├── airbyte ├── py.typed ├── _processors │ ├── sql │ │ ├── __init__.py │ │ ├── motherduck.py │ │ └── postgres.py │ └── __init__.py ├── _executors │ ├── __init__.py │ ├── local.py │ └── docker.py ├── _writers │ ├── __init__.py │ ├── jsonl.py │ └── base.py ├── version.py ├── shared │ ├── __init__.py │ └── state_writers.py ├── _util │ ├── __init__.py │ ├── venv_util.py │ ├── api_duck_types.py │ ├── text_util.py │ ├── connector_info.py │ ├── hashing.py │ ├── api_imports.py │ ├── pip_util.py │ ├── temp_files.py │ ├── name_normalizers.py │ └── document_rendering.py ├── datasets │ ├── __init__.py │ ├── _map.py │ ├── _inmemory.py │ ├── _lazy.py │ └── _base.py ├── caches │ ├── generic.py │ ├── __init__.py │ ├── duckdb.py │ ├── postgres.py │ ├── _state_backend_base.py │ ├── bigquery.py │ ├── snowflake.py │ └── motherduck.py ├── sources │ ├── registry.py │ └── __init__.py ├── cloud │ ├── constants.py │ ├── __init__.py │ └── auth.py ├── secrets │ ├── prompt.py │ ├── google_colab.py │ ├── custom.py │ ├── env_vars.py │ └── config.py ├── experimental │ └── __init__.py ├── mcp │ ├── server.py │ ├── _annotations.py │ └── prompts.py ├── callbacks.py ├── documents.py ├── _batch_handles.py ├── strategies.py └── destinations │ └── __init__.py ├── tests ├── docs_tests │ ├── __init__.py │ └── test_docs_checked_in.py ├── lint_tests │ ├── __init__.py │ ├── test_mypy.py │ └── test_ruff.py ├── unit_tests │ ├── __init__.py │ ├── test_exceptions.py │ ├── test_lowcode_connectors.py │ ├── test_caches.py │ ├── test_processors.py │ └── test_pip_helpers.py ├── integration_tests │ ├── __init__.py │ ├── cloud │ │ ├── __init__.py │ │ ├── test_cloud_workspaces.py │ │ └── test_cloud_sync.py │ ├── secrets │ │ ├── __init__.py │ │ └── test_gsm_secrets.py │ ├── destinations │ │ └── __init__.py │ ├── fixtures │ │ ├── source-test │ │ │ ├── source_test │ │ │ │ └── __init__.py │ │ │ ├── setup.py │ │ │ └── metadata.yaml │ │ ├── invalid_config.json │ │ ├── valid_config.json │ │ ├── source-broken │ │ │ ├── source_broken │ │ │ │ └── run.py │ │ │ ├── setup.py │ │ │ └── metadata.yaml │ │ └── registry.json │ ├── test_bigquery_cache.py │ ├── test_install.py │ ├── test_validation.py │ ├── test_registry_spec.py │ └── test_config_change_callback.py └── pyproject.toml ├── docs ├── .gitignore ├── templates │ └── theme.css ├── generate.py └── faq.md ├── poetry.toml ├── .gitattributes ├── .viztracerrc ├── .github ├── mcp-ci-tests.mcp.json ├── workflows │ ├── actionlint.yml │ ├── release_drafter.yml │ ├── label-community-prs.yml │ ├── pydoc_preview.yml │ ├── poe-command.yml │ ├── auto_merge_notification.yml │ ├── autofix.yml │ ├── slash_command_dispatch.yml │ ├── semantic_pr_check.yml │ ├── pydoc_publish.yml │ ├── welcome-message.yml │ ├── python_lint.yml │ └── pypi_publish.yml ├── dependabot.yml ├── release-drafter.yml ├── pr-welcome-internal.md └── pr-welcome-community.md ├── pyrefly.toml ├── examples ├── run_faker_samples.py ├── pyproject.toml ├── run_file_source.py ├── run_test_source_single_stream.py ├── run_pokeapi.py ├── run_test_source.py ├── run_spacex.py ├── run_get_documents_from_github.py ├── run_faker.py ├── run_github.py ├── run_github_samples.py ├── run_faker_to_motherduck.py ├── run_snowflake_faker.py ├── run_downloadable_yaml_source.py ├── run_sync_to_destination_w_cache.py ├── run_bigquery_destination.py ├── run_sync_to_destination_from_read_result.py ├── run_bigquery_faker.py ├── run_sync_to_destination_wo_cache.py ├── run_gsm_connector_secret_fetch.py └── run_integ_test_source.py ├── .gitignore ├── bin └── test_mcp_tool.py └── LICENSE.md /airbyte/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/docs_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/lint_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | generated 2 | -------------------------------------------------------------------------------- /tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/secrets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/destinations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-test/source_test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/invalid_config.json: -------------------------------------------------------------------------------- 1 | { "apiKey": "wrong" } 2 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/valid_config.json: -------------------------------------------------------------------------------- 1 | { "apiKey": "test" } 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Hide diffs in auto-generated files 2 | docs/generated/**/* linguist-generated=true 3 | -------------------------------------------------------------------------------- /airbyte/_processors/sql/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """SQL processors.""" 3 | -------------------------------------------------------------------------------- /airbyte/_executors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Support for connector executors. This is currently a non-public API.""" 3 | -------------------------------------------------------------------------------- /airbyte/_processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Internal processors for PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-broken/source_broken/run.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | 5 | def run(): 6 | raise Exception("Could not run") 7 | -------------------------------------------------------------------------------- /.viztracerrc: -------------------------------------------------------------------------------- 1 | ; Default config settings for viztracer 2 | ; https://viztracer.readthedocs.io/en/latest/basic_usage.html#configuration-file 3 | 4 | [default] 5 | max_stack_depth = 20 6 | unique_output_file = True 7 | output_file = viztracer_report.json 8 | tracer_entries = 5_000_000 9 | -------------------------------------------------------------------------------- /.github/mcp-ci-tests.mcp.json: -------------------------------------------------------------------------------- 1 | { 2 | "mcpServers": { 3 | "airbyte-mcp": { 4 | "type": "stdio", 5 | "command": "poetry", 6 | "args": [ 7 | "run", 8 | "airbyte-mcp" 9 | ], 10 | "env": { 11 | "GCP_GSM_CREDENTIALS": "${GCP_GSM_CREDENTIALS}", 12 | "AIRBYTE_CLOUD_MCP_SAFE_MODE": "1" 13 | } 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /airbyte/_writers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """File processors.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte._batch_handles import BatchHandle 7 | from airbyte._writers.jsonl import FileWriterBase, JsonlWriter 8 | 9 | 10 | __all__ = [ 11 | "BatchHandle", 12 | "FileWriterBase", 13 | "JsonlWriter", 14 | ] 15 | -------------------------------------------------------------------------------- /airbyte/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Support for PyAirbyte version checks.""" 3 | 4 | from __future__ import annotations 5 | 6 | import importlib.metadata 7 | 8 | 9 | airbyte_version = importlib.metadata.version("airbyte") 10 | 11 | 12 | def get_version() -> str: 13 | """Return the version of PyAirbyte.""" 14 | return airbyte_version 15 | -------------------------------------------------------------------------------- /airbyte/shared/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Module for future CDK components. 3 | 4 | Components here are planned to move to the CDK. 5 | 6 | TODO!: Add GitHub link here before merging. 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | from airbyte.shared.sql_processor import SqlProcessorBase 12 | 13 | 14 | __all__ = [ 15 | "SqlProcessorBase", 16 | ] 17 | -------------------------------------------------------------------------------- /pyrefly.toml: -------------------------------------------------------------------------------- 1 | # Pyrefly Type Checker Configuration 2 | # https://pyrefly.org/en/docs/configuration/ 3 | 4 | python-version = "3.10" 5 | project-includes = ["airbyte"] 6 | project-excludes = [ 7 | "tests/integration_tests/fixtures/source-broken/**", 8 | "tests/integration_tests/fixtures/source-test/**", 9 | "docs/**", 10 | "tests/**", 11 | ] 12 | ignore-missing-imports = [ 13 | "airbyte_protocol", 14 | "airbyte_protocol.models", 15 | "*", 16 | ] 17 | -------------------------------------------------------------------------------- /examples/run_faker_samples.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the Faker source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_faker_samples.py 6 | """ 7 | 8 | import airbyte as ab 9 | 10 | 11 | source = ab.get_source( 12 | "source-faker", 13 | config={"count": 200_000}, 14 | streams="*", 15 | ) 16 | 17 | # Print samples of the streams. 18 | source.print_samples() 19 | -------------------------------------------------------------------------------- /airbyte/_util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Internal utility functions for dealing with pip. 3 | 4 | Note: This module is for internal use only and it should not be depended upon for production use. 5 | It is subject to change without notice. 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from airbyte._util.pip_util import connector_pip_url, github_pip_url 11 | 12 | 13 | __all__ = [ 14 | "connector_pip_url", 15 | "github_pip_url", 16 | ] 17 | -------------------------------------------------------------------------------- /tests/pyproject.toml: -------------------------------------------------------------------------------- 1 | # This file defines lint exceptions for pytest tests and examples. 2 | # The 'tests' and 'examples' directories can both share this same exception list. 3 | 4 | [tool.ruff] 5 | preview = true 6 | 7 | [tool.ruff.lint] 8 | extend-ignore = [ 9 | "ANN201", # Type annotations not required for test functions 10 | "F841", # Allow assignments that are not referenced 11 | "SLF001", # Accessing private members ok for tests 12 | "DTZ005", # Don't require timezone-aware datetimes for tests 13 | ] 14 | -------------------------------------------------------------------------------- /examples/pyproject.toml: -------------------------------------------------------------------------------- 1 | # This file defines lint exceptions for pytest tests and examples. 2 | # The 'tests' and 'examples' directories can both share this same exception list. 3 | 4 | [tool.ruff] 5 | preview = true 6 | 7 | [tool.ruff.lint] 8 | extend-ignore = [ 9 | "ANN201", # Type annotations not required for test functions 10 | "F841", # Allow assignments that are not referenced 11 | "SLF001", # Accessing private members ok for tests 12 | "DTZ005", # Don't require timezone-aware datetimes for tests 13 | ] 14 | -------------------------------------------------------------------------------- /airbyte/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """PyAirbyte dataset classes.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte.datasets._base import DatasetBase 7 | from airbyte.datasets._lazy import LazyDataset 8 | from airbyte.datasets._map import DatasetMap 9 | from airbyte.datasets._sql import CachedDataset, SQLDataset 10 | 11 | 12 | __all__ = [ 13 | "CachedDataset", 14 | "DatasetBase", 15 | "DatasetMap", 16 | "LazyDataset", 17 | "SQLDataset", 18 | ] 19 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-test/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 3 | # 4 | from __future__ import annotations 5 | 6 | from setuptools import setup 7 | 8 | setup( 9 | name="airbyte-source-test", 10 | version="0.0.1", 11 | description="Test Source", 12 | author="Airbyte", 13 | author_email="contact@airbyte.io", 14 | packages=["source_test"], 15 | entry_points={ 16 | "console_scripts": [ 17 | "source-test=source_test.run:run", 18 | ], 19 | }, 20 | ) 21 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-broken/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 3 | # 4 | from __future__ import annotations 5 | 6 | from setuptools import setup 7 | 8 | setup( 9 | name="airbyte-source-broken", 10 | version="0.0.1", 11 | description="Test Soutce", 12 | author="Airbyte", 13 | author_email="contact@airbyte.io", 14 | packages=["source_broken"], 15 | entry_points={ 16 | "console_scripts": [ 17 | "source-broken=source_broken.run:run", 18 | ], 19 | }, 20 | ) 21 | -------------------------------------------------------------------------------- /airbyte/_util/venv_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Internal helper functions for working with temporary files.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING 7 | 8 | from airbyte._util.meta import is_windows 9 | 10 | 11 | if TYPE_CHECKING: 12 | from pathlib import Path 13 | 14 | 15 | def get_bin_dir(venv_path: Path, /) -> Path: 16 | """Get the directory where executables are installed.""" 17 | if is_windows(): 18 | return venv_path / "Scripts" 19 | 20 | return venv_path / "bin" 21 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-test/metadata.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | connectorSubtype: api 3 | connectorType: source 4 | definitionId: 47f17145-fe20-4ef5-a548-e29b048adf84 5 | dockerImageTag: 0.0.0 6 | dockerRepository: airbyte/source-test 7 | githubIssueLabel: source-test 8 | name: Test 9 | releaseDate: 2023-08-25 10 | releaseStage: alpha 11 | supportLevel: community 12 | documentationUrl: https://docs.airbyte.com/integrations/sources/apify-dataset 13 | remoteRegistries: 14 | pypi: 15 | enabled: true 16 | packageName: airbyte-source-test 17 | metadataSpecVersion: "1.0" 18 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/source-broken/metadata.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | connectorSubtype: api 3 | connectorType: source 4 | definitionId: 47f17145-fe20-4ef5-a548-e29b048adf84 5 | dockerImageTag: 0.0.0 6 | dockerRepository: airbyte/source-broken 7 | githubIssueLabel: source-broken 8 | name: Test 9 | releaseDate: 2023-08-25 10 | releaseStage: alpha 11 | supportLevel: community 12 | documentationUrl: https://docs.airbyte.com/integrations/sources/apify-dataset 13 | remoteRegistries: 14 | pypi: 15 | enabled: true 16 | packageName: airbyte-source-broken 17 | metadataSpecVersion: "1.0" 18 | -------------------------------------------------------------------------------- /airbyte/caches/generic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A Generic SQL Cache implementation.""" 3 | 4 | from __future__ import annotations 5 | 6 | from overrides import overrides 7 | 8 | from airbyte.caches.base import CacheBase 9 | from airbyte.secrets.base import SecretString 10 | 11 | 12 | class GenericSQLCacheConfig(CacheBase): 13 | """Allows configuring 'sql_alchemy_url' directly.""" 14 | 15 | sql_alchemy_url: SecretString | str 16 | 17 | @overrides 18 | def get_sql_alchemy_url(self) -> SecretString: 19 | """Returns a SQL Alchemy URL.""" 20 | return SecretString(self.sql_alchemy_url) 21 | -------------------------------------------------------------------------------- /examples/run_file_source.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the File source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_file.py 6 | 7 | No setup is needed, but you may need to delete the .venv-source-file folder 8 | if your installation gets interrupted or corrupted. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import airbyte as ab 14 | 15 | 16 | source = ab.get_source( 17 | "source-file", 18 | install_if_missing=True, 19 | ) 20 | source.check() 21 | 22 | # print(list(source.get_records("pokemon"))) 23 | source.read(cache=ab.new_local_cache("poke")) 24 | -------------------------------------------------------------------------------- /airbyte/sources/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Backwards compatibility shim for airbyte.sources.registry. 3 | 4 | This module re-exports symbols from airbyte.registry for backwards compatibility. 5 | New code should import from airbyte.registry directly. 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from airbyte.registry import ( 11 | ConnectorMetadata, 12 | InstallType, 13 | Language, 14 | get_available_connectors, 15 | get_connector_metadata, 16 | ) 17 | 18 | 19 | __all__ = [ 20 | "ConnectorMetadata", 21 | "InstallType", 22 | "Language", 23 | "get_available_connectors", 24 | "get_connector_metadata", 25 | ] 26 | -------------------------------------------------------------------------------- /airbyte/_util/api_duck_types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A set of duck-typed classes for working with the Airbyte API.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING, Protocol 7 | 8 | 9 | if TYPE_CHECKING: 10 | import requests 11 | 12 | 13 | class AirbyteApiResponseDuckType(Protocol): 14 | """Used for duck-typing various Airbyte API responses.""" 15 | 16 | content_type: str 17 | r"""HTTP response content type for this operation""" 18 | status_code: int 19 | r"""HTTP response status code for this operation""" 20 | raw_response: requests.Response 21 | r"""Raw HTTP response; suitable for custom response parsing""" 22 | -------------------------------------------------------------------------------- /examples/run_test_source_single_stream.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import os 5 | 6 | import airbyte as ab 7 | 8 | 9 | # preparation (from PyAirbyte main folder): 10 | # python -m venv .venv-source-test 11 | # source .venv-source-test/bin/activate 12 | # pip install -e ./tests/integration_tests/fixtures/source-test 13 | # In separate terminal: 14 | # poetry run python examples/run_test_source.py 15 | 16 | os.environ["AIRBYTE_LOCAL_REGISTRY"] = ( 17 | "./tests/integration_tests/fixtures/registry.json" 18 | ) 19 | 20 | source = ab.get_source("source-test", config={"apiKey": "test"}) 21 | 22 | print(list(source.read(streams=["stream1"]))) 23 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.yml: -------------------------------------------------------------------------------- 1 | name: GitHub action linting 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '.github/workflows/**' 7 | 8 | permissions: 9 | contents: read 10 | pull-requests: write 11 | 12 | jobs: 13 | actionlint: 14 | name: actionlint 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 19 | - name: Run actionlint 20 | uses: reviewdog/action-actionlint@95395aac8c053577d0bc67eb7b74936c660c6f66 # v1.67.0 21 | with: 22 | github_token: ${{ secrets.GITHUB_TOKEN }} 23 | reporter: github-pr-review 24 | level: error 25 | fail_on_error: true 26 | -------------------------------------------------------------------------------- /airbyte/_util/text_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Utility functions for working with text.""" 3 | 4 | from __future__ import annotations 5 | 6 | import ulid 7 | 8 | 9 | def generate_ulid() -> str: 10 | """Generate a new ULID.""" 11 | return str(ulid.ULID()) 12 | 13 | 14 | def generate_random_suffix() -> str: 15 | """Generate a random suffix for use in temporary names. 16 | 17 | By default, this function generates a ULID and returns a 9-character string 18 | which will be monotonically sortable. It is not guaranteed to be unique but 19 | is sufficient for small-scale and medium-scale use cases. 20 | """ 21 | ulid_str = generate_ulid().lower() 22 | return ulid_str[:6] + ulid_str[-3:] 23 | -------------------------------------------------------------------------------- /.github/workflows/release_drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | env: 9 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | update_release_draft: 16 | permissions: 17 | contents: write 18 | pull-requests: write 19 | runs-on: ubuntu-latest 20 | steps: 21 | # Drafts the next Release notes as Pull Requests are merged into "main" 22 | - uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0 23 | with: 24 | config-name: release-drafter.yml 25 | disable-autolabeler: false 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | -------------------------------------------------------------------------------- /examples/run_pokeapi.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the PokeAPI source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_pokeapi.py 6 | 7 | No setup is needed, but you may need to delete the .venv-source-pokeapi folder 8 | if your installation gets interrupted or corrupted. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import airbyte as ab 14 | from airbyte import get_source 15 | 16 | 17 | source = get_source( 18 | "source-pokeapi", 19 | config={"pokemon_name": "bulbasaur"}, 20 | source_manifest=True, 21 | ) 22 | source.check() 23 | 24 | # print(list(source.get_records("pokemon"))) 25 | source.read(cache=ab.new_local_cache("poke")) 26 | -------------------------------------------------------------------------------- /tests/lint_tests/test_mypy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import subprocess 5 | 6 | import pytest 7 | 8 | 9 | @pytest.mark.linting 10 | def test_mypy_typing(): 11 | # Run the pyrefly check command 12 | check_result = subprocess.run( 13 | ["poetry", "run", "pyrefly", "check"], 14 | stdout=subprocess.PIPE, 15 | stderr=subprocess.PIPE, 16 | ) 17 | 18 | # Assert that the Pyrefly command exited without errors (exit code 0) 19 | assert check_result.returncode == 0, ( 20 | "Pyrefly checks failed:\n" 21 | + f"{check_result.stdout.decode()}\n{check_result.stderr.decode()}\n\n" 22 | + "Run `poetry run pyrefly check` to see all failures." 23 | ) 24 | -------------------------------------------------------------------------------- /airbyte/_util/connector_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Connector info classes for PyAirbyte. 3 | 4 | Used for telemetry and logging. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from dataclasses import asdict, dataclass 10 | from typing import Any 11 | 12 | 13 | @dataclass 14 | class RuntimeInfoBase: 15 | def to_dict(self) -> dict[str, Any]: 16 | return {k: v for k, v in asdict(self).items() if v is not None} 17 | 18 | 19 | @dataclass 20 | class WriterRuntimeInfo(RuntimeInfoBase): 21 | type: str 22 | config_hash: str | None = None 23 | 24 | 25 | @dataclass(kw_only=True) 26 | class ConnectorRuntimeInfo(RuntimeInfoBase): 27 | name: str 28 | executor_type: str | None = None 29 | version: str | None = None 30 | config_hash: str | None = None 31 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | 13 | - package-ecosystem: "github-actions" 14 | # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.) 15 | directory: "/" 16 | schedule: 17 | interval: "weekly" 18 | -------------------------------------------------------------------------------- /airbyte/cloud/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Useful constants for working with Airbyte Cloud features in PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte._util.api_imports import JobStatusEnum 7 | 8 | 9 | FINAL_STATUSES: set[JobStatusEnum] = { 10 | JobStatusEnum.SUCCEEDED, 11 | JobStatusEnum.FAILED, 12 | JobStatusEnum.CANCELLED, 13 | } 14 | """The set of `.JobStatusEnum` strings that indicate a sync job has completed.""" 15 | 16 | FAILED_STATUSES: set[JobStatusEnum] = { 17 | JobStatusEnum.FAILED, 18 | JobStatusEnum.CANCELLED, 19 | } 20 | """The set of `.JobStatusEnum` strings that indicate a sync job has failed.""" 21 | 22 | READABLE_DESTINATION_TYPES: set[str] = { 23 | "bigquery", 24 | "snowflake", 25 | } 26 | """List of Airbyte Cloud destinations that PyAirbyte is able to read from.""" 27 | -------------------------------------------------------------------------------- /tests/docs_tests/test_docs_checked_in.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | import os 4 | 5 | import docs.generate as generate 6 | import pytest 7 | 8 | 9 | @pytest.mark.filterwarnings("ignore") 10 | def test_docs_generation(): 11 | """ 12 | Docs need to be able to be generated via `poetry run poe docs-generate`. 13 | 14 | This test runs the docs generation and ensures that it can complete successfully. 15 | 16 | Generation often produces warnings that are not relevant, so we suppress warnings in this test. 17 | """ 18 | 19 | generate.run() 20 | 21 | # compare the generated docs with the checked in docs 22 | diff = os.system("git diff --exit-code docs/generated") 23 | 24 | # if there is a diff, fail the test 25 | assert diff == 0, ( 26 | "Docs are out of date. Please run `poetry run poe docs-generate` and commit the changes." 27 | ) 28 | -------------------------------------------------------------------------------- /airbyte/secrets/prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Secret manager that prompts the user to enter a secret.""" 3 | 4 | from __future__ import annotations 5 | 6 | import contextlib 7 | from getpass import getpass 8 | 9 | from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString 10 | 11 | 12 | class SecretsPrompt(SecretManager): 13 | """Secret manager that prompts the user to enter a secret.""" 14 | 15 | name = SecretSourceEnum.PROMPT.value 16 | 17 | def get_secret( 18 | self, 19 | secret_name: str, 20 | ) -> SecretString | None: 21 | """Prompt the user to enter a secret. 22 | 23 | As a security measure, the secret is not echoed to the terminal when typed. 24 | """ 25 | with contextlib.suppress(Exception): 26 | return SecretString(getpass(f"Enter the value for secret '{secret_name}': ")) 27 | 28 | return None 29 | -------------------------------------------------------------------------------- /airbyte/datasets/_map.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """A generic interface for a set of streams. 4 | 5 | TODO: This is a work in progress. It is not yet used by any other code. 6 | TODO: Implement before release, or delete. 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | from collections.abc import Iterator, Mapping 12 | from typing import TYPE_CHECKING 13 | 14 | 15 | if TYPE_CHECKING: 16 | from airbyte.datasets._base import DatasetBase 17 | 18 | 19 | class DatasetMap(Mapping): 20 | """A generic interface for a set of streams or datasets.""" 21 | 22 | def __init__(self) -> None: 23 | self._datasets: dict[str, DatasetBase] = {} 24 | 25 | def __getitem__(self, key: str) -> DatasetBase: 26 | return self._datasets[key] 27 | 28 | def __iter__(self) -> Iterator[str]: 29 | return iter(self._datasets) 30 | 31 | def __len__(self) -> int: 32 | return len(self._datasets) 33 | -------------------------------------------------------------------------------- /tests/integration_tests/test_bigquery_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Unit tests specific to BigQuery caches.""" 3 | 4 | from __future__ import annotations 5 | 6 | import pytest 7 | 8 | import airbyte as ab 9 | 10 | 11 | @pytest.mark.requires_creds 12 | def test_bigquery_props( 13 | new_bigquery_cache: ab.BigQueryCache, 14 | ) -> None: 15 | """Test that the BigQueryCache properties are set correctly.""" 16 | # assert new_bigquery_cache.credentials_path.endswith(".json") 17 | assert new_bigquery_cache.dataset_name == new_bigquery_cache.schema_name, ( 18 | "Dataset name should be the same as schema name." 19 | ) 20 | assert new_bigquery_cache.schema_name != "airbyte_raw", ( 21 | "Schema name should not be the default value." 22 | ) 23 | 24 | assert new_bigquery_cache.get_database_name() == new_bigquery_cache.project_name, ( 25 | "Database name should be the same as project name." 26 | ) 27 | -------------------------------------------------------------------------------- /examples/run_test_source.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import os 5 | 6 | import airbyte as ab 7 | 8 | # preparation (from PyAirbyte main folder): 9 | # python -m venv .venv-source-test 10 | # source .venv-source-test/bin/activate 11 | # pip install -e ./tests/integration_tests/fixtures/source-test 12 | # In separate terminal: 13 | # poetry run python examples/run_test_source.py 14 | 15 | os.environ["AIRBYTE_LOCAL_REGISTRY"] = ( 16 | "./tests/integration_tests/fixtures/registry.json" 17 | ) 18 | 19 | source = ab.get_source("source-test", config={"apiKey": "test"}) 20 | cache = ab.new_local_cache("cache_test") 21 | 22 | source.check() 23 | 24 | print(source.get_available_streams()) 25 | 26 | result = source.read(cache) 27 | 28 | print(result.processed_records) 29 | print(list(result["stream1"])) 30 | 31 | different_cache = ab.new_local_cache("cache_test") 32 | print(list(different_cache["stream1"])) 33 | -------------------------------------------------------------------------------- /examples/run_spacex.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from itertools import islice 5 | 6 | import airbyte as ab 7 | 8 | 9 | # preparation (from PyAirbyte main folder): 10 | # python -m venv .venv-source-spacex-api 11 | # source .venv-source-spacex-api/bin/activate 12 | # pip install -e ../airbyte-integrations/connectors/source-spacex-api 13 | # In separate terminal: 14 | # poetry run python examples/run_spacex.py 15 | 16 | source = ab.get_source( 17 | "source-spacex-api", 18 | config={"id": "605b4b6aaa5433645e37d03f"}, 19 | install_if_missing=True, 20 | ) 21 | cache = ab.new_local_cache() 22 | 23 | source.check() 24 | 25 | source.select_streams(["launches", "rockets", "capsules"]) 26 | 27 | result = source.read(cache) 28 | 29 | print(islice(source.get_records("capsules"), 10)) 30 | 31 | for name, records in result.cache.streams.items(): 32 | print(f"Stream {name}: {len(list(records))} records") 33 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'v$RESOLVED_VERSION' 2 | tag-template: 'v$RESOLVED_VERSION' 3 | categories: 4 | - title: '🚀 Features' 5 | labels: 6 | - 'feature' 7 | - 'enhancement' 8 | - title: '🐛 Bug Fixes' 9 | labels: 10 | - 'fix' 11 | - 'bugfix' 12 | - 'bug' 13 | - title: '🧰 Maintenance' 14 | label: 'chore' 15 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 16 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. 17 | version-resolver: 18 | major: 19 | labels: 20 | - 'major' 21 | minor: 22 | labels: 23 | - 'minor' 24 | patch: 25 | labels: 26 | - 'patch' 27 | default: patch 28 | template: | 29 | ## Changes 30 | 31 | $CHANGES 32 | autolabeler: 33 | - label: 'chore' 34 | title: 35 | - '/chore\:/i' 36 | - label: 'bug' 37 | title: 38 | - '/fix\:/i' 39 | - label: 'enhancement' 40 | title: 41 | - '/feature/i' 42 | -------------------------------------------------------------------------------- /.github/workflows/label-community-prs.yml: -------------------------------------------------------------------------------- 1 | name: Label Community PRs 2 | 3 | # This workflow automatically adds the "community" label to PRs from forks. 4 | # This enables automatic tracking on the Community PRs project board. 5 | 6 | on: 7 | pull_request_target: 8 | types: 9 | - opened 10 | - reopened 11 | 12 | jobs: 13 | label-community-pr: 14 | name: Add "Community" Label to PR 15 | # Only run for PRs from forks 16 | if: github.event.pull_request.head.repo.fork == true 17 | runs-on: ubuntu-24.04 18 | permissions: 19 | issues: write 20 | pull-requests: write 21 | steps: 22 | - name: Add community label 23 | # This action uses GitHub's addLabels API, which is idempotent. 24 | # If the label already exists, the API call succeeds without error. 25 | uses: actions-ecosystem/action-add-labels@bd52874380e3909a1ac983768df6976535ece7f8 # v1.1.0 26 | with: 27 | github_token: ${{ secrets.GITHUB_TOKEN }} 28 | labels: community 29 | -------------------------------------------------------------------------------- /airbyte/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Experimental features which may change. 3 | 4 | > **NOTE:** 5 | > The following "experimental" features are now "stable" and can be accessed directly from the 6 | `airbyte.get_source()` method: 7 | > - Docker sources, using the `docker_image` argument. 8 | > - Yaml sources, using the `source_manifest` argument. 9 | 10 | ## About Experimental Features 11 | 12 | Experimental features may change without notice between minor versions of PyAirbyte. Although rare, 13 | they may also be entirely removed or refactored in future versions of PyAirbyte. Experimental 14 | features may also be less stable than other features, and may not be as well-tested. 15 | 16 | You can help improve this product by reporting issues and providing feedback for improvements in our 17 | [GitHub issue tracker](https://github.com/airbytehq/pyairbyte/issues). 18 | """ 19 | 20 | from __future__ import annotations 21 | 22 | from airbyte.sources.util import get_source 23 | 24 | 25 | __all__ = [ 26 | "get_source", 27 | ] 28 | -------------------------------------------------------------------------------- /examples/run_get_documents_from_github.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """This examples script demonstrates how to render documents from a source.""" 3 | 4 | from __future__ import annotations 5 | 6 | import airbyte as ab 7 | import rich 8 | 9 | 10 | def main() -> None: 11 | read_result = ab.get_source( 12 | "source-github", 13 | config={ 14 | "repositories": ["airbytehq/quickstarts"], 15 | "credentials": { 16 | "personal_access_token": ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN") 17 | }, 18 | }, 19 | streams=["issues"], 20 | ).read() 21 | 22 | for doc in read_result["issues"].to_documents( 23 | title_property="title", 24 | content_properties=["body"], 25 | metadata_properties=["state", "url", "number"], 26 | # primary_key_properties=["id"], 27 | # cursor_property="updated_at", 28 | render_metadata=True, 29 | ): 30 | rich.print(rich.markdown.Markdown(str(doc) + "\n\n" + str("-" * 40))) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /.github/workflows/pydoc_preview.yml: -------------------------------------------------------------------------------- 1 | name: Generate Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: {} 8 | 9 | env: 10 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 11 | 12 | jobs: 13 | preview_docs: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 19 | - name: Set up Python 20 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 21 | with: 22 | python-version: '3.10' 23 | - name: Set up Poetry 24 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 25 | with: 26 | poetry-version: "2.2.0" 27 | 28 | - name: Install dependencies 29 | run: poetry install 30 | 31 | - name: Generate documentation 32 | run: | 33 | poetry run poe docs-generate 34 | 35 | - name: Upload artifact 36 | uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0 37 | with: 38 | # Upload entire repository 39 | path: 'docs/generated' 40 | -------------------------------------------------------------------------------- /.github/workflows/poe-command.yml: -------------------------------------------------------------------------------- 1 | name: On-Demand Poe Task 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | comment-id: 7 | description: "Optional comment-id of the slash command. Ignore if not applicable." 8 | required: false 9 | pr: 10 | description: "PR Number" 11 | required: false 12 | 13 | permissions: 14 | contents: write 15 | pull-requests: write 16 | 17 | jobs: 18 | run-poe-command: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Authenticate as GitHub App 22 | uses: actions/create-github-app-token@v2 23 | id: get-app-token 24 | with: 25 | owner: "airbytehq" 26 | repositories: "PyAirbyte" 27 | app-id: ${{ secrets.OCTAVIA_BOT_APP_ID }} 28 | private-key: ${{ secrets.OCTAVIA_BOT_PRIVATE_KEY }} 29 | 30 | - name: Run Poe Slash Command Processor 31 | uses: aaronsteers/poe-command-processor@v1.3.3 32 | with: 33 | pr: ${{ github.event.inputs.pr }} 34 | comment-id: ${{ github.event.inputs.comment-id }} 35 | github-token: ${{ steps.get-app-token.outputs.token }} 36 | -------------------------------------------------------------------------------- /examples/run_faker.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the Faker source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_faker.py 6 | 7 | No setup is needed, but you may need to delete the .venv-source-faker folder 8 | if your installation gets interrupted or corrupted. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import airbyte as ab 14 | 15 | SCALE = 200_000 # Number of records to generate between users and purchases. 16 | FORCE_FULL_REFRESH = True # Whether to force a full refresh on the source. 17 | 18 | 19 | cache = ab.get_default_cache() 20 | source = ab.get_source( 21 | "source-faker", 22 | config={"count": SCALE / 2}, 23 | install_if_missing=True, 24 | ) 25 | source.check() 26 | source.select_streams(["products", "users", "purchases"]) 27 | 28 | result = source.read( 29 | cache=cache, 30 | force_full_refresh=FORCE_FULL_REFRESH, 31 | ) 32 | 33 | print("Read complete. Validating results...") 34 | for name, records in result.streams.items(): 35 | print(f"Stream {name}: {len(records)} records") 36 | -------------------------------------------------------------------------------- /airbyte/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Sources connectors module for PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING 7 | 8 | from airbyte.registry import ( 9 | ConnectorMetadata, 10 | get_available_connectors, 11 | get_connector_metadata, 12 | ) 13 | from airbyte.sources.base import Source 14 | from airbyte.sources.util import ( 15 | get_benchmark_source, 16 | get_source, 17 | ) 18 | 19 | 20 | # Submodules imported here for documentation reasons: https://github.com/mitmproxy/pdoc/issues/757 21 | if TYPE_CHECKING: 22 | # ruff: noqa: TC004 # imports used for more than type checking 23 | from airbyte.sources import ( 24 | base, 25 | registry, 26 | util, 27 | ) 28 | 29 | __all__ = [ 30 | # Submodules 31 | "base", 32 | "registry", 33 | "util", 34 | # Factories 35 | "get_source", 36 | "get_benchmark_source", 37 | # Helper Functions 38 | "get_available_connectors", 39 | "get_connector_metadata", 40 | # Classes 41 | "Source", 42 | "ConnectorMetadata", 43 | ] 44 | -------------------------------------------------------------------------------- /tests/integration_tests/test_install.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import pytest 5 | 6 | from airbyte.sources.util import get_source 7 | from airbyte import exceptions as exc 8 | 9 | 10 | def test_install_failure_log_pypi(): 11 | """Test that the install log is created and contains the expected content.""" 12 | with pytest.raises(exc.AirbyteConnectorNotRegisteredError): 13 | source = get_source("source-not-found") 14 | 15 | with pytest.raises(exc.AirbyteConnectorInstallationError) as exc_info: 16 | source = get_source( 17 | "source-not-found", 18 | pip_url="https://pypi.org/project/airbyte-not-found", 19 | install_if_missing=True, 20 | ) 21 | 22 | # Check that the stderr log contains the expected content from a failed install (pip or uv) 23 | err_msg = str(exc_info.value.__cause__.log_text) 24 | assert any([ 25 | "Cannot unpack file" in err_msg, 26 | "Could not install requirement" in err_msg, 27 | "Failed to parse" in err_msg, 28 | "Expected direct URL" in err_msg, 29 | ]) 30 | -------------------------------------------------------------------------------- /docs/templates/theme.css: -------------------------------------------------------------------------------- 1 | /* pdoc color scheme - Airbyte branded version */ 2 | :root { 3 | --pdoc-background: #fff; 4 | } 5 | 6 | .pdoc { 7 | /* Airbyte purple color scheme */ 8 | --text: hsl(240, 19%, 18%); 9 | --muted: #6c757d; 10 | --link: #615eff; 11 | --link-hover: #3f3bff; 12 | --code: hsl(240, 100%, 98%); 13 | --active: #fff598; 14 | 15 | --accent: hsl(240, 25%, 98%); 16 | --accent2: #c1c1c1; 17 | 18 | --nav-hover: rgba(97, 94, 255, 0.1); 19 | --name: #615eff; 20 | --def: #3f3bff; 21 | --annotation: #007020; 22 | } 23 | 24 | /* Dark mode theme */ 25 | @media (prefers-color-scheme: dark) { 26 | :root { 27 | --pdoc-background: #212529; 28 | } 29 | 30 | .pdoc { 31 | --text: hsl(240, 10%, 90%); 32 | --muted: #9d9d9d; 33 | --link: #9492ff; 34 | --link-hover: #c8c7ff; 35 | --code: hsl(252, 25%, 18%); 36 | --active: #555; 37 | 38 | --accent: hsl(240, 14%, 14%); 39 | --accent2: #555; 40 | 41 | --nav-hover: rgba(0, 0, 0, 0.1); 42 | --name: #9492ff; 43 | --def: #8381ff; 44 | --annotation: #00c037; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /airbyte/_util/hashing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Hashing utils for PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | 6 | import hashlib 7 | from collections.abc import Mapping 8 | 9 | 10 | HASH_SEED = "PyAirbyte:" 11 | """Additional seed for randomizing one-way hashed strings.""" 12 | 13 | 14 | def one_way_hash( 15 | obj: Mapping | list | object, 16 | /, 17 | ) -> str: 18 | """Return a one-way hash of the given string. 19 | 20 | To ensure a unique domain of hashes, we prepend a seed to the string before hashing. 21 | """ 22 | string_to_hash: str 23 | if isinstance(obj, Mapping): 24 | # Recursively sort and convert nested dictionaries to tuples of key-value pairs 25 | string_to_hash = str(sorted((k, one_way_hash(v)) for k, v in obj.items())) 26 | 27 | elif isinstance(obj, list): 28 | # Recursively hash elements of the list 29 | string_to_hash = str([one_way_hash(item) for item in obj]) 30 | 31 | else: 32 | # Convert the object to a string 33 | string_to_hash = str(obj) 34 | 35 | return hashlib.sha256((HASH_SEED + str(string_to_hash)).encode()).hexdigest() 36 | -------------------------------------------------------------------------------- /examples/run_github.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the Faker source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_github.py 6 | 7 | No setup is needed, but you may need to delete the .venv-source-faker folder 8 | if your installation gets interrupted or corrupted. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import airbyte as ab 14 | 15 | 16 | # Create a token here: https://github.com/settings/tokens 17 | GITHUB_TOKEN = ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN") 18 | 19 | 20 | source = ab.get_source("source-github") 21 | source.set_config({ 22 | "repositories": ["airbytehq/airbyte-lib-private-beta"], 23 | "credentials": {"personal_access_token": GITHUB_TOKEN}, 24 | }) 25 | source.check() 26 | source.select_streams([ 27 | "issues", 28 | "pull_requests", 29 | "commits", 30 | "collaborators", 31 | "deployments", 32 | ]) 33 | 34 | result = source.read(cache=ab.new_local_cache("github")) 35 | print(result.processed_records) 36 | 37 | for name, records in result.streams.items(): 38 | print(f"Stream {name}: {len(records)} records") 39 | -------------------------------------------------------------------------------- /tests/unit_tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import inspect 5 | import pytest 6 | import airbyte.exceptions as exceptions_module 7 | 8 | 9 | def test_exceptions(): 10 | exception_classes = [ 11 | (name, obj) 12 | for name, obj in inspect.getmembers(exceptions_module) 13 | if inspect.isclass(obj) and name.endswith("Error") 14 | ] 15 | assert "AirbyteError" in [name for name, _ in exception_classes] 16 | assert "NotAnError" not in [name for name, _ in exception_classes] 17 | for name, obj in exception_classes: 18 | instance = obj() 19 | message = instance.get_message() 20 | assert isinstance(message, str), "No message for class: " + name 21 | assert message.count("\n") == 0 22 | assert message != "" 23 | assert message.strip() == message 24 | assert any([name.startswith(prefix) for prefix in ["Airbyte", "PyAirbyte"]]), ( 25 | f"{name} does not start with Airbyte or PyAirbyte" 26 | ) 27 | assert name.endswith("Error") 28 | 29 | 30 | if __name__ == "__main__": 31 | pytest.main() 32 | -------------------------------------------------------------------------------- /examples/run_github_samples.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A simple test of PyAirbyte, using the Faker source connector. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python ./examples/run_github_samples.py 6 | 7 | No setup is needed, but you may need to delete the .venv-source-faker folder 8 | if your installation gets interrupted or corrupted. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import airbyte as ab 14 | 15 | 16 | # Create a token here: https://github.com/settings/tokens 17 | GITHUB_TOKEN = ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN") 18 | 19 | FAILING_STREAMS = [ 20 | "collaborators", 21 | "issue_timeline_events", # key error: 'converted_to_discussion' 22 | "projects_v2", 23 | "team_members", 24 | "team_memberships", 25 | "teams", 26 | ] 27 | 28 | source = ab.get_source( 29 | "source-github", 30 | config={ 31 | "repositories": ["airbytehq/airbyte-lib-private-beta"], 32 | "credentials": {"personal_access_token": GITHUB_TOKEN}, 33 | }, 34 | streams="*", 35 | ) 36 | 37 | streams = list(set(source.get_available_streams()) - set(FAILING_STREAMS)) 38 | 39 | source.print_samples(streams=streams) 40 | -------------------------------------------------------------------------------- /.github/workflows/auto_merge_notification.yml: -------------------------------------------------------------------------------- 1 | # When a PR is has the auto-merge feature enabled or disabled, this workflow adds or removes 2 | # warning text at the bottom of the PR description. 3 | 4 | name: "Add Auto-Merge Notification Text" 5 | on: 6 | pull_request: 7 | types: [auto_merge_enabled, auto_merge_disabled] 8 | 9 | jobs: 10 | update-description: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | pull-requests: write 14 | steps: 15 | - name: Add Auto-Merge Notice 16 | if: github.event.action == 'auto_merge_enabled' 17 | uses: bcgov/action-pr-description-add@14338bfe0278ead273b3c1189e5aa286ff6709c4 # v2.0.0 18 | with: 19 | add_markdown: | 20 | > [!IMPORTANT] 21 | > **Auto-merge enabled.** 22 | > 23 | > _This PR is set to merge automatically when all requirements are met._ 24 | 25 | - name: Remove Auto-Merge Notice 26 | if: github.event.action == 'auto_merge_disabled' 27 | uses: bcgov/action-pr-description-add@14338bfe0278ead273b3c1189e5aa286ff6709c4 # v2.0.0 28 | with: 29 | add_markdown: | 30 | > [!NOTE] 31 | > **Auto-merge may have been disabled. Please check the PR status to confirm.** 32 | -------------------------------------------------------------------------------- /docs/generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 4 | """Generate docs for all public modules in PyAirbyte and save them to docs/generated. 5 | 6 | Usage: 7 | poetry run python docs/generate.py 8 | 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import pathlib 14 | import shutil 15 | 16 | import pdoc 17 | 18 | 19 | def run() -> None: 20 | """Generate docs for all public modules in PyAirbyte and save them to docs/generated.""" 21 | public_modules = ["airbyte", "airbyte/cli.py"] 22 | 23 | # recursively delete the docs/generated folder if it exists 24 | if pathlib.Path("docs/generated").exists(): 25 | shutil.rmtree("docs/generated") 26 | 27 | pdoc.render.configure( 28 | template_directory=pathlib.Path("docs/templates"), 29 | show_source=True, 30 | search=True, 31 | logo="https://docs.airbyte.com/img/pyairbyte-logo-dark.png", 32 | favicon="https://docs.airbyte.com/img/favicon.png", 33 | mermaid=True, 34 | docformat="google", 35 | ) 36 | pdoc.pdoc( 37 | *public_modules, 38 | output_directory=pathlib.Path("docs/generated"), 39 | ) 40 | 41 | 42 | if __name__ == "__main__": 43 | run() 44 | -------------------------------------------------------------------------------- /examples/run_faker_to_motherduck.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A sample execution script which loads data from `source-faker` to a MotherDuck-backed cache. 3 | 4 | Usage (from repo root): 5 | poetry install 6 | poetry run python examples/run_faker_to_motherduck.py 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | import airbyte as ab 12 | from airbyte.caches import MotherDuckCache 13 | 14 | 15 | MOTHERDUCK_API_KEY = ab.get_secret("MOTHERDUCK_API_KEY") 16 | """This is the API key for the MotherDuck service. 17 | 18 | It can be auto-detected in env vars and/or a .env file in the root of the project. 19 | 20 | If will be prompted (and masked during input) if not found in either location. 21 | """ 22 | 23 | 24 | source = ab.get_source( 25 | "source-faker", 26 | config={"count": 10000, "seed": 0, "parallelism": 1, "always_updated": False}, 27 | install_if_missing=True, 28 | ) 29 | source.check() 30 | source.select_all_streams() 31 | 32 | cache = MotherDuckCache( 33 | database="airbyte_test", 34 | schema_name="faker_data", 35 | api_key=MOTHERDUCK_API_KEY, 36 | ) 37 | 38 | result = source.read(cache=cache, force_full_refresh=True) 39 | 40 | for name, records in result.streams.items(): 41 | print(f"Stream {name}: {len(records)} records") 42 | -------------------------------------------------------------------------------- /airbyte/caches/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Base module for all caches.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING 7 | 8 | from airbyte.caches.base import CacheBase 9 | from airbyte.caches.bigquery import BigQueryCache 10 | from airbyte.caches.duckdb import DuckDBCache 11 | from airbyte.caches.motherduck import MotherDuckCache 12 | from airbyte.caches.postgres import PostgresCache 13 | from airbyte.caches.snowflake import SnowflakeCache 14 | from airbyte.caches.util import get_default_cache, new_local_cache 15 | 16 | 17 | # Submodules imported here for documentation reasons: https://github.com/mitmproxy/pdoc/issues/757 18 | if TYPE_CHECKING: 19 | # ruff: noqa: TC004 20 | from airbyte.caches import base, bigquery, duckdb, motherduck, postgres, snowflake, util 21 | 22 | # We export these classes for easy access: `airbyte.caches...` 23 | __all__ = [ 24 | # Factories 25 | "get_default_cache", 26 | "new_local_cache", 27 | # Classes 28 | "BigQueryCache", 29 | "CacheBase", 30 | "DuckDBCache", 31 | "MotherDuckCache", 32 | "PostgresCache", 33 | "SnowflakeCache", 34 | # Submodules, 35 | "util", 36 | "bigquery", 37 | "duckdb", 38 | "motherduck", 39 | "postgres", 40 | "snowflake", 41 | "base", 42 | ] 43 | -------------------------------------------------------------------------------- /.github/pr-welcome-internal.md: -------------------------------------------------------------------------------- 1 | ## 👋 Greetings, Airbyte Team Member! 2 | 3 | Here are some helpful tips and reminders for your convenience. 4 | 5 | ### Testing This PyAirbyte Version 6 | 7 | You can test this version of PyAirbyte using the following: 8 | 9 | ```bash 10 | # Run PyAirbyte CLI from this branch: 11 | uvx --from 'git+https://github.com/airbytehq/PyAirbyte.git@{{ .branch_name }}' pyairbyte --help 12 | 13 | # Install PyAirbyte from this branch for development: 14 | pip install 'git+https://github.com/airbytehq/PyAirbyte.git@{{ .branch_name }}' 15 | ``` 16 | 17 | ### Helpful Resources 18 | 19 | - [PyAirbyte Documentation](https://docs.airbyte.com/using-airbyte/pyairbyte/getting-started) 20 | - [API Reference](https://airbytehq.github.io/PyAirbyte/) 21 | 22 | ### PR Slash Commands 23 | 24 | Airbyte Maintainers can execute the following slash commands on your PR: 25 | 26 | - `/fix-pr` - Fixes most formatting and linting issues 27 | - `/poetry-lock` - Updates poetry.lock file 28 | - `/test-pr` - Runs tests with the updated PyAirbyte 29 | - `/prerelease` - Builds and publishes a prerelease version to PyPI 30 | 31 | ### Community Support 32 | 33 | Questions? Join the [#pyairbyte channel](https://airbytehq.slack.com/archives/C06FZ238P8W) in our Slack workspace. 34 | 35 | [📝 _Edit this welcome message._](https://github.com/airbytehq/PyAirbyte/blob/main/.github/pr-welcome-internal.md) 36 | -------------------------------------------------------------------------------- /.github/workflows/autofix.yml: -------------------------------------------------------------------------------- 1 | name: Auto-fix when '/autofix' Slash Command is used 2 | 3 | on: 4 | workflow_dispatch: {} 5 | repository_dispatch: 6 | types: [autofix-command] 7 | 8 | env: 9 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 10 | 11 | jobs: 12 | python-autofix: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 17 | - name: Set up Python 18 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 19 | with: 20 | python-version: 3.10 21 | - name: Set up Poetry 22 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 23 | with: 24 | poetry-version: "2.2.0" 25 | 26 | - name: Install dependencies 27 | run: poetry install 28 | 29 | - name: Format code 30 | run: poetry run ruff format . 31 | 32 | - name: Commit changes 33 | run: | 34 | git config --global user.name "Airbyte Automation Bot" 35 | git config --global user.email "no-reply@airbyte.io" 36 | git add . 37 | git diff-index --quiet HEAD || git commit -m "Format code with black" 38 | 39 | - name: Push changes 40 | uses: ad-m/github-push-action@77c5b412c50b723d2a4fbc6d71fb5723bcd439aa # v1.0.0 41 | with: 42 | github_token: ${{ secrets.GITHUB_TOKEN }} 43 | -------------------------------------------------------------------------------- /airbyte/secrets/google_colab.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Secrets manager for Google Colab user secrets.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString 7 | 8 | 9 | class ColabSecretManager(SecretManager): 10 | """Secret manager that retrieves secrets from Google Colab user secrets.""" 11 | 12 | name = SecretSourceEnum.GOOGLE_COLAB.value 13 | 14 | def __init__(self) -> None: 15 | """Initialize the Google Colab secret manager.""" 16 | try: 17 | from google.colab import ( # pyright: ignore[reportMissingImports] # noqa: PLC0415 18 | userdata as colab_userdata, 19 | ) 20 | 21 | self.colab_userdata = colab_userdata 22 | except ImportError: 23 | self.colab_userdata = None 24 | 25 | super().__init__() 26 | 27 | def get_secret(self, secret_name: str) -> SecretString | None: 28 | """Get a named secret from Google Colab user secrets.""" 29 | if self.colab_userdata is None: 30 | # The module doesn't exist. We probably aren't in Colab. 31 | return None 32 | 33 | try: 34 | return SecretString(self.colab_userdata.get(secret_name)) 35 | except Exception: 36 | # Secret name not found. Continue. 37 | return None 38 | -------------------------------------------------------------------------------- /examples/run_snowflake_faker.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """ 3 | Usage: 4 | poetry install 5 | poetry run python examples/run_snowflake_faker.py 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | import airbyte as ab 11 | from airbyte.caches import SnowflakeCache 12 | from airbyte.secrets.google_gsm import GoogleGSMSecretManager 13 | 14 | 15 | SCALE = 10_000 16 | 17 | 18 | AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" 19 | secret_mgr = GoogleGSMSecretManager( 20 | project=AIRBYTE_INTERNAL_GCP_PROJECT, 21 | credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), 22 | ) 23 | 24 | secret = secret_mgr.get_secret( 25 | secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS", 26 | ) 27 | assert secret is not None, "Secret not found." 28 | secret_config = secret.parse_json() 29 | 30 | 31 | cache = SnowflakeCache( 32 | account=secret_config["account"], 33 | username=secret_config["username"], 34 | password=secret_config["password"], 35 | database=secret_config["database"], 36 | warehouse=secret_config["warehouse"], 37 | role=secret_config["role"], 38 | ) 39 | 40 | source = ab.get_source( 41 | "source-faker", 42 | config={ 43 | "count": SCALE, 44 | }, 45 | install_if_missing=True, 46 | streams="*", 47 | ) 48 | source.check() 49 | 50 | result = source.read(cache) 51 | 52 | for name in ["products"]: 53 | print(f"Stream {name}: {len(list(result[name]))} records") 54 | -------------------------------------------------------------------------------- /airbyte/datasets/_inmemory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """In-memory dataset class.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING, Any 7 | 8 | from overrides import overrides 9 | 10 | from airbyte.datasets import DatasetBase 11 | 12 | 13 | if TYPE_CHECKING: 14 | from collections.abc import Iterator 15 | 16 | from airbyte_protocol.models import ConfiguredAirbyteStream 17 | 18 | 19 | class InMemoryDataset(DatasetBase): 20 | """A dataset that is held in memory. 21 | 22 | This dataset is useful for testing and debugging purposes, but should not be used with any 23 | large datasets. 24 | """ 25 | 26 | def __init__( 27 | self, 28 | records: list[dict[str, Any]], 29 | stream_metadata: ConfiguredAirbyteStream, 30 | ) -> None: 31 | """Initialize the dataset with a list of records.""" 32 | # Should already be a list, but we convert it to a list just in case an iterator is passed. 33 | self._records: list[dict[str, Any]] = list(records) 34 | super().__init__( 35 | stream_metadata=stream_metadata, 36 | ) 37 | 38 | @overrides 39 | def __iter__(self) -> Iterator[dict[str, Any]]: 40 | """Return the iterator of records.""" 41 | return iter(self._records) 42 | 43 | def __len__(self) -> int: 44 | """Return the number of records in the dataset.""" 45 | return len(self._records) 46 | -------------------------------------------------------------------------------- /tests/unit_tests/test_lowcode_connectors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from pathlib import Path 5 | import sys 6 | 7 | import pytest 8 | from airbyte import get_source 9 | from airbyte._util.meta import is_windows 10 | 11 | UNIT_TEST_DB_PATH: Path = Path(".cache") / "unit_tests" / "test_db.duckdb" 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "connector_name, config", 16 | [ 17 | ("source-pokeapi", {"pokemon_name": "ditto"}), 18 | ], 19 | ) 20 | @pytest.mark.xfail(condition=is_windows(), reason="Test expected to fail on Windows.") 21 | @pytest.mark.skipif( 22 | sys.version_info >= (3, 12), 23 | reason="Test fails in Python 3.12 as pokeAPI interface is blocked for bots/CI runners", 24 | ) 25 | def test_nocode_execution(connector_name: str, config: dict) -> None: 26 | source = get_source( 27 | name=connector_name, 28 | config=config, 29 | source_manifest=True, 30 | ) 31 | source.check() 32 | source.select_all_streams() 33 | read_result = source.read() 34 | for name, records in read_result.streams.items(): 35 | assert name 36 | assert len(records) > 0, f"No records were returned from the '{name}' stream." 37 | 38 | # Confirm we can read twice: 39 | read_result_2 = source.read() 40 | for name, records in read_result_2.streams.items(): 41 | assert name 42 | assert len(records) > 0, f"No records were returned from the '{name}' stream." 43 | -------------------------------------------------------------------------------- /airbyte/_processors/sql/motherduck.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A MotherDuck implementation of the cache, built on the DuckDB implementation.""" 3 | 4 | from __future__ import annotations 5 | 6 | import warnings 7 | from typing import TYPE_CHECKING 8 | 9 | from duckdb_engine import DuckDBEngineWarning 10 | from overrides import overrides 11 | 12 | from airbyte._processors.sql.duckdb import DuckDBSqlProcessor 13 | from airbyte._writers.jsonl import JsonlWriter 14 | 15 | 16 | if TYPE_CHECKING: 17 | from airbyte.caches.motherduck import MotherDuckCache 18 | 19 | 20 | # Suppress warnings from DuckDB about reflection on indices. 21 | # https://github.com/Mause/duckdb_engine/issues/905 22 | warnings.filterwarnings( 23 | "ignore", 24 | message="duckdb-engine doesn't yet support reflection on indices", 25 | category=DuckDBEngineWarning, 26 | ) 27 | 28 | 29 | class MotherDuckSqlProcessor(DuckDBSqlProcessor): 30 | """A cache implementation for MotherDuck.""" 31 | 32 | supports_merge_insert = False 33 | file_writer_class = JsonlWriter 34 | cache: MotherDuckCache 35 | 36 | @overrides 37 | def _setup(self) -> None: 38 | """Do any necessary setup, if applicable. 39 | 40 | Note: The DuckDB parent class requires pre-creation of local directory structure. We 41 | don't need to do that here so we override the method be a no-op. 42 | """ 43 | # No setup to do and no need to pre-create local file storage. 44 | pass 45 | -------------------------------------------------------------------------------- /airbyte/_util/api_imports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Imported classes from the Airbyte API. 3 | 4 | Any classes that are imported from the Airbyte API should be imported here. 5 | This allows for easy access to these classes in other modules, especially 6 | for type hinting purposes. 7 | 8 | Design Guidelines: 9 | - No modules except `api_util` and `api_imports` should import from `airbyte_api`. 10 | - If a module needs to import from `airbyte_api`, it should import from `api_imports` (this module) 11 | instead. 12 | - This module is divided into two sections: internal-use classes and public-use classes. 13 | - Public-use classes should be carefully reviewed to ensure that they are necessary for public use 14 | and that we are willing to support them as part of PyAirbyte. 15 | """ 16 | # Ignore import sorting in this file. Manual grouping is more important. 17 | # ruff: noqa: I001 18 | 19 | from __future__ import annotations 20 | 21 | # Internal-Use Classes 22 | 23 | # These classes are used internally to cache API responses. 24 | from airbyte_api.models import ( 25 | ConnectionResponse, 26 | DestinationResponse, 27 | JobResponse, 28 | ) 29 | 30 | # Public-Use Classes 31 | 32 | # This class is used to represent the status of a job. It may be used in 33 | # type hints for public functions that return a job status. 34 | from airbyte_api.models import JobStatusEnum # Alias not needed 35 | 36 | 37 | __all__: list[str] = [ 38 | "ConnectionResponse", 39 | "DestinationResponse", 40 | "JobResponse", 41 | "JobStatusEnum", 42 | ] 43 | -------------------------------------------------------------------------------- /.github/pr-welcome-community.md: -------------------------------------------------------------------------------- 1 | ## 👋 Welcome to PyAirbyte! 2 | 3 | Thank you for your contribution from **{{ .repo_name }}**! We're excited to have you in the Airbyte community. 4 | 5 | ### Testing This PyAirbyte Version 6 | 7 | You can test this version of PyAirbyte using the following: 8 | 9 | ```bash 10 | # Run PyAirbyte CLI from this branch: 11 | uvx --from 'git+https://github.com/{{ .repo_name }}.git@{{ .branch_name }}' pyairbyte --help 12 | 13 | # Install PyAirbyte from this branch for development: 14 | pip install 'git+https://github.com/{{ .repo_name }}.git@{{ .branch_name }}' 15 | ``` 16 | 17 | ### Helpful Resources 18 | 19 | - [Contributing Guidelines](https://github.com/airbytehq/PyAirbyte/blob/main/docs/CONTRIBUTING.md) 20 | - [PyAirbyte Documentation](https://docs.airbyte.com/using-airbyte/pyairbyte/getting-started) 21 | - [API Reference](https://airbytehq.github.io/PyAirbyte/) 22 | 23 | ### PR Slash Commands 24 | 25 | As needed or by request, Airbyte Maintainers can execute the following slash commands on your PR: 26 | 27 | - `/fix-pr` - Fixes most formatting and linting issues 28 | - `/poetry-lock` - Updates poetry.lock file 29 | - `/test-pr` - Runs tests with the updated PyAirbyte 30 | - `/prerelease` - Builds and publishes a prerelease version to PyPI 31 | 32 | ### Community Support 33 | 34 | If you have any questions, feel free to ask in the PR comments or join our community: 35 | - [Airbyte Slack](https://airbytehq.slack.com/) - Join the [#pyairbyte channel](https://airbytehq.slack.com/archives/C06FZ238P8W) 36 | - [GitHub Discussions](https://github.com/airbytehq/PyAirbyte/discussions) 37 | -------------------------------------------------------------------------------- /tests/integration_tests/test_validation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | 5 | import pytest 6 | from airbyte.validate import validate 7 | 8 | 9 | @pytest.fixture(scope="module", autouse=True) 10 | def autouse_source_test_installation(source_test_installation): 11 | return 12 | 13 | 14 | @pytest.fixture(scope="function", autouse=True) 15 | def autouse_source_test_registry(source_test_registry): 16 | return 17 | 18 | 19 | def test_validate_success(): 20 | validate( 21 | "./tests/integration_tests/fixtures/source-test", 22 | "./tests/integration_tests/fixtures/valid_config.json", 23 | validate_install_only=False, 24 | ) 25 | 26 | 27 | def test_validate_check_failure(): 28 | with pytest.raises(Exception): 29 | validate( 30 | "./tests/integration_tests/fixtures/source-test", 31 | "./tests/integration_tests/fixtures/invalid_config.json", 32 | validate_install_only=False, 33 | ) 34 | 35 | 36 | def test_validate_success_install_only(): 37 | validate( 38 | "./tests/integration_tests/fixtures/source-test", 39 | "./tests/integration_tests/fixtures/invalid_config.json", 40 | validate_install_only=True, 41 | ) 42 | 43 | 44 | def test_validate_config_failure(): 45 | with pytest.raises(Exception): 46 | validate( 47 | "./tests/integration_tests/fixtures/source-broken", 48 | "./tests/integration_tests/fixtures/valid_config.json", 49 | validate_install_only=True, 50 | ) 51 | -------------------------------------------------------------------------------- /airbyte/mcp/server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Experimental MCP (Model Context Protocol) server for PyAirbyte connector management.""" 3 | 4 | import asyncio 5 | import sys 6 | 7 | from fastmcp import FastMCP 8 | 9 | from airbyte._util.meta import set_mcp_mode 10 | from airbyte.mcp._util import initialize_secrets 11 | from airbyte.mcp.cloud_ops import register_cloud_ops_tools 12 | from airbyte.mcp.connector_registry import register_connector_registry_tools 13 | from airbyte.mcp.local_ops import register_local_ops_tools 14 | from airbyte.mcp.prompts import register_prompts 15 | 16 | 17 | set_mcp_mode() 18 | initialize_secrets() 19 | 20 | app: FastMCP = FastMCP("airbyte-mcp") 21 | """The Airbyte MCP Server application instance.""" 22 | 23 | register_connector_registry_tools(app) 24 | register_local_ops_tools(app) 25 | register_cloud_ops_tools(app) 26 | register_prompts(app) 27 | 28 | 29 | def main() -> None: 30 | """@private Main entry point for the MCP server. 31 | 32 | This function starts the FastMCP server to handle MCP requests. 33 | 34 | It should not be called directly; instead, consult the MCP client documentation 35 | for instructions on how to connect to the server. 36 | """ 37 | print("Starting Airbyte MCP server.", file=sys.stderr) 38 | try: 39 | asyncio.run(app.run_stdio_async()) 40 | except KeyboardInterrupt: 41 | print("Airbyte MCP server interrupted by user.", file=sys.stderr) 42 | except Exception as ex: 43 | print(f"Error running Airbyte MCP server: {ex}", file=sys.stderr) 44 | sys.exit(1) 45 | 46 | print("Airbyte MCP server stopped.", file=sys.stderr) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /examples/run_downloadable_yaml_source.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A test of PyAirbyte calling a declarative manifest. 3 | 4 | Usage (from PyAirbyte root directory): 5 | > poetry run python examples/run_downloadable_yaml_source.py 6 | 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | import airbyte as ab 12 | from airbyte import get_source 13 | 14 | yaml_connectors: list[str] = ab.get_available_connectors(install_type="yaml") 15 | 16 | print( 17 | f"Downloadable yaml sources ({len(yaml_connectors)}): \n- " 18 | + "\n- ".join(yaml_connectors) 19 | ) 20 | 21 | failed_installs: dict[str, list[str]] = {} 22 | 23 | for yaml_connector in yaml_connectors: 24 | try: 25 | _ = get_source(yaml_connector, source_manifest=True) 26 | except Exception as ex: 27 | exception_type = type(ex).__name__ 28 | if exception_type in failed_installs: 29 | failed_installs[exception_type].append(yaml_connector) 30 | else: 31 | failed_installs[exception_type] = [yaml_connector] 32 | 33 | # Print any connector failures, grouped by the error message 34 | for error, connectors_failed in failed_installs.items(): 35 | print( 36 | f"\nInstallation Errors ({len(failed_installs)}): {error}\n- " 37 | + "\n- ".join(connectors_failed) 38 | + "\n" 39 | ) 40 | 41 | print("Running declarative source...") 42 | source = get_source( 43 | "source-pokeapi", 44 | config={ 45 | "pokemon_name": "ditto", 46 | }, 47 | source_manifest=True, 48 | ) 49 | source.check() 50 | source.select_all_streams() 51 | 52 | result = source.read() 53 | 54 | for name, records in result.streams.items(): 55 | print(f"Stream {name}: {len(records)} records") 56 | -------------------------------------------------------------------------------- /examples/run_sync_to_destination_w_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """Test a sync to an Airbyte destination. 4 | 5 | Usage: 6 | ``` 7 | poetry run python examples/run_sync_to_destination_w_cache.py 8 | ``` 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import datetime 14 | 15 | import airbyte as ab 16 | 17 | SCALE = 200_000 18 | 19 | 20 | def get_my_source() -> ab.Source: 21 | return ab.get_source( 22 | "source-faker", 23 | config={ 24 | "count": SCALE, 25 | "seed": 1234, 26 | "parallelism": 16, 27 | }, 28 | streams="*", 29 | ) 30 | 31 | 32 | def get_my_destination() -> ab.Destination: 33 | return ab.get_destination( 34 | name="destination-duckdb", 35 | config={ 36 | # This path is relative to the container: 37 | "destination_path": "/local/temp/db.duckdb", 38 | }, 39 | docker_image=True, 40 | # OR: 41 | # pip_url="git+https://github.com/airbytehq/airbyte.git#subdirectory=airbyte-integrations/connectors/destination-duckdb", 42 | ) 43 | 44 | 45 | def main() -> None: 46 | """Test writing from the source to the destination.""" 47 | source = get_my_source() 48 | source.check() 49 | destination = get_my_destination() 50 | destination.check() 51 | write_result: ab.WriteResult = destination.write( 52 | source_data=source, 53 | cache=ab.new_local_cache(), 54 | ) 55 | print( 56 | f"Completed writing {write_result.processed_records:,} records " 57 | f"to destination at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}." 58 | ) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # PyAirbyte Frequently asked Questions 2 | 3 | **1. Does PyAirbyte replace Airbyte?** 4 | 5 | No. PyAirbyte is a Python library that allows you to use Airbyte connectors in Python but it does 6 | not have orchestration or scheduling capabilities, nor does is provide logging, alerting, or other 7 | features for managing data pipelines in production. Airbyte is a full-fledged data integration 8 | platform that provides connectors, orchestration, and scheduling capabilities. 9 | 10 | **2. What is the PyAirbyte cache? Is it a destination?** 11 | 12 | Yes and no. You can think of it as a built-in destination implementation, but we avoid the word 13 | "destination" in our docs to prevent confusion with our certified destinations list 14 | [here](https://docs.airbyte.com/integrations/destinations/). 15 | 16 | **3. Does PyAirbyte work with data orchestration frameworks like Airflow, Dagster, and Snowpark, 17 | etc.?** 18 | 19 | Yes, it should. Please give it a try and report any problems you see. Also, drop us a note if works 20 | for you! 21 | 22 | **4. Can I use PyAirbyte to develop or test when developing Airbyte sources?** 23 | 24 | Yes, you can. PyAirbyte makes it easy to test connectors in Python, and you can use it to develop 25 | new local connectors as well as existing already-published ones. 26 | 27 | **5. Can I develop traditional ETL pipelines with PyAirbyte?** 28 | 29 | Yes. Just pick the cache type matching the destination - like SnowflakeCache for landing data in 30 | Snowflake. 31 | 32 | **6. Can PyAirbyte import a connector from a local directory that has python project files, or does 33 | it have to be installed from PyPi?** 34 | 35 | Yes, PyAirbyte can use any local install that has a CLI - and will automatically find connectors b 36 | name if they are on PATH. 37 | -------------------------------------------------------------------------------- /airbyte/_writers/jsonl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A Parquet cache implementation.""" 3 | 4 | from __future__ import annotations 5 | 6 | import gzip 7 | import json 8 | from typing import IO, TYPE_CHECKING, cast 9 | 10 | import orjson 11 | from overrides import overrides 12 | 13 | from airbyte._writers.file_writers import ( 14 | FileWriterBase, 15 | ) 16 | 17 | 18 | if TYPE_CHECKING: 19 | from pathlib import Path 20 | 21 | from airbyte.records import StreamRecord 22 | 23 | 24 | class JsonlWriter(FileWriterBase): 25 | """A Jsonl cache implementation.""" 26 | 27 | default_cache_file_suffix = ".jsonl.gz" 28 | prune_extra_fields = True 29 | 30 | @overrides 31 | def _open_new_file( 32 | self, 33 | file_path: Path, 34 | ) -> IO[str]: 35 | """Open a new file for writing.""" 36 | return cast( 37 | "IO[str]", 38 | gzip.open( # Avoiding context manager 39 | file_path, 40 | mode="wt", 41 | encoding="utf-8", 42 | ), 43 | ) 44 | 45 | @overrides 46 | def _write_record_dict( 47 | self, 48 | record_dict: StreamRecord, 49 | open_file_writer: IO[str], 50 | ) -> None: 51 | # If the record is too nested, `orjson` will fail with error `TypeError: Recursion 52 | # limit reached`. If so, fall back to the slower `json.dumps`. 53 | try: 54 | open_file_writer.write(orjson.dumps(record_dict).decode(encoding="utf-8") + "\n") 55 | except TypeError: 56 | # Using isoformat method for datetime serialization 57 | open_file_writer.write( 58 | json.dumps(record_dict, default=lambda _: _.isoformat()) + "\n", 59 | ) 60 | -------------------------------------------------------------------------------- /examples/run_bigquery_destination.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """ 3 | Usage: 4 | poetry install 5 | poetry run python examples/run_bigquery_destination.py 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | import tempfile 11 | import warnings 12 | 13 | import airbyte as ab 14 | from airbyte.secrets.google_gsm import GoogleGSMSecretManager 15 | 16 | 17 | warnings.filterwarnings("ignore", message="Cannot create BigQuery Storage client") 18 | 19 | 20 | AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" 21 | SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" 22 | 23 | bigquery_destination_secret: dict = ( 24 | GoogleGSMSecretManager( # type: ignore[union-attr] 25 | project=AIRBYTE_INTERNAL_GCP_PROJECT, 26 | credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), 27 | ) 28 | .get_secret(SECRET_NAME) 29 | .parse_json() 30 | ) 31 | 32 | 33 | def main() -> None: 34 | source = ab.get_source( 35 | "source-faker", 36 | config={"count": 1000, "seed": 0, "parallelism": 1, "always_updated": False}, 37 | install_if_missing=True, 38 | ) 39 | source.check() 40 | source.select_all_streams() 41 | 42 | with tempfile.NamedTemporaryFile(mode="w+", delete=False, encoding="utf-8") as temp: 43 | # Write credentials to the temp file 44 | temp.write(bigquery_destination_secret["credentials_json"]) 45 | temp.flush() 46 | temp.close() 47 | 48 | destination = ab.get_destination( 49 | "destination-bigquery", 50 | config={**bigquery_destination_secret, "dataset_id": "pyairbyte_tests"}, 51 | ) 52 | write_result = destination.write( 53 | source, 54 | # cache=False, # Toggle comment to test with/without caching 55 | ) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /airbyte/callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Callbacks for working with PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | 6 | from collections.abc import Callable 7 | from typing import Any 8 | 9 | 10 | ConfigChangeCallback = Callable[[dict[str, Any]], None] 11 | """Callback for when the configuration changes while the connector is running. 12 | 13 | This callback can be passed to supporting functions like `airbyte.get_source()` and 14 | `airbyte.get_destination()` to take action whenever configuration changes. 15 | The callback will be called with the new configuration as the only argument. 16 | 17 | The most common use case for this callback is for connectors with OAuth APIs to pass updated 18 | refresh tokens when the previous token is about to expire. 19 | 20 | Note that the dictionary passed will contain the entire configuration, not just the changed fields. 21 | 22 | Example Usage: 23 | 24 | ```python 25 | import airbyte as ab 26 | import yaml 27 | from pathlib import Path 28 | 29 | config_file = Path("path/to/my/config.yaml") 30 | config_dict = yaml.safe_load(config_file.read_text()) 31 | 32 | # Define the callback function: 33 | def config_callback(new_config: dict[str, Any]) -> None: 34 | # Write new config back to config file 35 | config_file.write_text(yaml.safe_dump(new_config)) 36 | 37 | # Pass in the callback function when creating the source: 38 | source = get_source( 39 | "source-faker", 40 | config=config_dict, 41 | config_change_callback=config_callback, 42 | ) 43 | # Now read as usual. If config changes during sync, the callback will be called. 44 | source.read() 45 | ``` 46 | 47 | For more information on the underlying Airbyte protocol, please see documentation on the 48 | [`CONNECTOR_CONFIG`](https://docs.airbyte.com/understanding-airbyte/airbyte-protocol#airbytecontrolconnectorconfigmessage) 49 | control messages. 50 | """ 51 | -------------------------------------------------------------------------------- /airbyte/_util/pip_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Internal utility functions for dealing with pip.""" 3 | 4 | from __future__ import annotations 5 | 6 | 7 | def github_pip_url( 8 | owner: str = "airbytehq", 9 | repo: str = "airbyte", 10 | *, 11 | package_name: str, 12 | branch_or_ref: str | None = None, 13 | subdirectory: str | None = None, 14 | ) -> str: 15 | """Return the pip URL for a GitHub repository. 16 | 17 | Results will look like: 18 | - `git+airbytehq/airbyte.git#egg=airbyte-lib&subdirectory=airbyte-lib' 19 | - `git+airbytehq/airbyte.git@master#egg=airbyte-lib&subdirectory=airbyte-lib' 20 | - `git+airbytehq/airbyte.git@my-branch#egg=source-github 21 | &subdirectory=airbyte-integrations/connectors/source-github' 22 | """ 23 | result = f"git+https://github.com/{owner}/{repo}.git" 24 | 25 | if branch_or_ref: 26 | result += f"@{branch_or_ref}" 27 | 28 | next_delimiter = "#" 29 | if package_name: 30 | result += f"{next_delimiter}egg={package_name}" 31 | next_delimiter = "&" 32 | 33 | if subdirectory: 34 | result += f"{next_delimiter}subdirectory={subdirectory}" 35 | 36 | return result 37 | 38 | 39 | def connector_pip_url( 40 | connector_name: str, 41 | /, 42 | branch: str, 43 | *, 44 | owner: str | None = None, 45 | ) -> str: 46 | """Return a pip URL for a connector in the main `airbytehq/airbyte` git repo.""" 47 | owner = owner or "airbytehq" 48 | if not connector_name.startswith("source-") and not connector_name.startswith("destination-"): 49 | connector_name = "source-" + connector_name 50 | 51 | return github_pip_url( 52 | owner=owner, 53 | repo="airbyte", 54 | branch_or_ref=branch, 55 | package_name=connector_name, 56 | subdirectory=f"airbyte-integrations/connectors/{connector_name}", 57 | ) 58 | -------------------------------------------------------------------------------- /airbyte/caches/duckdb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A DuckDB implementation of the PyAirbyte cache. 3 | 4 | ## Usage Example 5 | 6 | ```python 7 | from airbyte as ab 8 | from airbyte.caches import DuckDBCache 9 | 10 | cache = DuckDBCache( 11 | db_path="/path/to/my/duckdb-file", 12 | schema_name="myschema", 13 | ) 14 | ``` 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import warnings 20 | from typing import TYPE_CHECKING, ClassVar 21 | 22 | from airbyte_api.models import DestinationDuckdb 23 | from duckdb_engine import DuckDBEngineWarning 24 | 25 | from airbyte._processors.sql.duckdb import DuckDBConfig, DuckDBSqlProcessor 26 | from airbyte.caches.base import CacheBase 27 | from airbyte.destinations._translate_cache_to_dest import duckdb_cache_to_destination_configuration 28 | 29 | 30 | if TYPE_CHECKING: 31 | from airbyte.shared.sql_processor import SqlProcessorBase 32 | 33 | 34 | # Suppress warnings from DuckDB about reflection on indices. 35 | # https://github.com/Mause/duckdb_engine/issues/905 36 | warnings.filterwarnings( 37 | "ignore", 38 | message="duckdb-engine doesn't yet support reflection on indices", 39 | category=DuckDBEngineWarning, 40 | ) 41 | 42 | 43 | class DuckDBCache(DuckDBConfig, CacheBase): 44 | """A DuckDB cache.""" 45 | 46 | _sql_processor_class: ClassVar[type[SqlProcessorBase]] = DuckDBSqlProcessor 47 | 48 | paired_destination_name: ClassVar[str | None] = "destination-duckdb" 49 | paired_destination_config_class: ClassVar[type | None] = DestinationDuckdb 50 | 51 | @property 52 | def paired_destination_config(self) -> DestinationDuckdb: 53 | """Return a dictionary of destination configuration values.""" 54 | return duckdb_cache_to_destination_configuration(cache=self) 55 | 56 | 57 | # Expose the Cache class and also the Config class. 58 | __all__ = [ 59 | "DuckDBCache", 60 | "DuckDBConfig", 61 | ] 62 | -------------------------------------------------------------------------------- /.github/workflows/slash_command_dispatch.yml: -------------------------------------------------------------------------------- 1 | name: Slash Command Dispatch 2 | 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | env: 8 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 9 | 10 | jobs: 11 | slashCommandDispatch: 12 | # Only allow slash commands on pull request (not on issues) 13 | if: ${{ github.event.issue.pull_request }} 14 | runs-on: ubuntu-latest 15 | steps: 16 | 17 | - name: Authenticate as GitHub App 18 | uses: actions/create-github-app-token@67018539274d69449ef7c02e8e71183d1719ab42 # v2.1.4 19 | id: get-app-token 20 | with: 21 | owner: "airbytehq" 22 | repositories: "PyAirbyte" 23 | app-id: ${{ secrets.OCTAVIA_BOT_APP_ID }} 24 | private-key: ${{ secrets.OCTAVIA_BOT_PRIVATE_KEY }} 25 | - name: Slash Command Dispatch 26 | id: dispatch 27 | uses: peter-evans/slash-command-dispatch@13bc09769d122a64f75aa5037256f6f2d78be8c4 # v4.0.0 28 | with: 29 | repository: ${{ github.repository }} 30 | token: ${{ steps.get-app-token.outputs.token }} 31 | dispatch-type: workflow 32 | issue-type: pull-request 33 | commands: | 34 | fix-pr 35 | test-pr 36 | poetry-lock 37 | poe 38 | prerelease 39 | static-args: | 40 | pr=${{ github.event.issue.number }} 41 | comment-id=${{ github.event.comment.id }} 42 | 43 | # Only run for users with 'write' permission on the main repository 44 | permission: write 45 | 46 | - name: Edit comment with error message 47 | if: steps.dispatch.outputs.error-message 48 | uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 49 | with: 50 | comment-id: ${{ github.event.comment.id }} 51 | body: | 52 | > Error: ${{ steps.dispatch.outputs.error-message }} 53 | -------------------------------------------------------------------------------- /.github/workflows/semantic_pr_check.yml: -------------------------------------------------------------------------------- 1 | name: "Verify Semantic PR Title" 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - opened 7 | - edited 8 | - synchronize 9 | - ready_for_review 10 | 11 | env: 12 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 13 | 14 | permissions: 15 | pull-requests: read 16 | 17 | jobs: 18 | validate_pr_title: 19 | name: Validate PR title 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1 23 | if: ${{ github.event.pull_request.draft == false }} 24 | env: 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | with: 27 | # Configure which types are allowed (newline-delimited). 28 | # These are intentionally case-insensitive, allowing title casing or all lowercase. 29 | # See: https://github.com/commitizen/conventional-commit-types/blob/master/index.json 30 | types: | 31 | fix 32 | Fix 33 | feat 34 | Feat 35 | docs 36 | Docs 37 | ci 38 | CI 39 | chore 40 | Chore 41 | build 42 | Build 43 | test 44 | Test 45 | 46 | # # We don't use scopes as of now 47 | # scopes: | 48 | # core 49 | # ui 50 | 51 | - name: Check for "do not merge" in PR title 52 | if: ${{ github.event.pull_request.draft == false }} 53 | uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 54 | with: 55 | script: | 56 | const title = context.payload.pull_request.title.toLowerCase(); 57 | if (title.includes('do not merge') || title.includes('do-not-merge')) { 58 | core.setFailed('PR title contains "do not merge" or "do-not-merge". Please remove this before merging.'); 59 | } 60 | -------------------------------------------------------------------------------- /airbyte/mcp/_annotations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """MCP tool annotation constants. 3 | 4 | These constants define the standard MCP annotations for tools, following the 5 | FastMCP 2.2.7+ specification. 6 | 7 | For more information, see: 8 | https://gofastmcp.com/concepts/tools#mcp-annotations 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | 14 | READ_ONLY_HINT = "readOnlyHint" 15 | """Indicates if the tool only reads data without making any changes. 16 | 17 | When True, the tool performs read-only operations and does not modify any state. 18 | When False, the tool may write, create, update, or delete data. 19 | 20 | FastMCP default if not specified: False 21 | """ 22 | 23 | DESTRUCTIVE_HINT = "destructiveHint" 24 | """Signals if the tool's changes are destructive (updates or deletes existing data). 25 | 26 | This hint is only relevant for non-read-only tools (readOnlyHint=False). 27 | When True, the tool modifies or deletes existing data in a way that may be 28 | difficult or impossible to reverse. 29 | When False, the tool creates new data or performs non-destructive operations. 30 | 31 | FastMCP default if not specified: True 32 | """ 33 | 34 | IDEMPOTENT_HINT = "idempotentHint" 35 | """Indicates if repeated calls with the same parameters have the same effect. 36 | 37 | When True, calling the tool multiple times with identical parameters produces 38 | the same result and side effects as calling it once. 39 | When False, each call may produce different results or side effects. 40 | 41 | FastMCP default if not specified: False 42 | """ 43 | 44 | OPEN_WORLD_HINT = "openWorldHint" 45 | """Specifies if the tool interacts with external systems. 46 | 47 | When True, the tool communicates with external services, APIs, or systems 48 | outside the local environment (e.g., cloud APIs, remote databases, internet). 49 | When False, the tool only operates on local state or resources. 50 | 51 | FastMCP default if not specified: True 52 | """ 53 | -------------------------------------------------------------------------------- /tests/lint_tests/test_ruff.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import subprocess 5 | 6 | import pytest 7 | 8 | 9 | @pytest.mark.linting 10 | def test_ruff_linting(): 11 | # Run the check command 12 | check_result = subprocess.run( 13 | ["poetry", "run", "ruff", "check", "."], 14 | stdout=subprocess.PIPE, 15 | stderr=subprocess.PIPE, 16 | ) 17 | 18 | # Assert that the Ruff command exited without errors (exit code 0) 19 | assert check_result.returncode == 0, ( 20 | "Ruff checks failed:\n\n" 21 | + f"{check_result.stdout.decode()}\n{check_result.stderr.decode()}\n\n" 22 | + "Run `poetry run ruff check .` to view all issues." 23 | ) 24 | 25 | 26 | @pytest.mark.linting 27 | def test_ruff_linting_fixable(): 28 | # Run the check command 29 | fix_diff_result = subprocess.run( 30 | ["poetry", "run", "ruff", "check", "--fix", "--diff", "."], 31 | stdout=subprocess.PIPE, 32 | stderr=subprocess.PIPE, 33 | ) 34 | 35 | # Assert that the Ruff command exited without errors (exit code 0) 36 | assert fix_diff_result.returncode == 0, ( 37 | "Ruff checks revealed fixable issues:\n\n" 38 | + f"{fix_diff_result.stdout.decode()}\n{fix_diff_result.stderr.decode()}\n\n" 39 | + "Run `poetry run ruff check --fix .` to attempt automatic fixes." 40 | ) 41 | 42 | 43 | @pytest.mark.linting 44 | def test_ruff_format(): 45 | # Define the command to run Ruff 46 | command = ["poetry", "run", "ruff", "format", "--check", "--diff"] 47 | 48 | # Run the command 49 | result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 50 | 51 | # Assert that the Ruff command exited without errors (exit code 0) 52 | assert result.returncode == 0, ( 53 | f"Ruff checks failed:\n\n{result.stdout.decode()}\n{result.stderr.decode()}\n\n" 54 | + "Run `poetry run ruff format .` to attempt automatic fixes." 55 | ) 56 | -------------------------------------------------------------------------------- /.github/workflows/pydoc_publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | 11 | env: 12 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 13 | 14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 15 | permissions: 16 | contents: read 17 | pages: write 18 | id-token: write 19 | 20 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 21 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 22 | concurrency: 23 | group: "pages" 24 | cancel-in-progress: false 25 | 26 | jobs: 27 | publish_docs: 28 | runs-on: ubuntu-latest 29 | environment: 30 | name: "github-pages" 31 | url: ${{ steps.deployment.outputs.page_url }} 32 | 33 | steps: 34 | - name: Checkout code 35 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 36 | - name: Set up Python 37 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 38 | with: 39 | python-version: '3.10' 40 | - name: Set up Poetry 41 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 42 | with: 43 | poetry-version: "2.2.0" 44 | - name: Setup Pages 45 | uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # v5.0.0 46 | 47 | - name: Install dependencies 48 | run: poetry install 49 | 50 | - name: Generate documentation 51 | run: | 52 | poetry run poe docs-generate 53 | 54 | - name: Upload artifact 55 | uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0 56 | with: 57 | # Upload entire repository 58 | path: 'docs/generated' 59 | 60 | - name: Deploy to GitHub Pages 61 | id: deployment 62 | uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 63 | -------------------------------------------------------------------------------- /examples/run_sync_to_destination_from_read_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """Test a sync to an Airbyte destination. 4 | 5 | Usage: 6 | ``` 7 | poetry run python examples/run_sync_to_destination_from_read_result.py 8 | ``` 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import datetime 14 | 15 | import airbyte as ab 16 | 17 | SCALE = 200_000 18 | 19 | 20 | def get_my_source() -> ab.Source: 21 | return ab.get_source( 22 | "source-faker", 23 | config={ 24 | "count": SCALE, 25 | "seed": 1234, 26 | "parallelism": 16, 27 | }, 28 | streams=["purchases"], 29 | ) 30 | 31 | 32 | def get_my_destination() -> ab.Destination: 33 | return ab.get_destination( 34 | name="destination-duckdb", 35 | config={ 36 | # This path is relative to the container: 37 | "destination_path": "/local/temp/db.duckdb", 38 | }, 39 | docker_image="airbyte/destination-duckdb:latest", 40 | # OR: 41 | # pip_url="git+https://github.com/airbytehq/airbyte.git#subdirectory=airbyte-integrations/connectors/destination-duckdb", 42 | ) 43 | 44 | 45 | def main() -> None: 46 | """Test writing from the source to the destination.""" 47 | source = get_my_source() 48 | source.check() 49 | destination = get_my_destination() 50 | destination.check() 51 | 52 | read_result: ab.ReadResult = source.read( 53 | cache=ab.new_local_cache(), 54 | ) 55 | print( 56 | "Completed reading from source at " 57 | f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}. " 58 | "Writing to destination..." 59 | ) 60 | write_result: ab.WriteResult = destination.write( 61 | source_data=read_result, 62 | ) 63 | print( 64 | f"Completed writing {write_result.processed_records:,} records " 65 | f"to destination at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}." 66 | ) 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /.github/workflows/welcome-message.yml: -------------------------------------------------------------------------------- 1 | name: Community PR Welcome Message 2 | 3 | # This workflow automatically adds a welcome message to PRs from community contributors (forks) 4 | # It includes PyAirbyte usage instructions and other helpful resources for testing changes 5 | # 6 | # MANUAL TESTING INSTRUCTIONS: 7 | # To manually test this workflow, temporarily uncomment the "synchronize" event type as a workflow trigger. 8 | # Then the workflow will run for all new commits. 9 | # 10 | # Before merging, remember to again comment-out the "synchronize" clause and uncomment the `if:` condition. 11 | 12 | on: 13 | pull_request_target: 14 | types: 15 | - opened 16 | - reopened 17 | # Toggle this line, uncommenting for testing: 18 | # - synchronize 19 | 20 | jobs: 21 | welcome-contributor: 22 | name: PR Welcome Message 23 | permissions: 24 | contents: read 25 | issues: write 26 | pull-requests: write 27 | runs-on: ubuntu-24.04 28 | steps: 29 | - name: Checkout repository 30 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 31 | 32 | - name: Resolve workflow variables 33 | id: vars 34 | uses: aaronsteers/resolve-ci-vars-action@2e56afab0344bbe03c047dfa39bae559d0291472 # v0.1.6 35 | 36 | - name: Render template 37 | id: template 38 | uses: chuhlomin/render-template@807354a04d9300c9c2ac177c0aa41556c92b3f75 # v1.10 39 | with: 40 | # Use a different template for internal vs forks (community) 41 | template: ${{ steps.vars.outputs.pr-source-is-fork == 'true' && '.github/pr-welcome-community.md' || '.github/pr-welcome-internal.md' }} 42 | vars: | 43 | repo_name: ${{ steps.vars.outputs.pr-source-repo-name-full }} 44 | branch_name: ${{ steps.vars.outputs.pr-source-git-branch }} 45 | 46 | - name: Create comment 47 | uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 48 | with: 49 | issue-number: ${{ github.event.pull_request.number }} 50 | body: ${{ steps.template.outputs.result }} 51 | -------------------------------------------------------------------------------- /tests/integration_tests/secrets/test_gsm_secrets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Tests for the GSM secrets manager.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte.secrets.base import SecretHandle 7 | from airbyte.secrets.google_gsm import GoogleGSMSecretManager 8 | 9 | 10 | def test_get_gsm_secret(ci_secret_manager: GoogleGSMSecretManager) -> dict: 11 | assert ci_secret_manager.get_secret( 12 | "SECRET_DESTINATION_DUCKDB__MOTHERDUCK__CREDS", 13 | ).parse_json() 14 | 15 | 16 | def test_get_gsm_secrets_with_filter(ci_secret_manager: GoogleGSMSecretManager) -> None: 17 | """Test fetching connector secrets.""" 18 | secrets = ci_secret_manager.fetch_secrets( 19 | filter_string="labels.connector=source-bigquery", 20 | ) 21 | assert secrets is not None 22 | secrets_list = list(secrets) 23 | assert len(secrets_list) > 0 24 | assert secrets_list[0].get_value().is_json() 25 | 26 | 27 | def test_get_gsm_secrets_by_label(ci_secret_manager: GoogleGSMSecretManager) -> None: 28 | """Test fetching connector secrets.""" 29 | secrets = ci_secret_manager.fetch_secrets_by_label( 30 | label_key="connector", 31 | label_value="source-salesforce", 32 | ) 33 | assert secrets is not None 34 | secrets_list = list(secrets) 35 | assert len(secrets_list) > 0 36 | assert secrets_list[0].get_value().is_json() 37 | 38 | 39 | def test_get_connector_secrets(ci_secret_manager: GoogleGSMSecretManager) -> None: 40 | """Test fetching connector secrets.""" 41 | secrets = ci_secret_manager.fetch_connector_secrets("source-salesforce") 42 | assert secrets is not None 43 | secrets_list = list(secrets) 44 | assert len(secrets_list) > 0 45 | assert secrets_list[0].get_value().is_json() 46 | 47 | 48 | def test_first_connector_secret(ci_secret_manager: GoogleGSMSecretManager) -> None: 49 | """Test fetching connector secrets.""" 50 | secret = ci_secret_manager.fetch_connector_secret("source-salesforce") 51 | assert secret is not None 52 | assert isinstance(secret, SecretHandle) 53 | assert secret.get_value().is_json() 54 | -------------------------------------------------------------------------------- /airbyte/documents.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """This module contains the `Documents` class for converting Airbyte records into documents. 3 | 4 | Generally you will not create `Documents` objects directly. Instead, you can use one of the 5 | following methods to generate documents from records: 6 | 7 | - `Source.get_documents()`: Get an iterable of documents from a source. 8 | - `Dataset.to_documents()`: Get an iterable of documents from a dataset. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | from typing import TYPE_CHECKING, Any 14 | 15 | from pydantic import BaseModel, Field 16 | 17 | 18 | if TYPE_CHECKING: 19 | import datetime 20 | 21 | 22 | MAX_SINGLE_LINE_LENGTH = 60 23 | AIRBYTE_DOCUMENT_RENDERING = "airbyte_document_rendering" 24 | TITLE_PROPERTY = "title_property" 25 | CONTENT_PROPS = "content_properties" 26 | METADATA_PROPERTIES = "metadata_properties" 27 | 28 | 29 | class Document(BaseModel): 30 | """A PyAirbyte document is a specific projection on top of a record. 31 | 32 | Documents have the following structure: 33 | - id (str): A unique string identifier for the document. 34 | - content (str): A string representing the record when rendered as a document. 35 | - metadata (dict[str, Any]): Associated metadata about the document, such as the record's IDs 36 | and/or URLs. 37 | 38 | This class is duck-typed to be compatible with LangChain project's `Document` class. 39 | """ 40 | 41 | id: str | None = Field(default=None) 42 | content: str 43 | metadata: dict[str, Any] 44 | last_modified: datetime.datetime | None = Field(default=None) 45 | 46 | def __str__(self) -> str: 47 | """Return a string representation of the document.""" 48 | return self.content 49 | 50 | @property 51 | def page_content(self) -> str: 52 | """Return the content of the document. 53 | 54 | This is an alias for the `content` property, and is provided for duck-type compatibility 55 | with the LangChain project's `Document` class. 56 | """ 57 | return self.content 58 | 59 | 60 | __all__ = [ 61 | "Document", 62 | ] 63 | -------------------------------------------------------------------------------- /airbyte/_executors/local.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING, NoReturn 5 | 6 | from airbyte import exceptions as exc 7 | from airbyte._executors.base import Executor 8 | 9 | 10 | if TYPE_CHECKING: 11 | from pathlib import Path 12 | 13 | 14 | class PathExecutor(Executor): 15 | def __init__( 16 | self, 17 | name: str | None = None, 18 | *, 19 | path: Path, 20 | target_version: str | None = None, 21 | ) -> None: 22 | """Initialize a connector executor that runs a connector from a local path. 23 | 24 | If path is simply the name of the connector, it will be expected to exist in the current 25 | PATH or in the current working directory. 26 | """ 27 | self.path: Path = path 28 | name = name or path.name 29 | super().__init__(name=name, target_version=target_version) 30 | 31 | def ensure_installation( 32 | self, 33 | *, 34 | auto_fix: bool = True, 35 | ) -> None: 36 | """Ensure that the connector executable can be found. 37 | 38 | The auto_fix parameter is ignored for this executor type. 39 | """ 40 | _ = auto_fix 41 | try: 42 | self.execute(["spec"]) 43 | except Exception as e: 44 | raise exc.AirbyteConnectorExecutableNotFoundError( 45 | connector_name=self.name, 46 | ) from e 47 | 48 | def install(self) -> NoReturn: 49 | raise exc.AirbyteConnectorInstallationError( 50 | message="Connector cannot be installed because it is not managed by PyAirbyte.", 51 | connector_name=self.name, 52 | ) 53 | 54 | def uninstall(self) -> NoReturn: 55 | raise exc.AirbyteConnectorInstallationError( 56 | message="Connector cannot be uninstalled because it is not managed by PyAirbyte.", 57 | connector_name=self.name, 58 | ) 59 | 60 | @property 61 | def _cli(self) -> list[str]: 62 | """Get the base args of the CLI executable.""" 63 | return [str(self.path)] 64 | -------------------------------------------------------------------------------- /examples/run_bigquery_faker.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """ 3 | Usage: 4 | poetry install 5 | poetry run python examples/run_bigquery_faker.py 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | import tempfile 11 | import warnings 12 | 13 | import airbyte as ab 14 | from airbyte.caches.bigquery import BigQueryCache 15 | from airbyte.secrets.google_gsm import GoogleGSMSecretManager 16 | 17 | 18 | warnings.filterwarnings("ignore", message="Cannot create BigQuery Storage client") 19 | 20 | 21 | AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" 22 | SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" 23 | 24 | bigquery_destination_secret: dict = ( 25 | GoogleGSMSecretManager( # type: ignore[union-attr] 26 | project=AIRBYTE_INTERNAL_GCP_PROJECT, 27 | credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), 28 | ) 29 | .get_secret(SECRET_NAME) 30 | .parse_json() 31 | ) 32 | 33 | 34 | def main() -> None: 35 | source = ab.get_source( 36 | "source-faker", 37 | config={"count": 1000, "seed": 0, "parallelism": 1, "always_updated": False}, 38 | install_if_missing=True, 39 | ) 40 | source.check() 41 | source.select_all_streams() 42 | 43 | with tempfile.NamedTemporaryFile(mode="w+", delete=False, encoding="utf-8") as temp: 44 | # Write credentials to the temp file 45 | temp.write(bigquery_destination_secret["credentials_json"]) 46 | temp.flush() 47 | temp.close() 48 | 49 | cache = BigQueryCache( 50 | project_name=bigquery_destination_secret["project_id"], 51 | dataset_name=bigquery_destination_secret.get( 52 | "dataset_id", "pyairbyte_integtest" 53 | ), 54 | credentials_path=temp.name, 55 | ) 56 | 57 | result = source.read(cache) 58 | 59 | # Read a second time to make sure table swaps and incremental are working. 60 | result = source.read(cache) 61 | 62 | for name, records in result.streams.items(): 63 | print(f"Stream {name}: {len(records)} records") 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /airbyte/caches/postgres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A Postgres implementation of the PyAirbyte cache. 3 | 4 | ## Usage Example 5 | 6 | ```python 7 | from airbyte as ab 8 | from airbyte.caches import PostgresCache 9 | 10 | cache = PostgresCache( 11 | host="myhost", 12 | port=5432, 13 | username="myusername", 14 | password=ab.get_secret("POSTGRES_PASSWORD"), 15 | database="mydatabase", 16 | ) 17 | ``` 18 | """ 19 | 20 | from __future__ import annotations 21 | 22 | from typing import TYPE_CHECKING, ClassVar 23 | 24 | from airbyte_api.models import DestinationPostgres 25 | 26 | from airbyte._processors.sql.postgres import PostgresConfig, PostgresSqlProcessor 27 | from airbyte.caches.base import CacheBase 28 | from airbyte.destinations._translate_cache_to_dest import ( 29 | postgres_cache_to_destination_configuration, 30 | ) 31 | 32 | 33 | if TYPE_CHECKING: 34 | from airbyte.shared.sql_processor import SqlProcessorBase 35 | 36 | 37 | class PostgresCache(PostgresConfig, CacheBase): 38 | """Configuration for the Postgres cache. 39 | 40 | Also inherits config from the JsonlWriter, which is responsible for writing files to disk. 41 | """ 42 | 43 | _sql_processor_class: ClassVar[type[SqlProcessorBase]] = PostgresSqlProcessor 44 | 45 | paired_destination_name: ClassVar[str | None] = "destination-bigquery" 46 | paired_destination_config_class: ClassVar[type | None] = DestinationPostgres 47 | 48 | @property 49 | def paired_destination_config(self) -> DestinationPostgres: 50 | """Return a dictionary of destination configuration values.""" 51 | return postgres_cache_to_destination_configuration(cache=self) 52 | 53 | def clone_as_cloud_destination_config(self) -> DestinationPostgres: 54 | """Return a DestinationPostgres instance with the same configuration.""" 55 | return DestinationPostgres( 56 | host=self.host, 57 | port=self.port, 58 | username=self.username, 59 | password=self.password, 60 | database=self.database, 61 | ) 62 | 63 | 64 | # Expose the Cache class and also the Config class. 65 | __all__ = [ 66 | "PostgresCache", 67 | "PostgresConfig", 68 | ] 69 | -------------------------------------------------------------------------------- /airbyte/secrets/custom.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Custom secret manager that retrieves secrets from a custom source.""" 3 | 4 | from __future__ import annotations 5 | 6 | from abc import ABC 7 | 8 | from airbyte.secrets.base import SecretManager 9 | from airbyte.secrets.config import clear_secret_sources, register_secret_manager 10 | 11 | 12 | class CustomSecretManager(SecretManager, ABC): 13 | """Custom secret manager that retrieves secrets from a custom source. 14 | 15 | This class is a convenience class that can be used to create custom secret 16 | managers. By default, custom secrets managers are auto-registered during 17 | creation. 18 | """ 19 | 20 | auto_register = True 21 | replace_existing = False 22 | as_backup = False 23 | 24 | def __init__(self) -> None: 25 | """Initialize the custom secret manager.""" 26 | super().__init__() 27 | if self.auto_register: 28 | self.register() 29 | 30 | def register( 31 | self, 32 | *, 33 | replace_existing: bool | None = None, 34 | as_backup: bool | None = None, 35 | ) -> None: 36 | """Register the secret manager as global secret source. 37 | 38 | This makes the secret manager available to the `get_secret` function and 39 | allows it to be used automatically as a source for secrets. 40 | 41 | If `replace_existing` is `True`, the secret manager will replace all existing 42 | secrets sources, including the default secret managers such as environment 43 | variables, dotenv files, and Google Colab secrets. If `replace_existing` is 44 | None or not provided, the default behavior will be used from the `replace_existing` 45 | of the class (`False` unless overridden by the subclass). 46 | """ 47 | if replace_existing is None: 48 | replace_existing = self.replace_existing 49 | 50 | if as_backup is None: 51 | as_backup = self.as_backup 52 | 53 | if replace_existing: 54 | clear_secret_sources() 55 | 56 | register_secret_manager( 57 | self, 58 | as_backup=as_backup, 59 | replace_existing=replace_existing, 60 | ) 61 | -------------------------------------------------------------------------------- /airbyte/caches/_state_backend_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """State backend implementation.""" 4 | 5 | from __future__ import annotations 6 | 7 | import abc 8 | from typing import TYPE_CHECKING 9 | 10 | 11 | if TYPE_CHECKING: 12 | from airbyte_protocol.models import ( 13 | AirbyteStreamState, 14 | ) 15 | 16 | from airbyte.shared.sql_processor import SqlConfig 17 | from airbyte.shared.state_providers import StateProviderBase 18 | from airbyte.shared.state_writers import StateWriterBase 19 | 20 | 21 | class StateBackendBase(abc.ABC): 22 | """A class which manages the stream state for data synced. 23 | 24 | The backend is responsible for storing and retrieving the state of streams. It generates 25 | `StateProvider` objects, which are paired to a specific source and table prefix. 26 | """ 27 | 28 | _sql_config: SqlConfig 29 | 30 | def __init__(self) -> None: 31 | """Initialize the state manager with a static catalog state.""" 32 | self._state_artifacts: list[AirbyteStreamState] | None = None 33 | 34 | @abc.abstractmethod 35 | def get_state_provider( 36 | self, 37 | source_name: str, 38 | table_prefix: str, 39 | *, 40 | refresh: bool = True, 41 | destination_name: str | None = None, 42 | ) -> StateProviderBase: 43 | """Return the state provider.""" 44 | ... 45 | 46 | @abc.abstractmethod 47 | def get_state_writer( 48 | self, 49 | source_name: str, 50 | destination_name: str | None = None, 51 | ) -> StateWriterBase: 52 | """Return a state writer for a named source. 53 | 54 | The same table prefix of the backend will be used for the state writer. 55 | """ 56 | ... 57 | 58 | def _initialize_backend( 59 | self, 60 | *, 61 | force_refresh: bool = False, 62 | ) -> None: 63 | """Do any needed initialization, for instance to load state artifacts from the cache. 64 | 65 | By default, this method does nothing. Base classes may override this method to load state 66 | artifacts or perform other initialization tasks. 67 | """ 68 | _ = force_refresh # Unused 69 | pass 70 | -------------------------------------------------------------------------------- /airbyte/caches/bigquery.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """A BigQuery implementation of the cache. 3 | 4 | ## Usage Example 5 | 6 | ```python 7 | import airbyte as ab 8 | from airbyte.caches import BigQueryCache 9 | 10 | cache = BigQueryCache( 11 | project_name="myproject", 12 | dataset_name="mydataset", 13 | credentials_path="path/to/credentials.json", 14 | ) 15 | ``` 16 | """ 17 | 18 | from __future__ import annotations 19 | 20 | from typing import TYPE_CHECKING, ClassVar, NoReturn 21 | 22 | from airbyte_api.models import DestinationBigquery 23 | 24 | from airbyte._processors.sql.bigquery import BigQueryConfig, BigQuerySqlProcessor 25 | from airbyte.caches.base import ( 26 | CacheBase, 27 | ) 28 | from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE 29 | from airbyte.destinations._translate_cache_to_dest import ( 30 | bigquery_cache_to_destination_configuration, 31 | ) 32 | 33 | 34 | if TYPE_CHECKING: 35 | from airbyte.shared.sql_processor import SqlProcessorBase 36 | 37 | 38 | class BigQueryCache(BigQueryConfig, CacheBase): 39 | """The BigQuery cache implementation.""" 40 | 41 | _sql_processor_class: ClassVar[type[SqlProcessorBase]] = BigQuerySqlProcessor 42 | 43 | paired_destination_name: ClassVar[str | None] = "destination-bigquery" 44 | paired_destination_config_class: ClassVar[type | None] = DestinationBigquery 45 | 46 | @property 47 | def paired_destination_config(self) -> DestinationBigquery: 48 | """Return a dictionary of destination configuration values.""" 49 | return bigquery_cache_to_destination_configuration(cache=self) 50 | 51 | def get_arrow_dataset( 52 | self, 53 | stream_name: str, 54 | *, 55 | max_chunk_size: int = DEFAULT_ARROW_MAX_CHUNK_SIZE, 56 | ) -> NoReturn: 57 | """Raises NotImplementedError; BigQuery doesn't support `pd.read_sql_table`. 58 | 59 | See: https://github.com/airbytehq/PyAirbyte/issues/165 60 | """ 61 | raise NotImplementedError( 62 | "BigQuery doesn't currently support to_arrow" 63 | "Please consider using a different cache implementation for these functionalities." 64 | ) 65 | 66 | 67 | # Expose the Cache class and also the Config class. 68 | __all__ = [ 69 | "BigQueryCache", 70 | "BigQueryConfig", 71 | ] 72 | -------------------------------------------------------------------------------- /examples/run_sync_to_destination_wo_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """Test a sync to an Airbyte destination. 4 | 5 | Usage: 6 | ``` 7 | poetry run python examples/run_sync_to_destination_wo_cache.py 8 | ``` 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import datetime 14 | 15 | import airbyte as ab 16 | 17 | SCALE = 200_000 18 | 19 | 20 | def get_my_source() -> ab.Source: 21 | # Create a token here: https://github.com/settings/tokens 22 | # Then export as env var `GITHUB_PERSONAL_ACCESS_TOKEN` 23 | github_pat = ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN") 24 | assert str(github_pat), "Could not locate Github PAT" 25 | source = ab.get_source( 26 | "source-github", 27 | config={ 28 | "repositories": ["airbytehq/PyAirbyte"], 29 | "credentials": { 30 | "personal_access_token": github_pat, 31 | }, 32 | }, 33 | ) 34 | source.check() 35 | source.select_streams(["issues"]) 36 | return source 37 | 38 | 39 | def get_cache() -> ab.DuckDBCache: 40 | return ab.new_local_cache( 41 | cache_name="state_cache", 42 | ) 43 | 44 | 45 | def get_my_destination() -> ab.Destination: 46 | return ab.get_destination( 47 | name="destination-duckdb", 48 | config={ 49 | # This path is relative to the container: 50 | "destination_path": "/local/temp/db.duckdb", 51 | }, 52 | docker_image="airbyte/destination-duckdb:latest", 53 | # OR: 54 | # pip_url="git+https://github.com/airbytehq/airbyte.git#subdirectory=airbyte-integrations/connectors/destination-duckdb", 55 | ) 56 | 57 | 58 | def main() -> None: 59 | """Test writing from the source to the destination.""" 60 | source = get_my_source() 61 | source.check() 62 | destination = get_my_destination() 63 | destination.check() 64 | state_cache = get_cache() 65 | write_result: ab.WriteResult = destination.write( 66 | source, 67 | cache=False, 68 | state_cache=state_cache, 69 | ) 70 | print( 71 | f"Completed writing {write_result.processed_records:,} records " 72 | f"to destination at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}." 73 | ) 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /tests/unit_tests/test_caches.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from pathlib import Path 5 | 6 | 7 | from airbyte.caches.base import CacheBase 8 | from airbyte.caches.duckdb import DuckDBCache 9 | 10 | UNIT_TEST_DB_PATH: Path = Path(".cache") / "unit_tests" / "test_db.duckdb" 11 | 12 | 13 | def test_duck_db_cache_config_initialization(): 14 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH, schema_name="test_schema") 15 | assert config.db_path == Path(UNIT_TEST_DB_PATH) 16 | assert config.schema_name == "test_schema" 17 | 18 | 19 | def test_duck_db_cache_config_default_schema_name(): 20 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH) 21 | assert config.schema_name == "main" 22 | 23 | 24 | def test_get_sql_alchemy_url(): 25 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH, schema_name="test_schema") 26 | assert config.get_sql_alchemy_url() == f"duckdb:///{UNIT_TEST_DB_PATH}" 27 | 28 | 29 | def test_get_sql_alchemy_url_with_default_schema_name(): 30 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH) 31 | assert config.get_sql_alchemy_url() == f"duckdb:///{UNIT_TEST_DB_PATH}" 32 | 33 | 34 | def test_duck_db_cache_config_inheritance(): 35 | assert issubclass(DuckDBCache, CacheBase) 36 | 37 | 38 | def test_duck_db_cache_config_get_sql_alchemy_url(): 39 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH, schema_name="test_schema") 40 | assert config.get_sql_alchemy_url() == f"duckdb:///{UNIT_TEST_DB_PATH}" 41 | 42 | 43 | def test_duck_db_cache_config_get_database_name(): 44 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH, schema_name="test_schema") 45 | assert config.get_database_name() == "test_db" 46 | 47 | 48 | def test_duck_db_cache_base_inheritance(): 49 | assert issubclass(DuckDBCache, CacheBase) 50 | 51 | 52 | def test_duck_db_cache_config_get_sql_alchemy_url_with_default_schema_name(): 53 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH) 54 | assert config.get_sql_alchemy_url() == f"duckdb:///{UNIT_TEST_DB_PATH}" 55 | 56 | 57 | def test_duck_db_cache_config_get_database_name_with_default_schema_name(): 58 | config = DuckDBCache(db_path=UNIT_TEST_DB_PATH) 59 | assert config.get_database_name() == "test_db" 60 | 61 | 62 | def test_duck_db_cache_config_inheritance_from_sql_cache_config_base(): 63 | assert issubclass(DuckDBCache, CacheBase) 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # temp files 2 | temp 3 | .temp 4 | 5 | # logs 6 | logs/ 7 | 8 | # Viztracer log files 9 | viztracer_report.json 10 | 11 | # Packaged docs 12 | docs/*.zip 13 | 14 | # Misc 15 | .DS_Store 16 | 17 | # Directories and subdirectories called '.secrets' and the top-level '/secrets' directory 18 | .secrets 19 | /secrets 20 | 21 | # Virtual Environments 22 | .venv 23 | .venv-* 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | pip-wheel-metadata/ 45 | share/python-wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .nox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | cover/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Logs: 80 | *.log 81 | 82 | # Generated docs 83 | docs/_build/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environment Vars & Secrets 102 | .env 103 | .envrc 104 | .env.local 105 | .venv 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # Type checkers (mypy, pyrefly) 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | # pytype static type analyzer 130 | .pytype/ 131 | 132 | # Cython debug symbols 133 | cython_debug/ 134 | 135 | # Pycharm 136 | .idea 137 | -------------------------------------------------------------------------------- /examples/run_gsm_connector_secret_fetch.py: -------------------------------------------------------------------------------- 1 | """Simple script to download secrets from GCS. 2 | 3 | Secrets will be located based on the `connector` label in the GSM secret metadata, and they 4 | will be written to the connector's secrets directory based upon the `filename` label. 5 | 6 | Filename is appended with `.json` and the secret is written to that file. 7 | 8 | As a safety measure, we will only write to the connector's secrets directory if it exists. 9 | If it doesn't exist, the script will fail. Users should ensure the directory 10 | exists and is excluded from git before running the script. 11 | 12 | Usage: 13 | poetry run python examples/run_gsm_connector_secret_fetch.py 14 | poetry run python examples/run_gsm_connector_secret_fetch.py 15 | poetry run python examples/run_gsm_connector_secret_fetch.py source-github 16 | """ 17 | 18 | from __future__ import annotations 19 | 20 | import sys 21 | from pathlib import Path 22 | 23 | import airbyte as ab 24 | from airbyte.secrets import GoogleGSMSecretManager, SecretHandle 25 | 26 | AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" 27 | CONNECTOR_NAME = sys.argv[1] if len(sys.argv) > 1 else "source-klaviyo" 28 | 29 | AIRBYTE_REPO_ROOT = Path(__file__).parent.parent.parent / "airbyte" 30 | 31 | 32 | CONNECTOR_SECRETS_DIR = ( 33 | AIRBYTE_REPO_ROOT 34 | / "airbyte-integrations" 35 | / "connectors" 36 | / CONNECTOR_NAME 37 | / "secrets" 38 | ) 39 | if not AIRBYTE_REPO_ROOT.exists(): 40 | raise FileNotFoundError(f"Airbyte repo root does not exist: {AIRBYTE_REPO_ROOT}") 41 | if not CONNECTOR_SECRETS_DIR.exists(): 42 | CONNECTOR_SECRETS_DIR.mkdir(parents=True, exist_ok=True) 43 | 44 | 45 | def main() -> None: 46 | secret_mgr = GoogleGSMSecretManager( 47 | project=AIRBYTE_INTERNAL_GCP_PROJECT, 48 | credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), 49 | ) 50 | 51 | secret: SecretHandle 52 | for secret in secret_mgr.fetch_connector_secrets( 53 | connector_name=CONNECTOR_NAME, 54 | ): 55 | filename_base = "config" # Default filename if not overridden 56 | if "filename" in secret.labels: 57 | filename_base = secret.labels["filename"] 58 | secret_file_path = CONNECTOR_SECRETS_DIR / f"{filename_base}.json" 59 | secret.write_to_file(secret_file_path) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /airbyte/mcp/prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """MCP prompt definitions for the PyAirbyte MCP server. 3 | 4 | This module defines prompts that can be invoked by MCP clients to perform 5 | common workflows. 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from typing import TYPE_CHECKING, Annotated 11 | 12 | from pydantic import Field 13 | 14 | 15 | if TYPE_CHECKING: 16 | from fastmcp import FastMCP 17 | 18 | 19 | TEST_MY_TOOLS_GUIDANCE = """ 20 | Test all available tools in this MCP server to confirm they are working properly. 21 | 22 | Guidelines: 23 | - Iterate through each tool systematically 24 | - Use read-only operations whenever possible 25 | - For tools that modify data, use test/safe modes or skip if no safe testing method exists 26 | - Avoid creating persistent side effects (e.g., don't create real resources, connections, or data) 27 | - Document which tools were tested and their status 28 | - Report any errors or issues encountered 29 | - Provide a summary of the test results at the end 30 | 31 | Focus on validating that tools: 32 | 1. Accept their required parameters correctly 33 | 2. Return expected output formats 34 | 3. Handle errors gracefully 35 | 4. Connect to required services (if applicable) 36 | 37 | Be efficient and practical in your testing approach. 38 | """.strip() 39 | 40 | 41 | def test_my_tools_prompt( 42 | scope: Annotated[ 43 | str | None, 44 | Field( 45 | description=( 46 | "Optional free-form text to focus or constrain testing. " 47 | "This can be a single word, a sentence, or a paragraph " 48 | "describing the desired scope or constraints." 49 | ), 50 | ), 51 | ] = None, 52 | ) -> list[dict[str, str]]: 53 | """Generate a prompt that instructs the agent to test available tools.""" 54 | content = TEST_MY_TOOLS_GUIDANCE 55 | 56 | if scope: 57 | content = f"{content}\n\n---\n\nAdditional scope or constraints:\n{scope}" 58 | 59 | return [ 60 | { 61 | "role": "user", 62 | "content": content, 63 | } 64 | ] 65 | 66 | 67 | def register_prompts(app: FastMCP) -> None: 68 | """Register all prompts with the FastMCP app.""" 69 | app.prompt( 70 | name="test-my-tools", 71 | description="Test all available MCP tools to confirm they are working properly", 72 | )(test_my_tools_prompt) 73 | -------------------------------------------------------------------------------- /bin/test_mcp_tool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 3 | """One-liner CLI tool for testing PyAirbyte MCP tools directly with JSON arguments. 4 | 5 | Usage: 6 | poe mcp-tool-test '' 7 | 8 | Examples: 9 | poe mcp-tool-test list_connectors '{}' 10 | poe mcp-tool-test get_config_spec '{"connector_name": "source-pokeapi"}' 11 | poe mcp-tool-test validate_config \ 12 | '{"connector_name": "source-pokeapi", "config": {"pokemon_name": "pikachu"}}' 13 | poe mcp-tool-test run_sync \ 14 | '{"connector_name": "source-pokeapi", "config": {"pokemon_name": "pikachu"}}' 15 | 16 | poe mcp-tool-test check_airbyte_cloud_workspace '{}' 17 | poe mcp-tool-test list_deployed_cloud_connections '{}' 18 | poe mcp-tool-test get_cloud_sync_status \ 19 | '{"connection_id": "0791e193-811b-4fcf-91c3-f8c5963e74a0", "include_attempts": true}' 20 | poe mcp-tool-test get_cloud_sync_logs \ 21 | '{"connection_id": "0791e193-811b-4fcf-91c3-f8c5963e74a0"}' 22 | """ 23 | 24 | import asyncio 25 | import json 26 | import sys 27 | import traceback 28 | from typing import Any 29 | 30 | from fastmcp import Client 31 | 32 | from airbyte.mcp.server import app 33 | 34 | 35 | MIN_ARGS = 3 36 | 37 | 38 | async def call_mcp_tool(tool_name: str, args: dict[str, Any]) -> object: 39 | """Call an MCP tool using the FastMCP client.""" 40 | async with Client(app) as client: 41 | return await client.call_tool(tool_name, args) 42 | 43 | 44 | def main() -> None: 45 | """Main entry point for the MCP tool tester.""" 46 | if len(sys.argv) < MIN_ARGS: 47 | print(__doc__, file=sys.stderr) 48 | sys.exit(1) 49 | 50 | tool_name = sys.argv[1] 51 | json_args = sys.argv[2] 52 | 53 | try: 54 | args: dict[str, Any] = json.loads(json_args) 55 | except json.JSONDecodeError as e: 56 | print(f"Error parsing JSON arguments: {e}", file=sys.stderr) 57 | sys.exit(1) 58 | 59 | try: 60 | result = asyncio.run(call_mcp_tool(tool_name, args)) 61 | 62 | if hasattr(result, "text"): 63 | print(result.text) 64 | else: 65 | print(str(result)) 66 | 67 | except Exception as e: 68 | print(f"Error executing tool '{tool_name}': {e}", file=sys.stderr) 69 | traceback.print_exc() 70 | sys.exit(1) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /airbyte/secrets/env_vars.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Secret manager that retrieves secrets from environment variables and `.env` files.""" 3 | 4 | from __future__ import annotations 5 | 6 | import os 7 | from typing import TYPE_CHECKING 8 | 9 | from dotenv import dotenv_values 10 | 11 | from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString 12 | 13 | 14 | if TYPE_CHECKING: 15 | from pathlib import Path 16 | 17 | 18 | class EnvVarSecretManager(SecretManager): 19 | """Secret manager that retrieves secrets from environment variables.""" 20 | 21 | name = SecretSourceEnum.ENV.value 22 | 23 | def get_secret(self, secret_name: str) -> SecretString | None: 24 | """Get a named secret from the environment.""" 25 | if secret_name not in os.environ: 26 | return None 27 | 28 | return SecretString(os.environ[secret_name]) 29 | 30 | 31 | class DotenvSecretManager(SecretManager): 32 | """Secret manager that retrieves secrets from a `.env` file.""" 33 | 34 | dotenv_path: Path | None = None 35 | 36 | @property 37 | def name(self) -> str: # type: ignore[override] 38 | """Get name of secret manager.""" 39 | if self.dotenv_path: 40 | return f"{SecretSourceEnum.DOTENV.value}:{self.dotenv_path}" 41 | return SecretSourceEnum.DOTENV.value 42 | 43 | def __init__( 44 | self, 45 | dotenv_path: Path | None = None, 46 | ) -> None: 47 | """Initialize a new .env Secret Manager, with optionally specified file path.""" 48 | self.dotenv_path = dotenv_path 49 | 50 | def get_secret(self, secret_name: str) -> SecretString | None: 51 | """Get a named secret from the `.env` file.""" 52 | try: 53 | dotenv_vars: dict[str, str | None] = dotenv_values( 54 | dotenv_path=self.dotenv_path, 55 | ) 56 | except Exception: 57 | # Can't locate or parse a .env file 58 | return None 59 | 60 | if secret_name not in dotenv_vars: 61 | # Secret not found 62 | return None 63 | 64 | return SecretString(dotenv_vars[secret_name]) 65 | 66 | def list_secrets_names(self) -> list[str]: 67 | """List all secrets available in the .env file.""" 68 | dotenv_vars: dict[str, str | None] = dotenv_values( 69 | dotenv_path=self.dotenv_path, 70 | ) 71 | return list(dotenv_vars.keys()) 72 | -------------------------------------------------------------------------------- /tests/integration_tests/test_registry_spec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Integration tests for registry spec helper functions.""" 3 | 4 | from __future__ import annotations 5 | 6 | import pytest 7 | 8 | from airbyte._util.registry_spec import ( 9 | get_connector_spec_from_registry, 10 | validate_connector_config_from_registry, 11 | ) 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "connector_name,platform,version", 16 | [ 17 | ("source-faker", "oss", None), 18 | ("source-faker", "cloud", None), 19 | ("destination-duckdb", "oss", None), 20 | ("source-faker", "oss", "6.2.0"), 21 | ("source-faker", "cloud", "6.2.0"), 22 | ], 23 | ) 24 | def test_get_connector_spec_from_registry( 25 | connector_name: str, 26 | platform: str, 27 | version: str | None, 28 | ) -> None: 29 | """Test fetching connector specs from the registry.""" 30 | spec = get_connector_spec_from_registry( 31 | connector_name, 32 | platform=platform, 33 | version=version, 34 | ) 35 | 36 | assert spec is not None 37 | assert isinstance(spec, dict) 38 | assert "type" in spec 39 | assert spec["type"] == "object" 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "connector_name,config,expected_valid", 44 | [ 45 | ( 46 | "source-faker", 47 | { 48 | "count": 100, 49 | "seed": 12345, 50 | "parallelism": 1, 51 | }, 52 | True, 53 | ), 54 | ( 55 | "source-faker", 56 | { 57 | "count": "not_a_number", 58 | }, 59 | False, 60 | ), 61 | ( 62 | "source-faker", 63 | { 64 | "count": 0, 65 | }, 66 | False, 67 | ), 68 | ], 69 | ) 70 | def test_validate_connector_config_from_registry( 71 | connector_name: str, 72 | config: dict, 73 | expected_valid: bool, 74 | ) -> None: 75 | """Test validating connector configs against registry specs.""" 76 | is_valid, error_message = validate_connector_config_from_registry( 77 | connector_name, 78 | config, 79 | platform="oss", 80 | ) 81 | 82 | assert is_valid == expected_valid 83 | 84 | if expected_valid: 85 | assert error_message is None 86 | else: 87 | assert error_message is not None 88 | assert isinstance(error_message, str) 89 | -------------------------------------------------------------------------------- /airbyte/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """PyAirbyte classes and methods for interacting with the Airbyte Cloud API. 3 | 4 | You can use this module to interact with Airbyte Cloud, OSS, and Enterprise. 5 | 6 | ## Examples 7 | 8 | ### Basic Sync Example: 9 | 10 | ```python 11 | import airbyte as ab 12 | from airbyte import cloud 13 | 14 | # Initialize an Airbyte Cloud workspace object 15 | workspace = cloud.CloudWorkspace( 16 | workspace_id="123", 17 | api_key=ab.get_secret("AIRBYTE_CLOUD_API_KEY"), 18 | ) 19 | 20 | # Run a sync job on Airbyte Cloud 21 | connection = workspace.get_connection(connection_id="456") 22 | sync_result = connection.run_sync() 23 | print(sync_result.get_job_status()) 24 | ``` 25 | 26 | ### Example Read From Cloud Destination: 27 | 28 | If your destination is supported, you can read records directly from the 29 | `SyncResult` object. Currently this is supported in Snowflake and BigQuery only. 30 | 31 | 32 | ```python 33 | # Assuming we've already created a `connection` object... 34 | 35 | # Get the latest job result and print the stream names 36 | sync_result = connection.get_sync_result() 37 | print(sync_result.stream_names) 38 | 39 | # Get a dataset from the sync result 40 | dataset: CachedDataset = sync_result.get_dataset("users") 41 | 42 | # Get a SQLAlchemy table to use in SQL queries... 43 | users_table = dataset.to_sql_table() 44 | print(f"Table name: {users_table.name}") 45 | 46 | # Or iterate over the dataset directly 47 | for record in dataset: 48 | print(record) 49 | ``` 50 | """ 51 | 52 | from __future__ import annotations 53 | 54 | from typing import TYPE_CHECKING 55 | 56 | from airbyte.cloud.client_config import CloudClientConfig 57 | from airbyte.cloud.connections import CloudConnection 58 | from airbyte.cloud.constants import JobStatusEnum 59 | from airbyte.cloud.sync_results import SyncResult 60 | from airbyte.cloud.workspaces import CloudWorkspace 61 | 62 | 63 | # Submodules imported here for documentation reasons: https://github.com/mitmproxy/pdoc/issues/757 64 | if TYPE_CHECKING: 65 | # ruff: noqa: TC004 66 | from airbyte.cloud import client_config, connections, constants, sync_results, workspaces 67 | 68 | 69 | __all__ = [ 70 | # Submodules 71 | "workspaces", 72 | "connections", 73 | "constants", 74 | "client_config", 75 | "sync_results", 76 | # Classes 77 | "CloudWorkspace", 78 | "CloudConnection", 79 | "CloudClientConfig", 80 | "SyncResult", 81 | # Enums 82 | "JobStatusEnum", 83 | ] 84 | -------------------------------------------------------------------------------- /airbyte/secrets/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """This module provides customization of how PyAirbyte locates secrets.""" 3 | 4 | from __future__ import annotations 5 | 6 | from airbyte._util import meta 7 | from airbyte.secrets.base import SecretManager, SecretSourceEnum 8 | from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager 9 | from airbyte.secrets.google_colab import ColabSecretManager 10 | from airbyte.secrets.prompt import SecretsPrompt 11 | 12 | 13 | _SECRETS_SOURCES: list[SecretManager] = [] 14 | 15 | 16 | def _get_secret_sources() -> list[SecretManager]: 17 | """Initialize the default secret sources.""" 18 | if len(_SECRETS_SOURCES) == 0: 19 | # Initialize the default secret sources 20 | _SECRETS_SOURCES.extend( 21 | [ 22 | EnvVarSecretManager(), 23 | DotenvSecretManager(), 24 | ] 25 | ) 26 | if meta.is_colab(): 27 | _SECRETS_SOURCES.append(ColabSecretManager()) 28 | 29 | if meta.is_interactive(): 30 | _SECRETS_SOURCES.append(SecretsPrompt()) 31 | 32 | return _SECRETS_SOURCES.copy() 33 | 34 | 35 | # Ensure the default secret sources are initialized 36 | _ = _get_secret_sources() 37 | 38 | 39 | def register_secret_manager( 40 | secret_manager: SecretManager, 41 | *, 42 | as_backup: bool = False, 43 | replace_existing: bool = False, 44 | ) -> None: 45 | """Register a custom secret manager.""" 46 | if replace_existing: 47 | clear_secret_sources() 48 | 49 | if as_backup: 50 | # Add to end of list 51 | _SECRETS_SOURCES.append(secret_manager) 52 | else: 53 | # Add to beginning of list 54 | _SECRETS_SOURCES.insert(0, secret_manager) 55 | 56 | 57 | def clear_secret_sources() -> None: 58 | """Clear all secret sources.""" 59 | _SECRETS_SOURCES.clear() 60 | 61 | 62 | def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: 63 | """Disable one of the default secrets sources. 64 | 65 | This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or 66 | a string representing the name of the source to disable. 67 | """ 68 | if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: 69 | _SECRETS_SOURCES.remove(source) 70 | return 71 | 72 | # Else, remove by name 73 | for existing_source in list(_SECRETS_SOURCES).copy(): 74 | if str(existing_source) == str(source): 75 | _SECRETS_SOURCES.remove(existing_source) 76 | -------------------------------------------------------------------------------- /airbyte/datasets/_lazy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING, Any 5 | 6 | from overrides import overrides 7 | 8 | from airbyte.datasets import DatasetBase 9 | from airbyte.datasets._inmemory import InMemoryDataset 10 | 11 | 12 | if TYPE_CHECKING: 13 | import threading 14 | from collections.abc import Iterator, Mapping 15 | 16 | from airbyte_protocol.models import ConfiguredAirbyteStream 17 | 18 | from airbyte import progress 19 | 20 | 21 | class LazyDataset(DatasetBase): 22 | """A dataset that is loaded incrementally from a source or a SQL query.""" 23 | 24 | def __init__( 25 | self, 26 | iterator: Iterator[dict[str, Any]], 27 | *, 28 | stream_metadata: ConfiguredAirbyteStream, 29 | stop_event: threading.Event | None, 30 | progress_tracker: progress.ProgressTracker, 31 | ) -> None: 32 | self._stop_event: threading.Event | None = stop_event or None 33 | self._progress_tracker = progress_tracker 34 | self._iterator: Iterator[dict[str, Any]] = iterator 35 | super().__init__( 36 | stream_metadata=stream_metadata, 37 | ) 38 | 39 | @overrides 40 | def __iter__(self) -> Iterator[dict[str, Any]]: 41 | return self._iterator 42 | 43 | def __next__(self) -> Mapping[str, Any]: 44 | try: 45 | return next(self._iterator) 46 | except StopIteration: 47 | # The iterator is exhausted, tell the producer they can stop if they are still 48 | # producing records. (Esp. when an artificial limit is reached.) 49 | self._progress_tracker.log_success() 50 | if self._stop_event: 51 | self._stop_event.set() 52 | raise 53 | 54 | def fetch_all(self) -> InMemoryDataset: 55 | """Fetch all records to memory and return an InMemoryDataset.""" 56 | return InMemoryDataset( 57 | records=list(self._iterator), 58 | stream_metadata=self._stream_metadata, 59 | ) 60 | 61 | def close(self) -> None: 62 | """Stop the dataset iterator. 63 | 64 | This method is used to signal the dataset to stop fetching records, for example 65 | when the dataset is being fetched incrementally and the user wants to stop the 66 | fetching process. 67 | """ 68 | if self._stop_event: 69 | self._stop_event.set() 70 | 71 | def __del__(self) -> None: 72 | """Close the dataset when the object is deleted.""" 73 | self.close() 74 | -------------------------------------------------------------------------------- /airbyte/_batch_handles.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Batch handle class.""" 3 | 4 | from __future__ import annotations 5 | 6 | from contextlib import suppress 7 | from typing import IO, TYPE_CHECKING 8 | 9 | 10 | if TYPE_CHECKING: 11 | from collections.abc import Callable 12 | from pathlib import Path 13 | 14 | 15 | class BatchHandle: 16 | """A handle for a batch of records.""" 17 | 18 | def __init__( 19 | self, 20 | stream_name: str, 21 | batch_id: str, 22 | files: list[Path], 23 | file_opener: Callable[[Path], IO[str]], 24 | ) -> None: 25 | """Initialize the batch handle.""" 26 | self._stream_name = stream_name 27 | self._batch_id = batch_id 28 | self._files = files 29 | self._record_count = 0 30 | assert self._files, "A batch must have at least one file." 31 | self._open_file_writer: IO[str] = file_opener(self._files[0]) 32 | 33 | # Marker for whether the batch has been finalized. 34 | self.finalized: bool = False 35 | 36 | @property 37 | def files(self) -> list[Path]: 38 | """Return the files.""" 39 | return self._files 40 | 41 | @property 42 | def batch_id(self) -> str: 43 | """Return the batch ID.""" 44 | return self._batch_id 45 | 46 | @property 47 | def stream_name(self) -> str: 48 | """Return the stream name.""" 49 | return self._stream_name 50 | 51 | @property 52 | def record_count(self) -> int: 53 | """Return the record count.""" 54 | return self._record_count 55 | 56 | def increment_record_count(self) -> None: 57 | """Increment the record count.""" 58 | self._record_count += 1 59 | 60 | @property 61 | def open_file_writer(self) -> IO[str] | None: 62 | """Return the open file writer, if any, or None.""" 63 | return self._open_file_writer 64 | 65 | def close_files(self) -> None: 66 | """Close the file writer.""" 67 | if self.open_file_writer is None: 68 | return 69 | 70 | with suppress(Exception): 71 | self.open_file_writer.close() 72 | 73 | def delete_files(self) -> None: 74 | """Delete the files. 75 | 76 | If any files are open, they will be closed first. 77 | If any files are missing, they will be ignored. 78 | """ 79 | self.close_files() 80 | for file in self.files: 81 | file.unlink(missing_ok=True) 82 | 83 | def __del__(self) -> None: 84 | """Upon deletion, close the file writer.""" 85 | self.close_files() 86 | -------------------------------------------------------------------------------- /tests/unit_tests/test_processors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from pathlib import Path 5 | from typing import Optional 6 | import pytest_mock 7 | from airbyte.caches.snowflake import SnowflakeSqlProcessor, SnowflakeConfig 8 | from airbyte_protocol.models import ConfiguredAirbyteCatalog 9 | from airbyte.secrets.base import SecretString 10 | from airbyte.shared.catalog_providers import CatalogProvider 11 | 12 | 13 | def test_snowflake_cache_config_data_retention_time_in_days( 14 | mocker: pytest_mock.MockFixture, 15 | ): 16 | expected_cmd = """ 17 | CREATE TABLE airbyte_raw."table_name" ( 18 | col_name type 19 | ) 20 | DATA_RETENTION_TIME_IN_DAYS = 1 21 | """ 22 | 23 | def _execute_sql(cmd): 24 | global actual_cmd 25 | actual_cmd = cmd 26 | 27 | mocker.patch.object(SnowflakeSqlProcessor, "_execute_sql", side_effect=_execute_sql) 28 | config = _build_mocked_snowflake_processor(mocker, data_retention_time_in_days=1) 29 | config._create_table(table_name="table_name", column_definition_str="col_name type") 30 | 31 | assert actual_cmd == expected_cmd 32 | 33 | 34 | def test_snowflake_cache_config_no_data_retention_time_in_days( 35 | mocker: pytest_mock.MockFixture, 36 | ): 37 | expected_cmd = """ 38 | CREATE TABLE airbyte_raw."table_name" ( 39 | col_name type 40 | ) 41 | \n """ 42 | 43 | def _execute_sql(cmd): 44 | global actual_cmd 45 | actual_cmd = cmd 46 | 47 | mocker.patch.object(SnowflakeSqlProcessor, "_execute_sql", side_effect=_execute_sql) 48 | config = _build_mocked_snowflake_processor(mocker) 49 | config._create_table(table_name="table_name", column_definition_str="col_name type") 50 | 51 | assert actual_cmd == expected_cmd 52 | 53 | 54 | def _build_mocked_snowflake_processor( 55 | mocker: pytest_mock.MockFixture, data_retention_time_in_days: Optional[int] = None 56 | ): 57 | sql_config = SnowflakeConfig( 58 | account="foo", 59 | username="foo", 60 | password=SecretString("foo"), 61 | warehouse="foo", 62 | database="foo", 63 | role="foo", 64 | data_retention_time_in_days=data_retention_time_in_days, 65 | ) 66 | 67 | mocker.patch.object( 68 | SnowflakeSqlProcessor, "_ensure_schema_exists", return_value=None 69 | ) 70 | return SnowflakeSqlProcessor( 71 | catalog_provider=CatalogProvider(ConfiguredAirbyteCatalog(streams=[])), 72 | temp_dir=Path(), 73 | temp_file_cleanup=True, 74 | sql_config=sql_config, 75 | ) 76 | -------------------------------------------------------------------------------- /airbyte/_util/temp_files.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Internal helper functions for working with temporary files.""" 3 | 4 | from __future__ import annotations 5 | 6 | import json 7 | import stat 8 | import tempfile 9 | import time 10 | import warnings 11 | from contextlib import contextmanager, suppress 12 | from pathlib import Path 13 | from typing import TYPE_CHECKING, Any 14 | 15 | from airbyte.constants import TEMP_DIR_OVERRIDE 16 | 17 | 18 | if TYPE_CHECKING: 19 | from collections.abc import Generator 20 | 21 | 22 | @contextmanager 23 | def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any, None]: 24 | """Write the given contents to temporary files and yield the file paths as strings.""" 25 | temp_files: list[Any] = [] 26 | try: 27 | for content in files_contents: 28 | use_json = isinstance(content, dict) 29 | temp_file = tempfile.NamedTemporaryFile( # noqa: SIM115 # Avoiding context manager 30 | mode="w+t", 31 | delete=False, 32 | encoding="utf-8", 33 | dir=TEMP_DIR_OVERRIDE or None, 34 | suffix=".json" if use_json else ".txt", 35 | ) 36 | temp_file.write( 37 | json.dumps(content) if isinstance(content, dict) else content, 38 | ) 39 | temp_file.flush() 40 | # Grant "read" permission to all users 41 | Path(temp_file.name).chmod(stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) 42 | 43 | # Don't close the file yet (breaks Windows) 44 | # temp_file.close() 45 | temp_files.append(temp_file) 46 | yield [file.name for file in temp_files] 47 | finally: 48 | for temp_file in temp_files: 49 | max_attempts = 5 50 | for attempt in range(max_attempts): 51 | try: 52 | with suppress(Exception): 53 | temp_file.close() 54 | 55 | Path(temp_file.name).unlink(missing_ok=True) 56 | 57 | break # File was deleted successfully. Move on. 58 | except Exception as ex: 59 | if attempt < max_attempts - 1: 60 | time.sleep(1) # File might not be closed yet. Wait and try again. 61 | else: 62 | # Something went wrong and the file could not be deleted. Warn the user. 63 | warnings.warn( 64 | f"Failed to remove temporary file: '{temp_file.name}'. {ex}", 65 | stacklevel=2, 66 | ) 67 | -------------------------------------------------------------------------------- /airbyte/_processors/sql/postgres.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A Postgres implementation of the cache.""" 3 | 4 | from __future__ import annotations 5 | 6 | import functools 7 | 8 | from overrides import overrides 9 | 10 | from airbyte._util.name_normalizers import LowerCaseNormalizer 11 | from airbyte._writers.jsonl import JsonlWriter 12 | from airbyte.secrets.base import SecretString 13 | from airbyte.shared.sql_processor import SqlConfig, SqlProcessorBase 14 | 15 | 16 | class PostgresConfig(SqlConfig): 17 | """Configuration for the Postgres cache. 18 | 19 | Also inherits config from the JsonlWriter, which is responsible for writing files to disk. 20 | """ 21 | 22 | host: str 23 | port: int 24 | database: str 25 | username: str 26 | password: SecretString | str 27 | 28 | @overrides 29 | def get_sql_alchemy_url(self) -> SecretString: 30 | """Return the SQLAlchemy URL to use.""" 31 | return SecretString( 32 | f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" 33 | ) 34 | 35 | @overrides 36 | def get_database_name(self) -> str: 37 | """Return the name of the database.""" 38 | return self.database 39 | 40 | 41 | class PostgresNormalizer(LowerCaseNormalizer): 42 | """A name normalizer for Postgres. 43 | 44 | Postgres has specific field name length limits: 45 | - Tables names are limited to 63 characters. 46 | - Column names are limited to 63 characters. 47 | 48 | The postgres normalizer inherits from the default LowerCaseNormalizer class, and 49 | additionally truncates column and table names to 63 characters. 50 | """ 51 | 52 | @staticmethod 53 | @functools.cache 54 | def normalize(name: str) -> str: 55 | """Normalize the name, truncating to 63 characters.""" 56 | return LowerCaseNormalizer.normalize(name)[:63] 57 | 58 | 59 | class PostgresSqlProcessor(SqlProcessorBase): 60 | """A Postgres implementation of the cache. 61 | 62 | Jsonl is used for local file storage before bulk loading. 63 | Unlike the Snowflake implementation, we can't use the COPY command to load data 64 | so we insert as values instead. 65 | 66 | TODO: Add optimized bulk load path for Postgres. Could use an alternate file writer 67 | or another import method. (Relatively low priority, since for now it works fine as-is.) 68 | """ 69 | 70 | supports_merge_insert = False 71 | file_writer_class = JsonlWriter 72 | sql_config: PostgresConfig 73 | 74 | normalizer = PostgresNormalizer # pyrefly: ignore[bad-override] 75 | """A Postgres-specific name normalizer for table and column name normalization.""" 76 | -------------------------------------------------------------------------------- /tests/unit_tests/test_pip_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import pytest 5 | from airbyte._util import github_pip_url, connector_pip_url 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "owner, repo, branch_or_ref, package_name, subdirectory, expected", 10 | [ 11 | ( 12 | "airbytehq", 13 | "airbyte", 14 | None, 15 | None, 16 | None, 17 | "git+https://github.com/airbytehq/airbyte.git", 18 | ), 19 | ( 20 | "airbytehq", 21 | "airbyte", 22 | "master", 23 | None, 24 | None, 25 | "git+https://github.com/airbytehq/airbyte.git@master", 26 | ), 27 | ( 28 | "airbytehq", 29 | "airbyte", 30 | "my-branch", 31 | None, 32 | None, 33 | "git+https://github.com/airbytehq/airbyte.git@my-branch", 34 | ), 35 | ( 36 | "airbytehq", 37 | "airbyte", 38 | "my-branch", 39 | "airbyte-lib", 40 | None, 41 | "git+https://github.com/airbytehq/airbyte.git@my-branch#egg=airbyte-lib", 42 | ), 43 | ( 44 | "airbytehq", 45 | "airbyte", 46 | "my-branch", 47 | "airbyte-lib", 48 | "airbyte-lib", 49 | "git+https://github.com/airbytehq/airbyte.git@my-branch#egg=airbyte-lib&subdirectory=airbyte-lib", 50 | ), 51 | ], 52 | ) 53 | def test_github_pip_url( 54 | owner, repo, branch_or_ref, package_name, subdirectory, expected 55 | ): 56 | result = github_pip_url( 57 | owner, 58 | repo, 59 | branch_or_ref=branch_or_ref, 60 | package_name=package_name, 61 | subdirectory=subdirectory, 62 | ) 63 | assert result == expected 64 | 65 | 66 | @pytest.mark.parametrize( 67 | "connector_name, branch, owner, expected", 68 | [ 69 | ( 70 | "source-coin-api", 71 | "my-branch", 72 | None, 73 | "git+https://github.com/airbytehq/airbyte.git@my-branch#egg=source-coin-api&subdirectory=airbyte-integrations/connectors/source-coin-api", 74 | ), 75 | ( 76 | "source-coin-api", 77 | "my-branch", 78 | "my-fork", 79 | "git+https://github.com/my-fork/airbyte.git@my-branch#egg=source-coin-api&subdirectory=airbyte-integrations/connectors/source-coin-api", 80 | ), 81 | ], 82 | ) 83 | def test_connector_pip_url(connector_name, branch, owner, expected): 84 | result = connector_pip_url(connector_name, branch, owner=owner) 85 | assert result == expected 86 | -------------------------------------------------------------------------------- /.github/workflows/python_lint.yml: -------------------------------------------------------------------------------- 1 | name: Run Linters 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: {} 8 | 9 | env: 10 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 11 | 12 | permissions: 13 | contents: read 14 | jobs: 15 | ruff-lint-check: 16 | name: Ruff Lint Check 17 | runs-on: ubuntu-latest 18 | steps: 19 | # Common steps: 20 | - name: Checkout code 21 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 22 | - name: Set up Poetry 23 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 24 | with: 25 | poetry-version: "2.2.0" 26 | - name: Set up Python 27 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 28 | with: 29 | python-version: '3.10' 30 | - name: Set up Poetry 31 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 32 | with: 33 | poetry-version: "2.2.0" 34 | - name: Install dependencies 35 | run: poetry install 36 | 37 | # Job-specifc step(s): 38 | - name: Format code 39 | run: poetry run ruff check . 40 | 41 | ruff-format-check: 42 | name: Ruff Format Check 43 | runs-on: ubuntu-latest 44 | steps: 45 | # Common steps: 46 | - name: Checkout code 47 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 48 | - name: Set up Python 49 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 50 | with: 51 | python-version: '3.10' 52 | - name: Set up Poetry 53 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 54 | with: 55 | poetry-version: "2.2.0" 56 | - name: Install dependencies 57 | run: poetry install 58 | 59 | # Job-specifc step(s): 60 | - name: Check code format 61 | run: poetry run ruff format --diff . 62 | 63 | python-type-checks: 64 | name: Python Type Checks 65 | runs-on: ubuntu-latest 66 | steps: 67 | # Common steps: 68 | - name: Checkout code 69 | uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 70 | - name: Set up Poetry 71 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 72 | with: 73 | poetry-version: "2.2.0" 74 | - name: Set up Python 75 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 76 | with: 77 | python-version: '3.10' 78 | - name: Set up Poetry 79 | uses: Gr1N/setup-poetry@48b0f77c8c1b1b19cb962f0f00dff7b4be8f81ec # v9 80 | with: 81 | poetry-version: "2.2.0" 82 | - name: Install dependencies 83 | run: poetry install 84 | 85 | # Job-specifc step(s): 86 | - name: Run Pyrefly Check 87 | run: poetry run pyrefly check 88 | -------------------------------------------------------------------------------- /airbyte/caches/snowflake.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """A Snowflake implementation of the PyAirbyte cache. 3 | 4 | ## Usage Example 5 | 6 | # Password connection: 7 | 8 | ```python 9 | from airbyte as ab 10 | from airbyte.caches import SnowflakeCache 11 | 12 | cache = SnowflakeCache( 13 | account="myaccount", 14 | username="myusername", 15 | password=ab.get_secret("SNOWFLAKE_PASSWORD"), # optional 16 | warehouse="mywarehouse", 17 | database="mydatabase", 18 | role="myrole", 19 | schema_name="myschema", 20 | ) 21 | ``` 22 | 23 | # Private key connection: 24 | 25 | ```python 26 | from airbyte as ab 27 | from airbyte.caches import SnowflakeCache 28 | 29 | cache = SnowflakeCache( 30 | account="myaccount", 31 | username="myusername", 32 | private_key=ab.get_secret("SNOWFLAKE_PRIVATE_KEY"), 33 | private_key_passphrase=ab.get_secret("SNOWFLAKE_PRIVATE_KEY_PASSPHRASE"), # optional 34 | warehouse="mywarehouse", 35 | database="mydatabase", 36 | role="myrole", 37 | schema_name="myschema", 38 | ) 39 | ``` 40 | 41 | # Private key path connection: 42 | 43 | ```python 44 | from airbyte as ab 45 | from airbyte.caches import SnowflakeCache 46 | 47 | cache = SnowflakeCache( 48 | account="myaccount", 49 | username="myusername", 50 | private_key_path="path/to/my/private_key.pem", 51 | private_key_passphrase=ab.get_secret("SNOWFLAKE_PRIVATE_KEY_PASSPHRASE"), # optional 52 | warehouse="mywarehouse", 53 | database="mydatabase", 54 | role="myrole", 55 | schema_name="myschema", 56 | ) 57 | ``` 58 | """ 59 | 60 | from __future__ import annotations 61 | 62 | from typing import ClassVar 63 | 64 | from airbyte_api.models import DestinationSnowflake 65 | 66 | from airbyte._processors.sql.snowflake import SnowflakeConfig, SnowflakeSqlProcessor 67 | from airbyte.caches.base import CacheBase 68 | from airbyte.destinations._translate_cache_to_dest import ( 69 | snowflake_cache_to_destination_configuration, 70 | ) 71 | from airbyte.shared.sql_processor import RecordDedupeMode, SqlProcessorBase 72 | 73 | 74 | class SnowflakeCache(SnowflakeConfig, CacheBase): 75 | """Configuration for the Snowflake cache.""" 76 | 77 | dedupe_mode: RecordDedupeMode = RecordDedupeMode.APPEND 78 | 79 | _sql_processor_class: ClassVar[type[SqlProcessorBase]] = SnowflakeSqlProcessor 80 | 81 | paired_destination_name: ClassVar[str | None] = "destination-bigquery" 82 | paired_destination_config_class: ClassVar[type | None] = DestinationSnowflake 83 | 84 | @property 85 | def paired_destination_config(self) -> DestinationSnowflake: 86 | """Return a dictionary of destination configuration values.""" 87 | return snowflake_cache_to_destination_configuration(cache=self) 88 | 89 | 90 | # Expose the Cache class and also the Config class. 91 | __all__ = [ 92 | "SnowflakeCache", 93 | "SnowflakeConfig", 94 | ] 95 | -------------------------------------------------------------------------------- /tests/integration_tests/fixtures/registry.json: -------------------------------------------------------------------------------- 1 | { 2 | "sources": [ 3 | { 4 | "sourceDefinitionId": "9f32dab3-77cb-45a1-9d33-347aa5fbe363", 5 | "name": "Test Source", 6 | "dockerRepository": "airbyte/source-test", 7 | "dockerImageTag": "0.0.1", 8 | "documentationUrl": "https://docs.airbyte.com/integrations/sources/test", 9 | "icon": "test.svg", 10 | "iconUrl": "https://connectors.airbyte.com/files/metadata/airbyte/source-test/latest/icon.svg", 11 | "sourceType": "api", 12 | "remoteRegistries": { 13 | "pypi": { 14 | "packageName": "airbyte-source-test", 15 | "enabled": true 16 | } 17 | }, 18 | "spec": { 19 | "documentationUrl": "https://docs.airbyte.com/integrations/sources/test", 20 | "connectionSpecification": { 21 | "$schema": "http://json-schema.org/draft-07/schema#", 22 | "type": "object", 23 | "properties": { 24 | "apiKey": { 25 | "type": "string", 26 | "title": "API Key", 27 | "description": "The API key for the service" 28 | } 29 | } 30 | } 31 | }, 32 | "tombstone": false, 33 | "public": true, 34 | "custom": false, 35 | "releaseStage": "alpha", 36 | "supportLevel": "community", 37 | "ab_internal": { 38 | "sl": 100, 39 | "ql": 200 40 | }, 41 | "tags": ["language:python"], 42 | "githubIssueLabel": "source-test", 43 | "license": "MIT" 44 | }, 45 | { 46 | "sourceDefinitionId": "9f32dab3-77cb-45a1-9d33-347aa5fbe333", 47 | "name": "Docker-only source", 48 | "dockerRepository": "airbyte/source-docker-only", 49 | "dockerImageTag": "0.0.1", 50 | "documentationUrl": "https://docs.airbyte.com/integrations/sources/test", 51 | "icon": "test.svg", 52 | "iconUrl": "https://connectors.airbyte.com/files/metadata/airbyte/source-test/latest/icon.svg", 53 | "sourceType": "api", 54 | "remoteRegistries": { 55 | }, 56 | "spec": { 57 | "documentationUrl": "https://docs.airbyte.com/integrations/sources/test", 58 | "connectionSpecification": { 59 | "$schema": "http://json-schema.org/draft-07/schema#", 60 | "type": "object", 61 | "properties": { 62 | "apiKey": { 63 | "type": "string", 64 | "title": "API Key", 65 | "description": "The API key for the service" 66 | } 67 | } 68 | } 69 | }, 70 | "tombstone": false, 71 | "public": true, 72 | "custom": false, 73 | "releaseStage": "alpha", 74 | "supportLevel": "community", 75 | "ab_internal": { 76 | "sl": 100, 77 | "ql": 200 78 | }, 79 | "tags": [ 80 | "language:java" 81 | ], 82 | "githubIssueLabel": "source-source-docker-only", 83 | "license": "MIT" 84 | } 85 | ], 86 | "destinations": [] 87 | } 88 | -------------------------------------------------------------------------------- /airbyte/strategies.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """Read and write strategies for PyAirbyte.""" 4 | 5 | from __future__ import annotations 6 | 7 | from enum import Enum 8 | 9 | from airbyte_protocol.models import DestinationSyncMode 10 | 11 | 12 | _MERGE = "merge" 13 | _REPLACE = "replace" 14 | _APPEND = "append" 15 | _AUTO = "auto" 16 | 17 | 18 | class WriteStrategy(str, Enum): 19 | """Read strategies for PyAirbyte. 20 | 21 | Read strategies set a preferred method for writing data to a destination. The actual method used 22 | may differ based on the capabilities of the destination. 23 | 24 | If a destination does not support the preferred method, it will fall back to the next best 25 | method. 26 | """ 27 | 28 | MERGE = _MERGE 29 | """Merge new records with existing records. 30 | 31 | This requires a primary key to be set on the stream. 32 | If no primary key is set, this will raise an exception. 33 | 34 | To apply this strategy in cases where some destination streams don't have a primary key, 35 | please use the `auto` strategy instead. 36 | """ 37 | 38 | APPEND = _APPEND 39 | """Append new records to existing records.""" 40 | 41 | REPLACE = _REPLACE 42 | """Replace existing records with new records.""" 43 | 44 | AUTO = _AUTO 45 | """Automatically determine the best strategy to use. 46 | 47 | This will use the following logic: 48 | - If there's a primary key, use merge. 49 | - Else, if there's an incremental key, use append. 50 | - Else, use full replace (table swap). 51 | """ 52 | 53 | 54 | class WriteMethod(str, Enum): 55 | """Write methods for PyAirbyte. 56 | 57 | Unlike write strategies, write methods are expected to be fully resolved and do not require any 58 | additional logic to determine the best method to use. 59 | 60 | If a destination does not support the declared method, it will raise an exception. 61 | """ 62 | 63 | MERGE = _MERGE 64 | """Merge new records with existing records. 65 | 66 | This requires a primary key to be set on the stream. 67 | If no primary key is set, this will raise an exception. 68 | 69 | To apply this strategy in cases where some destination streams don't have a primary key, 70 | please use the `auto` strategy instead. 71 | """ 72 | 73 | APPEND = _APPEND 74 | """Append new records to existing records.""" 75 | 76 | REPLACE = _REPLACE 77 | """Replace existing records with new records.""" 78 | 79 | @property 80 | def destination_sync_mode(self) -> DestinationSyncMode: 81 | """Convert the write method to a destination sync mode.""" 82 | if self == WriteMethod.MERGE: 83 | return DestinationSyncMode.append_dedup 84 | 85 | if self == WriteMethod.APPEND: 86 | return DestinationSyncMode.append 87 | 88 | if self == WriteMethod.REPLACE: 89 | return DestinationSyncMode.overwrite 90 | 91 | msg = f"Unknown write method: {self}" # type: ignore [unreachable] 92 | raise ValueError(msg) 93 | -------------------------------------------------------------------------------- /airbyte/cloud/auth.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Authentication-related constants and utilities for the Airbyte Cloud.""" 3 | 4 | from airbyte import constants 5 | from airbyte.secrets import SecretString 6 | from airbyte.secrets.util import get_secret, try_get_secret 7 | 8 | 9 | def resolve_cloud_bearer_token( 10 | input_value: str | SecretString | None = None, 11 | /, 12 | ) -> SecretString | None: 13 | """Get the Airbyte Cloud bearer token from the environment. 14 | 15 | Unlike other resolve functions, this returns None if no bearer token is found, 16 | since bearer token authentication is optional (client credentials can be used instead). 17 | 18 | Args: 19 | input_value: Optional explicit bearer token value. If provided, it will be 20 | returned directly (wrapped in SecretString if needed). 21 | 22 | Returns: 23 | The bearer token as a SecretString, or None if not found. 24 | """ 25 | if input_value is not None: 26 | return SecretString(input_value) 27 | 28 | result = try_get_secret(constants.CLOUD_BEARER_TOKEN_ENV_VAR, default=None) 29 | if result: 30 | return SecretString(result) 31 | return None 32 | 33 | 34 | def resolve_cloud_client_secret( 35 | input_value: str | SecretString | None = None, 36 | /, 37 | ) -> SecretString: 38 | """Get the Airbyte Cloud client secret from the environment.""" 39 | return get_secret(constants.CLOUD_CLIENT_SECRET_ENV_VAR, default=input_value) 40 | 41 | 42 | def resolve_cloud_client_id( 43 | input_value: str | SecretString | None = None, 44 | /, 45 | ) -> SecretString: 46 | """Get the Airbyte Cloud client ID from the environment.""" 47 | return get_secret(constants.CLOUD_CLIENT_ID_ENV_VAR, default=input_value) 48 | 49 | 50 | def resolve_cloud_api_url( 51 | input_value: str | None = None, 52 | /, 53 | ) -> str: 54 | """Get the Airbyte Cloud API URL from the environment, or return the default.""" 55 | return str( 56 | try_get_secret(constants.CLOUD_API_ROOT_ENV_VAR, default=input_value) 57 | or constants.CLOUD_API_ROOT 58 | ) 59 | 60 | 61 | def resolve_cloud_workspace_id( 62 | input_value: str | None = None, 63 | /, 64 | ) -> str: 65 | """Get the Airbyte Cloud workspace ID from the environment, or return None if not set.""" 66 | return str(get_secret(constants.CLOUD_WORKSPACE_ID_ENV_VAR, default=input_value)) 67 | 68 | 69 | def resolve_cloud_config_api_url( 70 | input_value: str | None = None, 71 | /, 72 | ) -> str | None: 73 | """Get the Airbyte Cloud Config API URL from the environment, or return None if not set. 74 | 75 | The Config API is a separate internal API used for certain operations like 76 | connector builder projects and custom source definitions. 77 | 78 | Returns: 79 | The Config API URL if set via environment variable or input, None otherwise. 80 | """ 81 | result = try_get_secret(constants.CLOUD_CONFIG_API_ROOT_ENV_VAR, default=input_value) 82 | if result: 83 | return str(result) 84 | return None 85 | -------------------------------------------------------------------------------- /airbyte/caches/motherduck.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """A MotherDuck implementation of the PyAirbyte cache, built on DuckDB. 3 | 4 | ## Usage Example 5 | 6 | ```python 7 | from airbyte as ab 8 | from airbyte.caches import MotherDuckCache 9 | 10 | cache = MotherDuckCache( 11 | database="mydatabase", 12 | schema_name="myschema", 13 | api_key=ab.get_secret("MOTHERDUCK_API_KEY"), 14 | ) 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import warnings 20 | from typing import TYPE_CHECKING, ClassVar 21 | 22 | from airbyte_api.models import DestinationDuckdb 23 | from duckdb_engine import DuckDBEngineWarning 24 | from overrides import overrides 25 | from pydantic import Field 26 | 27 | from airbyte._processors.sql.duckdb import DuckDBConfig 28 | from airbyte._processors.sql.motherduck import MotherDuckSqlProcessor 29 | from airbyte.caches.duckdb import DuckDBCache 30 | from airbyte.destinations._translate_cache_to_dest import ( 31 | motherduck_cache_to_destination_configuration, 32 | ) 33 | from airbyte.secrets import SecretString 34 | 35 | 36 | if TYPE_CHECKING: 37 | from airbyte.shared.sql_processor import SqlProcessorBase 38 | 39 | 40 | class MotherDuckConfig(DuckDBConfig): 41 | """Configuration for the MotherDuck cache.""" 42 | 43 | database: str = Field() 44 | api_key: SecretString = Field() 45 | db_path: str = Field(default="md:") # pyrefly: ignore[bad-override] 46 | _paired_destination_name: str = "destination-motherduck" 47 | 48 | @overrides 49 | def get_sql_alchemy_url(self) -> SecretString: 50 | """Return the SQLAlchemy URL to use.""" 51 | # Suppress warnings from DuckDB about reflection on indices. 52 | # https://github.com/Mause/duckdb_engine/issues/905 53 | warnings.filterwarnings( 54 | "ignore", 55 | message="duckdb-engine doesn't yet support reflection on indices", 56 | category=DuckDBEngineWarning, 57 | ) 58 | 59 | return SecretString( 60 | f"duckdb:///md:{self.database}?motherduck_token={self.api_key}" 61 | # Not sure why this doesn't work. We have to override later in the flow. 62 | # f"&schema={self.schema_name}" 63 | ) 64 | 65 | @overrides 66 | def get_database_name(self) -> str: 67 | """Return the name of the database.""" 68 | return self.database 69 | 70 | 71 | class MotherDuckCache(MotherDuckConfig, DuckDBCache): 72 | """Cache that uses MotherDuck for external persistent storage.""" 73 | 74 | _sql_processor_class: ClassVar[type[SqlProcessorBase]] = MotherDuckSqlProcessor 75 | 76 | paired_destination_name: ClassVar[str | None] = "destination-bigquery" 77 | paired_destination_config_class: ClassVar[type | None] = DestinationDuckdb 78 | 79 | @property 80 | def paired_destination_config(self) -> DestinationDuckdb: 81 | """Return a dictionary of destination configuration values.""" 82 | return motherduck_cache_to_destination_configuration(cache=self) 83 | 84 | 85 | # Expose the Cache class and also the Config class. 86 | __all__ = [ 87 | "MotherDuckCache", 88 | "MotherDuckConfig", 89 | ] 90 | -------------------------------------------------------------------------------- /examples/run_integ_test_source.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """This script will run any source that is registered in the Airbyte integration tests. 3 | 4 | 5 | Usage: 6 | poetry run python examples/run_integ_test_source.py source-coin-api 7 | poetry run python examples/run_integ_test_source.py source-github 8 | poetry run python examples/run_integ_test_source.py source-google-analytics-v4 9 | poetry run python examples/run_integ_test_source.py source-klaviyo 10 | poetry run python examples/run_integ_test_source.py source-shopify 11 | 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | import sys 17 | 18 | import airbyte as ab 19 | from airbyte.secrets.google_gsm import GoogleGSMSecretManager 20 | 21 | 22 | AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" 23 | SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" 24 | 25 | secret_mgr = GoogleGSMSecretManager( 26 | project=AIRBYTE_INTERNAL_GCP_PROJECT, 27 | credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), 28 | ) 29 | 30 | 31 | def get_secret_name(connector_name: str) -> str: 32 | """Get the secret name for the given connector. 33 | 34 | Some names are hard-coded, if the naming convention is not followed. 35 | """ 36 | if connector_name.lower() == "source-google-analytics-v4": 37 | return "SECRET_SOURCE_GOOGLE_ANALYTICS_V4_CLOUD__CREDS" 38 | 39 | if connector_name.lower() == "source-shopify": 40 | return "SECRET_SOURCE-SHOPIFY__CREDS" 41 | 42 | return f"SECRET_{connector_name.upper()}_CREDS" 43 | 44 | 45 | def main( 46 | connector_name: str, 47 | secret_name: str, 48 | streams: list[str] | None, 49 | ) -> None: 50 | secret = secret_mgr.get_secret( 51 | secret_name=secret_name, 52 | ) 53 | assert secret is not None, f"Secret {secret_name} not found." 54 | config = secret.parse_json() 55 | source = ab.get_source( 56 | connector_name, 57 | config=config, 58 | install_if_missing=True, 59 | ) 60 | if streams: 61 | source.select_streams(streams) 62 | else: 63 | source.select_all_streams() 64 | cache = ab.new_local_cache() 65 | try: 66 | read_result = source.read(cache=cache) 67 | print( 68 | f"Read from `{connector_name}` was successful. ", 69 | f"Cache results were saved to: {cache.cache_dir}", 70 | f"Streams list: {', '.join(read_result.streams.keys())}", 71 | ) 72 | except Exception: 73 | print( 74 | f"Read from `{connector_name}` failed. ", 75 | f"Cache files are located at: {cache.cache_dir}", 76 | ) 77 | raise 78 | 79 | 80 | if __name__ == "__main__": 81 | # Get first arg from CLI 82 | connector_name = sys.argv[1] 83 | streams_csv = sys.argv[2] if len(sys.argv) > 2 else None # noqa: PLR2004 84 | streams = None 85 | if streams_csv: 86 | streams = streams_csv.split(",") 87 | # TODO: We can optionally take a second arg to override the default secret name. 88 | secret_name = get_secret_name(connector_name) 89 | main(connector_name, streams=streams, secret_name=secret_name) 90 | -------------------------------------------------------------------------------- /airbyte/_writers/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Write interfaces for PyAirbyte.""" 3 | 4 | from __future__ import annotations 5 | 6 | import abc 7 | from typing import IO, TYPE_CHECKING 8 | 9 | from airbyte._util.connector_info import WriterRuntimeInfo 10 | 11 | 12 | if TYPE_CHECKING: 13 | from airbyte._message_iterators import AirbyteMessageIterator 14 | from airbyte.progress import ProgressTracker 15 | from airbyte.shared.catalog_providers import CatalogProvider 16 | from airbyte.shared.state_writers import StateWriterBase 17 | from airbyte.strategies import WriteStrategy 18 | 19 | 20 | class AirbyteWriterInterface(abc.ABC): 21 | """An interface for writing Airbyte messages.""" 22 | 23 | @property 24 | def name(self) -> str: 25 | """Return the name of the writer. 26 | 27 | This is used for logging and state tracking. 28 | """ 29 | if hasattr(self, "_name"): 30 | return self._name 31 | 32 | return self.__class__.__name__ 33 | 34 | def _get_writer_runtime_info(self) -> WriterRuntimeInfo: 35 | """Get metadata for telemetry and performance logging.""" 36 | return WriterRuntimeInfo( 37 | type=type(self).__name__, 38 | config_hash=self.config_hash, 39 | ) 40 | 41 | @property 42 | def config_hash(self) -> str | None: 43 | """Return a hash of the writer configuration. 44 | 45 | This is used for logging and state tracking. 46 | """ 47 | return None 48 | 49 | def _write_airbyte_io_stream( 50 | self, 51 | stdin: IO[str], 52 | *, 53 | catalog_provider: CatalogProvider, 54 | write_strategy: WriteStrategy, 55 | state_writer: StateWriterBase | None = None, 56 | progress_tracker: ProgressTracker, 57 | ) -> None: 58 | """Read from the connector and write to the cache. 59 | 60 | This is a specialized version of `_write_airbyte_message_stream` that reads from an IO 61 | stream. Writers can override this method to provide custom behavior for reading from an IO 62 | stream, without paying the cost of converting the stream to an AirbyteMessageIterator. 63 | """ 64 | self._write_airbyte_message_stream( 65 | stdin, 66 | catalog_provider=catalog_provider, 67 | write_strategy=write_strategy, 68 | state_writer=state_writer, 69 | progress_tracker=progress_tracker, 70 | ) 71 | 72 | @abc.abstractmethod 73 | def _write_airbyte_message_stream( 74 | self, 75 | stdin: IO[str] | AirbyteMessageIterator, 76 | *, 77 | catalog_provider: CatalogProvider, 78 | write_strategy: WriteStrategy, 79 | state_writer: StateWriterBase | None = None, 80 | progress_tracker: ProgressTracker, 81 | ) -> None: 82 | """Write the incoming data. 83 | 84 | Note: Callers should use `_write_airbyte_io_stream` instead of this method if 85 | `stdin` is always an IO stream. This ensures that the most efficient method is used for 86 | writing the incoming stream. 87 | """ 88 | ... 89 | -------------------------------------------------------------------------------- /airbyte/datasets/_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | from abc import ABC, abstractmethod 5 | from typing import TYPE_CHECKING, Any, cast 6 | 7 | from pandas import DataFrame 8 | 9 | from airbyte_protocol.models.airbyte_protocol import ConfiguredAirbyteStream 10 | 11 | from airbyte._util.document_rendering import DocumentRenderer 12 | from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE 13 | 14 | 15 | if TYPE_CHECKING: 16 | from collections.abc import Iterable, Iterator 17 | 18 | from pyarrow.dataset import Dataset 19 | 20 | from airbyte_protocol.models import ConfiguredAirbyteStream 21 | 22 | from airbyte.documents import Document 23 | 24 | 25 | class DatasetBase(ABC): 26 | """Base implementation for all datasets.""" 27 | 28 | def __init__(self, stream_metadata: ConfiguredAirbyteStream) -> None: 29 | self._stream_metadata = stream_metadata 30 | 31 | @abstractmethod 32 | def __iter__(self) -> Iterator[dict[str, Any]]: 33 | """Return the iterator of records.""" 34 | raise NotImplementedError 35 | 36 | def to_pandas(self) -> DataFrame: 37 | """Return a pandas DataFrame representation of the dataset. 38 | 39 | The base implementation simply passes the record iterator to Panda's DataFrame constructor. 40 | """ 41 | # Technically, we return an iterator of Mapping objects. However, pandas 42 | # expects an iterator of dict objects. This cast is safe because we know 43 | # duck typing is correct for this use case. 44 | return DataFrame(cast("Iterator[dict[str, Any]]", self)) 45 | 46 | def to_arrow( 47 | self, 48 | *, 49 | max_chunk_size: int = DEFAULT_ARROW_MAX_CHUNK_SIZE, 50 | ) -> Dataset: 51 | """Return an Arrow Dataset representation of the dataset. 52 | 53 | This method should be implemented by subclasses. 54 | """ 55 | raise NotImplementedError("Not implemented in base class") 56 | 57 | def to_documents( 58 | self, 59 | title_property: str | None = None, 60 | content_properties: list[str] | None = None, 61 | metadata_properties: list[str] | None = None, 62 | *, 63 | render_metadata: bool = False, 64 | ) -> Iterable[Document]: 65 | """Return the iterator of documents. 66 | 67 | If metadata_properties is not set, all properties that are not content will be added to 68 | the metadata. 69 | 70 | If render_metadata is True, metadata will be rendered in the document, as well as the 71 | the main content. Otherwise, metadata will be attached to the document but not rendered. 72 | """ 73 | renderer = DocumentRenderer( 74 | title_property=title_property, 75 | content_properties=content_properties, 76 | metadata_properties=metadata_properties, 77 | render_metadata=render_metadata, 78 | ) 79 | yield from renderer.render_documents(self) 80 | 81 | @property 82 | def column_names(self) -> list[str]: 83 | """Return the list of top-level column names.""" 84 | return list(self._stream_metadata.stream.json_schema["properties"].keys()) 85 | -------------------------------------------------------------------------------- /airbyte/shared/state_writers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """State writer implementation.""" 4 | 5 | from __future__ import annotations 6 | 7 | import abc 8 | from typing import TYPE_CHECKING, NoReturn, final 9 | 10 | from airbyte.shared.state_providers import StateProviderBase 11 | 12 | 13 | if TYPE_CHECKING: 14 | from airbyte_protocol.models import AirbyteStateMessage 15 | 16 | 17 | class StateWriterBase(StateProviderBase, abc.ABC): 18 | """A class to write state artifacts. 19 | 20 | This class is used to write state artifacts to a state store. It also serves as a provider 21 | of cached state artifacts. 22 | """ 23 | 24 | def __init__(self) -> None: 25 | """Initialize the state writer.""" 26 | self._latest_stream_state_messages: dict[str, AirbyteStateMessage] = {} 27 | """The latest state message seen for each stream.""" 28 | 29 | @property 30 | def _state_message_artifacts( 31 | self, 32 | ) -> list[AirbyteStateMessage]: 33 | """Return all state artifacts.""" 34 | return list(self._latest_stream_state_messages.values()) 35 | 36 | @_state_message_artifacts.setter 37 | def _state_message_artifacts(self, value: list[AirbyteStateMessage]) -> NoReturn: 38 | """Override as no-op / not-implemented.""" 39 | _ = value 40 | raise NotImplementedError("The `_state_message_artifacts` property cannot be set") 41 | 42 | @final 43 | def write_state( 44 | self, 45 | state_message: AirbyteStateMessage, 46 | ) -> None: 47 | """Save or 'write' a state artifact. 48 | 49 | This method is final and should not be overridden. Subclasses should instead overwrite 50 | the `_write_state` method. 51 | """ 52 | if state_message.stream: 53 | self._latest_stream_state_messages[state_message.stream.stream_descriptor.name] = ( 54 | state_message 55 | ) 56 | 57 | self._write_state(state_message) 58 | 59 | @abc.abstractmethod 60 | def _write_state( 61 | self, 62 | state_message: AirbyteStateMessage, 63 | ) -> None: 64 | """Save or 'write' a state artifact.""" 65 | ... 66 | 67 | 68 | class StdOutStateWriter(StateWriterBase): 69 | """A state writer that writes state artifacts to stdout. 70 | 71 | This is useful when we want PyAirbyte to behave like a "Destination" in the Airbyte protocol. 72 | """ 73 | 74 | def _write_state( 75 | self, 76 | state_message: AirbyteStateMessage, 77 | ) -> None: 78 | """Save or 'write' a state artifact.""" 79 | print(state_message.model_dump_json()) 80 | 81 | 82 | class NoOpStateWriter(StateWriterBase): 83 | """A state writer that does not write state artifacts. 84 | 85 | Even though state messages are not sent anywhere, they are still stored in memory and 86 | can be accessed using the `state_message_artifacts` property and other methods inherited 87 | from the `StateProviderBase` class 88 | """ 89 | 90 | def _write_state( 91 | self, 92 | state_message: AirbyteStateMessage, 93 | ) -> None: 94 | """Save or 'write' a state artifact.""" 95 | _ = state_message 96 | pass 97 | -------------------------------------------------------------------------------- /airbyte/_util/name_normalizers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Name normalizer classes.""" 3 | 4 | from __future__ import annotations 5 | 6 | import abc 7 | import functools 8 | import re 9 | from typing import TYPE_CHECKING 10 | 11 | from airbyte import exceptions as exc 12 | 13 | 14 | if TYPE_CHECKING: 15 | from collections.abc import Iterable 16 | 17 | 18 | class NameNormalizerBase(abc.ABC): 19 | """Abstract base class for name normalizers.""" 20 | 21 | @staticmethod 22 | @abc.abstractmethod 23 | def normalize(name: str) -> str: 24 | """Return the normalized name.""" 25 | ... 26 | 27 | @classmethod 28 | def normalize_set(cls, str_iter: Iterable[str]) -> set[str]: 29 | """Converts string iterable to a set of lower case strings.""" 30 | return {cls.normalize(s) for s in str_iter} 31 | 32 | @classmethod 33 | def normalize_list(cls, str_iter: Iterable[str]) -> list[str]: 34 | """Converts string iterable to a list of lower case strings.""" 35 | return [cls.normalize(s) for s in str_iter] 36 | 37 | @classmethod 38 | def check_matched(cls, name1: str, name2: str) -> bool: 39 | """Return True if the two names match after each is normalized.""" 40 | return cls.normalize(name1) == cls.normalize(name2) 41 | 42 | @classmethod 43 | def check_normalized(cls, name: str) -> bool: 44 | """Return True if the name is already normalized.""" 45 | return cls.normalize(name) == name 46 | 47 | 48 | class LowerCaseNormalizer(NameNormalizerBase): 49 | """A name normalizer that converts names to lower case.""" 50 | 51 | @staticmethod 52 | @functools.cache 53 | def normalize(name: str) -> str: # pyrefly: ignore[bad-override] # pyrefly decorator issue 54 | """Return the normalized name. 55 | 56 | - All non-alphanumeric characters are replaced with underscores. 57 | - Any names that start with a numeric ("1", "2", "123", "1b" etc.) are prefixed 58 | with and underscore ("_1", "_2", "_123", "_1b" etc.) 59 | 60 | Examples: 61 | - "Hello World!" -> "hello_world" 62 | - "Hello, World!" -> "hello__world" 63 | - "Hello - World" -> "hello___world" 64 | - "___Hello, World___" -> "___hello__world___" 65 | - "Average Sales (%)" -> "average_sales____" 66 | - "Average Sales (#)" -> "average_sales____" 67 | - "+1" -> "_1" 68 | - "-1" -> "_1" 69 | """ 70 | result = name 71 | 72 | # Replace all non-alphanumeric characters with underscores. 73 | result = re.sub(r"[^A-Za-z0-9]", "_", result.lower()) 74 | 75 | # Check if name starts with a number and prepend "_" if it does. 76 | if result and result[0].isdigit(): 77 | # Most databases do not allow identifiers to start with a number. 78 | result = f"_{result}" 79 | 80 | if not result.replace("_", ""): 81 | raise exc.PyAirbyteNameNormalizationError( 82 | message="Name cannot be empty after normalization.", 83 | raw_name=name, 84 | normalization_result=result, 85 | ) 86 | 87 | return result 88 | 89 | 90 | __all__ = [ 91 | "NameNormalizerBase", 92 | "LowerCaseNormalizer", 93 | ] 94 | -------------------------------------------------------------------------------- /.github/workflows/pypi_publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and/or Publish 2 | 3 | on: 4 | push: 5 | 6 | workflow_dispatch: 7 | inputs: 8 | git_ref: 9 | description: 'Git ref (SHA or branch) to checkout and build' 10 | required: false 11 | type: string 12 | version_override: 13 | description: 'Version to use (overrides dynamic versioning)' 14 | required: false 15 | type: string 16 | publish: 17 | description: 'Whether to publish to PyPI (true/false)' 18 | required: false 19 | type: string 20 | default: 'false' 21 | 22 | workflow_call: 23 | inputs: 24 | git_ref: 25 | description: 'Git ref (SHA or branch) to checkout and build' 26 | required: true 27 | type: string 28 | version_override: 29 | description: 'Version to use (overrides dynamic versioning)' 30 | required: false 31 | type: string 32 | publish: 33 | description: 'Whether to publish to PyPI' 34 | required: false 35 | type: boolean 36 | default: false 37 | 38 | env: 39 | AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }} 40 | 41 | jobs: 42 | build: 43 | runs-on: ubuntu-latest 44 | steps: 45 | - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 46 | with: 47 | ref: ${{ inputs.git_ref || github.ref }} 48 | fetch-depth: 0 49 | - name: Prepare version override 50 | id: version 51 | run: | 52 | echo "override=${{ inputs.version_override }}" >> $GITHUB_OUTPUT 53 | echo "has_override=${{ inputs.version_override != '' }}" >> $GITHUB_OUTPUT 54 | - name: Build package (with version override) 55 | if: steps.version.outputs.has_override == 'true' 56 | uses: hynek/build-and-inspect-python-package@efb823f52190ad02594531168b7a2d5790e66516 # v2.14.0 57 | env: 58 | POETRY_DYNAMIC_VERSIONING_BYPASS: ${{ steps.version.outputs.override }} 59 | - name: Build package (dynamic version) 60 | if: steps.version.outputs.has_override != 'true' 61 | uses: hynek/build-and-inspect-python-package@efb823f52190ad02594531168b7a2d5790e66516 # v2.14.0 62 | 63 | publish: 64 | name: Publish to PyPI 65 | runs-on: ubuntu-latest 66 | needs: [build] 67 | permissions: 68 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 69 | contents: write # Needed to upload artifacts to the release 70 | environment: 71 | name: PyPi 72 | url: https://pypi.org/p/airbyte 73 | # Publish when: (1) triggered by a tag push, OR (2) called with publish=true (handles both boolean and string) 74 | if: startsWith(github.ref, 'refs/tags/') || inputs.publish == true || inputs.publish == 'true' 75 | steps: 76 | - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 77 | with: 78 | name: Packages 79 | path: dist 80 | - name: Upload wheel to release 81 | # Only upload to GitHub release when triggered by a tag 82 | if: startsWith(github.ref, 'refs/tags/') 83 | uses: svenstaro/upload-release-action@81c65b7cd4de9b2570615ce3aad67a41de5b1a13 # latest 84 | with: 85 | repo_token: ${{ secrets.GITHUB_TOKEN }} 86 | file: dist/*.whl 87 | tag: ${{ github.ref }} 88 | overwrite: true 89 | file_glob: true 90 | 91 | - name: Publish 92 | uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 93 | -------------------------------------------------------------------------------- /airbyte/destinations/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Destinations module. 3 | 4 | This module contains classes and methods for interacting with Airbyte destinations. You can use this 5 | module to create custom destinations, or to interact with existing destinations. 6 | 7 | ## Getting Started 8 | 9 | To get started with destinations, you can use the `get_destination()` method to create a destination 10 | object. This method takes a destination name and configuration, and returns a destination object 11 | that you can use to write data to the destination. 12 | 13 | ```python 14 | import airbyte as ab 15 | 16 | my_destination = ab.get_destination( 17 | "destination-foo", 18 | config={"api_key": "my_api_key"}, 19 | docker_image=True, 20 | ) 21 | ``` 22 | 23 | ## Writing Data to a Destination 24 | 25 | To write data to a destination, you can use the `Destination.write()` method. This method 26 | takes either a `airbyte.Source` or `airbyte.ReadResult` object. 27 | 28 | ## Writing to a destination from a source 29 | 30 | To write directly from a source, simply pass the source object to the `Destination.write()` method: 31 | 32 | ```python 33 | my_source = get_source(...) 34 | my_destination = get_destination(...) 35 | my_destination.write(source_faker) 36 | ``` 37 | 38 | ## Writing from a read result: 39 | 40 | To write from a read result, you can use the following pattern. First, read data from the source, 41 | then write the data to the destination, using the `ReadResult` object as a buffer between the source 42 | and destination: 43 | 44 | ```python 45 | # First read data from the source: 46 | my_source = get_source(...) 47 | read_result = my_source.read(...) 48 | 49 | # Optionally, you can validate data before writing it: 50 | # ...misc validation code here... 51 | 52 | # Then write the data to the destination: 53 | my_destination.write(read_result) 54 | ``` 55 | 56 | ## Using Docker and Python-based Connectors 57 | 58 | By default, the `get_destination()` method will look for a Python-based connector. If you want to 59 | use a Docker-based connector, you can set the `docker_image` parameter to `True`: 60 | 61 | ```python 62 | my_destination = ab.get_destination( 63 | "destination-foo", 64 | config={"api_key": "my_api_key"}, 65 | docker_image=True, 66 | ) 67 | ``` 68 | 69 | **Note:** Unlike source connectors, most destination connectors are written in Java, and for this 70 | reason are only available as Docker-based connectors. If you need to load to a SQL database and your 71 | runtime does not support docker, you may want to use the `airbyte.caches` module to load data to 72 | a SQL cache. Caches are mostly identical to destinations in behavior, and are implemented internally 73 | to PyAirbyte so they can run anywhere that PyAirbyte can run. 74 | """ 75 | 76 | from __future__ import annotations 77 | 78 | from typing import TYPE_CHECKING 79 | 80 | from airbyte.destinations.base import Destination 81 | from airbyte.destinations.util import ( 82 | get_destination, 83 | get_noop_destination, 84 | ) 85 | 86 | 87 | # Submodules imported here for documentation reasons: https://github.com/mitmproxy/pdoc/issues/757 88 | if TYPE_CHECKING: 89 | # ruff: noqa: TC004 # imports used for more than type checking 90 | from airbyte.destinations import util 91 | 92 | 93 | __all__ = [ 94 | # Modules 95 | "util", 96 | # Methods 97 | "get_destination", 98 | "get_noop_destination", 99 | # Classes 100 | "Destination", 101 | ] 102 | -------------------------------------------------------------------------------- /tests/integration_tests/test_config_change_callback.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | 3 | """Integration tests which test destination capabilities using the JSONL destination (docker-based).""" 4 | 5 | from __future__ import annotations 6 | 7 | from typing import Any 8 | from unittest.mock import patch 9 | 10 | import pytest 11 | from airbyte import Destination, Source, get_destination, get_source 12 | 13 | from airbyte_protocol.models import ( 14 | AirbyteControlConnectorConfigMessage, 15 | AirbyteControlMessage, 16 | AirbyteMessage, 17 | OrchestratorType, 18 | Type, 19 | ) 20 | 21 | 22 | def config_change_callback(config: dict[str, Any]) -> None: 23 | print(f"Updated config: {config}") 24 | 25 | 26 | @pytest.fixture 27 | def new_duckdb_destination() -> Destination: 28 | """Return a new JSONL destination.""" 29 | return get_destination( 30 | name="destination-duckdb", 31 | config={ 32 | # This path is relative to the container: 33 | "destination_path": "/local/temp/db.duckdb", 34 | }, 35 | config_change_callback=config_change_callback, 36 | ) 37 | 38 | 39 | @pytest.fixture 40 | def new_source_faker(*, use_docker: bool) -> Source: 41 | return get_source( 42 | "source-faker", 43 | config={ 44 | "count": 100, 45 | "seed": 1234, 46 | "parallelism": 16, 47 | }, 48 | install_if_missing=True, 49 | streams=["products"], 50 | config_change_callback=config_change_callback, 51 | docker_image=use_docker, 52 | ) 53 | 54 | 55 | def test_source_config_callback( 56 | new_duckdb_destination: Destination, 57 | new_source_faker: Source, 58 | ) -> None: 59 | with patch.object( 60 | new_source_faker, "config_change_callback" 61 | ) as mock_config_change_callback: 62 | updated_config = { 63 | "count": 1000, 64 | "seed": 1234, 65 | "parallelism": 16, 66 | } 67 | airbyte_source_control_message = AirbyteMessage( 68 | type=Type.CONTROL, 69 | control=AirbyteControlMessage( 70 | type=OrchestratorType.CONNECTOR_CONFIG, 71 | emitted_at=0, 72 | connectorConfig=AirbyteControlConnectorConfigMessage( 73 | config=updated_config 74 | ), 75 | ), 76 | ) 77 | 78 | new_source_faker._peek_airbyte_message(airbyte_source_control_message) 79 | mock_config_change_callback.assert_called_once_with(updated_config) 80 | 81 | 82 | def test_destination_config_callback( 83 | new_duckdb_destination: Destination, 84 | new_source_faker: Source, 85 | ) -> None: 86 | with patch.object( 87 | new_duckdb_destination, "config_change_callback" 88 | ) as mock_config_change_callback: 89 | updated_config = { 90 | "destination_path": "/local/temp/db.duckdb", 91 | } 92 | airbyte_destination_control_message = AirbyteMessage( 93 | type=Type.CONTROL, 94 | control=AirbyteControlMessage( 95 | type=OrchestratorType.CONNECTOR_CONFIG, 96 | emitted_at=0, 97 | connectorConfig=AirbyteControlConnectorConfigMessage( 98 | config=updated_config 99 | ), 100 | ), 101 | ) 102 | 103 | new_duckdb_destination._peek_airbyte_message( 104 | airbyte_destination_control_message 105 | ) 106 | mock_config_change_callback.assert_called_once_with(updated_config) 107 | -------------------------------------------------------------------------------- /tests/integration_tests/cloud/test_cloud_workspaces.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Cloud Workspace integration tests. 3 | 4 | These tests are designed to be run against a running instance of the Airbyte API. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | import pytest 10 | 11 | import airbyte as ab 12 | from airbyte.cloud import CloudWorkspace 13 | from airbyte.cloud.connections import CloudConnection 14 | from airbyte.cloud.connectors import CloudSource 15 | 16 | 17 | def test_deploy_destination( 18 | cloud_workspace: CloudWorkspace, 19 | deployable_dummy_destination: ab.Destination, 20 | ) -> None: 21 | """Test deploying a source to a workspace.""" 22 | cloud_destination = cloud_workspace.deploy_destination( 23 | name="test-destination-deleteme", 24 | destination=deployable_dummy_destination, 25 | random_name_suffix=True, 26 | ) 27 | cloud_workspace.permanently_delete_destination(cloud_destination) 28 | 29 | 30 | def test_deploy_source( 31 | cloud_workspace: CloudWorkspace, 32 | *, 33 | use_docker: bool, 34 | ) -> None: 35 | """Test deploying a source to a workspace.""" 36 | source = ab.get_source( 37 | "source-faker", 38 | config={"count": 100}, 39 | docker_image=use_docker, 40 | ) 41 | source.check() 42 | cloud_source: CloudSource = cloud_workspace.deploy_source( 43 | name="test-faker-source-deleteme", 44 | source=source, 45 | unique=False, 46 | ) 47 | cloud_workspace.permanently_delete_source(cloud_source) 48 | 49 | 50 | def test_deploy_dummy_source( 51 | deployable_dummy_source: ab.Source, 52 | cloud_workspace: CloudWorkspace, 53 | ) -> None: 54 | """Test deploying a source to a workspace.""" 55 | deployable_dummy_source.check() 56 | 57 | cloud_source: CloudSource = cloud_workspace.deploy_source( 58 | name="test-source-deleteme", 59 | source=deployable_dummy_source, 60 | unique=False, 61 | ) 62 | cloud_workspace.permanently_delete_source(cloud_source) 63 | 64 | 65 | @pytest.mark.skip( 66 | "Test is being flaky. TODO: Fix upstream Cloud API issue with missing secrets. " 67 | "See: https://github.com/airbytehq/airbyte-internal-issues/issues/15502" 68 | ) 69 | def test_deploy_connection( 70 | cloud_workspace: CloudWorkspace, 71 | deployable_dummy_source: ab.Source, 72 | deployable_dummy_destination: ab.Destination, 73 | ) -> None: 74 | """Test deploying a source and cache to a workspace as a new connection.""" 75 | stream_names = deployable_dummy_source.get_selected_streams() 76 | cloud_source = cloud_workspace.deploy_source( 77 | name="test-source-deleteme", 78 | source=deployable_dummy_source, 79 | random_name_suffix=True, 80 | ) 81 | cloud_destination = cloud_workspace.deploy_destination( 82 | name="test-destination-deleteme", 83 | destination=deployable_dummy_destination, 84 | random_name_suffix=True, 85 | ) 86 | 87 | connection: CloudConnection = cloud_workspace.deploy_connection( 88 | connection_name="test-connection-deleteme", 89 | source=cloud_source, 90 | destination=cloud_destination, 91 | selected_streams=stream_names, 92 | table_prefix="zzz_deleteme_", 93 | ) 94 | assert set(connection.stream_names) == set(stream_names) 95 | assert connection.table_prefix == "zzz_deleteme_" 96 | cloud_workspace.permanently_delete_connection( 97 | connection=connection, 98 | cascade_delete_source=True, 99 | cascade_delete_destination=True, 100 | ) 101 | -------------------------------------------------------------------------------- /tests/integration_tests/cloud/test_cloud_sync.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | """Cloud Workspace integration tests. 3 | 4 | These tests are designed to be run against a running instance of the Airbyte API. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from dataclasses import asdict 10 | 11 | import airbyte as ab 12 | import pytest 13 | from airbyte._util import text_util 14 | from airbyte.cloud import CloudWorkspace 15 | from airbyte.cloud.sync_results import SyncResult 16 | from airbyte.destinations.base import Destination 17 | 18 | 19 | @pytest.fixture 20 | def pre_created_connection_id() -> str: 21 | return "80857d37-1f21-4500-a802-f5ac08d1a3dd" 22 | 23 | 24 | @pytest.mark.super_slow 25 | @pytest.mark.parametrize( 26 | "pre_created_connection_id", 27 | [ 28 | "80857d37-1f21-4500-a802-f5ac08d1a3dd", 29 | ], 30 | ) 31 | def test_run_connection( 32 | cloud_workspace: CloudWorkspace, 33 | pre_created_connection_id: str, 34 | ) -> None: 35 | """Test running a connection.""" 36 | sync_result: SyncResult = cloud_workspace.get_connection( 37 | pre_created_connection_id 38 | ).run_sync() 39 | assert sync_result.is_job_complete() 40 | assert sync_result.stream_names 41 | 42 | 43 | def test_get_previous_sync_result( 44 | cloud_workspace: CloudWorkspace, 45 | pre_created_connection_id: str, 46 | ) -> None: 47 | """Test running a connection.""" 48 | sync_result: SyncResult = cloud_workspace.get_connection( 49 | connection_id=pre_created_connection_id, 50 | ).get_previous_sync_logs()[0] 51 | assert sync_result.is_job_complete() 52 | assert sync_result.get_job_status() 53 | assert sync_result.stream_names 54 | 55 | 56 | @pytest.mark.super_slow 57 | @pytest.mark.skip( 58 | reason="This test is not yet working correctly. Config is invalid, missing property 'host'." 59 | ) 60 | def test_deploy_and_run_connection( 61 | cloud_workspace: CloudWorkspace, 62 | new_deployable_destination, 63 | with_bigquery_credentials_env_vars, 64 | with_snowflake_password_env_var, 65 | *, 66 | use_docker: bool, 67 | ) -> None: 68 | """Test deploying a source and cache to a workspace as a new connection.""" 69 | source = ab.get_source( 70 | "source-faker", 71 | config={"count": 100}, 72 | docker_image=use_docker, 73 | ) 74 | cloud_source = cloud_workspace.deploy_source( 75 | name=f"test-source-{text_util.generate_random_suffix()}", 76 | source=source, 77 | ) 78 | if not isinstance(new_deployable_destination, (dict, Destination)): 79 | try: 80 | new_deployable_destination = asdict(new_deployable_destination) 81 | except Exception as ex: 82 | raise ValueError( 83 | "new_deployable_destination must be a dictionary or a dataclass. " 84 | f"Instead, it is a {type(new_deployable_destination)}." 85 | ) from ex 86 | 87 | cloud_destination = cloud_workspace.deploy_destination( 88 | name=f"test-destination-{text_util.generate_random_suffix()}", 89 | destination=new_deployable_destination, 90 | ) 91 | connection = cloud_workspace.deploy_connection( 92 | connection_name=f"test-connection-{text_util.generate_random_suffix()}", 93 | source=cloud_source, 94 | destination=cloud_destination, 95 | selected_streams=source.get_available_streams(), 96 | ) 97 | sync_result = connection.run_sync() 98 | _ = sync_result 99 | 100 | cache = sync_result.get_sql_cache() 101 | assert list(cache.streams.keys()) 102 | assert cache.streams["users"].to_pandas() 103 | 104 | cloud_workspace.permanently_delete_connection(connection) 105 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # ELv2 2 | 3 | Elastic License 2.0 \(ELv2\) 4 | 5 | **Acceptance** By using the software, you agree to all of the terms and conditions below. 6 | 7 | **Copyright License** The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below 8 | 9 | **Limitations** You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software. 10 | 11 | You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key. 12 | 13 | You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law. 14 | 15 | **Patents** The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company. 16 | 17 | **Notices** You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms. 18 | 19 | If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software. 20 | 21 | **No Other Rights** These terms do not imply any licenses other than those expressly granted in these terms. 22 | 23 | **Termination** If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently. 24 | 25 | **No Liability** As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim. 26 | 27 | **Definitions** The _licensor_ is the entity offering these terms, and the _software_ is the software the licensor makes available under these terms, including any portion of it. 28 | 29 | _you_ refers to the individual or entity agreeing to these terms. 30 | 31 | _your company_ is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. _control_ means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect. 32 | 33 | _your licenses_ are all the licenses granted to you for the software under these terms. 34 | 35 | _use_ means anything you do with the software requiring one of your licenses. 36 | 37 | _trademark_ means trademarks, service marks, and similar rights. 38 | 39 | -------------------------------------------------------------------------------- /airbyte/_util/document_rendering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2 | """Methods for converting Airbyte records into documents.""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import TYPE_CHECKING, Any 7 | 8 | import yaml 9 | from pydantic import BaseModel 10 | 11 | from airbyte.documents import Document 12 | 13 | 14 | if TYPE_CHECKING: 15 | from collections.abc import Iterable 16 | 17 | 18 | def _to_title_case(name: str, /) -> str: 19 | """Convert a string to title case. 20 | 21 | Unlike Python's built-in `str.title` method, this function doesn't lowercase the rest of the 22 | string. This is useful for converting "snake_case" to "Title Case" without negatively affecting 23 | strings that are already in title case or camel case. 24 | """ 25 | return " ".join(word[0].upper() + word[1:] for word in name.split("_")) 26 | 27 | 28 | class CustomRenderingInstructions(BaseModel): 29 | """Instructions for rendering a stream's records as documents.""" 30 | 31 | title_property: str | None = None 32 | content_properties: list[str] 33 | frontmatter_properties: list[str] 34 | metadata_properties: list[str] 35 | 36 | 37 | class DocumentRenderer(BaseModel): 38 | """Instructions for rendering a stream's records as documents.""" 39 | 40 | title_property: str | None = None 41 | content_properties: list[str] | None = None 42 | metadata_properties: list[str] | None = None 43 | render_metadata: bool = False 44 | 45 | # TODO: Add primary key and cursor key support: 46 | # https://github.com/airbytehq/pyairbyte/issues/319 47 | # primary_key_properties: list[str] 48 | # cursor_property: str | None 49 | 50 | def render_document(self, record: dict[str, Any]) -> Document: 51 | """Render a record as a document. 52 | 53 | The document will be rendered as a markdown document, with content, frontmatter, and an 54 | optional title. If there are multiple properties to render as content, they will be rendered 55 | beneath H2 section headers. If there is only one property to render as content, it will be 56 | rendered without a section header. If a title property is specified, it will be rendered as 57 | an H1 header at the top of the document. 58 | 59 | Returns: 60 | A tuple of (content: str, metadata: dict). 61 | """ 62 | content = "" 63 | if not self.metadata_properties: 64 | self.metadata_properties = [ 65 | key 66 | for key in record 67 | if key not in (self.content_properties or []) and key != self.title_property 68 | ] 69 | if self.title_property: 70 | content += f"# {record[self.title_property]}\n\n" 71 | if self.render_metadata or not self.content_properties: 72 | content += "```yaml\n" 73 | content += yaml.dump({key: record[key] for key in self.metadata_properties}) 74 | content += "```\n" 75 | 76 | if not self.content_properties: 77 | pass 78 | elif len(self.content_properties) == 1: 79 | # Only one property to render as content; no need for section headers. 80 | content += str(record[self.content_properties[0]]) 81 | else: 82 | # Multiple properties to render as content; use H2 section headers. 83 | content += "\n".join( 84 | f"## {_to_title_case(key)}\n\n{record[key]}\n\n" for key in self.content_properties 85 | ) 86 | 87 | return Document( 88 | # id=doc_id, # TODD: Add support for primary key and doc ID generation. 89 | content=content, 90 | metadata={key: record[key] for key in self.metadata_properties}, 91 | ) 92 | 93 | def render_documents(self, records: Iterable[dict[str, Any]]) -> Iterable[Document]: 94 | """Render an iterable of records as documents.""" 95 | yield from (self.render_document(record=record) for record in records) 96 | -------------------------------------------------------------------------------- /airbyte/_executors/docker.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 | from __future__ import annotations 3 | 4 | import logging 5 | import shutil 6 | import subprocess 7 | from contextlib import suppress 8 | from pathlib import Path 9 | 10 | from airbyte import exceptions as exc 11 | from airbyte._executors.base import Executor 12 | 13 | 14 | logger = logging.getLogger("airbyte") 15 | 16 | 17 | DEFAULT_AIRBYTE_CONTAINER_TEMP_DIR = "/airbyte/tmp" 18 | """Default temp dir in an Airbyte connector's Docker image.""" 19 | 20 | 21 | class DockerExecutor(Executor): 22 | def __init__( 23 | self, 24 | name: str, 25 | image_name_full: str, 26 | *, 27 | executable: list[str], 28 | target_version: str | None = None, 29 | volumes: dict[Path, str] | None = None, 30 | ) -> None: 31 | self.executable: list[str] = executable 32 | self.volumes: dict[Path, str] = volumes or {} 33 | self.image_name_full: str = image_name_full 34 | super().__init__(name=name, target_version=target_version) 35 | 36 | def ensure_installation( 37 | self, 38 | *, 39 | auto_fix: bool = True, 40 | ) -> None: 41 | """Ensure that the connector executable can be found. 42 | 43 | The auto_fix parameter is ignored for this executor type. 44 | """ 45 | _ = auto_fix 46 | try: 47 | assert ( 48 | shutil.which("docker") is not None 49 | ), "Docker couldn't be found on your system. Please Install it." 50 | self.execute(["spec"]) 51 | except Exception as e: 52 | raise exc.AirbyteConnectorExecutableNotFoundError( 53 | connector_name=self.name, 54 | ) from e 55 | 56 | def install(self) -> None: 57 | """Install the connector. 58 | 59 | For docker images, for now this is a no-op. In the future we might 60 | pull the Docker image in this step. 61 | """ 62 | pass 63 | 64 | def uninstall(self) -> None: 65 | """Uninstall the connector. 66 | 67 | For docker images, this operation runs an `docker rmi` command to remove the image. 68 | 69 | We suppress any errors that occur during the removal process. 70 | """ 71 | with suppress(subprocess.CalledProcessError): 72 | subprocess.check_output( 73 | ["docker", "rmi", self.image_name_full], 74 | ) 75 | 76 | @property 77 | def _cli(self) -> list[str]: 78 | """Get the base args of the CLI executable.""" 79 | return self.executable 80 | 81 | def map_cli_args(self, args: list[str]) -> list[str]: 82 | """Map local file paths to the container's volume paths.""" 83 | new_args = [] 84 | for arg in args: 85 | if Path(arg).exists(): 86 | # This is a file path and we need to map it to the same file within the 87 | # relative path of the file within the container's volume. 88 | for local_volume, container_path in self.volumes.items(): 89 | if Path(arg).is_relative_to(local_volume): 90 | logger.debug( 91 | f"Found file input path `{arg}` " 92 | f"relative to container-mapped volume: {local_volume}" 93 | ) 94 | mapped_path = Path(container_path) / Path(arg).relative_to(local_volume) 95 | logger.debug(f"Mapping `{arg}` -> `{mapped_path}`") 96 | new_args.append(str(mapped_path)) 97 | break 98 | else: 99 | # No break reached; a volume was found for this file path 100 | logger.warning( 101 | f"File path `{arg}` is not relative to any volume path " 102 | f"in the provided volume mappings: {self.volumes}. " 103 | "The file may not be available to the container at runtime." 104 | ) 105 | new_args.append(arg) 106 | 107 | else: 108 | new_args.append(arg) 109 | 110 | if args != new_args: 111 | logger.debug( 112 | f"Mapping local-to-container CLI args: {args} -> {new_args} " 113 | f"based upon volume definitions: {self.volumes}" 114 | ) 115 | 116 | return new_args 117 | --------------------------------------------------------------------------------