├── duckcp ├── helper │ ├── __init__.py │ ├── collection.py │ ├── validation.py │ ├── click.py │ ├── serialization.py │ ├── digest.py │ ├── http.py │ ├── fs.py │ └── sql.py ├── transform │ ├── __init__.py │ ├── database_transform.py │ ├── duckdb_transform.py │ ├── file_transform.py │ └── bitable_transform.py ├── entity │ ├── __init__.py │ ├── task.py │ ├── task_transformer.py │ ├── transform_context.py │ ├── snapshot.py │ ├── storage.py │ ├── transformer.py │ ├── credential.py │ ├── repository.py │ ├── connection.py │ ├── statement.py │ └── executor.py ├── service │ ├── __init__.py │ ├── snapshot_service.py │ ├── meta_service.py │ ├── authentication_service.py │ ├── storage_service.py │ ├── repository_service.py │ ├── task_service.py │ └── transformer_service.py ├── projection │ ├── __init__.py │ ├── task_projection.py │ ├── repository_projection.py │ ├── storage_projection.py │ ├── transformer_projection.py │ └── task_transformer_projection.py ├── typing │ ├── __init__.py │ ├── authentication_token_type.py │ ├── authenticator_type.py │ ├── supports_get_item_protocol.py │ ├── transform_type.py │ ├── credential_refresher_type.py │ ├── record_constructor_protocol.py │ ├── connection_protocol.py │ └── cursor_protocol.py ├── migration │ ├── __init__.py │ ├── 006-tasks.sql │ ├── 001-repositories.sql │ ├── 005-snapshots.sql │ ├── 004-credentials.sql │ ├── 002-storages.sql │ ├── 007-tasks-transformers.sql │ └── 003-transformers.sql ├── constant │ └── __init__.py ├── configuration │ ├── __init__.py │ ├── meta_configuration.py │ └── logging_configuration.py ├── __init__.py ├── repository │ ├── duckdb_repository.py │ ├── file_repository.py │ ├── postgres_repository.py │ ├── odps_repository.py │ ├── sqlite_repository.py │ ├── __init__.py │ └── bitable_repository.py ├── boot │ ├── meta_command.py │ ├── __init__.py │ ├── task_command.py │ ├── transformer_command.py │ ├── repository_command.py │ └── storage_command.py └── feishu │ ├── __init__.py │ └── bitable.py ├── app.py ├── docs ├── app-apis.png ├── app-create.png ├── app-version.png ├── bitable-chart.png ├── app-credentials.png ├── app-permissions.png ├── bitable-add-app.png ├── bitable-fields.png ├── bitable-search-app.png ├── app-grant-authorities.png ├── app-tenant-access-token.png └── feishu-bitable-getting-started.md ├── .gitignore ├── pyproject.toml └── README.md /duckcp/helper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /duckcp/transform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /duckcp/entity/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 领域实体 3 | """ 4 | -------------------------------------------------------------------------------- /duckcp/service/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 核心服务 3 | """ 4 | -------------------------------------------------------------------------------- /duckcp/projection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 实体用于终端展示的投影 3 | """ -------------------------------------------------------------------------------- /duckcp/typing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 自定义类型或协议。 3 | """ 4 | -------------------------------------------------------------------------------- /duckcp/migration/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 元信息数据库迁移脚本 3 | """ 4 | -------------------------------------------------------------------------------- /duckcp/constant/__init__.py: -------------------------------------------------------------------------------- 1 | APP = 'duckcp' 2 | IDENTIFIER = f'com.yinfn.{APP}' 3 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from duckcp import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /docs/app-apis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-apis.png -------------------------------------------------------------------------------- /docs/app-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-create.png -------------------------------------------------------------------------------- /docs/app-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-version.png -------------------------------------------------------------------------------- /docs/bitable-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/bitable-chart.png -------------------------------------------------------------------------------- /docs/app-credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-credentials.png -------------------------------------------------------------------------------- /docs/app-permissions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-permissions.png -------------------------------------------------------------------------------- /docs/bitable-add-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/bitable-add-app.png -------------------------------------------------------------------------------- /docs/bitable-fields.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/bitable-fields.png -------------------------------------------------------------------------------- /docs/bitable-search-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/bitable-search-app.png -------------------------------------------------------------------------------- /docs/app-grant-authorities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-grant-authorities.png -------------------------------------------------------------------------------- /docs/app-tenant-access-token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redraiment/duckcp/HEAD/docs/app-tenant-access-token.png -------------------------------------------------------------------------------- /duckcp/configuration/__init__.py: -------------------------------------------------------------------------------- 1 | class Configuration: 2 | """ 3 | 全局配置信息 4 | """ 5 | file: str | None = None # 元数据的保存路径 6 | -------------------------------------------------------------------------------- /duckcp/typing/authentication_token_type.py: -------------------------------------------------------------------------------- 1 | """ 2 | 用于鉴权(认证)的信息 3 | """ 4 | from typing import Any 5 | 6 | type AuthenticationToken = dict[str, Any] 7 | -------------------------------------------------------------------------------- /duckcp/typing/authenticator_type.py: -------------------------------------------------------------------------------- 1 | """ 2 | 取得授信(访问凭证)的函数 3 | """ 4 | from typing import Callable, Optional 5 | 6 | type Authenticator = Callable[[], Optional[str]] 7 | -------------------------------------------------------------------------------- /duckcp/projection/task_projection.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class TaskProjection(NamedTuple): 5 | code: str # 编码 6 | transformers: int # 关联迁移的数量 7 | -------------------------------------------------------------------------------- /duckcp/entity/task.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple 3 | 4 | 5 | class Task(NamedTuple): 6 | id: int 7 | code: str # 编码 8 | created_at: datetime 9 | updated_at: datetime 10 | -------------------------------------------------------------------------------- /duckcp/projection/repository_projection.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class RepositoryProjection(NamedTuple): 5 | kind: str # 类型 6 | code: str # 编码 7 | storages: int # 关联存储数量 8 | transformers: int # 关联迁移数量 9 | -------------------------------------------------------------------------------- /duckcp/projection/storage_projection.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class StorageProjection(NamedTuple): 5 | repository_kind: str # 所属仓库类型 6 | repository_code: str # 所属仓库编码 7 | code: str # 编码 8 | transformers: int # 关联迁移数量 9 | -------------------------------------------------------------------------------- /duckcp/helper/collection.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据结构帮助函数。 3 | """ 4 | 5 | 6 | def chunk[T](data: list[T], size: int) -> list[list[T]]: 7 | """ 8 | 将数据按照每[size]个一组分组。 9 | """ 10 | return [data[index:index + size] for index in range(0, len(data), size)] 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ IDEA 2 | /.idea/ 3 | 4 | # Python 5 | __pycache__/ 6 | *.py[oc] 7 | /.venv/ 8 | 9 | # Testing 10 | test/ 11 | 12 | # Runtime 13 | *.log 14 | 15 | # Build 16 | /build/ 17 | /dist/ 18 | /wheels/ 19 | *.egg-info 20 | *.spec 21 | /starter 22 | -------------------------------------------------------------------------------- /duckcp/typing/supports_get_item_protocol.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, Any 2 | 3 | 4 | class SupportsGetItemProtocol(Protocol): 5 | """ 6 | 一个抽象基类,含一个抽象方法 __getitem__,该方法与其返回类型协变。 7 | """ 8 | 9 | def __getitem__(self, index) -> Any: 10 | ... 11 | -------------------------------------------------------------------------------- /duckcp/entity/task_transformer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple 3 | 4 | 5 | class TaskTransformer(NamedTuple): 6 | task_id: int # 所属任务 7 | transformer_id: int # 所属迁移 8 | sort: int # 排序 9 | created_at: datetime 10 | updated_at: datetime 11 | -------------------------------------------------------------------------------- /duckcp/entity/transform_context.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class TransformContext(NamedTuple): 5 | """ 6 | 迁移的上下文。 7 | """ 8 | source_repository_code: str # 来源仓库编码 9 | target_repository_code: str # 去向仓库编码 10 | target_storage_code: str # 目标存储编码 11 | -------------------------------------------------------------------------------- /duckcp/typing/transform_type.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from duckcp.entity.repository import Repository 4 | from duckcp.entity.statement import Statement 5 | from duckcp.entity.storage import Storage 6 | 7 | # 定义迁移函数接口 8 | type Transform[T: Repository] = Callable[[Statement, T, Storage], None] 9 | -------------------------------------------------------------------------------- /duckcp/entity/snapshot.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple, Any 3 | 4 | 5 | class Snapshot(NamedTuple): 6 | id: int 7 | storage_id: int # 所属存储单元 8 | checksum: str # 摘要 9 | records: list[Any] # 记录 10 | created_at: datetime 11 | updated_at: datetime 12 | -------------------------------------------------------------------------------- /duckcp/typing/credential_refresher_type.py: -------------------------------------------------------------------------------- 1 | """ 2 | 访问凭证的刷新函数 3 | """ 4 | from datetime import datetime 5 | from typing import Callable 6 | 7 | from duckcp.typing.authentication_token_type import AuthenticationToken 8 | 9 | type CredentialRefresher = Callable[[AuthenticationToken], tuple[str, datetime]] 10 | -------------------------------------------------------------------------------- /duckcp/entity/storage.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple, Any 3 | 4 | 5 | class Storage(NamedTuple): 6 | id: int 7 | repository_id: int # 所属仓库 8 | code: str # 编码 9 | properties: dict[str, Any] # 介质信息 10 | created_at: datetime 11 | updated_at: datetime 12 | -------------------------------------------------------------------------------- /duckcp/entity/transformer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple 3 | 4 | 5 | class Transformer(NamedTuple): 6 | id: int 7 | code: str # 编号 8 | source_id: int # 来源仓库 9 | target_id: int # 目标表格 10 | script_file: str # 迁移脚本 11 | created_at: datetime 12 | updated_at: datetime 13 | -------------------------------------------------------------------------------- /duckcp/entity/credential.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import NamedTuple 3 | 4 | 5 | class Credential(NamedTuple): 6 | id: int 7 | platform_code: str # 所属平台 8 | app_code: str # 应用编码 9 | access_token: str # 访问凭证 10 | expired_at: datetime 11 | created_at: datetime 12 | updated_at: datetime 13 | -------------------------------------------------------------------------------- /duckcp/migration/006-tasks.sql: -------------------------------------------------------------------------------- 1 | -- 任务:多个迁移的集合;方便同一时刻按顺序执行多个迁移 2 | create table tasks ( 3 | id integer primary key autoincrement, 4 | code text not null unique, -- 编码 5 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 6 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null 7 | ); 8 | -------------------------------------------------------------------------------- /duckcp/typing/record_constructor_protocol.py: -------------------------------------------------------------------------------- 1 | """ 2 | 查询结果返回记录的构造函数。 3 | """ 4 | from typing import Protocol, Any, Sequence 5 | 6 | 7 | class RecordConstructorProtocol[T: tuple[Any, ...]](Protocol): 8 | """ 9 | 一个抽象基类,含一个抽象方法 __call__,该方法用于构造所需的记录类型。 10 | """ 11 | 12 | def __call__(self, record: Sequence[Any]) -> T: 13 | ... 14 | -------------------------------------------------------------------------------- /duckcp/helper/validation.py: -------------------------------------------------------------------------------- 1 | from rich.prompt import Prompt 2 | 3 | 4 | def ensure(condition: bool, message: str = '参数异常'): 5 | """ 6 | Assert的运行时版本。 7 | """ 8 | if not condition: 9 | raise AssertionError(message) 10 | 11 | 12 | def confirm(prompt: str, default: str = 'n') -> bool: 13 | return Prompt.ask(prompt, choices=['y', 'n'], default=default).lower() == 'y' 14 | -------------------------------------------------------------------------------- /duckcp/projection/transformer_projection.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class TransformerProjection(NamedTuple): 5 | code: str # 编码 6 | source_repository_kind: str # 来源仓库类型 7 | source_repository_code: str # 来源仓库编码 8 | target_repository_kind: str # 目标仓库类型 9 | target_repository_code: str # 目标仓库编码 10 | target_storage_code: str # 目标存储编码 11 | script_file: str # 迁移脚本 12 | tasks: int # 关联任务数量 13 | -------------------------------------------------------------------------------- /duckcp/typing/connection_protocol.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | from duckcp.typing.cursor_protocol import CursorProtocol 4 | 5 | 6 | class ConnectionProtocol(Protocol): 7 | """ 8 | 一个抽象基类,定义数据库连接协议。 9 | """ 10 | 11 | def close(self): 12 | """ 13 | 断开连接。 14 | """ 15 | ... 16 | 17 | def cursor(self) -> CursorProtocol: 18 | """ 19 | 创建新的游标对象。 20 | """ 21 | ... 22 | -------------------------------------------------------------------------------- /duckcp/projection/task_transformer_projection.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class TaskTransformerProjection(NamedTuple): 5 | task_code: str # 任务编码 6 | sort: int # 执行顺序 7 | transformer_code: str # 迁移编码 8 | source_repository_kind: str # 来源仓库类型 9 | source_repository_code: str # 来源仓库编码 10 | target_repository_kind: str # 目标仓库类型 11 | target_repository_code: str # 目标仓库编码 12 | target_storage_code: str # 目标存储编码 13 | script_file: str # 迁移脚本 14 | -------------------------------------------------------------------------------- /duckcp/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from logging import DEBUG 4 | 5 | from duckcp.boot import app, meta_command, repository_command, storage_command, transformer_command, task_command 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def main(): 11 | try: 12 | app() 13 | except Exception as e: 14 | if logger.isEnabledFor(DEBUG): 15 | logger.exception(e) 16 | else: 17 | logger.error('%s', e) 18 | sys.exit(1) 19 | -------------------------------------------------------------------------------- /duckcp/migration/001-repositories.sql: -------------------------------------------------------------------------------- 1 | -- 数据仓库 2 | create table if not exists repositories ( 3 | id integer primary key autoincrement, 4 | kind text not null, -- 仓库类型 5 | code text not null unique, -- 编码 6 | properties jsonb not null, -- 连接信息 7 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 8 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 9 | check ( kind in ('postgres', 'odps', 'duckdb', 'sqlite', 'bitable', 'file') ) 10 | ); 11 | -------------------------------------------------------------------------------- /duckcp/helper/click.py: -------------------------------------------------------------------------------- 1 | from json import JSONDecodeError 2 | 3 | from click import ParamType 4 | 5 | from duckcp.helper.serialization import json_decode 6 | 7 | 8 | class JSONParamType(ParamType): 9 | """ 10 | JSON类型。 11 | """ 12 | name = 'json' 13 | 14 | def convert(self, value, parameter, context): 15 | try: 16 | return json_decode(value) 17 | except JSONDecodeError: 18 | self.fail(f"'{value}'不是有效的JSON格式", parameter, context) 19 | 20 | 21 | JSON = JSONParamType() 22 | -------------------------------------------------------------------------------- /duckcp/migration/005-snapshots.sql: -------------------------------------------------------------------------------- 1 | -- 数据快照:由系统管理 2 | create table if not exists snapshots ( 3 | id integer primary key autoincrement, 4 | storage_id bigint not null unique references storages (id) -- 所属存储单元 5 | on update cascade 6 | on delete cascade, 7 | checksum text default '' not null, -- 摘要 8 | records jsonb default '[]' not null, -- 记录 9 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 10 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null 11 | ); 12 | -------------------------------------------------------------------------------- /duckcp/migration/004-credentials.sql: -------------------------------------------------------------------------------- 1 | -- 开放平台访问凭证:由系统管理 2 | create table if not exists credentials ( 3 | id integer primary key autoincrement, 4 | platform_code text not null, -- 开放平台 5 | app_code text not null, -- 应用编码 6 | access_token text default '' not null, -- 访问凭证 7 | expired_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 8 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 9 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 10 | unique (platform_code, app_code) 11 | ); 12 | -------------------------------------------------------------------------------- /duckcp/migration/002-storages.sql: -------------------------------------------------------------------------------- 1 | -- 存储单元 2 | create table if not exists storages ( 3 | id integer primary key autoincrement, 4 | repository_id bigint not null references repositories (id) -- 所属仓库 5 | on update cascade 6 | on delete cascade, 7 | code text not null, -- 编码 8 | properties jsonb not null, -- 介质信息 9 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 10 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 11 | unique (repository_id, code) 12 | ); 13 | -------------------------------------------------------------------------------- /duckcp/repository/duckdb_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import duckdb 4 | from duckdb.duckdb import DuckDBPyConnection 5 | 6 | from duckcp.entity.repository import Repository 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class DuckDBRepository(Repository): 12 | """ 13 | DuckDB类型仓库。 14 | """ 15 | 16 | def establish_connection(self) -> DuckDBPyConnection: 17 | """ 18 | 创建DuckDB数据库连接。 19 | """ 20 | file = self.properties['file'] if self.properties and 'file' in self.properties else ':memory:' 21 | logger.debug('file=%s', file) 22 | return duckdb.connect(file) 23 | -------------------------------------------------------------------------------- /duckcp/migration/007-tasks-transformers.sql: -------------------------------------------------------------------------------- 1 | -- 定时任务关联的迁移 2 | create table if not exists tasks_transformers ( 3 | task_id bigint not null references tasks (id) -- 所属任务 4 | on update cascade 5 | on delete cascade, 6 | transformer_id bigint not null references transformers (id) -- 所属迁移 7 | on update cascade 8 | on delete cascade, 9 | sort int not null, -- 执行顺序 10 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 11 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 12 | primary key (task_id, transformer_id), 13 | unique (transformer_id, task_id) 14 | ); 15 | -------------------------------------------------------------------------------- /duckcp/migration/003-transformers.sql: -------------------------------------------------------------------------------- 1 | -- 数据迁移 2 | create table if not exists transformers ( 3 | id integer primary key autoincrement, 4 | code text not null unique, -- 编码 5 | source_id bigint not null references repositories (id) -- 数据来源 6 | on update cascade 7 | on delete cascade, 8 | target_id bigint not null references storages (id) -- 数据去向 9 | on update cascade 10 | on delete cascade, 11 | script_file text not null, -- 迁移脚本 12 | created_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 13 | updated_at timestamp default (datetime(current_timestamp, 'localtime')) not null, 14 | unique (source_id, target_id) 15 | ); 16 | -------------------------------------------------------------------------------- /duckcp/boot/meta_command.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | 5 | from duckcp.boot import app 6 | from duckcp.service import meta_service 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | @app.group(help='管理元信息数据库') 12 | @click.help_option('-h', '--help', help='展示帮助信息') 13 | def meta(): 14 | pass 15 | 16 | 17 | @meta.command('delete', help='删除元信息数据库') 18 | @click.help_option('-h', '--help', help='展示帮助信息') 19 | def meta_delete(): 20 | meta_service.meta_delete() 21 | 22 | 23 | @meta.command('create', help='创建元信息数据库') 24 | @click.option('-f', '--force', is_flag=True, help='强制重建') 25 | @click.help_option('-h', '--help', help='展示帮助信息') 26 | def meta_create(force: bool): 27 | if force: 28 | meta_service.meta_delete() 29 | meta_service.meta_create() 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "duckcp" 3 | version = "0.1.3" 4 | description = "数据同步工具" 5 | readme = "README.md" 6 | requires-python = ">=3.12" # 类型别名 7 | dependencies = [ 8 | "click>=8.2.1", # Python >=3.10 9 | "duckdb>=1.3.2", # Python >= 3.9 10 | "pandas>=2.3.1", # Python >=3.9 11 | "psycopg2-binary>=2.9.10", # Python >=3.8 12 | "pyodps>=0.12.4", # Python >=3.7 13 | "rich>=14.1.0", # Python >=3.8 14 | "sqlglot>=27.4.1", # Python >=3.9 15 | ] 16 | 17 | [dependency-groups] 18 | dev = [ 19 | "ipython>=9.3.0", # Python >=3.11 20 | "pyinstaller>=6.14.1", # Python >=3.8 21 | ] 22 | 23 | [project.scripts] 24 | duckcp = "duckcp:main" 25 | 26 | [build-system] 27 | requires = ["uv_build"] 28 | build-backend = "uv_build" 29 | 30 | [tool.uv.build-backend] 31 | module-root = "" 32 | -------------------------------------------------------------------------------- /duckcp/helper/serialization.py: -------------------------------------------------------------------------------- 1 | """ 2 | 序列化与反序列化帮助函数。 3 | """ 4 | from decimal import Decimal 5 | from json import JSONEncoder, dumps, loads 6 | from typing import Any 7 | 8 | 9 | class JsonEncoder(JSONEncoder): 10 | """ 11 | 序列化JSON的数据: 12 | - 添加Decimal类型支持。 13 | - 其他类型数据通过`vars`支持。 14 | """ 15 | 16 | def default(self, value) -> any: 17 | return float(value) if isinstance(value, Decimal) else vars(value) 18 | 19 | 20 | def json_encode(data: Any) -> str: 21 | """ 22 | 将数据序列化成JSON字符串: 23 | - 保留中文 24 | - 去掉多于空格 25 | - 支持自定义的类型 26 | """ 27 | return dumps(data, separators=(',', ':'), ensure_ascii=False, cls=JsonEncoder) 28 | 29 | 30 | def json_decode(data: str | bytes) -> any: 31 | """ 32 | 将JSON字符串反序列化成对象 33 | """ 34 | return loads(data) 35 | -------------------------------------------------------------------------------- /duckcp/entity/repository.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from datetime import datetime 3 | from typing import NamedTuple, Any 4 | 5 | from duckcp.entity.connection import Connection 6 | from duckcp.typing.connection_protocol import ConnectionProtocol 7 | 8 | 9 | class Repository(NamedTuple): 10 | id: int = None 11 | kind: str = None # 类型 12 | code: str = None # 编码 13 | properties: dict[str, Any] = None # 连接信息 14 | created_at: datetime = None 15 | updated_at: datetime = None 16 | 17 | @abstractmethod 18 | def establish_connection[T: ConnectionProtocol](self) -> T: 19 | """ 20 | 建立新的数据库连接,并返回原始的连接类型。 21 | """ 22 | pass 23 | 24 | def connect(self) -> Connection: 25 | """ 26 | 建立新的数据库连接。 27 | """ 28 | return Connection(self.establish_connection()) 29 | -------------------------------------------------------------------------------- /duckcp/typing/cursor_protocol.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, Sequence, Any 2 | 3 | from duckcp.typing.supports_get_item_protocol import SupportsGetItemProtocol 4 | 5 | 6 | class CursorProtocol(Protocol): 7 | """ 8 | 一个抽象基类,定义数据库游标协议。 9 | """ 10 | 11 | @property 12 | def description(self) -> Sequence[SupportsGetItemProtocol]: 13 | ... 14 | 15 | def close(self): 16 | """ 17 | 关闭游标 18 | """ 19 | ... 20 | 21 | def executemany(self, sql: str, parameters: list[Sequence[Any]]): 22 | """ 23 | 批量执行。 24 | """ 25 | ... 26 | 27 | def execute(self, sql: str, parameters: Sequence[Any] = None): 28 | """ 29 | 单句执行。 30 | """ 31 | ... 32 | 33 | def fetchall(self) -> list[Sequence[Any]]: 34 | """ 35 | 获取查询结果。 36 | """ 37 | ... 38 | -------------------------------------------------------------------------------- /duckcp/service/snapshot_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional 3 | 4 | from duckcp.configuration import meta_configuration as metadata 5 | from duckcp.entity.snapshot import Snapshot 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def snapshot_find(storage_id: int) -> Optional[Snapshot]: 11 | """ 12 | 找到存储单元的快照记录。 13 | """ 14 | with metadata.connect() as meta: 15 | return meta.record('select * from snapshots where storage_id = ?', storage_id, constructor=Snapshot._make) 16 | 17 | 18 | def take_snapshot(storage_id: int, checksum: str, records: list[str]): 19 | """ 20 | 保存快照。 21 | """ 22 | with metadata.connect() as meta: 23 | snapshot = meta.record(''' 24 | insert or replace into snapshots 25 | (storage_id, checksum, records) 26 | values 27 | (?, ?, ?) 28 | returning * 29 | ''', storage_id, checksum, records, constructor=Snapshot._make) 30 | logger.info('创建快照(%s)', storage_id) 31 | logger.debug('snapshot=%s', snapshot) 32 | -------------------------------------------------------------------------------- /duckcp/repository/file_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Iterator 3 | from contextlib import contextmanager 4 | 5 | import duckdb 6 | from duckdb.duckdb import DuckDBPyConnection 7 | 8 | from duckcp.entity.connection import Connection 9 | from duckcp.entity.repository import Repository 10 | from duckcp.helper.fs import WorkDirectory 11 | from duckcp.helper.validation import ensure 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class FileRepository(Repository): 17 | """ 18 | 文件类型仓库。 19 | """ 20 | 21 | def establish_connection(self) -> DuckDBPyConnection: 22 | """ 23 | 创建DuckDB内存数据库连接。 24 | """ 25 | return duckdb.connect(':memory:') 26 | 27 | @contextmanager 28 | def connect(self) -> Iterator[Connection]: 29 | """ 30 | 建立在目标目录下的临时连接。 31 | """ 32 | ensure(bool(self.properties), '缺少连接参数') 33 | ensure(bool(self.properties.get('folder')), '缺少文件夹') 34 | folder = self.properties.get('folder') 35 | with WorkDirectory(folder): 36 | yield Connection(self.establish_connection()) 37 | -------------------------------------------------------------------------------- /duckcp/entity/connection.py: -------------------------------------------------------------------------------- 1 | from typing import Self 2 | 3 | from duckcp.entity.executor import Executor 4 | from duckcp.entity.statement import Statement 5 | from duckcp.typing.connection_protocol import ConnectionProtocol 6 | 7 | 8 | class Connection: 9 | """ 10 | 数据库连接对象。 11 | """ 12 | connection: ConnectionProtocol 13 | 14 | def __init__(self, connection: ConnectionProtocol): 15 | self.connection = connection 16 | 17 | def __enter__(self) -> Self: 18 | """ 19 | 生命周期开始。 20 | """ 21 | return self 22 | 23 | def __exit__(self, exception_class, exception, traceback): 24 | """ 25 | 生命周期结束。 26 | """ 27 | self.close() 28 | 29 | def close(self): 30 | """ 31 | 断开连接。 32 | """ 33 | self.connection.close() 34 | 35 | def executor(self) -> Executor: 36 | """ 37 | 创建新的语句对象,对于执行查询语句。 38 | """ 39 | return Executor(self.connection.cursor()) 40 | 41 | def prepare(self, sql: str) -> Statement: 42 | """ 43 | 准备SQL语句用于后续执行。 44 | """ 45 | return Statement(self.executor(), sql) 46 | -------------------------------------------------------------------------------- /duckcp/repository/postgres_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import psycopg2 4 | 5 | from duckcp.entity.repository import Repository 6 | from duckcp.helper.validation import ensure 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class PostgresRepository(Repository): 12 | """ 13 | Postgres类型仓库。 14 | """ 15 | 16 | def establish_connection(self) -> psycopg2.extensions.connection: 17 | """ 18 | 创建Postgres连接。 19 | """ 20 | ensure(bool(self.properties), '缺少连接参数') 21 | host = self.properties.get('host') 22 | port = self.properties.get('port') 23 | ensure(bool(self.properties.get('database')), '缺少数据库名称') 24 | database = self.properties.get('database') 25 | username = self.properties.get('username') 26 | password = self.properties.get('password') 27 | logger.debug('host=%s, port=%s, database=%s, username=%s', host, port, database, username) 28 | 29 | return psycopg2.connect( 30 | host=host, 31 | port=port, 32 | database=database, 33 | user=username, 34 | password=password 35 | ) 36 | -------------------------------------------------------------------------------- /duckcp/repository/odps_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from odps import ODPS, dbapi as odps 4 | 5 | from duckcp.entity.repository import Repository 6 | from duckcp.helper.validation import ensure 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class OdpsRepository(Repository): 12 | """ 13 | MaxCompute(ODPS)类型仓库。 14 | """ 15 | 16 | def establish_connection(self) -> odps.Connection: 17 | """ 18 | 创建Odps仓库连接。 19 | """ 20 | ensure(bool(self.properties), '缺少连接参数') 21 | ensure(bool(self.properties.get('end_point')), '缺少接入点') 22 | end_point = self.properties.get('end_point') 23 | ensure(bool(self.properties.get('project')), '缺少项目名') 24 | project = self.properties.get('project') 25 | ensure(bool(self.properties.get('access_key')), '缺少访问编码') 26 | access_key = self.properties.get('access_key') 27 | ensure(bool(self.properties.get('access_secret')), '缺少访问密钥') 28 | access_secret = self.properties.get('access_secret') 29 | 30 | logger.debug('end_point=%s, project=%s, access_key=%s', end_point, project, access_key) 31 | db = ODPS(access_key, access_secret, project, end_point) 32 | return odps.connect(db) 33 | -------------------------------------------------------------------------------- /duckcp/boot/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | from click import Choice, Tuple 3 | 4 | from duckcp.configuration.logging_configuration import enable_logging_configuration 5 | from duckcp.configuration.meta_configuration import enable_metadata_configuration 6 | 7 | 8 | @click.group(help='数据同步工具') 9 | @click.option('-c', '--config-file', metavar='FILE', help='配置文件') 10 | @click.option('-o', '--logging-file', metavar='FILE', help='日志文件') 11 | @click.option('-l', '--logging', type=Tuple([str, Choice([ 12 | 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL', 13 | ], case_sensitive=False)]), metavar='...', multiple=True, nargs=2, help='日志等级') 14 | @click.option('-m', '--message-only', is_flag=True, help='日志只输出内容') 15 | @click.option('-v', '--verbose', is_flag=True, help='开启详细日志') 16 | @click.option('-q', '--quiet', is_flag=True, help='关闭所有日志') 17 | @click.version_option('v0.1.3', '-V', '--version', help='展示版本信息') 18 | @click.help_option('-h', '--help', help='展示帮助信息') 19 | def app(config_file: str, logging_file: str, logging: list[tuple[str, str]], message_only: bool, verbose: bool, quiet: bool) -> None: 20 | enable_logging_configuration(logging_file, logging, message_only, not quiet and verbose, quiet) 21 | enable_metadata_configuration(config_file) 22 | -------------------------------------------------------------------------------- /duckcp/transform/database_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据迁移至管系统数据库表,原理如下: 3 | 1. 在来源仓库上执行SQL。 4 | 2. 根据查询结果生成DELETE语句与INSERT语句。 5 | 3. 先执行删除语句清空表。 6 | 4. 再执行插入语句新增记录。 7 | """ 8 | import logging 9 | 10 | from duckcp.entity.repository import Repository 11 | from duckcp.entity.statement import Statement 12 | from duckcp.entity.storage import Storage 13 | from duckcp.helper.sql import delete_from, insert_into 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def database_transform(statement: Statement, repository: Repository, storage: Storage): 19 | """ 20 | 将数据源迁移到关系型数据库表中。 21 | """ 22 | catalog = storage.properties.get('catalog') 23 | schema = storage.properties.get('schema') 24 | table = storage.properties['table'] 25 | 26 | # 通过sqlglot生成SQL,避免字符串转义或SQL注入等问题。 27 | with repository.connect() as connection: 28 | with connection.executor() as executor: 29 | sql = delete_from(catalog, schema, table).sql() 30 | logger.info('清空表(%s)', sql) 31 | executor.execute(sql) 32 | 33 | columns, records = statement.execute() 34 | sql = insert_into(catalog, schema, table, columns).sql() 35 | logger.info('批量添加数据(%s)', sql) 36 | executor.batch(sql, records) 37 | -------------------------------------------------------------------------------- /duckcp/service/meta_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | 元信息数据库管理服务。 3 | """ 4 | import logging 5 | from importlib.resources import path 6 | from os import unlink, chmod 7 | from os.path import exists 8 | 9 | from duckcp import migration 10 | from duckcp.configuration import meta_configuration as metadata, Configuration 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def meta_create(): 16 | """ 17 | 创建元信息数据库。 18 | """ 19 | if not exists(Configuration.file): 20 | logger.info('配置文件(%s)初始化', Configuration.file) 21 | with metadata.connect() as meta: 22 | with path(migration) as folder: 23 | for script in sorted(folder.glob('**/*.sql')): 24 | logger.info('执行脚本(%s)', script.name) 25 | meta.execute(script.read_text(encoding='utf-8')) 26 | chmod(Configuration.file, 0o600) # 配置文件里包含部分敏感信息,因此只允许当前用户访问 27 | else: 28 | logger.warning('配置文件(%s)已存在', Configuration.file) 29 | 30 | 31 | def meta_delete(): 32 | """ 33 | 删除元信息数据库。 34 | """ 35 | if exists(Configuration.file): 36 | logger.info('删除配置文件(%s)', Configuration.file) 37 | unlink(Configuration.file) 38 | else: 39 | logger.info('配置文件(%s)不存在;忽略删除操作', Configuration.file) 40 | -------------------------------------------------------------------------------- /duckcp/configuration/meta_configuration.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import contextmanager 3 | from os import makedirs 4 | from os.path import dirname, exists 5 | from typing import Optional, Iterator 6 | 7 | from duckcp.configuration import Configuration 8 | from duckcp.constant import IDENTIFIER 9 | from duckcp.entity.executor import Executor 10 | from duckcp.helper.fs import absolute_path, config 11 | from duckcp.repository.sqlite_repository import SqliteRepository 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def enable_metadata_configuration(file: Optional[str] = None): 17 | """ 18 | 元数据配置。 19 | """ 20 | file = absolute_path(file) if file is not None else config(IDENTIFIER, 'configuration.db') 21 | folder = dirname(file) 22 | if not exists(folder): 23 | logger.info('创建目录(%s)', folder) 24 | makedirs(folder) 25 | Configuration.file = file 26 | logger.debug('元信息文件(%s)', file) 27 | 28 | 29 | @contextmanager 30 | def connect() -> Iterator[Executor]: 31 | """ 32 | 链接元信息数据库。 33 | """ 34 | repository = SqliteRepository(properties={'file': Configuration.file}) 35 | with repository.connect() as connection: 36 | with connection.executor() as executor: 37 | yield executor 38 | -------------------------------------------------------------------------------- /duckcp/transform/duckdb_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据迁移至DuckDB数据库表,原理如下: 3 | 1. 在来源仓库上执行SQL,并将查询结果封装成DataFrame。 4 | 2. 将DataFrame映射成DuckDB的只读视图。 5 | 3. 执行`create ore replace table ... from ...`替换目标表内的数据。 6 | """ 7 | import logging 8 | 9 | from duckcp.entity.statement import Statement 10 | from duckcp.entity.storage import Storage 11 | from duckcp.helper.sql import create_or_replace_table 12 | from duckcp.repository.duckdb_repository import DuckDBRepository 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def duckdb_transform(statement: Statement, repository: DuckDBRepository, storage: Storage): 18 | """ 19 | 将数据源迁移到DuckDB数据库表中。 20 | """ 21 | catalog = storage.properties.get('catalog') 22 | schema = storage.properties.get('schema') 23 | table = storage.properties['table'] 24 | 25 | with repository.establish_connection() as connection: 26 | with connection.cursor() as cursor: 27 | data = statement() 28 | cursor.execute(f' set global pandas_analyze_sample = {len(data)} ') 29 | cursor.register(storage.code, data) # 表的全名是`temp.main.` 30 | # 通过sqlglot生成SQL,避免字符串转义或SQL注入等问题。 31 | ast = create_or_replace_table(catalog, schema, table, storage.code) 32 | sql = ast.sql(dialect='duckdb') 33 | logger.debug('sql=%s', sql) 34 | cursor.execute(sql) 35 | -------------------------------------------------------------------------------- /duckcp/transform/file_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据迁移至本地文件,原理如下: 3 | 1. 在来源仓库上执行SQL,并将查询结果封装成DataFrame。 4 | 2. 将DataFrame映射成DuckDB的只读视图。 5 | 3. 执行`COPY ... to ...`导出数据到本地文件。 6 | """ 7 | import logging 8 | 9 | from duckcp.entity.statement import Statement 10 | from duckcp.entity.storage import Storage 11 | from duckcp.helper.fs import WorkDirectory 12 | from duckcp.helper.sql import copy_to 13 | from duckcp.repository.duckdb_repository import DuckDBRepository 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def file_transform(statement: Statement, repository: DuckDBRepository, storage: Storage): 19 | """ 20 | 将数据源迁移到本地文件中。 21 | """ 22 | folder = repository.properties['folder'] 23 | with WorkDirectory(folder): 24 | with repository.establish_connection() as connection: 25 | with connection.cursor() as cursor: 26 | data = statement() 27 | cursor.execute(f' set global pandas_analyze_sample = {len(data)} ') 28 | cursor.register(storage.code, data) # 表的全名是`temp.main.
` 29 | file_name = storage.properties.pop('file') 30 | # 通过sqlglot生成SQL,避免字符串转义或SQL注入等问题。 31 | ast = copy_to(storage.code, file_name, storage.properties) 32 | sql = ast.sql(dialect='duckdb') 33 | logger.debug('sql=%s', sql) 34 | cursor.execute(sql) 35 | -------------------------------------------------------------------------------- /duckcp/feishu/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 飞书开放接口 3 | """ 4 | import logging 5 | from datetime import datetime, timedelta 6 | from typing import TypedDict 7 | 8 | from duckcp.helper import http 9 | from duckcp.typing.authentication_token_type import AuthenticationToken 10 | 11 | OPEN_API = 'https://open.feishu.cn/open-apis' 12 | ACCESS_TOKEN_API = f'{OPEN_API}/auth/v3/tenant_access_token/internal/' 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class FeiShuError(Exception): 18 | """ 19 | 飞书多维表格接口执行失败时,抛出本异常。 20 | """ 21 | 22 | def __init__(self, api: str, message: str): 23 | super().__init__(f'{api}请求失败:{message}') 24 | 25 | 26 | class CredentialResponse(TypedDict): 27 | """ 28 | 飞书应用凭证响应。 29 | """ 30 | code: int # 错误编码。 31 | msg: str # 错误消息。 32 | expire: int # 有效时长;单位秒。 33 | tenant_access_token: str # 租户级别凭证。 34 | 35 | 36 | def tenant_access_token(token: AuthenticationToken) -> tuple[str, datetime]: 37 | """ 38 | 获得租户级别的访问凭证 39 | @param token: 认证信息 40 | """ 41 | logger.debug('token=%s', token) 42 | response: CredentialResponse = http.post(ACCESS_TOKEN_API, params={ 43 | 'app_id': token['access_key'], 44 | 'app_secret': token['access_secret'], 45 | }) 46 | if response['code'] == 0: 47 | access_token = response['tenant_access_token'] 48 | expired_at = datetime.now() + timedelta(seconds=response['expire'] - 30) 49 | return access_token, expired_at 50 | else: 51 | raise FeiShuError('刷新凭证', response.get('msg', '')) 52 | -------------------------------------------------------------------------------- /duckcp/repository/sqlite_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sqlite3 3 | from typing import Any 4 | 5 | from duckcp.entity.repository import Repository 6 | from duckcp.helper.serialization import json_encode, json_decode 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def to_bool(value: bytes) -> bool: 12 | """ 13 | 将SQLite返回的Bytes类型布尔值转成Python的布尔值。 14 | """ 15 | return bool(int(value)) if value is not None else None 16 | 17 | 18 | def to_value(value: bytes) -> Any: 19 | """ 20 | 将SQLite的JSON转成Python的值。 21 | """ 22 | return json_decode(value) if value is not None else None 23 | 24 | 25 | def to_json(value: Any) -> str: 26 | """ 27 | 将Python的值转成SQLite的JSON。 28 | """ 29 | return json_encode(value) if value is not None else None 30 | 31 | 32 | sqlite3.register_converter('boolean', to_bool) 33 | sqlite3.register_converter('bool', to_bool) 34 | sqlite3.register_converter('jsonb', to_value) 35 | sqlite3.register_converter('json', to_value) 36 | sqlite3.register_adapter(dict, to_json) 37 | sqlite3.register_adapter(list, to_json) 38 | 39 | 40 | class SqliteRepository(Repository): 41 | """ 42 | Sqlite类型仓库。 43 | """ 44 | 45 | def establish_connection(self) -> sqlite3.Connection: 46 | """ 47 | 创建Sqlite连接。 48 | """ 49 | file = self.properties['file'] if self.properties and 'file' in self.properties else ':memory:' 50 | logger.debug('file=%s', file) 51 | connection = sqlite3.connect(file, detect_types=sqlite3.PARSE_DECLTYPES, autocommit=True) 52 | connection.execute('PRAGMA foreign_keys=ON') # 启用on delete cascade 53 | return connection 54 | -------------------------------------------------------------------------------- /duckcp/helper/digest.py: -------------------------------------------------------------------------------- 1 | """ 2 | 摘要算法帮助函数 3 | """ 4 | import hashlib 5 | from typing import Callable 6 | 7 | 8 | # bytes版本 9 | 10 | def hash_encode_bytes(algorithm: Callable, data: bytes) -> bytes: 11 | """ 12 | 通过Hash算法编码明文数据。 13 | :param algorithm: hashlib包里的算法。 14 | :param data: 明文字节流。 15 | :return: 密文字节流。 16 | """ 17 | encoder = algorithm() 18 | encoder.update(data) 19 | return encoder.digest() 20 | 21 | 22 | def md5_bytes(data: bytes) -> bytes: 23 | """ 24 | 计算明文数据的MD5摘要。 25 | :param data: 明文字节流。 26 | :return: MD5字节流。 27 | """ 28 | return hash_encode_bytes(hashlib.md5, data) 29 | 30 | 31 | def sha256_bytes(data: bytes) -> bytes: 32 | """ 33 | 计算明文数据的SHA256摘要。 34 | :param data: 明文字节流。 35 | :return: SHA256字节流。 36 | """ 37 | return hash_encode_bytes(hashlib.sha256, data) 38 | 39 | 40 | def sha512_bytes(data: bytes) -> bytes: 41 | """ 42 | 计算明文数据的SHA512摘要。 43 | :param data: 明文字节流。 44 | :return: SHA512字节流。 45 | """ 46 | return hash_encode_bytes(hashlib.sha512, data) 47 | 48 | 49 | # str 版本 50 | 51 | def hash_encode(algorithm: Callable, data: str) -> str: 52 | """ 53 | 通过Hash算法编码明文数据。 54 | :param algorithm: hashlib包里的算法。 55 | :param data: 明文字符串。 56 | :return: 密文字符串。 57 | """ 58 | return hash_encode_bytes(algorithm, data.encode('utf-8')).hex() 59 | 60 | 61 | def md5(data: str) -> str: 62 | """ 63 | 计算明文数据的MD5摘要。 64 | :param data: 明文字符串。 65 | :return: MD5字符串。 66 | """ 67 | return hash_encode(hashlib.md5, data) 68 | 69 | 70 | def sha256(data: str) -> str: 71 | """ 72 | 计算明文数据的SHA256摘要。 73 | :param data: 明文字符串。 74 | :return: SHA256字符串。 75 | """ 76 | return hash_encode(hashlib.sha256, data) 77 | 78 | 79 | def sha512(data: str) -> str: 80 | """ 81 | 计算明文数据的SHA512摘要。 82 | :param data: 明文字符串。 83 | :return: SHA512字符串。 84 | """ 85 | return hash_encode(hashlib.sha512, data) 86 | -------------------------------------------------------------------------------- /duckcp/service/authentication_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | 开放平台认证服务。 3 | """ 4 | import logging 5 | from typing import Optional 6 | 7 | from duckcp.configuration import meta_configuration as metadata 8 | from duckcp.entity.credential import Credential 9 | from duckcp.helper.validation import ensure 10 | from duckcp.typing.authentication_token_type import AuthenticationToken 11 | from duckcp.typing.authenticator_type import Authenticator 12 | from duckcp.typing.credential_refresher_type import CredentialRefresher 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def authenticate(platform_code: str, app_code: str, token: AuthenticationToken, refresher: CredentialRefresher) -> Authenticator: 18 | """ 19 | 认证并获取最新的访问凭证。 20 | """ 21 | logger.debug('platform_code=%s, app_code=%s, token=%s', platform_code, app_code, token) 22 | ensure(platform_code is not None and app_code is not None and token is not None, '缺少应用信息') 23 | 24 | def authenticator() -> Optional[str]: 25 | """ 26 | 认证闭包:包含了认证信息。 27 | """ 28 | with metadata.connect() as meta: 29 | access_token = meta.value(''' 30 | select 31 | access_token 32 | from 33 | credentials 34 | where 35 | platform_code = ? 36 | and app_code = ? 37 | and datetime(current_timestamp, 'localtime') < expired_at 38 | ''', platform_code, app_code) 39 | if access_token is None: 40 | access_token, expired_at = refresher(token) 41 | credential = meta.record(''' 42 | insert or replace into credentials 43 | (platform_code, app_code, access_token, expired_at) 44 | values 45 | (?, ?, ?, ?) 46 | returning * 47 | ''', platform_code, app_code, access_token, expired_at, constructor=Credential._make) 48 | logger.info('刷新凭证(%s, %s)', platform_code, app_code) 49 | logger.debug('credential=%s', credential) 50 | return access_token 51 | 52 | return authenticator 53 | -------------------------------------------------------------------------------- /docs/feishu-bitable-getting-started.md: -------------------------------------------------------------------------------- 1 | 飞书多维表格配置 2 | ==== 3 | 4 | 本文档介绍如何在飞书开放平台上创建应用,以及关联授权该应用通过接口编辑多维表格。 5 | 6 | ## 前置准备 7 | 8 | 1. 首先进入[飞书主页](https://www.feishu.cn)注册一个飞书账号。 9 | 2. 然后创建一个新的企业(不需要认证);或者加入已注册的企业,并获得开发者权限。 10 | 3. 接着可以按照下文的步骤创建企业应用并关联多维表格。 11 | 12 | ## 一、创建企业应用 13 | 14 | 1. 进入[飞书开发者后台](https://open.feishu.cn/app)页面,如下图所示。 15 | 2. 点击『创建企业自建应用』按钮,填写信息并点击『创建』。 16 | 17 | ![创建企业自建应用](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-create.png) 18 | 19 | ## 二、添加应用权限 20 | 21 | 新创建的应用没有任何权限,duckcp同步数据需要调用以下三个接口: 22 | 23 | 1. [查询记录接口](https://open.feishu.cn/document/docs/bitable-v1/app-table-record/search) 24 | 2. [新增多条记录接口](https://open.feishu.cn/document/server-docs/docs/bitable-v1/app-table-record/batch_create) 25 | 3. [删除多条记录接口](https://open.feishu.cn/document/server-docs/docs/bitable-v1/app-table-record/batch_delete) 26 | 27 | 若当前应用尚未获得相关的权限,右上角『权限配置』选项卡就会出现红色感叹号,如下图所示: 28 | 29 | ![接口列表](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-apis.png) 30 | 31 | 切换至『权限配置』选项卡,选中状态是『未开通』的选项,点击右上角的『批量开通』——如下图所示——并在弹出窗口中点击『开通』 32 | 33 | ![批量开通权限](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-grant-authorities.png) 34 | 35 | **特别注意**:开通权限前先确定当前使用的Access Token是否为『Tenant Access Token』;若不是,点击『切换为tenant_access_token』按钮。如下图所示: 36 | 37 | ![切换访问凭证类型](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-tenant_access-token.png) 38 | 39 | 在开放平台后台的应用详情页面中,切换到『开发配置』-『权限管理』,能看到当前应用申请的所有权限。如下图所示: 40 | 41 | ![权限管理](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-permissions.png) 42 | 43 | ## 三、创建应用版本 44 | 45 | 应用需要创新新版本并发布,申请的权限才能生效。 46 | 47 | 在开放平台后台的应用详情页面中,切换到『应用发布』-『版本管理与发布』,再点击右上角的『创建版本』按钮,就能新建版本。如下图所示: 48 | 49 | ![创建应用版本](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/app-version.png) 50 | 51 | ## 四、创建多维表格 52 | 53 | 1. 进入[飞书企业云盘](https://www.feishu.cn/product/drive)产品页面。 54 | 2. 点击『新建』-『多维表格』按钮。 55 | 3. 在弹出的窗口里点击『新建空白表格』按钮。 56 | 57 | ## 五、添加文档应用 58 | 59 | 1. 点击右上角『...』按钮。 60 | 2. 进入『... 更多』菜单。 61 | 3. 点击『添加文档应用』。 62 | 63 | ![添加文档应用](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/bitable-add-app.png) 64 | 65 | 在弹出的『文档应用』窗口中搜索上面创建的企业应用名称,并为其添加『可编辑』权限。如下图所示: 66 | 67 | ![搜索应用](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/bitable-search-app.png) 68 | -------------------------------------------------------------------------------- /duckcp/helper/http.py: -------------------------------------------------------------------------------- 1 | """ 2 | HTTP客户端帮助函数。 3 | """ 4 | import logging 5 | from typing import Any 6 | from urllib.parse import urlparse, parse_qs, urlencode, urlunparse 7 | from urllib.request import urlopen, Request 8 | 9 | from duckcp.helper.serialization import json_decode, json_encode 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class HttpClientError(Exception): 15 | """ 16 | HTTP客户端执行失败时,抛出本异常。 17 | """ 18 | 19 | def __init__(self, url: str, message: str): 20 | super().__init__(f'请求{url}失败:{message}') 21 | 22 | 23 | def request(body: Request, timeout: float = 10) -> Any: 24 | with urlopen(body, timeout=timeout) as response: 25 | if response.status // 100 == 2: # 2xx 26 | data = response.read().decode() 27 | return json_decode(data) 28 | else: 29 | raise HttpClientError(body.full_url, response.status) 30 | 31 | 32 | def _append_search(url: str, query: dict) -> str: 33 | """ 34 | 将查询条件合并到URL中。 35 | """ 36 | if query is not None: 37 | uri = urlparse(url) 38 | query = {**parse_qs(uri.query), **query} # 合并参数,并且用新参数覆盖已有参数 39 | search = urlencode(query, doseq=True) 40 | return str(urlunparse(uri._replace(query=search))) 41 | else: 42 | return url 43 | 44 | 45 | def get(url: str, headers: dict = None, query: dict = None, timeout: float = 10) -> Any: 46 | """ 47 | 发起HTTP GET请求。 48 | """ 49 | logger.debug('url=%s, headers=%s, params=%s, timeout=%s', url, headers, query, timeout) 50 | 51 | if headers is None: 52 | headers = {} 53 | url = _append_search(url, query) 54 | 55 | return request(Request(url, method='GET', headers=headers), timeout) 56 | 57 | 58 | def post(url: str, headers: dict = None, query: dict = None, params: dict = None, timeout: float = 10) -> Any: 59 | """ 60 | 发起HTTP POST请求。 61 | """ 62 | logger.debug('url=%s, headers=%s, params=%s, timeout=%s', url, headers, params, timeout) 63 | if headers is None: 64 | headers = {} 65 | if 'Content-Type' not in headers: 66 | headers['Content-Type'] = 'application/json; charset=utf-8' 67 | url = _append_search(url, query) 68 | data = json_encode(params).encode() if params is not None else None 69 | 70 | return request(Request(url, method='POST', headers=headers, data=data), timeout) 71 | -------------------------------------------------------------------------------- /duckcp/transform/bitable_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据迁移至多维表格,原理如下: 3 | 1. 在来源仓库上执行SQL。 4 | 2. 对比查询结果与快照是否一致。 5 | 3. 若快照已失效,则清空多维表格的记录。 6 | 4. 清空多维表格后,再添加新记录到多维表格。 7 | 5. 记录保存完成后,讲记录编码保存至本地缓存。 8 | """ 9 | import logging 10 | from typing import Any 11 | 12 | from duckcp.entity.statement import Statement 13 | from duckcp.entity.storage import Storage 14 | from duckcp.feishu.bitable import batch_delete, batch_create, Record 15 | from duckcp.helper.digest import sha256 16 | from duckcp.repository.bitable_repository import BiTableRepository 17 | from duckcp.service import snapshot_service 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def digest(records: list[dict[str, Any]]) -> str: 23 | """ 24 | 计算记录的摘要。 25 | """ 26 | content = '\n'.join([ 27 | f'{name}:{value}' 28 | for record in records 29 | for name, value in sorted(record.items()) 30 | ]) 31 | return sha256(content) 32 | 33 | 34 | def bitable_transform(statement: Statement, repository: BiTableRepository, storage: Storage): 35 | """ 36 | 将数据源迁移到多维表格中。 37 | """ 38 | authenticator = repository.authenticator 39 | logger.debug('storage=%s', storage) 40 | document = storage.properties['document'] 41 | table = storage.properties['table'] 42 | logger.debug('document=%s, table=%s', document, table) 43 | 44 | # 1. 获取数据并计算摘要 45 | records = statement.all() 46 | checksum = digest(records) 47 | logger.debug('records=%s, checksum=%s', records, checksum) 48 | 49 | # 2. 对比快照 50 | snapshot = snapshot_service.snapshot_find(storage.id) 51 | if snapshot is not None and snapshot.checksum != checksum and bool(snapshot.records): 52 | # 快照失效,先清空历史数据 53 | batch_delete(authenticator(), document, table, snapshot.records) 54 | logger.info('清空飞书文档(%s)多维表格(%s)', document, table) 55 | 56 | # 3. 保存数据 57 | if snapshot is None or snapshot.checksum != checksum and bool(records): 58 | # 无快照或快照失效,则添加数据 59 | records = batch_create(authenticator(), document, table, [ 60 | Record(fields=record) 61 | for record in records 62 | ]) 63 | snapshot_service.take_snapshot(storage.id, checksum, [record['record_id'] for record in records]) 64 | logger.info('保存飞书文档(%s)多维表格(%s)记录%s条', document, table, len(records)) 65 | else: 66 | logger.info('飞书文档(%s)多维表格(%s)数据未变化', document, table) 67 | -------------------------------------------------------------------------------- /duckcp/entity/statement.py: -------------------------------------------------------------------------------- 1 | from typing import Self, Sequence, Any, Optional 2 | 3 | from pandas import DataFrame 4 | 5 | from duckcp.entity.executor import Executor 6 | from duckcp.typing.record_constructor_protocol import RecordConstructorProtocol 7 | 8 | 9 | class Statement: 10 | """ 11 | 语句:预置SQL语句的执行器。 12 | """ 13 | executor: Executor 14 | sql: str 15 | 16 | def __init__(self, executor: Executor, sql: str): 17 | self.executor = executor 18 | self.sql = sql 19 | 20 | def __enter__(self) -> Self: 21 | """ 22 | 生命周期开始。 23 | """ 24 | return self 25 | 26 | def __exit__(self, exception_class, exception, traceback): 27 | """ 28 | 生命周期结束。 29 | """ 30 | self.close() 31 | 32 | def close(self): 33 | """ 34 | 关闭本次会话/游标。 35 | """ 36 | self.executor.close() 37 | 38 | def batch(self, parameters: list[Sequence[Any]]): 39 | """ 40 | 批量执行语句,无返回结果。 41 | """ 42 | self.executor.batch(self.sql, parameters) 43 | 44 | def execute(self, *parameters: Any) -> tuple[list[str], list[Sequence[Any]]]: 45 | """ 46 | 执行单条语句,返回原始的结果。 47 | - 头信息:列名和类型。 48 | - 记录:原始数据。 49 | """ 50 | return self.executor.execute(self.sql, *parameters) 51 | 52 | def __call__(self, *parameters: Any) -> DataFrame: 53 | """ 54 | 执行查询语句,返回DataFrame结构。 55 | """ 56 | return self.executor(self.sql, *parameters) 57 | 58 | def records[T: tuple[Any, ...]](self, *parameters: Any, constructor: RecordConstructorProtocol[T] = None) -> list[T]: 59 | """ 60 | 执行查询语句,返回NamedTuple形式的记录数据。 61 | """ 62 | return self.executor.records(self.sql, *parameters, constructor=constructor) 63 | 64 | def record[T: tuple[Any, ...]](self, *parameters: Any, constructor: RecordConstructorProtocol[T] = None) -> Optional[T]: 65 | """ 66 | 执行查询语句,并返回第一行NamedTuple形式的记录数据。 67 | """ 68 | return self.executor.record(self.sql, *parameters, constructor=constructor) 69 | 70 | def value(self, *parameters: Any) -> Optional[Any]: 71 | """ 72 | 执行查询语句,并返回第一行第一列数据。 73 | """ 74 | return self.executor.value(self.sql, *parameters) 75 | 76 | def values(self, *parameters: Any) -> list[Any]: 77 | """ 78 | 执行查询语句,并返回所有行的第一列数据。 79 | """ 80 | return self.executor.values(self.sql, *parameters) 81 | 82 | def all(self, *parameters: Any) -> list[dict[str, Any]]: 83 | """ 84 | 执行查询语句,返回字典结构的记录。 85 | """ 86 | return self.executor.all(self.sql, *parameters) 87 | 88 | def one(self, *parameters: Any) -> dict[str, Any]: 89 | """ 90 | 执行查询语句,返回字典结构的第一行记录。 91 | """ 92 | return self.executor.one(self.sql, *parameters) 93 | -------------------------------------------------------------------------------- /duckcp/configuration/logging_configuration.py: -------------------------------------------------------------------------------- 1 | from logging.config import dictConfig 2 | from os import makedirs 3 | from os.path import exists, dirname 4 | 5 | import click 6 | from rich.text import Text 7 | 8 | 9 | def highlighter(text: Text) -> Text: 10 | """ 11 | 自定义的高亮:根据日志级别展示不同的色彩。 12 | """ 13 | fragments = text.plain.split(' ', 5) 14 | if len(fragments) == 6: 15 | (date, time, level, fn, separator, message) = fragments 16 | if level == 'DEBUG': 17 | style = 'dim black' 18 | elif level == 'INFO': 19 | style = 'cyan' 20 | elif level == 'WARNING': 21 | style = 'yellow' 22 | elif level == 'ERROR': 23 | style = 'bold red' 24 | elif level == 'CRITICAL': 25 | style = 'red on yellow' 26 | else: 27 | style = 'black' 28 | record = f'[dim][cyan]{date} {time}[/cyan] [{style}]{level}[/{style}] {fn} {separator}[/dim] [{style}]{message}[/{style}]' 29 | return Text.from_markup(record) 30 | else: 31 | return text 32 | 33 | 34 | def enable_logging_configuration( 35 | file: str | None, 36 | levels: list[tuple[str, str]], 37 | message_only: bool, 38 | verbose: bool, 39 | quiet: bool 40 | ): 41 | """ 42 | 日志输出配置。 43 | """ 44 | default_level = 'DEBUG' if verbose else 'INFO' 45 | formatter = 'message' if message_only else 'base' 46 | 47 | if quiet: 48 | handler = { 49 | 'class': 'logging.NullHandler' 50 | } 51 | elif file is None or file == '': # 不输出文件,则默认输出到控制台 52 | handler = { 53 | 'class': 'rich.logging.RichHandler', 54 | 'level': default_level, 55 | 'formatter': formatter, 56 | 'highlighter': highlighter, 57 | 'show_time': False, 58 | 'show_level': False, 59 | 'rich_tracebacks': True, 60 | 'tracebacks_suppress': [click], 61 | } 62 | else: 63 | folder = dirname(file) 64 | if not exists(folder): 65 | makedirs(folder) 66 | handler = { 67 | 'class': 'logging.handlers.TimedRotatingFileHandler', 68 | 'level': default_level, 69 | 'formatter': formatter, 70 | 'filename': file, 71 | 'when': 'D', 72 | 'backupCount': 30, 73 | } 74 | 75 | dictConfig({ 76 | 'version': 1, 77 | 'formatters': { 78 | 'message': { 79 | 'format': '%(message)s', 80 | 'datefmt': '%Y-%m-%d %H:%M:%S', 81 | }, 82 | 'base': { 83 | 'format': '%(asctime)s.%(msecs)03d %(levelname)s %(name)s#%(funcName)s : %(message)s', 84 | 'datefmt': '%Y-%m-%d %H:%M:%S', 85 | }, 86 | }, 87 | 'handlers': { 88 | 'handler': handler, 89 | }, 90 | 'root': { 91 | 'level': default_level, 92 | 'propagate': False, 93 | 'handlers': ['handler'], 94 | }, 95 | 'loggers': { 96 | 'duckcp': { 97 | 'level': default_level, 98 | 'propagate': False, 99 | 'handlers': ['handler'], 100 | }, 101 | '__main__': { 102 | 'level': default_level, 103 | 'propagate': False, 104 | 'handlers': ['handler'], 105 | }, 106 | **{name: { 107 | 'level': level.upper(), 108 | 'propagate': False, 109 | 'handlers': ['handler'], 110 | } for name, level in levels} 111 | }, 112 | }) 113 | -------------------------------------------------------------------------------- /duckcp/boot/task_command.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from click import help_option, argument, option, INT 4 | from rich.console import Console 5 | from rich.table import Table 6 | 7 | from duckcp.boot import app 8 | from duckcp.service import task_service 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @app.group(short_help='管理任务', help='管理任务定时调度的任务') 14 | @help_option('-h', '--help', help='展示帮助信息') 15 | def task(): 16 | pass 17 | 18 | 19 | @task.command('create', help='创建任务') 20 | @argument('name', metavar='NAME') 21 | @help_option('-h', '--help', help='展示帮助信息') 22 | def task_create(name: str): 23 | logger.debug('name=%s', name) 24 | task_service.task_create(name) 25 | 26 | 27 | @task.command('delete', help='删除任务') 28 | @argument('name', metavar='NAME') 29 | @help_option('-h', '--help', help='展示帮助信息') 30 | def task_delete(name: str): 31 | logger.debug('name=%s', name) 32 | task_service.task_delete(name) 33 | 34 | 35 | @task.command('list', help='列出所有任务') 36 | @help_option('-h', '--help', help='展示帮助信息') 37 | def task_list(): 38 | table = Table(title='任务列表') 39 | table.add_column('名称', no_wrap=True) 40 | table.add_column('关联迁移', justify='right') 41 | for row in task_service.task_list(): 42 | table.add_row( 43 | row.code, 44 | str(row.transformers), 45 | ) 46 | 47 | console = Console() 48 | console.print(table) 49 | 50 | 51 | @task.command('execute', help='执行任务') 52 | @argument('name', metavar='NAME') 53 | @help_option('-h', '--help', help='展示帮助信息') 54 | def task_execute(name: str): 55 | logger.debug('name=%s', name) 56 | task_service.task_execute(name) 57 | 58 | 59 | @task.command('bind', help='绑定迁移') 60 | @argument('name', metavar='NAME') 61 | @option('-t', '--transformer', metavar='TRANSFORMER', required=True, help='关联迁移') 62 | @option('-i', '--index', metavar='NUMBER', type=INT, help='执行顺序;默认最后') 63 | @help_option('-h', '--help', help='展示帮助信息') 64 | def task_bind(name: str, transformer: str, index: int): 65 | logger.debug('name=%s, transformer=%s', name, transformer) 66 | task_service.task_bind(name, transformer, index) 67 | 68 | 69 | @task.command('unbind', help='解绑迁移') 70 | @argument('name', metavar='NAME') 71 | @option('-t', '--transformer', metavar='TRANSFORMER', required=True, help='关联迁移') 72 | @help_option('-h', '--help', help='展示帮助信息') 73 | def task_unbind(name: str, transformer: str): 74 | logger.debug('name=%s, transformer=%s', name, transformer) 75 | task_service.task_unbind(name, transformer) 76 | 77 | 78 | @task.command('transformers', help='列出所有迁移') 79 | @argument('name', required=False, metavar='NAME') 80 | @help_option('-h', '--help', help='展示帮助信息') 81 | def task_transformer_list(name: str): 82 | logger.debug('name=%s', name) 83 | table = Table(title='任务迁移列表') 84 | if name is None: 85 | table.add_column('任务名称', no_wrap=True) 86 | table.add_column('执行顺序', justify='right') 87 | table.add_column('来源仓库类型') 88 | table.add_column('来源仓库编码') 89 | table.add_column('目标仓库类型') 90 | table.add_column('目标仓库编码') 91 | table.add_column('目标存储编码') 92 | table.add_column('迁移脚本') 93 | for row in task_service.task_transformer_list(name): 94 | record = [ 95 | str(row.sort), 96 | row.source_repository_kind, 97 | row.source_repository_code, 98 | row.target_repository_kind, 99 | row.target_repository_code, 100 | row.target_storage_code, 101 | row.script_file 102 | ] 103 | if name is None: 104 | record.insert(0, row.task_code) 105 | table.add_row(*record) 106 | 107 | console = Console() 108 | console.print(table) 109 | -------------------------------------------------------------------------------- /duckcp/helper/fs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 文件系统帮助函数。 3 | """ 4 | import logging 5 | from functools import lru_cache 6 | from os import makedirs, unlink, getcwd, chdir 7 | from os.path import abspath, expanduser, expandvars, exists, join, dirname 8 | from platform import system 9 | from shutil import rmtree, move 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class UnsupportedPlatformError(Exception): 15 | """ 16 | 当前操作系统不支持时,抛出本异常 17 | """ 18 | 19 | def __init__(self, platform: str): 20 | super().__init__(f'不支持操作系统`{platform}`') 21 | 22 | 23 | # 文件 24 | 25 | 26 | def move_file(source: str, target: str): 27 | """ 28 | 移动文件。 29 | """ 30 | if exists(source): 31 | logger.info('移动文件(%s)至(%s)', source, target) 32 | move(source, target) 33 | 34 | 35 | def remove_file(file: str): 36 | """ 37 | 删除已存在的文件。 38 | """ 39 | if exists(file): 40 | logger.info('删除文件(%s)', file) 41 | unlink(file) 42 | 43 | 44 | def slurp(file: str) -> str: 45 | """ 46 | 读取文件内容。 47 | """ 48 | with open(file) as f: 49 | return f.read() 50 | 51 | 52 | def spit(file: str, content: str): 53 | """ 54 | 写入文件内容。 55 | """ 56 | with open(file, 'w') as f: 57 | return f.write(content) 58 | 59 | 60 | # 路径 61 | 62 | class WorkDirectory: 63 | """ 64 | 临时切换当前工作目录。 65 | """ 66 | origin: str # 当前工作目录 67 | directory: str # 目标工作目录 68 | 69 | def __init__(self, directory: str): 70 | self.origin = getcwd() 71 | self.directory = absolute_path(directory) 72 | 73 | def __enter__(self): 74 | """ 75 | 将工作目录临时切换到指定的目录 76 | """ 77 | chdir(self.directory) 78 | 79 | def __exit__(self, exception_class, exception, tracebacks): 80 | """ 81 | 将工作目录临时切换回原始目录 82 | """ 83 | chdir(self.origin) 84 | 85 | 86 | def absolute_path(path: str) -> str: 87 | """ 88 | 将路径转成绝对路径。 89 | """ 90 | return abspath(expanduser(expandvars(path))) 91 | 92 | 93 | def ensure_folder(path: str, parent: bool = False) -> str: 94 | """ 95 | 确保指定的文件夹或父级文件夹存在。 96 | """ 97 | folder = dirname(path) if parent else path 98 | if not exists(folder): 99 | logger.info('创建目录(%s)', folder) 100 | makedirs(folder) 101 | return path 102 | 103 | 104 | def remove_folder(folder: str): 105 | """ 106 | 递归地删除目录。 107 | """ 108 | if exists(folder): 109 | logger.info('删除目录(%s)', folder) 110 | rmtree(folder) 111 | 112 | 113 | # XDG 114 | 115 | @lru_cache() 116 | def xdg(folder: str, *path: str) -> str: 117 | """ 118 | XDG Base Directory Specification。 119 | """ 120 | file = join(absolute_path(folder), *path) 121 | parent = dirname(file) 122 | if not exists(parent): 123 | makedirs(parent) 124 | return file 125 | 126 | 127 | @lru_cache() 128 | def cache(*path: str) -> str: 129 | """ 130 | 基于缓存目录的路径。 131 | """ 132 | match system(): 133 | case 'Linux': 134 | folder = f'~/.cache' 135 | case 'Darwin': 136 | folder = f'~/Library/Caches' 137 | case 'Windows': 138 | folder = f'%LOCALAPPDATA%\\Temp' 139 | case _: 140 | raise UnsupportedPlatformError(system()) 141 | return xdg(folder, *path) 142 | 143 | 144 | @lru_cache() 145 | def config(*path: str) -> str: 146 | """ 147 | 基于配置目录的路径。 148 | """ 149 | match system(): 150 | case 'Linux': 151 | folder = f'~/.config' 152 | case 'Darwin': 153 | folder = f'~/Library/Application Support' 154 | case 'Windows': 155 | folder = f'%LOCALAPPDATA%' 156 | case _: 157 | raise UnsupportedPlatformError(system()) 158 | return xdg(folder, *path) 159 | -------------------------------------------------------------------------------- /duckcp/boot/transformer_command.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from click import help_option, argument, option, Choice 4 | from rich.console import Console 5 | from rich.table import Table 6 | 7 | from duckcp.boot import app 8 | from duckcp.repository import RepositoryKind 9 | from duckcp.service import transformer_service 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @app.group(help='管理迁移') 15 | @help_option('-h', '--help', help='展示帮助信息') 16 | def transformer(): 17 | pass 18 | 19 | 20 | @transformer.command('create', help='创建迁移') 21 | @argument('name', metavar='NAME') 22 | @option('-s', '--source', metavar='REPOSITORY', required=True, help='来源仓库') 23 | @option('-t', '--target', metavar='REPOSITORY', required=True, help='目标仓库') 24 | @option('-o', '--storage', metavar='STORAGE', required=True, help='目标存储单元') 25 | @option('-f', '--script', metavar='FILE', required=True, help='迁移脚本') 26 | @help_option('-h', '--help', help='展示帮助信息') 27 | def transformer_create(name: str, source: str, target: str, storage: str, script: str): 28 | logger.debug('name=%s, source=%s, target=%s, storage=%s, script=%s', name, source, target, storage, script) 29 | transformer_service.transformer_create(name, source, target, storage, script) 30 | 31 | 32 | @transformer.command('update', help='更新迁移信息') 33 | @argument('name', metavar='NAME') 34 | @option('-s', '--source', metavar='REPOSITORY', help='来源仓库') 35 | @option('-t', '--target', metavar='REPOSITORY', help='目标仓库') 36 | @option('-o', '--storage', metavar='STORAGE', help='目标存储单元') 37 | @option('-f', '--script', metavar='FILE', help='迁移脚本') 38 | @help_option('-h', '--help', help='展示帮助信息') 39 | def transformer_update(name: str, source: str, target: str, storage: str, script: str): 40 | logger.debug('name=%s, source=%s, target=%s, storage=%s, script=%s', name, source, target, storage, script) 41 | transformer_service.transformer_update(name, source, target, storage, script) 42 | 43 | 44 | @transformer.command('delete', help='删除迁移;更新作业') 45 | @argument('name', metavar='NAME') 46 | @help_option('-h', '--help', help='展示帮助信息') 47 | def transformer_delete(name: str): 48 | logger.debug('name=%s', name) 49 | transformer_service.transformer_delete(name) 50 | 51 | 52 | @transformer.command('list', help='列出所有迁移') 53 | @option('--source-kind', type=Choice(RepositoryKind.codes()), help='来源仓库类型') 54 | @option('--source-repository', metavar='REPOSITORY', help='来源仓库名称') 55 | @option('--target-kind', type=Choice(RepositoryKind.codes()), help='目标仓库类型') 56 | @option('--target-repository', metavar='REPOSITORY', help='目标仓库名称') 57 | @option('--target-storage', metavar='STORAGE', help='目标存储单元') 58 | @help_option('-h', '--help', help='展示帮助信息') 59 | def transformer_list( 60 | source_kind: str, 61 | source_repository: str, 62 | target_kind: str, 63 | target_repository: str, 64 | target_storage: str, 65 | ): 66 | table = Table(title='迁移列表') 67 | table.add_column('名称', no_wrap=True) 68 | table.add_column('来源类型') 69 | table.add_column('来源仓库') 70 | table.add_column('目标类型') 71 | table.add_column('目标仓库') 72 | table.add_column('存储单元') 73 | table.add_column('迁移脚本') 74 | table.add_column('关联任务', justify='right') 75 | for row in transformer_service.transformer_list(source_kind, source_repository, target_kind, target_repository, target_storage): 76 | table.add_row( 77 | row.code, 78 | row.source_repository_kind, 79 | row.source_repository_code, 80 | row.target_repository_kind, 81 | row.target_repository_code, 82 | row.target_storage_code, 83 | row.script_file, 84 | str(row.tasks), 85 | ) 86 | 87 | console = Console() 88 | console.print(table) 89 | 90 | 91 | @transformer.command('execute', help='执行迁移') 92 | @argument('name', metavar='NAME') 93 | @help_option('-h', '--help', help='展示帮助信息') 94 | def transformer_execute(name: str): 95 | logger.debug('name=%s', name) 96 | transformer_service.transformer_execute(name) 97 | -------------------------------------------------------------------------------- /duckcp/entity/executor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sqlite3 3 | from collections import namedtuple 4 | from typing import Self, Any, Sequence, Optional 5 | 6 | from pandas import DataFrame 7 | 8 | from duckcp.typing.cursor_protocol import CursorProtocol 9 | from duckcp.typing.record_constructor_protocol import RecordConstructorProtocol 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class Executor: 15 | """ 16 | 执行器:执行SQL语句。 17 | """ 18 | cursor: CursorProtocol 19 | 20 | def __init__(self, cursor: CursorProtocol): 21 | self.cursor = cursor 22 | 23 | def __enter__(self) -> Self: 24 | """ 25 | 生命周期开始。 26 | """ 27 | return self 28 | 29 | def __exit__(self, exception_class, exception, traceback): 30 | """ 31 | 生命周期结束。 32 | """ 33 | self.close() 34 | 35 | def close(self): 36 | """ 37 | 关闭本次会话/游标。 38 | """ 39 | self.cursor.close() 40 | 41 | def batch(self, sql: str, parameters: list[Sequence[Any]]): 42 | """ 43 | 批量执行语句,无返回结果。 44 | """ 45 | logger.debug('sql=%s, parameters=%s', sql, parameters) 46 | self.cursor.executemany(sql, parameters) 47 | 48 | def execute(self, sql: str, *parameters: Any) -> tuple[list[str], list[Sequence[Any]]]: 49 | """ 50 | 执行单条语句,返回原始的结果。 51 | - 头信息:列名和类型。 52 | - 记录:原始数据。 53 | """ 54 | logger.debug('sql=%s, parameters=%s', sql, parameters) 55 | if not parameters: # 勿删:不同类型Cursor中参数默认值不同,无法统一处理 56 | parameters = [] if isinstance(self.cursor, sqlite3.Cursor) else None 57 | self.cursor.execute(sql, parameters) 58 | columns = [column[0] for column in self.cursor.description] if self.cursor.description else [] 59 | records = self.cursor.fetchall() 60 | return columns, records 61 | 62 | def __call__(self, sql: str, *parameters: Any) -> DataFrame: 63 | """ 64 | 执行查询语句,返回DataFrame结构。 65 | """ 66 | columns, records = self.execute(sql, *parameters) 67 | return DataFrame(records, columns=columns) 68 | 69 | def records[T: tuple[Any, ...]](self, sql: str, *parameters: Any, constructor: RecordConstructorProtocol[T] = None) -> list[T]: 70 | """ 71 | 执行查询语句,返回NamedTuple形式的记录数据。 72 | """ 73 | columns, records = self.execute(sql, *parameters) 74 | if constructor is None: 75 | record_class = namedtuple('_DuckCP_Record', columns) 76 | return [record_class(*record) for record in records] 77 | else: 78 | return [constructor(record) for record in records] 79 | 80 | def record[T: tuple[Any, ...]](self, sql: str, *parameters: Any, constructor: RecordConstructorProtocol[T] = None) -> Optional[T]: 81 | """ 82 | 执行查询语句,并返回第一行NamedTuple形式的记录数据。 83 | """ 84 | records = self.records(sql, *parameters, constructor=constructor) 85 | return records[0] if records else None 86 | 87 | def value(self, sql: str, *parameters: Any) -> Optional[Any]: 88 | """ 89 | 执行查询语句,并返回第一行第一列数据。 90 | """ 91 | record = self.record(sql, *parameters) 92 | return record[0] if record else None 93 | 94 | def values(self, sql: str, *parameters: Any) -> list[Any]: 95 | """ 96 | 执行查询语句,并返回所有行的第一列数据。 97 | """ 98 | return [record[0] for record in self.records(sql, *parameters)] 99 | 100 | def all(self, sql: str, *parameters: Any) -> list[dict[str, Any]]: 101 | """ 102 | 执行查询语句,返回字典结构的记录。 103 | """ 104 | columns, records = self.execute(sql, *parameters) 105 | return [ 106 | {column: value for column, value in zip(columns, record)} 107 | for record in records 108 | ] 109 | 110 | def one(self, sql: str, *parameters: Any) -> dict[str, Any]: 111 | """ 112 | 执行查询语句,返回字典结构的第一行记录。 113 | """ 114 | records = self.all(sql, *parameters) 115 | return records[0] if records else None 116 | -------------------------------------------------------------------------------- /duckcp/repository/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Sequence 3 | from enum import Enum 4 | from typing import Any, NamedTuple 5 | 6 | from duckcp.repository.bitable_repository import BiTableRepository 7 | from duckcp.repository.duckdb_repository import DuckDBRepository 8 | from duckcp.repository.file_repository import FileRepository 9 | from duckcp.repository.odps_repository import OdpsRepository 10 | from duckcp.repository.postgres_repository import PostgresRepository 11 | from duckcp.repository.sqlite_repository import SqliteRepository 12 | from duckcp.transform.bitable_transform import bitable_transform 13 | from duckcp.transform.database_transform import database_transform 14 | from duckcp.transform.duckdb_transform import duckdb_transform 15 | from duckcp.transform.file_transform import file_transform 16 | from duckcp.typing.transform_type import Transform 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class RepositoryKind(Enum): 22 | """ 23 | 仓库类型。用于管理不同类型仓库的选项. 24 | """ 25 | Postgres = ( 26 | 'postgres', 27 | PostgresRepository, 28 | ['database'], 29 | ['table'], 30 | database_transform, 31 | ) 32 | Odps = ( 33 | 'odps', 34 | OdpsRepository, 35 | ['end_point', 'project', 'access_key', 'access_secret'], 36 | ['table'], 37 | database_transform, 38 | ) 39 | BiTable = ( 40 | 'bitable', 41 | BiTableRepository, 42 | ['access_key', 'access_secret'], 43 | ['document', 'table'], 44 | bitable_transform, 45 | ) 46 | DuckDB = ( 47 | 'duckdb', 48 | DuckDBRepository, 49 | ['file'], 50 | ['table'], 51 | duckdb_transform, 52 | ) 53 | Sqlite = ( 54 | 'sqlite', 55 | SqliteRepository, 56 | ['file'], 57 | ['table'], 58 | database_transform, 59 | ) 60 | File = ( 61 | 'file', 62 | FileRepository, 63 | ['folder'], 64 | ['file'], 65 | file_transform, 66 | ) 67 | 68 | @staticmethod 69 | def codes() -> list[str]: 70 | """ 71 | 仓库类型的编码:用于交互选择仓库类型。 72 | """ 73 | return [kind.code for kind in RepositoryKind] 74 | 75 | @staticmethod 76 | def of(code: str) -> 'RepositoryKind': 77 | """ 78 | 根据编码获取仓库类型。 79 | """ 80 | for kind in RepositoryKind: 81 | if kind.code == code: 82 | return kind 83 | else: 84 | raise ValueError(f'仓库类型({code})不支持') 85 | 86 | @staticmethod 87 | def ensure(code: str): 88 | """ 89 | 确认支持仓库编码对应的仓库类型。 90 | """ 91 | RepositoryKind.of(code) 92 | 93 | @property 94 | def code(self) -> str: 95 | """ 96 | 当前仓库类型的唯一编码。 97 | """ 98 | return self.value[0] 99 | 100 | @property 101 | def repository(self) -> type[NamedTuple]: 102 | """ 103 | 当前类型的仓库。 104 | """ 105 | return self.value[1] 106 | 107 | @property 108 | def required_connection_options(self) -> list[str]: 109 | """ 110 | 当前类型仓库必要的连接选项。 111 | """ 112 | return self.value[2] 113 | 114 | def ensure_connection_properties(self, properties: dict[str, Any]): 115 | """ 116 | 确保仓库类型需要的连接选项有提供。 117 | """ 118 | for name in self.required_connection_options: 119 | option = f'--{name.replace("_", "-")}' 120 | if not bool(properties.get(name)): 121 | raise AssertionError(f'{self.code}类型仓库缺少`{option}`') 122 | 123 | @property 124 | def required_medium_options(self) -> list[str]: 125 | """ 126 | 当前类型仓库的存储单元中必要的介质选项。 127 | """ 128 | return self.value[3] 129 | 130 | def ensure_medium_properties(self, properties: dict[str, Any]): 131 | """ 132 | 确保仓库类型的存储单元需要的介质选项有提供。 133 | """ 134 | for name in self.required_medium_options: 135 | option = f'--{name.replace("_", "-")}' 136 | if not bool(properties.get(name)): 137 | raise AssertionError(f'{self.code}类型仓库的存储缺少`{option}`') 138 | 139 | @property 140 | def transform(self) -> Transform: 141 | """ 142 | 当前类型仓库的迁移函数。 143 | """ 144 | return self.value[4] 145 | 146 | 147 | def repository_constructor[T: tuple](record: Sequence[Any]) -> T: 148 | """ 149 | 根据记录的仓库类型,创建对应类型的仓库。 150 | """ 151 | logger.debug('record=%s', record) 152 | kind = RepositoryKind.of(record[1]) 153 | repository_class = kind.repository 154 | return repository_class._make(record) 155 | -------------------------------------------------------------------------------- /duckcp/feishu/bitable.py: -------------------------------------------------------------------------------- 1 | """ 2 | 多维表格接口 3 | """ 4 | import logging 5 | from typing import TypedDict, NotRequired, Any 6 | 7 | from duckcp.feishu import OPEN_API, FeiShuError 8 | from duckcp.helper import http 9 | from duckcp.helper.collection import chunk 10 | 11 | LIST_FIELDS_API = f'{OPEN_API}/bitable/v1/apps/{{document}}/tables/{{table}}/fields' 12 | LIST_RECORDS_API = f'{OPEN_API}/bitable/v1/apps/{{document}}/tables/{{table}}/records/search' 13 | BATCH_CREATE_API = f'{OPEN_API}/bitable/v1/apps/{{document}}/tables/{{table}}/records/batch_create' 14 | BATCH_DELETE_API = f'{OPEN_API}/bitable/v1/apps/{{document}}/tables/{{table}}/records/batch_delete' 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Record(TypedDict): 19 | """ 20 | 多维表格单行记录。 21 | """ 22 | record_id: NotRequired[str] # 记录编号:创建时可忽略。 23 | fields: dict[str, Any] # 字段数据。 24 | 25 | 26 | class Field(TypedDict): 27 | """ 28 | 多维表格字段。 29 | """ 30 | field_id: str 31 | field_name: str 32 | type: int 33 | ui_type: str 34 | is_primary: bool 35 | is_hidden: bool 36 | description: str 37 | 38 | 39 | class Page[T](TypedDict): 40 | """ 41 | 分页查询结果 42 | """ 43 | has_more: bool 44 | page_token: str 45 | total: int 46 | items: list[T] 47 | 48 | 49 | class BatchDeleteRecord(TypedDict): 50 | """ 51 | 批量删除结果响应。 52 | """ 53 | deleted: bool # 是否删除成功。 54 | record_id: str # 记录编号。 55 | 56 | 57 | class Batch[T](TypedDict): 58 | """ 59 | 批量操作返回的结果。 60 | """ 61 | records: list[T] # 记录集。 62 | 63 | 64 | class Response[T](TypedDict): 65 | """ 66 | 操作结果响应。 67 | """ 68 | code: int # 错误编码。 69 | msg: str # 错误消息。 70 | data: T # 结果。 71 | 72 | 73 | def list_fields(access_token: str, document: str, table: str) -> list[Field]: 74 | url = LIST_FIELDS_API.format(document=document, table=table) 75 | response: Response[Page[Field]] = http.get(url, headers={'Authorization': f'Bearer {access_token}'}, query={'page_size': 100}) 76 | if response['code'] == 0: 77 | return response['data']['items'] 78 | else: 79 | raise FeiShuError('获取字段', response.get('msg', '')) 80 | 81 | 82 | def list_records(access_token: str, document: str, table: str) -> list[Record]: 83 | """ 84 | 分页查询所有记录。 85 | """ 86 | url = LIST_RECORDS_API.format(document=document, table=table) 87 | headers = {'Authorization': f'Bearer {access_token}'} 88 | query: dict[str, Any] = {'page_size': 500} 89 | records: list[Record] = [] 90 | while True: 91 | response: Response[Page[Record]] = http.post(url, headers=headers, query=query, params={}) 92 | if response['code'] == 0: 93 | records.extend(response['data']['items']) 94 | if response['data']['has_more']: 95 | query['page_token'] = response['data']['page_token'] 96 | else: 97 | break 98 | else: 99 | raise FeiShuError('获取记录', response.get('msg', '')) 100 | return records 101 | 102 | 103 | def batch_create(access_token: str, document: str, table: str, records: list[Record]) -> list[Record]: 104 | """ 105 | 批量创建记录。 106 | @param access_token: 访问凭证。 107 | @param document: 多维文档编号。 108 | @param table: 多维表格编号。 109 | @param records: 记录编号集合。 110 | @return: 成功创建的记录。 111 | """ 112 | url = BATCH_CREATE_API.format(document=document, table=table) 113 | result: list[Record] = [] 114 | for bucket in chunk(records, 1000): 115 | response: Response[Batch[Record]] = http.post(url, headers={'Authorization': f'Bearer {access_token}'}, params={'records': bucket}) 116 | if response['code'] == 0: 117 | result.extend(response['data']['records']) 118 | else: 119 | raise FeiShuError('批量创建', response.get('msg', '')) 120 | logger.debug('rows=%s', len(result)) 121 | return result 122 | 123 | 124 | def batch_delete(access_token: str, document: str, table: str, records: list[str]) -> list[str]: 125 | """ 126 | 批量删除记录。 127 | @param access_token: 访问凭证。 128 | @param document: 多维文档编号。 129 | @param table: 多维表格编号。 130 | @param records: 记录编号集合。 131 | @return: 删除成功的记录编号集合。 132 | """ 133 | url = BATCH_DELETE_API.format(document=document, table=table) 134 | result = [] 135 | for bucket in chunk(records, 500): 136 | response: Response[Batch[BatchDeleteRecord]] = http.post(url, headers={'Authorization': f'Bearer {access_token}'}, params={'records': bucket}) 137 | if response['code'] == 0: 138 | result.extend([record['record_id'] for record in response['data']['records'] if record['deleted']]) 139 | else: 140 | raise FeiShuError('批量删除', response.get('msg', '')) 141 | logger.debug('rows=%s', len(result)) 142 | return result 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DuckCP 2 | ===== 3 | 4 | 同步数据的小工具,支持以下类型的数据源之间同步数据: 5 | 6 | - PostgreSQL数据库:以及兼容的数据库。例如Hologres。 7 | - MaxCompute数据库:即ODPS。 8 | - SQLite数据库:文件型OLTP数据库。 9 | - DuckDB数据库:文件型OLAP数据库。 10 | - 本地文件:基于DuckDB实现,支持CSV、Parquet、JSON三种格式的读写。 11 | - 飞书多维表格:基于DuckDB实现。多维表格被映射成DuckDB的只读View,可执行SQL查询。 12 | 13 | ## 一、功能简介 14 | 15 | DuckCP按照以下步骤同步数据: 16 | 17 | 1. 连接数据源,并执行SQL查询数据。 18 | 2. **清空**目标存储单元里的存量数据。 19 | 3. 将查询结果批量保存到目标存储单元内。 20 | 21 | ## 二、安装与使用 22 | 23 | * 安装方法:`pip install duckcp==0.1.3` 24 | * 使用方法:`duckcp -h` 25 | 26 | ## 三、案例演示 27 | 28 | 假设在`data`目录下有一个`programmers.csv`文件,保存了每位程序员使用的编程语言。内容如下: 29 | 30 | | id | name | language | 31 | |----|---------|------------| 32 | | 1 | Joe | Java | 33 | | 2 | Alice | JavaScript | 34 | | 3 | Leon | C/C++ | 35 | | 4 | William | Java | 36 | | 5 | James | C/C++ | 37 | | 6 | Enson | C/C++ | 38 | 39 | 本案例将演示从上述CSV文件中读取数据,统计使用各种编程语言的程序员人数,并将统计结果保存到飞书多维表格中,表格包含以下两个字段: 40 | 41 | 1. 编程语言:文本类型。 42 | 2. 程序员人数:整数类型。 43 | 44 | 如下图所示: 45 | 46 | ![多维表格字段](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/bitable-fields.png) 47 | 48 | 多维表格将自动基于统计结果绘制图表,如下图所示: 49 | 50 | ![多维表格图表](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/bitable-chart.png) 51 | 52 | 开始后续的操作之前,请先参考[飞书多维表格配置](https://github.com/redraiment/duckcp/blob/main/docs/feishu-bitable-getting-started.md)创建飞书企业自建应用,并且添加多维表格的文档应用。 53 | 54 | ### 3.1 初始化 55 | 56 | DuckCP在本地SQLite3数据库中管理以下元信息: 57 | 58 | - 数据仓库(Repository):定义各类数据库的连接信息。 59 | - 存储单元(Storage):定义数据仓库内的存储单元。例如数据库的表、目录下的文件等。 60 | - 迁移任务(Transformer):定义来源仓库、目标仓库、迁移脚本(SQL)等迁移信息。 61 | - 迁移作业(Task):定义可同时执行的迁移任务,及其执行顺序。 62 | 63 | 使用以下命令创建元信息数据库: 64 | 65 | ```shell 66 | duckcp meta create 67 | ``` 68 | 69 | 元信息数据库默认保存在以下路径: 70 | 71 | * Linux系统:`$HOME/.config/com.yinfn.duckcp/configuration.db` 72 | * macOS系统:`$HOME/Library/Application\ Support/com.yinfn.duckcp/configuration.db` 73 | * Windows系统:`%LOCALAPPDATA%\com.yinfn.duckcp\configuration.db` 74 | 75 | 可以通过全局选项`-c/--config-file`指定数据库文件路径: 76 | 77 | ```shell 78 | duckcp -c meta create 79 | ``` 80 | 81 | ### 3.2 创建数据仓库 82 | 83 | 本案例中需要创建两个数据仓库: 84 | 85 | 1. 文件类型(kind=`file`)数据仓库:即前文中数据文件所在的目录`data`。取名为『文件仓库』。 86 | 2. 多维表格(kind=`bitable`)数据仓库:可管理多维表格的飞书开放平台应用。取名为『多维表格』。 87 | 88 | 命令如下: 89 | 90 | ```shell 91 | duckcp repository create 文件仓库 -k file --folder data 92 | duckcp repository create 多维表格 -k bitable --access-key --access-secret 93 | ``` 94 | 95 | 其中文件类型(`-k file`)仓库的连接选项: 96 | 97 | - `--folder `:CSV等数据文件所在的目录。本例中『FOLDER』为『data』。 98 | 99 | 多维表格类型(`-k bitable`)仓库的连接选项: 100 | 101 | - `--access-key `:飞书开放平台中应用凭证的『App ID』。 102 | - `--access-secret `:飞书开放平台中应用凭证的『App Secret』。 103 | 104 | 飞书开放平台上创建应用并凭证的获取方式步骤如下: 105 | 106 | 1. 进入[飞书开发者后台](https://open.feishu.cn/app)页面。 107 | 2. 点击『创建企业自建应用』按钮,填写信息并点击『创建』。 108 | 3. 进入应用详情页面后,点击左侧『凭证与基础信息』菜单,可看到如下图所示的应用凭证。 109 | 110 | ![飞书应用凭证](https://raw.githubusercontent.com/redraiment/duckcp/master/docs/feishu-open-platform-credentials.png) 111 | 112 | 不同类型的仓库连接选项不一样,细节请参见`duckcp repository create -h`。 113 | 114 | ### 3.3 创建存储单元 115 | 116 | 作为数据来源的『文件仓库』不需要创建存储单元;作为存储目标,数据仓库『多维表格』必须创建具体的『存储单元』,即目标最终保存在仓库的那张表或哪个文件内。 117 | 118 | 本例中,数据最终保存到多维表格『程序员分布表』数据表中。创建存储单元的方法如下: 119 | 120 | ```shell 121 | duckcp storage create 程序员分布表 -r 多维表格 --document --table
122 | ``` 123 | 124 | 存储介质选项包括: 125 | 126 | - `-r/--repository `:指定所属的数据仓库。本例中『REPOSITORY』为『多维表格』。 127 | - `--document `:飞书多维表格文档的编码。 128 | - `--table
`:飞书多维表格数据表的编码。 129 | 130 | 飞书多维表格文档与数据表的编码获取步骤如下: 131 | 132 | 1. 进入[飞书企业云盘](https://www.feishu.cn/product/drive)产品页面。 133 | 2. 点击『进入飞书』按钮。 134 | 3. 点击『新建』-『多维表格』按钮。 135 | 4. 此时会新的页面,其中URL类似: 136 | 137 | ``` 138 | https://yinfn-tech.feishu.cn/base/D3yhboIwZazNERsGfDscLt5onee?table=tblrfAQHyWUlNG1q&view=vewwQlhsgf 139 | ``` 140 | 141 | 其中: 142 | 143 | - `/base/`之后的路径参数就是文档编码:即`D3yhboIwZazNERsGfDscLt5onee`。 144 | - 查询参数`table`的值就是数据表编码,即`tblrfAQHyWUlNG1q`。。 145 | 146 | ### 3.4 创建迁移 147 | 148 | 创建迁移之前,首先需要创建一个SQL迁移脚本。在本例中我在`data`目录下创建了一个`迁移脚本.sql`文件,内容如下: 149 | 150 | ```sql 151 | select 152 | "language" as "编程语言", 153 | count(*) as "程序员人数" 154 | from 155 | read_csv('programmers.csv') 156 | group by 157 | "language" 158 | order by 159 | "程序员人数" desc 160 | ``` 161 | 162 | DuckCP的文件类型仓库本质上是一个临时的DuckDB数据库,因此读取CSV使用DuckDB内置的`read_csv`函数。 163 | 164 | 接着可以创建迁移,指定从文件数据源中用迁移脚本读取数据,并保存至多维表格的数据表中。方法如下: 165 | 166 | ```shell 167 | duckcp transformer create 数据统计 -s 文件仓库 -t 多维表格 -o 程序员分布表 -f data/迁移脚本.sql 168 | ``` 169 | 170 | 其中选项包括: 171 | 172 | - `-s/--source REPOSITORY`:指定来源数据仓库。本例中『REPOSITORY』为『文件仓库』。 173 | - `-t/--target REPOSITORY`:指定目标数据仓库。本例中『REPOSITORY』为『多维表格』。 174 | - `-o/--storage STORAGE`:指定目标存储单元。本例中『STORAGE』为『多维表格』的『程序员分布表』。 175 | - `-f/--script FILE`:指定迁移脚本,用于从来源数据仓库内读取数据和加工数据。本例中『FILE』为『data/迁移脚本.sql』。 176 | 177 | ### 3.5 执行迁移 178 | 179 | 最后,可以执行前文创建的迁移。方法如下: 180 | 181 | ```shell 182 | duckcp transformer execute 数据统计 183 | ``` 184 | 185 | ## 问题反馈 186 | 187 | DuckCP在2023年9月开始在公司内部使用,前后重写超过6次,近期(2025年6月)才开始筹备开源。 188 | 代码难免有错误和不足,欢迎在Github上提交问题,或发邮件至 redraiment@gmail.com 交流。 189 | 190 | 感谢! 191 | -------------------------------------------------------------------------------- /duckcp/helper/sql.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Sequence, Mapping 3 | from numbers import Number 4 | from typing import Optional, Any 5 | 6 | from sqlglot import parse, Expression 7 | from sqlglot.dialects.duckdb import DuckDB 8 | from sqlglot.expressions import With, CTE, Table, Create, Identifier, From, Delete, Insert, Schema, Values, Tuple, Copy, Literal, CopyParameter, Var, Boolean, Struct, Array, Null, PropertyEQ 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def to_expression(instance: Any) -> Expression: 14 | """ 15 | 将Python值转成表达式。 16 | """ 17 | if instance is None: 18 | return Null() 19 | elif isinstance(instance, bool): 20 | return Boolean(this=instance) 21 | elif isinstance(instance, str): 22 | return Literal.string(instance) 23 | elif isinstance(instance, Number): 24 | return Literal.number(instance) 25 | elif isinstance(instance, Sequence): 26 | return Array(expressions=[to_expression(value) for value in instance]) 27 | elif isinstance(instance, Mapping): 28 | return Struct(expressions=[PropertyEQ( 29 | this=Identifier(this=name, quoted=True), 30 | expression=to_expression(value), 31 | ) for name, value in instance.items()]) 32 | else: 33 | raise ValueError(f'未知数据({instance})类型({type(instance)})') 34 | 35 | 36 | def extract_tables(sql: str) -> set[str]: 37 | """ 38 | 从SQL中提取真正的表名,忽略CTE、子查询等临时的表名。 39 | """ 40 | logger.debug('sql=%s', sql) 41 | tables = set() # 真正的表 42 | for statement in parse(sql, dialect=DuckDB) or []: 43 | ignores = set() # 提取CTE别名,忽略这些名字 44 | if expression := statement.find(With): 45 | for cte in expression.find_all(CTE): # 先依次遍历CTE 46 | for table in cte.find_all(Table): 47 | if table.name not in ignores: 48 | tables.add(table.name) 49 | ignores.add(cte.alias) 50 | for expression in statement.iter_expressions(): 51 | if not isinstance(expression, With): # 排除With语句;前面已处理 52 | for table in expression.find_all(Table): 53 | if table.name not in ignores: 54 | tables.add(table.name) 55 | return tables 56 | 57 | 58 | def create_or_replace_table( 59 | catalog: Optional[str], 60 | schema: Optional[str], 61 | table: str, 62 | source: str, 63 | ) -> Expression: 64 | """ 65 | 创建DuckDB方言的create or replace table语句。 66 | """ 67 | logger.debug('catalog=%s, schema=%s, table=%s, source=%s', catalog, schema, table, source) 68 | return Create( 69 | this=Table( 70 | this=Identifier(this=table, quoted=True), 71 | db=Identifier(this=schema, quoted=True) if schema else None, 72 | catalog=Identifier(this=catalog, quoted=True) if catalog else None), 73 | kind='table', 74 | replace=True, 75 | expression=From( 76 | this=Table( 77 | this=Identifier(this=source, quoted=True), 78 | db=Identifier(this='main', quoted=False), 79 | catalog=Identifier(this='temp', quoted=False)))) 80 | 81 | 82 | def delete_from( 83 | catalog: Optional[str], 84 | schema: Optional[str], 85 | table: str, 86 | ) -> Expression: 87 | """ 88 | 创建通用的删除表所有数据语句。 89 | """ 90 | logger.debug('catalog=%s, schema=%s, table=%s', catalog, schema, table) 91 | return Delete( 92 | this=Table( 93 | this=Identifier(this=table, quoted=True), 94 | db=Identifier(this=schema, quoted=True) if schema else None, 95 | catalog=Identifier(this=catalog, quoted=True) if catalog else None)) 96 | 97 | 98 | def insert_into( 99 | catalog: Optional[str], 100 | schema: Optional[str], 101 | table: str, 102 | columns: list[str], 103 | ) -> Expression: 104 | """ 105 | 创建通用的新增记录语句。 106 | """ 107 | logger.debug('catalog=%s, schema=%s, table=%s, columns=%s', catalog, schema, table, columns) 108 | return Insert( 109 | this=Schema( 110 | this=Table( 111 | this=Identifier(this=table, quoted=True), 112 | db=Identifier(this=schema, quoted=True) if schema else None, 113 | catalog=Identifier(this=catalog, quoted=True) if catalog else None), 114 | expressions=[Identifier(this=column, quoted=True) for column in columns]), 115 | expression=Values( 116 | expressions=[Tuple( 117 | expressions=['?'] * len(columns) 118 | )])) 119 | 120 | 121 | def copy_to( 122 | table_name: str, 123 | file_name: str, 124 | parameters: dict[str, Any], 125 | ) -> Expression: 126 | """ 127 | 创建DuckDB方言的COPY语句。 128 | """ 129 | return Copy( 130 | this=Table( 131 | this=Identifier(this=table_name, quoted=True), 132 | db=Identifier(this='main', quoted=True), 133 | catalog=Identifier(this='temp', quoted=True)), 134 | files=[Literal.string(file_name)], 135 | params=[ 136 | CopyParameter(this=Var(this=name), expression=to_expression(value)) 137 | for name, value in parameters.items() 138 | ]) 139 | -------------------------------------------------------------------------------- /duckcp/repository/bitable_repository.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Sequence, cast, NamedTuple 3 | 4 | import duckdb 5 | from duckdb.duckdb import DuckDBPyConnection 6 | from pandas import DataFrame 7 | 8 | from duckcp.configuration import meta_configuration as metadata 9 | from duckcp.entity.connection import Connection 10 | from duckcp.entity.executor import Executor 11 | from duckcp.entity.repository import Repository 12 | from duckcp.feishu import tenant_access_token, bitable 13 | from duckcp.helper.sql import extract_tables 14 | from duckcp.helper.validation import ensure 15 | from duckcp.service.authentication_service import Authenticator, authenticate 16 | from duckcp.typing.connection_protocol import ConnectionProtocol 17 | from duckcp.typing.supports_get_item_protocol import SupportsGetItemProtocol 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class BiTable(NamedTuple): 23 | name: str # 在SQL中使用的表名 24 | code: str # 飞书多维表格编码 25 | document_code: str # 飞书多维文档编码 26 | 27 | 28 | class BiTableCursor: 29 | """ 30 | 飞书多维表格游标:增删改等变更操作只影响本地缓存数据;不会同步至多维表格。 31 | """ 32 | cursor: duckdb.DuckDBPyConnection 33 | authenticator: Authenticator # 授信服务。 34 | tables: dict[str, BiTable] 35 | 36 | def __init__(self, cursor: DuckDBPyConnection, authenticator: Authenticator, tables: dict[str, BiTable]): 37 | self.cursor = cursor 38 | self.authenticator = authenticator 39 | self.tables = tables 40 | 41 | @property 42 | def description(self) -> Sequence[SupportsGetItemProtocol]: 43 | return self.cursor.description 44 | 45 | def close(self): 46 | """ 47 | 关闭游标 48 | """ 49 | self.cursor.close() 50 | 51 | def __prepare(self, sql: str): 52 | """ 53 | 根据SQL语句中查询用到的表,动态地从远程多维表格中实时加载最新的数据。 54 | """ 55 | for table_name in extract_tables(sql): 56 | if table := self.tables.get(table_name): 57 | records = bitable.list_records(self.authenticator(), table.document_code, table.code) 58 | frame = DataFrame([{ 59 | **record['fields'], 60 | 'id': record['record_id'], 61 | } for record in records]) 62 | self.cursor.execute(f' set global pandas_analyze_sample = {len(frame)} ') 63 | self.cursor.register(table.name, frame) 64 | 65 | def executemany(self, sql: str, parameters: list[Sequence[Any]]): 66 | """ 67 | 批量执行。 68 | """ 69 | self.__prepare(sql) 70 | self.cursor.executemany(sql, parameters) 71 | 72 | def execute(self, sql: str, parameters: Sequence[Any]): 73 | """ 74 | 单句执行。 75 | """ 76 | self.__prepare(sql) 77 | return self.cursor.execute(sql, parameters) 78 | 79 | def fetchall(self) -> list[Sequence[Any]]: 80 | """ 81 | 获取查询结果。 82 | """ 83 | return self.cursor.fetchall() 84 | 85 | 86 | class BiTableConnection(Connection): 87 | """ 88 | 飞书多维表格连接:获取访问凭证。 89 | """ 90 | authenticator: Authenticator # 授信服务。 91 | tables: dict[str, BiTable] 92 | 93 | def __init__(self, connection: ConnectionProtocol, authenticator: Authenticator, tables: dict[str, BiTable]): 94 | super().__init__(connection) 95 | self.authenticator = authenticator 96 | self.tables = tables 97 | 98 | def executor(self) -> Executor: 99 | """ 100 | 创建新的语句对象,对于执行查询语句。 101 | """ 102 | cursor = cast(DuckDBPyConnection, self.connection.cursor()) 103 | return Executor(BiTableCursor(cursor, self.authenticator, self.tables)) 104 | 105 | 106 | class BiTableRepository(Repository): 107 | """ 108 | 飞书多维表格类型仓库。 109 | """ 110 | 111 | def establish_connection(self) -> DuckDBPyConnection: 112 | """ 113 | 创建DuckDB内存数据库连接。 114 | """ 115 | return duckdb.connect(':memory:') 116 | 117 | @property 118 | def authenticator(self) -> Authenticator: 119 | """ 120 | 飞书开放平台的身份校验器。 121 | """ 122 | ensure(bool(self.properties), '缺少连接参数') 123 | ensure(bool(self.properties.get('access_key')), '缺少访问编码') 124 | access_key = self.properties.get('access_key') 125 | ensure(bool(self.properties.get('access_secret')), '缺少访问密钥') 126 | access_secret = self.properties.get('access_secret') 127 | logger.debug('access_key=%s', access_key) 128 | return authenticate('feishu', access_key, { 129 | 'access_key': access_key, 130 | 'access_secret': access_secret, 131 | }, tenant_access_token) 132 | 133 | def connect(self) -> Connection: 134 | """ 135 | 连接多维表格。 136 | """ 137 | tables = {} 138 | if self.id is not None: 139 | with metadata.connect() as meta: 140 | tables = { 141 | table.name: table 142 | for table in meta.records(''' 143 | select 144 | code as name, 145 | properties->>'table' as code, 146 | properties->>'document' as document_code 147 | from 148 | storages 149 | where 150 | repository_id = ? 151 | ''', self.id, constructor=BiTable._make) 152 | } 153 | connection = self.establish_connection() 154 | return BiTableConnection(connection, self.authenticator, tables) 155 | -------------------------------------------------------------------------------- /duckcp/service/storage_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, Any 3 | 4 | from duckcp.configuration import meta_configuration as metadata 5 | from duckcp.entity.snapshot import Snapshot 6 | from duckcp.entity.storage import Storage 7 | from duckcp.helper.validation import ensure 8 | from duckcp.projection.repository_projection import RepositoryProjection 9 | from duckcp.projection.storage_projection import StorageProjection 10 | from duckcp.repository import RepositoryKind 11 | from duckcp.service import repository_service 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def storage_find(repository_code: str, code: str) -> Optional[Storage]: 17 | """ 18 | 根据编码查找存储单元。 19 | """ 20 | with metadata.connect() as meta: 21 | return meta.record(''' 22 | select 23 | storages.* 24 | from 25 | storages 26 | inner join 27 | repositories 28 | on 29 | storages.repository_id = repositories.id 30 | and repositories.code = ? 31 | and storages.code = ? 32 | ''', repository_code, code, constructor=Storage._make) 33 | 34 | 35 | def storage_exists(repository_code: str, code: str) -> bool: 36 | """ 37 | 判断编码对应的存储单元是否已存在。 38 | """ 39 | return storage_find(repository_code, code) is not None 40 | 41 | 42 | def storage_create(repository_code: str, code: str, properties: dict[str, Any]): 43 | """ 44 | 添加仓库的存储单元。 45 | """ 46 | logger.debug('repository_code=%s, code=%s, properties=%s', repository_code, code, properties) 47 | ensure(repository_code is not None, '缺少仓库名称') 48 | ensure(code is not None, f'仓库({repository_code})缺少存储单元名称') 49 | properties = {name: value for name, value in properties.items() if bool(value)} if properties else {} 50 | ensure(bool(properties), f'仓库({repository_code})的存储单元({code})缺少存储参数') 51 | 52 | repository = repository_service.repository_find(repository_code) 53 | ensure(repository is not None, f'仓库({repository_code})不存在') 54 | kind = RepositoryKind.of(repository.kind) 55 | kind.ensure_medium_properties(properties) 56 | ensure(not storage_exists(repository.code, code), f'仓库({repository_code})的存储单元({code})已存在') 57 | 58 | with metadata.connect() as meta: 59 | storage = meta.record(''' 60 | insert into storages 61 | (repository_id, code, properties) 62 | values 63 | (?, ?, ?) 64 | returning * 65 | ''', repository.id, code, properties, constructor=Storage._make) 66 | logger.info('创建仓库(%s)的存储单元(%s)', repository.code, code) 67 | logger.debug('storage=%s', storage) 68 | 69 | 70 | def storage_update(repository_code: str, code: str, properties: dict[str, Any]): 71 | """ 72 | 更新仓库的存储单元信息。 73 | """ 74 | logger.debug('repository_code=%s, code=%s, properties=%s', repository_code, code, properties) 75 | ensure(repository_code is not None, '缺少仓库名称') 76 | ensure(code is not None, f'仓库({repository_code})缺少存储单元名称') 77 | properties = {key: value for key, value in properties.items() if value is not None} if properties else {} 78 | ensure(bool(properties), f'仓库({repository_code})的存储单元({code})缺少更新内容') 79 | 80 | repository = repository_service.repository_find(repository_code) 81 | ensure(repository is not None, f'仓库({repository_code})不存在') 82 | storage = storage_find(repository_code, code) 83 | ensure(storage is not None, f'仓库({repository_code})的存储单元({code})不存在') 84 | kind = RepositoryKind.of(repository.kind) 85 | properties = {**storage.properties, **properties} 86 | kind.ensure_medium_properties(properties) 87 | 88 | with metadata.connect() as meta: 89 | storage = meta.record(''' 90 | update 91 | storages 92 | set 93 | properties = ?, 94 | updated_at = datetime(current_timestamp, 'localtime') 95 | where 96 | id = ? 97 | returning * 98 | ''', { 99 | key: value 100 | for key, value in properties.items() 101 | if bool(value) # 移除手工强制设为空值的项 102 | }, storage.id, constructor=Storage._make) 103 | logger.info('更新仓库(%s)的存储单元(%s)', repository.code, code) 104 | logger.debug('storage=%s', storage) 105 | 106 | 107 | def storage_delete(repository_code: str, code: str): 108 | """ 109 | 删除存储单元。 110 | """ 111 | logger.debug('repository_code=%s, code=%s', repository_code, code) 112 | ensure(repository_code is not None, '缺少仓库名称') 113 | ensure(code is not None, f'仓库({repository_code})缺少存储单元名称') 114 | 115 | repository = repository_service.repository_find(repository_code) 116 | ensure(repository is not None, f'仓库({repository_code})不存在') 117 | storage = storage_find(repository_code, code) 118 | ensure(storage is not None, f'仓库({repository_code})的存储单元({code})不存在') 119 | 120 | with metadata.connect() as meta: 121 | storage = meta.record(''' 122 | delete from storages where id = ? returning * 123 | ''', storage.id, constructor=Storage._make) 124 | logger.info('删除仓库(%s)的存储单元(%s)', repository.code, code) 125 | logger.debug('storage=%s', storage) 126 | 127 | 128 | def storage_list( 129 | repository_kind: Optional[str] = None, 130 | repository_code: Optional[str] = None 131 | ) -> list[StorageProjection]: 132 | """ 133 | 列出符合条件的存储单元。 134 | """ 135 | logger.debug('repository_kind=%s, repository_code=%s', repository_kind, repository_code) 136 | 137 | filters = [] 138 | parameters = [] 139 | if repository_kind is not None: 140 | RepositoryKind.ensure(repository_kind) 141 | filters.append('and repositories.kind = ?') 142 | parameters.append(repository_kind) 143 | if repository_code is not None: 144 | filters.append('and repositories.code = ?') 145 | parameters.append(repository_code) 146 | 147 | with metadata.connect() as meta: 148 | return meta.records(f''' 149 | with storages_transformers as ( 150 | select 151 | target_id as storage_id, 152 | count(*) as transformers 153 | from 154 | transformers 155 | group by 156 | target_id 157 | ) 158 | select 159 | repositories.kind as repository_kind, 160 | repositories.code as repository_code, 161 | storages.code, 162 | coalesce(storages_transformers.transformers, 0) as transformers 163 | from 164 | storages 165 | inner join 166 | repositories 167 | on 168 | storages.repository_id = repositories.id 169 | {' '.join(filters)} 170 | left join 171 | storages_transformers 172 | on 173 | storages.id = storages_transformers.storage_id 174 | order by 175 | repositories.kind, 176 | repositories.code, 177 | storages.code 178 | ''', *parameters, constructor=StorageProjection._make) 179 | -------------------------------------------------------------------------------- /duckcp/service/repository_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import contextmanager 3 | from typing import Any, Optional, Sequence, Iterator 4 | 5 | from duckcp.configuration import meta_configuration as metadata 6 | from duckcp.entity.executor import Executor 7 | from duckcp.entity.repository import Repository 8 | from duckcp.helper.validation import ensure 9 | from duckcp.projection.repository_projection import RepositoryProjection 10 | from duckcp.repository import repository_constructor, RepositoryKind 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def repository_find[T: Repository](code: str) -> Optional[T]: 16 | """ 17 | 根据编码查找仓库。 18 | """ 19 | with metadata.connect() as meta: 20 | return meta.record('select * from repositories where code = ?', code, constructor=repository_constructor) 21 | 22 | 23 | def repository_exists(code: str) -> bool: 24 | """ 25 | 判断编码对应的仓库是否已存在。 26 | """ 27 | return repository_find(code) is not None 28 | 29 | 30 | def repository_storages(code: str) -> int: 31 | """ 32 | 统计仓库的存储数。 33 | """ 34 | with metadata.connect() as meta: 35 | return meta.value(''' 36 | select 37 | count(*) as tables 38 | from 39 | repositories 40 | inner join 41 | storages 42 | on 43 | repositories.id = storages.repository_id 44 | and repositories.code = ? 45 | ''', code) 46 | 47 | 48 | def repository_transformers(code: str) -> int: 49 | """ 50 | 统计仓库的迁移数。 51 | """ 52 | with metadata.connect() as meta: 53 | return meta.value(''' 54 | select 55 | count(*) as transformers 56 | from 57 | repositories 58 | inner join 59 | transformers 60 | on 61 | repositories.id = transformers.source_id 62 | and repositories.code = ? 63 | ''', code) 64 | 65 | 66 | def repository_create(code: str, kind_code: str, properties: dict[str, Any]): 67 | """ 68 | 创建仓库。 69 | """ 70 | logger.debug('code=%s, kind_code=%s, properties=%s', code, kind_code, properties) 71 | ensure(code is not None, '缺少仓库名称') 72 | ensure(kind_code is not None, f'仓库({code})缺少类型') 73 | kind = RepositoryKind.of(kind_code) 74 | ensure(bool(properties), f'仓库({code})缺少连接参数') 75 | kind.ensure_connection_properties(properties) 76 | ensure(not repository_exists(code), f'仓库({code})已存在') 77 | 78 | with metadata.connect() as meta: 79 | repository = meta.record(''' 80 | insert into repositories 81 | (code, kind, properties) 82 | values 83 | (?, ?, ?) 84 | returning * 85 | ''', code, kind.code, { 86 | key: value 87 | for key, value in properties.items() 88 | if bool(value) 89 | }, constructor=repository_constructor) 90 | logger.info('创建仓库(%s)', code) 91 | logger.debug('repository=%s', repository) 92 | 93 | 94 | def repository_update(code: str, kind_code: str, properties: dict[str, Any]): 95 | """ 96 | 更新仓库信息。 97 | """ 98 | logger.debug('code=%s, kind_code=%s, properties=%s', code, kind_code, properties) 99 | ensure(code is not None, '缺少仓库名称') 100 | properties = {key: value for key, value in properties.items() if value is not None} if properties is not None else {} 101 | ensure(kind_code is not None or bool(properties), f'仓库({code})缺少更新内容') 102 | 103 | repository = repository_find(code) 104 | ensure(repository is not None, f'仓库({code})不存在') 105 | 106 | kind = RepositoryKind.of(kind_code or repository.kind) 107 | properties = {**repository.properties, **properties} 108 | kind.ensure_connection_properties(properties) 109 | 110 | with metadata.connect() as meta: 111 | repository = meta.record(''' 112 | update 113 | repositories 114 | set 115 | kind = ?, 116 | properties = ?, 117 | updated_at = current_timestamp 118 | where 119 | code = ? 120 | returning * 121 | ''', kind.code, { 122 | key: value 123 | for key, value in properties.items() 124 | if bool(value) # 移除手工强制设为空值的项 125 | }, code, constructor=repository_constructor) 126 | logger.info('更新仓库(%s)', code) 127 | logger.debug('repository=%s', repository) 128 | 129 | 130 | def repository_delete(code: str): 131 | """ 132 | 删除仓库。 133 | - 解绑数据表。 134 | - 删除迁移。 135 | """ 136 | logger.debug('code=%s', code) 137 | ensure(code is not None, '缺少仓库名称') 138 | ensure(repository_exists(code), f'仓库({code})不存在') 139 | with metadata.connect() as meta: 140 | repository = meta.record(''' 141 | delete from repositories where code = ? returning * 142 | ''', code, constructor=repository_constructor) 143 | logger.info('删除仓库(%s)', code) 144 | logger.debug('repository=%s', repository) 145 | 146 | 147 | def repository_list(kind: Optional[str] = None) -> list[RepositoryProjection]: 148 | """ 149 | 列出符合条件的仓库。 150 | """ 151 | logger.debug('kind=%s', kind) 152 | filters = [] 153 | parameters = [] 154 | if kind is not None: 155 | RepositoryKind.ensure(kind) 156 | filters.append('where repositories.kind = ?') 157 | parameters.append(kind) 158 | 159 | with metadata.connect() as meta: 160 | return meta.records(f''' 161 | with repositories_transformers as ( 162 | select 163 | repositories.id as repository_id, 164 | count(*) as transformers 165 | from 166 | repositories 167 | inner join 168 | transformers 169 | on 170 | repositories.id = transformers.source_id 171 | group by 172 | repositories.id 173 | ), repositories_storages as ( 174 | select 175 | repositories.id as repository_id, 176 | count(*) as storages 177 | from 178 | repositories 179 | inner join 180 | storages 181 | on 182 | repositories.id = storages.repository_id 183 | group by 184 | repositories.id 185 | ) 186 | select 187 | repositories.kind, 188 | repositories.code, 189 | coalesce(repositories_storages.storages, 0) as storages, 190 | coalesce(repositories_transformers.transformers, 0) as transformers 191 | from 192 | repositories 193 | left join 194 | repositories_transformers 195 | on 196 | repositories.id = repositories_transformers.repository_id 197 | left join 198 | repositories_storages 199 | on 200 | repositories.id = repositories_storages.repository_id 201 | {' '.join(filters)} 202 | order by 203 | repositories.kind, 204 | repositories.code 205 | ''', *parameters, constructor=RepositoryProjection._make) 206 | 207 | 208 | @contextmanager 209 | def repository_connect(code: str) -> Iterator[Executor]: 210 | """ 211 | 连接指定的仓库。 212 | """ 213 | logger.debug('code=%s', code) 214 | repository = repository_find(code) 215 | ensure(repository is not None, f'仓库({code})不存在') 216 | 217 | with repository.connect() as connection: 218 | with connection.executor() as executor: 219 | yield executor 220 | 221 | 222 | def repository_execute(code: str, sql: str) -> tuple[list[str], list[Sequence[Any]]]: 223 | """ 224 | 在仓库上执行查询脚本,并返回查询结果。 225 | """ 226 | logger.debug('code=%s, sql=%s', code, sql) 227 | with repository_connect(code) as executor: 228 | return executor.execute(sql) 229 | -------------------------------------------------------------------------------- /duckcp/boot/repository_command.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import exists 3 | 4 | import click 5 | from click import help_option, option, argument, Choice 6 | from rich.console import Console 7 | from rich.table import Table 8 | 9 | from duckcp.boot import app 10 | from duckcp.helper.fs import slurp, absolute_path 11 | from duckcp.helper.validation import ensure, confirm 12 | from duckcp.repository import RepositoryKind 13 | from duckcp.service import repository_service 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @app.group(help='管理仓库') 19 | @help_option('-h', '--help', help='展示帮助信息') 20 | def repository(): 21 | pass 22 | 23 | 24 | @repository.command('create', help='创建仓库') 25 | @argument('name', metavar='NAME') 26 | @option('-k', '--kind', type=Choice(RepositoryKind.codes()), required=True, help='仓库类型') 27 | # Postgres 28 | @option('--host', metavar='HOST', help='主机;用于[postgres]') 29 | @option('--port', type=click.INT, metavar='PORT', help='端口;用于[postgres]') 30 | @option('--database', metavar='DATABASE', help='数据库名;用于[postgres]') 31 | @option('--username', metavar='USERNAME', help='登入用户;用于[postgres]') 32 | @option('--password', metavar='PASSWORD', help='登入密码;用于[postgres]') 33 | # ODPS 34 | @option('--end-point', metavar='END-POINT', help='地址;用于[odps]') 35 | @option('--project', metavar='PROJECT', help='项目;用于[odps]') 36 | # ODPS;BiTable 37 | @option('--access-key', metavar='KEY', help='凭证编码;用于[odps;bitable]') 38 | @option('--access-secret', metavar='SECRET', help='凭证密钥;用于[odps;bitable]') 39 | # DuckDB; SQLite 40 | @option('--file', metavar='FILE', help='文件;用于[duckdb;sqlite]') 41 | # File 42 | @option('--folder', metavar='FOLDER', help='目录;用于[file]') 43 | # Others 44 | @help_option('-h', '--help', help='展示帮助信息') 45 | def repository_create( 46 | name: str, 47 | kind: str, 48 | # Postgres 49 | host: str, 50 | port: int, 51 | database: str, 52 | username: str, 53 | password: str, 54 | # ODPS; BiTable 55 | end_point: str, 56 | project: str, 57 | access_key: str, 58 | access_secret: str, 59 | # DuckDB; SQLite 60 | file: str, 61 | # File 62 | folder: str, 63 | ): 64 | logger.debug( 65 | 'name=%s, kind=%s, host=%s, port=%s, database=%s, username=%s, end_point=%s, project=%s, access_key=%s, file=%s, folder=%s', 66 | name, kind, 67 | host, port, database, username, 68 | end_point, project, access_key, 69 | file, folder, 70 | ) 71 | repository_service.repository_create(name, kind, { 72 | 'host': host or None, 73 | 'port': port or None, 74 | 'database': database or None, 75 | 'username': username or None, 76 | 'password': password or None, 77 | 'end_point': end_point or None, 78 | 'project': project or None, 79 | 'access_key': access_key or None, 80 | 'access_secret': access_secret or None, 81 | 'file': absolute_path(file) if file else None, 82 | 'folder': absolute_path(folder) if folder else None, 83 | }) 84 | 85 | 86 | @repository.command('update', help='更新仓库信息') 87 | @argument('name', metavar='NAME') 88 | @option('-k', '--kind', type=Choice(RepositoryKind.codes()), help='仓库类型') 89 | # Postgres 90 | @option('--host', metavar='HOST', help='主机;用于[postgres]') 91 | @option('--port', type=click.INT, metavar='PORT', help='端口;用于[postgres]') 92 | @option('--database', metavar='DATABASE', help='数据库名;用于[postgres]') 93 | @option('--username', metavar='USERNAME', help='登入用户;用于[postgres]') 94 | @option('--password', metavar='PASSWORD', help='登入密码;用于[postgres]') 95 | # ODPS 96 | @option('--end-point', metavar='END-POINT', help='地址;用于[odps]') 97 | @option('--project', metavar='PROJECT', help='项目;用于[odps]') 98 | # ODPS;BiTable 99 | @option('--access-key', metavar='KEY', help='凭证编码;用于[odps;bitable]') 100 | @option('--access-secret', metavar='SECRET', help='凭证密钥;用于[odps;bitable]') 101 | # DuckDB; SQLite 102 | @option('--file', metavar='FILE', help='文件;用于[duckdb;sqlite]') 103 | # File 104 | @option('--folder', metavar='FOLDER', help='目录;用于[file]') 105 | # Others 106 | @help_option('-h', '--help', help='展示帮助信息') 107 | def repository_update( 108 | name: str, 109 | kind: str, 110 | # Postgres 111 | host: str, 112 | port: int, 113 | database: str, 114 | username: str, 115 | password: str, 116 | # ODPS;BiTable 117 | end_point: str, 118 | project: str, 119 | access_key: str, 120 | access_secret: str, 121 | # DuckDB; SQLite 122 | file: str, 123 | # File 124 | folder: str, 125 | ): 126 | logger.debug( 127 | 'name=%s, kind=%s, host=%s, port=%s, database=%s, username=%s, end_point=%s, project=%s, access_key=%s, file=%s, folder=%s', 128 | name, kind, 129 | host, port, database, username, 130 | end_point, project, access_key, 131 | file, folder, 132 | ) 133 | repository_service.repository_update(name, kind, { 134 | 'host': host, 135 | 'port': port, 136 | 'database': database, 137 | 'username': username, 138 | 'password': password, 139 | 'end_point': end_point, 140 | 'project': project, 141 | 'access_key': access_key, 142 | 'access_secret': access_secret, 143 | 'file': absolute_path(file) if file else file, 144 | 'folder': absolute_path(folder) if folder else file, 145 | }) 146 | 147 | 148 | @repository.command('delete', help='注销仓库;删除迁移') 149 | @argument('name', metavar='NAME') 150 | @option('-y', '--yes', is_flag=True, help='自动同意') 151 | @help_option('-h', '--help', help='展示帮助信息') 152 | def repository_delete(name: str, yes: bool): 153 | logger.debug('name=%s, yes=%s', name, yes) 154 | transformers = repository_service.repository_transformers(name) 155 | if transformers > 0 and not yes and not confirm(f'仓库({name})还有{transformers}个关联迁移,确认删除?'): 156 | return 157 | tables = repository_service.repository_storages(name) 158 | if tables > 0 and not yes and not confirm(f'仓库({name})还有{tables}个关联表,确认删除?'): 159 | return 160 | repository_service.repository_delete(name) 161 | 162 | 163 | @repository.command('list', help='列出所有仓库') 164 | @option('-k', '--kind', type=Choice(RepositoryKind.codes()), help='仓库类型') 165 | @help_option('-h', '--help', help='展示帮助信息') 166 | def repository_list(kind: str): 167 | table = Table(title='仓库列表') 168 | table.add_column('类型', no_wrap=True) 169 | table.add_column('名称', no_wrap=True) 170 | table.add_column('关联存储', justify='right') 171 | table.add_column('关联迁移', justify='right') 172 | for row in repository_service.repository_list(kind): 173 | table.add_row( 174 | row.kind, 175 | row.code, 176 | str(row.storages), 177 | str(row.transformers), 178 | ) 179 | 180 | console = Console() 181 | console.print(table) 182 | 183 | 184 | @repository.command('execute', help='执行脚本') 185 | @argument('name', metavar='NAME') 186 | @option('-f', '--script', metavar='FILE', help='查询数据的SQL脚本文件') 187 | @option('-e', '--expression', metavar='SQL', help='查询数据的SQL') 188 | @help_option('-h', '--help', help='展示帮助信息') 189 | def repository_execute(name: str, script: str, expression: str): 190 | logger.debug('name=%s, script=%s, expression=%s', name, script, expression) 191 | if script is None and expression is None: 192 | raise ValueError('请指定查询SQL的脚本文件或查询语句') 193 | if script is not None and expression is None: 194 | ensure(exists(script), f'脚本文件({script})不存在') 195 | expression = slurp(script) 196 | 197 | table = Table(title=name) 198 | columns, records = repository_service.repository_execute(name, expression) 199 | 200 | for column in columns: 201 | table.add_column(column) 202 | 203 | for record in records: 204 | table.add_row(*map(str, record)) 205 | 206 | console = Console() 207 | console.print(table) 208 | -------------------------------------------------------------------------------- /duckcp/service/task_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定时任务调度服务。 3 | """ 4 | import logging 5 | from typing import Optional 6 | 7 | from duckcp.configuration import meta_configuration as metadata 8 | from duckcp.entity.task import Task 9 | from duckcp.entity.task_transformer import TaskTransformer 10 | from duckcp.helper.validation import ensure 11 | from duckcp.projection.task_projection import TaskProjection 12 | from duckcp.projection.task_transformer_projection import TaskTransformerProjection 13 | from duckcp.service import transformer_service 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def task_find(code: str) -> Optional[Task]: 19 | """ 20 | 根据编码查找任务。 21 | """ 22 | with metadata.connect() as meta: 23 | return meta.record('select * from tasks where code = ?', code, constructor=Task._make) 24 | 25 | 26 | def task_exists(code: str) -> bool: 27 | """ 28 | 判断编码对应的任务是否已存在。 29 | """ 30 | return task_find(code) is not None 31 | 32 | 33 | def task_transformer_find(task_id: int, transformer_id: int) -> TaskTransformer: 34 | """ 35 | 判断任务与迁移是否已绑定。 36 | """ 37 | with metadata.connect() as meta: 38 | return meta.record('select * from tasks_transformers where task_id = ? and transformer_id = ?', task_id, transformer_id, constructor=TaskTransformer._make) 39 | 40 | 41 | def task_transformers(task_id: int) -> int: 42 | """ 43 | 任务关联的迁移数。 44 | """ 45 | with metadata.connect() as meta: 46 | return meta.value('select count(*) as transformers from tasks_transformers where task_id = ?', task_id) 47 | 48 | 49 | def task_create(code: str): 50 | """ 51 | 创建任务。 52 | """ 53 | logger.debug('code=%s', code) 54 | ensure(code is not None, '缺少任务名称') 55 | ensure(not task_exists(code), f'任务({code})已存在') 56 | with metadata.connect() as meta: 57 | task = meta.record(''' 58 | insert into tasks (code) values (?) returning * 59 | ''', code, constructor=Task._make) 60 | logger.info('创建任务(%s)', code) 61 | logger.debug('task=%s', task) 62 | 63 | 64 | def task_delete(code: str): 65 | """ 66 | 删除任务。 67 | """ 68 | logger.debug('code=%s', code) 69 | ensure(code is not None, '缺少任务名称') 70 | ensure(task_exists(code), f'任务({code})不存在') 71 | with metadata.connect() as meta: 72 | task = meta.record(''' 73 | delete from tasks where code = ? returning * 74 | ''', code, constructor=Task._make) 75 | logger.info('删除任务(%s)', code) 76 | logger.debug('task=%s', task) 77 | 78 | 79 | def task_list() -> list[TaskProjection]: 80 | """ 81 | 罗列所有任务。 82 | """ 83 | with metadata.connect() as meta: 84 | return meta.records(''' 85 | with task_transformers as ( 86 | select 87 | task_id, 88 | count(*) as transformers 89 | from 90 | tasks_transformers 91 | group by 92 | task_id 93 | ) 94 | select 95 | tasks.code, 96 | coalesce(task_transformers.transformers, 0) as transformers 97 | from 98 | tasks 99 | left join 100 | task_transformers 101 | on 102 | tasks.id = task_transformers.task_id 103 | group by 104 | tasks.code 105 | order by 106 | tasks.code 107 | ''', constructor=TaskProjection._make) 108 | 109 | 110 | def task_execute(code: str): 111 | """ 112 | 执行迁移任务。 113 | """ 114 | logger.debug('code=%s', code) 115 | ensure(task_exists(code), f'任务({code})不存在') 116 | with metadata.connect() as meta: 117 | for transformer_code in meta.values(''' 118 | select 119 | transformers.code 120 | from 121 | tasks 122 | inner join 123 | tasks_transformers 124 | on 125 | tasks.id = tasks_transformers.task_id 126 | and tasks.code = ? 127 | inner join 128 | transformers 129 | on 130 | tasks_transformers.transformer_id = transformers.id 131 | order by 132 | tasks_transformers.sort 133 | ''', code): 134 | transformer_service.transformer_execute(transformer_code) 135 | 136 | 137 | def task_bind(code: str, transformer_code: str, sort: int): 138 | """ 139 | 迁移与任务绑定。 140 | """ 141 | logger.debug('code=%s, transformer_code=%s', code, transformer_code) 142 | task = task_find(code) 143 | ensure(task is not None, f'任务({code})不存在') 144 | transformer = transformer_service.transformer_find(transformer_code) 145 | ensure(transformer is not None, f'迁移({transformer_code})不存在') 146 | ensure(task_transformer_find(task.id, transformer.id) is None, f'任务({code})与迁移({transformer_code})已绑定') 147 | with metadata.connect() as meta: 148 | transformers = task_transformers(task.id) 149 | if sort is not None and 0 < sort <= transformers: 150 | meta.execute(''' 151 | update 152 | tasks_transformers 153 | set 154 | sort = sort + 1, 155 | updated_at = datetime(current_timestamp, 'localtime') 156 | where 157 | task_id = ? 158 | and sort >= ? 159 | ''', task.id, sort) 160 | logger.info('后移序号(%s-%s)的迁移', sort, transformers) 161 | else: 162 | sort = transformers + 1 163 | task_transformer = meta.record(''' 164 | insert into tasks_transformers 165 | (task_id, transformer_id, sort) 166 | values 167 | (?, ?, ?) 168 | returning * 169 | ''', task.id, transformer.id, sort) 170 | logger.info('绑定任务(%s)与迁移(%s)', code, transformer_code) 171 | logger.debug('task_transformer=%s', task_transformer) 172 | 173 | 174 | def task_unbind(code: str, transformer_code: str): 175 | """ 176 | 迁移与任务解绑。 177 | """ 178 | logger.debug('code=%s, transformer_code=%s', code, transformer_code) 179 | task = task_find(code) 180 | ensure(task is not None, f'任务({code})不存在') 181 | transformer = transformer_service.transformer_find(transformer_code) 182 | ensure(transformer is not None, f'迁移({transformer_code})不存在') 183 | task_transformer = task_transformer_find(task.id, transformer.id) 184 | ensure(task_transformer is not None, f'任务({code})与迁移({transformer_code})未绑定') 185 | with metadata.connect() as meta: 186 | transformers = task_transformers(task.id) 187 | if task_transformer.sort < transformers: 188 | meta.execute(''' 189 | update 190 | tasks_transformers 191 | set 192 | sort = sort - 1, 193 | updated_at = datetime(current_timestamp, 'localtime') 194 | where 195 | task_id = ? 196 | and sort > ? 197 | ''', task.id, task_transformer.sort) 198 | logger.info('前移序号(%s-%s)的迁移', task_transformer.sort + 1, transformers) 199 | task_transformer = meta.record(''' 200 | delete from tasks_transformers where task_id = ? and transformer_id = ? returning * 201 | ''', task.id, transformer.id) 202 | logger.info('解绑任务(%s)与迁移(%s)', code, transformer_code) 203 | logger.debug('task_transformer=%s', task_transformer) 204 | 205 | 206 | def task_transformer_list(code: str) -> list[TaskTransformerProjection]: 207 | """ 208 | 列出任务内所有迁移。 209 | """ 210 | logger.debug('code=%s', code) 211 | filters = [] 212 | parameters = [] 213 | if code is not None: 214 | ensure(task_exists(code), f'任务({code})不存在') 215 | filters.append('and tasks.code = ?') 216 | parameters.append(code) 217 | 218 | with metadata.connect() as meta: 219 | return meta.records(f''' 220 | select 221 | tasks.code as task_code, 222 | tasks_transformers.sort, 223 | transformers.code as transformer_code, 224 | sources.kind as source_repository_kind, 225 | sources.code as source_repository_code, 226 | targets.kind as target_repository_kind, 227 | targets.code as target_repository_code, 228 | storages.code as target_storage_code, 229 | transformers.script_file 230 | from 231 | tasks 232 | inner join 233 | tasks_transformers 234 | on 235 | tasks.id = tasks_transformers.task_id 236 | {' '.join(filters)} 237 | inner join 238 | transformers 239 | on 240 | tasks_transformers.transformer_id = transformers.id 241 | inner join 242 | repositories as sources 243 | on 244 | transformers.source_id = sources.id 245 | inner join 246 | storages 247 | on 248 | transformers.target_id = storages.id 249 | inner join 250 | repositories as targets 251 | on 252 | storages.repository_id = targets.id 253 | order by 254 | tasks.code, 255 | tasks_transformers.sort 256 | ''', *parameters, constructor=TaskTransformerProjection._make) 257 | -------------------------------------------------------------------------------- /duckcp/service/transformer_service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import exists 3 | from typing import Optional 4 | 5 | from duckcp.configuration import meta_configuration as metadata 6 | from duckcp.entity.transform_context import TransformContext 7 | from duckcp.entity.transformer import Transformer 8 | from duckcp.helper.fs import absolute_path, slurp 9 | from duckcp.helper.validation import ensure 10 | from duckcp.projection.transformer_projection import TransformerProjection 11 | from duckcp.repository import RepositoryKind 12 | from duckcp.service import repository_service, storage_service 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def transformer_find(code: str) -> Optional[Transformer]: 18 | """ 19 | 根据编码查找迁移。 20 | """ 21 | with metadata.connect() as meta: 22 | return meta.record('select * from transformers where code = ?', code, constructor=Transformer._make) 23 | 24 | 25 | def transformer_exists(code: str) -> bool: 26 | """ 27 | 判断编码对应的迁移是否已存在。 28 | """ 29 | return transformer_find(code) is not None 30 | 31 | 32 | def transformer_create( 33 | code: str, 34 | source_repository_code: str, 35 | target_repository_code: str, 36 | target_storage_code: str, 37 | script_file: str, 38 | ): 39 | """ 40 | 添加迁移。 41 | """ 42 | logger.debug( 43 | 'code=%s, source_repository_code=%s, target_repository_code=%s, target_storage_code=%s, script_file=%s', 44 | code, source_repository_code, 45 | target_repository_code, target_storage_code, 46 | script_file 47 | ) 48 | ensure(code is not None, '缺少迁移名称') 49 | ensure(source_repository_code is not None, f'迁移({code})缺少来源仓库名称') 50 | ensure(target_repository_code is not None, f'迁移({code})缺少目标仓库名称') 51 | ensure(target_storage_code is not None, f'迁移({code})缺少目标存储单元') 52 | ensure(script_file is not None, f'迁移({code})缺少迁移脚本') 53 | 54 | ensure(not transformer_exists(code), f'迁移({code})已存在') 55 | repository = repository_service.repository_find(source_repository_code) 56 | ensure(repository is not None, f'来源仓库({source_repository_code})不存在') 57 | storage = storage_service.storage_find(target_repository_code, target_storage_code) 58 | ensure(storage is not None, f'目标仓库({target_repository_code})的存储单元({target_storage_code})不存在') 59 | script_file = absolute_path(script_file) 60 | 61 | with metadata.connect() as meta: 62 | transformer = meta.record(''' 63 | insert into transformers 64 | (code, source_id, target_id, script_file) 65 | values 66 | (?, ?, ?, ?) 67 | returning * 68 | ''', code, repository.id, storage.id, script_file, constructor=Transformer._make) 69 | logger.info('创建迁移(%s)', code) 70 | logger.debug('transformer=%s', transformer) 71 | 72 | 73 | def transformer_update( 74 | code: str, 75 | source_repository_code: str, 76 | target_repository_code: str, 77 | target_storage_code: str, 78 | script_file: str 79 | ): 80 | """ 81 | 更新迁移信息。 82 | """ 83 | logger.debug( 84 | 'code=%s, source_repository_code=%s, target_repository_code=%s, target_storage_code=%s, script_file=%s', 85 | code, source_repository_code, 86 | target_repository_code, target_storage_code, 87 | script_file 88 | ) 89 | ensure(code is not None, '缺少迁移名称') 90 | ensure( 91 | source_repository_code is not None 92 | or (target_repository_code is not None and target_storage_code is not None) 93 | or script_file is not None, 94 | '缺少更新内容' 95 | ) 96 | 97 | transformer = transformer_find(code) 98 | ensure(transformer is not None, f'迁移({code})不存在') 99 | 100 | if source_repository_code is not None: 101 | repository = repository_service.repository_find(source_repository_code) 102 | ensure(repository is not None, f'来源仓库({source_repository_code})不存在') 103 | source_id = repository.id 104 | else: 105 | source_id = transformer.source_id 106 | 107 | if target_repository_code is not None and target_storage_code is not None: 108 | storage = storage_service.storage_find(target_repository_code, target_storage_code) 109 | ensure(storage is not None, f'目标仓库({target_repository_code})的存储单元({target_storage_code})不存在') 110 | target_id = storage.id 111 | else: 112 | target_id = transformer.target_id 113 | 114 | if script_file is not None: 115 | script_file = absolute_path(script_file) 116 | else: 117 | script_file = transformer.script_file 118 | 119 | with metadata.connect() as meta: 120 | transformer = meta.record(''' 121 | update 122 | transformers 123 | set 124 | source_id = ?, 125 | target_id = ?, 126 | script_file = ?, 127 | updated_at = datetime(current_timestamp, 'localtime') 128 | where 129 | code = ? 130 | returning * 131 | ''', source_id, target_id, script_file, code, constructor=Transformer._make) 132 | logger.info('更新迁移(%s)', code) 133 | logger.debug('transformer=%s', transformer) 134 | 135 | 136 | def transformer_delete(code: str): 137 | """ 138 | 删除迁移。 139 | """ 140 | logger.debug('code=%s', code) 141 | ensure(transformer_exists(code), f'迁移({code})不存在') 142 | with metadata.connect() as meta: 143 | transformer = meta.record(''' 144 | delete from transformers where code = ? returning * 145 | ''', code, constructor=Transformer._make) 146 | logger.info('删除迁移(%s)', code) 147 | logger.debug('transformer=%s', transformer) 148 | 149 | 150 | def transformer_list( 151 | source_repository_kind: Optional[str] = None, 152 | source_repository_code: Optional[str] = None, 153 | target_repository_kind: Optional[str] = None, 154 | target_repository_code: Optional[str] = None, 155 | target_storage_code: Optional[str] = None, 156 | ) -> list[TransformerProjection]: 157 | """ 158 | 列出所有迁移。 159 | """ 160 | logger.debug( 161 | 'source_repository_kind=%s, source_repository_code=%s, target_repository_kind=%s, target_repository_code=%s, target_storage_code=%s', 162 | source_repository_kind, source_repository_code, 163 | target_repository_kind, target_repository_code, 164 | target_storage_code, 165 | ) 166 | 167 | filters = [] 168 | parameters = [] 169 | if source_repository_kind is not None: 170 | RepositoryKind.ensure(source_repository_kind) 171 | filters.append('and sources.kind = ?') 172 | parameters.append(source_repository_kind) 173 | if source_repository_code is not None: 174 | filters.append('and sources.code = ?') 175 | parameters.append(source_repository_code) 176 | if target_repository_kind is not None: 177 | RepositoryKind.ensure(target_repository_kind) 178 | filters.append('and targets.kind = ?') 179 | parameters.append(target_repository_kind) 180 | if target_repository_code is not None: 181 | filters.append('and targets.code = ?') 182 | parameters.append(target_repository_code) 183 | if target_storage_code is not None: 184 | filters.append('and storages.code = ?') 185 | parameters.append(target_storage_code) 186 | 187 | with metadata.connect() as meta: 188 | return meta.records(f''' 189 | with transformers_tasks as ( 190 | select 191 | transformer_id as id, 192 | count(*) as tasks 193 | from 194 | tasks_transformers 195 | group by 196 | transformer_id 197 | ) 198 | select 199 | transformers.code, 200 | sources.kind as source_repository_kind, 201 | sources.code as source_repository_code, 202 | targets.kind as target_repository_kind, 203 | targets.code as target_repository_code, 204 | storages.code as target_storage_code, 205 | transformers.script_file, 206 | coalesce(transformers_tasks.tasks, 0) as tasks 207 | from 208 | transformers 209 | inner join 210 | repositories as sources 211 | on 212 | transformers.source_id = sources.id 213 | inner join 214 | storages 215 | on 216 | transformers.target_id = storages.id 217 | inner join 218 | repositories as targets 219 | on 220 | storages.repository_id = targets.id 221 | {' '.join(filters)} 222 | left join 223 | transformers_tasks 224 | on 225 | transformers.id = transformers_tasks.id 226 | order by 227 | transformers.code 228 | ''', *parameters, constructor=TransformerProjection._make) 229 | 230 | 231 | # 执行迁移 232 | 233 | def transformer_execute(code: str): 234 | """ 235 | 执行迁移。 236 | """ 237 | logger.debug('code=%s', code) 238 | transformer = transformer_find(code) 239 | ensure(transformer is not None, f'迁移({code})不存在') 240 | ensure(exists(transformer.script_file), f'迁移脚本({transformer.script_file})不存在') 241 | sql = slurp(transformer.script_file) 242 | 243 | with metadata.connect() as meta: 244 | context = meta.record(''' 245 | select 246 | sources.code as source_repository_code, 247 | targets.code as target_repository_code, 248 | storages.code as target_storage_code 249 | from 250 | transformers 251 | inner join 252 | repositories as sources 253 | on 254 | transformers.source_id = sources.id 255 | and transformers.id = ? 256 | inner join 257 | storages 258 | on 259 | transformers.target_id = storages.id 260 | inner join 261 | repositories as targets 262 | on 263 | storages.repository_id = targets.id 264 | ''', transformer.id, constructor=TransformContext._make) 265 | source_repository = repository_service.repository_find(context.source_repository_code) 266 | target_repository = repository_service.repository_find(context.target_repository_code) 267 | target_storage = storage_service.storage_find(context.target_repository_code, context.target_storage_code) 268 | kind = RepositoryKind.of(target_repository.kind) 269 | 270 | with source_repository.connect() as source_connection: 271 | with source_connection.prepare(sql) as statement: 272 | kind.transform(statement, target_repository, target_storage) 273 | logger.info('从仓库(%s)迁移数据到仓库(%s)的存储单元(%s)', source_repository.code, target_repository.code, target_storage.code) 274 | -------------------------------------------------------------------------------- /duckcp/boot/storage_command.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any 3 | 4 | from click import help_option, option, argument, INT, Choice 5 | from rich.console import Console 6 | from rich.table import Table 7 | 8 | from duckcp.boot import app 9 | from duckcp.helper.click import JSON 10 | from duckcp.repository import RepositoryKind 11 | from duckcp.service import storage_service 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @app.group(help='管理存储单元') 17 | @help_option('-h', '--help', help='展示帮助信息') 18 | def storage(): 19 | pass 20 | 21 | 22 | @storage.command('create', help='创建存储单元') 23 | @argument('name', metavar='NAME') 24 | @option('-r', '--repository', metavar='REPOSITORY', help='所属仓库') 25 | # Postgres;DuckDB;ODPS 26 | @option('--catalog', metavar='CATALOG', help='目录;用于[postgres;duckdb;odps]') 27 | # Postgres;DuckDB;ODPS;SQLite 28 | @option('--schema', metavar='SCHEMA', help='模式;用于[postgres;duckdb;odps;sqlite]') 29 | # Postgres;DuckDB;ODPS;SQLite;BiTable 30 | @option('--table', metavar='TABLE', help='表;用于[postgres;duckdb;odps;sqlite;bitable]') 31 | # BiTable 32 | @option('--document', metavar='DOCUMENT', help='多维表格文档;用于[bitable]') 33 | # File 34 | @option('--file', metavar='FILE', help='文件名;用于[file]') 35 | @option('--format', metavar='FORMAT', type=Choice(['csv', 'parquet', 'json']), help='文件格式;用于[file]') 36 | @option('--compression', metavar='ALGORITHM', type=Choice(['gzip', 'zstd', 'snappy', 'brotli', 'lz4', 'lz4_raw']), help='压缩算法;用于[file]') 37 | @option('--compression-level', metavar='LEVEL', type=INT, help='压缩等级;用于[file][parquet]') 38 | @option('--parquet-version', metavar='VERSION', type=Choice(['V1', 'V2']), help='Parquet文件版本;用于[file][parquet]') 39 | @option('--field-ids', metavar='JSON', type=JSON, help='字段的编号;用于[file][parquet]') 40 | @option('--row-group-size', metavar='SIZE', type=INT, help='最大行数;用于[file][parquet]') 41 | @option('--row-group-size-bytes', metavar='SIZE', type=INT, help='每行组最大字节数;用于[file][parquet]') 42 | @option('--row-group-per-file', metavar='SIZE', type=INT, help='每个文件最大的行组数;用于[file][parquet]') 43 | @option('--header/--no-header', is_flag=True, default=None, help='是否包含字段名;用于[file][csv]') 44 | @option('--delimiter', metavar='DELIMITER', help='字段分隔符;用于[file][csv]') 45 | @option('--quote-char', metavar='CHAR', help='引号字符;用于[file][csv]') 46 | @option('--escape-char', metavar='CHAR', help='转义字符;用于[file][csv]') 47 | @option('--null-literal', metavar='LITERAL', help='空值字面量;用于[file][csv]') 48 | @option('--force-quote', metavar='COLUMN', multiple=True, help='给指定列强制添加引号;用于[file][csv]') 49 | @option('--prefix', metavar='PREFIX', help='前置内容;用于[file][csv]') 50 | @option('--suffix', metavar='SUFFIX', help='后置内容;用于[file][csv]') 51 | @option('--date-format', metavar='FORMAT', help='日期格式;用于[file][csv;json]') 52 | @option('--timestamp-format', metavar='FORMAT', help='时间戳格式;用于[file][csv;json]') 53 | @option('--array/--no-array', is_flag=True, default=None, help='写入JSON数组;用于[file][json]') 54 | @option('--per-thread-output', is_flag=True, default=None, help='是否每个线程写入独立文件;用于[file][thread]') 55 | @option('--file-size-bytes', metavar='BYTES', type=INT, help='每个独立文件最大字节数;用于[file][thread]') 56 | @option('--partition-by', metavar='COLUMN', multiple=True, help='分区列;用于[file][partition]') 57 | @option('--filename-pattern', metavar='PATTERN', help='文件名模式;用于[file][partition]') 58 | @option('--file-extension', metavar='EXTENSION', help='扩展名;用于[file][partition]') 59 | @option('--write-partition-columns', is_flag=True, default=None, help='是否包含分区列;用于[file][partition]') 60 | @option('--use-tmp-file', is_flag=True, default=None, help='若原始文件存在,是否先写入临时文件;用于[file][partition]') 61 | @option('--delete-before-write', is_flag=True, default=None, help='写入之前删除整个目录;用于[file][partition]') 62 | @option('--overwrite/--no-overwrite', is_flag=True, default=None, help='覆盖已存在的文件;用于[file][partition]') 63 | @option('--append/--no-append', is_flag=True, default=None, help='追加到已存在的文件;用于[file][partition]') 64 | @option('--preserve-order', is_flag=True, default=None, help='是否保留原始顺序;用于[file]') 65 | # Others 66 | @help_option('-h', '--help', help='展示帮助信息') 67 | def storage_create( 68 | name: str, 69 | repository: str, 70 | # Postgres;DuckDB;ODPS 71 | catalog: str, # 目录;用于[postgres;duckdb;odps] 72 | # Postgres;DuckDB;ODPS;SQLite 73 | schema: str, # 模式;用于[postgres;duckdb;odps;sqlite] 74 | # Postgres;DuckDB;ODPS;SQLite;BiTable 75 | table: str, # 表;用于[postgres;duckdb;odps;sqlite;bitable] 76 | # BiTable 77 | document: str, # 多维表格文档;用于[bitable] 78 | # File 79 | file: str, # 文件名;用于[file] 80 | format: str, # 文件格式;用于[file] 81 | compression: str, # 压缩算法;用于[file] 82 | compression_level: int, # 压缩等级;用于[file][parquet] 83 | parquet_version: str, # Parquet文件版本;用于[file][parquet] 84 | field_ids: dict[str, Any], # 字段的编号;用于[file][parquet] 85 | row_group_size: int, # 最大行数;用于[file][parquet] 86 | row_group_size_bytes: int, # 每行组最大字节数;用于[file][parquet] 87 | row_group_per_file: int, # 每个文件最大的行组数;用于[file][parquet] 88 | header: bool, # 是否包含字段名;用于[file][csv] 89 | delimiter: str, # 字段分隔符;用于[file][csv] 90 | quote_char: str, # 引号字符;用于[file][csv] 91 | escape_char: str, # 转义字符;用于[file][csv] 92 | null_literal: str, # 空值字面量;用于[file][csv] 93 | force_quote: list[str], # 给指定列强制添加引号;用于[file][csv] 94 | prefix: str, # 前置内容;用于[file][csv] 95 | suffix: str, # 后置内容;用于[file][csv] 96 | date_format: str, # 日期格式;用于[file][csv;json] 97 | timestamp_format: str, # 时间戳格式;用于[file][csv;json] 98 | array: bool, # 写入JSON数组;用于[file][json] 99 | per_thread_output: bool, # 是否每个线程写入独立文件;用于[file][thread] 100 | file_size_bytes: int, # 每个独立文件最大字节数;用于[file][thread] 101 | partition_by: list[str], # 分区列;用于[file][partition] 102 | filename_pattern: str, # 文件名模式;用于[file][partition] 103 | file_extension: str, # 扩展名;用于[file][partition] 104 | write_partition_columns: bool, # 是否包含分区列;用于[file][partition] 105 | use_tmp_file: bool, # 若原始文件存在,是否先写入临时文件;用于[file][partition] 106 | delete_before_write: bool, # 写入之前删除整个目录;用于[file][partition] 107 | overwrite: bool, # 覆盖已存在的文件;用于[file][partition] 108 | append: bool, # 追加到已存在的文件;用于[file][partition] 109 | preserve_order: bool, # 是否保留原始顺序;用于[file] 110 | ): 111 | logger.debug( 112 | 'name=%s, repository=%s, catalog=%s, schema=%s, table=%s, document=%s, file=%s, format=%s, compression=%s, compression_level=%s, parquet_version=%s, field_ids=%s, row_group_size=%s, row_group_size_bytes=%s, row_group_per_file=%s, header=%s, delimiter=%s, quote_char=%s, escape_char=%s, null_literal=%s, force_quote=%s, prefix=%s, suffix=%s, date_format=%s, timestamp_format=%s, array=%s, per_thread_output=%s, file_size_bytes=%s, partition_by=%s, filename_pattern=%s, file_extension=%s, write_partition_columns=%s, use_tmp_file=%s, delete_before_write=%s, overwrite=%s, append=%s, preserve_order=%s', 113 | name, repository, catalog, schema, table, document, 114 | file, format, compression, compression_level, 115 | parquet_version, field_ids, row_group_size, row_group_size_bytes, row_group_per_file, 116 | header, delimiter, quote_char, escape_char, null_literal, force_quote, prefix, suffix, 117 | date_format, timestamp_format, array, 118 | per_thread_output, file_size_bytes, 119 | partition_by, filename_pattern, file_extension, write_partition_columns, use_tmp_file, 120 | delete_before_write, overwrite, append, preserve_order, 121 | ) 122 | storage_service.storage_create(repository, name, { 123 | 'catalog': catalog, 124 | 'schema': schema, 125 | 'table': table, 126 | 'document': document, 127 | 'file': file, 128 | 'format': format, 129 | 'compression': compression, 130 | 'compression_level': compression_level, 131 | 'parquet_version': parquet_version, 132 | 'field_ids': field_ids, 133 | 'row_group_size': row_group_size, 134 | 'row_group_size_bytes': row_group_size_bytes, 135 | 'row_group_per_file': row_group_per_file, 136 | 'header': header, 137 | 'delimiter': delimiter, 138 | 'quote': quote_char, 139 | 'escape': escape_char, 140 | 'nullstr': null_literal, 141 | 'force_quote': force_quote, 142 | 'prefix': prefix, 143 | 'suffix': suffix, 144 | 'dateformat': date_format, 145 | 'timestampformat': timestamp_format, 146 | 'array': array, 147 | 'per_thread_output': per_thread_output, 148 | 'file_size_bytes': file_size_bytes, 149 | 'partition_by': partition_by, 150 | 'filename_pattern': filename_pattern, 151 | 'file_extension': file_extension, 152 | 'write_partition_columns': write_partition_columns, 153 | 'use_tmp_file': use_tmp_file, 154 | 'overwrite': delete_before_write, 155 | 'overwrite_or_ignore': overwrite, 156 | 'append': append, 157 | 'preserve_order': preserve_order, 158 | }) 159 | 160 | 161 | @storage.command('update', help='更新存储单元信息') 162 | @argument('name', metavar='NAME') 163 | @option('-r', '--repository', metavar='REPOSITORY', help='所属仓库') 164 | # Postgres;DuckDB;ODPS 165 | @option('--catalog', metavar='CATALOG', help='目录;用于[postgres;duckdb;odps]') 166 | # Postgres;DuckDB;ODPS;SQLite 167 | @option('--schema', metavar='SCHEMA', help='模式;用于[postgres;duckdb;odps;sqlite]') 168 | # Postgres;DuckDB;ODPS;SQLite;BiTable 169 | @option('--table', metavar='TABLE', help='表;用于[postgres;duckdb;odps;sqlite;bitable]') 170 | # BiTable 171 | @option('--document', metavar='DOCUMENT', help='多维表格文档;用于[bitable]') 172 | # File 173 | @option('--file', metavar='FILE', help='文件名;用于[file]') 174 | @option('--format', metavar='FORMAT', type=Choice(['csv', 'parquet', 'json']), help='文件格式;用于[file]') 175 | @option('--compression', metavar='ALGORITHM', type=Choice(['gzip', 'zstd', 'snappy', 'brotli', 'lz4', 'lz4_raw']), help='压缩算法;用于[file]') 176 | @option('--compression-level', metavar='LEVEL', type=INT, help='压缩等级;用于[file][parquet]') 177 | @option('--parquet-version', metavar='VERSION', type=Choice(['V1', 'V2']), help='Parquet文件版本;用于[file][parquet]') 178 | @option('--field-ids', metavar='JSON', type=JSON, help='字段的编号;用于[file][parquet]') 179 | @option('--row-group-size', metavar='SIZE', type=INT, help='最大行数;用于[file][parquet]') 180 | @option('--row-group-size-bytes', metavar='SIZE', type=INT, help='每行组最大字节数;用于[file][parquet]') 181 | @option('--row-group-per-file', metavar='SIZE', type=INT, help='每个文件最大的行组数;用于[file][parquet]') 182 | @option('--header/--no-header', is_flag=True, default=None, help='是否包含字段名;用于[file][csv]') 183 | @option('--delimiter', metavar='DELIMITER', help='字段分隔符;用于[file][csv]') 184 | @option('--quote-char', metavar='CHAR', help='引号字符;用于[file][csv]') 185 | @option('--escape-char', metavar='CHAR', help='转义字符;用于[file][csv]') 186 | @option('--null-literal', metavar='LITERAL', help='空值字面量;用于[file][csv]') 187 | @option('--force-quote', metavar='COLUMN', multiple=True, help='给指定列强制添加引号;用于[file][csv]') 188 | @option('--prefix', metavar='PREFIX', help='前置内容;用于[file][csv]') 189 | @option('--suffix', metavar='SUFFIX', help='后置内容;用于[file][csv]') 190 | @option('--date-format', metavar='FORMAT', help='日期格式;用于[file][csv;json]') 191 | @option('--timestamp-format', metavar='FORMAT', help='时间戳格式;用于[file][csv;json]') 192 | @option('--array/--no-array', is_flag=True, default=None, help='写入JSON数组;用于[file][json]') 193 | @option('--per-thread-output', is_flag=True, default=None, help='是否每个线程写入独立文件;用于[file][thread]') 194 | @option('--file-size-bytes', metavar='BYTES', type=INT, help='每个独立文件最大字节数;用于[file][thread]') 195 | @option('--partition-by', metavar='COLUMN', multiple=True, help='分区列;用于[file][partition]') 196 | @option('--filename-pattern', metavar='PATTERN', help='文件名模式;用于[file][partition]') 197 | @option('--file-extension', metavar='EXTENSION', help='扩展名;用于[file][partition]') 198 | @option('--write-partition-columns', is_flag=True, default=None, help='是否包含分区列;用于[file][partition]') 199 | @option('--use-tmp-file', is_flag=True, default=None, help='若原始文件存在,是否先写入临时文件;用于[file][partition]') 200 | @option('--delete-before-write', is_flag=True, default=None, help='写入之前删除整个目录;用于[file][partition]') 201 | @option('--overwrite/--no-overwrite', is_flag=True, default=None, help='覆盖已存在的文件;用于[file][partition]') 202 | @option('--append/--no-append', is_flag=True, default=None, help='追加到已存在的文件;用于[file][partition]') 203 | @option('--preserve-order', is_flag=True, default=None, help='是否保留原始顺序;用于[file]') 204 | # Others 205 | @help_option('-h', '--help', help='展示帮助信息') 206 | def storage_update( 207 | name: str, 208 | repository: str, 209 | # Postgres;DuckDB;ODPS 210 | catalog: str, # 目录;用于[postgres;duckdb;odps] 211 | # Postgres;DuckDB;ODPS;SQLite 212 | schema: str, # 模式;用于[postgres;duckdb;odps;sqlite] 213 | # Postgres;DuckDB;ODPS;SQLite;BiTable 214 | table: str, # 表;用于[postgres;duckdb;odps;sqlite;bitable] 215 | # BiTable 216 | document: str, # 多维表格文档;用于[bitable] 217 | # File 218 | file: str, # 文件名;用于[file] 219 | format: str, # 文件格式;用于[file] 220 | compression: str, # 压缩算法;用于[file] 221 | compression_level: int, # 压缩等级;用于[file][parquet] 222 | parquet_version: str, # Parquet文件版本;用于[file][parquet] 223 | field_ids: dict[str, Any], # 字段的编号;用于[file][parquet] 224 | row_group_size: int, # 最大行数;用于[file][parquet] 225 | row_group_size_bytes: int, # 每行组最大字节数;用于[file][parquet] 226 | row_group_per_file: int, # 每个文件最大的行组数;用于[file][parquet] 227 | header: bool, # 是否包含字段名;用于[file][csv] 228 | delimiter: str, # 字段分隔符;用于[file][csv] 229 | quote_char: str, # 引号字符;用于[file][csv] 230 | escape_char: str, # 转义字符;用于[file][csv] 231 | null_literal: str, # 空值字面量;用于[file][csv] 232 | force_quote: list[str], # 给指定列强制添加引号;用于[file][csv] 233 | prefix: str, # 前置内容;用于[file][csv] 234 | suffix: str, # 后置内容;用于[file][csv] 235 | date_format: str, # 日期格式;用于[file][csv;json] 236 | timestamp_format: str, # 时间戳格式;用于[file][csv;json] 237 | array: bool, # 写入JSON数组;用于[file][json] 238 | per_thread_output: bool, # 是否每个线程写入独立文件;用于[file][thread] 239 | file_size_bytes: int, # 每个独立文件最大字节数;用于[file][thread] 240 | partition_by: list[str], # 分区列;用于[file][partition] 241 | filename_pattern: str, # 文件名模式;用于[file][partition] 242 | file_extension: str, # 扩展名;用于[file][partition] 243 | write_partition_columns: bool, # 是否包含分区列;用于[file][partition] 244 | use_tmp_file: bool, # 若原始文件存在,是否先写入临时文件;用于[file][partition] 245 | delete_before_write: bool, # 写入之前删除整个目录;用于[file][partition] 246 | overwrite: bool, # 覆盖已存在的文件;用于[file][partition] 247 | append: bool, # 追加到已存在的文件;用于[file][partition] 248 | preserve_order: bool, # 是否保留原始顺序;用于[file] 249 | ): 250 | logger.debug( 251 | 'name=%s, repository=%s, catalog=%s, schema=%s, table=%s, document=%s, file=%s, format=%s, compression=%s, compression_level=%s, parquet_version=%s, field_ids=%s, row_group_size=%s, row_group_size_bytes=%s, row_group_per_file=%s, header=%s, delimiter=%s, quote_char=%s, escape_char=%s, null_literal=%s, force_quote=%s, prefix=%s, suffix=%s, date_format=%s, timestamp_format=%s, array=%s, per_thread_output=%s, file_size_bytes=%s, partition_by=%s, filename_pattern=%s, file_extension=%s, write_partition_columns=%s, use_tmp_file=%s, delete_before_write=%s, overwrite=%s, append=%s, preserve_order=%s', 252 | name, repository, catalog, schema, table, document, 253 | file, format, compression, compression_level, 254 | parquet_version, field_ids, row_group_size, row_group_size_bytes, row_group_per_file, 255 | header, delimiter, quote_char, escape_char, null_literal, force_quote, prefix, suffix, 256 | date_format, timestamp_format, array, 257 | per_thread_output, file_size_bytes, 258 | partition_by, filename_pattern, file_extension, write_partition_columns, use_tmp_file, 259 | delete_before_write, overwrite, append, preserve_order, 260 | ) 261 | storage_service.storage_update(repository, name, { 262 | 'catalog': catalog, 263 | 'schema': schema, 264 | 'table': table, 265 | 'document': document, 266 | 'file': file, 267 | 'format': format, 268 | 'compression': compression, 269 | 'compression_level': compression_level, 270 | 'parquet_version': parquet_version, 271 | 'field_ids': field_ids, 272 | 'row_group_size': row_group_size, 273 | 'row_group_size_bytes': row_group_size_bytes, 274 | 'row_group_per_file': row_group_per_file, 275 | 'header': header, 276 | 'delimiter': delimiter, 277 | 'quote': quote_char, 278 | 'escape': escape_char, 279 | 'nullstr': null_literal, 280 | 'force_quote': force_quote, 281 | 'prefix': prefix, 282 | 'suffix': suffix, 283 | 'dateformat': date_format, 284 | 'timestampformat': timestamp_format, 285 | 'array': array, 286 | 'per_thread_output': per_thread_output, 287 | 'file_size_bytes': file_size_bytes, 288 | 'partition_by': partition_by, 289 | 'filename_pattern': filename_pattern, 290 | 'file_extension': file_extension, 291 | 'write_partition_columns': write_partition_columns, 292 | 'use_tmp_file': use_tmp_file, 293 | 'overwrite': delete_before_write, 294 | 'overwrite_or_ignore': overwrite, 295 | 'append': append, 296 | 'preserve_order': preserve_order, 297 | }) 298 | 299 | 300 | @storage.command('delete', help='删除存储单元;删除迁移') 301 | @argument('name', metavar='NAME') 302 | @option('-r', '--repository', metavar='REPOSITORY', help='所属仓库') 303 | @help_option('-h', '--help', help='展示帮助信息') 304 | def storage_delete(name: str, repository: str): 305 | logger.debug('name=%s, repository=%s', name, repository) 306 | storage_service.storage_delete(repository, name) 307 | 308 | 309 | @storage.command('list', help='列出所有存储单元') 310 | @option('-k', '--kind', type=Choice(RepositoryKind.codes()), help='仓库类型') 311 | @option('-r', '--repository', metavar='REPOSITORY', help='所属仓库') 312 | @help_option('-h', '--help', help='展示帮助信息') 313 | def storage_list(kind: str, repository: str): 314 | logger.debug('kind=%s, repository=%s', kind, repository) 315 | table = Table(title='存储单元列表') 316 | table.add_column('仓库类型', no_wrap=True) 317 | table.add_column('仓库名称', no_wrap=True) 318 | table.add_column('存储单元', no_wrap=True) 319 | table.add_column('关联迁移', justify='right') 320 | for row in storage_service.storage_list(kind, repository): 321 | table.add_row( 322 | row.repository_kind, 323 | row.repository_code, 324 | row.code, 325 | str(row.transformers), 326 | ) 327 | 328 | console = Console() 329 | console.print(table) 330 | --------------------------------------------------------------------------------