├── CLAUDE.md ├── .python-version ├── .serena ├── .gitignore ├── memories │ └── wiktionary_cli_refactor.md └── project.yml ├── data └── dictionary.sqlite ├── .gitignore ├── src └── open_dictionary │ ├── __init__.py │ ├── llm │ ├── llm_client.py │ ├── define.py │ └── define_enricher.py │ ├── utils │ └── env_loader.py │ ├── wikitionary │ ├── extract.py │ ├── downloader.py │ ├── filter.py │ ├── pipeline.py │ ├── progress.py │ ├── transform.py │ └── pre_process.py │ ├── db │ ├── access.py │ ├── sqlite_manager.py │ ├── cleaner.py │ └── mark_commonness.py │ ├── workflow.py │ └── cli.py ├── pyproject.toml ├── README.md └── AGENTS.md /CLAUDE.md: -------------------------------------------------------------------------------- 1 | AGENTS.md -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /.serena/.gitignore: -------------------------------------------------------------------------------- 1 | /cache 2 | -------------------------------------------------------------------------------- /data/dictionary.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahpxex/open-dictionary/HEAD/data/dictionary.sqlite -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | .env 12 | words.txt 13 | .DS_Store -------------------------------------------------------------------------------- /src/open_dictionary/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import main as _cli_main 2 | 3 | def importer() -> None: 4 | raise SystemExit(_cli_main()) 5 | 6 | def main() -> None: 7 | raise SystemExit(_cli_main()) 8 | -------------------------------------------------------------------------------- /.serena/memories/wiktionary_cli_refactor.md: -------------------------------------------------------------------------------- 1 | Refactored Open Dictionary CLI: new central src/open_dictionary/cli.py registers commands; streaming logic in wikitionary/transform.py uses shared StreamingProgress from wikitionary/progress.py; pipeline orchestration moved to wikitionary/pipeline.py; filter command supports 'all' languages with progress output; README documents filter usage. -------------------------------------------------------------------------------- /src/open_dictionary/llm/llm_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from open_dictionary.utils.env_loader import get_env 3 | 4 | client = OpenAI( 5 | # This is the default and can be omitted 6 | api_key=get_env('LLM_KEY'), 7 | base_url=get_env('LLM_API'), 8 | ) 9 | 10 | def get_chat_response(instructions: str, input: str) -> str: 11 | response = client.responses.create( 12 | model=get_env('LLM_MODEL'), # type: ignore 13 | instructions=instructions, 14 | input=input, 15 | temperature=0.1 16 | ) 17 | 18 | return response.output_text -------------------------------------------------------------------------------- /src/open_dictionary/utils/env_loader.py: -------------------------------------------------------------------------------- 1 | from os import getenv 2 | from dotenv import load_dotenv 3 | from typing import Literal 4 | 5 | load_dotenv() 6 | 7 | EnvKey = Literal['LLM_MODEL', 'LLM_KEY', 'LLM_API', 'DATABASE_URL'] 8 | 9 | def get_env(key: EnvKey, default: str | None = None) -> str | None: 10 | """Get environment variable value. 11 | 12 | Args: 13 | key: Environment variable key 14 | default: Default value if key not found 15 | 16 | Returns: 17 | Environment variable value or default 18 | """ 19 | return getenv(key, default) 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "open-dictionary" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | authors = [ 7 | { name = "ahpx", email = "AHpx@yandex.com" } 8 | ] 9 | requires-python = ">=3.12" 10 | dependencies = [ 11 | "dotenv>=0.9.9", 12 | "openai>=2.6.1", 13 | "psycopg[binary]>=3.2,<4", 14 | "python-dotenv>=1.0,<2", 15 | "python-toon>=0.1.2", 16 | "wordfreq>=3.1.1", 17 | ] 18 | 19 | [project.scripts] 20 | open-dictionary = "open_dictionary:main" 21 | 22 | [build-system] 23 | requires = ["uv_build>=0.8.23,<0.9.0"] 24 | build-backend = "uv_build" 25 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/extract.py: -------------------------------------------------------------------------------- 1 | """Extraction helpers for the Wiktionary JSONL archive.""" 2 | 3 | from __future__ import annotations 4 | 5 | import gzip 6 | import sys 7 | from pathlib import Path 8 | 9 | from .progress import ByteProgressPrinter 10 | 11 | 12 | def extract_wiktionary_dump( 13 | source: Path, 14 | destination: Path, 15 | *, 16 | overwrite: bool = False, 17 | chunk_size: int = 32 * 1024 * 1024, 18 | ) -> Path: 19 | """Extract a Wiktionary ``.jsonl.gz`` archive to ``destination``.""" 20 | 21 | source_path = Path(source) 22 | if not source_path.is_file(): 23 | raise FileNotFoundError(f"Source archive {source_path} does not exist") 24 | 25 | dest_path = Path(destination) 26 | if dest_path.exists() and dest_path.is_dir(): 27 | raise IsADirectoryError(f"Destination {dest_path} is a directory") 28 | 29 | if dest_path.exists() and not overwrite: 30 | print(f"Extraction skipped; {dest_path} already exists.", file=sys.stderr) 31 | return dest_path 32 | 33 | dest_path.parent.mkdir(parents=True, exist_ok=True) 34 | 35 | total_size = source_path.stat().st_size 36 | progress = ByteProgressPrinter("Extracting", total_size) 37 | 38 | with source_path.open("rb") as raw_handle: 39 | with gzip.GzipFile(fileobj=raw_handle) as gz_handle: 40 | with dest_path.open("wb") as out_handle: 41 | while True: 42 | chunk = gz_handle.read(chunk_size) 43 | if not chunk: 44 | break 45 | out_handle.write(chunk) 46 | progress.report(raw_handle.tell()) 47 | 48 | progress.finalize(total_size) 49 | return dest_path 50 | 51 | 52 | __all__ = ["extract_wiktionary_dump"] 53 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/downloader.py: -------------------------------------------------------------------------------- 1 | """Streaming download helpers for the Wiktionary dataset.""" 2 | 3 | from __future__ import annotations 4 | 5 | import sys 6 | import urllib.error 7 | import urllib.request 8 | from pathlib import Path 9 | 10 | from .progress import ByteProgressPrinter 11 | 12 | 13 | DEFAULT_WIKTIONARY_URL = "https://kaikki.org/dictionary/raw-wiktextract-data.jsonl.gz" 14 | 15 | 16 | def download_wiktionary_dump( 17 | destination: Path, 18 | *, 19 | url: str = DEFAULT_WIKTIONARY_URL, 20 | overwrite: bool = False, 21 | chunk_size: int = 32 * 1024 * 1024, 22 | ) -> Path: 23 | """Download a Wiktionary dump to ``destination`` with streaming progress.""" 24 | 25 | dest_path = Path(destination) 26 | if dest_path.exists() and dest_path.is_dir(): 27 | raise IsADirectoryError(f"Destination {dest_path} is a directory") 28 | 29 | if dest_path.exists() and not overwrite: 30 | print(f"Download skipped; {dest_path} already exists.", file=sys.stderr) 31 | return dest_path 32 | 33 | dest_path.parent.mkdir(parents=True, exist_ok=True) 34 | 35 | downloaded = 0 36 | try: 37 | with urllib.request.urlopen(url) as response: 38 | total_size = int(response.headers.get("Content-Length", "0") or 0) 39 | progress = ByteProgressPrinter("Downloading", total_size) 40 | 41 | with dest_path.open("wb") as out_handle: 42 | while True: 43 | chunk = response.read(chunk_size) 44 | if not chunk: 45 | break 46 | out_handle.write(chunk) 47 | downloaded += len(chunk) 48 | progress.report(downloaded) 49 | 50 | progress.finalize(downloaded) 51 | 52 | except urllib.error.URLError as exc: # pragma: no cover - network failure guard 53 | raise RuntimeError(f"Failed to download Wiktionary dump: {exc}") from exc 54 | return dest_path 55 | 56 | 57 | __all__ = ["DEFAULT_WIKTIONARY_URL", "download_wiktionary_dump"] 58 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/filter.py: -------------------------------------------------------------------------------- 1 | """Business logic for filtering Wiktionary entries into language-specific tables.""" 2 | 3 | from __future__ import annotations 4 | 5 | import sys 6 | from typing import Sequence 7 | 8 | from .transform import partition_dictionary_by_language 9 | 10 | 11 | def filter_languages( 12 | conninfo: str, 13 | *, 14 | source_table: str, 15 | column_name: str, 16 | languages: Sequence[str], 17 | lang_field: str = "lang_code", 18 | table_prefix: str = "dictionary_lang", 19 | target_schema: str | None = None, 20 | drop_existing: bool = False, 21 | ) -> list[str]: 22 | """Create language-specific tables for the requested ``languages`` only.""" 23 | 24 | if not languages: 25 | raise ValueError("At least one language code must be provided.") 26 | 27 | normalized: list[str] = [] 28 | include_all = False 29 | for raw_code in languages: 30 | code = (raw_code or "").strip() 31 | if not code: 32 | continue 33 | if code.lower() == "all": 34 | include_all = True 35 | break 36 | normalized.append(code) 37 | 38 | language_list: Sequence[str] | None 39 | if include_all: 40 | print( 41 | f"[filter] Materializing all languages from {source_table}.{column_name}...", 42 | file=sys.stderr, 43 | flush=True, 44 | ) 45 | language_list = None 46 | else: 47 | if not normalized: 48 | raise ValueError("At least one non-empty language code must be provided.") 49 | display_codes = ", ".join(normalized[:5]) 50 | if len(normalized) > 5: 51 | display_codes += ", ..." 52 | print( 53 | ( 54 | f"[filter] Materializing {len(normalized)} language(s) " 55 | f"({display_codes}) from {source_table}.{column_name}..." 56 | ), 57 | file=sys.stderr, 58 | flush=True, 59 | ) 60 | language_list = normalized 61 | 62 | return partition_dictionary_by_language( 63 | conninfo, 64 | source_table=source_table, 65 | column_name=column_name, 66 | lang_field=lang_field, 67 | table_prefix=table_prefix, 68 | target_schema=target_schema, 69 | drop_existing=drop_existing, 70 | languages=language_list, 71 | ) 72 | 73 | 74 | __all__ = [ 75 | "filter_languages", 76 | ] 77 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/pipeline.py: -------------------------------------------------------------------------------- 1 | """Workflow helpers for streaming Wiktionary dumps into PostgreSQL.""" 2 | 3 | from __future__ import annotations 4 | 5 | import sys 6 | import urllib.parse 7 | from pathlib import Path 8 | 9 | from .downloader import DEFAULT_WIKTIONARY_URL, download_wiktionary_dump 10 | from .extract import extract_wiktionary_dump 11 | from .transform import copy_jsonl_to_postgres, partition_dictionary_by_language 12 | 13 | 14 | def run_pipeline( 15 | *, 16 | workdir: Path, 17 | conninfo: str, 18 | table_name: str, 19 | column_name: str, 20 | url: str = DEFAULT_WIKTIONARY_URL, 21 | truncate: bool = False, 22 | skip_download: bool = False, 23 | skip_extract: bool = False, 24 | skip_partition: bool = False, 25 | overwrite_download: bool = False, 26 | overwrite_extract: bool = False, 27 | lang_field: str = "lang_code", 28 | table_prefix: str = "dictionary_lang", 29 | target_schema: str | None = None, 30 | drop_existing_partitions: bool = False, 31 | ) -> None: 32 | """Execute the full download → extract → load → partition workflow.""" 33 | 34 | workdir = Path(workdir) 35 | workdir.mkdir(parents=True, exist_ok=True) 36 | 37 | parsed = urllib.parse.urlparse(url) 38 | filename = Path(parsed.path or "wiktextract.jsonl.gz").name 39 | gz_path = workdir / filename 40 | jsonl_path = gz_path.with_suffix("") 41 | 42 | if not skip_download: 43 | print( 44 | f"Downloading Wiktionary dump from {url} to {gz_path}...", 45 | file=sys.stderr, 46 | ) 47 | download_wiktionary_dump( 48 | gz_path, 49 | url=url, 50 | overwrite=overwrite_download, 51 | ) 52 | else: 53 | print(f"Skipping download step; reusing {gz_path}", file=sys.stderr) 54 | 55 | if not gz_path.exists(): 56 | raise FileNotFoundError(f"Expected archive {gz_path} after download step") 57 | 58 | if not skip_extract: 59 | print( 60 | f"Extracting {gz_path} to {jsonl_path}...", 61 | file=sys.stderr, 62 | ) 63 | extract_wiktionary_dump( 64 | gz_path, 65 | jsonl_path, 66 | overwrite=overwrite_extract, 67 | ) 68 | else: 69 | print(f"Skipping extract step; reusing {jsonl_path}", file=sys.stderr) 70 | 71 | if not jsonl_path.exists(): 72 | raise FileNotFoundError(f"Expected JSONL file {jsonl_path} after extract step") 73 | 74 | rows_copied = copy_jsonl_to_postgres( 75 | jsonl_path=jsonl_path, 76 | conninfo=conninfo, 77 | table_name=table_name, 78 | column_name=column_name, 79 | truncate=truncate, 80 | ) 81 | print( 82 | f"Finished loading {rows_copied:,} rows into {table_name}.{column_name}", 83 | file=sys.stderr, 84 | ) 85 | 86 | if skip_partition: 87 | print("Partition step skipped by configuration.", file=sys.stderr) 88 | return 89 | 90 | partition_dictionary_by_language( 91 | conninfo, 92 | source_table=table_name, 93 | column_name=column_name, 94 | lang_field=lang_field, 95 | table_prefix=table_prefix, 96 | target_schema=target_schema, 97 | drop_existing=drop_existing_partitions, 98 | ) 99 | 100 | 101 | __all__ = ["run_pipeline"] 102 | -------------------------------------------------------------------------------- /src/open_dictionary/db/access.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, Any, Sequence, Tuple, Union 2 | import uuid 3 | import psycopg 4 | from psycopg.rows import dict_row 5 | from psycopg import sql 6 | from psycopg.sql import Composable 7 | 8 | from open_dictionary.utils.env_loader import get_env 9 | 10 | ColumnSpec = Union[str, Tuple[str, Composable]] 11 | 12 | class DatabaseAccess: 13 | """Database access layer for dictionary tables.""" 14 | 15 | def __init__(self, connection_string: str | None = None): 16 | resolved = connection_string or get_env("DATABASE_URL") 17 | if not resolved: 18 | raise RuntimeError("Database connection string is not configured") 19 | self.connection_string = resolved 20 | 21 | def _get_connection(self): 22 | """Get database connection.""" 23 | return psycopg.connect(self.connection_string) # type: ignore 24 | 25 | def get_connection(self): 26 | """Return a new psycopg connection using the configured DSN.""" 27 | return self._get_connection() 28 | 29 | def iterate_table( 30 | self, 31 | table_name: str, 32 | batch_size: int = 20, 33 | *, 34 | columns: Sequence[ColumnSpec] | None = None, 35 | where: Composable | None = None, 36 | order_by: Sequence[str] | None = None, 37 | ) -> Iterator[dict[str, Any]]: 38 | """Iterate over all rows in a table using server-side cursor for memory efficiency. 39 | 40 | Args: 41 | table_name: Name of the table to iterate 42 | batch_size: Number of rows to fetch per batch 43 | columns: Specific columns to select (defaults to all) 44 | where: Optional SQL WHERE clause (Composable) to filter rows 45 | order_by: Optional list of columns to order the results 46 | 47 | Yields: 48 | Dictionary containing row data with column names as keys 49 | """ 50 | def _compile_column_spec(column: ColumnSpec) -> Composable: 51 | if isinstance(column, tuple): 52 | alias, expression = column 53 | if not isinstance(expression, Composable): 54 | raise TypeError("Expression must be a psycopg Composable instance") 55 | return sql.Composed( 56 | [sql.SQL("("), expression, sql.SQL(") AS "), sql.Identifier(alias)] 57 | ) 58 | 59 | return sql.Identifier(column) 60 | 61 | if columns: 62 | compiled_columns = [_compile_column_spec(col) for col in columns] 63 | column_clause = sql.SQL(", ").join(compiled_columns) 64 | else: 65 | column_clause = sql.SQL("*") 66 | 67 | query = sql.SQL("SELECT {columns} FROM {table}").format( 68 | columns=column_clause, 69 | table=sql.Identifier(table_name), 70 | ) 71 | 72 | if where is not None: 73 | query += sql.SQL(" WHERE ") + where 74 | 75 | if order_by: 76 | order_clause = sql.SQL(", ").join(sql.Identifier(col) for col in order_by) 77 | query += sql.SQL(" ORDER BY ") + order_clause 78 | 79 | cursor_name = f"fetch_cursor_{uuid.uuid4().hex}" 80 | 81 | with self._get_connection() as conn: 82 | with conn.cursor(row_factory=dict_row, name=cursor_name) as cursor: 83 | cursor.execute(query) # type: ignore 84 | 85 | while True: 86 | rows = cursor.fetchmany(batch_size) 87 | if not rows: 88 | break 89 | 90 | for row in rows: 91 | yield row 92 | 93 | 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open English Dictionary 2 | 3 | ## Rebuilding process WIP 4 | 5 | ## Currently, this project is being rebuilt. 6 | 7 | New features are: 8 | 9 | - Streamlined process + pipeline integration 10 | - Wiktionary grounding + LLM explain 11 | - Enormous words data across multiple languages 12 | - Extremely detailed definitions 13 | - New distribution format will be: jsonl, sqlite and more are to be determined 14 | - Options are available to select specific category of words 15 | 16 | **Behold and stay tuned!** 17 | 18 | ## Prerequisites 19 | 20 | - Install project dependencies: `uv sync` 21 | - Configure a `.env` file with `DATABASE_URL` 22 | - Ensure a PostgreSQL database is reachable via that URL 23 | 24 | ## Run The Wiktionary Workflow 25 | 26 | Download the compressed dump: 27 | 28 | ```bash 29 | uv run open-dictionary download --output data/raw-wiktextract-data.jsonl.gz 30 | ``` 31 | 32 | Extract the JSONL file: 33 | 34 | ```bash 35 | uv run open-dictionary extract \ 36 | --input data/raw-wiktextract-data.jsonl.gz \ 37 | --output data/raw-wiktextract-data.jsonl 38 | ``` 39 | 40 | Stream the JSONL into PostgreSQL (`dictionary_all.data` is JSONB): 41 | 42 | ```bash 43 | uv run open-dictionary load data/raw-wiktextract-data.jsonl \ 44 | --table dictionary_all \ 45 | --column data \ 46 | --truncate 47 | ``` 48 | 49 | Run everything end-to-end with optional partitioning: 50 | 51 | ```bash 52 | uv run open-dictionary pipeline \ 53 | --workdir data \ 54 | --table dictionary_all \ 55 | --column data \ 56 | --truncate 57 | ``` 58 | 59 | Split rows by language code into per-language tables when needed: 60 | 61 | ```bash 62 | uv run open-dictionary partition \ 63 | --table dictionary_all \ 64 | --column data \ 65 | --lang-field lang_code 66 | ``` 67 | 68 | Materialize a smaller set of languages into dedicated tables with a custom prefix: 69 | 70 | ```bash 71 | uv run open-dictionary filter en zh \ 72 | --table dictionary_all \ 73 | --column data \ 74 | --table-prefix dictionary_filtered 75 | ``` 76 | 77 | Pass `all` to emit every language into its own table: 78 | 79 | ```bash 80 | uv run open-dictionary filter all --table dictionary_all --column data 81 | ``` 82 | 83 | Populate the `common_score` column with word frequency data (re-run with `--recompute-existing` to refresh scores): 84 | 85 | ```bash 86 | uv run open-dictionary db-commonness --table dictionary_filtered_en 87 | ``` 88 | 89 | Normalize raw Wiktionary payloads into a slimmer JSONB column without invoking LLMs (writes to `process` by default): 90 | 91 | _Optionally convert to TOON format (reduces token usage by 30-60% for LLM workflows, stores as TEXT instead of JSONB):_ 92 | 93 | ```bash 94 | uv run open-dictionary pre-process \ 95 | --table dictionary_filtered_en \ 96 | --source-column data \ 97 | --target-column processed \ 98 | --toon 99 | ``` 100 | 101 | Remove low-quality rows (zero common score, numeric tokens, legacy tags) directly in PostgreSQL: 102 | 103 | ```bash 104 | uv run open-dictionary db-clean --table dictionary_filtered_en 105 | ``` 106 | 107 | Generate structured Chinese learner-friendly entries with the LLM `define` workflow (writes JSONB into `new_speak` by default). This streams rows in batches, dispatches up to 50 concurrent LLM calls with exponential-backoff retries, and resumes automatically on restart: 108 | 109 | ```bash 110 | uv run open-dictionary llm-define \ 111 | --table dictionary_filtered_en \ 112 | --source-column processed \ 113 | --target-column new_speak 114 | ``` 115 | 116 | Provide `LLM_MODEL`, `LLM_KEY`, and `LLM_API` in your environment (e.g., `.env`) before running LLM commands. 117 | 118 | Each command streams data in chunks to handle the 10M+ line dataset efficiently. 119 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # Repository Guidelines 2 | 3 | This is a full tool sets for building a open dictionary, based on wikitionary data. 4 | 5 | ## Project Structure & Module Organization 6 | 7 | - Core logic lives in `src/open_dictionary`. The CLI entry point defined in `pyproject.toml` resolves to `open_dictionary:main`, which dispatches into `src/open_dictionary/cli.py`; keep any new commands registered there while delegating business logic to feature modules. 8 | - Data access helpers sit under `src/open_dictionary/db` (for example `access.py`) and should remain focused on PostgreSQL streaming semantics. 9 | - Wiktionary ingestion utilities are split by concern under `src/open_dictionary/wikitionary/`: `downloader.py`, `extract.py`, `transform.py` (streaming COPY + table helpers), `pipeline.py` (orchestration), `filter.py` (language table materialization), and `progress.py` (shared progress reporters). 10 | - LLM-facing enrichments live in `src/open_dictionary/llm`, while cross-cutting utilities (environment loading, helpers) belong in `src/open_dictionary/utils`. 11 | - Runtime artifacts such as dumps or extracted JSONL files are expected in a local `data/` directory (not tracked); scripts should accept paths rather than hard-code locations. 12 | 13 | ## Build, Test, and Development Commands 14 | 15 | - `uv sync` installs all dependencies declared in `pyproject.toml`. 16 | - `uv run open-dictionary download --output data/raw-wiktextract-data.jsonl.gz` streams the upstream Wiktextract snapshot. 17 | - `uv run open-dictionary pipeline --workdir data --table dictionary --column data --truncate` executes download → extract → load → partition in one shot; add `--skip-*` flags for partial runs. 18 | - `uv run open-dictionary filter en zh --table dictionary_all --column data` copies only selected languages into `dictionary_lang_*` tables; pass `all` as the first positional argument to materialize every language code. 19 | - `uv run open-dictionary db-clean --table dictionary_en` removes rows that fail quality heuristics (numeric tokens, zero scores, legacy tags, etc.). 20 | - `uv run open-dictionary db-commonness --table dictionary_en` streams wordfreq-derived `common_score` values into the target table (add `--recompute-existing` to refresh populated rows). 21 | - `uv run python -m pytest` is the expected test runner once suites are added; for now, rely on targeted CLI runs against a disposable PostgreSQL database. 22 | 23 | ## Coding Style & Naming Conventions 24 | 25 | - Target Python 3.12+, four-space indentation, and `snake_case` for functions, modules, and CLI subcommand names. 26 | - Prefer type hints and `pydantic` models for structured payloads (see `llm/define.py`), and keep side effects behind small helpers for easier testing. 27 | - Environment keys (`DATABASE_URL`, `LLM_KEY`, `LLM_API`, `LLM_MODEL`) are loaded through `utils.env_loader`; never fetch them ad hoc inside command bodies. 28 | 29 | ## Testing Guidelines 30 | 31 | - Focus on integration tests that exercise the CLI contract end-to-end with a seeded PostgreSQL container; isolate I/O with temp directories under `tmp_path`. 32 | - Name test modules `test_.py` and colocate fixtures under `tests/conftest.py` once the suite exists. 33 | - Validate large operations by asserting row counts, emitted table names, and LLM scaffolding errors rather than snapshotting full JSON. 34 | 35 | ## Commit & Pull Request Guidelines 36 | 37 | - Follow the existing history: concise imperative subject lines (e.g. “Add DB iterator”), optional body wrapped at ~72 chars. 38 | - Reference issue IDs in the body when available and note required migrations or manual steps. 39 | - PRs should describe the dataset used for validation, include command transcripts (`uv run …`) for any pipelines executed, and, when UI/CLI behavior changes, attach representative logs or screenshots. 40 | 41 | ## Environment & Security Tips 42 | 43 | - Keep `.env` files local; share example variables via documentation rather than version control. 44 | - Never commit API keys or database URLs. If sensitive configuration is required in CI, use repository secrets and reference them through environment loader helpers. 45 | -------------------------------------------------------------------------------- /.serena/project.yml: -------------------------------------------------------------------------------- 1 | # language of the project (csharp, python, rust, java, typescript, go, cpp, or ruby) 2 | # * For C, use cpp 3 | # * For JavaScript, use typescript 4 | # Special requirements: 5 | # * csharp: Requires the presence of a .sln file in the project folder. 6 | language: python 7 | 8 | # the encoding used by text files in the project 9 | # For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings 10 | encoding: "utf-8" 11 | 12 | # whether to use the project's gitignore file to ignore files 13 | # Added on 2025-04-07 14 | ignore_all_files_in_gitignore: true 15 | # list of additional paths to ignore 16 | # same syntax as gitignore, so you can use * and ** 17 | # Was previously called `ignored_dirs`, please update your config if you are using that. 18 | # Added (renamed) on 2025-04-07 19 | ignored_paths: [] 20 | 21 | # whether the project is in read-only mode 22 | # If set to true, all editing tools will be disabled and attempts to use them will result in an error 23 | # Added on 2025-04-18 24 | read_only: false 25 | 26 | # list of tool names to exclude. We recommend not excluding any tools, see the readme for more details. 27 | # Below is the complete list of tools for convenience. 28 | # To make sure you have the latest list of tools, and to view their descriptions, 29 | # execute `uv run scripts/print_tool_overview.py`. 30 | # 31 | # * `activate_project`: Activates a project by name. 32 | # * `check_onboarding_performed`: Checks whether project onboarding was already performed. 33 | # * `create_text_file`: Creates/overwrites a file in the project directory. 34 | # * `delete_lines`: Deletes a range of lines within a file. 35 | # * `delete_memory`: Deletes a memory from Serena's project-specific memory store. 36 | # * `execute_shell_command`: Executes a shell command. 37 | # * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced. 38 | # * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type). 39 | # * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type). 40 | # * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes. 41 | # * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file. 42 | # * `initial_instructions`: Gets the initial instructions for the current project. 43 | # Should only be used in settings where the system prompt cannot be set, 44 | # e.g. in clients you have no control over, like Claude Desktop. 45 | # * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol. 46 | # * `insert_at_line`: Inserts content at a given line in a file. 47 | # * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol. 48 | # * `list_dir`: Lists files and directories in the given directory (optionally with recursion). 49 | # * `list_memories`: Lists memories in Serena's project-specific memory store. 50 | # * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building). 51 | # * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context). 52 | # * `read_file`: Reads a file within the project directory. 53 | # * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store. 54 | # * `remove_project`: Removes a project from the Serena configuration. 55 | # * `replace_lines`: Replaces a range of lines within a file with new content. 56 | # * `replace_symbol_body`: Replaces the full definition of a symbol. 57 | # * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen. 58 | # * `search_for_pattern`: Performs a search for a pattern in the project. 59 | # * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase. 60 | # * `switch_modes`: Activates modes by providing a list of their names 61 | # * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information. 62 | # * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task. 63 | # * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed. 64 | # * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store. 65 | excluded_tools: [] 66 | 67 | # initial prompt for the project. It will always be given to the LLM upon activating the project 68 | # (contrary to the memories, which are loaded on demand). 69 | initial_prompt: "" 70 | 71 | project_name: "open-english-dictionary" 72 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/progress.py: -------------------------------------------------------------------------------- 1 | """Progress helpers for long-running Wiktionary data operations.""" 2 | 3 | from __future__ import annotations 4 | 5 | import sys 6 | import time 7 | 8 | 9 | class ByteProgressPrinter: 10 | """Emit coarse progress updates for byte-oriented streaming tasks.""" 11 | 12 | def __init__( 13 | self, 14 | label: str, 15 | total_bytes: int, 16 | *, 17 | min_bytes_step: int = 64 * 1024 * 1024, 18 | min_time_step: float = 5.0, 19 | ) -> None: 20 | self.label = label 21 | self.total_bytes = max(total_bytes, 0) 22 | self.min_bytes_step = max(min_bytes_step, 1) 23 | self.min_time_step = max(min_time_step, 0.0) 24 | self._last_report_time = time.monotonic() 25 | self._last_report_bytes = 0 26 | 27 | def report(self, processed_bytes: int, *, force: bool = False) -> None: 28 | """Report the number of processed bytes if thresholds are met.""" 29 | 30 | if processed_bytes < 0: # Defensive guard for unexpected inputs 31 | return 32 | 33 | now = time.monotonic() 34 | bytes_increment = processed_bytes - self._last_report_bytes 35 | 36 | if not force and processed_bytes < self.total_bytes: 37 | if ( 38 | bytes_increment < self.min_bytes_step 39 | and (now - self._last_report_time) < self.min_time_step 40 | ): 41 | return 42 | elif not force and bytes_increment <= 0: 43 | return 44 | 45 | percent_text = "" 46 | if self.total_bytes: 47 | percent = min(100.0, (processed_bytes / self.total_bytes) * 100) 48 | percent_text = f"{percent:5.1f}% | " 49 | 50 | gib_processed = processed_bytes / (1024**3) 51 | message = f"{self.label}: {percent_text}{gib_processed:.2f} GiB" 52 | print(message, file=sys.stderr, flush=True) 53 | 54 | self._last_report_time = now 55 | self._last_report_bytes = processed_bytes 56 | 57 | def finalize(self, processed_bytes: int) -> None: 58 | """Ensure a final progress update is displayed when finished.""" 59 | 60 | if processed_bytes == 0: 61 | return 62 | 63 | self.report(processed_bytes, force=True) 64 | 65 | 66 | class StreamingProgress: 67 | """Progress reporter for streaming row + byte oriented workloads.""" 68 | 69 | def __init__( 70 | self, 71 | total_bytes: int, 72 | *, 73 | label: str = "Progress", 74 | min_bytes_step: int = 64 * 1024 * 1024, 75 | min_rows_step: int = 50_000, 76 | min_time_step: float = 5.0, 77 | ) -> None: 78 | self.total_bytes = max(total_bytes, 0) 79 | self.label = label 80 | self.min_bytes_step = max(min_bytes_step, 1) 81 | self.min_rows_step = max(min_rows_step, 1) 82 | self.min_time_step = max(min_time_step, 0.0) 83 | self._last_report_time = time.monotonic() 84 | self._last_report_bytes = 0 85 | self._last_report_rows = 0 86 | 87 | def report(self, rows: int, bytes_processed: int, *, force: bool = False) -> None: 88 | """Emit a progress message when thresholds are crossed.""" 89 | 90 | if rows < 0 or bytes_processed < 0: 91 | return 92 | 93 | now = time.monotonic() 94 | bytes_increment = bytes_processed - self._last_report_bytes 95 | rows_increment = rows - self._last_report_rows 96 | 97 | if not force: 98 | if bytes_processed < self.total_bytes: 99 | if ( 100 | bytes_increment < self.min_bytes_step 101 | and rows_increment < self.min_rows_step 102 | and (now - self._last_report_time) < self.min_time_step 103 | ): 104 | return 105 | else: 106 | if bytes_increment <= 0 and rows_increment <= 0: 107 | return 108 | 109 | percent_text = "" 110 | if self.total_bytes: 111 | percent = min(100.0, (bytes_processed / self.total_bytes) * 100) 112 | percent_text = f"{percent:5.1f}% | " 113 | 114 | gib_processed = bytes_processed / (1024**3) 115 | rate = 0.0 116 | elapsed = now - self._last_report_time 117 | if elapsed > 0 and rows_increment > 0: 118 | rate = rows_increment / elapsed 119 | 120 | message = ( 121 | f"{self.label}: {percent_text}{rows:,} rows | " 122 | f"{gib_processed:.2f} GiB read | {rate:,.0f} rows/s" 123 | ) 124 | print(message, file=sys.stderr, flush=True) 125 | 126 | self._last_report_time = now 127 | self._last_report_bytes = bytes_processed 128 | self._last_report_rows = rows 129 | 130 | def finalize(self, rows: int, bytes_processed: int) -> None: 131 | """Ensure a final progress message is emitted.""" 132 | 133 | if rows == 0 and bytes_processed == 0: 134 | return 135 | 136 | self.report(rows, bytes_processed, force=True) 137 | 138 | 139 | __all__ = ["ByteProgressPrinter", "StreamingProgress"] 140 | -------------------------------------------------------------------------------- /src/open_dictionary/llm/define.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | import json 4 | from open_dictionary.llm.llm_client import get_chat_response 5 | 6 | 7 | instruction = """ 8 | 你是一位顶级的词典编纂专家、语言学家,以及精通中英双语的教育家。你的任务是读取并解析一段来自 Wiktionary 的、结构复杂的数据,然后将其转化为一份清晰、准确、对中文学习者极其友好的结构化中文词典条目。 9 | 10 | **核心任务:** 11 | 根据下方提供的输入JSON,严格按照【输出格式定义】生成一个唯一的、完整的 JSON 对象作为最终结果。不要输出任何解释、注释或无关内容。 12 | 13 | **【重要:JSON 格式规范】** 14 | 1. 输出必须是严格、合法的 JSON 格式。 15 | 2. **所有字符串值中的双引号 (") 必须使用反斜杠转义为 \\"**。 16 | 3. **所有字符串值中的反斜杠 (\\) 必须转义为 \\**。 17 | 18 | --- 19 | 20 | **【输出格式定义】** 21 | 22 | 请生成一个包含以下键 (key) 的 JSON 对象: 23 | 24 | 1. `word`: (string) 英文单词本身。 25 | 2. `pos`: (string) 词性。 26 | 3. `pronunciations`: (object) 一个包含发音方式和音频文件的对象: 27 | * `ipa`: (string) 国际音标。直接从输入JSON的 `sounds` 数组中提取 `ipa` 字段的值。 28 | * `natural_phonics`: (string) 自然拼读。根据单词的拼写和音节,生成一个对初学者友好的、用连字符分隔的拼读提示。例如 "philosophy" -> "phi-lo-so-phy"。 29 | * `ogg_url`: (string) OGG音频文件链接。从输入JSON的 `sounds` 数组中查找并提取 `ogg_url` 字段的值。如果不存在,则返回 `null`。 30 | 4. `forms`: (array of strings) **词形变化**。遍历输入JSON的 `forms` 数组,将每个词形 (`form`) 及其标签 (`tags`) 组合成一个易于理解的中文描述字符串。例如:`"hits (第三人称单数现在时)"`。 31 | 5. `concise_definition`: (string) **简明释义**。在分析完所有词义后,用一句话高度概括该单词最核心、最常用的1-2个中文意思。 32 | 6. `detailed_definitions`: (array) **详细释义数组**。遍历输入JSON中 `senses` 数组的每一个对象,为每个词义生成一个包含以下内容的对象: 33 | * `definition_en`: (string) **英文原义**。从输入JSON的 `glosses` 数组中,提取出**最具体、最完整**的那个英文释义。如果数组中包含一个概括性标题和一个具体释义,请**选择那个具体的释义**。**注意:如果原文包含引号,必须转义。** 34 | * `definition_cn`: (string) **中文阐释**。此项是核心,请遵循以下原则: 35 | * **解释而非翻译**:用**通俗、自然、易懂**的中文来解释 `definition_en` 的核心含义。 36 | * **捕捉精髓**:要抓住该词义的**使用场景、语气(如正式、口语、俚语)和细微差别**。 37 | * **避免直译**:请**避免生硬的、字典式的直译**。目标是让中文母语者能瞬间理解这个词义的真正用法。 38 | * **转义规则**:如果中文阐释中需要使用引号(如「」、""),请使用中文引号,避免使用英文双引号。如果必须使用英文双引号,务必转义。 39 | * `example`: (object) **为该词义创作一个全新的例句**,包含: 40 | * `en`: (string) 一个**简单、现代、生活化**的英文例句,清晰地展示当前词义的用法。**绝对不要使用**输入JSON中提供的复杂或古老的例句。**如果例句中包含引号,必须转义。** 41 | * `cn`: (string) 上述英文例句的对应中文翻译。**如果翻译中包含英文引号,必须转义。** 42 | 7. `derived`: (array of objects) **派生词**。遍历输入JSON的 `derived` 数组,为其中的**每个单词**生成一个包含以下内容的对象: 43 | * `word`: (string) 派生词本身。 44 | * `definition_cn`: (string) 对该派生词的**简明中文定义**。 45 | 8. `etymology`: (string) **词源故事**。读取输入JSON中的 `etymology_text` 字段,将其内容翻译并**转述**成一段流畅、易懂的中文。说明其起源语言(如拉丁语、古英语、希腊语)和含义的演变过程,像讲故事一样。**如果词源中包含引号,必须转义。** 46 | 47 | --- 48 | 49 | **【示例】** 50 | 51 | **输入:** 52 | word: quote 53 | pos: verb 54 | forms[2]: 55 | - form: quotes 56 | tags[2]: present,singular,third-person 57 | - form: quoted 58 | tags[1]: past 59 | senses[1]: 60 | - 61 | glosses[1]: "To repeat or copy out (words from a text or speech written or spoken by another person)." 62 | sounds[1,]{ipa,ogg_url}: 63 | /kwəʊt/,url 64 | derived[1,]{word}: 65 | quotation 66 | etymology_text: "From Medieval Latin quotare meaning \"to mark with numbers\"." 67 | 68 | **你的JSON输出:** 69 | { 70 | "word": "quote", 71 | "pos": "verb", 72 | "pronunciations": { 73 | "ipa": "/kwəʊt/", 74 | "natural_phonics": "quote", 75 | "ogg_url": "url" 76 | }, 77 | "forms": [ 78 | "quotes (第三人称单数现在时)", 79 | "quoted (过去式)" 80 | ], 81 | "concise_definition": "引用,引述。", 82 | "detailed_definitions": [ 83 | { 84 | "definition_en": "To repeat or copy out (words from a text or speech written or spoken by another person).", 85 | "definition_cn": "指重复或摘录他人的话语或文字,通常用于写作、演讲中引用权威来源或他人观点。", 86 | "example": { 87 | "en": "She quoted Shakespeare by saying \"To be or not to be\".", 88 | "cn": "她引用了莎士比亚的话说「生存还是毁灭」。" 89 | } 90 | } 91 | ], 92 | "derived": [ 93 | { 94 | "word": "quotation", 95 | "definition_cn": "引文,引语;报价。" 96 | } 97 | ], 98 | "etymology": "该词源自中世纪拉丁语 quotare,意为「标记数字」。" 99 | } 100 | 101 | """ 102 | 103 | class Example(BaseModel): 104 | en: str 105 | cn: str 106 | 107 | 108 | class DetailedDefinition(BaseModel): 109 | definition_en: str 110 | definition_cn: str 111 | example: Example 112 | 113 | 114 | class DerivedWord(BaseModel): 115 | word: str 116 | definition_cn: str 117 | 118 | 119 | class Pronunciations(BaseModel): 120 | ipa: str 121 | natural_phonics: str 122 | ogg_url: Optional[str] = None 123 | 124 | 125 | class Definition(BaseModel): 126 | word: str 127 | pos: str 128 | pronunciations: Pronunciations 129 | forms: list[str] 130 | concise_definition: str 131 | detailed_definitions: list[DetailedDefinition] 132 | derived: list[DerivedWord] 133 | etymology: str 134 | 135 | 136 | def define(input_data: str) -> Definition: 137 | """Generate a structured dictionary definition from Wiktionary JSON/Toon data. 138 | 139 | Args: 140 | input_data: String containing Wiktionary data in JSON or Toon format 141 | 142 | Returns: 143 | Definition object with structured dictionary entry 144 | """ 145 | response = get_chat_response(instruction, input_data) 146 | 147 | try: 148 | return Definition.model_validate_json(response) 149 | except Exception as exc: 150 | # Attach the raw response to the exception for error logging 151 | exc.llm_response = response # type: ignore 152 | raise 153 | -------------------------------------------------------------------------------- /src/open_dictionary/db/sqlite_manager.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from contextlib import contextmanager 3 | from pathlib import Path 4 | from typing import Any, Iterator 5 | import json 6 | 7 | 8 | class SQLiteManager: 9 | """Manager for SQLite database with JSON1 support for storing definitions.""" 10 | 11 | def __init__(self, db_path: str = "data/dictionary.sqlite"): 12 | """Initialize SQLite manager. 13 | 14 | Args: 15 | db_path: Path to SQLite database file 16 | """ 17 | path_str = str(db_path) 18 | self._use_memory_db = path_str == ":memory:" 19 | self._memory_connection: sqlite3.Connection | None = None 20 | 21 | if self._use_memory_db: 22 | # Keep a persistent connection open for in-memory databases so the schema 23 | # and rows survive multiple operations. 24 | self.db_path = path_str 25 | self._memory_connection = sqlite3.connect(path_str, check_same_thread=False) 26 | else: 27 | self.db_path = Path(path_str) 28 | self.db_path.parent.mkdir(parents=True, exist_ok=True) 29 | 30 | self._init_db() 31 | 32 | def _init_db(self): 33 | """Initialize database schema.""" 34 | with self._connection() as conn: 35 | conn.execute(""" 36 | CREATE TABLE IF NOT EXISTS definitions ( 37 | word TEXT PRIMARY KEY, 38 | definition JSON NOT NULL 39 | ) 40 | """) 41 | conn.commit() 42 | 43 | @contextmanager 44 | def _connection(self) -> Iterator[sqlite3.Connection]: 45 | """Yield a SQLite connection, keeping in-memory DBs alive.""" 46 | if self._use_memory_db: 47 | assert self._memory_connection is not None 48 | yield self._memory_connection 49 | else: 50 | conn = sqlite3.connect(self.db_path) 51 | try: 52 | yield conn 53 | finally: 54 | conn.close() 55 | 56 | def insert_definition(self, word: str, definition: dict[str, Any]): 57 | """Insert a single definition into the database. 58 | 59 | Args: 60 | word: The word being defined 61 | definition: The definition data as a dictionary 62 | """ 63 | with self._connection() as conn: 64 | conn.execute( 65 | "INSERT OR REPLACE INTO definitions (word, definition) VALUES (?, ?)", 66 | (word, json.dumps(definition, ensure_ascii=False)) 67 | ) 68 | conn.commit() 69 | 70 | def insert_definitions_batch(self, definitions: list[tuple[str, dict[str, Any]]]): 71 | """Insert multiple definitions in a batch. 72 | 73 | Args: 74 | definitions: List of (word, definition_dict) tuples 75 | """ 76 | with self._connection() as conn: 77 | conn.executemany( 78 | "INSERT OR REPLACE INTO definitions (word, definition) VALUES (?, ?)", 79 | [(word, json.dumps(defn, ensure_ascii=False)) for word, defn in definitions] 80 | ) 81 | conn.commit() 82 | 83 | def get_definition(self, word: str) -> dict[str, Any] | None: 84 | """Get definition for a word. 85 | 86 | Args: 87 | word: The word to look up 88 | 89 | Returns: 90 | Definition dictionary or None if not found 91 | """ 92 | with self._connection() as conn: 93 | cursor = conn.execute( 94 | "SELECT definition FROM definitions WHERE word = ?", 95 | (word,) 96 | ) 97 | row = cursor.fetchone() 98 | return json.loads(row[0]) if row else None 99 | 100 | def count_definitions(self) -> int: 101 | """Count total definitions in database. 102 | 103 | Returns: 104 | Number of definitions 105 | """ 106 | with self._connection() as conn: 107 | cursor = conn.execute("SELECT COUNT(*) FROM definitions") 108 | return cursor.fetchone()[0] 109 | 110 | def close(self) -> None: 111 | """Close any persistent SQLite connections.""" 112 | if self._memory_connection is not None: 113 | self._memory_connection.close() 114 | self._memory_connection = None 115 | 116 | def __del__(self): # pragma: no cover - best effort cleanup 117 | try: 118 | self.close() 119 | except Exception: 120 | pass 121 | 122 | 123 | def test_sqlite_manager(): 124 | """Test function to verify SQLite manager works correctly.""" 125 | import tempfile 126 | import os 127 | 128 | # Create a temporary database 129 | with tempfile.NamedTemporaryFile(delete=False, suffix='.sqlite') as f: 130 | test_db = f.name 131 | 132 | try: 133 | print(f"Testing with database: {test_db}") 134 | manager = SQLiteManager(test_db) 135 | 136 | # Test single insert 137 | test_def = {"word": "test", "pos": "noun", "definition": "A trial or examination"} 138 | manager.insert_definition("test", test_def) 139 | print(f"After single insert: {manager.count_definitions()} definitions") 140 | 141 | # Test batch insert 142 | batch = [ 143 | ("word1", {"word": "word1", "meaning": "first"}), 144 | ("word2", {"word": "word2", "meaning": "second"}), 145 | ("word3", {"word": "word3", "meaning": "third"}), 146 | ] 147 | manager.insert_definitions_batch(batch) 148 | print(f"After batch insert: {manager.count_definitions()} definitions") 149 | 150 | # Test retrieval 151 | retrieved = manager.get_definition("test") 152 | print(f"Retrieved definition: {retrieved}") 153 | 154 | # Test in-memory database support 155 | memory_manager = SQLiteManager(":memory:") 156 | memory_manager.insert_definition("memory_word", {"word": "memory_word"}) 157 | print(f"In-memory count: {memory_manager.count_definitions()} definitions") 158 | print(f"In-memory retrieval: {memory_manager.get_definition('memory_word')}") 159 | memory_manager.close() 160 | 161 | print("All tests passed!") 162 | finally: 163 | # Clean up 164 | if os.path.exists(test_db): 165 | os.unlink(test_db) 166 | 167 | 168 | if __name__ == "__main__": 169 | test_sqlite_manager() 170 | -------------------------------------------------------------------------------- /src/open_dictionary/db/cleaner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import time 4 | from typing import Any, Sequence 5 | 6 | from psycopg import sql 7 | from psycopg.cursor import Cursor 8 | 9 | # 假设这个模块存在并且可以正确配置数据库连接 10 | # 注意:您需要确保 open_dictionary.db.access 模块在您的环境中可用 11 | from open_dictionary.db.access import DatabaseAccess 12 | 13 | FETCH_BATCH_SIZE = 5000 14 | DELETE_BATCH_SIZE = 5000 15 | PROGRESS_EVERY_ROWS = 20_000 16 | PROGRESS_EVERY_SECONDS = 30.0 17 | 18 | 19 | def clean_dictionary_data( 20 | table_name: str, 21 | *, 22 | fetch_batch_size: int = FETCH_BATCH_SIZE, 23 | delete_batch_size: int = DELETE_BATCH_SIZE, 24 | progress_every_rows: int = PROGRESS_EVERY_ROWS, 25 | progress_every_seconds: float = PROGRESS_EVERY_SECONDS, 26 | ) -> None: 27 | """ 28 | 从字典表中删除不符合质量标准的词条行。 29 | 30 | 该函数会删除满足以下任一条件的词条: 31 | 1. `common_score` 精确为零。 32 | 2. 单词本身 (`data`->'word') 包含任何数字 (0-9)。 33 | 3. 单词本身是长度超过1的全大写词 (例如 "UNESCO")。 34 | 4. 单词本身包含特殊字符(允许字母, 撇号, 空格, 连字符)。 35 | 5. 词条的标签 (`data`->'tags') 包含 "archaic", "obsolete", "dated", "古旧", 或 "废弃"。 36 | """ 37 | 38 | data_access = DatabaseAccess() 39 | processed = 0 40 | deleted = 0 41 | pending_ids: list[int] = [] 42 | start_time = time.monotonic() 43 | 44 | print( 45 | f"[cleaner] starting table={table_name} " 46 | f"fetch_batch={fetch_batch_size} delete_batch={delete_batch_size} " 47 | f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds}", 48 | flush=True, 49 | ) 50 | 51 | # 构建复杂的 WHERE 子句来一次性筛选所有不合格的词条 52 | # 这种方法比在 Python 中进行判断效率高得多,因为它将过滤工作完全交给了数据库 53 | conditions = [ 54 | # 1. 删除 common_score 为 0 的词条 55 | sql.SQL("common_score = 0"), 56 | 57 | # 2. 删除单词中包含数字的词条 58 | # data->>'word' 从 jsonb 字段 'data' 中以文本形式提取 'word' 的值 59 | # ~ 是 PostgreSQL 的正则表达式匹配操作符 60 | sql.SQL("data->>'word' ~ '[0-9]'"), 61 | 62 | # 3. 删除全是大写的词条(长度大于1,以避免删除 "I", "A" 等) 63 | # 同时检查是否真的包含大写字母,以避免非字母字符串被误判 64 | sql.SQL("LENGTH(data->>'word') > 1 AND data->>'word' = UPPER(data->>'word') AND data->>'word' ~ '[A-Z]'"), 65 | 66 | # 4. 删除包含特殊字符的词条 67 | # 正则表达式 [^a-zA-Z' -] 匹配任何不是字母、撇号、空格或连字符的字符 68 | # 注意在 SQL 字符串中,撇号需要写成 '' 来转义 69 | sql.SQL("data->>'word' ~ '[^a-zA-Z'' -]'"), 70 | 71 | # 5. 删除包含古旧、废弃等标签的词条 72 | # data->'tags' 获取 jsonb 字段 'data' 中的 'tags' 数组 73 | # ?| 操作符检查左边的 jsonb 数组是否包含右边 text 数组中的任何一个元素 74 | sql.SQL("data->'tags' ?| array['archaic', 'obsolete', 'dated']") 75 | ] 76 | 77 | # 使用 OR 将所有条件连接起来,满足任意一个条件即被选中 78 | where_clause = sql.SQL(" OR ").join(conditions) 79 | 80 | with data_access.get_connection() as delete_conn: 81 | with delete_conn.cursor() as cursor: 82 | last_log_time = start_time 83 | 84 | print(f"[cleaner] Executing query with WHERE clause: {where_clause.as_string(cursor)}", flush=True) 85 | 86 | # 使用构建好的 where_clause 来迭代所有需要删除的行 87 | for row in data_access.iterate_table( 88 | table_name, 89 | batch_size=fetch_batch_size, 90 | columns=("id",), 91 | where=where_clause, 92 | order_by=("id",), 93 | ): 94 | row_id = row.get("id") 95 | if row_id is None: 96 | continue 97 | 98 | processed += 1 99 | emit_progress = processed == 1 100 | 101 | pending_ids.append(int(row_id)) 102 | 103 | if len(pending_ids) >= delete_batch_size: 104 | batch_count = _flush_deletions(cursor, table_name, pending_ids) 105 | delete_conn.commit() 106 | deleted += batch_count 107 | pending_ids.clear() 108 | emit_progress = True 109 | 110 | now = time.monotonic() 111 | 112 | if progress_every_rows and processed % progress_every_rows == 0: 113 | emit_progress = True 114 | if progress_every_seconds and (now - last_log_time) >= progress_every_seconds: 115 | emit_progress = True 116 | 117 | if emit_progress: 118 | _report_progress(processed, deleted, start_time) 119 | last_log_time = now 120 | 121 | if pending_ids: 122 | batch_count = _flush_deletions(cursor, table_name, pending_ids) 123 | delete_conn.commit() 124 | deleted += batch_count 125 | pending_ids.clear() 126 | _report_progress(processed, deleted, start_time) 127 | 128 | _report_completion(processed, deleted, start_time) 129 | 130 | 131 | def _flush_deletions( 132 | cursor: Cursor[Any], 133 | table_name: str, 134 | ids: Sequence[int], 135 | ) -> int: 136 | if not ids: 137 | return 0 138 | 139 | values_sql = sql.SQL(", ").join(sql.SQL("(%s::bigint)") for _ in ids) 140 | delete_sql = sql.SQL( 141 | """ 142 | DELETE FROM {table} AS t 143 | USING (VALUES {values}) AS v(id) 144 | WHERE t.id = v.id 145 | """ 146 | ).format( 147 | table=sql.Identifier(table_name), 148 | values=values_sql, 149 | ) 150 | 151 | cursor.execute(delete_sql, ids) 152 | return cursor.rowcount 153 | 154 | 155 | def _report_progress(processed: int, deleted: int, start_time: float) -> None: 156 | elapsed = max(time.monotonic() - start_time, 1e-6) 157 | processed_rate = processed / elapsed 158 | deleted_rate = deleted / elapsed if deleted else 0.0 159 | print( 160 | f"[cleaner] processed={processed:,} deleted={deleted:,} " 161 | f"elapsed={elapsed:,.1f}s rate={processed_rate:,.0f} rows/s " 162 | f"delete_rate={deleted_rate:,.0f} rows/s", 163 | flush=True, 164 | ) 165 | 166 | 167 | def _report_completion(processed: int, deleted: int, start_time: float) -> None: 168 | elapsed = max(time.monotonic() - start_time, 1e-6) 169 | processed_rate = processed / elapsed if processed else 0.0 170 | deleted_rate = deleted / elapsed if deleted else 0.0 171 | print( 172 | f"[cleaner] completed processed={processed:,} deleted={deleted:,} " 173 | f"elapsed={elapsed:,.1f}s avg_rate={processed_rate:,.0f} rows/s " 174 | f"delete_rate={deleted_rate:,.0f} rows/s", 175 | flush=True, 176 | ) 177 | __all__ = [ 178 | "FETCH_BATCH_SIZE", 179 | "DELETE_BATCH_SIZE", 180 | "PROGRESS_EVERY_ROWS", 181 | "PROGRESS_EVERY_SECONDS", 182 | "clean_dictionary_data", 183 | ] 184 | -------------------------------------------------------------------------------- /src/open_dictionary/db/mark_commonness.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import time 5 | from decimal import Decimal 6 | from functools import lru_cache 7 | from typing import Any, Optional, Sequence, Tuple 8 | 9 | from psycopg import sql 10 | from psycopg.cursor import Cursor 11 | from wordfreq import zipf_frequency 12 | 13 | from open_dictionary.db.access import DatabaseAccess 14 | 15 | FETCH_BATCH_SIZE = 5000 16 | UPDATE_BATCH_SIZE = 5000 17 | PROGRESS_EVERY_ROWS = 20_000 18 | PROGRESS_EVERY_SECONDS = 30.0 19 | 20 | 21 | def enrich_common_score( 22 | table_name: str, 23 | *, 24 | fetch_batch_size: int = FETCH_BATCH_SIZE, 25 | update_batch_size: int = UPDATE_BATCH_SIZE, 26 | progress_every_rows: int = PROGRESS_EVERY_ROWS, 27 | progress_every_seconds: float = PROGRESS_EVERY_SECONDS, 28 | recompute_existing: bool = False, 29 | ) -> None: 30 | """Populate the common_score column on ``table_name`` using wordfreq data. 31 | 32 | The routine streams rows via a server-side cursor to keep memory usage flat, 33 | batches UPDATE statements to stay efficient on very large tables, and skips 34 | rows that were already processed. 35 | """ 36 | data_access = DatabaseAccess() 37 | 38 | _ensure_common_score_column(data_access, table_name) 39 | 40 | where_clause = None 41 | if not recompute_existing: 42 | where_clause = sql.SQL("{} IS NULL").format(sql.Identifier("common_score")) 43 | 44 | processed = 0 45 | updated = 0 46 | pending_updates: list[tuple[int, Optional[float]]] = [] 47 | start_time = time.monotonic() 48 | 49 | print( 50 | f"[common_score] starting table={table_name} " 51 | f"fetch_batch={fetch_batch_size} update_batch={update_batch_size} " 52 | f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds} " 53 | f"recompute_existing={recompute_existing}", 54 | flush=True, 55 | ) 56 | 57 | with data_access.get_connection() as update_conn: 58 | with update_conn.cursor() as cursor: 59 | last_log_time = start_time 60 | for row in data_access.iterate_table( 61 | table_name, 62 | batch_size=fetch_batch_size, 63 | columns=( 64 | "id", 65 | "common_score", 66 | ("word", sql.SQL("data->>'word'")), 67 | ), 68 | where=where_clause, 69 | order_by=("id",), 70 | ): 71 | processed += 1 72 | emit_progress = False 73 | 74 | if processed == 1: 75 | emit_progress = True 76 | 77 | update_payload = _build_update_payload(row) 78 | if update_payload is not None: 79 | pending_updates.append(update_payload) 80 | 81 | if len(pending_updates) >= update_batch_size: 82 | batch_count = _flush_updates(cursor, table_name, pending_updates) 83 | update_conn.commit() 84 | updated += batch_count 85 | pending_updates.clear() 86 | emit_progress = True 87 | 88 | now = time.monotonic() 89 | 90 | if progress_every_rows and processed % progress_every_rows == 0: 91 | emit_progress = True 92 | if progress_every_seconds and (now - last_log_time) >= progress_every_seconds: 93 | emit_progress = True 94 | 95 | if emit_progress: 96 | _report_progress(processed, updated, start_time) 97 | last_log_time = now 98 | 99 | if pending_updates: 100 | batch_count = _flush_updates(cursor, table_name, pending_updates) 101 | update_conn.commit() 102 | updated += batch_count 103 | pending_updates.clear() 104 | _report_progress(processed, updated, start_time) 105 | 106 | _report_completion(processed, updated, start_time) 107 | 108 | 109 | def _ensure_common_score_column(data_access: DatabaseAccess, table_name: str) -> None: 110 | with data_access.get_connection() as conn: 111 | with conn.cursor() as cursor: 112 | cursor.execute( 113 | sql.SQL( 114 | """ 115 | ALTER TABLE {table} 116 | ADD COLUMN IF NOT EXISTS common_score DOUBLE PRECISION 117 | """ 118 | ).format(table=sql.Identifier(table_name)) 119 | ) 120 | 121 | 122 | def _build_update_payload(row: dict[str, Any]) -> Tuple[int, Optional[float]] | None: 123 | row_id = row.get("id") 124 | if row_id is None: 125 | return None 126 | 127 | existing = row.get("common_score") 128 | normalized_existing = _to_float(existing) 129 | 130 | word = _extract_word(row) 131 | score = _score_for_word(word) 132 | 133 | if normalized_existing is None and score is None: 134 | return None 135 | 136 | if normalized_existing is not None and score is not None: 137 | if abs(normalized_existing - score) < 1e-9: 138 | return None 139 | 140 | return int(row_id), score 141 | 142 | 143 | def _extract_word(row: dict[str, Any]) -> Optional[str]: 144 | direct_word = row.get("word") 145 | candidate = _normalize_word(direct_word) 146 | if candidate: 147 | return candidate 148 | 149 | data = row.get("data") 150 | if isinstance(data, dict): 151 | candidate = _normalize_word(data.get("word")) 152 | if candidate: 153 | return candidate 154 | elif isinstance(data, str): 155 | try: 156 | decoded = json.loads(data) 157 | except json.JSONDecodeError: 158 | decoded = None 159 | if isinstance(decoded, dict): 160 | candidate = _normalize_word(decoded.get("word")) 161 | if candidate: 162 | return candidate 163 | 164 | return None 165 | 166 | 167 | def _normalize_word(value: Any) -> Optional[str]: 168 | if not isinstance(value, str): 169 | return None 170 | stripped = value.strip() 171 | if not stripped: 172 | return None 173 | return stripped.lower() 174 | 175 | 176 | def _score_for_word(word: Optional[str]) -> Optional[float]: 177 | if not word: 178 | return None 179 | score = _cached_zipf_frequency(word) 180 | if score <= 0.0: 181 | return 0.0 182 | return score 183 | 184 | 185 | @lru_cache(maxsize=None) 186 | def _cached_zipf_frequency(word: str) -> float: 187 | return float(zipf_frequency(word, "en")) 188 | 189 | 190 | def _flush_updates( 191 | cursor: Cursor[Any], 192 | table_name: str, 193 | payloads: Sequence[tuple[int, Optional[float]]], 194 | ) -> int: 195 | if not payloads: 196 | return 0 197 | values_sql = sql.SQL(", ").join( 198 | sql.SQL("(%s::bigint, %s::double precision)") for _ in payloads 199 | ) 200 | update_sql = sql.SQL( 201 | """ 202 | UPDATE {table} AS t 203 | SET common_score = v.score 204 | FROM (VALUES {values}) AS v(id, score) 205 | WHERE t.id = v.id 206 | """ 207 | ).format( 208 | table=sql.Identifier(table_name), 209 | values=values_sql, 210 | ) 211 | params: list[Any] = [] 212 | for row_id, score in payloads: 213 | params.extend((row_id, score)) 214 | 215 | cursor.execute(update_sql, params) 216 | return len(payloads) 217 | 218 | 219 | def _to_float(value: Any) -> Optional[float]: 220 | if value is None: 221 | return None 222 | if isinstance(value, float): 223 | return value 224 | if isinstance(value, Decimal): 225 | return float(value) 226 | return None 227 | 228 | 229 | def _report_progress(processed: int, updated: int, start_time: float) -> None: 230 | elapsed = max(time.monotonic() - start_time, 1e-6) 231 | rate = processed / elapsed 232 | print( 233 | f"[common_score] processed={processed:,} updated={updated:,} " 234 | f"elapsed={elapsed:,.1f}s rate={rate:,.0f} rows/s", 235 | flush=True, 236 | ) 237 | 238 | 239 | def _report_completion(processed: int, updated: int, start_time: float) -> None: 240 | elapsed = max(time.monotonic() - start_time, 1e-6) 241 | avg_rate = processed / elapsed if processed else 0.0 242 | print( 243 | f"[common_score] completed: processed={processed:,} updated={updated:,} " 244 | f"elapsed={elapsed:,.1f}s avg_rate={avg_rate:,.0f} rows/s", 245 | flush=True, 246 | ) 247 | __all__ = [ 248 | "FETCH_BATCH_SIZE", 249 | "UPDATE_BATCH_SIZE", 250 | "PROGRESS_EVERY_ROWS", 251 | "PROGRESS_EVERY_SECONDS", 252 | "enrich_common_score", 253 | ] 254 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/transform.py: -------------------------------------------------------------------------------- 1 | """Utilities for streaming Wiktionary JSONL data into PostgreSQL.""" 2 | 3 | from __future__ import annotations 4 | 5 | import json 6 | import re 7 | import sys 8 | from pathlib import Path 9 | from typing import Iterator, Sequence 10 | 11 | import psycopg 12 | from psycopg import sql 13 | 14 | from .progress import StreamingProgress 15 | 16 | 17 | UTF8_BOM = b"\xef\xbb\xbf" 18 | 19 | 20 | class JsonlProcessingError(Exception): 21 | """Raised when the JSONL input contains invalid JSON content.""" 22 | 23 | 24 | def iter_json_lines(file_path: Path) -> Iterator[tuple[str, int]]: 25 | """Yield JSON rows and byte offsets from a JSONL file, skipping blank lines.""" 26 | 27 | path = Path(file_path) 28 | if not path.is_file(): 29 | raise FileNotFoundError(f"No JSONL file found at {path}") 30 | 31 | with path.open("rb", buffering=1024 * 1024) as handle: 32 | for line_number, raw_line in enumerate(handle, start=1): 33 | if not raw_line.strip(): 34 | continue 35 | 36 | if line_number == 1 and raw_line.startswith(UTF8_BOM): 37 | raw_line = raw_line[len(UTF8_BOM) :] 38 | 39 | json_bytes = raw_line.rstrip(b"\r\n") 40 | if not json_bytes: 41 | continue 42 | 43 | try: 44 | json_text = json_bytes.decode("utf-8") 45 | except UnicodeDecodeError as exc: # pragma: no cover - defensive 46 | message = f"Invalid UTF-8 sequence on line {line_number}: {exc!s}" 47 | raise JsonlProcessingError(message) from exc 48 | 49 | try: 50 | json.loads(json_text) 51 | except json.JSONDecodeError as exc: # pragma: no cover - defensive 52 | message = ( 53 | f"Invalid JSON on line {line_number}: {exc.msg} (column {exc.colno})" 54 | ) 55 | raise JsonlProcessingError(message) from exc 56 | 57 | bytes_read = handle.tell() 58 | yield json_text, bytes_read 59 | def _identifier_from_dotted(qualified_name: str) -> sql.Identifier: 60 | """Return a psycopg identifier from a dotted path like ``schema.table``.""" 61 | 62 | parts = [segment.strip() for segment in qualified_name.split(".") if segment.strip()] 63 | if not parts: 64 | raise ValueError("Identifier name cannot be empty") 65 | return sql.Identifier(*parts) 66 | 67 | 68 | def _ensure_table_structure( 69 | cursor: psycopg.Cursor, 70 | table_identifier: sql.Identifier, 71 | column_identifier: sql.Identifier, 72 | ) -> None: 73 | """Create the destination table if missing.""" 74 | 75 | create_sql = sql.SQL( 76 | """ 77 | CREATE TABLE IF NOT EXISTS {} ( 78 | id BIGSERIAL PRIMARY KEY, 79 | {} JSONB NOT NULL 80 | ) 81 | """ 82 | ).format(table_identifier, column_identifier) 83 | 84 | cursor.execute(create_sql) 85 | 86 | 87 | def _sanitize_language_code(code: str) -> str: 88 | safe = re.sub(r"[^0-9A-Za-z_]+", "_", code).strip("_") 89 | return safe.lower() 90 | 91 | 92 | def partition_dictionary_by_language( 93 | conninfo: str, 94 | *, 95 | source_table: str, 96 | column_name: str, 97 | lang_field: str = "lang_code", 98 | table_prefix: str = "dictionary_lang", 99 | target_schema: str | None = None, 100 | drop_existing: bool = False, 101 | languages: Sequence[str] | None = None, 102 | ) -> list[str]: 103 | """Split rows in ``source_table`` into per-language tables based on ``lang_field``.""" 104 | 105 | created_tables: list[str] = [] 106 | table_identifier = _identifier_from_dotted(source_table) 107 | column_identifier = sql.Identifier(column_name) 108 | 109 | with psycopg.connect(conninfo) as connection: 110 | with connection.cursor() as cursor: 111 | if languages: 112 | language_codes = [code for code in dict.fromkeys(languages) if code] 113 | else: 114 | select_distinct = sql.SQL( 115 | """ 116 | SELECT DISTINCT {column}->>%s AS lang_code 117 | FROM {table} 118 | WHERE {column} ? %s 119 | AND {column}->>%s IS NOT NULL 120 | AND {column}->>%s <> '' 121 | ORDER BY lang_code 122 | """ 123 | ).format(column=column_identifier, table=table_identifier) 124 | 125 | cursor.execute(select_distinct, (lang_field, lang_field, lang_field, lang_field)) 126 | language_codes = [row[0] for row in cursor.fetchall() if row and row[0]] 127 | 128 | if not language_codes: 129 | print( 130 | "No language codes found; skipping partition step.", 131 | file=sys.stderr, 132 | ) 133 | return created_tables 134 | 135 | total_languages = len(language_codes) 136 | print( 137 | f"Partitioning {total_languages} language set(s) from {source_table}.{column_name}...", 138 | file=sys.stderr, 139 | ) 140 | 141 | seen_tables: set[tuple[str | None, str]] = set() 142 | for idx, code in enumerate(language_codes, start=1): 143 | prefix = f"[{idx}/{total_languages}] " 144 | safe_code = _sanitize_language_code(code) 145 | if not safe_code: 146 | print( 147 | prefix 148 | + f"Skipping language code '{code}' because it cannot form a valid table name.", 149 | file=sys.stderr, 150 | ) 151 | continue 152 | 153 | table_name = f"{table_prefix}_{safe_code}" 154 | if target_schema: 155 | table_key = (target_schema, table_name) 156 | target_identifier = sql.Identifier(target_schema, table_name) 157 | display_name = f"{target_schema}.{table_name}" 158 | else: 159 | table_key = (None, table_name) 160 | target_identifier = sql.Identifier(table_name) 161 | display_name = table_name 162 | 163 | if table_key in seen_tables: 164 | print( 165 | prefix 166 | + f"Skipping language code '{code}' because it maps to an existing table name {display_name}.", 167 | file=sys.stderr, 168 | ) 169 | continue 170 | seen_tables.add(table_key) 171 | 172 | if drop_existing: 173 | drop_sql = sql.SQL("DROP TABLE IF EXISTS {}").format(target_identifier) 174 | cursor.execute(drop_sql) 175 | connection.commit() 176 | 177 | create_sql = sql.SQL( 178 | """ 179 | CREATE TABLE IF NOT EXISTS {} ( 180 | id BIGINT PRIMARY KEY, 181 | {} JSONB NOT NULL 182 | ) 183 | """ 184 | ).format(target_identifier, column_identifier) 185 | cursor.execute(create_sql) 186 | 187 | insert_sql = sql.SQL( 188 | """ 189 | INSERT INTO {target} (id, {column}) 190 | SELECT id, {column} 191 | FROM {source} 192 | WHERE {column}->>%s = %s 193 | ON CONFLICT (id) DO NOTHING 194 | """ 195 | ).format( 196 | target=target_identifier, 197 | column=column_identifier, 198 | source=table_identifier, 199 | ) 200 | 201 | cursor.execute(insert_sql, (lang_field, code)) 202 | connection.commit() 203 | 204 | inserted = cursor.rowcount if cursor.rowcount != -1 else None 205 | inserted_text = f" ({inserted} rows)" if inserted is not None else "" 206 | print( 207 | f"{prefix}Partitioned '{code}' -> {display_name}{inserted_text}", 208 | file=sys.stderr, 209 | ) 210 | created_tables.append(display_name) 211 | 212 | return created_tables 213 | 214 | 215 | def copy_jsonl_to_postgres( 216 | jsonl_path: Path, 217 | conninfo: str, 218 | table_name: str, 219 | column_name: str, 220 | truncate: bool = False, 221 | ) -> int: 222 | """Stream JSON rows from ``jsonl_path`` into ``table_name.column_name``. 223 | 224 | Returns the number of rows copied. 225 | """ 226 | 227 | table_identifier = _identifier_from_dotted(table_name) 228 | if not column_name.strip(): 229 | raise ValueError("Column name cannot be empty") 230 | 231 | column_identifier = sql.Identifier(column_name) 232 | 233 | rows_written = 0 234 | total_bytes = jsonl_path.stat().st_size 235 | progress = StreamingProgress(total_bytes, label=f"COPY {table_name}") 236 | latest_bytes_processed = 0 237 | 238 | with psycopg.connect(conninfo) as connection: 239 | with connection.cursor() as cursor: 240 | _ensure_table_structure(cursor, table_identifier, column_identifier) 241 | 242 | if truncate: 243 | cursor.execute(sql.SQL("TRUNCATE TABLE {}").format(table_identifier)) 244 | 245 | copy_sql = sql.SQL("COPY {} ({}) FROM STDIN WITH (FORMAT text)").format( 246 | table_identifier, 247 | column_identifier, 248 | ) 249 | copy_command = copy_sql.as_string(connection) 250 | 251 | with cursor.copy(copy_command) as copy: # type: ignore[arg-type] 252 | for json_text, bytes_processed in iter_json_lines(jsonl_path): 253 | copy.write_row((json_text,)) 254 | rows_written += 1 255 | latest_bytes_processed = bytes_processed 256 | progress.report(rows_written, latest_bytes_processed) 257 | 258 | progress.finalize(rows_written, latest_bytes_processed) 259 | 260 | return rows_written 261 | 262 | __all__ = [ 263 | "JsonlProcessingError", 264 | "iter_json_lines", 265 | "partition_dictionary_by_language", 266 | "copy_jsonl_to_postgres", 267 | ] 268 | -------------------------------------------------------------------------------- /src/open_dictionary/workflow.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | from typing import Any 3 | import logging 4 | import sys 5 | import time 6 | 7 | from open_dictionary.db.access import DatabaseAccess 8 | from open_dictionary.db.sqlite_manager import SQLiteManager 9 | from open_dictionary.llm.define import define, Definition 10 | 11 | # Configure logging 12 | logging.basicConfig( 13 | level=logging.INFO, 14 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 15 | stream=sys.stderr 16 | ) 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ProgressReporter: 21 | """Report progress of definition generation with statistics.""" 22 | 23 | def __init__( 24 | self, 25 | *, 26 | min_time_step: float = 5.0, 27 | min_count_step: int = 10, 28 | ): 29 | self.min_time_step = max(min_time_step, 0.0) 30 | self.min_count_step = max(min_count_step, 1) 31 | self._last_report_time = time.monotonic() 32 | self._last_report_count = 0 33 | self._start_time = time.monotonic() 34 | 35 | def maybe_report( 36 | self, 37 | processed: int, 38 | failed: int, 39 | *, 40 | force: bool = False 41 | ) -> None: 42 | """Report progress if enough time/items have passed.""" 43 | now = time.monotonic() 44 | count_increment = processed - self._last_report_count 45 | 46 | if not force: 47 | if ( 48 | count_increment < self.min_count_step 49 | and (now - self._last_report_time) < self.min_time_step 50 | ): 51 | return 52 | 53 | elapsed = now - self._start_time 54 | total = processed + failed 55 | rate = processed / elapsed if elapsed > 0 else 0 56 | 57 | message = ( 58 | f"Progress: {processed:,} processed | {failed:,} failed | " 59 | f"{total:,} total | {rate:.1f} items/sec" 60 | ) 61 | logger.info(message) 62 | 63 | self._last_report_time = now 64 | self._last_report_count = processed 65 | 66 | def finalize(self, processed: int, failed: int) -> None: 67 | """Print final statistics.""" 68 | elapsed = time.monotonic() - self._start_time 69 | total = processed + failed 70 | rate = processed / elapsed if elapsed > 0 else 0 71 | 72 | logger.info("=" * 60) 73 | logger.info(f"Processing complete!") 74 | logger.info(f"Total processed: {processed:,}") 75 | logger.info(f"Total failed: {failed:,}") 76 | logger.info(f"Total items: {total:,}") 77 | logger.info(f"Success rate: {(processed/total*100 if total > 0 else 0):.1f}%") 78 | logger.info(f"Total time: {elapsed:.1f} seconds") 79 | logger.info(f"Average rate: {rate:.1f} items/sec") 80 | logger.info("=" * 60) 81 | 82 | 83 | def process_single_word(word_data: dict[str, Any]) -> tuple[str, dict[str, Any]] | None: 84 | """Process a single word definition request. 85 | 86 | Args: 87 | word_data: Dictionary containing word data from PostgreSQL 88 | 89 | Returns: 90 | Tuple of (word, definition_dict) or None if processing failed 91 | """ 92 | try: 93 | logger.debug(f"Processing word data keys: {list(word_data.keys())}") 94 | definition = define(word_data) 95 | result = (definition.word, definition.model_dump()) 96 | logger.debug(f"Successfully processed word: {definition.word}") 97 | return result 98 | except Exception as e: 99 | logger.error(f"Failed to process word '{word_data.get('word', 'unknown')}': {e}", exc_info=True) 100 | return None 101 | 102 | 103 | def run_parallel_definitions( 104 | table_name: str = "dictionary_en", 105 | batch_size: int = 50, 106 | max_workers: int = 50, 107 | sqlite_path: str = "data/dictionary.sqlite", 108 | limit: int | None = None, 109 | ): 110 | """Process dictionary entries in parallel and store in SQLite. 111 | 112 | This function reads from PostgreSQL, sends definition requests to LLM in parallel, 113 | and writes results to SQLite. 114 | 115 | Args: 116 | table_name: Name of the PostgreSQL table to read from 117 | batch_size: Number of rows to fetch from PostgreSQL per batch 118 | max_workers: Maximum number of parallel LLM requests 119 | sqlite_path: Path to SQLite database file 120 | limit: Optional limit on number of words to process 121 | """ 122 | db_access = DatabaseAccess() 123 | sqlite_manager = SQLiteManager(sqlite_path) 124 | progress = ProgressReporter() 125 | 126 | logger.info(f"Starting parallel definition processing with {max_workers} workers") 127 | logger.info(f"Reading from PostgreSQL table: {table_name}") 128 | logger.info(f"Writing to SQLite: {sqlite_path}") 129 | if limit: 130 | logger.info(f"Processing limit: {limit:,} words") 131 | 132 | processed_count = 0 133 | failed_count = 0 134 | pending_batch = [] 135 | 136 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 137 | # Iterator to track all submitted futures 138 | future_to_word = {} 139 | 140 | # Iterate through PostgreSQL table 141 | row_iterator = db_access.iterate_table( 142 | table_name=table_name, 143 | batch_size=batch_size, 144 | ) 145 | 146 | for row in row_iterator: 147 | # Check limit 148 | if limit and processed_count >= limit: 149 | break 150 | 151 | # Extract the data field if present (PostgreSQL stores JSON in 'data' column) 152 | word_data = row.get('data', row) 153 | word_key = word_data.get('word', 'unknown') if isinstance(word_data, dict) else 'unknown' 154 | 155 | # Submit word for processing 156 | future = executor.submit(process_single_word, word_data) 157 | future_to_word[future] = word_key 158 | 159 | # When we have max_workers futures pending, wait for some to complete 160 | if len(future_to_word) >= max_workers: 161 | # Wait for at least one to complete 162 | for future in as_completed(list(future_to_word.keys())): 163 | # Process this completed future and break to continue submitting 164 | if future not in future_to_word: 165 | continue 166 | 167 | word_key = future_to_word.pop(future) 168 | result = future.result() 169 | 170 | if result: 171 | word, definition = result 172 | pending_batch.append((word, definition)) 173 | processed_count += 1 174 | logger.debug(f"Added '{word}' to pending batch (size: {len(pending_batch)})") 175 | 176 | # Write batch when it reaches batch_size 177 | if len(pending_batch) >= batch_size: 178 | logger.debug(f"Writing batch of {len(pending_batch)} definitions to SQLite") 179 | sqlite_manager.insert_definitions_batch(pending_batch) 180 | logger.info(f"Wrote batch to SQLite. Total in DB: {sqlite_manager.count_definitions()}") 181 | pending_batch = [] 182 | 183 | # Report progress 184 | progress.maybe_report(processed_count, failed_count) 185 | else: 186 | failed_count += 1 187 | logger.warning(f"Failed to process: {word_key}") 188 | progress.maybe_report(processed_count, failed_count) 189 | 190 | # Break after processing one to continue submitting more work 191 | break 192 | 193 | # Wait for remaining futures 194 | for future in as_completed(future_to_word.keys()): 195 | word_key = future_to_word[future] 196 | result = future.result() 197 | 198 | if result: 199 | word, definition = result 200 | pending_batch.append((word, definition)) 201 | processed_count += 1 202 | progress.maybe_report(processed_count, failed_count) 203 | else: 204 | failed_count += 1 205 | logger.warning(f"Failed to process: {word_key}") 206 | progress.maybe_report(processed_count, failed_count) 207 | 208 | # Write any remaining definitions 209 | if pending_batch: 210 | logger.info(f"Writing final batch of {len(pending_batch)} definitions to SQLite") 211 | sqlite_manager.insert_definitions_batch(pending_batch) 212 | logger.info(f"Final batch written. Total in DB: {sqlite_manager.count_definitions()}") 213 | 214 | # Final statistics 215 | progress.finalize(processed_count, failed_count) 216 | final_count = sqlite_manager.count_definitions() 217 | logger.info(f"Total definitions in SQLite: {final_count:,}") 218 | 219 | if final_count != processed_count: 220 | logger.warning(f"Mismatch: processed {processed_count} but only {final_count} in database!") 221 | 222 | 223 | if __name__ == "__main__": 224 | import argparse 225 | 226 | parser = argparse.ArgumentParser( 227 | description="Generate dictionary definitions using LLM in parallel." 228 | ) 229 | parser.add_argument( 230 | "--table", 231 | default="dictionary_en", 232 | help="PostgreSQL table to read dictionary entries from (default: dictionary_en).", 233 | ) 234 | parser.add_argument( 235 | "--batch-size", 236 | type=int, 237 | default=50, 238 | help="Number of rows to fetch from PostgreSQL per batch (default: 50).", 239 | ) 240 | parser.add_argument( 241 | "--workers", 242 | type=int, 243 | default=50, 244 | help="Maximum number of parallel LLM requests (default: 50).", 245 | ) 246 | parser.add_argument( 247 | "--sqlite-path", 248 | default="data/dictionary.sqlite", 249 | help="Path to SQLite database file for storing definitions (default: data/dictionary.sqlite).", 250 | ) 251 | parser.add_argument( 252 | "--limit", 253 | type=int, 254 | help="Optional limit on number of words to process (for testing).", 255 | ) 256 | 257 | args = parser.parse_args() 258 | 259 | run_parallel_definitions( 260 | table_name=args.table, 261 | batch_size=args.batch_size, 262 | max_workers=args.workers, 263 | sqlite_path=args.sqlite_path, 264 | limit=args.limit, 265 | ) 266 | 267 | -------------------------------------------------------------------------------- /src/open_dictionary/wikitionary/pre_process.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import time 5 | from typing import Any, Sequence 6 | 7 | from psycopg import sql 8 | from psycopg.cursor import Cursor 9 | 10 | from open_dictionary.db.access import DatabaseAccess 11 | 12 | try: 13 | from toon import encode as toon_encode 14 | except ImportError: 15 | toon_encode = None 16 | 17 | FETCH_BATCH_SIZE = 5000 18 | UPDATE_BATCH_SIZE = 5000 19 | PROGRESS_EVERY_ROWS = 20_000 20 | PROGRESS_EVERY_SECONDS = 30.0 21 | 22 | _ALLOWED_TOP_LEVEL_KEYS = ( 23 | "pos", 24 | "word", 25 | "forms", 26 | "derived", 27 | "etymology_text", 28 | ) 29 | _SENSE_KEYS = ("glosses", "raw_glosses") 30 | 31 | 32 | def preprocess_entries( 33 | *, 34 | table_name: str, 35 | source_column: str = "data", 36 | target_column: str = "process", 37 | fetch_batch_size: int = FETCH_BATCH_SIZE, 38 | update_batch_size: int = UPDATE_BATCH_SIZE, 39 | progress_every_rows: int = PROGRESS_EVERY_ROWS, 40 | progress_every_seconds: float = PROGRESS_EVERY_SECONDS, 41 | recompute_existing: bool = False, 42 | use_toon: bool = False, 43 | ) -> None: 44 | """Normalize Wiktionary payloads into a slimmer JSONB column.""" 45 | 46 | if fetch_batch_size <= 0: 47 | raise ValueError("fetch_batch_size must be positive") 48 | if update_batch_size <= 0: 49 | raise ValueError("update_batch_size must be positive") 50 | if use_toon and toon_encode is None: 51 | raise ValueError("TOON format requested but 'toon' package is not installed") 52 | 53 | data_access = DatabaseAccess() 54 | _ensure_target_column(data_access, table_name, target_column, use_toon) 55 | 56 | where_clause = None 57 | if not recompute_existing: 58 | where_clause = sql.SQL("{column} IS NULL").format( 59 | column=sql.Identifier(target_column) 60 | ) 61 | 62 | print( 63 | "[pre-process] starting " 64 | f"table={table_name} source={source_column} target={target_column} " 65 | f"fetch_batch={fetch_batch_size} update_batch={update_batch_size} " 66 | f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds} " 67 | f"recompute_existing={recompute_existing} use_toon={use_toon}", 68 | flush=True, 69 | ) 70 | 71 | processed = 0 72 | updated = 0 73 | skipped = 0 74 | start_time = time.monotonic() 75 | last_log_time = start_time 76 | pending_updates: list[tuple[int, str]] = [] 77 | 78 | with data_access.get_connection() as update_conn: 79 | with update_conn.cursor() as cursor: 80 | row_stream = data_access.iterate_table( 81 | table_name, 82 | batch_size=fetch_batch_size, 83 | columns=( 84 | "id", 85 | source_column, 86 | target_column, 87 | ), 88 | where=where_clause, 89 | order_by=("id",), 90 | ) 91 | 92 | for row in row_stream: 93 | row_id = row.get("id") 94 | if row_id is None: 95 | skipped += 1 96 | continue 97 | 98 | payload = _load_payload(row.get(source_column)) 99 | if payload is None: 100 | skipped += 1 101 | continue 102 | 103 | processed_payload = _preprocess_payload(payload) 104 | 105 | if use_toon: 106 | payload_str = convert_to_toon(processed_payload) 107 | else: 108 | payload_str = json.dumps( 109 | processed_payload, 110 | ensure_ascii=False, 111 | separators=(",", ":"), 112 | ) 113 | 114 | pending_updates.append((int(row_id), payload_str)) 115 | 116 | if len(pending_updates) >= update_batch_size: 117 | batch_count = _flush_updates( 118 | cursor, 119 | table_name, 120 | target_column, 121 | pending_updates, 122 | use_toon, 123 | ) 124 | update_conn.commit() 125 | updated += batch_count 126 | pending_updates.clear() 127 | 128 | processed += 1 129 | 130 | emit_progress = False 131 | now = time.monotonic() 132 | if processed == 1: 133 | emit_progress = True 134 | elif progress_every_rows and processed % progress_every_rows == 0: 135 | emit_progress = True 136 | elif progress_every_seconds and (now - last_log_time) >= progress_every_seconds: 137 | emit_progress = True 138 | 139 | if emit_progress: 140 | _report_progress(processed, updated, skipped, start_time) 141 | last_log_time = now 142 | 143 | if pending_updates: 144 | batch_count = _flush_updates( 145 | cursor, 146 | table_name, 147 | target_column, 148 | pending_updates, 149 | use_toon, 150 | ) 151 | update_conn.commit() 152 | updated += batch_count 153 | pending_updates.clear() 154 | 155 | _report_completion(processed, updated, skipped, start_time) 156 | 157 | 158 | def _ensure_target_column( 159 | data_access: DatabaseAccess, 160 | table_name: str, 161 | target_column: str, 162 | use_toon: bool = False, 163 | ) -> None: 164 | column_type = "TEXT" if use_toon else "JSONB" 165 | with data_access.get_connection() as conn: 166 | with conn.cursor() as cursor: 167 | cursor.execute( 168 | sql.SQL( 169 | """ 170 | ALTER TABLE {table} 171 | ADD COLUMN IF NOT EXISTS {column} {type} 172 | """ 173 | ).format( 174 | table=sql.Identifier(table_name), 175 | column=sql.Identifier(target_column), 176 | type=sql.SQL(column_type), 177 | ) 178 | ) 179 | conn.commit() 180 | 181 | 182 | def _flush_updates( 183 | cursor: Cursor[Any], 184 | table_name: str, 185 | target_column: str, 186 | payloads: Sequence[tuple[int, str]], 187 | use_toon: bool = False, 188 | ) -> int: 189 | if not payloads: 190 | return 0 191 | 192 | values_sql = sql.SQL(", ").join( 193 | sql.SQL("(%s::bigint, %s::text)") for _ in payloads 194 | ) 195 | 196 | # When using TOON, store as TEXT; otherwise cast to JSONB 197 | cast_type = "text" if use_toon else "jsonb" 198 | 199 | update_sql = sql.SQL( 200 | """ 201 | UPDATE {table} AS t 202 | SET {column} = v.payload::{cast_type} 203 | FROM (VALUES {values}) AS v(id, payload) 204 | WHERE t.id = v.id 205 | """ 206 | ).format( 207 | table=sql.Identifier(table_name), 208 | column=sql.Identifier(target_column), 209 | cast_type=sql.SQL(cast_type), 210 | values=values_sql, 211 | ) 212 | 213 | params: list[Any] = [] 214 | for row_id, payload_json in payloads: 215 | params.extend((row_id, payload_json)) 216 | 217 | cursor.execute(update_sql, params) 218 | return cursor.rowcount 219 | 220 | 221 | def _preprocess_payload(payload: dict[str, Any]) -> dict[str, Any]: 222 | result: dict[str, Any] = {} 223 | 224 | for key in _ALLOWED_TOP_LEVEL_KEYS: 225 | if key in payload: 226 | value = payload[key] 227 | if value is not None: 228 | result[key] = value 229 | 230 | senses = _extract_senses(payload.get("senses")) 231 | if senses is not None: 232 | result["senses"] = senses 233 | 234 | sounds = _extract_sounds(payload.get("sounds")) 235 | if sounds is not None: 236 | result["sounds"] = sounds 237 | 238 | related = _extract_related(payload.get("related")) 239 | if related is not None: 240 | result["related"] = related 241 | 242 | return result 243 | 244 | 245 | def convert_to_toon(payload: dict[str, Any]) -> str: 246 | """Convert a preprocessed dictionary payload to TOON format. 247 | 248 | Args: 249 | payload: The preprocessed dictionary entry (output of _preprocess_payload). 250 | 251 | Returns: 252 | A string representation in TOON format. 253 | 254 | Raises: 255 | ValueError: If the toon package is not available. 256 | """ 257 | if toon_encode is None: 258 | raise ValueError("TOON format requested but 'toon' package is not installed") 259 | 260 | return toon_encode(payload) 261 | 262 | 263 | def _extract_senses(value: Any) -> list[dict[str, list[str]]] | None: 264 | if not isinstance(value, list): 265 | return None 266 | 267 | senses: list[dict[str, list[str]]] = [] 268 | for item in value: 269 | if not isinstance(item, dict): 270 | continue 271 | 272 | sense: dict[str, list[str]] = {} 273 | for key in _SENSE_KEYS: 274 | normalized = _ensure_string_list(item.get(key)) 275 | if normalized is not None: 276 | sense[key] = normalized 277 | 278 | if sense: 279 | senses.append(sense) 280 | 281 | if not senses: 282 | return None 283 | return senses 284 | 285 | 286 | def _extract_sounds(value: Any) -> list[str] | None: 287 | if not isinstance(value, list): 288 | return None 289 | 290 | urls: list[str] = [] 291 | seen: set[str] = set() 292 | for item in value: 293 | if isinstance(item, dict): 294 | candidate = item.get("ogg_url") 295 | else: 296 | candidate = None 297 | 298 | if not isinstance(candidate, str): 299 | continue 300 | 301 | trimmed = candidate.strip() 302 | if not trimmed or trimmed in seen: 303 | continue 304 | 305 | urls.append(trimmed) 306 | seen.add(trimmed) 307 | 308 | if not urls: 309 | return None 310 | return urls 311 | 312 | 313 | def _extract_related(value: Any) -> list[str] | None: 314 | if not isinstance(value, list): 315 | return None 316 | 317 | items: list[str] = [] 318 | seen: set[str] = set() 319 | 320 | for entry in value: 321 | candidate: Any 322 | if isinstance(entry, dict): 323 | candidate = entry.get("word") 324 | elif isinstance(entry, (list, tuple)) and entry: 325 | candidate = entry[0] 326 | else: 327 | candidate = entry 328 | 329 | if not isinstance(candidate, str): 330 | continue 331 | 332 | word = candidate.strip() 333 | if not word or word in seen: 334 | continue 335 | 336 | items.append(word) 337 | seen.add(word) 338 | 339 | if not items: 340 | return None 341 | return items 342 | 343 | 344 | def _ensure_string_list(value: Any) -> list[str] | None: 345 | if value is None: 346 | return None 347 | 348 | items: list[str] = [] 349 | 350 | if isinstance(value, str): 351 | trimmed = value.strip() 352 | if trimmed: 353 | items.append(trimmed) 354 | elif isinstance(value, (list, tuple)): 355 | for entry in value: 356 | if not isinstance(entry, str): 357 | continue 358 | trimmed = entry.strip() 359 | if trimmed: 360 | items.append(trimmed) 361 | else: 362 | return None 363 | 364 | if not items: 365 | return None 366 | return items 367 | 368 | 369 | def _load_payload(value: Any) -> dict[str, Any] | None: 370 | if isinstance(value, dict): 371 | return value 372 | if value is None: 373 | return None 374 | if isinstance(value, bytes): 375 | try: 376 | decoded = value.decode("utf-8") 377 | except UnicodeDecodeError: 378 | return None 379 | return _load_payload(decoded) 380 | if isinstance(value, memoryview): 381 | return _load_payload(value.tobytes()) 382 | if isinstance(value, str): 383 | try: 384 | decoded = json.loads(value) 385 | except json.JSONDecodeError: 386 | return None 387 | if isinstance(decoded, dict): 388 | return decoded 389 | return None 390 | return None 391 | 392 | 393 | def _report_progress( 394 | processed: int, 395 | updated: int, 396 | skipped: int, 397 | start_time: float, 398 | ) -> None: 399 | elapsed = max(time.monotonic() - start_time, 1e-6) 400 | processed_rate = processed / elapsed 401 | print( 402 | f"[pre-process] progress processed={processed:,} " 403 | f"updated={updated:,} skipped={skipped:,} " 404 | f"elapsed={elapsed:,.1f}s rate={processed_rate:,.0f} rows/s", 405 | flush=True, 406 | ) 407 | 408 | 409 | def _report_completion( 410 | processed: int, 411 | updated: int, 412 | skipped: int, 413 | start_time: float, 414 | ) -> None: 415 | elapsed = max(time.monotonic() - start_time, 1e-6) 416 | processed_rate = processed / elapsed if processed else 0.0 417 | print( 418 | f"[pre-process] completed processed={processed:,} " 419 | f"updated={updated:,} skipped={skipped:,} " 420 | f"elapsed={elapsed:,.1f}s avg_rate={processed_rate:,.0f} rows/s", 421 | flush=True, 422 | ) 423 | 424 | 425 | __all__ = [ 426 | "FETCH_BATCH_SIZE", 427 | "UPDATE_BATCH_SIZE", 428 | "PROGRESS_EVERY_ROWS", 429 | "PROGRESS_EVERY_SECONDS", 430 | "preprocess_entries", 431 | "convert_to_toon", 432 | ] 433 | 434 | -------------------------------------------------------------------------------- /src/open_dictionary/llm/define_enricher.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures 4 | import json 5 | import random 6 | import time 7 | from dataclasses import dataclass 8 | from typing import Any, Callable, Sequence 9 | 10 | from psycopg import sql 11 | from psycopg.cursor import Cursor 12 | 13 | from open_dictionary.db.access import DatabaseAccess 14 | from open_dictionary.llm.define import Definition, define 15 | 16 | DEFAULT_TABLE_NAME = "dictionary_filtered_en" 17 | DEFAULT_SOURCE_COLUMN = "data" 18 | DEFAULT_TARGET_COLUMN = "new_speak" 19 | DEFAULT_FETCH_BATCH_SIZE = 400 20 | DEFAULT_LLM_BATCH_SIZE = 40 21 | DEFAULT_MAX_WORKERS = DEFAULT_LLM_BATCH_SIZE 22 | DEFAULT_MAX_RETRIES = 5 23 | DEFAULT_INITIAL_BACKOFF_SECONDS = 5.0 24 | DEFAULT_MAX_BACKOFF_SECONDS = 60.0 25 | DEFAULT_PROGRESS_EVERY_ROWS = 120 26 | DEFAULT_PROGRESS_EVERY_SECONDS = 30.0 27 | 28 | 29 | @dataclass(frozen=True) 30 | class RowPayload: 31 | row_id: int 32 | payload: str 33 | 34 | 35 | def enrich_definitions( 36 | *, 37 | table_name: str = DEFAULT_TABLE_NAME, 38 | source_column: str = DEFAULT_SOURCE_COLUMN, 39 | target_column: str = DEFAULT_TARGET_COLUMN, 40 | fetch_batch_size: int = DEFAULT_FETCH_BATCH_SIZE, 41 | llm_batch_size: int = DEFAULT_LLM_BATCH_SIZE, 42 | max_workers: int | None = None, 43 | max_retries: int = DEFAULT_MAX_RETRIES, 44 | initial_backoff_seconds: float = DEFAULT_INITIAL_BACKOFF_SECONDS, 45 | max_backoff_seconds: float = DEFAULT_MAX_BACKOFF_SECONDS, 46 | progress_every_rows: int = DEFAULT_PROGRESS_EVERY_ROWS, 47 | progress_every_seconds: float = DEFAULT_PROGRESS_EVERY_SECONDS, 48 | recompute_existing: bool = False, 49 | ) -> None: 50 | """Generate LLM-enriched dictionary entries and store them in a JSONB column.""" 51 | 52 | if llm_batch_size <= 0: 53 | raise ValueError("llm_batch_size must be positive") 54 | if fetch_batch_size <= 0: 55 | raise ValueError("fetch_batch_size must be positive") 56 | if max_workers is not None and max_workers <= 0: 57 | raise ValueError("max_workers must be positive when provided") 58 | 59 | data_access = DatabaseAccess() 60 | _ensure_target_column(data_access, table_name, target_column) 61 | 62 | where_clause = None 63 | if not recompute_existing: 64 | where_clause = sql.SQL("{column} IS NULL").format( 65 | column=sql.Identifier(target_column) 66 | ) 67 | 68 | max_workers = max_workers or llm_batch_size 69 | 70 | print( 71 | "[llm-define] starting " 72 | f"table={table_name} source={source_column} target={target_column} " 73 | f"fetch_batch={fetch_batch_size} llm_batch={llm_batch_size} " 74 | f"max_workers={max_workers} retries={max_retries} " 75 | f"backoff_start={initial_backoff_seconds}s backoff_max={max_backoff_seconds}s " 76 | f"recompute_existing={recompute_existing}", 77 | flush=True, 78 | ) 79 | 80 | processed = 0 81 | succeeded = 0 82 | failed = 0 83 | start_time = time.monotonic() 84 | last_log_time = start_time 85 | last_log_count = 0 86 | pending_rows: list[RowPayload] = [] 87 | 88 | def emit_progress(force: bool = False) -> None: 89 | nonlocal last_log_time, last_log_count 90 | now = time.monotonic() 91 | should_emit = force 92 | if not should_emit: 93 | if progress_every_rows and processed - last_log_count >= progress_every_rows: 94 | should_emit = True 95 | if progress_every_seconds and (now - last_log_time) >= progress_every_seconds: 96 | should_emit = True 97 | if should_emit: 98 | _report_progress(processed, succeeded, failed, start_time) 99 | last_log_time = now 100 | last_log_count = processed 101 | 102 | def record_result(is_success: bool) -> None: 103 | nonlocal processed, succeeded, failed 104 | processed += 1 105 | if is_success: 106 | succeeded += 1 107 | else: 108 | failed += 1 109 | emit_progress(force=True) 110 | 111 | with data_access.get_connection() as update_conn: 112 | with update_conn.cursor() as cursor: 113 | row_stream = data_access.iterate_table( 114 | table_name, 115 | batch_size=fetch_batch_size, 116 | columns=( 117 | "id", 118 | source_column, 119 | target_column, 120 | ), 121 | where=where_clause, 122 | order_by=("common_score",), 123 | ) 124 | 125 | for row in row_stream: 126 | row_id = row.get("id") 127 | if row_id is None: 128 | failed += 1 129 | processed += 1 130 | print("[llm-define] skipped row without id", flush=True) 131 | emit_progress(force=True) 132 | continue 133 | 134 | payload = _load_payload(row.get(source_column)) 135 | if payload is None: 136 | failed += 1 137 | processed += 1 138 | print( 139 | f"[llm-define] row_id={row_id} missing or invalid {source_column}", 140 | flush=True, 141 | ) 142 | emit_progress(force=True) 143 | continue 144 | 145 | sanitized_payload = _sanitize_payload(payload) 146 | 147 | pending_rows.append(RowPayload(int(row_id), sanitized_payload)) 148 | 149 | if len(pending_rows) >= llm_batch_size: 150 | _process_batch( 151 | cursor, 152 | table_name, 153 | target_column, 154 | pending_rows, 155 | max_workers, 156 | max_retries, 157 | initial_backoff_seconds, 158 | max_backoff_seconds, 159 | record_result, 160 | ) 161 | pending_rows.clear() 162 | update_conn.commit() 163 | 164 | if pending_rows: 165 | _process_batch( 166 | cursor, 167 | table_name, 168 | target_column, 169 | pending_rows, 170 | max_workers, 171 | max_retries, 172 | initial_backoff_seconds, 173 | max_backoff_seconds, 174 | record_result, 175 | ) 176 | pending_rows.clear() 177 | update_conn.commit() 178 | 179 | _report_completion(processed, succeeded, failed, start_time) 180 | 181 | 182 | def _process_batch( 183 | cursor: Cursor[Any], 184 | table_name: str, 185 | target_column: str, 186 | rows: Sequence[RowPayload], 187 | max_workers: int, 188 | max_retries: int, 189 | initial_backoff_seconds: float, 190 | max_backoff_seconds: float, 191 | record_result: Callable[[bool], None], 192 | ) -> None: 193 | successes = _run_llm_batch( 194 | rows, 195 | max_workers, 196 | max_retries, 197 | initial_backoff_seconds, 198 | max_backoff_seconds, 199 | record_result, 200 | ) 201 | 202 | _apply_updates(cursor, table_name, target_column, successes) 203 | 204 | 205 | def _run_llm_batch( 206 | rows: Sequence[RowPayload], 207 | max_workers: int, 208 | max_retries: int, 209 | initial_backoff_seconds: float, 210 | max_backoff_seconds: float, 211 | record_result: Callable[[bool], None], 212 | ) -> list[tuple[int, str]]: 213 | successes: list[tuple[int, str]] = [] 214 | 215 | worker_count = min(max(len(rows), 1), max_workers) 216 | 217 | with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor: 218 | future_to_row = { 219 | executor.submit( 220 | _define_with_retry, 221 | row.payload, 222 | max_retries, 223 | initial_backoff_seconds, 224 | max_backoff_seconds, 225 | ): row 226 | for row in rows 227 | } 228 | 229 | for future in concurrent.futures.as_completed(future_to_row): 230 | row = future_to_row[future] 231 | try: 232 | definition = future.result() 233 | except Exception as exc: # pragma: no cover - network/runtime failures 234 | print( 235 | f"[llm-define] row_id={row.row_id} failed: {exc}", 236 | flush=True, 237 | ) 238 | _log_error(row.row_id, row.payload, exc) 239 | record_result(False) 240 | else: 241 | payload_json = json.dumps( 242 | definition.model_dump(mode="json"), 243 | ensure_ascii=False, 244 | ) 245 | successes.append((row.row_id, payload_json)) 246 | record_result(True) 247 | 248 | return successes 249 | 250 | 251 | def _define_with_retry( 252 | payload: str, 253 | max_retries: int, 254 | initial_backoff_seconds: float, 255 | max_backoff_seconds: float, 256 | ) -> Definition: 257 | attempt = 0 258 | while True: 259 | try: 260 | return define(payload) 261 | except Exception as exc: # pragma: no cover - passthrough for runtime errors 262 | attempt += 1 263 | if attempt >= max_retries: 264 | raise exc 265 | 266 | backoff = min( 267 | max_backoff_seconds, 268 | initial_backoff_seconds * (2 ** (attempt - 1)), 269 | ) 270 | jitter = random.uniform(0.0, initial_backoff_seconds) 271 | sleep_seconds = max(backoff + jitter, 0.0) 272 | time.sleep(sleep_seconds) 273 | 274 | 275 | def _apply_updates( 276 | cursor: Cursor[Any], 277 | table_name: str, 278 | target_column: str, 279 | payloads: Sequence[tuple[int, str]], 280 | ) -> None: 281 | if not payloads: 282 | return 283 | 284 | values_sql = sql.SQL(", ").join( 285 | sql.SQL("(%s::bigint, %s::text)") for _ in payloads 286 | ) 287 | 288 | update_sql = sql.SQL( 289 | """ 290 | UPDATE {table} AS t 291 | SET {column} = v.payload::jsonb 292 | FROM (VALUES {values}) AS v(id, payload) 293 | WHERE t.id = v.id 294 | """ 295 | ).format( 296 | table=sql.Identifier(table_name), 297 | column=sql.Identifier(target_column), 298 | values=values_sql, 299 | ) 300 | 301 | params: list[Any] = [] 302 | for row_id, payload_json in payloads: 303 | params.extend((row_id, payload_json)) 304 | 305 | cursor.execute(update_sql, params) 306 | 307 | 308 | def _ensure_target_column( 309 | data_access: DatabaseAccess, 310 | table_name: str, 311 | target_column: str, 312 | ) -> None: 313 | with data_access.get_connection() as conn: 314 | with conn.cursor() as cursor: 315 | cursor.execute( 316 | sql.SQL( 317 | """ 318 | ALTER TABLE {table} 319 | ADD COLUMN IF NOT EXISTS {column} JSONB 320 | """ 321 | ).format( 322 | table=sql.Identifier(table_name), 323 | column=sql.Identifier(target_column), 324 | ) 325 | ) 326 | conn.commit() 327 | 328 | 329 | def _load_payload(value: Any) -> str | None: 330 | if isinstance(value, str): 331 | return value 332 | if value is None: 333 | return None 334 | if isinstance(value, dict): 335 | return json.dumps(value, ensure_ascii=False) 336 | if isinstance(value, bytes): 337 | try: 338 | decoded = value.decode("utf-8") 339 | except UnicodeDecodeError: 340 | return None 341 | return decoded 342 | if isinstance(value, memoryview): 343 | return _load_payload(value.tobytes()) 344 | return None 345 | 346 | 347 | def _sanitize_payload(payload: str, max_length: int = 1000) -> str: 348 | """Trim overly large payloads by dropping noisy fields.""" 349 | if len(payload) <= max_length: 350 | return payload 351 | 352 | try: 353 | payload_obj = json.loads(payload) 354 | except (TypeError, json.JSONDecodeError): 355 | return payload 356 | 357 | if isinstance(payload_obj, dict): 358 | for key in ("derived", "forms", "glosses"): 359 | payload_obj.pop(key, None) 360 | 361 | senses = payload_obj.get("senses") 362 | if isinstance(senses, list): 363 | for sense in senses: 364 | if isinstance(sense, dict): 365 | sense.pop("glosses", None) 366 | 367 | return json.dumps(payload_obj, ensure_ascii=False) 368 | 369 | return payload 370 | 371 | 372 | def _report_progress( 373 | processed: int, 374 | succeeded: int, 375 | failed: int, 376 | start_time: float, 377 | ) -> None: 378 | elapsed = max(time.monotonic() - start_time, 1e-6) 379 | rate = processed / elapsed 380 | print( 381 | f"[llm-define] progress processed={processed:,} " 382 | f"succeeded={succeeded:,} failed={failed:,} " 383 | f"elapsed={elapsed:,.1f}s rate={rate:,.0f} rows/s", 384 | flush=True, 385 | ) 386 | 387 | 388 | def _report_completion( 389 | processed: int, 390 | succeeded: int, 391 | failed: int, 392 | start_time: float, 393 | ) -> None: 394 | elapsed = max(time.monotonic() - start_time, 1e-6) 395 | rate = processed / elapsed if processed else 0.0 396 | print( 397 | f"[llm-define] completed processed={processed:,} " 398 | f"succeeded={succeeded:,} failed={failed:,} " 399 | f"elapsed={elapsed:,.1f}s avg_rate={rate:,.0f} rows/s", 400 | flush=True, 401 | ) 402 | 403 | 404 | def _log_error( 405 | row_id: int, 406 | payload: str, 407 | error: Exception, 408 | log_file: str = "data/llm_define_errors.log", 409 | ) -> None: 410 | """Write error details to a log file.""" 411 | import os 412 | from datetime import datetime 413 | 414 | try: 415 | os.makedirs(os.path.dirname(log_file), exist_ok=True) 416 | with open(log_file, "a", encoding="utf-8") as f: 417 | timestamp = datetime.now().isoformat() 418 | f.write(f"\n{'='*80}\n") 419 | f.write(f"Timestamp: {timestamp}\n") 420 | f.write(f"Row ID: {row_id}\n") 421 | f.write(f"Error: {type(error).__name__}: {error}\n") 422 | f.write(f"\nPayload:\n{payload}\n") 423 | 424 | # Log the LLM response if it's attached to the exception 425 | if hasattr(error, 'llm_response'): 426 | f.write(f"\nLLM Response:\n{error.llm_response}\n") # type: ignore 427 | 428 | f.write(f"{'='*80}\n") 429 | except Exception as log_exc: # pragma: no cover 430 | print( 431 | f"[llm-define] failed to write error log: {log_exc}", 432 | flush=True, 433 | ) 434 | 435 | 436 | __all__ = [ 437 | "DEFAULT_TABLE_NAME", 438 | "DEFAULT_SOURCE_COLUMN", 439 | "DEFAULT_TARGET_COLUMN", 440 | "DEFAULT_FETCH_BATCH_SIZE", 441 | "DEFAULT_LLM_BATCH_SIZE", 442 | "DEFAULT_MAX_WORKERS", 443 | "DEFAULT_MAX_RETRIES", 444 | "DEFAULT_INITIAL_BACKOFF_SECONDS", 445 | "DEFAULT_MAX_BACKOFF_SECONDS", 446 | "DEFAULT_PROGRESS_EVERY_ROWS", 447 | "DEFAULT_PROGRESS_EVERY_SECONDS", 448 | "enrich_definitions", 449 | ] 450 | -------------------------------------------------------------------------------- /src/open_dictionary/cli.py: -------------------------------------------------------------------------------- 1 | """Command-line entry point for the Open Dictionary toolkit.""" 2 | 3 | from __future__ import annotations 4 | 5 | import argparse 6 | import os 7 | import sys 8 | from pathlib import Path 9 | 10 | import psycopg 11 | from dotenv import load_dotenv 12 | 13 | from .db import cleaner as db_cleaner 14 | from .db import mark_commonness as db_commonness 15 | from .llm import define_enricher as llm_define_enricher 16 | from .wikitionary.downloader import DEFAULT_WIKTIONARY_URL, download_wiktionary_dump 17 | from .wikitionary.extract import extract_wiktionary_dump 18 | from .wikitionary.filter import filter_languages 19 | from .wikitionary import pre_process as wiktionary_pre_process 20 | from .wikitionary.pipeline import run_pipeline 21 | from .wikitionary.transform import ( 22 | JsonlProcessingError, 23 | copy_jsonl_to_postgres, 24 | partition_dictionary_by_language, 25 | ) 26 | 27 | 28 | DEFAULT_DICTIONARY_TABLE = "dictionary_en" 29 | 30 | 31 | COMMAND_NAMES = { 32 | "download", 33 | "extract", 34 | "filter", 35 | "load", 36 | "partition", 37 | "pipeline", 38 | "db-clean", 39 | "db-commonness", 40 | "llm-define", 41 | "pre-process", 42 | } 43 | 44 | 45 | def _add_database_options(parser: argparse.ArgumentParser) -> None: 46 | parser.add_argument( 47 | "--env-file", 48 | default=".env", 49 | help="Path to the .env file containing the database URL (default: .env).", 50 | ) 51 | parser.add_argument( 52 | "--database-url-var", 53 | default="DATABASE_URL", 54 | help="Environment variable name holding the connection string.", 55 | ) 56 | 57 | 58 | def _get_conninfo(args: argparse.Namespace) -> str: 59 | env_file = getattr(args, "env_file", None) 60 | if env_file: 61 | load_dotenv(env_file) 62 | 63 | var_name = getattr(args, "database_url_var", "DATABASE_URL") 64 | if not var_name: 65 | raise RuntimeError("Database URL environment variable name cannot be empty") 66 | 67 | conninfo = os.getenv(var_name) # type: ignore[arg-type] 68 | if not conninfo: 69 | raise RuntimeError( 70 | f"Environment variable {var_name} is not set. Ensure your .env file is loaded." 71 | ) 72 | 73 | return conninfo 74 | 75 | 76 | def _cmd_download(args: argparse.Namespace) -> int: 77 | try: 78 | destination = download_wiktionary_dump( 79 | args.output, 80 | url=args.url, 81 | overwrite=args.overwrite, 82 | ) 83 | except RuntimeError as exc: # pragma: no cover - network failure guard 84 | args._parser.error(str(exc)) 85 | except OSError as exc: 86 | args._parser.error(str(exc)) 87 | 88 | print(f"Downloaded file to {destination}") # type: ignore[func-returns-value] 89 | return 0 90 | 91 | 92 | def _cmd_extract(args: argparse.Namespace) -> int: 93 | try: 94 | output = extract_wiktionary_dump( 95 | args.input, 96 | args.output, 97 | overwrite=args.overwrite, 98 | ) 99 | except (FileNotFoundError, IsADirectoryError) as exc: 100 | args._parser.error(str(exc)) 101 | except OSError as exc: 102 | args._parser.error(str(exc)) 103 | 104 | print(f"Extracted archive to {output}") # type: ignore[func-returns-value] 105 | return 0 106 | 107 | 108 | def _cmd_load(args: argparse.Namespace) -> int: 109 | try: 110 | conninfo = _get_conninfo(args) 111 | except RuntimeError as exc: 112 | args._parser.error(str(exc)) 113 | 114 | try: 115 | rows_copied = copy_jsonl_to_postgres( 116 | jsonl_path=args.input, 117 | conninfo=conninfo, # type: ignore[arg-type] 118 | table_name=args.table, 119 | column_name=args.column, 120 | truncate=args.truncate, 121 | ) 122 | except (FileNotFoundError, JsonlProcessingError) as exc: 123 | args._parser.error(str(exc)) 124 | except (psycopg.Error, ValueError) as exc: 125 | args._parser.error(f"Database error: {exc}") 126 | 127 | print(f"Copied {rows_copied} rows into {args.table}.{args.column}") # type: ignore[misc] 128 | return 0 129 | 130 | 131 | def _cmd_partition(args: argparse.Namespace) -> int: 132 | try: 133 | conninfo = _get_conninfo(args) 134 | except RuntimeError as exc: 135 | args._parser.error(str(exc)) 136 | 137 | try: 138 | created = partition_dictionary_by_language( 139 | conninfo, # type: ignore[arg-type] 140 | source_table=args.table, 141 | column_name=args.column, 142 | lang_field=args.lang_field, 143 | table_prefix=args.prefix, 144 | target_schema=args.target_schema, 145 | drop_existing=args.drop_existing, 146 | ) 147 | except (psycopg.Error, ValueError) as exc: 148 | args._parser.error(f"Database error: {exc}") 149 | 150 | if created: # type: ignore[truthy-bool] 151 | print("Created/updated tables:") 152 | for table in created: 153 | print(f"- {table}") 154 | else: 155 | print("No language-specific tables were created.") 156 | return 0 157 | 158 | 159 | def _cmd_pipeline(args: argparse.Namespace) -> int: 160 | try: 161 | conninfo = _get_conninfo(args) 162 | except RuntimeError as exc: 163 | args._parser.error(str(exc)) 164 | 165 | try: 166 | run_pipeline( 167 | workdir=args.workdir, 168 | conninfo=conninfo, # type: ignore[arg-type] 169 | table_name=args.table, 170 | column_name=args.column, 171 | url=args.url, 172 | truncate=args.truncate, 173 | skip_download=args.skip_download, 174 | skip_extract=args.skip_extract, 175 | skip_partition=args.skip_partition, 176 | overwrite_download=args.overwrite_download, 177 | overwrite_extract=args.overwrite_extract, 178 | lang_field=args.lang_field, 179 | table_prefix=args.prefix, 180 | target_schema=args.target_schema, 181 | drop_existing_partitions=args.drop_existing_partitions, 182 | ) 183 | except (FileNotFoundError, JsonlProcessingError) as exc: 184 | args._parser.error(str(exc)) 185 | except RuntimeError as exc: # pragma: no cover - network failure guard 186 | args._parser.error(str(exc)) 187 | except (psycopg.Error, ValueError) as exc: 188 | args._parser.error(f"Database error: {exc}") 189 | 190 | print("Pipeline completed successfully.") 191 | return 0 192 | 193 | 194 | def _cmd_filter(args: argparse.Namespace) -> int: 195 | try: 196 | conninfo = _get_conninfo(args) 197 | except RuntimeError as exc: 198 | args._parser.error(str(exc)) 199 | 200 | try: 201 | created = filter_languages( 202 | conninfo, # type: ignore[arg-type] 203 | source_table=args.table, 204 | column_name=args.column, 205 | languages=args.languages, 206 | lang_field=args.lang_field, 207 | table_prefix=args.table_prefix, 208 | target_schema=args.target_schema, 209 | drop_existing=args.drop_existing, 210 | ) 211 | except ValueError as exc: 212 | args._parser.error(str(exc)) 213 | except psycopg.Error as exc: 214 | args._parser.error(f"Database error: {exc}") 215 | 216 | if created: # type: ignore[truthy-bool] 217 | print("Created/updated tables:") 218 | for table in created: 219 | print(f"- {table}") 220 | else: 221 | print("No tables were created.") 222 | return 0 223 | 224 | 225 | def _cmd_db_clean(args: argparse.Namespace) -> int: 226 | try: 227 | _ = _get_conninfo(args) 228 | except RuntimeError as exc: 229 | args._parser.error(str(exc)) 230 | 231 | db_cleaner.clean_dictionary_data( 232 | table_name=args.table, 233 | fetch_batch_size=args.fetch_batch_size, 234 | delete_batch_size=args.delete_batch_size, 235 | progress_every_rows=args.progress_every_rows, 236 | progress_every_seconds=args.progress_every_seconds, 237 | ) 238 | return 0 239 | 240 | 241 | def _cmd_db_commonness(args: argparse.Namespace) -> int: 242 | try: 243 | _ = _get_conninfo(args) 244 | except RuntimeError as exc: 245 | args._parser.error(str(exc)) 246 | 247 | db_commonness.enrich_common_score( 248 | table_name=args.table, 249 | fetch_batch_size=args.fetch_batch_size, 250 | update_batch_size=args.update_batch_size, 251 | progress_every_rows=args.progress_every_rows, 252 | progress_every_seconds=args.progress_every_seconds, 253 | recompute_existing=args.recompute_existing, 254 | ) 255 | return 0 256 | 257 | 258 | def _cmd_llm_define(args: argparse.Namespace) -> int: 259 | try: 260 | _ = _get_conninfo(args) 261 | except RuntimeError as exc: 262 | args._parser.error(str(exc)) 263 | 264 | llm_define_enricher.enrich_definitions( 265 | table_name=args.table, 266 | source_column=args.source_column, 267 | target_column=args.target_column, 268 | fetch_batch_size=args.fetch_batch_size, 269 | llm_batch_size=args.llm_batch_size, 270 | max_workers=args.max_workers, 271 | max_retries=args.max_retries, 272 | initial_backoff_seconds=args.initial_backoff_seconds, 273 | max_backoff_seconds=args.max_backoff_seconds, 274 | progress_every_rows=args.progress_every_rows, 275 | progress_every_seconds=args.progress_every_seconds, 276 | recompute_existing=args.recompute_existing, 277 | ) 278 | return 0 279 | 280 | 281 | def _cmd_pre_process(args: argparse.Namespace) -> int: 282 | try: 283 | _ = _get_conninfo(args) 284 | except RuntimeError as exc: 285 | args._parser.error(str(exc)) 286 | 287 | wiktionary_pre_process.preprocess_entries( 288 | table_name=args.table, 289 | source_column=args.source_column, 290 | target_column=args.target_column, 291 | fetch_batch_size=args.fetch_batch_size, 292 | update_batch_size=args.update_batch_size, 293 | progress_every_rows=args.progress_every_rows, 294 | progress_every_seconds=args.progress_every_seconds, 295 | recompute_existing=args.recompute_existing, 296 | use_toon=args.toon, 297 | ) 298 | return 0 299 | 300 | 301 | def _build_parser() -> argparse.ArgumentParser: 302 | parser = argparse.ArgumentParser( 303 | description="Utilities for downloading, extracting, and loading Wiktionary dumps.", 304 | ) 305 | subparsers = parser.add_subparsers(dest="command") 306 | 307 | download_parser = subparsers.add_parser( 308 | "download", 309 | help="Download the raw Wiktionary dump (.jsonl.gz).", 310 | ) 311 | download_parser.add_argument( 312 | "--url", 313 | default=DEFAULT_WIKTIONARY_URL, 314 | help="Source URL for the Wiktionary dump (default: official raw dataset).", 315 | ) 316 | download_parser.add_argument( 317 | "--output", 318 | type=Path, 319 | default=Path("data/raw-wiktextract-data.jsonl.gz"), 320 | help="Where to store the downloaded archive (default: data/raw-wiktextract-data.jsonl.gz).", 321 | ) 322 | download_parser.add_argument( 323 | "--overwrite", 324 | action="store_true", 325 | help="Overwrite the existing archive if it already exists.", 326 | ) 327 | download_parser.set_defaults(func=_cmd_download, _parser=download_parser) 328 | 329 | extract_parser = subparsers.add_parser( 330 | "extract", 331 | help="Extract the downloaded .jsonl.gz archive to a plain JSONL file.", 332 | ) 333 | extract_parser.add_argument( 334 | "--input", 335 | type=Path, 336 | default=Path("data/raw-wiktextract-data.jsonl.gz"), 337 | help="Path to the .jsonl.gz archive (default: data/raw-wiktextract-data.jsonl.gz).", 338 | ) 339 | extract_parser.add_argument( 340 | "--output", 341 | type=Path, 342 | default=Path("data/raw-wiktextract-data.jsonl"), 343 | help="Where to write the decompressed JSONL file (default: data/raw-wiktextract-data.jsonl).", 344 | ) 345 | extract_parser.add_argument( 346 | "--overwrite", 347 | action="store_true", 348 | help="Overwrite the extracted JSONL if it already exists.", 349 | ) 350 | extract_parser.set_defaults(func=_cmd_extract, _parser=extract_parser) 351 | 352 | load_parser = subparsers.add_parser( 353 | "load", 354 | help="Load a JSONL file into PostgreSQL using COPY.", 355 | ) 356 | load_parser.add_argument("input", type=Path, help="Path to the JSONL file to load.") 357 | load_parser.add_argument( 358 | "--table", 359 | default="dictionary_all", 360 | help="Target table name (default: dictionary_all).", 361 | ) 362 | load_parser.add_argument( 363 | "--column", 364 | default="data", 365 | help="Target JSON/JSONB column name (default: data).", 366 | ) 367 | load_parser.add_argument( 368 | "--truncate", 369 | action="store_true", 370 | help="Truncate the destination table before inserting new rows.", 371 | ) 372 | _add_database_options(load_parser) 373 | load_parser.set_defaults(func=_cmd_load, _parser=load_parser) 374 | 375 | partition_parser = subparsers.add_parser( 376 | "partition", 377 | help="Split the main dictionary table into per-language tables.", 378 | ) 379 | partition_parser.add_argument( 380 | "--table", 381 | default="dictionary_all", 382 | help="Source table containing the JSONB data (default: dictionary_all).", 383 | ) 384 | partition_parser.add_argument( 385 | "--column", 386 | default="data", 387 | help="JSONB column to inspect for language codes (default: data).", 388 | ) 389 | partition_parser.add_argument( 390 | "--lang-field", 391 | default="lang_code", 392 | help="JSON key inside each entry that stores the language code (default: lang_code).", 393 | ) 394 | partition_parser.add_argument( 395 | "--prefix", 396 | default="dictionary_lang", 397 | help="Prefix for generated tables (default: dictionary_lang).", 398 | ) 399 | partition_parser.add_argument( 400 | "--target-schema", 401 | help="Optional schema to place the generated tables in (default: current search_path).", 402 | ) 403 | partition_parser.add_argument( 404 | "--drop-existing", 405 | action="store_true", 406 | help="Drop and recreate each language table before inserting rows.", 407 | ) 408 | _add_database_options(partition_parser) 409 | partition_parser.set_defaults(func=_cmd_partition, _parser=partition_parser) 410 | 411 | pipeline_parser = subparsers.add_parser( 412 | "pipeline", 413 | help="Run the full download → extract → load → partition workflow.", 414 | ) 415 | pipeline_parser.add_argument( 416 | "--workdir", 417 | type=Path, 418 | default=Path("data"), 419 | help="Working directory for downloaded/extracted files (default: data).", 420 | ) 421 | pipeline_parser.add_argument( 422 | "--url", 423 | default=DEFAULT_WIKTIONARY_URL, 424 | help="Source URL for the Wiktionary dump (default: official raw dataset).", 425 | ) 426 | pipeline_parser.add_argument( 427 | "--table", 428 | default="dictionary_all", 429 | help="Destination table for the raw entries (default: dictionary_all).", 430 | ) 431 | pipeline_parser.add_argument( 432 | "--column", 433 | default="data", 434 | help="Destination JSONB column name (default: data).", 435 | ) 436 | pipeline_parser.add_argument( 437 | "--truncate", 438 | action="store_true", 439 | help="Truncate the destination table before inserting new rows.", 440 | ) 441 | pipeline_parser.add_argument( 442 | "--skip-download", 443 | action="store_true", 444 | help="Skip downloading if the archive is already present.", 445 | ) 446 | pipeline_parser.add_argument( 447 | "--skip-extract", 448 | action="store_true", 449 | help="Skip extraction if the JSONL file already exists.", 450 | ) 451 | pipeline_parser.add_argument( 452 | "--skip-partition", 453 | action="store_true", 454 | help="Skip creating per-language tables after loading.", 455 | ) 456 | pipeline_parser.add_argument( 457 | "--overwrite-download", 458 | action="store_true", 459 | help="Force re-download even if the archive already exists.", 460 | ) 461 | pipeline_parser.add_argument( 462 | "--overwrite-extract", 463 | action="store_true", 464 | help="Force re-extraction even if the JSONL already exists.", 465 | ) 466 | pipeline_parser.add_argument( 467 | "--lang-field", 468 | default="lang_code", 469 | help="JSON key inside each entry that stores the language code (default: lang_code).", 470 | ) 471 | pipeline_parser.add_argument( 472 | "--prefix", 473 | default="dictionary_lang", 474 | help="Prefix for generated language tables (default: dictionary_lang).", 475 | ) 476 | pipeline_parser.add_argument( 477 | "--target-schema", 478 | help="Optional schema to place generated tables in (default: current search_path).", 479 | ) 480 | pipeline_parser.add_argument( 481 | "--drop-existing-partitions", 482 | action="store_true", 483 | help="Drop existing language tables before rebuilding them.", 484 | ) 485 | _add_database_options(pipeline_parser) 486 | pipeline_parser.set_defaults(func=_cmd_pipeline, _parser=pipeline_parser) 487 | 488 | filter_parser = subparsers.add_parser( 489 | "filter", 490 | help="Filter existing dictionary entries into language-specific tables.", 491 | ) 492 | filter_parser.add_argument( 493 | "languages", 494 | nargs="+", 495 | help="Language codes to materialize (e.g. en zh fr, or 'all').", 496 | ) 497 | filter_parser.add_argument( 498 | "--table", 499 | default="dictionary_all", 500 | help="Source table containing the raw entries (default: dictionary_all).", 501 | ) 502 | filter_parser.add_argument( 503 | "--column", 504 | default="data", 505 | help="JSONB column storing the dictionary payloads (default: data).", 506 | ) 507 | filter_parser.add_argument( 508 | "--lang-field", 509 | default="lang_code", 510 | help="JSON key containing the language code (default: lang_code).", 511 | ) 512 | filter_parser.add_argument( 513 | "--table-prefix", 514 | default="dictionary_lang", 515 | help="Base name for materialized tables; language code is appended (default: dictionary_lang).", 516 | ) 517 | filter_parser.add_argument( 518 | "--target-schema", 519 | help="Optional schema for the materialized tables (default: current search_path).", 520 | ) 521 | filter_parser.add_argument( 522 | "--drop-existing", 523 | action="store_true", 524 | help="Drop existing destination tables before inserting rows.", 525 | ) 526 | _add_database_options(filter_parser) 527 | filter_parser.set_defaults(func=_cmd_filter, _parser=filter_parser) 528 | 529 | pre_process_parser = subparsers.add_parser( 530 | "pre-process", 531 | help="Trim Wiktionary entries to the subset needed by downstream workflows.", 532 | ) 533 | pre_process_parser.add_argument( 534 | "--table", 535 | default="dictionary_all", 536 | help="Source table containing raw Wiktionary entries (default: %(default)s).", 537 | ) 538 | pre_process_parser.add_argument( 539 | "--source-column", 540 | default="data", 541 | help="Column storing the original Wiktionary JSON (default: %(default)s).", 542 | ) 543 | pre_process_parser.add_argument( 544 | "--target-column", 545 | default="process", 546 | help="Column to store the normalized JSON (default: %(default)s).", 547 | ) 548 | pre_process_parser.add_argument( 549 | "--fetch-batch-size", 550 | type=int, 551 | default=wiktionary_pre_process.FETCH_BATCH_SIZE, 552 | help="Rows fetched per streaming batch (default: %(default)s).", 553 | ) 554 | pre_process_parser.add_argument( 555 | "--update-batch-size", 556 | type=int, 557 | default=wiktionary_pre_process.UPDATE_BATCH_SIZE, 558 | help="Rows updated per write batch (default: %(default)s).", 559 | ) 560 | pre_process_parser.add_argument( 561 | "--progress-every-rows", 562 | type=int, 563 | default=wiktionary_pre_process.PROGRESS_EVERY_ROWS, 564 | help="Emit progress after this many processed rows (default: %(default)s).", 565 | ) 566 | pre_process_parser.add_argument( 567 | "--progress-every-seconds", 568 | type=float, 569 | default=wiktionary_pre_process.PROGRESS_EVERY_SECONDS, 570 | help="Emit progress at least this often in seconds (default: %(default)s).", 571 | ) 572 | pre_process_parser.add_argument( 573 | "--recompute-existing", 574 | action="store_true", 575 | help="Regenerate payloads even if the target column is already populated.", 576 | ) 577 | pre_process_parser.add_argument( 578 | "--toon", 579 | action="store_true", 580 | help="Convert processed payloads to TOON format (reduces token usage for LLMs).", 581 | ) 582 | _add_database_options(pre_process_parser) 583 | pre_process_parser.set_defaults(func=_cmd_pre_process, _parser=pre_process_parser) 584 | 585 | db_clean_parser = subparsers.add_parser( 586 | "db-clean", 587 | help="Remove low-quality entries from a dictionary table.", 588 | ) 589 | db_clean_parser.add_argument( 590 | "--table", 591 | default=DEFAULT_DICTIONARY_TABLE, 592 | help="Source table containing JSONB entries (default: %(default)s).", 593 | ) 594 | db_clean_parser.add_argument( 595 | "--fetch-batch-size", 596 | type=int, 597 | default=db_cleaner.FETCH_BATCH_SIZE, 598 | help="Number of rows to fetch per batch (default: %(default)s).", 599 | ) 600 | db_clean_parser.add_argument( 601 | "--delete-batch-size", 602 | type=int, 603 | default=db_cleaner.DELETE_BATCH_SIZE, 604 | help="Number of rows to delete per batch (default: %(default)s).", 605 | ) 606 | db_clean_parser.add_argument( 607 | "--progress-every-rows", 608 | type=int, 609 | default=db_cleaner.PROGRESS_EVERY_ROWS, 610 | help="Emit progress after this many processed rows (default: %(default)s).", 611 | ) 612 | db_clean_parser.add_argument( 613 | "--progress-every-seconds", 614 | type=float, 615 | default=db_cleaner.PROGRESS_EVERY_SECONDS, 616 | help="Emit progress at least this often in seconds (default: %(default)s).", 617 | ) 618 | _add_database_options(db_clean_parser) 619 | db_clean_parser.set_defaults(func=_cmd_db_clean, _parser=db_clean_parser) 620 | 621 | db_common_parser = subparsers.add_parser( 622 | "db-commonness", 623 | help="Populate the common_score column using word frequency data.", 624 | ) 625 | db_common_parser.add_argument( 626 | "--table", 627 | default=DEFAULT_DICTIONARY_TABLE, 628 | help="Target dictionary table (default: %(default)s).", 629 | ) 630 | db_common_parser.add_argument( 631 | "--fetch-batch-size", 632 | type=int, 633 | default=db_commonness.FETCH_BATCH_SIZE, 634 | help="Number of rows to fetch per batch (default: %(default)s).", 635 | ) 636 | db_common_parser.add_argument( 637 | "--update-batch-size", 638 | type=int, 639 | default=db_commonness.UPDATE_BATCH_SIZE, 640 | help="Number of rows to update per batch (default: %(default)s).", 641 | ) 642 | db_common_parser.add_argument( 643 | "--progress-every-rows", 644 | type=int, 645 | default=db_commonness.PROGRESS_EVERY_ROWS, 646 | help="Emit progress after this many processed rows (default: %(default)s).", 647 | ) 648 | db_common_parser.add_argument( 649 | "--progress-every-seconds", 650 | type=float, 651 | default=db_commonness.PROGRESS_EVERY_SECONDS, 652 | help="Emit progress at least this often in seconds (default: %(default)s).", 653 | ) 654 | db_common_parser.add_argument( 655 | "--recompute-existing", 656 | action="store_true", 657 | help="Recalculate scores even if a value already exists.", 658 | ) 659 | _add_database_options(db_common_parser) 660 | db_common_parser.set_defaults(func=_cmd_db_commonness, _parser=db_common_parser) 661 | 662 | llm_define_parser = subparsers.add_parser( 663 | "llm-define", 664 | help="Generate enriched dictionary entries via the LLM define workflow.", 665 | ) 666 | llm_define_parser.add_argument( 667 | "--table", 668 | default=llm_define_enricher.DEFAULT_TABLE_NAME, 669 | help="Source table containing JSONB entries (default: %(default)s).", 670 | ) 671 | llm_define_parser.add_argument( 672 | "--source-column", 673 | default=llm_define_enricher.DEFAULT_SOURCE_COLUMN, 674 | help="Column containing original Wiktionary payloads (default: %(default)s).", 675 | ) 676 | llm_define_parser.add_argument( 677 | "--target-column", 678 | default=llm_define_enricher.DEFAULT_TARGET_COLUMN, 679 | help="Column to store LLM-enriched JSONB (default: %(default)s).", 680 | ) 681 | llm_define_parser.add_argument( 682 | "--fetch-batch-size", 683 | type=int, 684 | default=llm_define_enricher.DEFAULT_FETCH_BATCH_SIZE, 685 | help="Rows fetched from PostgreSQL per server-side batch (default: %(default)s).", 686 | ) 687 | llm_define_parser.add_argument( 688 | "--llm-batch-size", 689 | type=int, 690 | default=llm_define_enricher.DEFAULT_LLM_BATCH_SIZE, 691 | help="Number of requests dispatched to the LLM at once (default: %(default)s).", 692 | ) 693 | llm_define_parser.add_argument( 694 | "--max-workers", 695 | type=int, 696 | help="Maximum concurrent worker threads for LLM calls (default: llm-batch-size).", 697 | ) 698 | llm_define_parser.add_argument( 699 | "--max-retries", 700 | type=int, 701 | default=llm_define_enricher.DEFAULT_MAX_RETRIES, 702 | help="Attempts per row before giving up (default: %(default)s).", 703 | ) 704 | llm_define_parser.add_argument( 705 | "--initial-backoff-seconds", 706 | type=float, 707 | default=llm_define_enricher.DEFAULT_INITIAL_BACKOFF_SECONDS, 708 | help="Initial retry backoff in seconds (default: %(default)s).", 709 | ) 710 | llm_define_parser.add_argument( 711 | "--max-backoff-seconds", 712 | type=float, 713 | default=llm_define_enricher.DEFAULT_MAX_BACKOFF_SECONDS, 714 | help="Maximum retry backoff in seconds (default: %(default)s).", 715 | ) 716 | llm_define_parser.add_argument( 717 | "--progress-every-rows", 718 | type=int, 719 | default=llm_define_enricher.DEFAULT_PROGRESS_EVERY_ROWS, 720 | help="Emit progress after processing this many rows (default: %(default)s).", 721 | ) 722 | llm_define_parser.add_argument( 723 | "--progress-every-seconds", 724 | type=float, 725 | default=llm_define_enricher.DEFAULT_PROGRESS_EVERY_SECONDS, 726 | help="Emit progress at least this often in seconds (default: %(default)s).", 727 | ) 728 | llm_define_parser.add_argument( 729 | "--recompute-existing", 730 | action="store_true", 731 | help="Recreate target-column payloads even if already populated.", 732 | ) 733 | _add_database_options(llm_define_parser) 734 | llm_define_parser.set_defaults(func=_cmd_llm_define, _parser=llm_define_parser) 735 | 736 | return parser 737 | 738 | 739 | def main(argv: list[str] | None = None) -> int: 740 | parser = _build_parser() 741 | 742 | if argv is None: 743 | argv_list = sys.argv[1:] 744 | else: 745 | argv_list = list(argv) 746 | 747 | if argv_list and not argv_list[0].startswith("-") and argv_list[0] not in COMMAND_NAMES: 748 | argv_list = ["load", *argv_list] 749 | 750 | args = parser.parse_args(argv_list) 751 | 752 | func = getattr(args, "func", None) 753 | if func is None: 754 | parser.print_help() 755 | return 1 756 | 757 | return func(args) 758 | 759 | 760 | if __name__ == "__main__": # pragma: no cover - CLI entry guard 761 | sys.exit(main()) 762 | 763 | 764 | __all__ = ["main"] 765 | --------------------------------------------------------------------------------