├── CLAUDE.md
├── .python-version
├── .serena
    ├── .gitignore
    ├── memories
    │   └── wiktionary_cli_refactor.md
    └── project.yml
├── data
    └── dictionary.sqlite
├── .gitignore
├── src
    └── open_dictionary
    │   ├── __init__.py
    │   ├── llm
    │       ├── llm_client.py
    │       ├── define.py
    │       └── define_enricher.py
    │   ├── utils
    │       └── env_loader.py
    │   ├── wikitionary
    │       ├── extract.py
    │       ├── downloader.py
    │       ├── filter.py
    │       ├── pipeline.py
    │       ├── progress.py
    │       ├── transform.py
    │       └── pre_process.py
    │   ├── db
    │       ├── access.py
    │       ├── sqlite_manager.py
    │       ├── cleaner.py
    │       └── mark_commonness.py
    │   ├── workflow.py
    │   └── cli.py
├── pyproject.toml
├── README.md
└── AGENTS.md


/CLAUDE.md:
--------------------------------------------------------------------------------
1 | AGENTS.md


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/.serena/.gitignore:
--------------------------------------------------------------------------------
1 | /cache
2 | 


--------------------------------------------------------------------------------
/data/dictionary.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahpxex/open-dictionary/HEAD/data/dictionary.sqlite


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | .env
12 | words.txt
13 | .DS_Store


--------------------------------------------------------------------------------
/src/open_dictionary/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import main as _cli_main
2 | 
3 | def importer() -> None:
4 |     raise SystemExit(_cli_main())
5 | 
6 | def main() -> None:
7 |     raise SystemExit(_cli_main())
8 | 


--------------------------------------------------------------------------------
/.serena/memories/wiktionary_cli_refactor.md:
--------------------------------------------------------------------------------
1 | Refactored Open Dictionary CLI: new central src/open_dictionary/cli.py registers commands; streaming logic in wikitionary/transform.py uses shared StreamingProgress from wikitionary/progress.py; pipeline orchestration moved to wikitionary/pipeline.py; filter command supports 'all' languages with progress output; README documents filter usage.


--------------------------------------------------------------------------------
/src/open_dictionary/llm/llm_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | from open_dictionary.utils.env_loader import get_env
 3 | 
 4 | client = OpenAI(
 5 |     # This is the default and can be omitted
 6 |     api_key=get_env('LLM_KEY'),
 7 |     base_url=get_env('LLM_API'),
 8 | )
 9 | 
10 | def get_chat_response(instructions: str, input: str) -> str:
11 |     response = client.responses.create(
12 |         model=get_env('LLM_MODEL'), # type: ignore
13 |         instructions=instructions,
14 |         input=input,
15 |         temperature=0.1
16 |     )
17 | 
18 |     return response.output_text


--------------------------------------------------------------------------------
/src/open_dictionary/utils/env_loader.py:
--------------------------------------------------------------------------------
 1 | from os import getenv
 2 | from dotenv import load_dotenv
 3 | from typing import Literal
 4 | 
 5 | load_dotenv()
 6 | 
 7 | EnvKey = Literal['LLM_MODEL', 'LLM_KEY', 'LLM_API', 'DATABASE_URL']
 8 | 
 9 | def get_env(key: EnvKey, default: str | None = None) -> str | None:
10 |     """Get environment variable value.
11 | 
12 |     Args:
13 |         key: Environment variable key
14 |         default: Default value if key not found
15 | 
16 |     Returns:
17 |         Environment variable value or default
18 |     """
19 |     return getenv(key, default)
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "open-dictionary"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "ahpx", email = "AHpx@yandex.com" }
 8 | ]
 9 | requires-python = ">=3.12"
10 | dependencies = [
11 |     "dotenv>=0.9.9",
12 |     "openai>=2.6.1",
13 |     "psycopg[binary]>=3.2,<4",
14 |     "python-dotenv>=1.0,<2",
15 |     "python-toon>=0.1.2",
16 |     "wordfreq>=3.1.1",
17 | ]
18 | 
19 | [project.scripts]
20 | open-dictionary = "open_dictionary:main"
21 | 
22 | [build-system]
23 | requires = ["uv_build>=0.8.23,<0.9.0"]
24 | build-backend = "uv_build"
25 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/extract.py:
--------------------------------------------------------------------------------
 1 | """Extraction helpers for the Wiktionary JSONL archive."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import gzip
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | from .progress import ByteProgressPrinter
10 | 
11 | 
12 | def extract_wiktionary_dump(
13 |     source: Path,
14 |     destination: Path,
15 |     *,
16 |     overwrite: bool = False,
17 |     chunk_size: int = 32 * 1024 * 1024,
18 | ) -> Path:
19 |     """Extract a Wiktionary ``.jsonl.gz`` archive to ``destination``."""
20 | 
21 |     source_path = Path(source)
22 |     if not source_path.is_file():
23 |         raise FileNotFoundError(f"Source archive {source_path} does not exist")
24 | 
25 |     dest_path = Path(destination)
26 |     if dest_path.exists() and dest_path.is_dir():
27 |         raise IsADirectoryError(f"Destination {dest_path} is a directory")
28 | 
29 |     if dest_path.exists() and not overwrite:
30 |         print(f"Extraction skipped; {dest_path} already exists.", file=sys.stderr)
31 |         return dest_path
32 | 
33 |     dest_path.parent.mkdir(parents=True, exist_ok=True)
34 | 
35 |     total_size = source_path.stat().st_size
36 |     progress = ByteProgressPrinter("Extracting", total_size)
37 | 
38 |     with source_path.open("rb") as raw_handle:
39 |         with gzip.GzipFile(fileobj=raw_handle) as gz_handle:
40 |             with dest_path.open("wb") as out_handle:
41 |                 while True:
42 |                     chunk = gz_handle.read(chunk_size)
43 |                     if not chunk:
44 |                         break
45 |                     out_handle.write(chunk)
46 |                     progress.report(raw_handle.tell())
47 | 
48 |     progress.finalize(total_size)
49 |     return dest_path
50 | 
51 | 
52 | __all__ = ["extract_wiktionary_dump"]
53 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/downloader.py:
--------------------------------------------------------------------------------
 1 | """Streaming download helpers for the Wiktionary dataset."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import sys
 6 | import urllib.error
 7 | import urllib.request
 8 | from pathlib import Path
 9 | 
10 | from .progress import ByteProgressPrinter
11 | 
12 | 
13 | DEFAULT_WIKTIONARY_URL = "https://kaikki.org/dictionary/raw-wiktextract-data.jsonl.gz"
14 | 
15 | 
16 | def download_wiktionary_dump(
17 |     destination: Path,
18 |     *,
19 |     url: str = DEFAULT_WIKTIONARY_URL,
20 |     overwrite: bool = False,
21 |     chunk_size: int = 32 * 1024 * 1024,
22 | ) -> Path:
23 |     """Download a Wiktionary dump to ``destination`` with streaming progress."""
24 | 
25 |     dest_path = Path(destination)
26 |     if dest_path.exists() and dest_path.is_dir():
27 |         raise IsADirectoryError(f"Destination {dest_path} is a directory")
28 | 
29 |     if dest_path.exists() and not overwrite:
30 |         print(f"Download skipped; {dest_path} already exists.", file=sys.stderr)
31 |         return dest_path
32 | 
33 |     dest_path.parent.mkdir(parents=True, exist_ok=True)
34 | 
35 |     downloaded = 0
36 |     try:
37 |         with urllib.request.urlopen(url) as response:
38 |             total_size = int(response.headers.get("Content-Length", "0") or 0)
39 |             progress = ByteProgressPrinter("Downloading", total_size)
40 | 
41 |             with dest_path.open("wb") as out_handle:
42 |                 while True:
43 |                     chunk = response.read(chunk_size)
44 |                     if not chunk:
45 |                         break
46 |                     out_handle.write(chunk)
47 |                     downloaded += len(chunk)
48 |                     progress.report(downloaded)
49 | 
50 |             progress.finalize(downloaded)
51 | 
52 |     except urllib.error.URLError as exc:  # pragma: no cover - network failure guard
53 |         raise RuntimeError(f"Failed to download Wiktionary dump: {exc}") from exc
54 |     return dest_path
55 | 
56 | 
57 | __all__ = ["DEFAULT_WIKTIONARY_URL", "download_wiktionary_dump"]
58 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/filter.py:
--------------------------------------------------------------------------------
 1 | """Business logic for filtering Wiktionary entries into language-specific tables."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import sys
 6 | from typing import Sequence
 7 | 
 8 | from .transform import partition_dictionary_by_language
 9 | 
10 | 
11 | def filter_languages(
12 |     conninfo: str,
13 |     *,
14 |     source_table: str,
15 |     column_name: str,
16 |     languages: Sequence[str],
17 |     lang_field: str = "lang_code",
18 |     table_prefix: str = "dictionary_lang",
19 |     target_schema: str | None = None,
20 |     drop_existing: bool = False,
21 | ) -> list[str]:
22 |     """Create language-specific tables for the requested ``languages`` only."""
23 | 
24 |     if not languages:
25 |         raise ValueError("At least one language code must be provided.")
26 | 
27 |     normalized: list[str] = []
28 |     include_all = False
29 |     for raw_code in languages:
30 |         code = (raw_code or "").strip()
31 |         if not code:
32 |             continue
33 |         if code.lower() == "all":
34 |             include_all = True
35 |             break
36 |         normalized.append(code)
37 | 
38 |     language_list: Sequence[str] | None
39 |     if include_all:
40 |         print(
41 |             f"[filter] Materializing all languages from {source_table}.{column_name}...",
42 |             file=sys.stderr,
43 |             flush=True,
44 |         )
45 |         language_list = None
46 |     else:
47 |         if not normalized:
48 |             raise ValueError("At least one non-empty language code must be provided.")
49 |         display_codes = ", ".join(normalized[:5])
50 |         if len(normalized) > 5:
51 |             display_codes += ", ..."
52 |         print(
53 |             (
54 |                 f"[filter] Materializing {len(normalized)} language(s) "
55 |                 f"({display_codes}) from {source_table}.{column_name}..."
56 |             ),
57 |             file=sys.stderr,
58 |             flush=True,
59 |         )
60 |         language_list = normalized
61 | 
62 |     return partition_dictionary_by_language(
63 |         conninfo,
64 |         source_table=source_table,
65 |         column_name=column_name,
66 |         lang_field=lang_field,
67 |         table_prefix=table_prefix,
68 |         target_schema=target_schema,
69 |         drop_existing=drop_existing,
70 |         languages=language_list,
71 |     )
72 | 
73 | 
74 | __all__ = [
75 |     "filter_languages",
76 | ]
77 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/pipeline.py:
--------------------------------------------------------------------------------
  1 | """Workflow helpers for streaming Wiktionary dumps into PostgreSQL."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import sys
  6 | import urllib.parse
  7 | from pathlib import Path
  8 | 
  9 | from .downloader import DEFAULT_WIKTIONARY_URL, download_wiktionary_dump
 10 | from .extract import extract_wiktionary_dump
 11 | from .transform import copy_jsonl_to_postgres, partition_dictionary_by_language
 12 | 
 13 | 
 14 | def run_pipeline(
 15 |     *,
 16 |     workdir: Path,
 17 |     conninfo: str,
 18 |     table_name: str,
 19 |     column_name: str,
 20 |     url: str = DEFAULT_WIKTIONARY_URL,
 21 |     truncate: bool = False,
 22 |     skip_download: bool = False,
 23 |     skip_extract: bool = False,
 24 |     skip_partition: bool = False,
 25 |     overwrite_download: bool = False,
 26 |     overwrite_extract: bool = False,
 27 |     lang_field: str = "lang_code",
 28 |     table_prefix: str = "dictionary_lang",
 29 |     target_schema: str | None = None,
 30 |     drop_existing_partitions: bool = False,
 31 | ) -> None:
 32 |     """Execute the full download → extract → load → partition workflow."""
 33 | 
 34 |     workdir = Path(workdir)
 35 |     workdir.mkdir(parents=True, exist_ok=True)
 36 | 
 37 |     parsed = urllib.parse.urlparse(url)
 38 |     filename = Path(parsed.path or "wiktextract.jsonl.gz").name
 39 |     gz_path = workdir / filename
 40 |     jsonl_path = gz_path.with_suffix("")
 41 | 
 42 |     if not skip_download:
 43 |         print(
 44 |             f"Downloading Wiktionary dump from {url} to {gz_path}...",
 45 |             file=sys.stderr,
 46 |         )
 47 |         download_wiktionary_dump(
 48 |             gz_path,
 49 |             url=url,
 50 |             overwrite=overwrite_download,
 51 |         )
 52 |     else:
 53 |         print(f"Skipping download step; reusing {gz_path}", file=sys.stderr)
 54 | 
 55 |     if not gz_path.exists():
 56 |         raise FileNotFoundError(f"Expected archive {gz_path} after download step")
 57 | 
 58 |     if not skip_extract:
 59 |         print(
 60 |             f"Extracting {gz_path} to {jsonl_path}...",
 61 |             file=sys.stderr,
 62 |         )
 63 |         extract_wiktionary_dump(
 64 |             gz_path,
 65 |             jsonl_path,
 66 |             overwrite=overwrite_extract,
 67 |         )
 68 |     else:
 69 |         print(f"Skipping extract step; reusing {jsonl_path}", file=sys.stderr)
 70 | 
 71 |     if not jsonl_path.exists():
 72 |         raise FileNotFoundError(f"Expected JSONL file {jsonl_path} after extract step")
 73 | 
 74 |     rows_copied = copy_jsonl_to_postgres(
 75 |         jsonl_path=jsonl_path,
 76 |         conninfo=conninfo,
 77 |         table_name=table_name,
 78 |         column_name=column_name,
 79 |         truncate=truncate,
 80 |     )
 81 |     print(
 82 |         f"Finished loading {rows_copied:,} rows into {table_name}.{column_name}",
 83 |         file=sys.stderr,
 84 |     )
 85 | 
 86 |     if skip_partition:
 87 |         print("Partition step skipped by configuration.", file=sys.stderr)
 88 |         return
 89 | 
 90 |     partition_dictionary_by_language(
 91 |         conninfo,
 92 |         source_table=table_name,
 93 |         column_name=column_name,
 94 |         lang_field=lang_field,
 95 |         table_prefix=table_prefix,
 96 |         target_schema=target_schema,
 97 |         drop_existing=drop_existing_partitions,
 98 |     )
 99 | 
100 | 
101 | __all__ = ["run_pipeline"]
102 | 


--------------------------------------------------------------------------------
/src/open_dictionary/db/access.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterator, Any, Sequence, Tuple, Union
 2 | import uuid
 3 | import psycopg
 4 | from psycopg.rows import dict_row
 5 | from psycopg import sql
 6 | from psycopg.sql import Composable
 7 | 
 8 | from open_dictionary.utils.env_loader import get_env
 9 | 
10 | ColumnSpec = Union[str, Tuple[str, Composable]]
11 | 
12 | class DatabaseAccess:
13 |     """Database access layer for dictionary tables."""
14 | 
15 |     def __init__(self, connection_string: str | None = None):
16 |         resolved = connection_string or get_env("DATABASE_URL")
17 |         if not resolved:
18 |             raise RuntimeError("Database connection string is not configured")
19 |         self.connection_string = resolved
20 | 
21 |     def _get_connection(self):
22 |         """Get database connection."""
23 |         return psycopg.connect(self.connection_string) # type: ignore
24 | 
25 |     def get_connection(self):
26 |         """Return a new psycopg connection using the configured DSN."""
27 |         return self._get_connection()
28 | 
29 |     def iterate_table(
30 |         self,
31 |         table_name: str,
32 |         batch_size: int = 20,
33 |         *,
34 |         columns: Sequence[ColumnSpec] | None = None,
35 |         where: Composable | None = None,
36 |         order_by: Sequence[str] | None = None,
37 |     ) -> Iterator[dict[str, Any]]:
38 |         """Iterate over all rows in a table using server-side cursor for memory efficiency.
39 | 
40 |         Args:
41 |             table_name: Name of the table to iterate
42 |             batch_size: Number of rows to fetch per batch
43 |             columns: Specific columns to select (defaults to all)
44 |             where: Optional SQL WHERE clause (Composable) to filter rows
45 |             order_by: Optional list of columns to order the results
46 | 
47 |         Yields:
48 |             Dictionary containing row data with column names as keys
49 |         """
50 |         def _compile_column_spec(column: ColumnSpec) -> Composable:
51 |             if isinstance(column, tuple):
52 |                 alias, expression = column
53 |                 if not isinstance(expression, Composable):
54 |                     raise TypeError("Expression must be a psycopg Composable instance")
55 |                 return sql.Composed(
56 |                     [sql.SQL("("), expression, sql.SQL(") AS "), sql.Identifier(alias)]
57 |                 )
58 | 
59 |             return sql.Identifier(column)
60 | 
61 |         if columns:
62 |             compiled_columns = [_compile_column_spec(col) for col in columns]
63 |             column_clause = sql.SQL(", ").join(compiled_columns)
64 |         else:
65 |             column_clause = sql.SQL("*")
66 | 
67 |         query = sql.SQL("SELECT {columns} FROM {table}").format(
68 |             columns=column_clause,
69 |             table=sql.Identifier(table_name),
70 |         )
71 | 
72 |         if where is not None:
73 |             query += sql.SQL(" WHERE ") + where
74 | 
75 |         if order_by:
76 |             order_clause = sql.SQL(", ").join(sql.Identifier(col) for col in order_by)
77 |             query += sql.SQL(" ORDER BY ") + order_clause
78 | 
79 |         cursor_name = f"fetch_cursor_{uuid.uuid4().hex}"
80 | 
81 |         with self._get_connection() as conn:
82 |             with conn.cursor(row_factory=dict_row, name=cursor_name) as cursor:
83 |                 cursor.execute(query) # type: ignore
84 | 
85 |                 while True:
86 |                     rows = cursor.fetchmany(batch_size)
87 |                     if not rows:
88 |                         break
89 | 
90 |                     for row in rows:
91 |                         yield row
92 | 
93 |     
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open English Dictionary
  2 | 
  3 | ## Rebuilding process WIP
  4 | 
  5 | ## Currently, this project is being rebuilt.
  6 | 
  7 | New features are:
  8 | 
  9 | - Streamlined process + pipeline integration
 10 | - Wiktionary grounding + LLM explain
 11 |   - Enormous words data across multiple languages
 12 |   - Extremely detailed definitions
 13 | - New distribution format will be: jsonl, sqlite and more are to be determined
 14 | - Options are available to select specific category of words
 15 | 
 16 | **Behold and stay tuned!**
 17 | 
 18 | ## Prerequisites
 19 | 
 20 | - Install project dependencies: `uv sync`
 21 | - Configure a `.env` file with `DATABASE_URL`
 22 | - Ensure a PostgreSQL database is reachable via that URL
 23 | 
 24 | ## Run The Wiktionary Workflow
 25 | 
 26 | Download the compressed dump:
 27 | 
 28 | ```bash
 29 | uv run open-dictionary download --output data/raw-wiktextract-data.jsonl.gz
 30 | ```
 31 | 
 32 | Extract the JSONL file:
 33 | 
 34 | ```bash
 35 | uv run open-dictionary extract \
 36 |   --input data/raw-wiktextract-data.jsonl.gz \
 37 |   --output data/raw-wiktextract-data.jsonl
 38 | ```
 39 | 
 40 | Stream the JSONL into PostgreSQL (`dictionary_all.data` is JSONB):
 41 | 
 42 | ```bash
 43 | uv run open-dictionary load data/raw-wiktextract-data.jsonl \
 44 |   --table dictionary_all \
 45 |   --column data \
 46 |   --truncate
 47 | ```
 48 | 
 49 | Run everything end-to-end with optional partitioning:
 50 | 
 51 | ```bash
 52 | uv run open-dictionary pipeline \
 53 |   --workdir data \
 54 |   --table dictionary_all \
 55 |   --column data \
 56 |   --truncate
 57 | ```
 58 | 
 59 | Split rows by language code into per-language tables when needed:
 60 | 
 61 | ```bash
 62 | uv run open-dictionary partition \
 63 |   --table dictionary_all \
 64 |   --column data \
 65 |   --lang-field lang_code
 66 | ```
 67 | 
 68 | Materialize a smaller set of languages into dedicated tables with a custom prefix:
 69 | 
 70 | ```bash
 71 | uv run open-dictionary filter en zh \
 72 |   --table dictionary_all \
 73 |   --column data \
 74 |   --table-prefix dictionary_filtered
 75 | ```
 76 | 
 77 | Pass `all` to emit every language into its own table:
 78 | 
 79 | ```bash
 80 | uv run open-dictionary filter all --table dictionary_all --column data
 81 | ```
 82 | 
 83 | Populate the `common_score` column with word frequency data (re-run with `--recompute-existing` to refresh scores):
 84 | 
 85 | ```bash
 86 | uv run open-dictionary db-commonness --table dictionary_filtered_en
 87 | ```
 88 | 
 89 | Normalize raw Wiktionary payloads into a slimmer JSONB column without invoking LLMs (writes to `process` by default):
 90 | 
 91 | _Optionally convert to TOON format (reduces token usage by 30-60% for LLM workflows, stores as TEXT instead of JSONB):_
 92 | 
 93 | ```bash
 94 | uv run open-dictionary pre-process \
 95 |   --table dictionary_filtered_en \
 96 |   --source-column data \
 97 |   --target-column processed \
 98 |   --toon
 99 | ```
100 | 
101 | Remove low-quality rows (zero common score, numeric tokens, legacy tags) directly in PostgreSQL:
102 | 
103 | ```bash
104 | uv run open-dictionary db-clean --table dictionary_filtered_en
105 | ```
106 | 
107 | Generate structured Chinese learner-friendly entries with the LLM `define` workflow (writes JSONB into `new_speak` by default). This streams rows in batches, dispatches up to 50 concurrent LLM calls with exponential-backoff retries, and resumes automatically on restart:
108 | 
109 | ```bash
110 | uv run open-dictionary llm-define \
111 |   --table dictionary_filtered_en \
112 |   --source-column processed \
113 |   --target-column new_speak
114 | ```
115 | 
116 | Provide `LLM_MODEL`, `LLM_KEY`, and `LLM_API` in your environment (e.g., `.env`) before running LLM commands.
117 | 
118 | Each command streams data in chunks to handle the 10M+ line dataset efficiently.
119 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
 1 | # Repository Guidelines
 2 | 
 3 | This is a full tool sets for building a open dictionary, based on wikitionary data.
 4 | 
 5 | ## Project Structure & Module Organization
 6 | 
 7 | - Core logic lives in `src/open_dictionary`. The CLI entry point defined in `pyproject.toml` resolves to `open_dictionary:main`, which dispatches into `src/open_dictionary/cli.py`; keep any new commands registered there while delegating business logic to feature modules.
 8 | - Data access helpers sit under `src/open_dictionary/db` (for example `access.py`) and should remain focused on PostgreSQL streaming semantics.
 9 | - Wiktionary ingestion utilities are split by concern under `src/open_dictionary/wikitionary/`: `downloader.py`, `extract.py`, `transform.py` (streaming COPY + table helpers), `pipeline.py` (orchestration), `filter.py` (language table materialization), and `progress.py` (shared progress reporters).
10 | - LLM-facing enrichments live in `src/open_dictionary/llm`, while cross-cutting utilities (environment loading, helpers) belong in `src/open_dictionary/utils`.
11 | - Runtime artifacts such as dumps or extracted JSONL files are expected in a local `data/` directory (not tracked); scripts should accept paths rather than hard-code locations.
12 | 
13 | ## Build, Test, and Development Commands
14 | 
15 | - `uv sync` installs all dependencies declared in `pyproject.toml`.
16 | - `uv run open-dictionary download --output data/raw-wiktextract-data.jsonl.gz` streams the upstream Wiktextract snapshot.
17 | - `uv run open-dictionary pipeline --workdir data --table dictionary --column data --truncate` executes download → extract → load → partition in one shot; add `--skip-*` flags for partial runs.
18 | - `uv run open-dictionary filter en zh --table dictionary_all --column data` copies only selected languages into `dictionary_lang_*` tables; pass `all` as the first positional argument to materialize every language code.
19 | - `uv run open-dictionary db-clean --table dictionary_en` removes rows that fail quality heuristics (numeric tokens, zero scores, legacy tags, etc.).
20 | - `uv run open-dictionary db-commonness --table dictionary_en` streams wordfreq-derived `common_score` values into the target table (add `--recompute-existing` to refresh populated rows).
21 | - `uv run python -m pytest` is the expected test runner once suites are added; for now, rely on targeted CLI runs against a disposable PostgreSQL database.
22 | 
23 | ## Coding Style & Naming Conventions
24 | 
25 | - Target Python 3.12+, four-space indentation, and `snake_case` for functions, modules, and CLI subcommand names.
26 | - Prefer type hints and `pydantic` models for structured payloads (see `llm/define.py`), and keep side effects behind small helpers for easier testing.
27 | - Environment keys (`DATABASE_URL`, `LLM_KEY`, `LLM_API`, `LLM_MODEL`) are loaded through `utils.env_loader`; never fetch them ad hoc inside command bodies.
28 | 
29 | ## Testing Guidelines
30 | 
31 | - Focus on integration tests that exercise the CLI contract end-to-end with a seeded PostgreSQL container; isolate I/O with temp directories under `tmp_path`.
32 | - Name test modules `test_<feature>.py` and colocate fixtures under `tests/conftest.py` once the suite exists.
33 | - Validate large operations by asserting row counts, emitted table names, and LLM scaffolding errors rather than snapshotting full JSON.
34 | 
35 | ## Commit & Pull Request Guidelines
36 | 
37 | - Follow the existing history: concise imperative subject lines (e.g. “Add DB iterator”), optional body wrapped at ~72 chars.
38 | - Reference issue IDs in the body when available and note required migrations or manual steps.
39 | - PRs should describe the dataset used for validation, include command transcripts (`uv run …`) for any pipelines executed, and, when UI/CLI behavior changes, attach representative logs or screenshots.
40 | 
41 | ## Environment & Security Tips
42 | 
43 | - Keep `.env` files local; share example variables via documentation rather than version control.
44 | - Never commit API keys or database URLs. If sensitive configuration is required in CI, use repository secrets and reference them through environment loader helpers.
45 | 


--------------------------------------------------------------------------------
/.serena/project.yml:
--------------------------------------------------------------------------------
 1 | # language of the project (csharp, python, rust, java, typescript, go, cpp, or ruby)
 2 | #  * For C, use cpp
 3 | #  * For JavaScript, use typescript
 4 | # Special requirements:
 5 | #  * csharp: Requires the presence of a .sln file in the project folder.
 6 | language: python
 7 | 
 8 | # the encoding used by text files in the project
 9 | # For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
10 | encoding: "utf-8"
11 | 
12 | # whether to use the project's gitignore file to ignore files
13 | # Added on 2025-04-07
14 | ignore_all_files_in_gitignore: true
15 | # list of additional paths to ignore
16 | # same syntax as gitignore, so you can use * and **
17 | # Was previously called `ignored_dirs`, please update your config if you are using that.
18 | # Added (renamed) on 2025-04-07
19 | ignored_paths: []
20 | 
21 | # whether the project is in read-only mode
22 | # If set to true, all editing tools will be disabled and attempts to use them will result in an error
23 | # Added on 2025-04-18
24 | read_only: false
25 | 
26 | # list of tool names to exclude. We recommend not excluding any tools, see the readme for more details.
27 | # Below is the complete list of tools for convenience.
28 | # To make sure you have the latest list of tools, and to view their descriptions, 
29 | # execute `uv run scripts/print_tool_overview.py`.
30 | #
31 | #  * `activate_project`: Activates a project by name.
32 | #  * `check_onboarding_performed`: Checks whether project onboarding was already performed.
33 | #  * `create_text_file`: Creates/overwrites a file in the project directory.
34 | #  * `delete_lines`: Deletes a range of lines within a file.
35 | #  * `delete_memory`: Deletes a memory from Serena's project-specific memory store.
36 | #  * `execute_shell_command`: Executes a shell command.
37 | #  * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced.
38 | #  * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type).
39 | #  * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type).
40 | #  * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes.
41 | #  * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file.
42 | #  * `initial_instructions`: Gets the initial instructions for the current project.
43 | #     Should only be used in settings where the system prompt cannot be set,
44 | #     e.g. in clients you have no control over, like Claude Desktop.
45 | #  * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol.
46 | #  * `insert_at_line`: Inserts content at a given line in a file.
47 | #  * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol.
48 | #  * `list_dir`: Lists files and directories in the given directory (optionally with recursion).
49 | #  * `list_memories`: Lists memories in Serena's project-specific memory store.
50 | #  * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building).
51 | #  * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context).
52 | #  * `read_file`: Reads a file within the project directory.
53 | #  * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store.
54 | #  * `remove_project`: Removes a project from the Serena configuration.
55 | #  * `replace_lines`: Replaces a range of lines within a file with new content.
56 | #  * `replace_symbol_body`: Replaces the full definition of a symbol.
57 | #  * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen.
58 | #  * `search_for_pattern`: Performs a search for a pattern in the project.
59 | #  * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase.
60 | #  * `switch_modes`: Activates modes by providing a list of their names
61 | #  * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information.
62 | #  * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task.
63 | #  * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed.
64 | #  * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store.
65 | excluded_tools: []
66 | 
67 | # initial prompt for the project. It will always be given to the LLM upon activating the project
68 | # (contrary to the memories, which are loaded on demand).
69 | initial_prompt: ""
70 | 
71 | project_name: "open-english-dictionary"
72 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/progress.py:
--------------------------------------------------------------------------------
  1 | """Progress helpers for long-running Wiktionary data operations."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import sys
  6 | import time
  7 | 
  8 | 
  9 | class ByteProgressPrinter:
 10 |     """Emit coarse progress updates for byte-oriented streaming tasks."""
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         label: str,
 15 |         total_bytes: int,
 16 |         *,
 17 |         min_bytes_step: int = 64 * 1024 * 1024,
 18 |         min_time_step: float = 5.0,
 19 |     ) -> None:
 20 |         self.label = label
 21 |         self.total_bytes = max(total_bytes, 0)
 22 |         self.min_bytes_step = max(min_bytes_step, 1)
 23 |         self.min_time_step = max(min_time_step, 0.0)
 24 |         self._last_report_time = time.monotonic()
 25 |         self._last_report_bytes = 0
 26 | 
 27 |     def report(self, processed_bytes: int, *, force: bool = False) -> None:
 28 |         """Report the number of processed bytes if thresholds are met."""
 29 | 
 30 |         if processed_bytes < 0:  # Defensive guard for unexpected inputs
 31 |             return
 32 | 
 33 |         now = time.monotonic()
 34 |         bytes_increment = processed_bytes - self._last_report_bytes
 35 | 
 36 |         if not force and processed_bytes < self.total_bytes:
 37 |             if (
 38 |                 bytes_increment < self.min_bytes_step
 39 |                 and (now - self._last_report_time) < self.min_time_step
 40 |             ):
 41 |                 return
 42 |         elif not force and bytes_increment <= 0:
 43 |             return
 44 | 
 45 |         percent_text = ""
 46 |         if self.total_bytes:
 47 |             percent = min(100.0, (processed_bytes / self.total_bytes) * 100)
 48 |             percent_text = f"{percent:5.1f}% | "
 49 | 
 50 |         gib_processed = processed_bytes / (1024**3)
 51 |         message = f"{self.label}: {percent_text}{gib_processed:.2f} GiB"
 52 |         print(message, file=sys.stderr, flush=True)
 53 | 
 54 |         self._last_report_time = now
 55 |         self._last_report_bytes = processed_bytes
 56 | 
 57 |     def finalize(self, processed_bytes: int) -> None:
 58 |         """Ensure a final progress update is displayed when finished."""
 59 | 
 60 |         if processed_bytes == 0:
 61 |             return
 62 | 
 63 |         self.report(processed_bytes, force=True)
 64 | 
 65 | 
 66 | class StreamingProgress:
 67 |     """Progress reporter for streaming row + byte oriented workloads."""
 68 | 
 69 |     def __init__(
 70 |         self,
 71 |         total_bytes: int,
 72 |         *,
 73 |         label: str = "Progress",
 74 |         min_bytes_step: int = 64 * 1024 * 1024,
 75 |         min_rows_step: int = 50_000,
 76 |         min_time_step: float = 5.0,
 77 |     ) -> None:
 78 |         self.total_bytes = max(total_bytes, 0)
 79 |         self.label = label
 80 |         self.min_bytes_step = max(min_bytes_step, 1)
 81 |         self.min_rows_step = max(min_rows_step, 1)
 82 |         self.min_time_step = max(min_time_step, 0.0)
 83 |         self._last_report_time = time.monotonic()
 84 |         self._last_report_bytes = 0
 85 |         self._last_report_rows = 0
 86 | 
 87 |     def report(self, rows: int, bytes_processed: int, *, force: bool = False) -> None:
 88 |         """Emit a progress message when thresholds are crossed."""
 89 | 
 90 |         if rows < 0 or bytes_processed < 0:
 91 |             return
 92 | 
 93 |         now = time.monotonic()
 94 |         bytes_increment = bytes_processed - self._last_report_bytes
 95 |         rows_increment = rows - self._last_report_rows
 96 | 
 97 |         if not force:
 98 |             if bytes_processed < self.total_bytes:
 99 |                 if (
100 |                     bytes_increment < self.min_bytes_step
101 |                     and rows_increment < self.min_rows_step
102 |                     and (now - self._last_report_time) < self.min_time_step
103 |                 ):
104 |                     return
105 |             else:
106 |                 if bytes_increment <= 0 and rows_increment <= 0:
107 |                     return
108 | 
109 |         percent_text = ""
110 |         if self.total_bytes:
111 |             percent = min(100.0, (bytes_processed / self.total_bytes) * 100)
112 |             percent_text = f"{percent:5.1f}% | "
113 | 
114 |         gib_processed = bytes_processed / (1024**3)
115 |         rate = 0.0
116 |         elapsed = now - self._last_report_time
117 |         if elapsed > 0 and rows_increment > 0:
118 |             rate = rows_increment / elapsed
119 | 
120 |         message = (
121 |             f"{self.label}: {percent_text}{rows:,} rows | "
122 |             f"{gib_processed:.2f} GiB read | {rate:,.0f} rows/s"
123 |         )
124 |         print(message, file=sys.stderr, flush=True)
125 | 
126 |         self._last_report_time = now
127 |         self._last_report_bytes = bytes_processed
128 |         self._last_report_rows = rows
129 | 
130 |     def finalize(self, rows: int, bytes_processed: int) -> None:
131 |         """Ensure a final progress message is emitted."""
132 | 
133 |         if rows == 0 and bytes_processed == 0:
134 |             return
135 | 
136 |         self.report(rows, bytes_processed, force=True)
137 | 
138 | 
139 | __all__ = ["ByteProgressPrinter", "StreamingProgress"]
140 | 


--------------------------------------------------------------------------------
/src/open_dictionary/llm/define.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel
  2 | from typing import Optional
  3 | import json
  4 | from open_dictionary.llm.llm_client import get_chat_response
  5 | 
  6 | 
  7 | instruction = """
  8 | 你是一位顶级的词典编纂专家、语言学家，以及精通中英双语的教育家。你的任务是读取并解析一段来自 Wiktionary 的、结构复杂的数据，然后将其转化为一份清晰、准确、对中文学习者极其友好的结构化中文词典条目。
  9 | 
 10 | **核心任务：**
 11 | 根据下方提供的输入JSON，严格按照【输出格式定义】生成一个唯一的、完整的 JSON 对象作为最终结果。不要输出任何解释、注释或无关内容。
 12 | 
 13 | **【重要：JSON 格式规范】**
 14 | 1. 输出必须是严格、合法的 JSON 格式。
 15 | 2. **所有字符串值中的双引号 (") 必须使用反斜杠转义为 \\"**。
 16 | 3. **所有字符串值中的反斜杠 (\\) 必须转义为 \\**。
 17 | 
 18 | ---
 19 | 
 20 | **【输出格式定义】**
 21 | 
 22 | 请生成一个包含以下键 (key) 的 JSON 对象：
 23 | 
 24 | 1.  `word`: (string) 英文单词本身。
 25 | 2.  `pos`: (string) 词性。
 26 | 3.  `pronunciations`: (object) 一个包含发音方式和音频文件的对象：
 27 |     *   `ipa`: (string) 国际音标。直接从输入JSON的 `sounds` 数组中提取 `ipa` 字段的值。
 28 |     *   `natural_phonics`: (string) 自然拼读。根据单词的拼写和音节，生成一个对初学者友好的、用连字符分隔的拼读提示。例如 "philosophy" -> "phi-lo-so-phy"。
 29 |     *   `ogg_url`: (string) OGG音频文件链接。从输入JSON的 `sounds` 数组中查找并提取 `ogg_url` 字段的值。如果不存在，则返回 `null`。
 30 | 4.  `forms`: (array of strings) **词形变化**。遍历输入JSON的 `forms` 数组，将每个词形 (`form`) 及其标签 (`tags`) 组合成一个易于理解的中文描述字符串。例如：`"hits (第三人称单数现在时)"`。
 31 | 5.  `concise_definition`: (string) **简明释义**。在分析完所有词义后，用一句话高度概括该单词最核心、最常用的1-2个中文意思。
 32 | 6.  `detailed_definitions`: (array) **详细释义数组**。遍历输入JSON中 `senses` 数组的每一个对象，为每个词义生成一个包含以下内容的对象：
 33 |     *   `definition_en`: (string) **英文原义**。从输入JSON的 `glosses` 数组中，提取出**最具体、最完整**的那个英文释义。如果数组中包含一个概括性标题和一个具体释义，请**选择那个具体的释义**。**注意：如果原文包含引号，必须转义。**
 34 |     *   `definition_cn`: (string) **中文阐释**。此项是核心，请遵循以下原则：
 35 |         *   **解释而非翻译**：用**通俗、自然、易懂**的中文来解释 `definition_en` 的核心含义。
 36 |         *   **捕捉精髓**：要抓住该词义的**使用场景、语气（如正式、口语、俚语）和细微差别**。
 37 |         *   **避免直译**：请**避免生硬的、字典式的直译**。目标是让中文母语者能瞬间理解这个词义的真正用法。
 38 |         *   **转义规则**：如果中文阐释中需要使用引号（如「」、""），请使用中文引号，避免使用英文双引号。如果必须使用英文双引号，务必转义。
 39 |     *   `example`: (object) **为该词义创作一个全新的例句**，包含：
 40 |         *   `en`: (string) 一个**简单、现代、生活化**的英文例句，清晰地展示当前词义的用法。**绝对不要使用**输入JSON中提供的复杂或古老的例句。**如果例句中包含引号，必须转义。**
 41 |         *   `cn`: (string) 上述英文例句的对应中文翻译。**如果翻译中包含英文引号，必须转义。**
 42 | 7.  `derived`: (array of objects) **派生词**。遍历输入JSON的 `derived` 数组，为其中的**每个单词**生成一个包含以下内容的对象：
 43 |     *   `word`: (string) 派生词本身。
 44 |     *   `definition_cn`: (string) 对该派生词的**简明中文定义**。
 45 | 8.  `etymology`: (string) **词源故事**。读取输入JSON中的 `etymology_text` 字段，将其内容翻译并**转述**成一段流畅、易懂的中文。说明其起源语言（如拉丁语、古英语、希腊语）和含义的演变过程，像讲故事一样。**如果词源中包含引号，必须转义。**
 46 | 
 47 | ---
 48 | 
 49 | **【示例】**
 50 | 
 51 | **输入:**
 52 | word: quote
 53 | pos: verb
 54 | forms[2]:
 55 |   - form: quotes
 56 |     tags[2]: present,singular,third-person
 57 |   - form: quoted
 58 |     tags[1]: past
 59 | senses[1]:
 60 |   -
 61 |     glosses[1]: "To repeat or copy out (words from a text or speech written or spoken by another person)."
 62 | sounds[1,]{ipa,ogg_url}:
 63 |   /kwəʊt/,url
 64 | derived[1,]{word}:
 65 |   quotation
 66 | etymology_text: "From Medieval Latin quotare meaning \"to mark with numbers\"."
 67 | 
 68 | **你的JSON输出:**
 69 | {
 70 |   "word": "quote",
 71 |   "pos": "verb",
 72 |   "pronunciations": {
 73 |     "ipa": "/kwəʊt/",
 74 |     "natural_phonics": "quote",
 75 |     "ogg_url": "url"
 76 |   },
 77 |   "forms": [
 78 |     "quotes (第三人称单数现在时)",
 79 |     "quoted (过去式)"
 80 |   ],
 81 |   "concise_definition": "引用，引述。",
 82 |   "detailed_definitions": [
 83 |     {
 84 |       "definition_en": "To repeat or copy out (words from a text or speech written or spoken by another person).",
 85 |       "definition_cn": "指重复或摘录他人的话语或文字，通常用于写作、演讲中引用权威来源或他人观点。",
 86 |       "example": {
 87 |         "en": "She quoted Shakespeare by saying \"To be or not to be\".",
 88 |         "cn": "她引用了莎士比亚的话说「生存还是毁灭」。"
 89 |       }
 90 |     }
 91 |   ],
 92 |   "derived": [
 93 |     {
 94 |       "word": "quotation",
 95 |       "definition_cn": "引文，引语；报价。"
 96 |     }
 97 |   ],
 98 |   "etymology": "该词源自中世纪拉丁语 quotare，意为「标记数字」。"
 99 | }
100 | 
101 | """
102 | 
103 | class Example(BaseModel):
104 |     en: str
105 |     cn: str
106 | 
107 | 
108 | class DetailedDefinition(BaseModel):
109 |     definition_en: str
110 |     definition_cn: str
111 |     example: Example
112 | 
113 | 
114 | class DerivedWord(BaseModel):
115 |     word: str
116 |     definition_cn: str
117 | 
118 | 
119 | class Pronunciations(BaseModel):
120 |     ipa: str
121 |     natural_phonics: str
122 |     ogg_url: Optional[str] = None
123 | 
124 | 
125 | class Definition(BaseModel):
126 |     word: str
127 |     pos: str
128 |     pronunciations: Pronunciations
129 |     forms: list[str]
130 |     concise_definition: str
131 |     detailed_definitions: list[DetailedDefinition]
132 |     derived: list[DerivedWord]
133 |     etymology: str
134 | 
135 | 
136 | def define(input_data: str) -> Definition:
137 |     """Generate a structured dictionary definition from Wiktionary JSON/Toon data.
138 | 
139 |     Args:
140 |         input_data: String containing Wiktionary data in JSON or Toon format
141 | 
142 |     Returns:
143 |         Definition object with structured dictionary entry
144 |     """
145 |     response = get_chat_response(instruction, input_data)
146 | 
147 |     try:
148 |         return Definition.model_validate_json(response)
149 |     except Exception as exc:
150 |         # Attach the raw response to the exception for error logging
151 |         exc.llm_response = response  # type: ignore
152 |         raise
153 | 


--------------------------------------------------------------------------------
/src/open_dictionary/db/sqlite_manager.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | from contextlib import contextmanager
  3 | from pathlib import Path
  4 | from typing import Any, Iterator
  5 | import json
  6 | 
  7 | 
  8 | class SQLiteManager:
  9 |     """Manager for SQLite database with JSON1 support for storing definitions."""
 10 | 
 11 |     def __init__(self, db_path: str = "data/dictionary.sqlite"):
 12 |         """Initialize SQLite manager.
 13 | 
 14 |         Args:
 15 |             db_path: Path to SQLite database file
 16 |         """
 17 |         path_str = str(db_path)
 18 |         self._use_memory_db = path_str == ":memory:"
 19 |         self._memory_connection: sqlite3.Connection | None = None
 20 | 
 21 |         if self._use_memory_db:
 22 |             # Keep a persistent connection open for in-memory databases so the schema
 23 |             # and rows survive multiple operations.
 24 |             self.db_path = path_str
 25 |             self._memory_connection = sqlite3.connect(path_str, check_same_thread=False)
 26 |         else:
 27 |             self.db_path = Path(path_str)
 28 |             self.db_path.parent.mkdir(parents=True, exist_ok=True)
 29 | 
 30 |         self._init_db()
 31 | 
 32 |     def _init_db(self):
 33 |         """Initialize database schema."""
 34 |         with self._connection() as conn:
 35 |             conn.execute("""
 36 |                 CREATE TABLE IF NOT EXISTS definitions (
 37 |                     word TEXT PRIMARY KEY,
 38 |                     definition JSON NOT NULL
 39 |                 )
 40 |             """)
 41 |             conn.commit()
 42 | 
 43 |     @contextmanager
 44 |     def _connection(self) -> Iterator[sqlite3.Connection]:
 45 |         """Yield a SQLite connection, keeping in-memory DBs alive."""
 46 |         if self._use_memory_db:
 47 |             assert self._memory_connection is not None
 48 |             yield self._memory_connection
 49 |         else:
 50 |             conn = sqlite3.connect(self.db_path)
 51 |             try:
 52 |                 yield conn
 53 |             finally:
 54 |                 conn.close()
 55 | 
 56 |     def insert_definition(self, word: str, definition: dict[str, Any]):
 57 |         """Insert a single definition into the database.
 58 | 
 59 |         Args:
 60 |             word: The word being defined
 61 |             definition: The definition data as a dictionary
 62 |         """
 63 |         with self._connection() as conn:
 64 |             conn.execute(
 65 |                 "INSERT OR REPLACE INTO definitions (word, definition) VALUES (?, ?)",
 66 |                 (word, json.dumps(definition, ensure_ascii=False))
 67 |             )
 68 |             conn.commit()
 69 | 
 70 |     def insert_definitions_batch(self, definitions: list[tuple[str, dict[str, Any]]]):
 71 |         """Insert multiple definitions in a batch.
 72 | 
 73 |         Args:
 74 |             definitions: List of (word, definition_dict) tuples
 75 |         """
 76 |         with self._connection() as conn:
 77 |             conn.executemany(
 78 |                 "INSERT OR REPLACE INTO definitions (word, definition) VALUES (?, ?)",
 79 |                 [(word, json.dumps(defn, ensure_ascii=False)) for word, defn in definitions]
 80 |             )
 81 |             conn.commit()
 82 | 
 83 |     def get_definition(self, word: str) -> dict[str, Any] | None:
 84 |         """Get definition for a word.
 85 | 
 86 |         Args:
 87 |             word: The word to look up
 88 | 
 89 |         Returns:
 90 |             Definition dictionary or None if not found
 91 |         """
 92 |         with self._connection() as conn:
 93 |             cursor = conn.execute(
 94 |                 "SELECT definition FROM definitions WHERE word = ?",
 95 |                 (word,)
 96 |             )
 97 |             row = cursor.fetchone()
 98 |             return json.loads(row[0]) if row else None
 99 | 
100 |     def count_definitions(self) -> int:
101 |         """Count total definitions in database.
102 | 
103 |         Returns:
104 |             Number of definitions
105 |         """
106 |         with self._connection() as conn:
107 |             cursor = conn.execute("SELECT COUNT(*) FROM definitions")
108 |             return cursor.fetchone()[0]
109 | 
110 |     def close(self) -> None:
111 |         """Close any persistent SQLite connections."""
112 |         if self._memory_connection is not None:
113 |             self._memory_connection.close()
114 |             self._memory_connection = None
115 | 
116 |     def __del__(self):  # pragma: no cover - best effort cleanup
117 |         try:
118 |             self.close()
119 |         except Exception:
120 |             pass
121 | 
122 | 
123 | def test_sqlite_manager():
124 |     """Test function to verify SQLite manager works correctly."""
125 |     import tempfile
126 |     import os
127 | 
128 |     # Create a temporary database
129 |     with tempfile.NamedTemporaryFile(delete=False, suffix='.sqlite') as f:
130 |         test_db = f.name
131 | 
132 |     try:
133 |         print(f"Testing with database: {test_db}")
134 |         manager = SQLiteManager(test_db)
135 | 
136 |         # Test single insert
137 |         test_def = {"word": "test", "pos": "noun", "definition": "A trial or examination"}
138 |         manager.insert_definition("test", test_def)
139 |         print(f"After single insert: {manager.count_definitions()} definitions")
140 | 
141 |         # Test batch insert
142 |         batch = [
143 |             ("word1", {"word": "word1", "meaning": "first"}),
144 |             ("word2", {"word": "word2", "meaning": "second"}),
145 |             ("word3", {"word": "word3", "meaning": "third"}),
146 |         ]
147 |         manager.insert_definitions_batch(batch)
148 |         print(f"After batch insert: {manager.count_definitions()} definitions")
149 | 
150 |         # Test retrieval
151 |         retrieved = manager.get_definition("test")
152 |         print(f"Retrieved definition: {retrieved}")
153 | 
154 |         # Test in-memory database support
155 |         memory_manager = SQLiteManager(":memory:")
156 |         memory_manager.insert_definition("memory_word", {"word": "memory_word"})
157 |         print(f"In-memory count: {memory_manager.count_definitions()} definitions")
158 |         print(f"In-memory retrieval: {memory_manager.get_definition('memory_word')}")
159 |         memory_manager.close()
160 | 
161 |         print("All tests passed!")
162 |     finally:
163 |         # Clean up
164 |         if os.path.exists(test_db):
165 |             os.unlink(test_db)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     test_sqlite_manager()
170 | 


--------------------------------------------------------------------------------
/src/open_dictionary/db/cleaner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import time
  4 | from typing import Any, Sequence
  5 | 
  6 | from psycopg import sql
  7 | from psycopg.cursor import Cursor
  8 | 
  9 | # 假设这个模块存在并且可以正确配置数据库连接
 10 | # 注意：您需要确保 open_dictionary.db.access 模块在您的环境中可用
 11 | from open_dictionary.db.access import DatabaseAccess
 12 | 
 13 | FETCH_BATCH_SIZE = 5000
 14 | DELETE_BATCH_SIZE = 5000
 15 | PROGRESS_EVERY_ROWS = 20_000
 16 | PROGRESS_EVERY_SECONDS = 30.0
 17 | 
 18 | 
 19 | def clean_dictionary_data(
 20 |     table_name: str,
 21 |     *,
 22 |     fetch_batch_size: int = FETCH_BATCH_SIZE,
 23 |     delete_batch_size: int = DELETE_BATCH_SIZE,
 24 |     progress_every_rows: int = PROGRESS_EVERY_ROWS,
 25 |     progress_every_seconds: float = PROGRESS_EVERY_SECONDS,
 26 | ) -> None:
 27 |     """
 28 |     从字典表中删除不符合质量标准的词条行。
 29 | 
 30 |     该函数会删除满足以下任一条件的词条：
 31 |     1.  `common_score` 精确为零。
 32 |     2.  单词本身 (`data`->'word') 包含任何数字 (0-9)。
 33 |     3.  单词本身是长度超过1的全大写词 (例如 "UNESCO")。
 34 |     4.  单词本身包含特殊字符（允许字母, 撇号, 空格, 连字符）。
 35 |     5.  词条的标签 (`data`->'tags') 包含 "archaic", "obsolete", "dated", "古旧", 或 "废弃"。
 36 |     """
 37 | 
 38 |     data_access = DatabaseAccess()
 39 |     processed = 0
 40 |     deleted = 0
 41 |     pending_ids: list[int] = []
 42 |     start_time = time.monotonic()
 43 | 
 44 |     print(
 45 |         f"[cleaner] starting table={table_name} "
 46 |         f"fetch_batch={fetch_batch_size} delete_batch={delete_batch_size} "
 47 |         f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds}",
 48 |         flush=True,
 49 |     )
 50 | 
 51 |     # 构建复杂的 WHERE 子句来一次性筛选所有不合格的词条
 52 |     # 这种方法比在 Python 中进行判断效率高得多，因为它将过滤工作完全交给了数据库
 53 |     conditions = [
 54 |         # 1. 删除 common_score 为 0 的词条
 55 |         sql.SQL("common_score = 0"),
 56 |         
 57 |         # 2. 删除单词中包含数字的词条
 58 |         # data->>'word' 从 jsonb 字段 'data' 中以文本形式提取 'word' 的值
 59 |         # ~ 是 PostgreSQL 的正则表达式匹配操作符
 60 |         sql.SQL("data->>'word' ~ '[0-9]'"),
 61 | 
 62 |         # 3. 删除全是大写的词条（长度大于1，以避免删除 "I", "A" 等）
 63 |         # 同时检查是否真的包含大写字母，以避免非字母字符串被误判
 64 |         sql.SQL("LENGTH(data->>'word') > 1 AND data->>'word' = UPPER(data->>'word') AND data->>'word' ~ '[A-Z]'"),
 65 | 
 66 |         # 4. 删除包含特殊字符的词条
 67 |         # 正则表达式 [^a-zA-Z' -] 匹配任何不是字母、撇号、空格或连字符的字符
 68 |         # 注意在 SQL 字符串中，撇号需要写成 '' 来转义
 69 |         sql.SQL("data->>'word' ~ '[^a-zA-Z'' -]'"),
 70 |         
 71 |         # 5. 删除包含古旧、废弃等标签的词条
 72 |         # data->'tags' 获取 jsonb 字段 'data' 中的 'tags' 数组
 73 |         # ?| 操作符检查左边的 jsonb 数组是否包含右边 text 数组中的任何一个元素
 74 |         sql.SQL("data->'tags' ?| array['archaic', 'obsolete', 'dated']")
 75 |     ]
 76 |     
 77 |     # 使用 OR 将所有条件连接起来，满足任意一个条件即被选中
 78 |     where_clause = sql.SQL(" OR ").join(conditions)
 79 | 
 80 |     with data_access.get_connection() as delete_conn:
 81 |         with delete_conn.cursor() as cursor:
 82 |             last_log_time = start_time
 83 | 
 84 |             print(f"[cleaner] Executing query with WHERE clause: {where_clause.as_string(cursor)}", flush=True)
 85 | 
 86 |             # 使用构建好的 where_clause 来迭代所有需要删除的行
 87 |             for row in data_access.iterate_table(
 88 |                 table_name,
 89 |                 batch_size=fetch_batch_size,
 90 |                 columns=("id",),
 91 |                 where=where_clause,
 92 |                 order_by=("id",),
 93 |             ):
 94 |                 row_id = row.get("id")
 95 |                 if row_id is None:
 96 |                     continue
 97 | 
 98 |                 processed += 1
 99 |                 emit_progress = processed == 1
100 | 
101 |                 pending_ids.append(int(row_id))
102 | 
103 |                 if len(pending_ids) >= delete_batch_size:
104 |                     batch_count = _flush_deletions(cursor, table_name, pending_ids)
105 |                     delete_conn.commit()
106 |                     deleted += batch_count
107 |                     pending_ids.clear()
108 |                     emit_progress = True
109 | 
110 |                 now = time.monotonic()
111 | 
112 |                 if progress_every_rows and processed % progress_every_rows == 0:
113 |                     emit_progress = True
114 |                 if progress_every_seconds and (now - last_log_time) >= progress_every_seconds:
115 |                     emit_progress = True
116 | 
117 |                 if emit_progress:
118 |                     _report_progress(processed, deleted, start_time)
119 |                     last_log_time = now
120 | 
121 |             if pending_ids:
122 |                 batch_count = _flush_deletions(cursor, table_name, pending_ids)
123 |                 delete_conn.commit()
124 |                 deleted += batch_count
125 |                 pending_ids.clear()
126 |                 _report_progress(processed, deleted, start_time)
127 | 
128 |     _report_completion(processed, deleted, start_time)
129 | 
130 | 
131 | def _flush_deletions(
132 |     cursor: Cursor[Any],
133 |     table_name: str,
134 |     ids: Sequence[int],
135 | ) -> int:
136 |     if not ids:
137 |         return 0
138 | 
139 |     values_sql = sql.SQL(", ").join(sql.SQL("(%s::bigint)") for _ in ids)
140 |     delete_sql = sql.SQL(
141 |         """
142 |         DELETE FROM {table} AS t
143 |         USING (VALUES {values}) AS v(id)
144 |         WHERE t.id = v.id
145 |         """
146 |     ).format(
147 |         table=sql.Identifier(table_name),
148 |         values=values_sql,
149 |     )
150 | 
151 |     cursor.execute(delete_sql, ids)
152 |     return cursor.rowcount
153 | 
154 | 
155 | def _report_progress(processed: int, deleted: int, start_time: float) -> None:
156 |     elapsed = max(time.monotonic() - start_time, 1e-6)
157 |     processed_rate = processed / elapsed
158 |     deleted_rate = deleted / elapsed if deleted else 0.0
159 |     print(
160 |         f"[cleaner] processed={processed:,} deleted={deleted:,} "
161 |         f"elapsed={elapsed:,.1f}s rate={processed_rate:,.0f} rows/s "
162 |         f"delete_rate={deleted_rate:,.0f} rows/s",
163 |         flush=True,
164 |     )
165 | 
166 | 
167 | def _report_completion(processed: int, deleted: int, start_time: float) -> None:
168 |     elapsed = max(time.monotonic() - start_time, 1e-6)
169 |     processed_rate = processed / elapsed if processed else 0.0
170 |     deleted_rate = deleted / elapsed if deleted else 0.0
171 |     print(
172 |         f"[cleaner] completed processed={processed:,} deleted={deleted:,} "
173 |         f"elapsed={elapsed:,.1f}s avg_rate={processed_rate:,.0f} rows/s "
174 |         f"delete_rate={deleted_rate:,.0f} rows/s",
175 |         flush=True,
176 |     )
177 | __all__ = [
178 |     "FETCH_BATCH_SIZE",
179 |     "DELETE_BATCH_SIZE",
180 |     "PROGRESS_EVERY_ROWS",
181 |     "PROGRESS_EVERY_SECONDS",
182 |     "clean_dictionary_data",
183 | ]
184 | 


--------------------------------------------------------------------------------
/src/open_dictionary/db/mark_commonness.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import time
  5 | from decimal import Decimal
  6 | from functools import lru_cache
  7 | from typing import Any, Optional, Sequence, Tuple
  8 | 
  9 | from psycopg import sql
 10 | from psycopg.cursor import Cursor
 11 | from wordfreq import zipf_frequency
 12 | 
 13 | from open_dictionary.db.access import DatabaseAccess
 14 | 
 15 | FETCH_BATCH_SIZE = 5000
 16 | UPDATE_BATCH_SIZE = 5000
 17 | PROGRESS_EVERY_ROWS = 20_000
 18 | PROGRESS_EVERY_SECONDS = 30.0
 19 | 
 20 | 
 21 | def enrich_common_score(
 22 |     table_name: str,
 23 |     *,
 24 |     fetch_batch_size: int = FETCH_BATCH_SIZE,
 25 |     update_batch_size: int = UPDATE_BATCH_SIZE,
 26 |     progress_every_rows: int = PROGRESS_EVERY_ROWS,
 27 |     progress_every_seconds: float = PROGRESS_EVERY_SECONDS,
 28 |     recompute_existing: bool = False,
 29 | ) -> None:
 30 |     """Populate the common_score column on ``table_name`` using wordfreq data.
 31 | 
 32 |     The routine streams rows via a server-side cursor to keep memory usage flat,
 33 |     batches UPDATE statements to stay efficient on very large tables, and skips
 34 |     rows that were already processed.
 35 |     """
 36 |     data_access = DatabaseAccess()
 37 | 
 38 |     _ensure_common_score_column(data_access, table_name)
 39 | 
 40 |     where_clause = None
 41 |     if not recompute_existing:
 42 |         where_clause = sql.SQL("{} IS NULL").format(sql.Identifier("common_score"))
 43 | 
 44 |     processed = 0
 45 |     updated = 0
 46 |     pending_updates: list[tuple[int, Optional[float]]] = []
 47 |     start_time = time.monotonic()
 48 | 
 49 |     print(
 50 |         f"[common_score] starting table={table_name} "
 51 |         f"fetch_batch={fetch_batch_size} update_batch={update_batch_size} "
 52 |         f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds} "
 53 |         f"recompute_existing={recompute_existing}",
 54 |         flush=True,
 55 |     )
 56 | 
 57 |     with data_access.get_connection() as update_conn:
 58 |         with update_conn.cursor() as cursor:
 59 |             last_log_time = start_time
 60 |             for row in data_access.iterate_table(
 61 |                 table_name,
 62 |                 batch_size=fetch_batch_size,
 63 |                 columns=(
 64 |                     "id",
 65 |                     "common_score",
 66 |                     ("word", sql.SQL("data->>'word'")),
 67 |                 ),
 68 |                 where=where_clause,
 69 |                 order_by=("id",),
 70 |             ):
 71 |                 processed += 1
 72 |                 emit_progress = False
 73 | 
 74 |                 if processed == 1:
 75 |                     emit_progress = True
 76 | 
 77 |                 update_payload = _build_update_payload(row)
 78 |                 if update_payload is not None:
 79 |                     pending_updates.append(update_payload)
 80 | 
 81 |                 if len(pending_updates) >= update_batch_size:
 82 |                     batch_count = _flush_updates(cursor, table_name, pending_updates)
 83 |                     update_conn.commit()
 84 |                     updated += batch_count
 85 |                     pending_updates.clear()
 86 |                     emit_progress = True
 87 | 
 88 |                 now = time.monotonic()
 89 | 
 90 |                 if progress_every_rows and processed % progress_every_rows == 0:
 91 |                     emit_progress = True
 92 |                 if progress_every_seconds and (now - last_log_time) >= progress_every_seconds:
 93 |                     emit_progress = True
 94 | 
 95 |                 if emit_progress:
 96 |                     _report_progress(processed, updated, start_time)
 97 |                     last_log_time = now
 98 | 
 99 |             if pending_updates:
100 |                 batch_count = _flush_updates(cursor, table_name, pending_updates)
101 |                 update_conn.commit()
102 |                 updated += batch_count
103 |                 pending_updates.clear()
104 |                 _report_progress(processed, updated, start_time)
105 | 
106 |     _report_completion(processed, updated, start_time)
107 | 
108 | 
109 | def _ensure_common_score_column(data_access: DatabaseAccess, table_name: str) -> None:
110 |     with data_access.get_connection() as conn:
111 |         with conn.cursor() as cursor:
112 |             cursor.execute(
113 |                 sql.SQL(
114 |                     """
115 |                     ALTER TABLE {table}
116 |                     ADD COLUMN IF NOT EXISTS common_score DOUBLE PRECISION
117 |                     """
118 |                 ).format(table=sql.Identifier(table_name))
119 |             )
120 | 
121 | 
122 | def _build_update_payload(row: dict[str, Any]) -> Tuple[int, Optional[float]] | None:
123 |     row_id = row.get("id")
124 |     if row_id is None:
125 |         return None
126 | 
127 |     existing = row.get("common_score")
128 |     normalized_existing = _to_float(existing)
129 | 
130 |     word = _extract_word(row)
131 |     score = _score_for_word(word)
132 | 
133 |     if normalized_existing is None and score is None:
134 |         return None
135 | 
136 |     if normalized_existing is not None and score is not None:
137 |         if abs(normalized_existing - score) < 1e-9:
138 |             return None
139 | 
140 |     return int(row_id), score
141 | 
142 | 
143 | def _extract_word(row: dict[str, Any]) -> Optional[str]:
144 |     direct_word = row.get("word")
145 |     candidate = _normalize_word(direct_word)
146 |     if candidate:
147 |         return candidate
148 | 
149 |     data = row.get("data")
150 |     if isinstance(data, dict):
151 |         candidate = _normalize_word(data.get("word"))
152 |         if candidate:
153 |             return candidate
154 |     elif isinstance(data, str):
155 |         try:
156 |             decoded = json.loads(data)
157 |         except json.JSONDecodeError:
158 |             decoded = None
159 |         if isinstance(decoded, dict):
160 |             candidate = _normalize_word(decoded.get("word"))
161 |             if candidate:
162 |                 return candidate
163 | 
164 |     return None
165 | 
166 | 
167 | def _normalize_word(value: Any) -> Optional[str]:
168 |     if not isinstance(value, str):
169 |         return None
170 |     stripped = value.strip()
171 |     if not stripped:
172 |         return None
173 |     return stripped.lower()
174 | 
175 | 
176 | def _score_for_word(word: Optional[str]) -> Optional[float]:
177 |     if not word:
178 |         return None
179 |     score = _cached_zipf_frequency(word)
180 |     if score <= 0.0:
181 |         return 0.0
182 |     return score
183 | 
184 | 
185 | @lru_cache(maxsize=None)
186 | def _cached_zipf_frequency(word: str) -> float:
187 |     return float(zipf_frequency(word, "en"))
188 | 
189 | 
190 | def _flush_updates(
191 |     cursor: Cursor[Any],
192 |     table_name: str,
193 |     payloads: Sequence[tuple[int, Optional[float]]],
194 | ) -> int:
195 |     if not payloads:
196 |         return 0
197 |     values_sql = sql.SQL(", ").join(
198 |         sql.SQL("(%s::bigint, %s::double precision)") for _ in payloads
199 |     )
200 |     update_sql = sql.SQL(
201 |         """
202 |         UPDATE {table} AS t
203 |         SET common_score = v.score
204 |         FROM (VALUES {values}) AS v(id, score)
205 |         WHERE t.id = v.id
206 |         """
207 |     ).format(
208 |         table=sql.Identifier(table_name),
209 |         values=values_sql,
210 |     )
211 |     params: list[Any] = []
212 |     for row_id, score in payloads:
213 |         params.extend((row_id, score))
214 | 
215 |     cursor.execute(update_sql, params)
216 |     return len(payloads)
217 | 
218 | 
219 | def _to_float(value: Any) -> Optional[float]:
220 |     if value is None:
221 |         return None
222 |     if isinstance(value, float):
223 |         return value
224 |     if isinstance(value, Decimal):
225 |         return float(value)
226 |     return None
227 | 
228 | 
229 | def _report_progress(processed: int, updated: int, start_time: float) -> None:
230 |     elapsed = max(time.monotonic() - start_time, 1e-6)
231 |     rate = processed / elapsed
232 |     print(
233 |         f"[common_score] processed={processed:,} updated={updated:,} "
234 |         f"elapsed={elapsed:,.1f}s rate={rate:,.0f} rows/s",
235 |         flush=True,
236 |     )
237 | 
238 | 
239 | def _report_completion(processed: int, updated: int, start_time: float) -> None:
240 |     elapsed = max(time.monotonic() - start_time, 1e-6)
241 |     avg_rate = processed / elapsed if processed else 0.0
242 |     print(
243 |         f"[common_score] completed: processed={processed:,} updated={updated:,} "
244 |         f"elapsed={elapsed:,.1f}s avg_rate={avg_rate:,.0f} rows/s",
245 |         flush=True,
246 |     )
247 | __all__ = [
248 |     "FETCH_BATCH_SIZE",
249 |     "UPDATE_BATCH_SIZE",
250 |     "PROGRESS_EVERY_ROWS",
251 |     "PROGRESS_EVERY_SECONDS",
252 |     "enrich_common_score",
253 | ]
254 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/transform.py:
--------------------------------------------------------------------------------
  1 | """Utilities for streaming Wiktionary JSONL data into PostgreSQL."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import json
  6 | import re
  7 | import sys
  8 | from pathlib import Path
  9 | from typing import Iterator, Sequence
 10 | 
 11 | import psycopg
 12 | from psycopg import sql
 13 | 
 14 | from .progress import StreamingProgress
 15 | 
 16 | 
 17 | UTF8_BOM = b"\xef\xbb\xbf"
 18 | 
 19 | 
 20 | class JsonlProcessingError(Exception):
 21 |     """Raised when the JSONL input contains invalid JSON content."""
 22 | 
 23 | 
 24 | def iter_json_lines(file_path: Path) -> Iterator[tuple[str, int]]:
 25 |     """Yield JSON rows and byte offsets from a JSONL file, skipping blank lines."""
 26 | 
 27 |     path = Path(file_path)
 28 |     if not path.is_file():
 29 |         raise FileNotFoundError(f"No JSONL file found at {path}")
 30 | 
 31 |     with path.open("rb", buffering=1024 * 1024) as handle:
 32 |         for line_number, raw_line in enumerate(handle, start=1):
 33 |             if not raw_line.strip():
 34 |                 continue
 35 | 
 36 |             if line_number == 1 and raw_line.startswith(UTF8_BOM):
 37 |                 raw_line = raw_line[len(UTF8_BOM) :]
 38 | 
 39 |             json_bytes = raw_line.rstrip(b"\r\n")
 40 |             if not json_bytes:
 41 |                 continue
 42 | 
 43 |             try:
 44 |                 json_text = json_bytes.decode("utf-8")
 45 |             except UnicodeDecodeError as exc:  # pragma: no cover - defensive
 46 |                 message = f"Invalid UTF-8 sequence on line {line_number}: {exc!s}"
 47 |                 raise JsonlProcessingError(message) from exc
 48 | 
 49 |             try:
 50 |                 json.loads(json_text)
 51 |             except json.JSONDecodeError as exc:  # pragma: no cover - defensive
 52 |                 message = (
 53 |                     f"Invalid JSON on line {line_number}: {exc.msg} (column {exc.colno})"
 54 |                 )
 55 |                 raise JsonlProcessingError(message) from exc
 56 | 
 57 |             bytes_read = handle.tell()
 58 |             yield json_text, bytes_read
 59 | def _identifier_from_dotted(qualified_name: str) -> sql.Identifier:
 60 |     """Return a psycopg identifier from a dotted path like ``schema.table``."""
 61 | 
 62 |     parts = [segment.strip() for segment in qualified_name.split(".") if segment.strip()]
 63 |     if not parts:
 64 |         raise ValueError("Identifier name cannot be empty")
 65 |     return sql.Identifier(*parts)
 66 | 
 67 | 
 68 | def _ensure_table_structure(
 69 |     cursor: psycopg.Cursor,
 70 |     table_identifier: sql.Identifier,
 71 |     column_identifier: sql.Identifier,
 72 | ) -> None:
 73 |     """Create the destination table if missing."""
 74 | 
 75 |     create_sql = sql.SQL(
 76 |         """
 77 |         CREATE TABLE IF NOT EXISTS {} (
 78 |             id BIGSERIAL PRIMARY KEY,
 79 |             {} JSONB NOT NULL
 80 |         )
 81 |         """
 82 |     ).format(table_identifier, column_identifier)
 83 | 
 84 |     cursor.execute(create_sql)
 85 | 
 86 | 
 87 | def _sanitize_language_code(code: str) -> str:
 88 |     safe = re.sub(r"[^0-9A-Za-z_]+", "_", code).strip("_")
 89 |     return safe.lower()
 90 | 
 91 | 
 92 | def partition_dictionary_by_language(
 93 |     conninfo: str,
 94 |     *,
 95 |     source_table: str,
 96 |     column_name: str,
 97 |     lang_field: str = "lang_code",
 98 |     table_prefix: str = "dictionary_lang",
 99 |     target_schema: str | None = None,
100 |     drop_existing: bool = False,
101 |     languages: Sequence[str] | None = None,
102 | ) -> list[str]:
103 |     """Split rows in ``source_table`` into per-language tables based on ``lang_field``."""
104 | 
105 |     created_tables: list[str] = []
106 |     table_identifier = _identifier_from_dotted(source_table)
107 |     column_identifier = sql.Identifier(column_name)
108 | 
109 |     with psycopg.connect(conninfo) as connection:
110 |         with connection.cursor() as cursor:
111 |             if languages:
112 |                 language_codes = [code for code in dict.fromkeys(languages) if code]
113 |             else:
114 |                 select_distinct = sql.SQL(
115 |                     """
116 |                     SELECT DISTINCT {column}->>%s AS lang_code
117 |                     FROM {table}
118 |                     WHERE {column} ? %s
119 |                       AND {column}->>%s IS NOT NULL
120 |                       AND {column}->>%s <> ''
121 |                     ORDER BY lang_code
122 |                     """
123 |                 ).format(column=column_identifier, table=table_identifier)
124 | 
125 |                 cursor.execute(select_distinct, (lang_field, lang_field, lang_field, lang_field))
126 |                 language_codes = [row[0] for row in cursor.fetchall() if row and row[0]]
127 | 
128 |             if not language_codes:
129 |                 print(
130 |                     "No language codes found; skipping partition step.",
131 |                     file=sys.stderr,
132 |                 )
133 |                 return created_tables
134 | 
135 |             total_languages = len(language_codes)
136 |             print(
137 |                 f"Partitioning {total_languages} language set(s) from {source_table}.{column_name}...",
138 |                 file=sys.stderr,
139 |             )
140 | 
141 |             seen_tables: set[tuple[str | None, str]] = set()
142 |             for idx, code in enumerate(language_codes, start=1):
143 |                 prefix = f"[{idx}/{total_languages}] "
144 |                 safe_code = _sanitize_language_code(code)
145 |                 if not safe_code:
146 |                     print(
147 |                         prefix
148 |                         + f"Skipping language code '{code}' because it cannot form a valid table name.",
149 |                         file=sys.stderr,
150 |                     )
151 |                     continue
152 | 
153 |                 table_name = f"{table_prefix}_{safe_code}"
154 |                 if target_schema:
155 |                     table_key = (target_schema, table_name)
156 |                     target_identifier = sql.Identifier(target_schema, table_name)
157 |                     display_name = f"{target_schema}.{table_name}"
158 |                 else:
159 |                     table_key = (None, table_name)
160 |                     target_identifier = sql.Identifier(table_name)
161 |                     display_name = table_name
162 | 
163 |                 if table_key in seen_tables:
164 |                     print(
165 |                         prefix
166 |                         + f"Skipping language code '{code}' because it maps to an existing table name {display_name}.",
167 |                         file=sys.stderr,
168 |                     )
169 |                     continue
170 |                 seen_tables.add(table_key)
171 | 
172 |                 if drop_existing:
173 |                     drop_sql = sql.SQL("DROP TABLE IF EXISTS {}").format(target_identifier)
174 |                     cursor.execute(drop_sql)
175 |                     connection.commit()
176 | 
177 |                 create_sql = sql.SQL(
178 |                     """
179 |                     CREATE TABLE IF NOT EXISTS {} (
180 |                         id BIGINT PRIMARY KEY,
181 |                         {} JSONB NOT NULL
182 |                     )
183 |                     """
184 |                 ).format(target_identifier, column_identifier)
185 |                 cursor.execute(create_sql)
186 | 
187 |                 insert_sql = sql.SQL(
188 |                     """
189 |                     INSERT INTO {target} (id, {column})
190 |                     SELECT id, {column}
191 |                     FROM {source}
192 |                     WHERE {column}->>%s = %s
193 |                     ON CONFLICT (id) DO NOTHING
194 |                     """
195 |                 ).format(
196 |                     target=target_identifier,
197 |                     column=column_identifier,
198 |                     source=table_identifier,
199 |                 )
200 | 
201 |                 cursor.execute(insert_sql, (lang_field, code))
202 |                 connection.commit()
203 | 
204 |                 inserted = cursor.rowcount if cursor.rowcount != -1 else None
205 |                 inserted_text = f" ({inserted} rows)" if inserted is not None else ""
206 |                 print(
207 |                     f"{prefix}Partitioned '{code}' -> {display_name}{inserted_text}",
208 |                     file=sys.stderr,
209 |                 )
210 |                 created_tables.append(display_name)
211 | 
212 |     return created_tables
213 | 
214 | 
215 | def copy_jsonl_to_postgres(
216 |     jsonl_path: Path,
217 |     conninfo: str,
218 |     table_name: str,
219 |     column_name: str,
220 |     truncate: bool = False,
221 | ) -> int:
222 |     """Stream JSON rows from ``jsonl_path`` into ``table_name.column_name``.
223 | 
224 |     Returns the number of rows copied.
225 |     """
226 | 
227 |     table_identifier = _identifier_from_dotted(table_name)
228 |     if not column_name.strip():
229 |         raise ValueError("Column name cannot be empty")
230 | 
231 |     column_identifier = sql.Identifier(column_name)
232 | 
233 |     rows_written = 0
234 |     total_bytes = jsonl_path.stat().st_size
235 |     progress = StreamingProgress(total_bytes, label=f"COPY {table_name}")
236 |     latest_bytes_processed = 0
237 | 
238 |     with psycopg.connect(conninfo) as connection:
239 |         with connection.cursor() as cursor:
240 |             _ensure_table_structure(cursor, table_identifier, column_identifier)
241 | 
242 |             if truncate:
243 |                 cursor.execute(sql.SQL("TRUNCATE TABLE {}").format(table_identifier))
244 | 
245 |             copy_sql = sql.SQL("COPY {} ({}) FROM STDIN WITH (FORMAT text)").format(
246 |                 table_identifier,
247 |                 column_identifier,
248 |             )
249 |             copy_command = copy_sql.as_string(connection)
250 | 
251 |             with cursor.copy(copy_command) as copy:  # type: ignore[arg-type]
252 |                 for json_text, bytes_processed in iter_json_lines(jsonl_path):
253 |                     copy.write_row((json_text,))
254 |                     rows_written += 1
255 |                     latest_bytes_processed = bytes_processed
256 |                     progress.report(rows_written, latest_bytes_processed)
257 | 
258 |     progress.finalize(rows_written, latest_bytes_processed)
259 | 
260 |     return rows_written
261 | 
262 | __all__ = [
263 |     "JsonlProcessingError",
264 |     "iter_json_lines",
265 |     "partition_dictionary_by_language",
266 |     "copy_jsonl_to_postgres",
267 | ]
268 | 


--------------------------------------------------------------------------------
/src/open_dictionary/workflow.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor, as_completed
  2 | from typing import Any
  3 | import logging
  4 | import sys
  5 | import time
  6 | 
  7 | from open_dictionary.db.access import DatabaseAccess
  8 | from open_dictionary.db.sqlite_manager import SQLiteManager
  9 | from open_dictionary.llm.define import define, Definition
 10 | 
 11 | # Configure logging
 12 | logging.basicConfig(
 13 |     level=logging.INFO,
 14 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 15 |     stream=sys.stderr
 16 | )
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class ProgressReporter:
 21 |     """Report progress of definition generation with statistics."""
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         *,
 26 |         min_time_step: float = 5.0,
 27 |         min_count_step: int = 10,
 28 |     ):
 29 |         self.min_time_step = max(min_time_step, 0.0)
 30 |         self.min_count_step = max(min_count_step, 1)
 31 |         self._last_report_time = time.monotonic()
 32 |         self._last_report_count = 0
 33 |         self._start_time = time.monotonic()
 34 | 
 35 |     def maybe_report(
 36 |         self,
 37 |         processed: int,
 38 |         failed: int,
 39 |         *,
 40 |         force: bool = False
 41 |     ) -> None:
 42 |         """Report progress if enough time/items have passed."""
 43 |         now = time.monotonic()
 44 |         count_increment = processed - self._last_report_count
 45 | 
 46 |         if not force:
 47 |             if (
 48 |                 count_increment < self.min_count_step
 49 |                 and (now - self._last_report_time) < self.min_time_step
 50 |             ):
 51 |                 return
 52 | 
 53 |         elapsed = now - self._start_time
 54 |         total = processed + failed
 55 |         rate = processed / elapsed if elapsed > 0 else 0
 56 | 
 57 |         message = (
 58 |             f"Progress: {processed:,} processed | {failed:,} failed | "
 59 |             f"{total:,} total | {rate:.1f} items/sec"
 60 |         )
 61 |         logger.info(message)
 62 | 
 63 |         self._last_report_time = now
 64 |         self._last_report_count = processed
 65 | 
 66 |     def finalize(self, processed: int, failed: int) -> None:
 67 |         """Print final statistics."""
 68 |         elapsed = time.monotonic() - self._start_time
 69 |         total = processed + failed
 70 |         rate = processed / elapsed if elapsed > 0 else 0
 71 | 
 72 |         logger.info("=" * 60)
 73 |         logger.info(f"Processing complete!")
 74 |         logger.info(f"Total processed: {processed:,}")
 75 |         logger.info(f"Total failed: {failed:,}")
 76 |         logger.info(f"Total items: {total:,}")
 77 |         logger.info(f"Success rate: {(processed/total*100 if total > 0 else 0):.1f}%")
 78 |         logger.info(f"Total time: {elapsed:.1f} seconds")
 79 |         logger.info(f"Average rate: {rate:.1f} items/sec")
 80 |         logger.info("=" * 60)
 81 | 
 82 | 
 83 | def process_single_word(word_data: dict[str, Any]) -> tuple[str, dict[str, Any]] | None:
 84 |     """Process a single word definition request.
 85 | 
 86 |     Args:
 87 |         word_data: Dictionary containing word data from PostgreSQL
 88 | 
 89 |     Returns:
 90 |         Tuple of (word, definition_dict) or None if processing failed
 91 |     """
 92 |     try:
 93 |         logger.debug(f"Processing word data keys: {list(word_data.keys())}")
 94 |         definition = define(word_data)
 95 |         result = (definition.word, definition.model_dump())
 96 |         logger.debug(f"Successfully processed word: {definition.word}")
 97 |         return result
 98 |     except Exception as e:
 99 |         logger.error(f"Failed to process word '{word_data.get('word', 'unknown')}': {e}", exc_info=True)
100 |         return None
101 | 
102 | 
103 | def run_parallel_definitions(
104 |     table_name: str = "dictionary_en",
105 |     batch_size: int = 50,
106 |     max_workers: int = 50,
107 |     sqlite_path: str = "data/dictionary.sqlite",
108 |     limit: int | None = None,
109 | ):
110 |     """Process dictionary entries in parallel and store in SQLite.
111 | 
112 |     This function reads from PostgreSQL, sends definition requests to LLM in parallel,
113 |     and writes results to SQLite.
114 | 
115 |     Args:
116 |         table_name: Name of the PostgreSQL table to read from
117 |         batch_size: Number of rows to fetch from PostgreSQL per batch
118 |         max_workers: Maximum number of parallel LLM requests
119 |         sqlite_path: Path to SQLite database file
120 |         limit: Optional limit on number of words to process
121 |     """
122 |     db_access = DatabaseAccess()
123 |     sqlite_manager = SQLiteManager(sqlite_path)
124 |     progress = ProgressReporter()
125 | 
126 |     logger.info(f"Starting parallel definition processing with {max_workers} workers")
127 |     logger.info(f"Reading from PostgreSQL table: {table_name}")
128 |     logger.info(f"Writing to SQLite: {sqlite_path}")
129 |     if limit:
130 |         logger.info(f"Processing limit: {limit:,} words")
131 | 
132 |     processed_count = 0
133 |     failed_count = 0
134 |     pending_batch = []
135 | 
136 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
137 |         # Iterator to track all submitted futures
138 |         future_to_word = {}
139 | 
140 |         # Iterate through PostgreSQL table
141 |         row_iterator = db_access.iterate_table(
142 |             table_name=table_name,
143 |             batch_size=batch_size,
144 |         )
145 | 
146 |         for row in row_iterator:
147 |             # Check limit
148 |             if limit and processed_count >= limit:
149 |                 break
150 | 
151 |             # Extract the data field if present (PostgreSQL stores JSON in 'data' column)
152 |             word_data = row.get('data', row)
153 |             word_key = word_data.get('word', 'unknown') if isinstance(word_data, dict) else 'unknown'
154 | 
155 |             # Submit word for processing
156 |             future = executor.submit(process_single_word, word_data)
157 |             future_to_word[future] = word_key
158 | 
159 |             # When we have max_workers futures pending, wait for some to complete
160 |             if len(future_to_word) >= max_workers:
161 |                 # Wait for at least one to complete
162 |                 for future in as_completed(list(future_to_word.keys())):
163 |                     # Process this completed future and break to continue submitting
164 |                     if future not in future_to_word:
165 |                         continue
166 | 
167 |                     word_key = future_to_word.pop(future)
168 |                     result = future.result()
169 | 
170 |                     if result:
171 |                         word, definition = result
172 |                         pending_batch.append((word, definition))
173 |                         processed_count += 1
174 |                         logger.debug(f"Added '{word}' to pending batch (size: {len(pending_batch)})")
175 | 
176 |                         # Write batch when it reaches batch_size
177 |                         if len(pending_batch) >= batch_size:
178 |                             logger.debug(f"Writing batch of {len(pending_batch)} definitions to SQLite")
179 |                             sqlite_manager.insert_definitions_batch(pending_batch)
180 |                             logger.info(f"Wrote batch to SQLite. Total in DB: {sqlite_manager.count_definitions()}")
181 |                             pending_batch = []
182 | 
183 |                         # Report progress
184 |                         progress.maybe_report(processed_count, failed_count)
185 |                     else:
186 |                         failed_count += 1
187 |                         logger.warning(f"Failed to process: {word_key}")
188 |                         progress.maybe_report(processed_count, failed_count)
189 | 
190 |                     # Break after processing one to continue submitting more work
191 |                     break
192 | 
193 |         # Wait for remaining futures
194 |         for future in as_completed(future_to_word.keys()):
195 |             word_key = future_to_word[future]
196 |             result = future.result()
197 | 
198 |             if result:
199 |                 word, definition = result
200 |                 pending_batch.append((word, definition))
201 |                 processed_count += 1
202 |                 progress.maybe_report(processed_count, failed_count)
203 |             else:
204 |                 failed_count += 1
205 |                 logger.warning(f"Failed to process: {word_key}")
206 |                 progress.maybe_report(processed_count, failed_count)
207 | 
208 |         # Write any remaining definitions
209 |         if pending_batch:
210 |             logger.info(f"Writing final batch of {len(pending_batch)} definitions to SQLite")
211 |             sqlite_manager.insert_definitions_batch(pending_batch)
212 |             logger.info(f"Final batch written. Total in DB: {sqlite_manager.count_definitions()}")
213 | 
214 |     # Final statistics
215 |     progress.finalize(processed_count, failed_count)
216 |     final_count = sqlite_manager.count_definitions()
217 |     logger.info(f"Total definitions in SQLite: {final_count:,}")
218 | 
219 |     if final_count != processed_count:
220 |         logger.warning(f"Mismatch: processed {processed_count} but only {final_count} in database!")
221 | 
222 | 
223 | if __name__ == "__main__":
224 |     import argparse
225 | 
226 |     parser = argparse.ArgumentParser(
227 |         description="Generate dictionary definitions using LLM in parallel."
228 |     )
229 |     parser.add_argument(
230 |         "--table",
231 |         default="dictionary_en",
232 |         help="PostgreSQL table to read dictionary entries from (default: dictionary_en).",
233 |     )
234 |     parser.add_argument(
235 |         "--batch-size",
236 |         type=int,
237 |         default=50,
238 |         help="Number of rows to fetch from PostgreSQL per batch (default: 50).",
239 |     )
240 |     parser.add_argument(
241 |         "--workers",
242 |         type=int,
243 |         default=50,
244 |         help="Maximum number of parallel LLM requests (default: 50).",
245 |     )
246 |     parser.add_argument(
247 |         "--sqlite-path",
248 |         default="data/dictionary.sqlite",
249 |         help="Path to SQLite database file for storing definitions (default: data/dictionary.sqlite).",
250 |     )
251 |     parser.add_argument(
252 |         "--limit",
253 |         type=int,
254 |         help="Optional limit on number of words to process (for testing).",
255 |     )
256 | 
257 |     args = parser.parse_args()
258 | 
259 |     run_parallel_definitions(
260 |         table_name=args.table,
261 |         batch_size=args.batch_size,
262 |         max_workers=args.workers,
263 |         sqlite_path=args.sqlite_path,
264 |         limit=args.limit,
265 |     )
266 | 
267 | 


--------------------------------------------------------------------------------
/src/open_dictionary/wikitionary/pre_process.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import time
  5 | from typing import Any, Sequence
  6 | 
  7 | from psycopg import sql
  8 | from psycopg.cursor import Cursor
  9 | 
 10 | from open_dictionary.db.access import DatabaseAccess
 11 | 
 12 | try:
 13 |     from toon import encode as toon_encode
 14 | except ImportError:
 15 |     toon_encode = None
 16 | 
 17 | FETCH_BATCH_SIZE = 5000
 18 | UPDATE_BATCH_SIZE = 5000
 19 | PROGRESS_EVERY_ROWS = 20_000
 20 | PROGRESS_EVERY_SECONDS = 30.0
 21 | 
 22 | _ALLOWED_TOP_LEVEL_KEYS = (
 23 |     "pos",
 24 |     "word",
 25 |     "forms",
 26 |     "derived",
 27 |     "etymology_text",
 28 | )
 29 | _SENSE_KEYS = ("glosses", "raw_glosses")
 30 | 
 31 | 
 32 | def preprocess_entries(
 33 |     *,
 34 |     table_name: str,
 35 |     source_column: str = "data",
 36 |     target_column: str = "process",
 37 |     fetch_batch_size: int = FETCH_BATCH_SIZE,
 38 |     update_batch_size: int = UPDATE_BATCH_SIZE,
 39 |     progress_every_rows: int = PROGRESS_EVERY_ROWS,
 40 |     progress_every_seconds: float = PROGRESS_EVERY_SECONDS,
 41 |     recompute_existing: bool = False,
 42 |     use_toon: bool = False,
 43 | ) -> None:
 44 |     """Normalize Wiktionary payloads into a slimmer JSONB column."""
 45 | 
 46 |     if fetch_batch_size <= 0:
 47 |         raise ValueError("fetch_batch_size must be positive")
 48 |     if update_batch_size <= 0:
 49 |         raise ValueError("update_batch_size must be positive")
 50 |     if use_toon and toon_encode is None:
 51 |         raise ValueError("TOON format requested but 'toon' package is not installed")
 52 | 
 53 |     data_access = DatabaseAccess()
 54 |     _ensure_target_column(data_access, table_name, target_column, use_toon)
 55 | 
 56 |     where_clause = None
 57 |     if not recompute_existing:
 58 |         where_clause = sql.SQL("{column} IS NULL").format(
 59 |             column=sql.Identifier(target_column)
 60 |         )
 61 | 
 62 |     print(
 63 |         "[pre-process] starting "
 64 |         f"table={table_name} source={source_column} target={target_column} "
 65 |         f"fetch_batch={fetch_batch_size} update_batch={update_batch_size} "
 66 |         f"progress_rows={progress_every_rows} progress_seconds={progress_every_seconds} "
 67 |         f"recompute_existing={recompute_existing} use_toon={use_toon}",
 68 |         flush=True,
 69 |     )
 70 | 
 71 |     processed = 0
 72 |     updated = 0
 73 |     skipped = 0
 74 |     start_time = time.monotonic()
 75 |     last_log_time = start_time
 76 |     pending_updates: list[tuple[int, str]] = []
 77 | 
 78 |     with data_access.get_connection() as update_conn:
 79 |         with update_conn.cursor() as cursor:
 80 |             row_stream = data_access.iterate_table(
 81 |                 table_name,
 82 |                 batch_size=fetch_batch_size,
 83 |                 columns=(
 84 |                     "id",
 85 |                     source_column,
 86 |                     target_column,
 87 |                 ),
 88 |                 where=where_clause,
 89 |                 order_by=("id",),
 90 |             )
 91 | 
 92 |             for row in row_stream:
 93 |                 row_id = row.get("id")
 94 |                 if row_id is None:
 95 |                     skipped += 1
 96 |                     continue
 97 | 
 98 |                 payload = _load_payload(row.get(source_column))
 99 |                 if payload is None:
100 |                     skipped += 1
101 |                     continue
102 | 
103 |                 processed_payload = _preprocess_payload(payload)
104 | 
105 |                 if use_toon:
106 |                     payload_str = convert_to_toon(processed_payload)
107 |                 else:
108 |                     payload_str = json.dumps(
109 |                         processed_payload,
110 |                         ensure_ascii=False,
111 |                         separators=(",", ":"),
112 |                     )
113 | 
114 |                 pending_updates.append((int(row_id), payload_str))
115 | 
116 |                 if len(pending_updates) >= update_batch_size:
117 |                     batch_count = _flush_updates(
118 |                         cursor,
119 |                         table_name,
120 |                         target_column,
121 |                         pending_updates,
122 |                         use_toon,
123 |                     )
124 |                     update_conn.commit()
125 |                     updated += batch_count
126 |                     pending_updates.clear()
127 | 
128 |                 processed += 1
129 | 
130 |                 emit_progress = False
131 |                 now = time.monotonic()
132 |                 if processed == 1:
133 |                     emit_progress = True
134 |                 elif progress_every_rows and processed % progress_every_rows == 0:
135 |                     emit_progress = True
136 |                 elif progress_every_seconds and (now - last_log_time) >= progress_every_seconds:
137 |                     emit_progress = True
138 | 
139 |                 if emit_progress:
140 |                     _report_progress(processed, updated, skipped, start_time)
141 |                     last_log_time = now
142 | 
143 |             if pending_updates:
144 |                 batch_count = _flush_updates(
145 |                     cursor,
146 |                     table_name,
147 |                     target_column,
148 |                     pending_updates,
149 |                     use_toon,
150 |                 )
151 |                 update_conn.commit()
152 |                 updated += batch_count
153 |                 pending_updates.clear()
154 | 
155 |     _report_completion(processed, updated, skipped, start_time)
156 | 
157 | 
158 | def _ensure_target_column(
159 |     data_access: DatabaseAccess,
160 |     table_name: str,
161 |     target_column: str,
162 |     use_toon: bool = False,
163 | ) -> None:
164 |     column_type = "TEXT" if use_toon else "JSONB"
165 |     with data_access.get_connection() as conn:
166 |         with conn.cursor() as cursor:
167 |             cursor.execute(
168 |                 sql.SQL(
169 |                     """
170 |                     ALTER TABLE {table}
171 |                     ADD COLUMN IF NOT EXISTS {column} {type}
172 |                     """
173 |                 ).format(
174 |                     table=sql.Identifier(table_name),
175 |                     column=sql.Identifier(target_column),
176 |                     type=sql.SQL(column_type),
177 |                 )
178 |             )
179 |         conn.commit()
180 | 
181 | 
182 | def _flush_updates(
183 |     cursor: Cursor[Any],
184 |     table_name: str,
185 |     target_column: str,
186 |     payloads: Sequence[tuple[int, str]],
187 |     use_toon: bool = False,
188 | ) -> int:
189 |     if not payloads:
190 |         return 0
191 | 
192 |     values_sql = sql.SQL(", ").join(
193 |         sql.SQL("(%s::bigint, %s::text)") for _ in payloads
194 |     )
195 | 
196 |     # When using TOON, store as TEXT; otherwise cast to JSONB
197 |     cast_type = "text" if use_toon else "jsonb"
198 | 
199 |     update_sql = sql.SQL(
200 |         """
201 |         UPDATE {table} AS t
202 |         SET {column} = v.payload::{cast_type}
203 |         FROM (VALUES {values}) AS v(id, payload)
204 |         WHERE t.id = v.id
205 |         """
206 |     ).format(
207 |         table=sql.Identifier(table_name),
208 |         column=sql.Identifier(target_column),
209 |         cast_type=sql.SQL(cast_type),
210 |         values=values_sql,
211 |     )
212 | 
213 |     params: list[Any] = []
214 |     for row_id, payload_json in payloads:
215 |         params.extend((row_id, payload_json))
216 | 
217 |     cursor.execute(update_sql, params)
218 |     return cursor.rowcount
219 | 
220 | 
221 | def _preprocess_payload(payload: dict[str, Any]) -> dict[str, Any]:
222 |     result: dict[str, Any] = {}
223 | 
224 |     for key in _ALLOWED_TOP_LEVEL_KEYS:
225 |         if key in payload:
226 |             value = payload[key]
227 |             if value is not None:
228 |                 result[key] = value
229 | 
230 |     senses = _extract_senses(payload.get("senses"))
231 |     if senses is not None:
232 |         result["senses"] = senses
233 | 
234 |     sounds = _extract_sounds(payload.get("sounds"))
235 |     if sounds is not None:
236 |         result["sounds"] = sounds
237 | 
238 |     related = _extract_related(payload.get("related"))
239 |     if related is not None:
240 |         result["related"] = related
241 | 
242 |     return result
243 | 
244 | 
245 | def convert_to_toon(payload: dict[str, Any]) -> str:
246 |     """Convert a preprocessed dictionary payload to TOON format.
247 | 
248 |     Args:
249 |         payload: The preprocessed dictionary entry (output of _preprocess_payload).
250 | 
251 |     Returns:
252 |         A string representation in TOON format.
253 | 
254 |     Raises:
255 |         ValueError: If the toon package is not available.
256 |     """
257 |     if toon_encode is None:
258 |         raise ValueError("TOON format requested but 'toon' package is not installed")
259 | 
260 |     return toon_encode(payload)
261 | 
262 | 
263 | def _extract_senses(value: Any) -> list[dict[str, list[str]]] | None:
264 |     if not isinstance(value, list):
265 |         return None
266 | 
267 |     senses: list[dict[str, list[str]]] = []
268 |     for item in value:
269 |         if not isinstance(item, dict):
270 |             continue
271 | 
272 |         sense: dict[str, list[str]] = {}
273 |         for key in _SENSE_KEYS:
274 |             normalized = _ensure_string_list(item.get(key))
275 |             if normalized is not None:
276 |                 sense[key] = normalized
277 | 
278 |         if sense:
279 |             senses.append(sense)
280 | 
281 |     if not senses:
282 |         return None
283 |     return senses
284 | 
285 | 
286 | def _extract_sounds(value: Any) -> list[str] | None:
287 |     if not isinstance(value, list):
288 |         return None
289 | 
290 |     urls: list[str] = []
291 |     seen: set[str] = set()
292 |     for item in value:
293 |         if isinstance(item, dict):
294 |             candidate = item.get("ogg_url")
295 |         else:
296 |             candidate = None
297 | 
298 |         if not isinstance(candidate, str):
299 |             continue
300 | 
301 |         trimmed = candidate.strip()
302 |         if not trimmed or trimmed in seen:
303 |             continue
304 | 
305 |         urls.append(trimmed)
306 |         seen.add(trimmed)
307 | 
308 |     if not urls:
309 |         return None
310 |     return urls
311 | 
312 | 
313 | def _extract_related(value: Any) -> list[str] | None:
314 |     if not isinstance(value, list):
315 |         return None
316 | 
317 |     items: list[str] = []
318 |     seen: set[str] = set()
319 | 
320 |     for entry in value:
321 |         candidate: Any
322 |         if isinstance(entry, dict):
323 |             candidate = entry.get("word")
324 |         elif isinstance(entry, (list, tuple)) and entry:
325 |             candidate = entry[0]
326 |         else:
327 |             candidate = entry
328 | 
329 |         if not isinstance(candidate, str):
330 |             continue
331 | 
332 |         word = candidate.strip()
333 |         if not word or word in seen:
334 |             continue
335 | 
336 |         items.append(word)
337 |         seen.add(word)
338 | 
339 |     if not items:
340 |         return None
341 |     return items
342 | 
343 | 
344 | def _ensure_string_list(value: Any) -> list[str] | None:
345 |     if value is None:
346 |         return None
347 | 
348 |     items: list[str] = []
349 | 
350 |     if isinstance(value, str):
351 |         trimmed = value.strip()
352 |         if trimmed:
353 |             items.append(trimmed)
354 |     elif isinstance(value, (list, tuple)):
355 |         for entry in value:
356 |             if not isinstance(entry, str):
357 |                 continue
358 |             trimmed = entry.strip()
359 |             if trimmed:
360 |                 items.append(trimmed)
361 |     else:
362 |         return None
363 | 
364 |     if not items:
365 |         return None
366 |     return items
367 | 
368 | 
369 | def _load_payload(value: Any) -> dict[str, Any] | None:
370 |     if isinstance(value, dict):
371 |         return value
372 |     if value is None:
373 |         return None
374 |     if isinstance(value, bytes):
375 |         try:
376 |             decoded = value.decode("utf-8")
377 |         except UnicodeDecodeError:
378 |             return None
379 |         return _load_payload(decoded)
380 |     if isinstance(value, memoryview):
381 |         return _load_payload(value.tobytes())
382 |     if isinstance(value, str):
383 |         try:
384 |             decoded = json.loads(value)
385 |         except json.JSONDecodeError:
386 |             return None
387 |         if isinstance(decoded, dict):
388 |             return decoded
389 |         return None
390 |     return None
391 | 
392 | 
393 | def _report_progress(
394 |     processed: int,
395 |     updated: int,
396 |     skipped: int,
397 |     start_time: float,
398 | ) -> None:
399 |     elapsed = max(time.monotonic() - start_time, 1e-6)
400 |     processed_rate = processed / elapsed
401 |     print(
402 |         f"[pre-process] progress processed={processed:,} "
403 |         f"updated={updated:,} skipped={skipped:,} "
404 |         f"elapsed={elapsed:,.1f}s rate={processed_rate:,.0f} rows/s",
405 |         flush=True,
406 |     )
407 | 
408 | 
409 | def _report_completion(
410 |     processed: int,
411 |     updated: int,
412 |     skipped: int,
413 |     start_time: float,
414 | ) -> None:
415 |     elapsed = max(time.monotonic() - start_time, 1e-6)
416 |     processed_rate = processed / elapsed if processed else 0.0
417 |     print(
418 |         f"[pre-process] completed processed={processed:,} "
419 |         f"updated={updated:,} skipped={skipped:,} "
420 |         f"elapsed={elapsed:,.1f}s avg_rate={processed_rate:,.0f} rows/s",
421 |         flush=True,
422 |     )
423 | 
424 | 
425 | __all__ = [
426 |     "FETCH_BATCH_SIZE",
427 |     "UPDATE_BATCH_SIZE",
428 |     "PROGRESS_EVERY_ROWS",
429 |     "PROGRESS_EVERY_SECONDS",
430 |     "preprocess_entries",
431 |     "convert_to_toon",
432 | ]
433 | 
434 | 


--------------------------------------------------------------------------------
/src/open_dictionary/llm/define_enricher.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import concurrent.futures
  4 | import json
  5 | import random
  6 | import time
  7 | from dataclasses import dataclass
  8 | from typing import Any, Callable, Sequence
  9 | 
 10 | from psycopg import sql
 11 | from psycopg.cursor import Cursor
 12 | 
 13 | from open_dictionary.db.access import DatabaseAccess
 14 | from open_dictionary.llm.define import Definition, define
 15 | 
 16 | DEFAULT_TABLE_NAME = "dictionary_filtered_en"
 17 | DEFAULT_SOURCE_COLUMN = "data"
 18 | DEFAULT_TARGET_COLUMN = "new_speak"
 19 | DEFAULT_FETCH_BATCH_SIZE = 400
 20 | DEFAULT_LLM_BATCH_SIZE = 40
 21 | DEFAULT_MAX_WORKERS = DEFAULT_LLM_BATCH_SIZE
 22 | DEFAULT_MAX_RETRIES = 5
 23 | DEFAULT_INITIAL_BACKOFF_SECONDS = 5.0
 24 | DEFAULT_MAX_BACKOFF_SECONDS = 60.0
 25 | DEFAULT_PROGRESS_EVERY_ROWS = 120
 26 | DEFAULT_PROGRESS_EVERY_SECONDS = 30.0
 27 | 
 28 | 
 29 | @dataclass(frozen=True)
 30 | class RowPayload:
 31 |     row_id: int
 32 |     payload: str
 33 | 
 34 | 
 35 | def enrich_definitions(
 36 |     *,
 37 |     table_name: str = DEFAULT_TABLE_NAME,
 38 |     source_column: str = DEFAULT_SOURCE_COLUMN,
 39 |     target_column: str = DEFAULT_TARGET_COLUMN,
 40 |     fetch_batch_size: int = DEFAULT_FETCH_BATCH_SIZE,
 41 |     llm_batch_size: int = DEFAULT_LLM_BATCH_SIZE,
 42 |     max_workers: int | None = None,
 43 |     max_retries: int = DEFAULT_MAX_RETRIES,
 44 |     initial_backoff_seconds: float = DEFAULT_INITIAL_BACKOFF_SECONDS,
 45 |     max_backoff_seconds: float = DEFAULT_MAX_BACKOFF_SECONDS,
 46 |     progress_every_rows: int = DEFAULT_PROGRESS_EVERY_ROWS,
 47 |     progress_every_seconds: float = DEFAULT_PROGRESS_EVERY_SECONDS,
 48 |     recompute_existing: bool = False,
 49 | ) -> None:
 50 |     """Generate LLM-enriched dictionary entries and store them in a JSONB column."""
 51 | 
 52 |     if llm_batch_size <= 0:
 53 |         raise ValueError("llm_batch_size must be positive")
 54 |     if fetch_batch_size <= 0:
 55 |         raise ValueError("fetch_batch_size must be positive")
 56 |     if max_workers is not None and max_workers <= 0:
 57 |         raise ValueError("max_workers must be positive when provided")
 58 | 
 59 |     data_access = DatabaseAccess()
 60 |     _ensure_target_column(data_access, table_name, target_column)
 61 | 
 62 |     where_clause = None
 63 |     if not recompute_existing:
 64 |         where_clause = sql.SQL("{column} IS NULL").format(
 65 |             column=sql.Identifier(target_column)
 66 |         )
 67 | 
 68 |     max_workers = max_workers or llm_batch_size
 69 | 
 70 |     print(
 71 |         "[llm-define] starting "
 72 |         f"table={table_name} source={source_column} target={target_column} "
 73 |         f"fetch_batch={fetch_batch_size} llm_batch={llm_batch_size} "
 74 |         f"max_workers={max_workers} retries={max_retries} "
 75 |         f"backoff_start={initial_backoff_seconds}s backoff_max={max_backoff_seconds}s "
 76 |         f"recompute_existing={recompute_existing}",
 77 |         flush=True,
 78 |     )
 79 | 
 80 |     processed = 0
 81 |     succeeded = 0
 82 |     failed = 0
 83 |     start_time = time.monotonic()
 84 |     last_log_time = start_time
 85 |     last_log_count = 0
 86 |     pending_rows: list[RowPayload] = []
 87 | 
 88 |     def emit_progress(force: bool = False) -> None:
 89 |         nonlocal last_log_time, last_log_count
 90 |         now = time.monotonic()
 91 |         should_emit = force
 92 |         if not should_emit:
 93 |             if progress_every_rows and processed - last_log_count >= progress_every_rows:
 94 |                 should_emit = True
 95 |             if progress_every_seconds and (now - last_log_time) >= progress_every_seconds:
 96 |                 should_emit = True
 97 |         if should_emit:
 98 |             _report_progress(processed, succeeded, failed, start_time)
 99 |             last_log_time = now
100 |             last_log_count = processed
101 | 
102 |     def record_result(is_success: bool) -> None:
103 |         nonlocal processed, succeeded, failed
104 |         processed += 1
105 |         if is_success:
106 |             succeeded += 1
107 |         else:
108 |             failed += 1
109 |         emit_progress(force=True)
110 | 
111 |     with data_access.get_connection() as update_conn:
112 |         with update_conn.cursor() as cursor:
113 |             row_stream = data_access.iterate_table(
114 |                 table_name,
115 |                 batch_size=fetch_batch_size,
116 |                 columns=(
117 |                     "id",
118 |                     source_column,
119 |                     target_column,
120 |                 ),
121 |                 where=where_clause,
122 |                 order_by=("common_score",),
123 |             )
124 | 
125 |             for row in row_stream:
126 |                 row_id = row.get("id")
127 |                 if row_id is None:
128 |                     failed += 1
129 |                     processed += 1
130 |                     print("[llm-define] skipped row without id", flush=True)
131 |                     emit_progress(force=True)
132 |                     continue
133 | 
134 |                 payload = _load_payload(row.get(source_column))
135 |                 if payload is None:
136 |                     failed += 1
137 |                     processed += 1
138 |                     print(
139 |                         f"[llm-define] row_id={row_id} missing or invalid {source_column}",
140 |                         flush=True,
141 |                     )
142 |                     emit_progress(force=True)
143 |                     continue
144 | 
145 |                 sanitized_payload = _sanitize_payload(payload)
146 | 
147 |                 pending_rows.append(RowPayload(int(row_id), sanitized_payload))
148 | 
149 |                 if len(pending_rows) >= llm_batch_size:
150 |                     _process_batch(
151 |                         cursor,
152 |                         table_name,
153 |                         target_column,
154 |                         pending_rows,
155 |                         max_workers,
156 |                         max_retries,
157 |                         initial_backoff_seconds,
158 |                         max_backoff_seconds,
159 |                         record_result,
160 |                     )
161 |                     pending_rows.clear()
162 |                     update_conn.commit()
163 | 
164 |             if pending_rows:
165 |                 _process_batch(
166 |                     cursor,
167 |                     table_name,
168 |                     target_column,
169 |                     pending_rows,
170 |                     max_workers,
171 |                     max_retries,
172 |                     initial_backoff_seconds,
173 |                     max_backoff_seconds,
174 |                     record_result,
175 |                 )
176 |                 pending_rows.clear()
177 |                 update_conn.commit()
178 | 
179 |     _report_completion(processed, succeeded, failed, start_time)
180 | 
181 | 
182 | def _process_batch(
183 |     cursor: Cursor[Any],
184 |     table_name: str,
185 |     target_column: str,
186 |     rows: Sequence[RowPayload],
187 |     max_workers: int,
188 |     max_retries: int,
189 |     initial_backoff_seconds: float,
190 |     max_backoff_seconds: float,
191 |     record_result: Callable[[bool], None],
192 | ) -> None:
193 |     successes = _run_llm_batch(
194 |         rows,
195 |         max_workers,
196 |         max_retries,
197 |         initial_backoff_seconds,
198 |         max_backoff_seconds,
199 |         record_result,
200 |     )
201 | 
202 |     _apply_updates(cursor, table_name, target_column, successes)
203 | 
204 | 
205 | def _run_llm_batch(
206 |     rows: Sequence[RowPayload],
207 |     max_workers: int,
208 |     max_retries: int,
209 |     initial_backoff_seconds: float,
210 |     max_backoff_seconds: float,
211 |     record_result: Callable[[bool], None],
212 | ) -> list[tuple[int, str]]:
213 |     successes: list[tuple[int, str]] = []
214 | 
215 |     worker_count = min(max(len(rows), 1), max_workers)
216 | 
217 |     with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
218 |         future_to_row = {
219 |             executor.submit(
220 |                 _define_with_retry,
221 |                 row.payload,
222 |                 max_retries,
223 |                 initial_backoff_seconds,
224 |                 max_backoff_seconds,
225 |             ): row
226 |             for row in rows
227 |         }
228 | 
229 |         for future in concurrent.futures.as_completed(future_to_row):
230 |             row = future_to_row[future]
231 |             try:
232 |                 definition = future.result()
233 |             except Exception as exc:  # pragma: no cover - network/runtime failures
234 |                 print(
235 |                     f"[llm-define] row_id={row.row_id} failed: {exc}",
236 |                     flush=True,
237 |                 )
238 |                 _log_error(row.row_id, row.payload, exc)
239 |                 record_result(False)
240 |             else:
241 |                 payload_json = json.dumps(
242 |                     definition.model_dump(mode="json"),
243 |                     ensure_ascii=False,
244 |                 )
245 |                 successes.append((row.row_id, payload_json))
246 |                 record_result(True)
247 | 
248 |     return successes
249 | 
250 | 
251 | def _define_with_retry(
252 |     payload: str,
253 |     max_retries: int,
254 |     initial_backoff_seconds: float,
255 |     max_backoff_seconds: float,
256 | ) -> Definition:
257 |     attempt = 0
258 |     while True:
259 |         try:
260 |             return define(payload)
261 |         except Exception as exc:  # pragma: no cover - passthrough for runtime errors
262 |             attempt += 1
263 |             if attempt >= max_retries:
264 |                 raise exc
265 | 
266 |             backoff = min(
267 |                 max_backoff_seconds,
268 |                 initial_backoff_seconds * (2 ** (attempt - 1)),
269 |             )
270 |             jitter = random.uniform(0.0, initial_backoff_seconds)
271 |             sleep_seconds = max(backoff + jitter, 0.0)
272 |             time.sleep(sleep_seconds)
273 | 
274 | 
275 | def _apply_updates(
276 |     cursor: Cursor[Any],
277 |     table_name: str,
278 |     target_column: str,
279 |     payloads: Sequence[tuple[int, str]],
280 | ) -> None:
281 |     if not payloads:
282 |         return
283 | 
284 |     values_sql = sql.SQL(", ").join(
285 |         sql.SQL("(%s::bigint, %s::text)") for _ in payloads
286 |     )
287 | 
288 |     update_sql = sql.SQL(
289 |         """
290 |         UPDATE {table} AS t
291 |         SET {column} = v.payload::jsonb
292 |         FROM (VALUES {values}) AS v(id, payload)
293 |         WHERE t.id = v.id
294 |         """
295 |     ).format(
296 |         table=sql.Identifier(table_name),
297 |         column=sql.Identifier(target_column),
298 |         values=values_sql,
299 |     )
300 | 
301 |     params: list[Any] = []
302 |     for row_id, payload_json in payloads:
303 |         params.extend((row_id, payload_json))
304 | 
305 |     cursor.execute(update_sql, params)
306 | 
307 | 
308 | def _ensure_target_column(
309 |     data_access: DatabaseAccess,
310 |     table_name: str,
311 |     target_column: str,
312 | ) -> None:
313 |     with data_access.get_connection() as conn:
314 |         with conn.cursor() as cursor:
315 |             cursor.execute(
316 |                 sql.SQL(
317 |                     """
318 |                     ALTER TABLE {table}
319 |                     ADD COLUMN IF NOT EXISTS {column} JSONB
320 |                     """
321 |                 ).format(
322 |                     table=sql.Identifier(table_name),
323 |                     column=sql.Identifier(target_column),
324 |                 )
325 |             )
326 |         conn.commit()
327 | 
328 | 
329 | def _load_payload(value: Any) -> str | None:
330 |     if isinstance(value, str):
331 |         return value
332 |     if value is None:
333 |         return None
334 |     if isinstance(value, dict):
335 |         return json.dumps(value, ensure_ascii=False)
336 |     if isinstance(value, bytes):
337 |         try:
338 |             decoded = value.decode("utf-8")
339 |         except UnicodeDecodeError:
340 |             return None
341 |         return decoded
342 |     if isinstance(value, memoryview):
343 |         return _load_payload(value.tobytes())
344 |     return None
345 | 
346 | 
347 | def _sanitize_payload(payload: str, max_length: int = 1000) -> str:
348 |     """Trim overly large payloads by dropping noisy fields."""
349 |     if len(payload) <= max_length:
350 |         return payload
351 | 
352 |     try:
353 |         payload_obj = json.loads(payload)
354 |     except (TypeError, json.JSONDecodeError):
355 |         return payload
356 | 
357 |     if isinstance(payload_obj, dict):
358 |         for key in ("derived", "forms", "glosses"):
359 |             payload_obj.pop(key, None)
360 | 
361 |         senses = payload_obj.get("senses")
362 |         if isinstance(senses, list):
363 |             for sense in senses:
364 |                 if isinstance(sense, dict):
365 |                     sense.pop("glosses", None)
366 | 
367 |         return json.dumps(payload_obj, ensure_ascii=False)
368 | 
369 |     return payload
370 | 
371 | 
372 | def _report_progress(
373 |     processed: int,
374 |     succeeded: int,
375 |     failed: int,
376 |     start_time: float,
377 | ) -> None:
378 |     elapsed = max(time.monotonic() - start_time, 1e-6)
379 |     rate = processed / elapsed
380 |     print(
381 |         f"[llm-define] progress processed={processed:,} "
382 |         f"succeeded={succeeded:,} failed={failed:,} "
383 |         f"elapsed={elapsed:,.1f}s rate={rate:,.0f} rows/s",
384 |         flush=True,
385 |     )
386 | 
387 | 
388 | def _report_completion(
389 |     processed: int,
390 |     succeeded: int,
391 |     failed: int,
392 |     start_time: float,
393 | ) -> None:
394 |     elapsed = max(time.monotonic() - start_time, 1e-6)
395 |     rate = processed / elapsed if processed else 0.0
396 |     print(
397 |         f"[llm-define] completed processed={processed:,} "
398 |         f"succeeded={succeeded:,} failed={failed:,} "
399 |         f"elapsed={elapsed:,.1f}s avg_rate={rate:,.0f} rows/s",
400 |         flush=True,
401 |     )
402 | 
403 | 
404 | def _log_error(
405 |     row_id: int,
406 |     payload: str,
407 |     error: Exception,
408 |     log_file: str = "data/llm_define_errors.log",
409 | ) -> None:
410 |     """Write error details to a log file."""
411 |     import os
412 |     from datetime import datetime
413 | 
414 |     try:
415 |         os.makedirs(os.path.dirname(log_file), exist_ok=True)
416 |         with open(log_file, "a", encoding="utf-8") as f:
417 |             timestamp = datetime.now().isoformat()
418 |             f.write(f"\n{'='*80}\n")
419 |             f.write(f"Timestamp: {timestamp}\n")
420 |             f.write(f"Row ID: {row_id}\n")
421 |             f.write(f"Error: {type(error).__name__}: {error}\n")
422 |             f.write(f"\nPayload:\n{payload}\n")
423 |             
424 |             # Log the LLM response if it's attached to the exception
425 |             if hasattr(error, 'llm_response'):
426 |                 f.write(f"\nLLM Response:\n{error.llm_response}\n") # type: ignore
427 |             
428 |             f.write(f"{'='*80}\n")
429 |     except Exception as log_exc:  # pragma: no cover
430 |         print(
431 |             f"[llm-define] failed to write error log: {log_exc}",
432 |             flush=True,
433 |         )
434 | 
435 | 
436 | __all__ = [
437 |     "DEFAULT_TABLE_NAME",
438 |     "DEFAULT_SOURCE_COLUMN",
439 |     "DEFAULT_TARGET_COLUMN",
440 |     "DEFAULT_FETCH_BATCH_SIZE",
441 |     "DEFAULT_LLM_BATCH_SIZE",
442 |     "DEFAULT_MAX_WORKERS",
443 |     "DEFAULT_MAX_RETRIES",
444 |     "DEFAULT_INITIAL_BACKOFF_SECONDS",
445 |     "DEFAULT_MAX_BACKOFF_SECONDS",
446 |     "DEFAULT_PROGRESS_EVERY_ROWS",
447 |     "DEFAULT_PROGRESS_EVERY_SECONDS",
448 |     "enrich_definitions",
449 | ]
450 | 


--------------------------------------------------------------------------------
/src/open_dictionary/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line entry point for the Open Dictionary toolkit."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import argparse
  6 | import os
  7 | import sys
  8 | from pathlib import Path
  9 | 
 10 | import psycopg
 11 | from dotenv import load_dotenv
 12 | 
 13 | from .db import cleaner as db_cleaner
 14 | from .db import mark_commonness as db_commonness
 15 | from .llm import define_enricher as llm_define_enricher
 16 | from .wikitionary.downloader import DEFAULT_WIKTIONARY_URL, download_wiktionary_dump
 17 | from .wikitionary.extract import extract_wiktionary_dump
 18 | from .wikitionary.filter import filter_languages
 19 | from .wikitionary import pre_process as wiktionary_pre_process
 20 | from .wikitionary.pipeline import run_pipeline
 21 | from .wikitionary.transform import (
 22 |     JsonlProcessingError,
 23 |     copy_jsonl_to_postgres,
 24 |     partition_dictionary_by_language,
 25 | )
 26 | 
 27 | 
 28 | DEFAULT_DICTIONARY_TABLE = "dictionary_en"
 29 | 
 30 | 
 31 | COMMAND_NAMES = {
 32 |     "download",
 33 |     "extract",
 34 |     "filter",
 35 |     "load",
 36 |     "partition",
 37 |     "pipeline",
 38 |     "db-clean",
 39 |     "db-commonness",
 40 |     "llm-define",
 41 |     "pre-process",
 42 | }
 43 | 
 44 | 
 45 | def _add_database_options(parser: argparse.ArgumentParser) -> None:
 46 |     parser.add_argument(
 47 |         "--env-file",
 48 |         default=".env",
 49 |         help="Path to the .env file containing the database URL (default: .env).",
 50 |     )
 51 |     parser.add_argument(
 52 |         "--database-url-var",
 53 |         default="DATABASE_URL",
 54 |         help="Environment variable name holding the connection string.",
 55 |     )
 56 | 
 57 | 
 58 | def _get_conninfo(args: argparse.Namespace) -> str:
 59 |     env_file = getattr(args, "env_file", None)
 60 |     if env_file:
 61 |         load_dotenv(env_file)
 62 | 
 63 |     var_name = getattr(args, "database_url_var", "DATABASE_URL")
 64 |     if not var_name:
 65 |         raise RuntimeError("Database URL environment variable name cannot be empty")
 66 | 
 67 |     conninfo = os.getenv(var_name)  # type: ignore[arg-type]
 68 |     if not conninfo:
 69 |         raise RuntimeError(
 70 |             f"Environment variable {var_name} is not set. Ensure your .env file is loaded."
 71 |         )
 72 | 
 73 |     return conninfo
 74 | 
 75 | 
 76 | def _cmd_download(args: argparse.Namespace) -> int:
 77 |     try:
 78 |         destination = download_wiktionary_dump(
 79 |             args.output,
 80 |             url=args.url,
 81 |             overwrite=args.overwrite,
 82 |         )
 83 |     except RuntimeError as exc:  # pragma: no cover - network failure guard
 84 |         args._parser.error(str(exc))
 85 |     except OSError as exc:
 86 |         args._parser.error(str(exc))
 87 | 
 88 |     print(f"Downloaded file to {destination}")  # type: ignore[func-returns-value]
 89 |     return 0
 90 | 
 91 | 
 92 | def _cmd_extract(args: argparse.Namespace) -> int:
 93 |     try:
 94 |         output = extract_wiktionary_dump(
 95 |             args.input,
 96 |             args.output,
 97 |             overwrite=args.overwrite,
 98 |         )
 99 |     except (FileNotFoundError, IsADirectoryError) as exc:
100 |         args._parser.error(str(exc))
101 |     except OSError as exc:
102 |         args._parser.error(str(exc))
103 | 
104 |     print(f"Extracted archive to {output}")  # type: ignore[func-returns-value]
105 |     return 0
106 | 
107 | 
108 | def _cmd_load(args: argparse.Namespace) -> int:
109 |     try:
110 |         conninfo = _get_conninfo(args)
111 |     except RuntimeError as exc:
112 |         args._parser.error(str(exc))
113 | 
114 |     try:
115 |         rows_copied = copy_jsonl_to_postgres(
116 |             jsonl_path=args.input,
117 |             conninfo=conninfo,  # type: ignore[arg-type]
118 |             table_name=args.table,
119 |             column_name=args.column,
120 |             truncate=args.truncate,
121 |         )
122 |     except (FileNotFoundError, JsonlProcessingError) as exc:
123 |         args._parser.error(str(exc))
124 |     except (psycopg.Error, ValueError) as exc:
125 |         args._parser.error(f"Database error: {exc}")
126 | 
127 |     print(f"Copied {rows_copied} rows into {args.table}.{args.column}")  # type: ignore[misc]
128 |     return 0
129 | 
130 | 
131 | def _cmd_partition(args: argparse.Namespace) -> int:
132 |     try:
133 |         conninfo = _get_conninfo(args)
134 |     except RuntimeError as exc:
135 |         args._parser.error(str(exc))
136 | 
137 |     try:
138 |         created = partition_dictionary_by_language(
139 |             conninfo,  # type: ignore[arg-type]
140 |             source_table=args.table,
141 |             column_name=args.column,
142 |             lang_field=args.lang_field,
143 |             table_prefix=args.prefix,
144 |             target_schema=args.target_schema,
145 |             drop_existing=args.drop_existing,
146 |         )
147 |     except (psycopg.Error, ValueError) as exc:
148 |         args._parser.error(f"Database error: {exc}")
149 | 
150 |     if created:  # type: ignore[truthy-bool]
151 |         print("Created/updated tables:")
152 |         for table in created:
153 |             print(f"- {table}")
154 |     else:
155 |         print("No language-specific tables were created.")
156 |     return 0
157 | 
158 | 
159 | def _cmd_pipeline(args: argparse.Namespace) -> int:
160 |     try:
161 |         conninfo = _get_conninfo(args)
162 |     except RuntimeError as exc:
163 |         args._parser.error(str(exc))
164 | 
165 |     try:
166 |         run_pipeline(
167 |             workdir=args.workdir,
168 |             conninfo=conninfo,  # type: ignore[arg-type]
169 |             table_name=args.table,
170 |             column_name=args.column,
171 |             url=args.url,
172 |             truncate=args.truncate,
173 |             skip_download=args.skip_download,
174 |             skip_extract=args.skip_extract,
175 |             skip_partition=args.skip_partition,
176 |             overwrite_download=args.overwrite_download,
177 |             overwrite_extract=args.overwrite_extract,
178 |             lang_field=args.lang_field,
179 |             table_prefix=args.prefix,
180 |             target_schema=args.target_schema,
181 |             drop_existing_partitions=args.drop_existing_partitions,
182 |         )
183 |     except (FileNotFoundError, JsonlProcessingError) as exc:
184 |         args._parser.error(str(exc))
185 |     except RuntimeError as exc:  # pragma: no cover - network failure guard
186 |         args._parser.error(str(exc))
187 |     except (psycopg.Error, ValueError) as exc:
188 |         args._parser.error(f"Database error: {exc}")
189 | 
190 |     print("Pipeline completed successfully.")
191 |     return 0
192 | 
193 | 
194 | def _cmd_filter(args: argparse.Namespace) -> int:
195 |     try:
196 |         conninfo = _get_conninfo(args)
197 |     except RuntimeError as exc:
198 |         args._parser.error(str(exc))
199 | 
200 |     try:
201 |         created = filter_languages(
202 |             conninfo,  # type: ignore[arg-type]
203 |             source_table=args.table,
204 |             column_name=args.column,
205 |             languages=args.languages,
206 |             lang_field=args.lang_field,
207 |             table_prefix=args.table_prefix,
208 |             target_schema=args.target_schema,
209 |             drop_existing=args.drop_existing,
210 |         )
211 |     except ValueError as exc:
212 |         args._parser.error(str(exc))
213 |     except psycopg.Error as exc:
214 |         args._parser.error(f"Database error: {exc}")
215 | 
216 |     if created:  # type: ignore[truthy-bool]
217 |         print("Created/updated tables:")
218 |         for table in created:
219 |             print(f"- {table}")
220 |     else:
221 |         print("No tables were created.")
222 |     return 0
223 | 
224 | 
225 | def _cmd_db_clean(args: argparse.Namespace) -> int:
226 |     try:
227 |         _ = _get_conninfo(args)
228 |     except RuntimeError as exc:
229 |         args._parser.error(str(exc))
230 | 
231 |     db_cleaner.clean_dictionary_data(
232 |         table_name=args.table,
233 |         fetch_batch_size=args.fetch_batch_size,
234 |         delete_batch_size=args.delete_batch_size,
235 |         progress_every_rows=args.progress_every_rows,
236 |         progress_every_seconds=args.progress_every_seconds,
237 |     )
238 |     return 0
239 | 
240 | 
241 | def _cmd_db_commonness(args: argparse.Namespace) -> int:
242 |     try:
243 |         _ = _get_conninfo(args)
244 |     except RuntimeError as exc:
245 |         args._parser.error(str(exc))
246 | 
247 |     db_commonness.enrich_common_score(
248 |         table_name=args.table,
249 |         fetch_batch_size=args.fetch_batch_size,
250 |         update_batch_size=args.update_batch_size,
251 |         progress_every_rows=args.progress_every_rows,
252 |         progress_every_seconds=args.progress_every_seconds,
253 |         recompute_existing=args.recompute_existing,
254 |     )
255 |     return 0
256 | 
257 | 
258 | def _cmd_llm_define(args: argparse.Namespace) -> int:
259 |     try:
260 |         _ = _get_conninfo(args)
261 |     except RuntimeError as exc:
262 |         args._parser.error(str(exc))
263 | 
264 |     llm_define_enricher.enrich_definitions(
265 |         table_name=args.table,
266 |         source_column=args.source_column,
267 |         target_column=args.target_column,
268 |         fetch_batch_size=args.fetch_batch_size,
269 |         llm_batch_size=args.llm_batch_size,
270 |         max_workers=args.max_workers,
271 |         max_retries=args.max_retries,
272 |         initial_backoff_seconds=args.initial_backoff_seconds,
273 |         max_backoff_seconds=args.max_backoff_seconds,
274 |         progress_every_rows=args.progress_every_rows,
275 |         progress_every_seconds=args.progress_every_seconds,
276 |         recompute_existing=args.recompute_existing,
277 |     )
278 |     return 0
279 | 
280 | 
281 | def _cmd_pre_process(args: argparse.Namespace) -> int:
282 |     try:
283 |         _ = _get_conninfo(args)
284 |     except RuntimeError as exc:
285 |         args._parser.error(str(exc))
286 | 
287 |     wiktionary_pre_process.preprocess_entries(
288 |         table_name=args.table,
289 |         source_column=args.source_column,
290 |         target_column=args.target_column,
291 |         fetch_batch_size=args.fetch_batch_size,
292 |         update_batch_size=args.update_batch_size,
293 |         progress_every_rows=args.progress_every_rows,
294 |         progress_every_seconds=args.progress_every_seconds,
295 |         recompute_existing=args.recompute_existing,
296 |         use_toon=args.toon,
297 |     )
298 |     return 0
299 | 
300 | 
301 | def _build_parser() -> argparse.ArgumentParser:
302 |     parser = argparse.ArgumentParser(
303 |         description="Utilities for downloading, extracting, and loading Wiktionary dumps.",
304 |     )
305 |     subparsers = parser.add_subparsers(dest="command")
306 | 
307 |     download_parser = subparsers.add_parser(
308 |         "download",
309 |         help="Download the raw Wiktionary dump (.jsonl.gz).",
310 |     )
311 |     download_parser.add_argument(
312 |         "--url",
313 |         default=DEFAULT_WIKTIONARY_URL,
314 |         help="Source URL for the Wiktionary dump (default: official raw dataset).",
315 |     )
316 |     download_parser.add_argument(
317 |         "--output",
318 |         type=Path,
319 |         default=Path("data/raw-wiktextract-data.jsonl.gz"),
320 |         help="Where to store the downloaded archive (default: data/raw-wiktextract-data.jsonl.gz).",
321 |     )
322 |     download_parser.add_argument(
323 |         "--overwrite",
324 |         action="store_true",
325 |         help="Overwrite the existing archive if it already exists.",
326 |     )
327 |     download_parser.set_defaults(func=_cmd_download, _parser=download_parser)
328 | 
329 |     extract_parser = subparsers.add_parser(
330 |         "extract",
331 |         help="Extract the downloaded .jsonl.gz archive to a plain JSONL file.",
332 |     )
333 |     extract_parser.add_argument(
334 |         "--input",
335 |         type=Path,
336 |         default=Path("data/raw-wiktextract-data.jsonl.gz"),
337 |         help="Path to the .jsonl.gz archive (default: data/raw-wiktextract-data.jsonl.gz).",
338 |     )
339 |     extract_parser.add_argument(
340 |         "--output",
341 |         type=Path,
342 |         default=Path("data/raw-wiktextract-data.jsonl"),
343 |         help="Where to write the decompressed JSONL file (default: data/raw-wiktextract-data.jsonl).",
344 |     )
345 |     extract_parser.add_argument(
346 |         "--overwrite",
347 |         action="store_true",
348 |         help="Overwrite the extracted JSONL if it already exists.",
349 |     )
350 |     extract_parser.set_defaults(func=_cmd_extract, _parser=extract_parser)
351 | 
352 |     load_parser = subparsers.add_parser(
353 |         "load",
354 |         help="Load a JSONL file into PostgreSQL using COPY.",
355 |     )
356 |     load_parser.add_argument("input", type=Path, help="Path to the JSONL file to load.")
357 |     load_parser.add_argument(
358 |         "--table",
359 |         default="dictionary_all",
360 |         help="Target table name (default: dictionary_all).",
361 |     )
362 |     load_parser.add_argument(
363 |         "--column",
364 |         default="data",
365 |         help="Target JSON/JSONB column name (default: data).",
366 |     )
367 |     load_parser.add_argument(
368 |         "--truncate",
369 |         action="store_true",
370 |         help="Truncate the destination table before inserting new rows.",
371 |     )
372 |     _add_database_options(load_parser)
373 |     load_parser.set_defaults(func=_cmd_load, _parser=load_parser)
374 | 
375 |     partition_parser = subparsers.add_parser(
376 |         "partition",
377 |         help="Split the main dictionary table into per-language tables.",
378 |     )
379 |     partition_parser.add_argument(
380 |         "--table",
381 |         default="dictionary_all",
382 |         help="Source table containing the JSONB data (default: dictionary_all).",
383 |     )
384 |     partition_parser.add_argument(
385 |         "--column",
386 |         default="data",
387 |         help="JSONB column to inspect for language codes (default: data).",
388 |     )
389 |     partition_parser.add_argument(
390 |         "--lang-field",
391 |         default="lang_code",
392 |         help="JSON key inside each entry that stores the language code (default: lang_code).",
393 |     )
394 |     partition_parser.add_argument(
395 |         "--prefix",
396 |         default="dictionary_lang",
397 |         help="Prefix for generated tables (default: dictionary_lang).",
398 |     )
399 |     partition_parser.add_argument(
400 |         "--target-schema",
401 |         help="Optional schema to place the generated tables in (default: current search_path).",
402 |     )
403 |     partition_parser.add_argument(
404 |         "--drop-existing",
405 |         action="store_true",
406 |         help="Drop and recreate each language table before inserting rows.",
407 |     )
408 |     _add_database_options(partition_parser)
409 |     partition_parser.set_defaults(func=_cmd_partition, _parser=partition_parser)
410 | 
411 |     pipeline_parser = subparsers.add_parser(
412 |         "pipeline",
413 |         help="Run the full download → extract → load → partition workflow.",
414 |     )
415 |     pipeline_parser.add_argument(
416 |         "--workdir",
417 |         type=Path,
418 |         default=Path("data"),
419 |         help="Working directory for downloaded/extracted files (default: data).",
420 |     )
421 |     pipeline_parser.add_argument(
422 |         "--url",
423 |         default=DEFAULT_WIKTIONARY_URL,
424 |         help="Source URL for the Wiktionary dump (default: official raw dataset).",
425 |     )
426 |     pipeline_parser.add_argument(
427 |         "--table",
428 |         default="dictionary_all",
429 |         help="Destination table for the raw entries (default: dictionary_all).",
430 |     )
431 |     pipeline_parser.add_argument(
432 |         "--column",
433 |         default="data",
434 |         help="Destination JSONB column name (default: data).",
435 |     )
436 |     pipeline_parser.add_argument(
437 |         "--truncate",
438 |         action="store_true",
439 |         help="Truncate the destination table before inserting new rows.",
440 |     )
441 |     pipeline_parser.add_argument(
442 |         "--skip-download",
443 |         action="store_true",
444 |         help="Skip downloading if the archive is already present.",
445 |     )
446 |     pipeline_parser.add_argument(
447 |         "--skip-extract",
448 |         action="store_true",
449 |         help="Skip extraction if the JSONL file already exists.",
450 |     )
451 |     pipeline_parser.add_argument(
452 |         "--skip-partition",
453 |         action="store_true",
454 |         help="Skip creating per-language tables after loading.",
455 |     )
456 |     pipeline_parser.add_argument(
457 |         "--overwrite-download",
458 |         action="store_true",
459 |         help="Force re-download even if the archive already exists.",
460 |     )
461 |     pipeline_parser.add_argument(
462 |         "--overwrite-extract",
463 |         action="store_true",
464 |         help="Force re-extraction even if the JSONL already exists.",
465 |     )
466 |     pipeline_parser.add_argument(
467 |         "--lang-field",
468 |         default="lang_code",
469 |         help="JSON key inside each entry that stores the language code (default: lang_code).",
470 |     )
471 |     pipeline_parser.add_argument(
472 |         "--prefix",
473 |         default="dictionary_lang",
474 |         help="Prefix for generated language tables (default: dictionary_lang).",
475 |     )
476 |     pipeline_parser.add_argument(
477 |         "--target-schema",
478 |         help="Optional schema to place generated tables in (default: current search_path).",
479 |     )
480 |     pipeline_parser.add_argument(
481 |         "--drop-existing-partitions",
482 |         action="store_true",
483 |         help="Drop existing language tables before rebuilding them.",
484 |     )
485 |     _add_database_options(pipeline_parser)
486 |     pipeline_parser.set_defaults(func=_cmd_pipeline, _parser=pipeline_parser)
487 | 
488 |     filter_parser = subparsers.add_parser(
489 |         "filter",
490 |         help="Filter existing dictionary entries into language-specific tables.",
491 |     )
492 |     filter_parser.add_argument(
493 |         "languages",
494 |         nargs="+",
495 |         help="Language codes to materialize (e.g. en zh fr, or 'all').",
496 |     )
497 |     filter_parser.add_argument(
498 |         "--table",
499 |         default="dictionary_all",
500 |         help="Source table containing the raw entries (default: dictionary_all).",
501 |     )
502 |     filter_parser.add_argument(
503 |         "--column",
504 |         default="data",
505 |         help="JSONB column storing the dictionary payloads (default: data).",
506 |     )
507 |     filter_parser.add_argument(
508 |         "--lang-field",
509 |         default="lang_code",
510 |         help="JSON key containing the language code (default: lang_code).",
511 |     )
512 |     filter_parser.add_argument(
513 |         "--table-prefix",
514 |         default="dictionary_lang",
515 |         help="Base name for materialized tables; language code is appended (default: dictionary_lang).",
516 |     )
517 |     filter_parser.add_argument(
518 |         "--target-schema",
519 |         help="Optional schema for the materialized tables (default: current search_path).",
520 |     )
521 |     filter_parser.add_argument(
522 |         "--drop-existing",
523 |         action="store_true",
524 |         help="Drop existing destination tables before inserting rows.",
525 |     )
526 |     _add_database_options(filter_parser)
527 |     filter_parser.set_defaults(func=_cmd_filter, _parser=filter_parser)
528 | 
529 |     pre_process_parser = subparsers.add_parser(
530 |         "pre-process",
531 |         help="Trim Wiktionary entries to the subset needed by downstream workflows.",
532 |     )
533 |     pre_process_parser.add_argument(
534 |         "--table",
535 |         default="dictionary_all",
536 |         help="Source table containing raw Wiktionary entries (default: %(default)s).",
537 |     )
538 |     pre_process_parser.add_argument(
539 |         "--source-column",
540 |         default="data",
541 |         help="Column storing the original Wiktionary JSON (default: %(default)s).",
542 |     )
543 |     pre_process_parser.add_argument(
544 |         "--target-column",
545 |         default="process",
546 |         help="Column to store the normalized JSON (default: %(default)s).",
547 |     )
548 |     pre_process_parser.add_argument(
549 |         "--fetch-batch-size",
550 |         type=int,
551 |         default=wiktionary_pre_process.FETCH_BATCH_SIZE,
552 |         help="Rows fetched per streaming batch (default: %(default)s).",
553 |     )
554 |     pre_process_parser.add_argument(
555 |         "--update-batch-size",
556 |         type=int,
557 |         default=wiktionary_pre_process.UPDATE_BATCH_SIZE,
558 |         help="Rows updated per write batch (default: %(default)s).",
559 |     )
560 |     pre_process_parser.add_argument(
561 |         "--progress-every-rows",
562 |         type=int,
563 |         default=wiktionary_pre_process.PROGRESS_EVERY_ROWS,
564 |         help="Emit progress after this many processed rows (default: %(default)s).",
565 |     )
566 |     pre_process_parser.add_argument(
567 |         "--progress-every-seconds",
568 |         type=float,
569 |         default=wiktionary_pre_process.PROGRESS_EVERY_SECONDS,
570 |         help="Emit progress at least this often in seconds (default: %(default)s).",
571 |     )
572 |     pre_process_parser.add_argument(
573 |         "--recompute-existing",
574 |         action="store_true",
575 |         help="Regenerate payloads even if the target column is already populated.",
576 |     )
577 |     pre_process_parser.add_argument(
578 |         "--toon",
579 |         action="store_true",
580 |         help="Convert processed payloads to TOON format (reduces token usage for LLMs).",
581 |     )
582 |     _add_database_options(pre_process_parser)
583 |     pre_process_parser.set_defaults(func=_cmd_pre_process, _parser=pre_process_parser)
584 | 
585 |     db_clean_parser = subparsers.add_parser(
586 |         "db-clean",
587 |         help="Remove low-quality entries from a dictionary table.",
588 |     )
589 |     db_clean_parser.add_argument(
590 |         "--table",
591 |         default=DEFAULT_DICTIONARY_TABLE,
592 |         help="Source table containing JSONB entries (default: %(default)s).",
593 |     )
594 |     db_clean_parser.add_argument(
595 |         "--fetch-batch-size",
596 |         type=int,
597 |         default=db_cleaner.FETCH_BATCH_SIZE,
598 |         help="Number of rows to fetch per batch (default: %(default)s).",
599 |     )
600 |     db_clean_parser.add_argument(
601 |         "--delete-batch-size",
602 |         type=int,
603 |         default=db_cleaner.DELETE_BATCH_SIZE,
604 |         help="Number of rows to delete per batch (default: %(default)s).",
605 |     )
606 |     db_clean_parser.add_argument(
607 |         "--progress-every-rows",
608 |         type=int,
609 |         default=db_cleaner.PROGRESS_EVERY_ROWS,
610 |         help="Emit progress after this many processed rows (default: %(default)s).",
611 |     )
612 |     db_clean_parser.add_argument(
613 |         "--progress-every-seconds",
614 |         type=float,
615 |         default=db_cleaner.PROGRESS_EVERY_SECONDS,
616 |         help="Emit progress at least this often in seconds (default: %(default)s).",
617 |     )
618 |     _add_database_options(db_clean_parser)
619 |     db_clean_parser.set_defaults(func=_cmd_db_clean, _parser=db_clean_parser)
620 | 
621 |     db_common_parser = subparsers.add_parser(
622 |         "db-commonness",
623 |         help="Populate the common_score column using word frequency data.",
624 |     )
625 |     db_common_parser.add_argument(
626 |         "--table",
627 |         default=DEFAULT_DICTIONARY_TABLE,
628 |         help="Target dictionary table (default: %(default)s).",
629 |     )
630 |     db_common_parser.add_argument(
631 |         "--fetch-batch-size",
632 |         type=int,
633 |         default=db_commonness.FETCH_BATCH_SIZE,
634 |         help="Number of rows to fetch per batch (default: %(default)s).",
635 |     )
636 |     db_common_parser.add_argument(
637 |         "--update-batch-size",
638 |         type=int,
639 |         default=db_commonness.UPDATE_BATCH_SIZE,
640 |         help="Number of rows to update per batch (default: %(default)s).",
641 |     )
642 |     db_common_parser.add_argument(
643 |         "--progress-every-rows",
644 |         type=int,
645 |         default=db_commonness.PROGRESS_EVERY_ROWS,
646 |         help="Emit progress after this many processed rows (default: %(default)s).",
647 |     )
648 |     db_common_parser.add_argument(
649 |         "--progress-every-seconds",
650 |         type=float,
651 |         default=db_commonness.PROGRESS_EVERY_SECONDS,
652 |         help="Emit progress at least this often in seconds (default: %(default)s).",
653 |     )
654 |     db_common_parser.add_argument(
655 |         "--recompute-existing",
656 |         action="store_true",
657 |         help="Recalculate scores even if a value already exists.",
658 |     )
659 |     _add_database_options(db_common_parser)
660 |     db_common_parser.set_defaults(func=_cmd_db_commonness, _parser=db_common_parser)
661 | 
662 |     llm_define_parser = subparsers.add_parser(
663 |         "llm-define",
664 |         help="Generate enriched dictionary entries via the LLM define workflow.",
665 |     )
666 |     llm_define_parser.add_argument(
667 |         "--table",
668 |         default=llm_define_enricher.DEFAULT_TABLE_NAME,
669 |         help="Source table containing JSONB entries (default: %(default)s).",
670 |     )
671 |     llm_define_parser.add_argument(
672 |         "--source-column",
673 |         default=llm_define_enricher.DEFAULT_SOURCE_COLUMN,
674 |         help="Column containing original Wiktionary payloads (default: %(default)s).",
675 |     )
676 |     llm_define_parser.add_argument(
677 |         "--target-column",
678 |         default=llm_define_enricher.DEFAULT_TARGET_COLUMN,
679 |         help="Column to store LLM-enriched JSONB (default: %(default)s).",
680 |     )
681 |     llm_define_parser.add_argument(
682 |         "--fetch-batch-size",
683 |         type=int,
684 |         default=llm_define_enricher.DEFAULT_FETCH_BATCH_SIZE,
685 |         help="Rows fetched from PostgreSQL per server-side batch (default: %(default)s).",
686 |     )
687 |     llm_define_parser.add_argument(
688 |         "--llm-batch-size",
689 |         type=int,
690 |         default=llm_define_enricher.DEFAULT_LLM_BATCH_SIZE,
691 |         help="Number of requests dispatched to the LLM at once (default: %(default)s).",
692 |     )
693 |     llm_define_parser.add_argument(
694 |         "--max-workers",
695 |         type=int,
696 |         help="Maximum concurrent worker threads for LLM calls (default: llm-batch-size).",
697 |     )
698 |     llm_define_parser.add_argument(
699 |         "--max-retries",
700 |         type=int,
701 |         default=llm_define_enricher.DEFAULT_MAX_RETRIES,
702 |         help="Attempts per row before giving up (default: %(default)s).",
703 |     )
704 |     llm_define_parser.add_argument(
705 |         "--initial-backoff-seconds",
706 |         type=float,
707 |         default=llm_define_enricher.DEFAULT_INITIAL_BACKOFF_SECONDS,
708 |         help="Initial retry backoff in seconds (default: %(default)s).",
709 |     )
710 |     llm_define_parser.add_argument(
711 |         "--max-backoff-seconds",
712 |         type=float,
713 |         default=llm_define_enricher.DEFAULT_MAX_BACKOFF_SECONDS,
714 |         help="Maximum retry backoff in seconds (default: %(default)s).",
715 |     )
716 |     llm_define_parser.add_argument(
717 |         "--progress-every-rows",
718 |         type=int,
719 |         default=llm_define_enricher.DEFAULT_PROGRESS_EVERY_ROWS,
720 |         help="Emit progress after processing this many rows (default: %(default)s).",
721 |     )
722 |     llm_define_parser.add_argument(
723 |         "--progress-every-seconds",
724 |         type=float,
725 |         default=llm_define_enricher.DEFAULT_PROGRESS_EVERY_SECONDS,
726 |         help="Emit progress at least this often in seconds (default: %(default)s).",
727 |     )
728 |     llm_define_parser.add_argument(
729 |         "--recompute-existing",
730 |         action="store_true",
731 |         help="Recreate target-column payloads even if already populated.",
732 |     )
733 |     _add_database_options(llm_define_parser)
734 |     llm_define_parser.set_defaults(func=_cmd_llm_define, _parser=llm_define_parser)
735 | 
736 |     return parser
737 | 
738 | 
739 | def main(argv: list[str] | None = None) -> int:
740 |     parser = _build_parser()
741 | 
742 |     if argv is None:
743 |         argv_list = sys.argv[1:]
744 |     else:
745 |         argv_list = list(argv)
746 | 
747 |     if argv_list and not argv_list[0].startswith("-") and argv_list[0] not in COMMAND_NAMES:
748 |         argv_list = ["load", *argv_list]
749 | 
750 |     args = parser.parse_args(argv_list)
751 | 
752 |     func = getattr(args, "func", None)
753 |     if func is None:
754 |         parser.print_help()
755 |         return 1
756 | 
757 |     return func(args)
758 | 
759 | 
760 | if __name__ == "__main__":  # pragma: no cover - CLI entry guard
761 |     sys.exit(main())
762 | 
763 | 
764 | __all__ = ["main"]
765 | 


--------------------------------------------------------------------------------