├── anki_packager
    ├── dict
    │   ├── __init__.py
    │   ├── longman.py
    │   ├── eudic.py
    │   ├── ecdict.py
    │   ├── youdao.py
    │   └── stardict.py
    ├── __main__.py
    ├── __init__.py
    ├── logger.py
    ├── prompt.py
    ├── utils.py
    ├── ai.py
    ├── packager
    │   └── deck.py
    └── cli.py
├── config
    ├── vocabulary.txt
    └── config.toml
├── test.apkg
├── dicts
    ├── 有道词语辨析.mdx
    └── 单词释义比例词典-带词性.mdx
├── images
    ├── apkg.png
    └── 卡片预览.png
├── publish.sh
├── MANIFEST.in
├── Dockerfile
├── LICENSE
├── setup.py
├── Makefile
├── requirements.txt
├── README.md
└── .gitignore


/anki_packager/dict/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/anki_packager/dict/longman.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/vocabulary.txt:
--------------------------------------------------------------------------------
1 | reform
2 | open


--------------------------------------------------------------------------------
/test.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/test.apkg


--------------------------------------------------------------------------------
/dicts/有道词语辨析.mdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/有道词语辨析.mdx


--------------------------------------------------------------------------------
/images/apkg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/apkg.png


--------------------------------------------------------------------------------
/images/卡片预览.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/卡片预览.png


--------------------------------------------------------------------------------
/dicts/单词释义比例词典-带词性.mdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/单词释义比例词典-带词性.mdx


--------------------------------------------------------------------------------
/anki_packager/__main__.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from .cli import main
3 | 
4 | if __name__ == "__main__":
5 |     asyncio.run(main())
6 | 


--------------------------------------------------------------------------------
/anki_packager/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.9.5"
 2 | 
 3 | from .utils import initialize_config
 4 | 
 5 | try:
 6 |     initialize_config()
 7 | except Exception as e:
 8 |     import sys
 9 | 
10 |     print(f"Warning: Unable to initialize configuration: {e}", file=sys.stderr)
11 | 


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Clean up previous builds
 4 | rm -rf build/ dist/ *.egg-info/
 5 | 
 6 | # Build source and wheel distributions
 7 | python -m build
 8 | 
 9 | # Upload to PyPI
10 | # Uncomment when ready to publish:
11 | # twine upload dist/*
12 | 
13 | echo -e "Build completed. To publish to PyPI, run: twine upload dist/*"
14 | 


--------------------------------------------------------------------------------
/config/config.toml:
--------------------------------------------------------------------------------
 1 | PROXY = ""
 2 | EUDIC_TOKEN = ""
 3 | EUDIC_ID = "0"
 4 | DECK_NAME = "anki_packager"
 5 | 
 6 | [[MODEL_PARAM]]
 7 | model = "gemini/gemini-2.5-flash"
 8 | api_key = "GEMINI_API_KEY"
 9 | rpm = 10                          # 每分钟请求次数
10 | 
11 | # [[MODEL_PARAM]]
12 | # model = "openai/gpt-4o"
13 | # api_key = "OPENAI_API_KEY"
14 | # api_base = "YOUR_API_BASE"
15 | # rpm = 200
16 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include README.md
 3 | include requirements.txt
 4 | recursive-include anki_packager/packager *.py
 5 | global-exclude *.py[cod] __pycache__ *.so
 6 | global-exclude */__pycache__/*
 7 | exclude anki_packager.log
 8 | exclude *.pyc
 9 | exclude __pycache__
10 | exclude *.apkg
11 | # 排除所有字典数据文件
12 | exclude anki_packager/dicts/*.mdx
13 | exclude anki_packager/dicts/*.7z
14 | exclude anki_packager/dicts/*.db
15 | exclude anki_packager/dicts/*.csv
16 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 你可能需要代理
 2 | # FROM hub.icert.top/python:3.10.16-slim
 3 | 
 4 | FROM python:3.10.16-slim
 5 | 
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |   gcc \
 8 |   build-essential \
 9 |   libffi-dev \
10 |   && rm -rf /var/lib/apt/lists/*
11 | 
12 | WORKDIR /app
13 | 
14 | COPY requirements.txt ./
15 | 
16 | RUN pip install --no-cache-dir -r requirements.txt
17 | 
18 | COPY . .
19 | 
20 | RUN mkdir -p config dicts
21 | 
22 | ENV PYTHONUNBUFFERED=1
23 | 
24 | ENTRYPOINT ["python", "-m", "anki_packager", "--disable_ai"]
25 | CMD []
26 | 


--------------------------------------------------------------------------------
/anki_packager/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | # ANSI escape codes for bold blue
 4 | BOLD_BLUE = "\033[1;34m"
 5 | RESET = "\033[0m"
 6 | 
 7 | logging.basicConfig(
 8 |     level=logging.INFO,
 9 |     format=f"{BOLD_BLUE}[%(filename)s:%(lineno)d:%(funcName)s]{RESET} %(message)s",
10 |     handlers=[logging.FileHandler("anki_packager.log"), logging.StreamHandler()],
11 | )
12 | 
13 | litellm_logger = logging.getLogger("LiteLLM")
14 | litellm_logger.setLevel(logging.WARNING)
15 | litellm_logger = logging.getLogger("LiteLLM Router")
16 | litellm_logger.setLevel(logging.WARNING)
17 | 
18 | logging.getLogger("httpx").setLevel(logging.WARNING)
19 | logger = logging.getLogger(__name__)
20 | 


--------------------------------------------------------------------------------
/anki_packager/prompt.py:
--------------------------------------------------------------------------------
 1 | PROMPT = """
 2 |     你是一名中英文双语教育专家，拥有帮助将中文视为母语的用户理解和记忆英语单词的专长，请根据用户提供的英语单词，用中文且仅用 json 格式回复:
 3 |     {
 4 |         "word": "用户提供的单词",
 5 |         "origin": {
 6 |             "etymology": "<详细介绍单词的造词来源和发展历史，以及在欧美文化中的内涵>",
 7 |             "mnemonic": {
 8 |                 "associative": "联想记忆：<提供一个联想记忆，帮助用户记住单词的含义>",
 9 |                 "homophone": "谐音记忆：<提供一个谐音记忆，帮助用户记住单词的拼写>"
10 |             }
11 |         },
12 |         "tenses": "<按照以下格式(如果存在）列出词形变化：'v. 动词原形, 过去式, 过去分词, 现在分词; adj. 形容词形式; n. 名词形式; adv. 副词形式'...>",
13 |         "story": {
14 |             "english": "<用英文撰写一个有画面感的场景故事。要求：1. 必须包含目标单词；2. 使用简单易懂的词汇；3. 长度在80-100个单词之间；4. 突出目标单词的使用场景>",
15 |             "chinese": "<故事的中文翻译，保持与英文版本一致的语气和画面感>"
16 |         }
17 |     }
18 | 
19 |     注意事项：
20 |     1. 在 word_forms 中，只填写客观存在的词形，不要随意捏造或添加不存在的词形。并且只包含英文，不要加入中文注解。
21 | """
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Yaoyao Hu 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from anki_packager import __version__
 3 | 
 4 | setup(
 5 |     name="apkger",
 6 |     version=__version__,
 7 |     author="Yaoyao Hu",
 8 |     author_email="shady030314@gmail.com",
 9 |     description="自动化 Anki 英语单词卡片牌组生成器",
10 |     long_description=open("README.md", encoding="utf-8").read(),
11 |     long_description_content_type="text/markdown",
12 |     url="https://github.com/yaoyhu/anki_packager",
13 |     packages=find_packages(
14 |         exclude=["*.pyc", "*.pyo", "__pycache__", "*.__pycache__*"]
15 |     ),
16 |     include_package_data=True,
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         "Programming Language :: Python :: 3.9",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |         "Topic :: Education",
23 |         "Topic :: Text Processing :: Linguistic",
24 |         "Development Status :: 4 - Beta",
25 |     ],
26 |     python_requires=">=3.9",
27 |     install_requires=open("requirements.txt").read().splitlines(),
28 |     entry_points={
29 |         "console_scripts": [
30 |             "apkger=anki_packager.__main__:main",
31 |         ],
32 |     },
33 | )
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Detect operating system
 2 | ifeq ($(OS),Windows_NT)
 3 |     SHELL := powershell.exe
 4 |     .SHELLFLAGS := -NoProfile -Command
 5 |     RM = Remove-Item -Force -Recurse
 6 |     FOLDER_SET = $$env:FOLDER="$$(Get-Location)"
 7 | else
 8 |     SHELL := /bin/bash
 9 |     RM = rm -rf
10 |     FOLDER_SET = export FOLDER=$$(pwd)
11 | endif
12 | 
13 | IMAGE_NAME = apkger
14 | CONTAINER_NAME = apkger
15 | VOLUME_NAME = apkger-dicts
16 | 
17 | .PHONY: build run shell clean help
18 | 
19 | # Build Docker image and create persistent volume
20 | build:
21 | 	docker build -t $(IMAGE_NAME) .
22 | 	docker volume create $(VOLUME_NAME)
23 | 
24 | run:
25 | 	docker run --rm \
26 | 		--name $(CONTAINER_NAME) \
27 | 		-v $(VOLUME_NAME):/app/dicts \
28 | 		$(IMAGE_NAME)
29 | 
30 | # Enter shell in container with volume mounted
31 | shell:
32 | 	docker run -it --rm \
33 | 		--name $(CONTAINER_NAME) \
34 | 		-v $(VOLUME_NAME):/app/dicts \
35 | 		-v $(shell pwd)/config:/app/config \
36 | 		-v $(shell pwd):/app \
37 | 		--entrypoint /bin/bash \
38 | 		$(IMAGE_NAME)
39 | 
40 | clean:
41 | 	-docker rmi $(IMAGE_NAME)
42 | 	-docker volume rm $(VOLUME_NAME)
43 | 
44 | help:
45 | 	@echo "Available targets:"
46 | 	@echo "  build           - Build Docker image and create persistent volume"
47 | 	@echo "  run             - Run container with mounted current directory"
48 | 	@echo "  shell           - Enter shell in container with volume mounted"
49 | 	@echo "  clean           - Remove container and image"
50 | 	@echo "  help            - Show this help message"
51 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohappyeyeballs==2.6.1
 2 | aiohttp==3.12.15
 3 | aiosignal==1.4.0
 4 | annotated-types==0.7.0
 5 | anyio==4.11.0
 6 | attrs==25.3.0
 7 | beautifulsoup4==4.7.1
 8 | brotli==1.1.0
 9 | cached-property==2.0.1
10 | certifi==2025.8.3
11 | charset-normalizer==3.4.3
12 | chevron==0.14.0
13 | click==8.1.8
14 | colorama==0.4.6
15 | distro==1.9.0
16 | fastuuid==0.12.0
17 | filelock==3.19.1
18 | frozendict==2.4.6
19 | frozenlist==1.7.0
20 | fsspec==2025.9.0
21 | genanki==0.13.1
22 | gtts==2.5.3
23 | h11==0.16.0
24 | httpcore==1.0.9
25 | httpx==0.28.1
26 | huggingface-hub==0.35.1
27 | idna==3.10
28 | importlib-metadata==8.7.0
29 | inflate64==1.0.3
30 | jinja2==3.1.6
31 | jiter==0.11.0
32 | jsonschema==4.25.1
33 | jsonschema-specifications==2025.9.1
34 | litellm==1.77.3
35 | madoka==0.7.1
36 | markupsafe==3.0.2
37 | mdict-utils==1.3.14
38 | multidict==6.6.4
39 | multivolumefile==0.2.3
40 | openai==1.109.1
41 | packaging==25.0
42 | pondpond==1.4.1
43 | propcache==0.3.2
44 | psutil==7.1.0
45 | py7zr==0.22.0
46 | pybcj==1.0.6
47 | pycryptodomex==3.23.0
48 | pydantic==2.12.0a1
49 | pydantic-core==2.37.2
50 | pyppmd==1.1.1
51 | python-dotenv==1.1.1
52 | pyyaml==6.0.2
53 | pyzstd==0.17.0
54 | referencing==0.36.2
55 | regex==2025.9.18
56 | requests==2.32.5
57 | rpds-py==0.27.1
58 | sniffio==1.3.1
59 | soupsieve==2.8
60 | texttable==1.7.0
61 | tiktoken==0.11.0
62 | tokenizers==0.22.1
63 | tqdm==4.67.1
64 | typing-extensions==4.15.0
65 | typing-inspection==0.4.1
66 | urllib3==2.5.0
67 | xxhash==3.5.0
68 | yarl==1.20.1
69 | zipp==3.23.0
70 | socksio==1.0.0
71 | 


--------------------------------------------------------------------------------
/anki_packager/dict/eudic.py:
--------------------------------------------------------------------------------
 1 | from anki_packager.logger import logger
 2 | import aiohttp
 3 | 
 4 | # https://my.eudic.net/OpenAPI/doc_api_study#-studylistapi-getcategory
 5 | 
 6 | 
 7 | class EUDIC:
 8 |     def __init__(self, token: str, id: str):
 9 |         self.id = id
10 |         self.token = token
11 |         self.header = {
12 |             "Authorization": self.token,
13 |             "Content-Type": "application/json",
14 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
15 |         }
16 |         self.studylist_url = (
17 |             "https://api.frdic.com/api/open/v1/studylist/category?language=en"
18 |         )
19 | 
20 |         self.words_url = "https://api.frdic.com/api/open/v1/studylist/words/"
21 | 
22 |     async def get_studylist(self):
23 |         async with aiohttp.request(
24 |             "GET", self.studylist_url, headers=self.header
25 |         ) as response:
26 |             self.check_token(response.status)
27 |             json = await response.json()
28 |             # show list id
29 |             for book in json["data"]:
30 |                 logger.info(f"id: {book['id']}, name: {book['name']}")
31 | 
32 |             return json
33 | 
34 |     async def get_words(self):
35 |         url = self.words_url + str(self.id) + "?language=en&category_id=0"
36 |         async with aiohttp.request("GET", url, headers=self.header) as response:
37 |             self.check_token(response.status)
38 |             json = await response.json()
39 |             return json
40 | 
41 |     def check_token(self, status_code: int):
42 |         if status_code != 200:
43 |             if status_code == 401:
44 |                 msg = "前往 https://my.eudic.net/OpenAPI/Authorization 获取 token 写入配置文件"
45 |                 logger.error(msg)
46 |                 exit(1)
47 |             else:
48 |                 msg = "检查填写的 ID 是否正确"
49 |                 logger.error(msg)
50 |                 exit(1)
51 | 


--------------------------------------------------------------------------------
/anki_packager/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | 
 4 | from anki_packager.logger import logger
 5 | 
 6 | 
 7 | def get_user_config_dir():
 8 |     """
 9 |     Returns the platform-specific user configuration directory.
10 | 
11 |     - Windows: %APPDATA%/anki_packager
12 |     - macOS/Linux: ~/.config/anki_packager
13 |     """
14 |     if platform.system() == "Windows":
15 |         return os.path.join(os.environ.get("APPDATA", ""), "anki_packager")
16 |     else:
17 |         return os.path.expanduser("~/.config/anki_packager")
18 | 
19 | 
20 | def initialize_config():
21 |     """
22 |     Make sure user config dir exists.
23 | 
24 |     Example:
25 |     ~/.config/anki_packager/
26 |         ├── config
27 |         │   ├── config.toml
28 |         │   ├── failed.txt
29 |         │   └── vocabulary.txt
30 |         └── dicts
31 |             ├── 单词释义比例词典-带词性.mdx
32 |             ├── 有道词语辨析.mdx
33 |             ├── stardict.7z
34 |             ├── stardict.csv
35 |             └── stardict.db
36 |     """
37 |     config_dir = get_user_config_dir()
38 |     os.makedirs(config_dir, exist_ok=True)
39 |     config_subdir = os.path.join(config_dir, "config")
40 |     os.makedirs(config_subdir, exist_ok=True)
41 |     dicts_dir = os.path.join(config_dir, "dicts")
42 |     os.makedirs(dicts_dir, exist_ok=True)
43 | 
44 |     # Default configuration in TOML format
45 |     default_config = """
46 | PROXY = ""
47 | EUDIC_TOKEN = ""
48 | EUDIC_ID = "0"
49 | DECK_NAME = "anki_packager"
50 | 
51 | [[MODEL_PARAM]]
52 | model = "gemini/gemini-2.5-flash"
53 | api_key = "GEMINI_API_KEY"
54 | rpm = 10                          # 每分钟请求次数
55 | 
56 | # [[MODEL_PARAM]]
57 | # model = "openai/gpt-4o"
58 | # api_key = "OPENAI_API_KEY"
59 | # api_base = "YOUR_API_BASE"
60 | # rpm = 200
61 | 
62 | """
63 | 
64 |     config_path = os.path.join(config_subdir, "config.toml")
65 |     if not os.path.exists(config_path):
66 |         with open(config_path, "w", encoding="utf-8") as f:
67 |             f.write(default_config)
68 | 
69 |     vocab_path = os.path.join(config_subdir, "vocabulary.txt")
70 |     if not os.path.exists(vocab_path):
71 |         with open(vocab_path, "w", encoding="utf-8") as f:
72 |             f.write("")
73 | 
74 |     failed_path = os.path.join(config_subdir, "failed.txt")
75 |     if not os.path.exists(failed_path):
76 |         with open(failed_path, "w", encoding="utf-8") as f:
77 |             f.write("reform\nopen\n")
78 | 
79 |     logger.info(f"配置文件位于 {config_path}")
80 | 


--------------------------------------------------------------------------------
/anki_packager/ai.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from litellm import Choices, Message
 3 | from litellm.router import Router
 4 | from litellm.files.main import ModelResponse
 5 | import json
 6 | from anki_packager.prompt import PROMPT
 7 | 
 8 | from pydantic import BaseModel, Field, ValidationError
 9 | 
10 | 
11 | class Mnemonic(BaseModel):
12 |     """助记法模型"""
13 | 
14 |     associative: str = Field(..., description="联想记忆法")
15 |     homophone: str = Field(..., description="谐音记忆法")
16 | 
17 | 
18 | class Origin(BaseModel):
19 |     """词源和助记模型"""
20 | 
21 |     etymology: str = Field(..., description="词源和文化内涵")
22 |     mnemonic: Mnemonic
23 | 
24 | 
25 | class Story(BaseModel):
26 |     """场景故事模型"""
27 | 
28 |     english: str = Field(..., description="英文场景故事")
29 |     chinese: str = Field(..., description="故事的中文翻译")
30 | 
31 | 
32 | # 最终的、最顶层的完整数据模型
33 | class WordExplanation(BaseModel):
34 |     """完整的单词解析数据模型"""
35 | 
36 |     word: str = Field(..., description="用户提供的单词")
37 |     origin: Origin
38 |     tenses: str = Field(..., description="单词的词形变化")
39 |     story: Story
40 | 
41 | 
42 | class llm:
43 |     def __init__(self, model_param: list):
44 |         model_list = [
45 |             {
46 |                 "model_name": "a",  # 为所有模型统一使用别名 "a"
47 |                 "litellm_params": param,
48 |             }
49 |             for param in model_param
50 |         ]
51 |         self.router = Router(model_list)
52 | 
53 |     async def explain(self, word: str) -> Dict:
54 |         try:
55 |             response = await self.router.acompletion(
56 |                 model="a",
57 |                 messages=[
58 |                     {"role": "system", "content": PROMPT},
59 |                     {"role": "user", "content": word},
60 |                 ],
61 |                 temperature=0.3,
62 |                 max_tokens=500,
63 |                 response_format={"type": "json_object"},
64 |             )
65 |             if isinstance(response, ModelResponse):
66 |                 if isinstance(response.choices, list) and response.choices:
67 |                     first_choice = response.choices[0]
68 |                     if (
69 |                         isinstance(first_choice, Choices)
70 |                         and isinstance(first_choice.message, Message)
71 |                         and isinstance(first_choice.message.content, str)
72 |                     ):
73 |                         result_str = first_choice.message.content
74 | 
75 |             if result_str.startswith("```json"):
76 |                 result_str = result_str.strip("```json\n").strip("```")
77 | 
78 |             # 1. 将字符串解析为 Python 字典
79 |             data = json.loads(result_str)
80 | 
81 |             # 2. 使用 WordExplanation 模型进行验证和解析
82 |             validated_data = WordExplanation.model_validate(data)
83 | 
84 |             return validated_data.model_dump()
85 | 
86 |         except json.JSONDecodeError as e:
87 |             raise json.JSONDecodeError(
88 |                 f"Failed to parse JSON for '{word}': {e}. Raw response: '{result_str[:150]}...'",
89 |                 result_str,
90 |                 e.pos,
91 |             )
92 |         except ValidationError as e:
93 |             raise ValidationError(f"JSON structure validation failed for '{word}': {e}")
94 |         except Exception as e:
95 |             raise Exception(f"An unexpected error occurred for '{word}': {e}")
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- LOGO -->
  2 | <h1>
  3 | <p align="center">
  4 |   <img src="./images/apkg.png" alt="Logo" width="200">
  5 |   <br>anki_packager
  6 | </h1>
  7 |   <p align="center">
  8 |     自动化 Anki 英语单词高质量卡片牌组生成工具
  9 |     <br />
 10 |     <a href="#关于项目">关于项目</a>
 11 |     ·
 12 |     <a href="#使用">使用指南</a>
 13 |     ·
 14 |     <a href="#todo">开发计划</a>
 15 |     ·
 16 |     <a href="#thanks">致谢</a>
 17 |   </p>
 18 | </p>
 19 | 
 20 | ## 关于项目
 21 | 
 22 | `anki_packager` 是一款自动化的 Anki 单词卡片生成工具，能够自动创建高质量的 `.apkg` 牌组。本项目致力于为英语学习者提供一个高效、智能的记忆辅助工具。
 23 | 
 24 | ### 核心特性
 25 | 
 26 | - 多源精选词典整合：[ECDICT](https://github.com/skywind3000/ECDICT)、[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)、[单词释义比例词典](https://skywind.me/blog/archives/2938)
 27 | - 智能化学习体验：
 28 |   - 自动抓取有道词典优质例句和常用短语
 29 |   - 支持谷歌 TTS 发音、中英双解、考纲标记等功能
 30 |   - 支持流行 AI 模型（需要 API-KEY）对单词进行总结、助记及和情境故事生成
 31 | - 便捷的数据导入：支持欧路词典生词本一键导入并批量处理单词列表，自动生成卡片
 32 | - 优良的命令行体验：显示处理进度，支持记录错误、支持丰富的命令行参数
 33 | - 支持 Docker 运行、支持 PyPI 安装
 34 | 
 35 | ### 卡片预览
 36 | 
 37 | 每张单词卡片包含丰富的学习资源，结构清晰，内容全面：
 38 | 
 39 | - 正面：词头、发音、音标 + 考试大纲标签（如 中高考、CET4、CET6、GRE 等）
 40 | - 背面：
 41 |   - 释义：中文（ECDICT）、时态（AI）、释义和词性比例（[单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx)） 
 42 |   - AI 生成词根 + 辅助记忆（联想记忆 + 谐音记忆）
 43 |   - 短语 + 例句（有道爬虫）
 44 |   - 单词辨析（[《有道词语辨析》加强版](https://pan.baidu.com/s/1gff2tdp)）
 45 |   - 英文释义（目前来自 ECDICT）+ AI 生成故事
 46 | 
 47 | <img src="./images/卡片预览.png" alt="背面 " style="zoom:50%;" />
 48 | 
 49 | ## 使用
 50 | 
 51 | ### 快速开始
 52 | 
 53 | ```bash
 54 | # 直接使用 pip 安装
 55 | pip install apkger
 56 | ```
 57 | 
 58 | 在使用 apkger 之前，你需要先在 `config/config.toml` 文件中填写相关配置信息：
 59 | 
 60 | 本项目使用 [litellm](https://github.com/BerriAI/litellm) 统一调用 LLM 服务。关于 `MODEL_PARAM` 的详细配置方法，请参考 [LiteLLM Providers 文档](https://docs.litellm.ai/docs/providers)。
 61 | 
 62 | ```toml
 63 | PROXY = ""
 64 | EUDIC_TOKEN = ""
 65 | EUDIC_ID = "0"
 66 | DECK_NAME = "anki_packager"
 67 | 
 68 | [[MODEL_PARAM]]
 69 | model = "gemini/gemini-2.5-flash"
 70 | api_key = "GEMINI_API_KEY"
 71 | rpm = 10                          # 每分钟请求次数
 72 | 
 73 | ### OpenAI-Compatible Endpoints 示例
 74 | # [[MODEL_PARAM]]
 75 | # model = "openai/gpt-4o"
 76 | # api_key = "OPENAI_API_KEY"
 77 | # api_base = "YOUR_API_BASE"
 78 | # rpm = 200
 79 | ```
 80 | 
 81 | 下面是关于配置文件中各参数的详细说明：
 82 | 
 83 | - `MODEL_PARAM`:
 84 |   - `model`: Provider Route on LiteLLM + Model ID
 85 |   - `api_key`: 对应模型的 API 密钥。
 86 |   - `api_base`: (可选) 仅在模型为 OpenAI-Compatible Endpoints 时需要填写
 87 |   - `rpm`: (可选) 每分钟的请求次数限制，用于控制 API 调用频率。
 88 | - `PROXY`: 如果你无法直接连接到 AI 服务提供商，可以在这里设置代理服务器地址
 89 | 
 90 | - 如果需要使用欧路词典生词本：先按照[欧陆官方获取](https://my.eudic.net/OpenAPI/Authorization) TOKEN，然后使用`apkger --eudicid` 选择 ID 写入配置文件
 91 | 
 92 | ### 下载字典
 93 | 
 94 | 下载字典到配置目录中（注意名称不要错）:
 95 | 
 96 | - Linux/MacOS: `~/.config/anki_packager/dicts/`
 97 | - Windows: `C:\Users\<用户名>\AppData\Roaming\anki_packager\dicts\`
 98 | 
 99 | 字典数据（感谢 [skywind）](https://github.com/skywind3000)下载地址:
100 | 
101 | - [stardict.7z](https://github.com/skywind3000/ECDICT/raw/refs/heads/master/stardict.7z)
102 | - [单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx)
103 | - [有道词语辨析](https://pan.baidu.com/s/1gff2tdp)：**需要手动解压**放入 `config/dicts`
104 | 
105 | 字典下载完毕后，解压和处理交给 anki_packager 即可。
106 | 
107 | ### 运行
108 | 
109 | 目前软件没有 UI 界面，只支持命令行运行，下面给出一些参考：
110 | 
111 | ```bash
112 | # 查看帮助信息
113 | apkger -h
114 | 
115 | # 从默认生词本读词生成卡片
116 | apkger
117 | 
118 | ### 关闭 AI 功能
119 | apkger --disable_ai
120 | 
121 | ### 从欧路词典生词本导出单词，生成卡片（需要先配置)
122 | ## 先查看 ID 写入配置文件
123 | apkger --eudicid
124 | ## 生成卡片
125 | apkger --eudic
126 | ```
127 | 
128 | <details>
129 | <summary>方式一：Conda 环境</summary>
130 | 
131 | ```bash
132 | # 创建并激活一个名为 apkg 的 Python 3.9 虚拟环境
133 | conda create -n apkg python=3.9
134 | conda activate apkg
135 | 
136 | # 安装项目依赖
137 | pip install -r requirements.txt
138 | 
139 | # 查看帮助信息
140 | python -m anki_packager -h
141 | 
142 | # 从欧路词典生词本导出单词，生成卡片（需要先配置)
143 | python -m anki_packager --eudic
144 | 
145 | # 关闭 AI 功能
146 | python -m anki_packager --disable_ai
147 | 
148 | # 从生词本读词生成卡片
149 | python -m anki_packager
150 | ```
151 | 
152 | </details>
153 | 
154 | <details>
155 | <summary>方式二：Docker 容器</summary>
156 | 
157 | 如果你希望避免污染本地环境，可以使用 Docker 运行 anki_packager，可以配合 `Makefile` 使用：
158 | 
159 | ```shell
160 | # 构建 Docker 镜像 和 创建持久化卷
161 | make build
162 | 
163 | # 第一次运行容器下载词典（需要一点时间）
164 | make run
165 | 
166 | # 进入容器（注意！需要在主机先配置 config/config.toml）
167 | # 在容器中运行 anki_packager，生成的牌组会保存在当前目录中
168 | make shell
169 | ```
170 | 
171 | </details>
172 | 
173 | ## TODO
174 | 
175 | - [x] ~~集成单词释义比例词典~~
176 | - [x] ~~近一步优化单词卡片 UI~~
177 | - [x] ~~从欧路词典导入生词~~
178 | - [x] ~~支持 SiliconFlow、Gemini~~
179 | - [x] ~~重新支持 Docker~~
180 | - [x] ~~发布到 PyPI~~
181 | - [x] ~~训练现成的数据包发布 release~~ @Initsnow
182 | - [ ] 支持更多软件生词导出
183 | - [ ] 支持 Longman 词典
184 | - [ ] 开发 GUI
185 | 
186 | ## Thanks
187 | 
188 | 本项目得到了众多开源项目和社区的支持：
189 | 
190 | - 感谢 [skywind](https://github.com/skywind3000) 开源的 [ECDICT](https://github.com/skywind3000/ECDICT) 以及其他词典项目，为本项目提供了丰富的词典资源。
191 | - 感谢 [yihong0618](https://github.com/yihong0618) 开源的众多优秀 Python 项目，从中获益良多。
192 | 
193 | ---
194 | 
195 | <p align="center">如果这个项目对你有帮助，欢迎 Star ⭐️</p>
196 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.apkg
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # UV
100 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #uv.lock
104 | 
105 | # poetry
106 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
108 | #   commonly ignored for libraries.
109 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | #   in version control.
117 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
118 | .pdm.toml
119 | .pdm-python
120 | .pdm-build/
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 | 
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 | 
129 | # SageMath parsed files
130 | *.sage.py
131 | 
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
172 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
173 | 
174 | # User-specific stuff
175 | .idea/**/workspace.xml
176 | .idea/**/tasks.xml
177 | .idea/**/usage.statistics.xml
178 | .idea/**/dictionaries
179 | .idea/**/shelf
180 | 
181 | # AWS User-specific
182 | .idea/**/aws.xml
183 | 
184 | # Generated files
185 | .idea/**/contentModel.xml
186 | 
187 | # Sensitive or high-churn files
188 | .idea/**/dataSources/
189 | .idea/**/dataSources.ids
190 | .idea/**/dataSources.local.xml
191 | .idea/**/sqlDataSources.xml
192 | .idea/**/dynamic.xml
193 | .idea/**/uiDesigner.xml
194 | .idea/**/dbnavigator.xml
195 | 
196 | # Gradle
197 | .idea/**/gradle.xml
198 | .idea/**/libraries
199 | 
200 | # Gradle and Maven with auto-import
201 | # When using Gradle or Maven with auto-import, you should exclude module files,
202 | # since they will be recreated, and may cause churn.  Uncomment if using
203 | # auto-import.
204 | # .idea/artifacts
205 | # .idea/compiler.xml
206 | # .idea/jarRepositories.xml
207 | # .idea/modules.xml
208 | # .idea/*.iml
209 | # .idea/modules
210 | # *.iml
211 | # *.ipr
212 | 
213 | # CMake
214 | cmake-build-*/
215 | 
216 | # Mongo Explorer plugin
217 | .idea/**/mongoSettings.xml
218 | 
219 | # File-based project format
220 | *.iws
221 | 
222 | # IntelliJ
223 | out/
224 | 
225 | # mpeltonen/sbt-idea plugin
226 | .idea_modules/
227 | 
228 | # JIRA plugin
229 | atlassian-ide-plugin.xml
230 | 
231 | # Cursive Clojure plugin
232 | .idea/replstate.xml
233 | 
234 | # SonarLint plugin
235 | .idea/sonarlint/
236 | 
237 | # Crashlytics plugin (for Android Studio and IntelliJ)
238 | com_crashlytics_export_strings.xml
239 | crashlytics.properties
240 | crashlytics-build.properties
241 | fabric.properties
242 | 
243 | # Editor-based Rest Client
244 | .idea/httpRequests
245 | 
246 | # Android studio 3.1+ serialized cache file
247 | .idea/caches/build_file_checksums.ser
248 | 
249 | # PyPI configuration file
250 | .pypirc
251 | 
252 | .vscode
253 | .vscode/*
254 | !.vscode/settings.json
255 | !.vscode/tasks.json
256 | !.vscode/launch.json
257 | !.vscode/extensions.json
258 | !.vscode/*.code-snippets
259 | 
260 | # Local History for Visual Studio Code
261 | .history/
262 | 
263 | # Built Visual Studio Code Extensions
264 | *.vsix
265 | 
266 | # macOS shit
267 | .DS_Store


--------------------------------------------------------------------------------
/anki_packager/dict/ecdict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sqlite3
  3 | 
  4 | from anki_packager.logger import logger
  5 | from anki_packager.utils import get_user_config_dir
  6 | 
  7 | from anki_packager.dict import stardict
  8 | 
  9 | # https://github.com/liuyug/mdict-utils
 10 | from mdict_utils.reader import query
 11 | from mdict_utils.utils import ElapsedTimer
 12 | 
 13 | 
 14 | class Ecdict:
 15 |     def __init__(self):
 16 |         self.config_dir = get_user_config_dir()
 17 |         self.dicts_dir = os.path.join(self.config_dir, "dicts")
 18 |         # keep the package archive small
 19 |         self.seven_zip = os.path.join(self.dicts_dir, "stardict.7z")
 20 |         self.csv = os.path.join(self.dicts_dir, "stardict.csv")
 21 |         self.sqlite = os.path.join(self.dicts_dir, "stardict.db")
 22 |         self._convert()
 23 |         self.conn = sqlite3.connect(self.sqlite)
 24 |         self.cursor = self.conn.cursor()
 25 |         self.sd = stardict.StarDict(self.sqlite, False)
 26 | 
 27 |     def __del__(self):
 28 |         if hasattr(self, "conn"):
 29 |             self.cursor.close()
 30 |             self.conn.close()
 31 | 
 32 |     def _convert(self):
 33 |         if not os.path.exists(self.csv):
 34 |             # unzip stardict.csv in 7zip
 35 |             if not os.path.exists(self.seven_zip):
 36 |                 raise FileNotFoundError(f"{self.seven_zip} 未找到!")
 37 | 
 38 |             import py7zr
 39 | 
 40 |             logger.info("首次使用: 正在解压词典到 anki_packager/dicts/stardict.csv")
 41 |             ar = py7zr.SevenZipFile(self.seven_zip, mode="r")
 42 |             ar.extractall(path=self.dicts_dir)
 43 |             ar.close()
 44 | 
 45 |         if not os.path.exists(self.sqlite):
 46 |             logger.info(
 47 |                 "耐心等待(790M): 正在转换数据库 anki_packager/dicts/stardict.db"
 48 |             )
 49 |             stardict.convert_dict(self.sqlite, self.csv)
 50 | 
 51 |     async def ret_word(self, word):
 52 |         """Return ECDICT data
 53 |         dict: 包含以下 ECDICT 数据字段的字典：
 54 |         - word: 单词名称
 55 |         - phonetic: 音标，以英语英标为主
 56 |         - definition: 单词释义（英文），每行一个释义
 57 |         - translation: 单词释义（中文），每行一个释义
 58 |         - pos: 词语位置，用 "/" 分割不同位置
 59 |         - collins: 柯林斯星级
 60 |         - oxford: 是否是牛津三千核心词汇
 61 |         - tag: 字符串标签: zk/中考, gk/高考, cet4/四级 等等标签，空格分割
 62 |         - bnc: 英国国家语料库词频顺序
 63 |         - frq: 当代语料库词频顺序
 64 |         - exchange: 时态复数等变换，使用 "/" 分割不同项目
 65 |         - detail: json 扩展信息，字典形式保存例句（待添加）
 66 |         - audio: 读音音频 url （待添加）
 67 |         """
 68 |         data = self.sd.query(word)
 69 | 
 70 |         # 考纲标签
 71 |         data = self.parse_tag(data)
 72 |         # 释义分布
 73 |         data = self.get_distribution(data)
 74 |         # 词语辨析
 75 |         data = self.get_diffrentiation(data)
 76 |         return data
 77 | 
 78 |     def get_distribution(self, data):
 79 |         """
 80 |         Get word distribution from mdx dictionary
 81 |         """
 82 |         with ElapsedTimer(verbose=False):
 83 |             mdx_path = os.path.join(
 84 |                 get_user_config_dir(),
 85 |                 "dicts",
 86 |                 "单词释义比例词典-带词性.mdx",
 87 |             )
 88 |             record = query(mdx_path, data["word"])
 89 |             if record:
 90 |                 data["distribution"] = record
 91 |             return data
 92 | 
 93 |     def get_diffrentiation(self, data):
 94 |         """[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)"""
 95 |         with ElapsedTimer(verbose=False):
 96 |             mdx_path = os.path.join(get_user_config_dir(), "dicts", "有道词语辨析.mdx")
 97 |             record = query(mdx_path, data["word"])
 98 |             if record:
 99 |                 data["diffrentiation"] = record
100 |             return data
101 | 
102 |     def definition_newline(self, data):
103 |         """Add newline to definition for each part-of-speech
104 | 
105 |         Demo:
106 |             Input: data["definition"] = "n. 词义1 v. 词义2"
107 |             Output: data["definition"] = "n. 词义1<br>v. 词义2"
108 | 
109 |         """
110 |         definition = data.get("definition", "")
111 |         if not definition:
112 |             return data
113 | 
114 |         # Split on part of speech markers (like "n.", "v.", etc.)
115 |         parts = []
116 |         current = ""
117 |         words = definition.split()
118 | 
119 |         for word in words:
120 |             if len(word) >= 2 and word.endswith(".") and word[0].isalpha():
121 |                 if current:
122 |                     parts.append(current.strip())
123 |                 current = word
124 |             else:
125 |                 current += " " + word
126 | 
127 |         if current:
128 |             parts.append(current.strip())
129 | 
130 |         data["definition"] = "<br>".join(parts)
131 |         return data
132 | 
133 |     def parse_tag(self, data):
134 |         """parse tag infomation and update data dict
135 |         Demo:
136 |             Input: data["tag"] = "zk gk cet4 cet6 ky ielts toefl"
137 |             Output: data["tag"] = "中考 高考 四级 六级 考研 雅思 托福"
138 |         """
139 |         text = data.get("tag", "")
140 |         if not text:
141 |             return data
142 | 
143 |         tag_map = {
144 |             "zk": "中考",
145 |             "gk": "高考",
146 |             "cet4": "四级",
147 |             "cet6": "六级",
148 |             "ky": "考研",
149 |             "ielts": "雅思",
150 |             "toefl": "托福",
151 |             "gre": "GRE",
152 |         }
153 | 
154 |         tags: str = text.split()
155 |         result = [tag_map.get(tag, tag) for tag in tags]
156 |         data["tag"] = " ".join(result)
157 |         return data
158 | 
159 |     def parse_exchange(self, data):
160 |         """parse exchange information and update data dict
161 | 
162 |         Demo:
163 |             Input: data["exchange"] = "s:tests/d:tested/i:testing/p:tested/3:tests"
164 |             Output: data["exchange"] = "复数:tests 过去式:tested 过去分词:tested 现在分词:testing 三单:tests"
165 |         """
166 |         text = data.get("exchange", "")
167 |         if not text:
168 |             return data
169 | 
170 |         exchange_map = {
171 |             "s": "复数",
172 |             "d": "过去式",
173 |             "p": "过去分词",
174 |             "i": "现在分词",
175 |             "3": "三单",
176 |             "r": "比较级",
177 |             "t": "最高级",
178 |             "0": "原型",
179 |             "1": "第一人称单数",
180 |         }
181 | 
182 |         result = []
183 |         for item in text.split("/"):
184 |             if ":" in item:
185 |                 key, value = item.split(":")
186 |                 if key in exchange_map:
187 |                     result.append(f"{exchange_map[key]}: {value}")
188 | 
189 |         data["exchange"] = " ".join(result)
190 |         return data
191 | 


--------------------------------------------------------------------------------
/anki_packager/dict/youdao.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import re
  4 | import shutil
  5 | import tempfile
  6 | import aiohttp
  7 | from gtts import gTTS
  8 | from bs4 import BeautifulSoup
  9 | from typing import Dict, Optional
 10 | from anki_packager.logger import logger
 11 | 
 12 | 
 13 | class YoudaoScraper:
 14 |     def __init__(self):
 15 |         self.base_url = "https://m.youdao.com/result"
 16 |         self.tmp = tempfile.mkdtemp()
 17 | 
 18 |     async def __aenter__(self):
 19 |         """进入 async with 时被调用"""
 20 |         self._session = aiohttp.ClientSession(
 21 |             headers={
 22 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 23 |             }
 24 |         )
 25 |         return self  # 返回实例本身
 26 | 
 27 |     async def __aexit__(self, exc_type, exc_val, exc_tb):
 28 |         """离开 async with 时被调用，确保 Session 被关闭"""
 29 |         await self._session.close()
 30 |         try:
 31 |             self._clean_temp_dir()
 32 |         except Exception as e:
 33 |             logger.error(f"Error cleaning up audio files: {e}")
 34 | 
 35 |     async def _get_audio(self, word: str):
 36 |         """return the filename of the audio and the temp directory that needs to be cleaned up"""
 37 |         filename = os.path.join(self.tmp, f"{word}.mp3")
 38 |         loop = asyncio.get_running_loop()
 39 | 
 40 |         def generate_and_save_audio():
 41 |             """A wrapper function for the blocking gTTS calls."""
 42 |             tts = gTTS(text=word, lang="en")
 43 |             tts.save(filename)
 44 | 
 45 |         await loop.run_in_executor(None, generate_and_save_audio)
 46 | 
 47 |         return filename
 48 | 
 49 |     def _clean_temp_dir(self):
 50 |         """Clean up a temporary directory and its contents."""
 51 |         try:
 52 |             if os.path.exists(self.tmp):
 53 |                 shutil.rmtree(self.tmp)
 54 |                 logger.info(f"音频临时文件夹已清理: {self.tmp}")
 55 |         except Exception as e:
 56 |             logger.error(f"音频临时文件夹 {self.tmp} 清理失败: {e}")
 57 | 
 58 |     async def get_word_info(self, word: str) -> Optional[Dict]:
 59 |         try:
 60 |             params = {"word": word, "lang": "en"}
 61 | 
 62 |             async with self._session.get(self.base_url, params=params) as response:
 63 |                 response.raise_for_status()
 64 |                 r_text = await response.text()
 65 |                 soup = BeautifulSoup(r_text, "html.parser")
 66 | 
 67 |             result = {
 68 |                 "word": word,
 69 |                 "example_phrases": [],
 70 |                 "example_sentences": [],
 71 |             }
 72 | 
 73 | 
 74 |             all_uls = soup.find_all("ul", class_="")
 75 |             # Extract example phrases
 76 |             if len(all_uls) > 0:
 77 |                 phrase_ul = all_uls[0]
 78 |                 if phrase_ul:
 79 |                     phrase_lis = phrase_ul.find_all("li", class_="mcols-layout")
 80 |                     for li in phrase_lis:
 81 |                         index = (
 82 |                             li.find("span", class_="grey").text.strip()
 83 |                             if li.find("span", class_="grey")
 84 |                             else None
 85 |                         )
 86 |                         col2_element = li.find("div", class_="col2")
 87 |                         point_element = col2_element.find("a", class_="point")
 88 |                         sen_phrase_element = col2_element.find("p", class_="sen-phrase")
 89 |                         english = None
 90 |                         chinese = None
 91 |                         if point_element and sen_phrase_element:
 92 |                             english = point_element.text.strip()
 93 |                             chinese = sen_phrase_element.text.strip()
 94 |                         else:
 95 |                             content = col2_element.text.strip()
 96 |                             parts = re.split(r"([;；])", content)
 97 |                             parts = [
 98 |                                 s.strip()
 99 |                                 for s in parts
100 |                                 if s.strip() and s not in [";", "；"]
101 |                             ]
102 |                             if len(parts) > 1:
103 |                                 english = parts[0]
104 |                                 chinese = "".join(parts[1:])
105 |                             else:
106 |                                 english = content
107 | 
108 |                         result["example_phrases"].append(
109 |                             {
110 |                                 "index": index,
111 |                                 "english": english,
112 |                                 "chinese": chinese,
113 |                             }
114 |                         )
115 | 
116 |             # Extract example sentences
117 |             if len(all_uls) > 1:
118 |                 sentence_ul = all_uls[1]
119 |                 if sentence_ul:
120 |                     sentence_lis = sentence_ul.find_all("li", class_="mcols-layout")
121 |                     for li in sentence_lis:
122 |                         index = (
123 |                             li.find("span", class_="grey index").text.strip()
124 |                             if li.find("span", class_="grey index")
125 |                             else None
126 |                         )
127 |                         english_element = li.find("div", class_="sen-eng")
128 |                         chinese_element = li.find("div", class_="sen-ch")
129 |                         source_element = li.find("div", class_="secondary")
130 | 
131 |                         english = english_element.text.strip() if english_element else None
132 |                         chinese = chinese_element.text.strip() if chinese_element else None
133 |                         source = source_element.text.strip() if source_element else None
134 | 
135 |                         result["example_sentences"].append(
136 |                             {
137 |                                 "index": index,
138 |                                 "english": english,
139 |                                 "chinese": chinese,
140 |                                 "source": source,
141 |                             }
142 |                         )
143 | 
144 |             return result
145 | 
146 |         except aiohttp.ClientError as e:
147 |             logger.error(f"Request error: {e}")
148 |             return None
149 |         except Exception as e:
150 |             logger.error(f"An error occurred: {e}")
151 |             return None
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     async def main():
156 |         async with YoudaoScraper() as youdao:
157 |             result = asyncio.run(youdao.get_word_info("variable"))
158 |             print(result)
159 |     asyncio.run(main())
160 | 


--------------------------------------------------------------------------------
/anki_packager/packager/deck.py:
--------------------------------------------------------------------------------
  1 | import genanki
  2 | import random
  3 | 
  4 | 
  5 | class AnkiDeckCreator:
  6 |     def __init__(self, deck_name: str):
  7 |         self.added = False
  8 |         self.deck_name = deck_name
  9 |         self.deck_id = random.randrange(1 << 30, 1 << 31)
 10 |         self.model_id = random.randrange(1 << 30, 1 << 31)
 11 |         self.deck = genanki.Deck(self.deck_id, deck_name)
 12 |         self.model = genanki.Model(
 13 |             self.model_id,
 14 |             "Anki Packager",
 15 |             fields=[
 16 |                 {"name": "Word"},  # 词头
 17 |                 {"name": "Pronunciation"},  # 读音
 18 |                 {"name": "Phonetic_Symbols"},  # 音标
 19 |                 {"name": "Examination_Syllabus"},  # 考试大纲
 20 |                 {"name": "ECDict"},  # Ecdict 中文解释
 21 |                 {"name": "Longman"},  # Longman
 22 |                 {"name": "Youdao"},  # 有道词典示例短语和句子
 23 |                 {"name": "Etymology_AI"},  # 词源
 24 |                 {"name": "Associative_Mnemonic_AI"},  # 联想助记
 25 |                 {"name": "Homophone_Mnemonic_AI"},  # 谐音助记
 26 |                 {"name": "Discrimination"},  # 辨析
 27 |                 {"name": "Story"},  # 故事
 28 |             ],
 29 |             templates=[
 30 |                 {
 31 |                     "name": "Dictionary Card",
 32 |                     "qfmt": """
 33 | <div class="card-front">
 34 |     <div class="header-center">
 35 |         <div class="word">{{Word}}</div>
 36 |         <div class= "front">[<span class="phonetic_symbols">{{Phonetic_Symbols}}</span>] ({{Examination_Syllabus}})</div>
 37 |         <div class="pronunciation">[{{Pronunciation}}]</div>
 38 |     </div>
 39 | </div>
 40 | """,
 41 |                     "afmt": """
 42 | {{FrontSide}}
 43 | <hr class="dashed">
 44 | <div class="card-back">
 45 |     <div class="ecdict">{{ECDict}}</div>
 46 |     <hr class="dashed">
 47 |     <div class="ai">
 48 |         <div class="etymology">{{Etymology_AI}}</div>
 49 |         <div class="associative_mnemonic">{{Associative_Mnemonic_AI}}</div>
 50 |         <div class="homophone_mnemonic">{{Homophone_Mnemonic_AI}}</div>
 51 |     </div>
 52 |     <hr class="dashed">
 53 |     <div class="examples">{{Youdao}}</div>
 54 |     <hr class="dashed">
 55 |     <div class="discrimination">{{Discrimination}}</div>
 56 |     <hr class="dashed">
 57 |     <div class="longman">{{Longman}}</div>
 58 |     <hr class="dashed">
 59 |     <div class="story">{{Story}}</div>
 60 | </div>
 61 | """,
 62 |                 }
 63 |             ],
 64 |             css="""
 65 | /* Color scheme variables */
 66 | :root {
 67 |     /* Light mode (default) colors */
 68 |     --bg-color: #ffffff;
 69 |     --text-color: #333333;
 70 |     --secondary-text: #666666;
 71 |     --tertiary-text: #2F4F4F;
 72 |     --highlight-color: #0645AD;
 73 |     --accent-color: #990000;
 74 |     --divider-color: #99a;
 75 |     --pos-color: #990000;
 76 |     --cn-text-color: #8B008B;
 77 |     --phrase-color: #8B4513;
 78 | }
 79 | 
 80 | /* Dark mode colors */
 81 | @media (prefers-color-scheme: dark) {
 82 |     .card {
 83 |         --bg-color: #1e1e2e;
 84 |         --text-color: #e0e0e0;
 85 |         --secondary-text: #b0b0b0;
 86 |         --tertiary-text: #a0c0c0;
 87 |         --highlight-color: #7cb8ff;
 88 |         --accent-color: #ff7c7c;
 89 |         --divider-color: #666;
 90 |         --pos-color: #ff9e64;
 91 |         --cn-text-color: #d183e8;
 92 |         --phrase-color: #e0c080;
 93 |     }
 94 | }
 95 | 
 96 | /* Night mode in Anki also triggers dark mode */
 97 | .nightMode {
 98 |     --bg-color: #1e1e2e;
 99 |     --text-color: #e0e0e0;
100 |     --secondary-text: #b0b0b0;
101 |     --tertiary-text: #a0c0c0;
102 |     --highlight-color: #7cb8ff;
103 |     --accent-color: #ff7c7c;
104 |     --divider-color: #666;
105 |     --pos-color: #ff9e64;
106 |     --cn-text-color: #d183e8;
107 |     --phrase-color: #e0c080;
108 | }
109 | 
110 | .card {
111 |     font-family: Arial, sans-serif;
112 |     text-align: left;
113 |     padding: 20px;
114 |     max-width: 800px;
115 |     margin: auto;
116 |     background-color: var(--bg-color);
117 |     color: var(--text-color);
118 |     line-height: 1.6;
119 | }
120 | 
121 | /* 虚线分隔符 */
122 | .dashed {
123 |     border: none;
124 |     border-top: 1px dashed var(--divider-color);
125 |     margin: 15px 0;
126 |     width: 100%;
127 | }
128 | 
129 | /* Front side */
130 | .card-front {
131 |     margin-bottom: 20px;
132 | }
133 | 
134 | /* Centered header section */
135 | .header-center {
136 |     text-align: center;
137 |     margin-bottom: 20px;
138 | }
139 | 
140 | .word {
141 |     font-size: 2.2em;
142 |     font-weight: bold;
143 |     color: var(--text-color);
144 |     margin-bottom: 5px;
145 | }
146 | 
147 | .pronunciation {
148 |     font-size: 1.1em;
149 |     color: var(--highlight-color);
150 |     margin-bottom: 10px;
151 | }
152 | 
153 | .front {
154 |     color: var(--secondary-text);
155 |     margin-bottom: 15px;
156 |     font-size: 0.90em;
157 | }
158 | 
159 | .phonetic_symbols {
160 |     color: blue;
161 | }
162 | 
163 | /* Back side */
164 | .card-back {
165 |     margin-top: 20px;
166 | }
167 | 
168 | .ecdict {
169 |     margin: 15px 0;
170 |     text-align: center;
171 | }
172 | 
173 | .longman {
174 |     margin: 15px 0;
175 | }
176 | 
177 | .examples {
178 |     color: var(--tertiary-text);
179 |     margin: 15px 0;
180 | }
181 | 
182 | .examples em {
183 |     color: var(--highlight-color);
184 |     font-style: normal;
185 |     font-weight: bold;
186 | }
187 | 
188 | .ai {
189 |     color: var(--secondary-text);
190 |     margin: 15px 0;
191 | }
192 | 
193 | .discrimination {
194 |     color: var(--text-color);
195 |     margin: 15px 0;
196 | }
197 | 
198 | /* Example sentences */
199 | .example {
200 |     color: var(--tertiary-text);
201 |     margin-left: 20px;
202 |     margin-bottom: 10px;
203 | }
204 | 
205 | /* Chinese text */
206 | .chinese {
207 |     color: var(--secondary-text);
208 |     margin-left: 20px;
209 | }
210 |             """,
211 |         )
212 | 
213 |     def format_pos(self, text: str) -> str:
214 |         """Format definition with line breaks between parts of speech"""
215 |         if not text:
216 |             return ""
217 | 
218 |         parts = []
219 |         current = []
220 | 
221 |         for word in text.split():
222 |             # Check for part of speech markers
223 |             if any(
224 |                 word.startswith(pos + ".")
225 |                 for pos in ["n", "v", "vt", "vi", "adj", "adv"]
226 |             ):
227 |                 if current:
228 |                     parts.append(" ".join(current))
229 |                 word = f"<b><i><span class='pos-marker'>{word}</span></i></b>"
230 |                 current = [word]
231 |             else:
232 |                 word = f"<span class='cn-text'>{word}</span>"
233 |                 current.append(word)
234 | 
235 |         if current:
236 |             parts.append(" ".join(current))
237 | 
238 |         return "<br>".join(parts)
239 | 
240 |     def format_trans(self, translation: str, tense: str, distribution: str) -> str:
241 |         """Add tense and distribution of each word in Translation part"""
242 |         if not tense:
243 |             # AI is disabled
244 |             return f"{translation}<br><br>{distribution}"
245 | 
246 |         return f"{translation}<br><br>{tense}<br><br>{distribution}"
247 | 
248 |     def format_youdao(self, data: dict) -> str:
249 |         """format youdao example_phrases and example_sentences"""
250 |         result = []
251 | 
252 |         # Format phrases if they exist
253 |         if "example_phrases" in data and data["example_phrases"]:
254 |             result.append("【短语】")
255 |             phrases = []
256 |             for phrase in data["example_phrases"]:
257 |                 formatted_phrase = f"<li><b><span class='phrase-text'>{phrase['english']}</span></b> {phrase['chinese']}</li>"
258 |                 phrases.append(formatted_phrase)
259 | 
260 |             result.append("".join(phrases))
261 | 
262 |         # Format sentences if they exist
263 |         if "example_sentences" in data and data["example_sentences"]:
264 |             result.append("【例句】")
265 |             phrases = []
266 |             for sentence in data["example_sentences"]:
267 |                 formatted_sentence = f"<li><b><span class='phrase-text'>{sentence['english']}</span></b> {sentence['chinese']}</li>"
268 |                 phrases.append(formatted_sentence)
269 | 
270 |             result.append("".join(phrases))
271 | 
272 |         return "<br>".join(result)
273 | 
274 |     def add_note(self, data: dict):
275 |         note = genanki.Note(
276 |             model=self.model,
277 |             fields=[
278 |                 # 词头
279 |                 data.get("Word", ""),
280 |                 # 读音
281 |                 f"[sound:{data.get('Pronunciation', '')}]",
282 |                 # 音标 + 考试大纲 + 语料库词频: [ә'bændәn] (高考 四级 六级 考研 托福 GRE 2057/2182)
283 |                 f"{data.get('ECDict', {}).get('phonetic', '')}",
284 |                 f"{data.get('ECDict', {}).get('tag', '')} {data.get('ECDict', {}).get('bnc', '')}/{data.get('ECDict', {}).get('frq', '')}",
285 |                 # Ecdict 中文解释 + 释义分布 + 时态
286 |                 self.format_trans(
287 |                     self.format_pos(data.get("ECDict", {}).get("translation", "")),
288 |                     data.get("ECDict", {}).get("distribution", ""),
289 |                     data.get("AI", {}).get("tenses", ""),
290 |                 ),
291 |                 # TODO: use better English source
292 |                 f"【英解】<br>{self.format_pos(data.get('ECDict', {}).get('definition', ''))}",
293 |                 # 有道词典示例短语和句子
294 |                 self.format_youdao(data.get("Youdao", {})),
295 |                 # AI词源、助记
296 |                 ""
297 |                 if not data.get("AI")
298 |                 else f"【词源】<br>{data.get('AI', {}).get('origin', {}).get('etymology', '')}",
299 |                 ""
300 |                 if not data.get("AI")
301 |                 else f"【联想助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('associative', '')}",
302 |                 ""
303 |                 if not data.get("AI")
304 |                 else f"【谐音助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('homophone', '')}",
305 |                 # 词语辨析
306 |                 ""
307 |                 if not data.get("ECDict", {}).get("diffrentiation", "")
308 |                 else f"【辨析】{data.get('ECDict', {}).get('diffrentiation', '')}",
309 |                 # 故事
310 |                 ""
311 |                 if not data.get("AI")
312 |                 else f"【故事】 {data.get('AI', {}).get('story', {}).get('english', '')}<br><br>{data.get('AI', {}).get('story', {}).get('chinese', '')}",
313 |             ],
314 |         )
315 |         self.deck.add_note(note)
316 |         self.added = True
317 | 
318 |     def write_to_file(self, file_path: str, mp3_files):
319 |         package = genanki.Package(self.deck)
320 |         package.media_files = mp3_files
321 |         package.write_to_file(file_path)
322 | 


--------------------------------------------------------------------------------
/anki_packager/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import os
  4 | from os import environ as env
  5 | import tomllib
  6 | from tqdm.asyncio import tqdm
  7 | import signal
  8 | 
  9 | ### config
 10 | from anki_packager.utils import get_user_config_dir
 11 | 
 12 | ### logger
 13 | from anki_packager.logger import logger
 14 | 
 15 | ### AI
 16 | from anki_packager.ai import llm
 17 | 
 18 | ### Dictionaries
 19 | from anki_packager.dict.youdao import YoudaoScraper
 20 | from anki_packager.dict.ecdict import Ecdict
 21 | from anki_packager.dict.eudic import EUDIC
 22 | 
 23 | ### Anki
 24 | from anki_packager.packager.deck import AnkiDeckCreator
 25 | 
 26 | MAX_RETRIES = 3  # 最大重试次数
 27 | RETRY_DELAY = 2  # 每次重试前的等待时间（秒）
 28 | CONCURRENCY_LIMIT = 40  # 并发数
 29 | 
 30 | 
 31 | def create_signal_handler(anki, audio_files, DECK_NAME):
 32 |     def signal_handler(sig, frame):
 33 |         logger.info("\033[1;31m程序被 <Ctrl-C> 异常中止...\033[0m")
 34 |         logger.info("正在写入已处理完毕的卡片...")
 35 |         anki.write_to_file(f"{DECK_NAME}.apkg", audio_files)
 36 |         logger.info("正在退出...")
 37 |         exit(0)
 38 | 
 39 |     return signal_handler
 40 | 
 41 | 
 42 | async def main():
 43 |     parser = argparse.ArgumentParser()
 44 | 
 45 |     parser.add_argument("--word", dest="word", type=str, help="word to add")
 46 | 
 47 |     parser.add_argument(
 48 |         "--retry",
 49 |         action="store_true",
 50 |         help="Retry processing failed words only from config/failed.txt",
 51 |     )
 52 | 
 53 |     parser.add_argument(
 54 |         "--disable_ai",
 55 |         dest="disable_ai",
 56 |         action="store_true",
 57 |         help="Disable AI completions",
 58 |     )
 59 | 
 60 |     # ./prog --eudicid: run eudic.get_studylist()
 61 |     parser.add_argument(
 62 |         "--eudicid",
 63 |         action="store_true",
 64 |         help="Display EUDIC studylist by id",
 65 |     )
 66 | 
 67 |     parser.add_argument(
 68 |         "--eudic",
 69 |         action="store_true",
 70 |         help="Use EUDIC book instead of vocabulary.txt",
 71 |     )
 72 | 
 73 |     # support user-defined txt file: ./prog --txt demo.txt
 74 |     parser.add_argument(
 75 |         "--txt",
 76 |         dest="txt_file",
 77 |         type=str,
 78 |         help="Use a custom txt file instead of vocabulary.txt",
 79 |     )
 80 | 
 81 |     parser.add_argument("--model", dest="model", type=str, help="custome AI model")
 82 | 
 83 |     parser.add_argument(
 84 |         "-p",
 85 |         "--proxy",
 86 |         dest="proxy",
 87 |         type=str,
 88 |         default="",
 89 |         help="Default proxy like: http://127.0.0.1:7890",
 90 |     )
 91 | 
 92 |     parser.add_argument(
 93 |         "--api_base",
 94 |         metavar="API_BASE_URL",
 95 |         dest="api_base",
 96 |         type=str,
 97 |         help="Default base url other than the OpenAI's official API address",
 98 |     )
 99 | 
100 |     options = parser.parse_args()
101 | 
102 |     ### set config according to config directory or parsed arguments
103 |     config_dir = get_user_config_dir()
104 |     config_path = os.path.join(config_dir, "config")
105 | 
106 |     ## 1. read config.toml
107 |     with open(os.path.join(config_path, "config.toml"), "rb") as f:
108 |         cfg = tomllib.load(f)
109 |         MODEL_PARAM = cfg["MODEL_PARAM"]
110 |         PROXY = cfg["PROXY"]
111 |         EUDIC_TOKEN = cfg["EUDIC_TOKEN"]
112 |         EUDIC_ID = cfg["EUDIC_ID"]
113 |         DECK_NAME = cfg["DECK_NAME"]
114 | 
115 |     logger.info("配置读取完毕")
116 | 
117 |     # display eudict id only
118 |     if options.eudicid:
119 |         logger.info("设置：仅读取欧路词典 ID")
120 |         eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID)
121 |         await eudic.get_studylist()
122 |         exit(0)
123 | 
124 |     # only add word into vocabulary.txt line by line
125 |     elif options.word:
126 |         WORD = options.word
127 |         vocab_path = os.path.join(config_path, "vocabulary.txt")
128 |         with open(vocab_path, "a") as f:
129 |             f.write(WORD + "\n")
130 |         logger.info(f"单词: {WORD} 已添加进 {vocab_path}")
131 |         exit(0)
132 | 
133 |     words = []
134 |     number_words = 0
135 |     audio_files = []
136 |     ai = None
137 | 
138 |     anki = AnkiDeckCreator(f"{DECK_NAME}")
139 |     ecdict = Ecdict()
140 | 
141 |     # AI 配置
142 |     if options.disable_ai:
143 |         logger.info("AI 功能已关闭")
144 |     else:
145 |         PROXY = options.proxy or PROXY
146 |         if PROXY:
147 |             env["HTTP_PROXY"] = PROXY
148 |             env["HTTPS_PROXY"] = PROXY
149 |             logger.info(f"使用代理: {PROXY}")
150 | 
151 |         # 初始化 AI 模型
152 |         try:
153 |             ai = llm(MODEL_PARAM)
154 |             logger.info(
155 |                 f"当前使用的 AI 模型: {[param['model'] for param in MODEL_PARAM]}"
156 |             )
157 |         except Exception as e:
158 |             logger.error(f"初始化 AI 模型失败: {e}")
159 |             exit(1)
160 |     ## 4. vocabulary source: eudic data, custom txt file, or default vocabulary.txt
161 |     if options.eudic:
162 |         logger.info("配置: 对欧路词典生词本单词进行处理...")
163 |         eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID)
164 |         r = await eudic.get_words()
165 |         eudic_words = r["data"]
166 |         for word in eudic_words:
167 |             words.append(word["word"])
168 |         number_words = len(words)
169 |     elif options.txt_file:
170 |         txt_file_path = options.txt_file
171 |         if not os.path.isabs(txt_file_path):
172 |             # If relative path, resolve from current directory
173 |             txt_file_path = os.path.abspath(txt_file_path)
174 | 
175 |         logger.info(f"配置: 对自定义单词文件 {txt_file_path} 进行处理...")
176 |         try:
177 |             with open(txt_file_path, "r") as vocab:
178 |                 for word in vocab:
179 |                     word = word.strip()
180 |                     if word:  # Skip empty lines
181 |                         words.append(word)
182 |                 number_words = len(words)
183 |         except FileNotFoundError:
184 |             logger.error(f"文件 {txt_file_path} 未找到")
185 |             exit(1)
186 |         except Exception as e:
187 |             logger.error(f"读取文件 {txt_file_path} 出错: {e}")
188 |             exit(1)
189 |     else:
190 |         vocab_path = os.path.join(config_path, "vocabulary.txt")
191 |         logger.info(f"配置: 对默认生词本单词 {vocab_path} 进行处理...")
192 |         try:
193 |             with open(vocab_path, "r") as vocab:
194 |                 for word in vocab:
195 |                     word = word.strip()
196 |                     if word:  # Skip empty lines
197 |                         words.append(word)
198 |                 number_words = len(words)
199 |             logger.info(f"从默认词库读取了 {number_words} 个单词")
200 |         except FileNotFoundError:
201 |             logger.error(f"默认词库文件 {vocab_path} 未找到")
202 |             exit(1)
203 |         except Exception as e:
204 |             logger.error(f"读取默认词库文件出错: {e}")
205 |             exit(1)
206 |         vocab.close()
207 | 
208 |     signal.signal(
209 |         signal.SIGINT,
210 |         create_signal_handler(anki, audio_files, DECK_NAME),
211 |     )
212 |     async with YoudaoScraper() as youdao:
213 |         logger.info(f"开始并发处理 {len(words)} 个单词...")
214 |         with tqdm(total=len(words), desc="开始处理") as pbar:
215 |             tasks = [
216 |                 task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files)
217 |                 for word in words
218 |             ]
219 |             results = await asyncio.gather(*tasks, return_exceptions=True)
220 | 
221 |         successful_results = []
222 |         failed_words = []
223 | 
224 |         for word, result in zip(words, results):
225 |             if isinstance(result, Exception):
226 |                 failed_words.append(word)
227 |                 logger.error(f"未能成功处理 '{word}'. 错误: {result}")
228 |             else:
229 |                 successful_results.append(result)
230 | 
231 |         if failed_words:
232 |             failed_file = os.path.join(config_path, "failed.txt")
233 |             logger.error(
234 |                 f"共 {len(failed_words)} 个单词处理失败，将它们写入 {failed_file}"
235 |             )
236 |             with open(failed_file, "w", encoding="utf-8") as f:
237 |                 for word in failed_words:
238 |                     f.write(f"{word}\n")
239 |         else:
240 |             logger.info("所有单词均已成功处理！")
241 | 
242 |         try:
243 |             if anki.added:
244 |                 anki.write_to_file(f"{DECK_NAME}.apkg", audio_files)
245 |                 logger.info(f"牌组生成完毕，请打开 {DECK_NAME}.apkg")
246 |         except Exception as e:
247 |             logger.error(f"Error saving Anki deck: {e}")
248 | 
249 | 
250 | async def process_word(word, ai, anki, youdao, ecdict, audio_files):
251 |     data = {}
252 |     data["Word"] = word
253 | 
254 |     # Get audio pronunciation from gtts
255 |     audio_path = await youdao._get_audio(word)
256 |     if not audio_path:
257 |         raise Exception("Failed to get audio")
258 | 
259 |     audio_files.append(audio_path)
260 |     # 只使用文件名作为 sound 标签的值
261 |     audio_filename = os.path.basename(audio_path)
262 |     data["Pronunciation"] = audio_filename
263 | 
264 |     # Get ECDICT definition
265 |     dict_def = await ecdict.ret_word(word)
266 |     if not dict_def:
267 |         raise Exception("Failed to get ECDICT definition")
268 |     data["ECDict"] = dict_def
269 | 
270 |     # Get Youdao dictionary information
271 |     youdao_result = await youdao.get_word_info(word)
272 |     if not youdao_result:
273 |         raise Exception("Failed to get Youdao information")
274 | 
275 |     data["Youdao"] = youdao_result
276 | 
277 |     # Get AI explanation if AI is enabled
278 |     if ai is not None:
279 |         try:
280 |             ai_explanation = await ai.explain(word)
281 |             data["AI"] = ai_explanation
282 |         except Exception as e:
283 |             raise Exception(f"Failed to get AI explanation: {str(e)}")
284 |     else:
285 |         data["AI"] = {}
286 | 
287 |     # TODO: Longman English explain
288 | 
289 |     # Add note to deck
290 |     anki.add_note(data)
291 |     return True
292 | 
293 | 
294 | semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
295 | 
296 | 
297 | async def process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files):
298 |     """
299 |     包含了重试和退避逻辑
300 |     """
301 |     for attempt in range(MAX_RETRIES):
302 |         try:
303 |             async with semaphore:
304 |                 result = await process_word(word, ai, anki, youdao, ecdict, audio_files)
305 |             return result
306 |         except Exception as e:
307 |             logger.warning(
308 |                 f"处理 '{word}' 第 {attempt + 1}/{MAX_RETRIES} 次尝试失败: {e}"
309 |             )
310 |             if attempt + 1 == MAX_RETRIES:
311 |                 # 如果是最后一次尝试，则不再捕获异常，让它冒泡出去
312 |                 # gather(return_exceptions=True) 会捕获这个最终的异常
313 |                 logger.error(f"'{word}' 在所有 {MAX_RETRIES} 次尝试后最终失败。")
314 |                 raise
315 |             await asyncio.sleep(RETRY_DELAY)
316 | 
317 | 
318 | async def task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files):
319 |     """
320 |     运行带重试逻辑的任务，并确保进度条在最后总会更新。
321 |     """
322 |     try:
323 |         r = await process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files)
324 |         pbar.set_description(f"'{word}' 添加成功")
325 |         return r
326 |     except Exception:
327 |         pbar.set_description(f"'{word}' 处理失败")
328 |         raise
329 |     finally:
330 |         pbar.update(1)
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     asyncio.run(main())
335 | 


--------------------------------------------------------------------------------
/anki_packager/dict/stardict.py:
--------------------------------------------------------------------------------
   1 | #! /usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | # vim: set ts=4 sw=4 tw=0 et :
   4 | #======================================================================
   5 | #
   6 | # stardict.py - 
   7 | #
   8 | # Created by skywind on 2011/05/13
   9 | # Last Modified: 2019/11/09 23:47
  10 | #
  11 | #======================================================================
  12 | from __future__ import print_function
  13 | import sys
  14 | import time
  15 | import os
  16 | import io
  17 | import csv
  18 | import sqlite3
  19 | import codecs
  20 | 
  21 | try:
  22 |     import json
  23 | except:
  24 |     import simplejson as json
  25 | 
  26 | MySQLdb = None
  27 | 
  28 | 
  29 | #----------------------------------------------------------------------
  30 | # python3 compatible
  31 | #----------------------------------------------------------------------
  32 | if sys.version_info[0] >= 3:
  33 |     unicode = str
  34 |     long = int
  35 |     xrange = range
  36 | 
  37 | 
  38 | #----------------------------------------------------------------------
  39 | # word strip
  40 | #----------------------------------------------------------------------
  41 | def stripword(word):
  42 |     return (''.join([ n for n in word if n.isalnum() ])).lower()
  43 | 
  44 | 
  45 | #----------------------------------------------------------------------
  46 | # StarDict 
  47 | #----------------------------------------------------------------------
  48 | class StarDict (object):
  49 | 
  50 |     def __init__ (self, filename, verbose = False):
  51 |         self.__dbname = filename
  52 |         if filename != ':memory:':
  53 |             os.path.abspath(filename)
  54 |         self.__conn = None
  55 |         self.__verbose = verbose
  56 |         self.__open()
  57 | 
  58 |     # 初始化并创建必要的表格和索引
  59 |     def __open (self):
  60 |         sql = '''
  61 |         CREATE TABLE IF NOT EXISTS "stardict" (
  62 |             "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE,
  63 |             "word" VARCHAR(64) COLLATE NOCASE NOT NULL UNIQUE,
  64 |             "sw" VARCHAR(64) COLLATE NOCASE NOT NULL,
  65 |             "phonetic" VARCHAR(64),
  66 |             "definition" TEXT,
  67 |             "translation" TEXT,
  68 |             "pos" VARCHAR(16),
  69 |             "collins" INTEGER DEFAULT(0),
  70 |             "oxford" INTEGER DEFAULT(0),
  71 |             "tag" VARCHAR(64),
  72 |             "bnc" INTEGER DEFAULT(NULL),
  73 |             "frq" INTEGER DEFAULT(NULL),
  74 |             "exchange" TEXT,
  75 |             "detail" TEXT,
  76 |             "audio" TEXT
  77 |         );
  78 |         CREATE UNIQUE INDEX IF NOT EXISTS "stardict_1" ON stardict (id);
  79 |         CREATE UNIQUE INDEX IF NOT EXISTS "stardict_2" ON stardict (word);
  80 |         CREATE INDEX IF NOT EXISTS "stardict_3" ON stardict (sw, word collate nocase);
  81 |         CREATE INDEX IF NOT EXISTS "sd_1" ON stardict (word collate nocase);
  82 |         '''
  83 | 
  84 |         self.__conn = sqlite3.connect(self.__dbname, isolation_level = "IMMEDIATE")
  85 |         self.__conn.isolation_level = "IMMEDIATE"
  86 | 
  87 |         sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ])
  88 |         sql = sql.strip('\n')
  89 | 
  90 |         self.__conn.executescript(sql)
  91 |         self.__conn.commit()
  92 | 
  93 |         fields = ( 'id', 'word', 'sw', 'phonetic', 'definition', 
  94 |             'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 
  95 |             'exchange', 'detail', 'audio' )
  96 |         self.__fields = tuple([(fields[i], i) for i in range(len(fields))])
  97 |         self.__names = { }
  98 |         for k, v in self.__fields:
  99 |             self.__names[k] = v
 100 |         self.__enable = self.__fields[3:]
 101 |         return True
 102 | 
 103 |     # 数据库记录转化为字典
 104 |     def __record2obj (self, record):
 105 |         if record is None:
 106 |             return None
 107 |         word = {}
 108 |         for k, v in self.__fields:
 109 |             word[k] = record[v]
 110 |         if word['detail']:
 111 |             text = word['detail']
 112 |             try:
 113 |                 obj = json.loads(text)
 114 |             except:
 115 |                 obj = None
 116 |             word['detail'] = obj
 117 |         return word
 118 | 
 119 |     # 关闭数据库
 120 |     def close (self):
 121 |         if self.__conn:
 122 |             self.__conn.close()
 123 |         self.__conn = None
 124 |     
 125 |     def __del__ (self):
 126 |         self.close()
 127 | 
 128 |     # 输出日志
 129 |     def out (self, text):
 130 |         if self.__verbose:
 131 |             print(text)
 132 |         return True
 133 | 
 134 |     # 查询单词
 135 |     def query (self, key):
 136 |         c = self.__conn.cursor()
 137 |         record = None
 138 |         if isinstance(key, int) or isinstance(key, long):
 139 |             c.execute('select * from stardict where id = ?;', (key,))
 140 |         elif isinstance(key, str) or isinstance(key, unicode):
 141 |             c.execute('select * from stardict where word = ?', (key,))
 142 |         else:
 143 |             return None
 144 |         record = c.fetchone()
 145 |         return self.__record2obj(record)
 146 | 
 147 |     # 查询单词匹配
 148 |     def match (self, word, limit = 10, strip = False):
 149 |         c = self.__conn.cursor()
 150 |         if not strip:
 151 |             sql = 'select id, word from stardict where word >= ? '
 152 |             sql += 'order by word collate nocase limit ?;'
 153 |             c.execute(sql, (word, limit))
 154 |         else:
 155 |             sql = 'select id, word from stardict where sw >= ? '
 156 |             sql += 'order by sw, word collate nocase limit ?;'
 157 |             c.execute(sql, (stripword(word), limit))
 158 |         records = c.fetchall()
 159 |         result = []
 160 |         for record in records:
 161 |             result.append(tuple(record))
 162 |         return result
 163 | 
 164 |     # 批量查询
 165 |     def query_batch (self, keys):
 166 |         sql = 'select * from stardict where '
 167 |         if keys is None:
 168 |             return None
 169 |         if not keys:
 170 |             return []
 171 |         querys = []
 172 |         for key in keys:
 173 |             if isinstance(key, int) or isinstance(key, long):
 174 |                 querys.append('id = ?')
 175 |             elif key is not None:
 176 |                 querys.append('word = ?')
 177 |         sql = sql + ' or '.join(querys) + ';'
 178 |         query_word = {}
 179 |         query_id = {}
 180 |         c = self.__conn.cursor()
 181 |         c.execute(sql, tuple(keys))
 182 |         for row in c:
 183 |             obj = self.__record2obj(row)
 184 |             query_word[obj['word'].lower()] = obj
 185 |             query_id[obj['id']] = obj
 186 |         results = []
 187 |         for key in keys:
 188 |             if isinstance(key, int) or isinstance(key, long):
 189 |                 results.append(query_id.get(key, None))
 190 |             elif key is not None:
 191 |                 results.append(query_word.get(key.lower(), None))
 192 |             else:
 193 |                 results.append(None)
 194 |         return tuple(results)
 195 | 
 196 |     # 取得单词总数
 197 |     def count (self):
 198 |         c = self.__conn.cursor()
 199 |         c.execute('select count(*) from stardict;')
 200 |         record = c.fetchone()
 201 |         return record[0]
 202 | 
 203 |     # 注册新单词
 204 |     def register (self, word, items, commit = True):
 205 |         sql = 'INSERT INTO stardict(word, sw) VALUES(?, ?);'
 206 |         try:
 207 |             self.__conn.execute(sql, (word, stripword(word)))
 208 |         except sqlite3.IntegrityError as e:
 209 |             self.out(str(e))
 210 |             return False
 211 |         except sqlite3.Error as e:
 212 |             self.out(str(e))
 213 |             return False
 214 |         self.update(word, items, commit)
 215 |         return True
 216 | 
 217 |     # 删除单词
 218 |     def remove (self, key, commit = True):
 219 |         if isinstance(key, int) or isinstance(key, long):
 220 |             sql = 'DELETE FROM stardict WHERE id=?;'
 221 |         else:
 222 |             sql = 'DELETE FROM stardict WHERE word=?;'
 223 |         try:
 224 |             self.__conn.execute(sql, (key,))
 225 |             if commit:
 226 |                 self.__conn.commit()
 227 |         except sqlite3.IntegrityError:
 228 |             return False
 229 |         return True
 230 | 
 231 |     # 清空数据库
 232 |     def delete_all (self, reset_id = False):
 233 |         sql1 = 'DELETE FROM stardict;'
 234 |         sql2 = "UPDATE sqlite_sequence SET seq = 0 WHERE name = 'stardict';"
 235 |         try:
 236 |             self.__conn.execute(sql1)
 237 |             if reset_id:
 238 |                 self.__conn.execute(sql2)
 239 |             self.__conn.commit()
 240 |         except sqlite3.IntegrityError as e:
 241 |             self.out(str(e))
 242 |             return False
 243 |         except sqlite3.Error as e:
 244 |             self.out(str(e))
 245 |             return False
 246 |         return True
 247 | 
 248 |     # 更新单词数据
 249 |     def update (self, key, items, commit = True):
 250 |         names = []
 251 |         values = []
 252 |         for name, id in self.__enable:
 253 |             if name in items:
 254 |                 names.append(name)
 255 |                 value = items[name]
 256 |                 if name == 'detail':
 257 |                     if value is not None:
 258 |                         value = json.dumps(value, ensure_ascii = False)
 259 |                 values.append(value)
 260 |         if len(names) == 0:
 261 |             if commit:
 262 |                 try:
 263 |                     self.__conn.commit()
 264 |                 except sqlite3.IntegrityError:
 265 |                     return False
 266 |             return False
 267 |         sql = 'UPDATE stardict SET ' + ', '.join(['%s=?'%n for n in names])
 268 |         if isinstance(key, str) or isinstance(key, unicode):
 269 |             sql += ' WHERE word=?;'
 270 |         else:
 271 |             sql += ' WHERE id=?;'
 272 |         try:
 273 |             self.__conn.execute(sql, tuple(values + [key]))
 274 |             if commit:
 275 |                 self.__conn.commit()
 276 |         except sqlite3.IntegrityError:
 277 |             return False
 278 |         return True
 279 | 
 280 |     # 浏览词典
 281 |     def __iter__ (self):
 282 |         c = self.__conn.cursor()
 283 |         sql = 'select "id", "word" from "stardict"'
 284 |         sql += ' order by "word" collate nocase;'
 285 |         c.execute(sql)
 286 |         return c.__iter__()
 287 | 
 288 |     # 取得长度
 289 |     def __len__ (self):
 290 |         return self.count()
 291 | 
 292 |     # 检测存在
 293 |     def __contains__ (self, key):
 294 |         return self.query(key) is not None
 295 | 
 296 |     # 查询单词
 297 |     def __getitem__ (self, key):
 298 |         return self.query(key)
 299 | 
 300 |     # 提交变更
 301 |     def commit (self):
 302 |         try:
 303 |             self.__conn.commit()
 304 |         except sqlite3.IntegrityError:
 305 |             self.__conn.rollback()
 306 |             return False
 307 |         return True
 308 | 
 309 |     # 取得所有单词
 310 |     def dumps (self):
 311 |         return [ n for _, n in self.__iter__() ]
 312 | 
 313 | 
 314 | 
 315 | #----------------------------------------------------------------------
 316 | # startup MySQLdb
 317 | #----------------------------------------------------------------------
 318 | def mysql_startup():
 319 |     global MySQLdb
 320 |     if MySQLdb is not None:
 321 |         return True
 322 |     try:
 323 |         import MySQLdb as _mysql
 324 |         MySQLdb = _mysql
 325 |     except ImportError:
 326 |         return False
 327 |     return True
 328 | 
 329 | 
 330 | #----------------------------------------------------------------------
 331 | # DictMysql
 332 | #----------------------------------------------------------------------
 333 | class DictMySQL (object):
 334 | 
 335 |     def __init__ (self, desc, init = False, timeout = 10, verbose = False):
 336 |         self.__argv = {}
 337 |         self.__uri = {}
 338 |         if isinstance(desc, dict):
 339 |             argv = desc
 340 |         else:
 341 |             argv = self.__url_parse(desc)
 342 |         for k, v in argv.items():
 343 |             self.__argv[k] = v
 344 |             if k not in ('engine', 'init', 'db', 'verbose'):
 345 |                 self.__uri[k] = v
 346 |         self.__uri['connect_timeout'] = timeout
 347 |         self.__conn = None
 348 |         self.__verbose = verbose
 349 |         self.__init = init
 350 |         if 'db' not in argv:
 351 |             raise KeyError('not find db name')
 352 |         self.__open()
 353 | 
 354 |     def __open (self):
 355 |         mysql_startup()
 356 |         if MySQLdb is None:
 357 |             raise ImportError('No module named MySQLdb')
 358 |         fields = [ 'id', 'word', 'sw', 'phonetic', 'definition', 
 359 |             'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 
 360 |             'exchange', 'detail', 'audio' ]
 361 |         self.__fields = tuple([(fields[i], i) for i in range(len(fields))])
 362 |         self.__names = { }
 363 |         for k, v in self.__fields:
 364 |             self.__names[k] = v
 365 |         self.__enable = self.__fields[3:]
 366 |         self.__db = self.__argv.get('db', 'stardict')
 367 |         if not self.__init:
 368 |             uri = {}
 369 |             for k, v in self.__uri.items():
 370 |                 uri[k] = v
 371 |             uri['db'] = self.__db
 372 |             self.__conn = MySQLdb.connect(**uri)
 373 |         else:
 374 |             self.__conn = MySQLdb.connect(**self.__uri)
 375 |             return self.init()
 376 |         return True
 377 | 
 378 |     # 输出日志
 379 |     def out (self, text):
 380 |         if self.__verbose:
 381 |             print(text)
 382 |         return True
 383 | 
 384 |     # 初始化数据库与表格
 385 |     def init (self):
 386 |         database = self.__argv.get('db', 'stardict')
 387 |         self.out('create database: %s'%database)
 388 |         self.__conn.query("SET sql_notes = 0;")
 389 |         self.__conn.query('CREATE DATABASE IF NOT EXISTS %s;'%database)
 390 |         self.__conn.query('USE %s;'%database)
 391 |         # self.__conn.query('drop table if exists stardict')
 392 |         sql = '''
 393 |             CREATE TABLE IF NOT EXISTS `%s`.`stardict` (
 394 |             `id` INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
 395 |             `word` VARCHAR(64) NOT NULL UNIQUE KEY,
 396 |             `sw` VARCHAR(64) NOT NULL,
 397 |             `phonetic` VARCHAR(64),
 398 |             `definition` TEXT,
 399 |             `translation` TEXT,
 400 |             `pos` VARCHAR(16),
 401 |             `collins` SMALLINT DEFAULT 0,
 402 |             `oxford` SMALLINT DEFAULT 0,
 403 |             `tag` VARCHAR(64),
 404 |             `bnc` INT DEFAULT NULL,
 405 |             `frq` INT DEFAULT NULL,
 406 |             `exchange` TEXT,
 407 |             `detail` TEXT,
 408 |             `audio` TEXT,
 409 |             KEY(`sw`, `word`),
 410 |             KEY(`collins`),
 411 |             KEY(`oxford`),
 412 |             KEY(`tag`)
 413 |             )
 414 |             '''%(database)
 415 |         sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ])
 416 |         sql = sql.strip('\n')
 417 |         sql += ' ENGINE=MyISAM DEFAULT CHARSET=utf8;'
 418 |         self.__conn.query(sql)
 419 |         self.__conn.commit()
 420 |         return True
 421 | 
 422 |     # 读取 mysql://user:passwd@host:port/database
 423 |     def __url_parse (self, url):
 424 |         if url[:8] != 'mysql://':
 425 |             return None
 426 |         url = url[8:]
 427 |         obj = {}
 428 |         part = url.split('/')
 429 |         main = part[0]
 430 |         p1 = main.find('@')
 431 |         if p1 >= 0:
 432 |             text = main[:p1].strip()
 433 |             main = main[p1 + 1:]
 434 |             p1 = text.find(':')
 435 |             if p1 >= 0:
 436 |                 obj['user'] = text[:p1].strip()
 437 |                 obj['passwd'] = text[p1 + 1:].strip()
 438 |             else:
 439 |                 obj['user'] = text
 440 |         p1 = main.find(':')
 441 |         if p1 >= 0:
 442 |             port = main[p1 + 1:]
 443 |             main = main[:p1]
 444 |             obj['port'] = int(port)
 445 |         main = main.strip()
 446 |         if not main:
 447 |             main = 'localhost'
 448 |         obj['host'] = main.strip()
 449 |         if len(part) >= 2:
 450 |             obj['db'] = part[1]
 451 |         return obj
 452 | 
 453 |     # 数据库记录转化为字典
 454 |     def __record2obj (self, record):
 455 |         if record is None:
 456 |             return None
 457 |         word = {}
 458 |         for k, v in self.__fields:
 459 |             word[k] = record[v]
 460 |         if word['detail']:
 461 |             text = word['detail']
 462 |             try:
 463 |                 obj = json.loads(text)
 464 |             except:
 465 |                 obj = None
 466 |             word['detail'] = obj
 467 |         return word
 468 | 
 469 |     # 关闭数据库
 470 |     def close (self):
 471 |         if self.__conn:
 472 |             self.__conn.close()
 473 |         self.__conn = None
 474 | 
 475 |     def __del__ (self):
 476 |         self.close()
 477 | 
 478 |     # 查询单词
 479 |     def query (self, key):
 480 |         record = None
 481 |         if isinstance(key, int) or isinstance(key, long):
 482 |             sql = 'select * from stardict where id = %s;'
 483 |         elif isinstance(key, str) or isinstance(key, unicode):
 484 |             sql = 'select * from stardict where word = %s;'
 485 |         else:
 486 |             return None
 487 |         with self.__conn as c:
 488 |             c.execute(sql, (key,))
 489 |             record = c.fetchone()
 490 |         return self.__record2obj(record)
 491 | 
 492 |     # 查询单词匹配
 493 |     def match (self, word, limit = 10, strip = False):
 494 |         c = self.__conn.cursor()
 495 |         if not strip:
 496 |             sql = 'select id, word from stardict where word >= %s '
 497 |             sql += 'order by word limit %s;'
 498 |             c.execute(sql, (word, limit))
 499 |         else:
 500 |             sql = 'select id, word from stardict where sw >= %s '
 501 |             sql += 'order by sw, word limit %s;'
 502 |             c.execute(sql, (stripword(word), limit))
 503 |         records = c.fetchall()
 504 |         result = []
 505 |         for record in records:
 506 |             result.append(tuple(record))
 507 |         return result
 508 | 
 509 |     # 批量查询
 510 |     def query_batch (self, keys):
 511 |         sql = 'select * from stardict where '
 512 |         if keys is None:
 513 |             return None
 514 |         if not keys:
 515 |             return []
 516 |         querys = []
 517 |         for key in keys:
 518 |             if isinstance(key, int) or isinstance(key, long):
 519 |                 querys.append('id = %s')
 520 |             elif key is not None:
 521 |                 querys.append('word = %s')
 522 |         sql = sql + ' or '.join(querys) + ';'
 523 |         query_word = {}
 524 |         query_id = {}
 525 |         with self.__conn as c:
 526 |             c.execute(sql, tuple(keys))
 527 |             for row in c:
 528 |                 obj = self.__record2obj(row)
 529 |                 query_word[obj['word'].lower()] = obj
 530 |                 query_id[obj['id']] = obj
 531 |         results = []
 532 |         for key in keys:
 533 |             if isinstance(key, int) or isinstance(key, long):
 534 |                 results.append(query_id.get(key, None))
 535 |             elif key is not None:
 536 |                 results.append(query_word.get(key.lower(), None))
 537 |             else:
 538 |                 results.append(None)
 539 |         return tuple(results)
 540 | 
 541 |     # 注册新单词
 542 |     def register (self, word, items, commit = True):
 543 |         sql = 'INSERT INTO stardict(word, sw) VALUES(%s, %s);'
 544 |         try:
 545 |             with self.__conn as c:
 546 |                 c.execute(sql, (word, stripword(word)))
 547 |         except MySQLdb.Error as e:
 548 |             self.out(str(e))
 549 |             return False
 550 |         self.update(word, items, commit)
 551 |         return True
 552 | 
 553 |     # 删除单词
 554 |     def remove (self, key, commit = True):
 555 |         if isinstance(key, int) or isinstance(key, long):
 556 |             sql = 'DELETE FROM stardict WHERE id=%s;'
 557 |         else:
 558 |             sql = 'DELETE FROM stardict WHERE word=%s;'
 559 |         try:
 560 |             with self.__conn as c:
 561 |                 c.execute(sql, (key,))
 562 |         except MySQLdb.Error as e:
 563 |             self.out(str(e))
 564 |             return False
 565 |         return True
 566 | 
 567 |     # 清空数据库
 568 |     def delete_all (self, reset_id = False):
 569 |         sql1 = 'DELETE FROM stardict;'
 570 |         try:
 571 |             with self.__conn as c:
 572 |                 c.execute(sql1)
 573 |         except MySQLdb.Error as e:
 574 |             self.out(str(e))
 575 |             return False
 576 |         return True
 577 | 
 578 |     # 更新单词数据
 579 |     def update (self, key, items, commit = True):
 580 |         names = []
 581 |         values = []
 582 |         for name, id in self.__enable:
 583 |             if name in items:
 584 |                 names.append(name)
 585 |                 value = items[name]
 586 |                 if name == 'detail':
 587 |                     if value is not None:
 588 |                         value = json.dumps(value, ensure_ascii = False)
 589 |                 values.append(value)
 590 |         if len(names) == 0:
 591 |             if commit:
 592 |                 try:
 593 |                     self.__conn.commit()
 594 |                 except MySQLdb.Error as e:
 595 |                     self.out(str(e))
 596 |                     return False
 597 |             return False
 598 |         sql = 'UPDATE stardict SET ' + ', '.join(['%s=%%s'%n for n in names])
 599 |         if isinstance(key, str) or isinstance(key, unicode):
 600 |             sql += ' WHERE word=%s;'
 601 |         else:
 602 |             sql += ' WHERE id=%s;'
 603 |         try:
 604 |             with self.__conn as c:
 605 |                 c.execute(sql, tuple(values + [key]))
 606 |         except MySQLdb.Error as e:
 607 |             self.out(str(e))
 608 |             return False
 609 |         return True
 610 | 
 611 |     # 取得数据量
 612 |     def count (self):
 613 |         sql = 'SELECT count(*) FROM stardict;'
 614 |         try:
 615 |             with self.__conn as c:
 616 |                 c.execute(sql)
 617 |                 row = c.fetchone()
 618 |                 return row[0]
 619 |         except MySQLdb.Error as e:
 620 |             self.out(str(e))
 621 |             return -1
 622 |         return 0
 623 | 
 624 |     # 提交数据
 625 |     def commit (self):
 626 |         try:
 627 |             self.__conn.commit()
 628 |         except MySQLdb.Error as e:
 629 |             self.out(str(e))
 630 |             return False
 631 |         return True
 632 | 
 633 |     # 取得长度
 634 |     def __len__ (self):
 635 |         return self.count()
 636 | 
 637 |     # 检测存在
 638 |     def __contains__ (self, key):
 639 |         return self.query(key) is not None
 640 | 
 641 |     # 查询单词
 642 |     def __getitem__ (self, key):
 643 |         return self.query(key)
 644 | 
 645 |     # 取得所有单词
 646 |     def dumps (self):
 647 |         return [ n for _, n in self.__iter__() ]
 648 | 
 649 | 
 650 | 
 651 | #----------------------------------------------------------------------
 652 | # CSV COLUMNS
 653 | #----------------------------------------------------------------------
 654 | COLUMN_SIZE = 13
 655 | COLUMN_ID = COLUMN_SIZE
 656 | COLUMN_SD = COLUMN_SIZE + 1
 657 | COLUMN_SW = COLUMN_SIZE + 2
 658 | 
 659 | 
 660 | #----------------------------------------------------------------------
 661 | # DictCsv
 662 | #----------------------------------------------------------------------
 663 | class DictCsv (object):
 664 | 
 665 |     def __init__ (self, filename, codec = 'utf-8'):
 666 |         self.__csvname = None
 667 |         if filename is not None:
 668 |             self.__csvname = os.path.abspath(filename)
 669 |         self.__codec = codec
 670 |         self.__heads = ( 'word', 'phonetic', 'definition', 
 671 |             'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 
 672 |             'exchange', 'detail', 'audio' )
 673 |         heads = self.__heads
 674 |         self.__fields = tuple([ (heads[i], i) for i in range(len(heads)) ])
 675 |         self.__names = {}
 676 |         for k, v in self.__fields:
 677 |             self.__names[k] = v
 678 |         numbers = []
 679 |         for name in ('collins', 'oxford', 'bnc', 'frq'):
 680 |             numbers.append(self.__names[name])
 681 |         self.__numbers = tuple(numbers)
 682 |         self.__enable = self.__fields[1:]
 683 |         self.__dirty = False
 684 |         self.__words = {}
 685 |         self.__rows = []
 686 |         self.__index = []
 687 |         self.__read()
 688 | 
 689 |     def reset (self):
 690 |         self.__dirty = False
 691 |         self.__words = {}
 692 |         self.__rows = []
 693 |         self.__index = []
 694 |         return True
 695 | 
 696 |     def encode (self, text):
 697 |         if text is None:
 698 |             return None
 699 |         text = text.replace('\\', '\\\\').replace('\n', '\\n')
 700 |         return text.replace('\r', '\\r')
 701 | 
 702 |     def decode (self, text):
 703 |         output = []
 704 |         i = 0
 705 |         if text is None:
 706 |             return None
 707 |         size = len(text)
 708 |         while i < size:
 709 |             c = text[i]
 710 |             if c == '\\':
 711 |                 c = text[i + 1:i + 2]
 712 |                 if c == '\\':
 713 |                     output.append('\\')
 714 |                 elif c == 'n':
 715 |                     output.append('\n')
 716 |                 elif c == 'r':
 717 |                     output.append('\r')
 718 |                 else:
 719 |                     output.append('\\' + c)
 720 |                 i += 2
 721 |             else:
 722 |                 output.append(c)
 723 |                 i += 1
 724 |         return ''.join(output)
 725 | 
 726 |     # 安全转行整数
 727 |     def readint (self, text):
 728 |         if text is None:
 729 |             return None
 730 |         if text == '':
 731 |             return 0
 732 |         try:
 733 |             x = long(text)
 734 |         except:
 735 |             return 0
 736 |         if x < 0x7fffffff:
 737 |             return int(x)
 738 |         return x
 739 | 
 740 |     # 读取文件
 741 |     def __read (self):
 742 |         self.reset()
 743 |         filename = self.__csvname
 744 |         if filename is None:
 745 |             return False
 746 |         if not os.path.exists(self.__csvname):
 747 |             return False
 748 |         codec = self.__codec
 749 |         if sys.version_info[0] < 3:
 750 |             fp = open(filename, 'rb')
 751 |             content = fp.read()
 752 |             if not isinstance(content, type(b'')):
 753 |                 content = content.encode(codec, 'ignore')
 754 |             content = content.replace(b'\r\n', b'\n')
 755 |             bio = io.BytesIO()
 756 |             bio.write(content)
 757 |             bio.seek(0)
 758 |             reader = csv.reader(bio)
 759 |         else:
 760 |             reader = csv.reader(open(filename, encoding = codec))
 761 |         rows = []
 762 |         index = []
 763 |         words = {}
 764 |         count = 0
 765 |         for row in reader:
 766 |             count += 1
 767 |             if count == 1:
 768 |                 continue
 769 |             if len(row) < 1:
 770 |                 continue
 771 |             if sys.version_info[0] < 3:
 772 |                 row = [ n.decode(codec, 'ignore') for n in row ]
 773 |             if len(row) < COLUMN_SIZE:
 774 |                 row.extend([None] * (COLUMN_SIZE - len(row)))
 775 |             if len(row) > COLUMN_SIZE:
 776 |                 row = row[:COLUMN_SIZE]
 777 |             word = row[0].lower()
 778 |             if word in words:
 779 |                 continue
 780 |             row.extend([0, 0, stripword(row[0])])
 781 |             words[word] = 1
 782 |             rows.append(row)
 783 |             index.append(row)
 784 |         self.__rows = rows
 785 |         self.__index = index
 786 |         self.__rows.sort(key = lambda row: row[0].lower())
 787 |         self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower()))
 788 |         for index in xrange(len(self.__rows)):
 789 |             row = self.__rows[index]
 790 |             row[COLUMN_ID] = index
 791 |             word = row[0].lower()
 792 |             self.__words[word] = row
 793 |         for index in xrange(len(self.__index)):
 794 |             row = self.__index[index]
 795 |             row[COLUMN_SD] = index
 796 |         return True
 797 | 
 798 |     # 保存文件
 799 |     def save (self, filename = None, codec = 'utf-8'):
 800 |         if filename is None:
 801 |             filename = self.__csvname
 802 |         if filename is None:
 803 |             return False
 804 |         if sys.version_info[0] < 3:
 805 |             fp = open(filename, 'wb')
 806 |             writer = csv.writer(fp)
 807 |         else:
 808 |             fp = open(filename, 'w', encoding = codec, newline = '')
 809 |             writer = csv.writer(fp)
 810 |         writer.writerow(self.__heads)   
 811 |         for row in self.__rows:
 812 |             newrow = []
 813 |             for n in row:
 814 |                 if isinstance(n, int) or isinstance(n, long):
 815 |                     n = str(n)
 816 |                 elif not isinstance(n, bytes):
 817 |                     if (n is not None) and sys.version_info[0] < 3:
 818 |                         n = n.encode(codec, 'ignore')
 819 |                 newrow.append(n)
 820 |             writer.writerow(newrow[:COLUMN_SIZE])
 821 |         fp.close()
 822 |         return True
 823 | 
 824 |     # 对象解码
 825 |     def __obj_decode (self, row):
 826 |         if row is None:
 827 |             return None
 828 |         obj = {}
 829 |         obj['id'] = row[COLUMN_ID]
 830 |         obj['sw'] = row[COLUMN_SW]
 831 |         skip = self.__numbers
 832 |         for key, index in self.__fields:
 833 |             value = row[index]
 834 |             if index in skip:
 835 |                 if value is not None:
 836 |                     value = self.readint(value)
 837 |             elif key != 'detail':
 838 |                 value = self.decode(value)
 839 |             obj[key] = value
 840 |         detail = obj.get('detail', None)
 841 |         if detail is not None:
 842 |             if detail != '':
 843 |                 detail = json.loads(detail)
 844 |             else:
 845 |                 detail = None
 846 |         obj['detail'] = detail
 847 |         return obj
 848 | 
 849 |     # 对象编码
 850 |     def __obj_encode (self, obj):
 851 |         row = [ None for i in xrange(len(self.__fields) + 3) ]
 852 |         for name, idx in self.__fields:
 853 |             value = obj.get(name, None)
 854 |             if value is None:
 855 |                 continue
 856 |             if idx in self.__numbers:
 857 |                 value = str(value)
 858 |             elif name == 'detail':
 859 |                 value = json.dumps(value, ensure_ascii = False)
 860 |             else:
 861 |                 value = self.encode(value)
 862 |             row[idx] = value
 863 |         return row
 864 | 
 865 |     # 重新排序
 866 |     def __resort (self):
 867 |         self.__rows.sort(key = lambda row: row[0].lower())
 868 |         self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower()))
 869 |         for index in xrange(len(self.__rows)):
 870 |             row = self.__rows[index]
 871 |             row[COLUMN_ID] = index
 872 |         for index in xrange(len(self.__index)):
 873 |             row = self.__index[index]
 874 |             row[COLUMN_SD] = index
 875 |         self.__dirty = False
 876 | 
 877 |     # 查询单词
 878 |     def query (self, key):
 879 |         if key is None:
 880 |             return None
 881 |         if self.__dirty:
 882 |             self.__resort()
 883 |         if isinstance(key, int) or isinstance(key, long):
 884 |             if key < 0 or key >= len(self.__rows):
 885 |                 return None
 886 |             return self.__obj_decode(self.__rows[key])
 887 |         row = self.__words.get(key.lower(), None)
 888 |         return self.__obj_decode(row)
 889 | 
 890 |     # 查询单词匹配
 891 |     def match (self, word, count = 10, strip = False):
 892 |         if len(self.__rows) == 0:
 893 |             return []
 894 |         if self.__dirty:
 895 |             self.__resort()
 896 |         if not strip:
 897 |             index = self.__rows
 898 |             pos = 0
 899 |         else:
 900 |             index = self.__index
 901 |             pos = COLUMN_SW
 902 |         top = 0
 903 |         bottom = len(index) - 1
 904 |         middle = top
 905 |         key = word.lower()
 906 |         if strip:
 907 |             key = stripword(word)
 908 |         while top < bottom:
 909 |             middle = (top + bottom) >> 1
 910 |             if top == middle or bottom == middle:
 911 |                 break
 912 |             text = index[middle][pos].lower()
 913 |             if key == text:
 914 |                 break
 915 |             elif key < text:
 916 |                 bottom = middle
 917 |             elif key > text:
 918 |                 top = middle
 919 |         while index[middle][pos].lower() < key:
 920 |             middle += 1
 921 |             if middle >= len(index):
 922 |                 break
 923 |         cc = COLUMN_ID
 924 |         likely = [ (tx[cc], tx[0]) for tx in index[middle:middle + count] ]
 925 |         return likely
 926 | 
 927 |     # 批量查询
 928 |     def query_batch (self, keys):
 929 |         return [ self.query(key) for key in keys ]
 930 | 
 931 |     # 单词总量
 932 |     def count (self):
 933 |         return len(self.__rows)
 934 | 
 935 |     # 取得长度
 936 |     def __len__ (self):
 937 |         return len(self.__rows)
 938 | 
 939 |     # 取得单词
 940 |     def __getitem__ (self, key):
 941 |         return self.query(key)
 942 | 
 943 |     # 是否存在
 944 |     def __contains__ (self, key):
 945 |         return self.__words.__contains__(key.lower())
 946 | 
 947 |     # 迭代器
 948 |     def __iter__ (self):
 949 |         record = []
 950 |         for index in xrange(len(self.__rows)):
 951 |             record.append((index, self.__rows[index][0]))
 952 |         return record.__iter__()
 953 | 
 954 |     # 注册新单词
 955 |     def register (self, word, items, commit = True):
 956 |         if word.lower() in self.__words:
 957 |             return False
 958 |         row = self.__obj_encode(items)
 959 |         row[0] = word
 960 |         row[COLUMN_ID] = len(self.__rows)
 961 |         row[COLUMN_SD] = len(self.__rows)
 962 |         row[COLUMN_SW] = stripword(word)
 963 |         self.__rows.append(row)
 964 |         self.__index.append(row)
 965 |         self.__words[word.lower()] = row
 966 |         self.__dirty = True
 967 |         return True
 968 | 
 969 |     # 删除单词
 970 |     def remove (self, key, commit = True):
 971 |         if isinstance(key, int) or isinstance(key, long):
 972 |             if key < 0 or key >= len(self.__rows):
 973 |                 return False
 974 |             if self.__dirty:
 975 |                 self.__resort()
 976 |             key = self.__rows[key][0]
 977 |         row = self.__words.get(key, None)
 978 |         if row is None:
 979 |             return False
 980 |         if len(self.__rows) == 1:
 981 |             self.reset()
 982 |             return True
 983 |         index = row[COLUMN_ID]
 984 |         self.__rows[index] = self.__rows[len(self.__rows) - 1]
 985 |         self.__rows.pop()
 986 |         index = row[COLUMN_SD]
 987 |         self.__index[index] = self.__index[len(self.__rows) - 1]
 988 |         self.__index.pop()
 989 |         del self.__words[key]
 990 |         self.__dirty = True
 991 |         return True
 992 | 
 993 |     # 清空所有
 994 |     def delete_all (self, reset_id = False):
 995 |         self.reset()
 996 |         return True
 997 | 
 998 |     # 更改单词
 999 |     def update (self, key, items, commit = True):
1000 |         if isinstance(key, int) or isinstance(key, long):
1001 |             if key < 0 or key >= len(self.__rows):
1002 |                 return False
1003 |             if self.__dirty:
1004 |                 self.__resort()
1005 |             key = self.__rows[key][0]
1006 |         key = key.lower()
1007 |         row = self.__words.get(key, None)
1008 |         if row is None:
1009 |             return False
1010 |         newrow = self.__obj_encode(items)
1011 |         for name, idx in self.__fields:
1012 |             if idx == 0:
1013 |                 continue
1014 |             if name in items:
1015 |                 row[idx] = newrow[idx]
1016 |         return True
1017 | 
1018 |     # 提交变更
1019 |     def commit (self):
1020 |         if self.__csvname:
1021 |             self.save(self.__csvname, self.__codec)
1022 |         return True
1023 | 
1024 |     # 取得所有单词
1025 |     def dumps (self):
1026 |         return [ n for _, n in self.__iter__() ]
1027 | 
1028 | 
1029 | #----------------------------------------------------------------------
1030 | # 词形衍生：查找动词的各种时态，名词的复数等，或反向查找
1031 | # 格式为每行一条数据：根词汇 -> 衍生1,衍生2,衍生3
1032 | # 可以用 Hunspell数据生成，下面有个日本人做的简版（1.8万组数据）：
1033 | # http://www.lexically.net/downloads/version4/downloading%20BNC.htm
1034 | #----------------------------------------------------------------------
1035 | class LemmaDB (object):
1036 | 
1037 |     def __init__ (self):
1038 |         self._stems = {}
1039 |         self._words = {}
1040 |         self._frqs = {}
1041 | 
1042 |     # 读取数据
1043 |     def load (self, filename, encoding = None):
1044 |         content = open(filename, 'rb').read()
1045 |         if content[:3] == b'\xef\xbb\xbf':
1046 |             content = content[3:].decode('utf-8', 'ignore')
1047 |         elif encoding is not None:
1048 |             text = content.decode(encoding, 'ignore')
1049 |         else:
1050 |             text = None
1051 |             match = ['utf-8', sys.getdefaultencoding(), 'ascii']
1052 |             for encoding in match + ['gbk', 'latin1']:
1053 |                 try:
1054 |                     text = content.decode(encoding)
1055 |                     break
1056 |                 except:
1057 |                     pass
1058 |             if text is None:
1059 |                 text = content.decode('utf-8', 'ignore')
1060 |         number = 0
1061 |         for line in text.split('\n'):
1062 |             number += 1
1063 |             line = line.strip('\r\n ')
1064 |             if (not line) or (line[:1] == ';'):
1065 |                 continue
1066 |             pos = line.find('->')
1067 |             if not pos:
1068 |                 continue
1069 |             stem = line[:pos].strip()
1070 |             p1 = stem.find('/')
1071 |             frq = 0
1072 |             if p1 >= 0:
1073 |                 frq = int(stem[p1 + 1:].strip())
1074 |                 stem = stem[:p1].strip()
1075 |             if not stem:
1076 |                 continue
1077 |             if frq > 0:
1078 |                 self._frqs[stem] = frq
1079 |             for word in line[pos + 2:].strip().split(','):
1080 |                 p1 = word.find('/')
1081 |                 if p1 >= 0:
1082 |                     word = word[:p1].strip()
1083 |                 if not word:
1084 |                     continue
1085 |                 self.add(stem, word.strip())
1086 |         return True
1087 | 
1088 |     # 保存数据文件
1089 |     def save (self, filename, encoding = 'utf-8'):
1090 |         stems = list(self._stems.keys())
1091 |         stems.sort(key = lambda x: x.lower())
1092 |         import codecs
1093 |         fp = codecs.open(filename, 'w', encoding)
1094 |         output = []
1095 |         for stem in stems:
1096 |             words = self.get(stem)
1097 |             if not words:
1098 |                 continue
1099 |             frq = self._frqs.get(stem, 0)
1100 |             if frq > 0:
1101 |                 stem = '%s/%d'%(stem, frq)
1102 |             output.append((-frq, u'%s -> %s'%(stem, ','.join(words))))
1103 |         output.sort()
1104 |         for _, text in output:
1105 |             fp.write(text + '\n')
1106 |         fp.close()
1107 |         return True
1108 | 
1109 |     # 添加一个词根的一个衍生词
1110 |     def add (self, stem, word):
1111 |         if stem not in self._stems:
1112 |             self._stems[stem] = {}
1113 |         if word not in self._stems[stem]:
1114 |             self._stems[stem][word] = len(self._stems[stem]) 
1115 |         if word not in self._words:
1116 |             self._words[word] = {}
1117 |         if stem not in self._words[word]:
1118 |             self._words[word][stem] = len(self._words[word])
1119 |         return True
1120 | 
1121 |     # 删除一个词根的一个衍生词
1122 |     def remove (self, stem, word):
1123 |         count = 0
1124 |         if stem in self._stems:
1125 |             if word in self._stems[stem]:
1126 |                 del self._stems[stem][word]
1127 |                 count += 1
1128 |             if not self._stems[stem]:
1129 |                 del self._stems[stem]
1130 |         if word in self._words:
1131 |             if stem in self._words[word]:
1132 |                 del self._words[word][stem]
1133 |                 count += 1
1134 |             if not self._words[word]:
1135 |                 del self._words[word]
1136 |         return (count > 0) and True or False
1137 | 
1138 |     # 清空数据库
1139 |     def reset (self):
1140 |         self._stems = {}
1141 |         self._words = {}
1142 |         return True
1143 | 
1144 |     # 根据词根找衍生，或者根据衍生反向找词根
1145 |     def get (self, word, reverse = False):
1146 |         if not reverse:
1147 |             if word not in self._stems:
1148 |                 if word in self._words:
1149 |                     return [word]
1150 |                 return None
1151 |             words = [ (v, k) for (k, v) in self._stems[word].items() ]
1152 |         else:
1153 |             if word not in self._words:
1154 |                 if word in self._stems:
1155 |                     return [word]
1156 |                 return None
1157 |             words = [ (v, k) for (k, v) in self._words[word].items() ]
1158 |         words.sort()
1159 |         return [ k for (v, k) in words ]
1160 | 
1161 |     # 知道一个单词求它的词根
1162 |     def word_stem (self, word):
1163 |         return self.get(word, reverse = True)
1164 | 
1165 |     # 总共多少条词根数据
1166 |     def stem_size (self):
1167 |         return len(self._stems)
1168 | 
1169 |     # 总共多少条衍生数据
1170 |     def word_size (self):
1171 |         return len(self._words)
1172 | 
1173 |     def dump (self, what = 'ALL'):
1174 |         words = {}
1175 |         what = what.lower()
1176 |         if what in ('all', 'stem'):
1177 |             for word in self._stems:
1178 |                 words[word] = 1
1179 |         if what in ('all', 'word'):
1180 |             for word in self._words:
1181 |                 words[word] = 1
1182 |         return words
1183 | 
1184 |     def __len__ (self):
1185 |         return len(self._stems)
1186 | 
1187 |     def __getitem__ (self, stem):
1188 |         return self.get(stem)
1189 | 
1190 |     def __contains__ (self, stem):
1191 |         return (stem in self._stems)
1192 | 
1193 |     def __iter__ (self):
1194 |         return self._stems.__iter__()
1195 | 
1196 | 
1197 | 
1198 | #----------------------------------------------------------------------
1199 | # DictHelper
1200 | #----------------------------------------------------------------------
1201 | class DictHelper (object):
1202 | 
1203 |     def __init__ (self):
1204 |         self._exchanges = {}
1205 |         self._exchanges['p'] = u'过去式'
1206 |         self._exchanges['d'] = u'过去分词'
1207 |         self._exchanges['i'] = u'现在分词'
1208 |         self._exchanges['3'] = u'第三人称单数'
1209 |         self._exchanges['r'] = u'比较级'
1210 |         self._exchanges['t'] = u'最高级'
1211 |         self._exchanges['s'] = u'复数'
1212 |         self._exchanges['0'] = u'原型'      # best 的原型是 good
1213 |         self._exchanges['1'] = u'类别'      # best 的类别是 good 里的 t
1214 |         self._pos = {}
1215 |         self._pos['a'] = (u'代词', 'pron.')
1216 |         self._pos['c'] = (u'连接词', 'conj.')
1217 |         self._pos['d'] = (u'限定词', 'determiner')
1218 |         self._pos['i'] = (u'介词', 'prep.')
1219 |         self._pos['j'] = (u'形容词', 'adj.')
1220 |         self._pos['m'] = (u'数词', 'num.')
1221 |         self._pos['n'] = (u'名词', 'n.')
1222 |         self._pos['p'] = (u'代词', 'pron.')
1223 |         self._pos['r'] = (u'副词', 'adv.')
1224 |         self._pos['u'] = (u'感叹词', 'int.')
1225 |         self._pos['t'] = (u'不定式标记', 'infm.')
1226 |         self._pos['v'] = (u'动词', 'v.')
1227 |         self._pos['x'] = (u'否定标记', 'not')
1228 | 
1229 |     # 返回一个进度指示条，传入总量，每走一格调用一次 next
1230 |     def progress (self, total):
1231 |         class ProgressIndicator (object):
1232 |             def __init__ (self, total):
1233 |                 self.count = 0
1234 |                 self.percent = -1
1235 |                 self.total = total
1236 |                 self.timestamp = time.time()
1237 |                 self.counter = {}
1238 |             def next (self):
1239 |                 if self.total:
1240 |                     self.count += 1
1241 |                     pc = int(self.count * 100 / self.total)
1242 |                     if pc != self.percent:
1243 |                         self.percent = pc
1244 |                         print('progress: %d%%'%pc)
1245 |             def inc (self, name):
1246 |                 if name not in self.counter:
1247 |                     self.counter[name] = 1
1248 |                 else:
1249 |                     self.counter[name] += 1
1250 |             def done (self):
1251 |                 t = (time.time() - self.timestamp)
1252 |                 keys = list(self.counter.keys())
1253 |                 keys.sort()
1254 |                 for key in keys:
1255 |                     print('[%s] -> %d'%(key, self.counter[key]))
1256 |                 print('[Finished in %d seconds (%d)]'%(t, self.count))
1257 |         return ProgressIndicator(total)
1258 | 
1259 |     # 返回词典里所有词的 map，默认转为小写
1260 |     def dump_map (self, dictionary, lower = True):
1261 |         words = {}
1262 |         for _, word in dictionary:
1263 |             if lower:
1264 |                 word = word.lower()
1265 |             words[word] = 1
1266 |         return words
1267 | 
1268 |     # 字典差异导出
1269 |     def discrepancy_export (self, dictionary, words, outname, opts = ''):
1270 |         existence = self.dump_map(dictionary)
1271 |         if os.path.splitext(outname)[-1].lower() in ('.txt', '.csv'):
1272 |             db = DictCsv(outname)
1273 |         else:
1274 |             db = StarDict(outname)
1275 |         db.delete_all()
1276 |         count = 0
1277 |         for word in words:
1278 |             if word.lower() in existence:
1279 |                 continue
1280 |             if '(' in word:
1281 |                 continue
1282 |             if '/' in word:
1283 |                 continue
1284 |             if '"' in word or '#' in word:
1285 |                 continue
1286 |             if '0' in word or '1' in word or '2' in word or '3' in word:
1287 |                 continue
1288 |             if 's' in opts:
1289 |                 if word.count(' ') >= 2:
1290 |                     continue
1291 |             if 't' in opts:
1292 |                 if ' ' in word:
1293 |                     continue
1294 |             if 'p' in opts:
1295 |                 if '-' in word:
1296 |                     continue
1297 |             try:
1298 |                 word.encode('ascii')
1299 |             except:
1300 |                 continue
1301 |             db.register(word, {'tag':'PENDING'}, False)
1302 |             count += 1
1303 |         db.commit()
1304 |         print('exported %d entries'%count)
1305 |         return count
1306 | 
1307 |     # 字典差异导入
1308 |     def discrepancy_import (self, dictionary, filename, opts = ''):
1309 |         existence = self.dump_map(dictionary)
1310 |         if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'):
1311 |             db = DictCsv(filename)
1312 |         else:
1313 |             db = StarDict(filename)
1314 |         count = 0
1315 |         for word in self.dump_map(db, False):
1316 |             data = db[word]
1317 |             if data is None:
1318 |                 continue
1319 |             if data['tag'] != 'OK':
1320 |                 continue
1321 |             phonetic = data.get('phonetic', '')
1322 |             definition = data.get('definition', '')
1323 |             translation = data.get('translation', '')
1324 |             update = {}
1325 |             if phonetic:
1326 |                 update['phonetic'] = phonetic
1327 |             if definition:
1328 |                 update['definition'] = definition
1329 |             if translation:
1330 |                 update['translation'] = translation
1331 |             if not update:
1332 |                 continue
1333 |             if word.lower() in existence:
1334 |                 if 'n' not in opts:
1335 |                     dictionary.update(word, update, False)
1336 |             else:
1337 |                 dictionary.register(word, update, False)
1338 |             count += 1
1339 |         dictionary.commit()
1340 |         print('imported %d entries'%count)
1341 |         return count
1342 | 
1343 |     # 差异比较（utf-8 的.txt 文件，单词和后面音标释义用tab分割） 
1344 |     def deficit_tab_txt (self, dictionary, txt, outname, opts = ''):
1345 |         deficit = {}
1346 |         for line in codecs.open(txt, encoding = 'utf-8'):
1347 |             row = [ n.strip() for n in line.split('\t') ]
1348 |             if len(row) < 2:
1349 |                 continue
1350 |             word = row[0]
1351 |             deficit[word] = 1
1352 |         return self.deficit_export(dictionary, deficit, outname, opts)
1353 | 
1354 |     # 导出星际译王的词典文件，根据一个单词到释义的字典
1355 |     def export_stardict (self, wordmap, outname, title):
1356 |         mainname = os.path.splitext(outname)[0]
1357 |         keys = [ k for k in wordmap ]
1358 |         keys.sort(key = lambda x: (x.lower(), x))
1359 |         import struct
1360 |         pc = self.progress(len(wordmap))
1361 |         position = 0
1362 |         with open(mainname + '.idx', 'wb') as f1:
1363 |             with open(mainname + '.dict', 'wb') as f2:
1364 |                 for word in keys:
1365 |                     pc.next()
1366 |                     f1.write(word.encode('utf-8', 'ignore') + b'\x00')
1367 |                     text = wordmap[word].encode('utf-8', 'ignore')
1368 |                     f1.write(struct.pack('>II', position, len(text)))
1369 |                     f2.write(text)
1370 |                     position += len(text)
1371 |             with open(mainname + '.ifo', 'wb') as f3:
1372 |                 f3.write("StarDict's dict ifo file\nversion=2.4.2\n")
1373 |                 f3.write('wordcount=%d\n'%len(wordmap))
1374 |                 f3.write('idxfilesize=%d\n'%f1.tell())
1375 |                 f3.write('bookname=%s\n'%title.encode('utf-8', 'ignore'))
1376 |                 f3.write('author=\ndescription=\n')
1377 |                 import datetime
1378 |                 ts = datetime.datetime.now().strftime('%Y.%m.%d')
1379 |                 f3.write('date=%s\nsametypesequence=m\n'%ts)
1380 |         pc.done()
1381 |         return True
1382 | 
1383 |     # 导出 mdict 的源文件
1384 |     def export_mdict (self, wordmap, outname):
1385 |         keys = [ k for k in wordmap ]
1386 |         keys.sort(key = lambda x: x.lower())
1387 |         size = len(keys)
1388 |         index = 0
1389 |         pc = self.progress(size)
1390 |         with codecs.open(outname, 'w', encoding = 'utf-8') as fp:
1391 |             for key in keys:
1392 |                 pc.next()
1393 |                 word = key.replace('</>', '').replace('\n', ' ')
1394 |                 text = wordmap[key].replace('</>', '')
1395 |                 if not isinstance(word, unicode):
1396 |                     word = word.decode('gbk')
1397 |                 if not isinstance(text, unicode):
1398 |                     text = text.decode('gbk')
1399 |                 fp.write(word + '\r\n')
1400 |                 for line in text.split('\n'):
1401 |                     line = line.rstrip('\r')
1402 |                     fp.write(line)
1403 |                     fp.write('\r\n')
1404 |                 index += 1
1405 |                 fp.write('</>' + ((index < size) and '\r\n' or ''))
1406 |         pc.done()
1407 |         return True
1408 | 
1409 |     # 导入mdx源文件
1410 |     def import_mdict (self, filename, encoding = 'utf-8'):
1411 |         import codecs
1412 |         words = {}
1413 |         with codecs.open(filename, 'r', encoding = encoding) as fp:
1414 |             text = []   
1415 |             word = None
1416 |             for line in fp:
1417 |                 line = line.rstrip('\r\n')
1418 |                 if word is None:
1419 |                     if line == '':
1420 |                         continue
1421 |                     else:
1422 |                         word = line.strip()
1423 |                 elif line.strip() != '</>':
1424 |                     text.append(line)
1425 |                 else:
1426 |                     words[word] = '\n'.join(text)
1427 |                     word = None
1428 |                     text = []
1429 |         return words
1430 | 
1431 |     # 直接生成 .mdx文件，需要 writemdict 支持：
1432 |     # https://github.com/skywind3000/writemdict
1433 |     def export_mdx (self, wordmap, outname, title, desc = None):
1434 |         try:
1435 |             import writemdict
1436 |         except ImportError:
1437 |             print('ERROR: can\'t import writemdict module, please install it:')
1438 |             print('https://github.com/skywind3000/writemdict')
1439 |             sys.exit(1)
1440 |         if desc is None:
1441 |             desc = u'Create by stardict.py'
1442 |         writer = writemdict.MDictWriter(wordmap, title = title, 
1443 |                 description = desc)
1444 |         with open(outname, 'wb') as fp:
1445 |             writer.write(fp)
1446 |         return True
1447 | 
1448 |     # 读取 .mdx 文件，需要 readmdict 支持：
1449 |     # https://github.com/skywind3000/writemdict (包含readmdict）
1450 |     def read_mdx (self, mdxname, mdd = False):
1451 |         try:
1452 |             import readmdict
1453 |         except ImportError:
1454 |             print('ERROR: can\'t import readmdict module, please install it:')
1455 |             print('https://github.com/skywind3000/writemdict')
1456 |             sys.exit(1)
1457 |         words = {}
1458 |         if not mdd:
1459 |             mdx = readmdict.MDX(mdxname)
1460 |         else:
1461 |             mdx = readmdict.MDD(mdxname)
1462 |         for key, value in mdx.items():
1463 |             key = key.decode('utf-8', 'ignore')
1464 |             if not mdd:
1465 |                 words[key] = value.decode('utf-8', 'ignore')
1466 |             else:
1467 |                 words[key] = value
1468 |         return words
1469 | 
1470 |     # 导出词形变换字符串
1471 |     def exchange_dumps (self, obj):
1472 |         part = []
1473 |         if not obj:
1474 |             return None
1475 |         for k, v in obj.items():
1476 |             k = k.replace('/', '').replace(':', '').strip()
1477 |             v = v.replace('/', '').replace(':', '').strip()
1478 |             part.append(k + ':' + v)
1479 |         return '/'.join(part)
1480 | 
1481 |     # 读取词形变换字符串
1482 |     def exchange_loads (self, exchg):
1483 |         if not exchg:
1484 |             return None
1485 |         obj = {}
1486 |         for text in exchg.split('/'):
1487 |             pos = text.find(':')
1488 |             if pos < 0:
1489 |                 continue
1490 |             k = text[:pos].strip()
1491 |             v = text[pos + 1:].strip()
1492 |             obj[k] = v
1493 |         return obj
1494 | 
1495 |     def pos_loads (self, pos):
1496 |         return self.exchange_loads(pos)
1497 | 
1498 |     def pos_dumps (self, obj):
1499 |         return self.exchange_dumps(obj)
1500 | 
1501 |     # 返回词性
1502 |     def pos_detect (self, word, pos):
1503 |         word = word.lower()
1504 |         if pos == 'a':
1505 |             if word in ('a', 'the',):
1506 |                 return (u'冠词', 'art.')
1507 |             if word in ('no', 'every'):
1508 |                 return (u'形容词', 'adj.')
1509 |             return (u'代词', 'pron.')
1510 |         if pos in self._pos:
1511 |             return self._pos[pos]
1512 |         return (u'未知', 'unknow')
1513 | 
1514 |     # 返回词形比例
1515 |     def pos_extract (self, data):
1516 |         if 'pos' not in data:
1517 |             return None
1518 |         position = data['pos']
1519 |         if not position:
1520 |             return None
1521 |         part = self.pos_loads(position)
1522 |         result = []
1523 |         for x in part:
1524 |             result.append((x, part[x]))
1525 |         result.sort(reverse = True, key = lambda t: int(t[1]))
1526 |         final = []
1527 |         for pos, num in result:
1528 |             mode = self.pos_detect(data['word'], pos)
1529 |             final.append((mode, num))
1530 |         return final
1531 | 
1532 |     # 设置详细内容，None代表删除
1533 |     def set_detail (self, dictionary, word, item, value, create = False):
1534 |         data = dictionary.query(word)
1535 |         if data is None:
1536 |             if not create:
1537 |                 return False
1538 |             dictionary.register(word, {}, False)
1539 |             data = {}
1540 |         detail = data.get('detail')
1541 |         if not detail:
1542 |             detail = {}
1543 |         if value is not None:
1544 |             detail[item] = value
1545 |         elif item in detail:
1546 |             del detail[item]
1547 |         if not detail:
1548 |             detail = None
1549 |         dictionary.update(word, {'detail': detail}, False)
1550 |         return True
1551 | 
1552 |     # 取得详细内容
1553 |     def get_detail (self, dictionary, word, item):
1554 |         data = dictionary.query(word)
1555 |         if not data:
1556 |             return None
1557 |         detail = data.get('detail')
1558 |         if not detail:
1559 |             return None
1560 |         return detail.get(item, None)
1561 | 
1562 |     # load file and guess encoding
1563 |     def load_text (self, filename, encoding = None):
1564 |         content = None
1565 |         try:
1566 |             content = open(filename, 'rb').read()
1567 |         except:
1568 |             return None
1569 |         if content[:3] == b'\xef\xbb\xbf':
1570 |             text = content[3:].decode('utf-8')
1571 |         elif encoding is not None:
1572 |             text = content.decode(encoding, 'ignore')
1573 |         else:
1574 |             text = None
1575 |             guess = [sys.getdefaultencoding(), 'utf-8']
1576 |             if sys.stdout and sys.stdout.encoding:
1577 |                 guess.append(sys.stdout.encoding)
1578 |             for name in guess + ['gbk', 'ascii', 'latin1']:
1579 |                 try:
1580 |                     text = content.decode(name)
1581 |                     break
1582 |                 except:
1583 |                     pass
1584 |             if text is None:
1585 |                 text = content.decode('utf-8', 'ignore')
1586 |         return text
1587 | 
1588 |     # csv 读取，自动检测编码
1589 |     def csv_load (self, filename, encoding = None):
1590 |         text = self.load_text(filename, encoding)
1591 |         if not text:
1592 |             return None
1593 |         import csv
1594 |         if sys.version_info[0] < 3:
1595 |             import cStringIO
1596 |             sio = cStringIO.StringIO(text.encode('utf-8', 'ignore'))
1597 |         else:
1598 |             import io
1599 |             sio = io.StringIO(text)
1600 |         reader = csv.reader(sio)
1601 |         output = []
1602 |         if sys.version_info[0] < 3:
1603 |             for row in reader:
1604 |                 output.append([ n.decode('utf-8', 'ignore') for n in row ])
1605 |         else:
1606 |             for row in reader:
1607 |                 output.append(row)
1608 |         return output
1609 | 
1610 |     # csv保存，可以指定编码
1611 |     def csv_save (self, filename, rows, encoding = 'utf-8'):
1612 |         import csv
1613 |         ispy2 = (sys.version_info[0] < 3)
1614 |         if not encoding:
1615 |             encoding = 'utf-8'
1616 |         if sys.version_info[0] < 3:
1617 |             fp = open(filename, 'wb')
1618 |             writer = csv.writer(fp)
1619 |         else:
1620 |             fp = open(filename, 'w', encoding = encoding, newline = '')
1621 |             writer = csv.writer(fp)
1622 |         for row in rows:
1623 |             newrow = []
1624 |             for n in row:
1625 |                 if isinstance(n, int) or isinstance(n, long):
1626 |                     n = str(n)
1627 |                 elif isinstance(n, float):
1628 |                     n = str(n)
1629 |                 elif not isinstance(n, bytes):
1630 |                     if (n is not None) and ispy2:
1631 |                         n = n.encode(encoding, 'ignore')
1632 |                 newrow.append(n)
1633 |             writer.writerow(newrow)
1634 |         fp.close()
1635 |         return True
1636 | 
1637 |     # 加载 tab 分割的 txt 文件, 返回 key, value
1638 |     def tab_txt_load (self, filename, encoding = None):
1639 |         words = {}
1640 |         content = self.load_text(filename, encoding)
1641 |         if content is None:
1642 |             return None
1643 |         for line in content.split('\n'):
1644 |             line = line.strip('\r\n\t ')
1645 |             if not line:
1646 |                 continue
1647 |             p1 = line.find('\t')
1648 |             if p1 < 0:
1649 |                 continue
1650 |             word = line[:p1].rstrip('\r\n\t ')
1651 |             text = line[p1:].lstrip('\r\n\t ')
1652 |             text = text.replace('\\n', '\n').replace('\\r', '\r')
1653 |             words[word] = text.replace('\\t', '\t').replace('\\\\', '\\')
1654 |         return words
1655 | 
1656 |     # 保存 tab 分割的 txt文件
1657 |     def tab_txt_save (self, filename, words, encoding = 'utf-8'):
1658 |         with codecs.open(filename, 'w', encoding = encoding) as fp:
1659 |             for word in words:
1660 |                 text = words[word]
1661 |                 text = text.replace('\\', '\\\\').replace('\n', '\\n')
1662 |                 text = text.replace('\r', '\\r').replace('\t', '\\t')
1663 |                 fp.write('%s\t%s\r\n'%(word, text))
1664 |         return True
1665 | 
1666 |     # Tab 分割的 txt文件释义导入
1667 |     def tab_txt_import (self, dictionary, filename):
1668 |         words = self.tab_txt_load(filename)
1669 |         if not words:
1670 |             return False
1671 |         pc = self.progress(len(words))
1672 |         for word in words:
1673 |             data = dictionary.query(word)
1674 |             if not data:
1675 |                 dictionary.register(word, {'translation':words[word]}, False)
1676 |             else:
1677 |                 dictionary.update(word, {'translation':words[word]}, False)
1678 |             pc.inc(0)
1679 |             pc.next()
1680 |         dictionary.commit()
1681 |         pc.done()
1682 |         return True
1683 | 
1684 |     # mdx-builder 使用writemdict代替MdxBuilder处理较大词典（需64为python）
1685 |     def mdx_build (self, srcname, outname, title, desc = None):
1686 |         print('loading %s'%srcname)
1687 |         t = time.time()
1688 |         words = self.import_mdict(srcname)
1689 |         t = time.time() - t
1690 |         print(u'%d records loaded in %.3f seconds'%(len(words), t))
1691 |         print(u'building %s'%outname)
1692 |         t = time.time()
1693 |         self.export_mdx(words, outname, title, desc)
1694 |         t = time.time() - t
1695 |         print(u'complete in %.3f seconds'%t)
1696 |         return True
1697 | 
1698 |     # 验证单词合法性
1699 |     def validate_word (self, word, asc128):
1700 |         alpha = 0
1701 |         for ch in word:
1702 |             if ch.isalpha():
1703 |                 alpha += 1
1704 |             if ord(ch) >= 128 and asc128:
1705 |                 return False
1706 |             elif (not ch.isalpha()) and (not ch.isdigit()):
1707 |                 if ch not in ('-', '\'', '/', '(', ')', ' ', ',', '.'):
1708 |                     if ch not in ('&', '!', '?', '_'):
1709 |                         if len(word) == 5 and word[2] == ';':
1710 |                             continue
1711 |                         if not ord(ch) in (239, 65292):
1712 |                             # print 'f1', ord(ch), word.find(ch)
1713 |                             return False
1714 |         if alpha == 0:
1715 |             if not word.isdigit():
1716 |                 return False
1717 |         if word[:1] == '"' and word[-1:] == '"':
1718 |             return False
1719 |         if word[:1] == '(' and word[-1:] == ')':
1720 |             if word.count('(') == 1:
1721 |                 return False
1722 |         if word[:3] == '(-)':
1723 |             return False
1724 |         for ch in ('<', '>', '%', '*', '@', '`'):
1725 |             if ch in word:
1726 |                 return False
1727 |         if '%' in word or '\\' in word or '`' in word:
1728 |             return False
1729 |         if word[:1] in ('$', '@'):
1730 |             return False
1731 |         if len(word) == 1:
1732 |             x = ord(word)
1733 |             if (x < ord('a')) or (x > ord('z')):
1734 |                 if (x < ord('A')) or (x > ord('Z')):
1735 |                     return False
1736 |         if (' ' not in word) and ('-' not in word):
1737 |             if ('?' in word) or ('!' in word):
1738 |                 return False
1739 |         if word.count('?') >= 2:
1740 |             return False
1741 |         if word.count('!') >= 2:
1742 |             return False
1743 |         if '---' in word:
1744 |             return False
1745 |         try:
1746 |             word.lower()
1747 |         except UnicodeWarning:
1748 |             return False
1749 |         return True
1750 | 
1751 | 
1752 | #----------------------------------------------------------------------
1753 | # Helper instance
1754 | #----------------------------------------------------------------------
1755 | tools = DictHelper()
1756 | 
1757 | # 根据文件名自动判断数据库类型并打开
1758 | def open_dict(filename):
1759 |     if isinstance(filename, dict):
1760 |         return DictMySQL(filename)
1761 |     if filename[:8] == 'mysql://':
1762 |         return DictMySQL(filename)
1763 |     if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'):
1764 |         return DictCsv(filename)
1765 |     return StarDict(filename)
1766 | 
1767 | 
1768 | # 字典转化，csv sqlite之间互转
1769 | def convert_dict(dstname, srcname):
1770 |     dst = open_dict(dstname)
1771 |     src = open_dict(srcname)
1772 |     dst.delete_all()
1773 |     pc = tools.progress(len(src))
1774 |     for word in src.dumps():
1775 |         pc.next()
1776 |         data = src[word]
1777 |         x = data['oxford']
1778 |         if isinstance(x, int) or isinstance(x, long):
1779 |             if x <= 0:
1780 |                 data['oxford'] = None
1781 |         elif isinstance(x, str) or isinstance(x, unicode):
1782 |             if x == '' or x == '0':
1783 |                 data['oxford'] = None
1784 |         x = data['collins']
1785 |         if isinstance(x, int) or isinstance(x, long):
1786 |             if x <= 0:
1787 |                 data['collins'] = None
1788 |         elif isinstance(x, str) or isinstance(x, unicode):
1789 |             if x in ('', '0'):
1790 |                 data['collins'] = None
1791 |         dst.register(word, data, False)
1792 |     dst.commit()
1793 |     pc.done()
1794 |     return True
1795 | 
1796 | 
1797 | # 从 ~/.local/share/stardict 下面打开词典
1798 | def open_local(filename):
1799 |     base = os.path.expanduser('~/.local')
1800 |     for dir in [base, base + '/share', base + '/share/stardict']:
1801 |         if not os.path.exists(dir):
1802 |             os.mkdir(dir)
1803 |     fn = os.path.join(base + '/share/stardict', filename)   
1804 |     return open_dict(fn)
1805 | 
1806 | 
1807 | 
1808 | 
1809 | #----------------------------------------------------------------------
1810 | # testing
1811 | #----------------------------------------------------------------------
1812 | if __name__ == '__main__':
1813 |     db = os.path.join(os.path.dirname(__file__), 'test.db')
1814 |     my = {'host':'??', 'user':'skywind', 'passwd':'??', 'db':'skywind_t1'}
1815 |     def test1():
1816 |         t = time.time()
1817 |         sd = StarDict(db, False)
1818 |         print(time.time() - t)
1819 |         # sd.delete_all(True)
1820 |         print(sd.register('kiss2', {'definition':'kiss me'}, False))
1821 |         print(sd.register('kiss here', {'definition':'kiss me'}, False))
1822 |         print(sd.register('Kiss', {'definition':'BIG KISS'}, False))
1823 |         print(sd.register('kiss', {'definition':'kiss me'}, False))
1824 |         print(sd.register('suck', {'definition':'suck me'}, False))
1825 |         print(sd.register('give', {'definition':'give me', 'detail':[1,2,3]}, False))
1826 |         sd.commit()
1827 |         print('')
1828 |         print(sd.count())
1829 |         print(sd.query('kiSs'))
1830 |         print(sd.query(2))
1831 |         print(sd.match('kis', 10))
1832 |         print('')
1833 |         print(sd.query_batch(['give', 2]))
1834 |         print(sd.match('kisshere', 10, True))
1835 |         return 0
1836 |     def test2():
1837 |         t = time.time()
1838 |         dm = DictMySQL(my, init = True)
1839 |         print(time.time() - t)
1840 |         # dm.delete_all(True)
1841 |         print(dm.register('kiss2', {'definition':'kiss me'}, False))
1842 |         print(dm.register('kiss here', {'definition':'kiss me'}, False))
1843 |         print(dm.register('Kiss', {'definition':'kiss me'}, False))
1844 |         print(dm.register('kiss', {'definition':'BIG KISS'}, False))
1845 |         print(dm.register('suck', {'definition':'suck me'}, False))
1846 |         print(dm.register('give', {'definition':'give me'}, False))
1847 |         print(dm.query('kiss'))
1848 |         print(dm.match('kis'))
1849 |         print('')
1850 |         print(dm.query('KiSs'))
1851 |         print(dm.query_batch(['give', 2, 9]))
1852 |         print('count: %d'%len(dm))
1853 |         print(dm.match('kisshere', 10, True))
1854 |         return 0
1855 |     def test3():
1856 |         csvname = os.path.join(os.path.dirname(__file__), 'test.csv')
1857 |         dc = DictCsv(csvname)
1858 |         dc.delete_all()
1859 |         print(dc.register('kiss2', {'definition':'kiss me'}, False))
1860 |         print(dc.register('kiss here', {'definition':'kiss me'}, False))
1861 |         print(dc.register('Kiss', {'definition':'kiss me'}, False))
1862 |         print(dc.register('kiss', {'definition':'kiss me'}, False))
1863 |         print(dc.register('suck', {'definition':'suck me'}, False))
1864 |         print(dc.register('word', {'definition':'WORD WORD'}, False))
1865 |         print(dc.query('kiss'))
1866 |         print('')
1867 |         dc.remove('kiss2')
1868 |         print(dc.match('kis'))
1869 |         print(dc.match('kisshere', 10, True))
1870 |         dc.commit()
1871 |         return 0
1872 |     def test4():
1873 |         lemma = LemmaDB()
1874 |         t = time.time()
1875 |         lemma.load('lemma.en.txt')
1876 |         print('load in %s seconds'%str(time.time() - t))
1877 |         print(len(lemma))
1878 |         for word in ('be', 'give', 'see', 'take'):
1879 |             print('%s -> %s'%(word, ','.join(lemma.get(word))))
1880 |         for word in ('gave', 'taken', 'looked', 'teeth', 'speak'):
1881 |             print('%s <- %s'%(word, ','.join(lemma.word_stem(word))))
1882 |         lemma.save('output.txt')
1883 |         return 0
1884 |     def test5():
1885 |         print(tools.validate_word('Hello World', False))
1886 |     test3()
1887 | 
1888 | 
1889 | 
1890 | 


--------------------------------------------------------------------------------