├── anki_packager
├── dict
│ ├── __init__.py
│ ├── longman.py
│ ├── eudic.py
│ ├── ecdict.py
│ ├── youdao.py
│ └── stardict.py
├── __main__.py
├── __init__.py
├── logger.py
├── prompt.py
├── utils.py
├── ai.py
├── packager
│ └── deck.py
└── cli.py
├── config
├── vocabulary.txt
└── config.toml
├── test.apkg
├── dicts
├── 有道词语辨析.mdx
└── 单词释义比例词典-带词性.mdx
├── images
├── apkg.png
└── 卡片预览.png
├── publish.sh
├── MANIFEST.in
├── Dockerfile
├── LICENSE
├── setup.py
├── Makefile
├── requirements.txt
├── README.md
└── .gitignore
/anki_packager/dict/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/anki_packager/dict/longman.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/config/vocabulary.txt:
--------------------------------------------------------------------------------
1 | reform
2 | open
--------------------------------------------------------------------------------
/test.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/test.apkg
--------------------------------------------------------------------------------
/dicts/有道词语辨析.mdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/有道词语辨析.mdx
--------------------------------------------------------------------------------
/images/apkg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/apkg.png
--------------------------------------------------------------------------------
/images/卡片预览.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/卡片预览.png
--------------------------------------------------------------------------------
/dicts/单词释义比例词典-带词性.mdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/单词释义比例词典-带词性.mdx
--------------------------------------------------------------------------------
/anki_packager/__main__.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from .cli import main
3 |
4 | if __name__ == "__main__":
5 | asyncio.run(main())
6 |
--------------------------------------------------------------------------------
/anki_packager/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.5"
2 |
3 | from .utils import initialize_config
4 |
5 | try:
6 | initialize_config()
7 | except Exception as e:
8 | import sys
9 |
10 | print(f"Warning: Unable to initialize configuration: {e}", file=sys.stderr)
11 |
--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Clean up previous builds
4 | rm -rf build/ dist/ *.egg-info/
5 |
6 | # Build source and wheel distributions
7 | python -m build
8 |
9 | # Upload to PyPI
10 | # Uncomment when ready to publish:
11 | # twine upload dist/*
12 |
13 | echo -e "Build completed. To publish to PyPI, run: twine upload dist/*"
14 |
--------------------------------------------------------------------------------
/config/config.toml:
--------------------------------------------------------------------------------
1 | PROXY = ""
2 | EUDIC_TOKEN = ""
3 | EUDIC_ID = "0"
4 | DECK_NAME = "anki_packager"
5 |
6 | [[MODEL_PARAM]]
7 | model = "gemini/gemini-2.5-flash"
8 | api_key = "GEMINI_API_KEY"
9 | rpm = 10 # 每分钟请求次数
10 |
11 | # [[MODEL_PARAM]]
12 | # model = "openai/gpt-4o"
13 | # api_key = "OPENAI_API_KEY"
14 | # api_base = "YOUR_API_BASE"
15 | # rpm = 200
16 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | recursive-include anki_packager/packager *.py
5 | global-exclude *.py[cod] __pycache__ *.so
6 | global-exclude */__pycache__/*
7 | exclude anki_packager.log
8 | exclude *.pyc
9 | exclude __pycache__
10 | exclude *.apkg
11 | # 排除所有字典数据文件
12 | exclude anki_packager/dicts/*.mdx
13 | exclude anki_packager/dicts/*.7z
14 | exclude anki_packager/dicts/*.db
15 | exclude anki_packager/dicts/*.csv
16 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # 你可能需要代理
2 | # FROM hub.icert.top/python:3.10.16-slim
3 |
4 | FROM python:3.10.16-slim
5 |
6 | RUN apt-get update && apt-get install -y --no-install-recommends \
7 | gcc \
8 | build-essential \
9 | libffi-dev \
10 | && rm -rf /var/lib/apt/lists/*
11 |
12 | WORKDIR /app
13 |
14 | COPY requirements.txt ./
15 |
16 | RUN pip install --no-cache-dir -r requirements.txt
17 |
18 | COPY . .
19 |
20 | RUN mkdir -p config dicts
21 |
22 | ENV PYTHONUNBUFFERED=1
23 |
24 | ENTRYPOINT ["python", "-m", "anki_packager", "--disable_ai"]
25 | CMD []
26 |
--------------------------------------------------------------------------------
/anki_packager/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | # ANSI escape codes for bold blue
4 | BOLD_BLUE = "\033[1;34m"
5 | RESET = "\033[0m"
6 |
7 | logging.basicConfig(
8 | level=logging.INFO,
9 | format=f"{BOLD_BLUE}[%(filename)s:%(lineno)d:%(funcName)s]{RESET} %(message)s",
10 | handlers=[logging.FileHandler("anki_packager.log"), logging.StreamHandler()],
11 | )
12 |
13 | litellm_logger = logging.getLogger("LiteLLM")
14 | litellm_logger.setLevel(logging.WARNING)
15 | litellm_logger = logging.getLogger("LiteLLM Router")
16 | litellm_logger.setLevel(logging.WARNING)
17 |
18 | logging.getLogger("httpx").setLevel(logging.WARNING)
19 | logger = logging.getLogger(__name__)
20 |
--------------------------------------------------------------------------------
/anki_packager/prompt.py:
--------------------------------------------------------------------------------
1 | PROMPT = """
2 | 你是一名中英文双语教育专家,拥有帮助将中文视为母语的用户理解和记忆英语单词的专长,请根据用户提供的英语单词,用中文且仅用 json 格式回复:
3 | {
4 | "word": "用户提供的单词",
5 | "origin": {
6 | "etymology": "<详细介绍单词的造词来源和发展历史,以及在欧美文化中的内涵>",
7 | "mnemonic": {
8 | "associative": "联想记忆:<提供一个联想记忆,帮助用户记住单词的含义>",
9 | "homophone": "谐音记忆:<提供一个谐音记忆,帮助用户记住单词的拼写>"
10 | }
11 | },
12 | "tenses": "<按照以下格式(如果存在)列出词形变化:'v. 动词原形, 过去式, 过去分词, 现在分词; adj. 形容词形式; n. 名词形式; adv. 副词形式'...>",
13 | "story": {
14 | "english": "<用英文撰写一个有画面感的场景故事。要求:1. 必须包含目标单词;2. 使用简单易懂的词汇;3. 长度在80-100个单词之间;4. 突出目标单词的使用场景>",
15 | "chinese": "<故事的中文翻译,保持与英文版本一致的语气和画面感>"
16 | }
17 | }
18 |
19 | 注意事项:
20 | 1. 在 word_forms 中,只填写客观存在的词形,不要随意捏造或添加不存在的词形。并且只包含英文,不要加入中文注解。
21 | """
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Yaoyao Hu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from anki_packager import __version__
3 |
4 | setup(
5 | name="apkger",
6 | version=__version__,
7 | author="Yaoyao Hu",
8 | author_email="shady030314@gmail.com",
9 | description="自动化 Anki 英语单词卡片牌组生成器",
10 | long_description=open("README.md", encoding="utf-8").read(),
11 | long_description_content_type="text/markdown",
12 | url="https://github.com/yaoyhu/anki_packager",
13 | packages=find_packages(
14 | exclude=["*.pyc", "*.pyo", "__pycache__", "*.__pycache__*"]
15 | ),
16 | include_package_data=True,
17 | classifiers=[
18 | "Programming Language :: Python :: 3",
19 | "Programming Language :: Python :: 3.9",
20 | "License :: OSI Approved :: MIT License",
21 | "Operating System :: OS Independent",
22 | "Topic :: Education",
23 | "Topic :: Text Processing :: Linguistic",
24 | "Development Status :: 4 - Beta",
25 | ],
26 | python_requires=">=3.9",
27 | install_requires=open("requirements.txt").read().splitlines(),
28 | entry_points={
29 | "console_scripts": [
30 | "apkger=anki_packager.__main__:main",
31 | ],
32 | },
33 | )
34 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Detect operating system
2 | ifeq ($(OS),Windows_NT)
3 | SHELL := powershell.exe
4 | .SHELLFLAGS := -NoProfile -Command
5 | RM = Remove-Item -Force -Recurse
6 | FOLDER_SET = $$env:FOLDER="$$(Get-Location)"
7 | else
8 | SHELL := /bin/bash
9 | RM = rm -rf
10 | FOLDER_SET = export FOLDER=$$(pwd)
11 | endif
12 |
13 | IMAGE_NAME = apkger
14 | CONTAINER_NAME = apkger
15 | VOLUME_NAME = apkger-dicts
16 |
17 | .PHONY: build run shell clean help
18 |
19 | # Build Docker image and create persistent volume
20 | build:
21 | docker build -t $(IMAGE_NAME) .
22 | docker volume create $(VOLUME_NAME)
23 |
24 | run:
25 | docker run --rm \
26 | --name $(CONTAINER_NAME) \
27 | -v $(VOLUME_NAME):/app/dicts \
28 | $(IMAGE_NAME)
29 |
30 | # Enter shell in container with volume mounted
31 | shell:
32 | docker run -it --rm \
33 | --name $(CONTAINER_NAME) \
34 | -v $(VOLUME_NAME):/app/dicts \
35 | -v $(shell pwd)/config:/app/config \
36 | -v $(shell pwd):/app \
37 | --entrypoint /bin/bash \
38 | $(IMAGE_NAME)
39 |
40 | clean:
41 | -docker rmi $(IMAGE_NAME)
42 | -docker volume rm $(VOLUME_NAME)
43 |
44 | help:
45 | @echo "Available targets:"
46 | @echo " build - Build Docker image and create persistent volume"
47 | @echo " run - Run container with mounted current directory"
48 | @echo " shell - Enter shell in container with volume mounted"
49 | @echo " clean - Remove container and image"
50 | @echo " help - Show this help message"
51 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohappyeyeballs==2.6.1
2 | aiohttp==3.12.15
3 | aiosignal==1.4.0
4 | annotated-types==0.7.0
5 | anyio==4.11.0
6 | attrs==25.3.0
7 | beautifulsoup4==4.7.1
8 | brotli==1.1.0
9 | cached-property==2.0.1
10 | certifi==2025.8.3
11 | charset-normalizer==3.4.3
12 | chevron==0.14.0
13 | click==8.1.8
14 | colorama==0.4.6
15 | distro==1.9.0
16 | fastuuid==0.12.0
17 | filelock==3.19.1
18 | frozendict==2.4.6
19 | frozenlist==1.7.0
20 | fsspec==2025.9.0
21 | genanki==0.13.1
22 | gtts==2.5.3
23 | h11==0.16.0
24 | httpcore==1.0.9
25 | httpx==0.28.1
26 | huggingface-hub==0.35.1
27 | idna==3.10
28 | importlib-metadata==8.7.0
29 | inflate64==1.0.3
30 | jinja2==3.1.6
31 | jiter==0.11.0
32 | jsonschema==4.25.1
33 | jsonschema-specifications==2025.9.1
34 | litellm==1.77.3
35 | madoka==0.7.1
36 | markupsafe==3.0.2
37 | mdict-utils==1.3.14
38 | multidict==6.6.4
39 | multivolumefile==0.2.3
40 | openai==1.109.1
41 | packaging==25.0
42 | pondpond==1.4.1
43 | propcache==0.3.2
44 | psutil==7.1.0
45 | py7zr==0.22.0
46 | pybcj==1.0.6
47 | pycryptodomex==3.23.0
48 | pydantic==2.12.0a1
49 | pydantic-core==2.37.2
50 | pyppmd==1.1.1
51 | python-dotenv==1.1.1
52 | pyyaml==6.0.2
53 | pyzstd==0.17.0
54 | referencing==0.36.2
55 | regex==2025.9.18
56 | requests==2.32.5
57 | rpds-py==0.27.1
58 | sniffio==1.3.1
59 | soupsieve==2.8
60 | texttable==1.7.0
61 | tiktoken==0.11.0
62 | tokenizers==0.22.1
63 | tqdm==4.67.1
64 | typing-extensions==4.15.0
65 | typing-inspection==0.4.1
66 | urllib3==2.5.0
67 | xxhash==3.5.0
68 | yarl==1.20.1
69 | zipp==3.23.0
70 | socksio==1.0.0
71 |
--------------------------------------------------------------------------------
/anki_packager/dict/eudic.py:
--------------------------------------------------------------------------------
1 | from anki_packager.logger import logger
2 | import aiohttp
3 |
4 | # https://my.eudic.net/OpenAPI/doc_api_study#-studylistapi-getcategory
5 |
6 |
7 | class EUDIC:
8 | def __init__(self, token: str, id: str):
9 | self.id = id
10 | self.token = token
11 | self.header = {
12 | "Authorization": self.token,
13 | "Content-Type": "application/json",
14 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
15 | }
16 | self.studylist_url = (
17 | "https://api.frdic.com/api/open/v1/studylist/category?language=en"
18 | )
19 |
20 | self.words_url = "https://api.frdic.com/api/open/v1/studylist/words/"
21 |
22 | async def get_studylist(self):
23 | async with aiohttp.request(
24 | "GET", self.studylist_url, headers=self.header
25 | ) as response:
26 | self.check_token(response.status)
27 | json = await response.json()
28 | # show list id
29 | for book in json["data"]:
30 | logger.info(f"id: {book['id']}, name: {book['name']}")
31 |
32 | return json
33 |
34 | async def get_words(self):
35 | url = self.words_url + str(self.id) + "?language=en&category_id=0"
36 | async with aiohttp.request("GET", url, headers=self.header) as response:
37 | self.check_token(response.status)
38 | json = await response.json()
39 | return json
40 |
41 | def check_token(self, status_code: int):
42 | if status_code != 200:
43 | if status_code == 401:
44 | msg = "前往 https://my.eudic.net/OpenAPI/Authorization 获取 token 写入配置文件"
45 | logger.error(msg)
46 | exit(1)
47 | else:
48 | msg = "检查填写的 ID 是否正确"
49 | logger.error(msg)
50 | exit(1)
51 |
--------------------------------------------------------------------------------
/anki_packager/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import platform
3 |
4 | from anki_packager.logger import logger
5 |
6 |
7 | def get_user_config_dir():
8 | """
9 | Returns the platform-specific user configuration directory.
10 |
11 | - Windows: %APPDATA%/anki_packager
12 | - macOS/Linux: ~/.config/anki_packager
13 | """
14 | if platform.system() == "Windows":
15 | return os.path.join(os.environ.get("APPDATA", ""), "anki_packager")
16 | else:
17 | return os.path.expanduser("~/.config/anki_packager")
18 |
19 |
20 | def initialize_config():
21 | """
22 | Make sure user config dir exists.
23 |
24 | Example:
25 | ~/.config/anki_packager/
26 | ├── config
27 | │ ├── config.toml
28 | │ ├── failed.txt
29 | │ └── vocabulary.txt
30 | └── dicts
31 | ├── 单词释义比例词典-带词性.mdx
32 | ├── 有道词语辨析.mdx
33 | ├── stardict.7z
34 | ├── stardict.csv
35 | └── stardict.db
36 | """
37 | config_dir = get_user_config_dir()
38 | os.makedirs(config_dir, exist_ok=True)
39 | config_subdir = os.path.join(config_dir, "config")
40 | os.makedirs(config_subdir, exist_ok=True)
41 | dicts_dir = os.path.join(config_dir, "dicts")
42 | os.makedirs(dicts_dir, exist_ok=True)
43 |
44 | # Default configuration in TOML format
45 | default_config = """
46 | PROXY = ""
47 | EUDIC_TOKEN = ""
48 | EUDIC_ID = "0"
49 | DECK_NAME = "anki_packager"
50 |
51 | [[MODEL_PARAM]]
52 | model = "gemini/gemini-2.5-flash"
53 | api_key = "GEMINI_API_KEY"
54 | rpm = 10 # 每分钟请求次数
55 |
56 | # [[MODEL_PARAM]]
57 | # model = "openai/gpt-4o"
58 | # api_key = "OPENAI_API_KEY"
59 | # api_base = "YOUR_API_BASE"
60 | # rpm = 200
61 |
62 | """
63 |
64 | config_path = os.path.join(config_subdir, "config.toml")
65 | if not os.path.exists(config_path):
66 | with open(config_path, "w", encoding="utf-8") as f:
67 | f.write(default_config)
68 |
69 | vocab_path = os.path.join(config_subdir, "vocabulary.txt")
70 | if not os.path.exists(vocab_path):
71 | with open(vocab_path, "w", encoding="utf-8") as f:
72 | f.write("")
73 |
74 | failed_path = os.path.join(config_subdir, "failed.txt")
75 | if not os.path.exists(failed_path):
76 | with open(failed_path, "w", encoding="utf-8") as f:
77 | f.write("reform\nopen\n")
78 |
79 | logger.info(f"配置文件位于 {config_path}")
80 |
--------------------------------------------------------------------------------
/anki_packager/ai.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 | from litellm import Choices, Message
3 | from litellm.router import Router
4 | from litellm.files.main import ModelResponse
5 | import json
6 | from anki_packager.prompt import PROMPT
7 |
8 | from pydantic import BaseModel, Field, ValidationError
9 |
10 |
11 | class Mnemonic(BaseModel):
12 | """助记法模型"""
13 |
14 | associative: str = Field(..., description="联想记忆法")
15 | homophone: str = Field(..., description="谐音记忆法")
16 |
17 |
18 | class Origin(BaseModel):
19 | """词源和助记模型"""
20 |
21 | etymology: str = Field(..., description="词源和文化内涵")
22 | mnemonic: Mnemonic
23 |
24 |
25 | class Story(BaseModel):
26 | """场景故事模型"""
27 |
28 | english: str = Field(..., description="英文场景故事")
29 | chinese: str = Field(..., description="故事的中文翻译")
30 |
31 |
32 | # 最终的、最顶层的完整数据模型
33 | class WordExplanation(BaseModel):
34 | """完整的单词解析数据模型"""
35 |
36 | word: str = Field(..., description="用户提供的单词")
37 | origin: Origin
38 | tenses: str = Field(..., description="单词的词形变化")
39 | story: Story
40 |
41 |
42 | class llm:
43 | def __init__(self, model_param: list):
44 | model_list = [
45 | {
46 | "model_name": "a", # 为所有模型统一使用别名 "a"
47 | "litellm_params": param,
48 | }
49 | for param in model_param
50 | ]
51 | self.router = Router(model_list)
52 |
53 | async def explain(self, word: str) -> Dict:
54 | try:
55 | response = await self.router.acompletion(
56 | model="a",
57 | messages=[
58 | {"role": "system", "content": PROMPT},
59 | {"role": "user", "content": word},
60 | ],
61 | temperature=0.3,
62 | max_tokens=500,
63 | response_format={"type": "json_object"},
64 | )
65 | if isinstance(response, ModelResponse):
66 | if isinstance(response.choices, list) and response.choices:
67 | first_choice = response.choices[0]
68 | if (
69 | isinstance(first_choice, Choices)
70 | and isinstance(first_choice.message, Message)
71 | and isinstance(first_choice.message.content, str)
72 | ):
73 | result_str = first_choice.message.content
74 |
75 | if result_str.startswith("```json"):
76 | result_str = result_str.strip("```json\n").strip("```")
77 |
78 | # 1. 将字符串解析为 Python 字典
79 | data = json.loads(result_str)
80 |
81 | # 2. 使用 WordExplanation 模型进行验证和解析
82 | validated_data = WordExplanation.model_validate(data)
83 |
84 | return validated_data.model_dump()
85 |
86 | except json.JSONDecodeError as e:
87 | raise json.JSONDecodeError(
88 | f"Failed to parse JSON for '{word}': {e}. Raw response: '{result_str[:150]}...'",
89 | result_str,
90 | e.pos,
91 | )
92 | except ValidationError as e:
93 | raise ValidationError(f"JSON structure validation failed for '{word}': {e}")
94 | except Exception as e:
95 | raise Exception(f"An unexpected error occurred for '{word}': {e}")
96 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
anki_packager
6 |
7 |
8 | 自动化 Anki 英语单词高质量卡片牌组生成工具
9 |
10 | 关于项目
11 | ·
12 | 使用指南
13 | ·
14 | 开发计划
15 | ·
16 | 致谢
17 |
18 |
19 |
20 | ## 关于项目
21 |
22 | `anki_packager` 是一款自动化的 Anki 单词卡片生成工具,能够自动创建高质量的 `.apkg` 牌组。本项目致力于为英语学习者提供一个高效、智能的记忆辅助工具。
23 |
24 | ### 核心特性
25 |
26 | - 多源精选词典整合:[ECDICT](https://github.com/skywind3000/ECDICT)、[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)、[单词释义比例词典](https://skywind.me/blog/archives/2938)
27 | - 智能化学习体验:
28 | - 自动抓取有道词典优质例句和常用短语
29 | - 支持谷歌 TTS 发音、中英双解、考纲标记等功能
30 | - 支持流行 AI 模型(需要 API-KEY)对单词进行总结、助记及和情境故事生成
31 | - 便捷的数据导入:支持欧路词典生词本一键导入并批量处理单词列表,自动生成卡片
32 | - 优良的命令行体验:显示处理进度,支持记录错误、支持丰富的命令行参数
33 | - 支持 Docker 运行、支持 PyPI 安装
34 |
35 | ### 卡片预览
36 |
37 | 每张单词卡片包含丰富的学习资源,结构清晰,内容全面:
38 |
39 | - 正面:词头、发音、音标 + 考试大纲标签(如 中高考、CET4、CET6、GRE 等)
40 | - 背面:
41 | - 释义:中文(ECDICT)、时态(AI)、释义和词性比例([单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx))
42 | - AI 生成词根 + 辅助记忆(联想记忆 + 谐音记忆)
43 | - 短语 + 例句(有道爬虫)
44 | - 单词辨析([《有道词语辨析》加强版](https://pan.baidu.com/s/1gff2tdp))
45 | - 英文释义(目前来自 ECDICT)+ AI 生成故事
46 |
47 |
48 |
49 | ## 使用
50 |
51 | ### 快速开始
52 |
53 | ```bash
54 | # 直接使用 pip 安装
55 | pip install apkger
56 | ```
57 |
58 | 在使用 apkger 之前,你需要先在 `config/config.toml` 文件中填写相关配置信息:
59 |
60 | 本项目使用 [litellm](https://github.com/BerriAI/litellm) 统一调用 LLM 服务。关于 `MODEL_PARAM` 的详细配置方法,请参考 [LiteLLM Providers 文档](https://docs.litellm.ai/docs/providers)。
61 |
62 | ```toml
63 | PROXY = ""
64 | EUDIC_TOKEN = ""
65 | EUDIC_ID = "0"
66 | DECK_NAME = "anki_packager"
67 |
68 | [[MODEL_PARAM]]
69 | model = "gemini/gemini-2.5-flash"
70 | api_key = "GEMINI_API_KEY"
71 | rpm = 10 # 每分钟请求次数
72 |
73 | ### OpenAI-Compatible Endpoints 示例
74 | # [[MODEL_PARAM]]
75 | # model = "openai/gpt-4o"
76 | # api_key = "OPENAI_API_KEY"
77 | # api_base = "YOUR_API_BASE"
78 | # rpm = 200
79 | ```
80 |
81 | 下面是关于配置文件中各参数的详细说明:
82 |
83 | - `MODEL_PARAM`:
84 | - `model`: Provider Route on LiteLLM + Model ID
85 | - `api_key`: 对应模型的 API 密钥。
86 | - `api_base`: (可选) 仅在模型为 OpenAI-Compatible Endpoints 时需要填写
87 | - `rpm`: (可选) 每分钟的请求次数限制,用于控制 API 调用频率。
88 | - `PROXY`: 如果你无法直接连接到 AI 服务提供商,可以在这里设置代理服务器地址
89 |
90 | - 如果需要使用欧路词典生词本:先按照[欧陆官方获取](https://my.eudic.net/OpenAPI/Authorization) TOKEN,然后使用`apkger --eudicid` 选择 ID 写入配置文件
91 |
92 | ### 下载字典
93 |
94 | 下载字典到配置目录中(注意名称不要错):
95 |
96 | - Linux/MacOS: `~/.config/anki_packager/dicts/`
97 | - Windows: `C:\Users\<用户名>\AppData\Roaming\anki_packager\dicts\`
98 |
99 | 字典数据(感谢 [skywind)](https://github.com/skywind3000)下载地址:
100 |
101 | - [stardict.7z](https://github.com/skywind3000/ECDICT/raw/refs/heads/master/stardict.7z)
102 | - [单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx)
103 | - [有道词语辨析](https://pan.baidu.com/s/1gff2tdp):**需要手动解压**放入 `config/dicts`
104 |
105 | 字典下载完毕后,解压和处理交给 anki_packager 即可。
106 |
107 | ### 运行
108 |
109 | 目前软件没有 UI 界面,只支持命令行运行,下面给出一些参考:
110 |
111 | ```bash
112 | # 查看帮助信息
113 | apkger -h
114 |
115 | # 从默认生词本读词生成卡片
116 | apkger
117 |
118 | ### 关闭 AI 功能
119 | apkger --disable_ai
120 |
121 | ### 从欧路词典生词本导出单词,生成卡片(需要先配置)
122 | ## 先查看 ID 写入配置文件
123 | apkger --eudicid
124 | ## 生成卡片
125 | apkger --eudic
126 | ```
127 |
128 |
129 | 方式一:Conda 环境
130 |
131 | ```bash
132 | # 创建并激活一个名为 apkg 的 Python 3.9 虚拟环境
133 | conda create -n apkg python=3.9
134 | conda activate apkg
135 |
136 | # 安装项目依赖
137 | pip install -r requirements.txt
138 |
139 | # 查看帮助信息
140 | python -m anki_packager -h
141 |
142 | # 从欧路词典生词本导出单词,生成卡片(需要先配置)
143 | python -m anki_packager --eudic
144 |
145 | # 关闭 AI 功能
146 | python -m anki_packager --disable_ai
147 |
148 | # 从生词本读词生成卡片
149 | python -m anki_packager
150 | ```
151 |
152 |
153 |
154 |
155 | 方式二:Docker 容器
156 |
157 | 如果你希望避免污染本地环境,可以使用 Docker 运行 anki_packager,可以配合 `Makefile` 使用:
158 |
159 | ```shell
160 | # 构建 Docker 镜像 和 创建持久化卷
161 | make build
162 |
163 | # 第一次运行容器下载词典(需要一点时间)
164 | make run
165 |
166 | # 进入容器(注意!需要在主机先配置 config/config.toml)
167 | # 在容器中运行 anki_packager,生成的牌组会保存在当前目录中
168 | make shell
169 | ```
170 |
171 |
172 |
173 | ## TODO
174 |
175 | - [x] ~~集成单词释义比例词典~~
176 | - [x] ~~近一步优化单词卡片 UI~~
177 | - [x] ~~从欧路词典导入生词~~
178 | - [x] ~~支持 SiliconFlow、Gemini~~
179 | - [x] ~~重新支持 Docker~~
180 | - [x] ~~发布到 PyPI~~
181 | - [x] ~~训练现成的数据包发布 release~~ @Initsnow
182 | - [ ] 支持更多软件生词导出
183 | - [ ] 支持 Longman 词典
184 | - [ ] 开发 GUI
185 |
186 | ## Thanks
187 |
188 | 本项目得到了众多开源项目和社区的支持:
189 |
190 | - 感谢 [skywind](https://github.com/skywind3000) 开源的 [ECDICT](https://github.com/skywind3000/ECDICT) 以及其他词典项目,为本项目提供了丰富的词典资源。
191 | - 感谢 [yihong0618](https://github.com/yihong0618) 开源的众多优秀 Python 项目,从中获益良多。
192 |
193 | ---
194 |
195 | 如果这个项目对你有帮助,欢迎 Star ⭐️
196 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.apkg
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # UV
100 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | #uv.lock
104 |
105 | # poetry
106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | # This is especially recommended for binary packages to ensure reproducibility, and is more
108 | # commonly ignored for libraries.
109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 |
112 | # pdm
113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | # in version control.
117 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
118 | .pdm.toml
119 | .pdm-python
120 | .pdm-build/
121 |
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 |
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 |
129 | # SageMath parsed files
130 | *.sage.py
131 |
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 |
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 |
145 | # Rope project settings
146 | .ropeproject
147 |
148 | # mkdocs documentation
149 | /site
150 |
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 |
156 | # Pyre type checker
157 | .pyre/
158 |
159 | # pytype static type analyzer
160 | .pytype/
161 |
162 | # Cython debug symbols
163 | cython_debug/
164 |
165 | # PyCharm
166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | # and can be added to the global gitignore or merged into this file. For a more nuclear
169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
172 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
173 |
174 | # User-specific stuff
175 | .idea/**/workspace.xml
176 | .idea/**/tasks.xml
177 | .idea/**/usage.statistics.xml
178 | .idea/**/dictionaries
179 | .idea/**/shelf
180 |
181 | # AWS User-specific
182 | .idea/**/aws.xml
183 |
184 | # Generated files
185 | .idea/**/contentModel.xml
186 |
187 | # Sensitive or high-churn files
188 | .idea/**/dataSources/
189 | .idea/**/dataSources.ids
190 | .idea/**/dataSources.local.xml
191 | .idea/**/sqlDataSources.xml
192 | .idea/**/dynamic.xml
193 | .idea/**/uiDesigner.xml
194 | .idea/**/dbnavigator.xml
195 |
196 | # Gradle
197 | .idea/**/gradle.xml
198 | .idea/**/libraries
199 |
200 | # Gradle and Maven with auto-import
201 | # When using Gradle or Maven with auto-import, you should exclude module files,
202 | # since they will be recreated, and may cause churn. Uncomment if using
203 | # auto-import.
204 | # .idea/artifacts
205 | # .idea/compiler.xml
206 | # .idea/jarRepositories.xml
207 | # .idea/modules.xml
208 | # .idea/*.iml
209 | # .idea/modules
210 | # *.iml
211 | # *.ipr
212 |
213 | # CMake
214 | cmake-build-*/
215 |
216 | # Mongo Explorer plugin
217 | .idea/**/mongoSettings.xml
218 |
219 | # File-based project format
220 | *.iws
221 |
222 | # IntelliJ
223 | out/
224 |
225 | # mpeltonen/sbt-idea plugin
226 | .idea_modules/
227 |
228 | # JIRA plugin
229 | atlassian-ide-plugin.xml
230 |
231 | # Cursive Clojure plugin
232 | .idea/replstate.xml
233 |
234 | # SonarLint plugin
235 | .idea/sonarlint/
236 |
237 | # Crashlytics plugin (for Android Studio and IntelliJ)
238 | com_crashlytics_export_strings.xml
239 | crashlytics.properties
240 | crashlytics-build.properties
241 | fabric.properties
242 |
243 | # Editor-based Rest Client
244 | .idea/httpRequests
245 |
246 | # Android studio 3.1+ serialized cache file
247 | .idea/caches/build_file_checksums.ser
248 |
249 | # PyPI configuration file
250 | .pypirc
251 |
252 | .vscode
253 | .vscode/*
254 | !.vscode/settings.json
255 | !.vscode/tasks.json
256 | !.vscode/launch.json
257 | !.vscode/extensions.json
258 | !.vscode/*.code-snippets
259 |
260 | # Local History for Visual Studio Code
261 | .history/
262 |
263 | # Built Visual Studio Code Extensions
264 | *.vsix
265 |
266 | # macOS shit
267 | .DS_Store
--------------------------------------------------------------------------------
/anki_packager/dict/ecdict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sqlite3
3 |
4 | from anki_packager.logger import logger
5 | from anki_packager.utils import get_user_config_dir
6 |
7 | from anki_packager.dict import stardict
8 |
9 | # https://github.com/liuyug/mdict-utils
10 | from mdict_utils.reader import query
11 | from mdict_utils.utils import ElapsedTimer
12 |
13 |
14 | class Ecdict:
15 | def __init__(self):
16 | self.config_dir = get_user_config_dir()
17 | self.dicts_dir = os.path.join(self.config_dir, "dicts")
18 | # keep the package archive small
19 | self.seven_zip = os.path.join(self.dicts_dir, "stardict.7z")
20 | self.csv = os.path.join(self.dicts_dir, "stardict.csv")
21 | self.sqlite = os.path.join(self.dicts_dir, "stardict.db")
22 | self._convert()
23 | self.conn = sqlite3.connect(self.sqlite)
24 | self.cursor = self.conn.cursor()
25 | self.sd = stardict.StarDict(self.sqlite, False)
26 |
27 | def __del__(self):
28 | if hasattr(self, "conn"):
29 | self.cursor.close()
30 | self.conn.close()
31 |
32 | def _convert(self):
33 | if not os.path.exists(self.csv):
34 | # unzip stardict.csv in 7zip
35 | if not os.path.exists(self.seven_zip):
36 | raise FileNotFoundError(f"{self.seven_zip} 未找到!")
37 |
38 | import py7zr
39 |
40 | logger.info("首次使用: 正在解压词典到 anki_packager/dicts/stardict.csv")
41 | ar = py7zr.SevenZipFile(self.seven_zip, mode="r")
42 | ar.extractall(path=self.dicts_dir)
43 | ar.close()
44 |
45 | if not os.path.exists(self.sqlite):
46 | logger.info(
47 | "耐心等待(790M): 正在转换数据库 anki_packager/dicts/stardict.db"
48 | )
49 | stardict.convert_dict(self.sqlite, self.csv)
50 |
51 | async def ret_word(self, word):
52 | """Return ECDICT data
53 | dict: 包含以下 ECDICT 数据字段的字典:
54 | - word: 单词名称
55 | - phonetic: 音标,以英语英标为主
56 | - definition: 单词释义(英文),每行一个释义
57 | - translation: 单词释义(中文),每行一个释义
58 | - pos: 词语位置,用 "/" 分割不同位置
59 | - collins: 柯林斯星级
60 | - oxford: 是否是牛津三千核心词汇
61 | - tag: 字符串标签: zk/中考, gk/高考, cet4/四级 等等标签,空格分割
62 | - bnc: 英国国家语料库词频顺序
63 | - frq: 当代语料库词频顺序
64 | - exchange: 时态复数等变换,使用 "/" 分割不同项目
65 | - detail: json 扩展信息,字典形式保存例句(待添加)
66 | - audio: 读音音频 url (待添加)
67 | """
68 | data = self.sd.query(word)
69 |
70 | # 考纲标签
71 | data = self.parse_tag(data)
72 | # 释义分布
73 | data = self.get_distribution(data)
74 | # 词语辨析
75 | data = self.get_diffrentiation(data)
76 | return data
77 |
78 | def get_distribution(self, data):
79 | """
80 | Get word distribution from mdx dictionary
81 | """
82 | with ElapsedTimer(verbose=False):
83 | mdx_path = os.path.join(
84 | get_user_config_dir(),
85 | "dicts",
86 | "单词释义比例词典-带词性.mdx",
87 | )
88 | record = query(mdx_path, data["word"])
89 | if record:
90 | data["distribution"] = record
91 | return data
92 |
93 | def get_diffrentiation(self, data):
94 | """[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)"""
95 | with ElapsedTimer(verbose=False):
96 | mdx_path = os.path.join(get_user_config_dir(), "dicts", "有道词语辨析.mdx")
97 | record = query(mdx_path, data["word"])
98 | if record:
99 | data["diffrentiation"] = record
100 | return data
101 |
102 | def definition_newline(self, data):
103 | """Add newline to definition for each part-of-speech
104 |
105 | Demo:
106 | Input: data["definition"] = "n. 词义1 v. 词义2"
107 | Output: data["definition"] = "n. 词义1
v. 词义2"
108 |
109 | """
110 | definition = data.get("definition", "")
111 | if not definition:
112 | return data
113 |
114 | # Split on part of speech markers (like "n.", "v.", etc.)
115 | parts = []
116 | current = ""
117 | words = definition.split()
118 |
119 | for word in words:
120 | if len(word) >= 2 and word.endswith(".") and word[0].isalpha():
121 | if current:
122 | parts.append(current.strip())
123 | current = word
124 | else:
125 | current += " " + word
126 |
127 | if current:
128 | parts.append(current.strip())
129 |
130 | data["definition"] = "
".join(parts)
131 | return data
132 |
133 | def parse_tag(self, data):
134 | """parse tag infomation and update data dict
135 | Demo:
136 | Input: data["tag"] = "zk gk cet4 cet6 ky ielts toefl"
137 | Output: data["tag"] = "中考 高考 四级 六级 考研 雅思 托福"
138 | """
139 | text = data.get("tag", "")
140 | if not text:
141 | return data
142 |
143 | tag_map = {
144 | "zk": "中考",
145 | "gk": "高考",
146 | "cet4": "四级",
147 | "cet6": "六级",
148 | "ky": "考研",
149 | "ielts": "雅思",
150 | "toefl": "托福",
151 | "gre": "GRE",
152 | }
153 |
154 | tags: str = text.split()
155 | result = [tag_map.get(tag, tag) for tag in tags]
156 | data["tag"] = " ".join(result)
157 | return data
158 |
159 | def parse_exchange(self, data):
160 | """parse exchange information and update data dict
161 |
162 | Demo:
163 | Input: data["exchange"] = "s:tests/d:tested/i:testing/p:tested/3:tests"
164 | Output: data["exchange"] = "复数:tests 过去式:tested 过去分词:tested 现在分词:testing 三单:tests"
165 | """
166 | text = data.get("exchange", "")
167 | if not text:
168 | return data
169 |
170 | exchange_map = {
171 | "s": "复数",
172 | "d": "过去式",
173 | "p": "过去分词",
174 | "i": "现在分词",
175 | "3": "三单",
176 | "r": "比较级",
177 | "t": "最高级",
178 | "0": "原型",
179 | "1": "第一人称单数",
180 | }
181 |
182 | result = []
183 | for item in text.split("/"):
184 | if ":" in item:
185 | key, value = item.split(":")
186 | if key in exchange_map:
187 | result.append(f"{exchange_map[key]}: {value}")
188 |
189 | data["exchange"] = " ".join(result)
190 | return data
191 |
--------------------------------------------------------------------------------
/anki_packager/dict/youdao.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import re
4 | import shutil
5 | import tempfile
6 | import aiohttp
7 | from gtts import gTTS
8 | from bs4 import BeautifulSoup
9 | from typing import Dict, Optional
10 | from anki_packager.logger import logger
11 |
12 |
13 | class YoudaoScraper:
14 | def __init__(self):
15 | self.base_url = "https://m.youdao.com/result"
16 | self.tmp = tempfile.mkdtemp()
17 |
18 | async def __aenter__(self):
19 | """进入 async with 时被调用"""
20 | self._session = aiohttp.ClientSession(
21 | headers={
22 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
23 | }
24 | )
25 | return self # 返回实例本身
26 |
27 | async def __aexit__(self, exc_type, exc_val, exc_tb):
28 | """离开 async with 时被调用,确保 Session 被关闭"""
29 | await self._session.close()
30 | try:
31 | self._clean_temp_dir()
32 | except Exception as e:
33 | logger.error(f"Error cleaning up audio files: {e}")
34 |
35 | async def _get_audio(self, word: str):
36 | """return the filename of the audio and the temp directory that needs to be cleaned up"""
37 | filename = os.path.join(self.tmp, f"{word}.mp3")
38 | loop = asyncio.get_running_loop()
39 |
40 | def generate_and_save_audio():
41 | """A wrapper function for the blocking gTTS calls."""
42 | tts = gTTS(text=word, lang="en")
43 | tts.save(filename)
44 |
45 | await loop.run_in_executor(None, generate_and_save_audio)
46 |
47 | return filename
48 |
49 | def _clean_temp_dir(self):
50 | """Clean up a temporary directory and its contents."""
51 | try:
52 | if os.path.exists(self.tmp):
53 | shutil.rmtree(self.tmp)
54 | logger.info(f"音频临时文件夹已清理: {self.tmp}")
55 | except Exception as e:
56 | logger.error(f"音频临时文件夹 {self.tmp} 清理失败: {e}")
57 |
58 | async def get_word_info(self, word: str) -> Optional[Dict]:
59 | try:
60 | params = {"word": word, "lang": "en"}
61 |
62 | async with self._session.get(self.base_url, params=params) as response:
63 | response.raise_for_status()
64 | r_text = await response.text()
65 | soup = BeautifulSoup(r_text, "html.parser")
66 |
67 | result = {
68 | "word": word,
69 | "example_phrases": [],
70 | "example_sentences": [],
71 | }
72 |
73 |
74 | all_uls = soup.find_all("ul", class_="")
75 | # Extract example phrases
76 | if len(all_uls) > 0:
77 | phrase_ul = all_uls[0]
78 | if phrase_ul:
79 | phrase_lis = phrase_ul.find_all("li", class_="mcols-layout")
80 | for li in phrase_lis:
81 | index = (
82 | li.find("span", class_="grey").text.strip()
83 | if li.find("span", class_="grey")
84 | else None
85 | )
86 | col2_element = li.find("div", class_="col2")
87 | point_element = col2_element.find("a", class_="point")
88 | sen_phrase_element = col2_element.find("p", class_="sen-phrase")
89 | english = None
90 | chinese = None
91 | if point_element and sen_phrase_element:
92 | english = point_element.text.strip()
93 | chinese = sen_phrase_element.text.strip()
94 | else:
95 | content = col2_element.text.strip()
96 | parts = re.split(r"([;;])", content)
97 | parts = [
98 | s.strip()
99 | for s in parts
100 | if s.strip() and s not in [";", ";"]
101 | ]
102 | if len(parts) > 1:
103 | english = parts[0]
104 | chinese = "".join(parts[1:])
105 | else:
106 | english = content
107 |
108 | result["example_phrases"].append(
109 | {
110 | "index": index,
111 | "english": english,
112 | "chinese": chinese,
113 | }
114 | )
115 |
116 | # Extract example sentences
117 | if len(all_uls) > 1:
118 | sentence_ul = all_uls[1]
119 | if sentence_ul:
120 | sentence_lis = sentence_ul.find_all("li", class_="mcols-layout")
121 | for li in sentence_lis:
122 | index = (
123 | li.find("span", class_="grey index").text.strip()
124 | if li.find("span", class_="grey index")
125 | else None
126 | )
127 | english_element = li.find("div", class_="sen-eng")
128 | chinese_element = li.find("div", class_="sen-ch")
129 | source_element = li.find("div", class_="secondary")
130 |
131 | english = english_element.text.strip() if english_element else None
132 | chinese = chinese_element.text.strip() if chinese_element else None
133 | source = source_element.text.strip() if source_element else None
134 |
135 | result["example_sentences"].append(
136 | {
137 | "index": index,
138 | "english": english,
139 | "chinese": chinese,
140 | "source": source,
141 | }
142 | )
143 |
144 | return result
145 |
146 | except aiohttp.ClientError as e:
147 | logger.error(f"Request error: {e}")
148 | return None
149 | except Exception as e:
150 | logger.error(f"An error occurred: {e}")
151 | return None
152 |
153 |
154 | if __name__ == "__main__":
155 | async def main():
156 | async with YoudaoScraper() as youdao:
157 | result = asyncio.run(youdao.get_word_info("variable"))
158 | print(result)
159 | asyncio.run(main())
160 |
--------------------------------------------------------------------------------
/anki_packager/packager/deck.py:
--------------------------------------------------------------------------------
1 | import genanki
2 | import random
3 |
4 |
5 | class AnkiDeckCreator:
6 | def __init__(self, deck_name: str):
7 | self.added = False
8 | self.deck_name = deck_name
9 | self.deck_id = random.randrange(1 << 30, 1 << 31)
10 | self.model_id = random.randrange(1 << 30, 1 << 31)
11 | self.deck = genanki.Deck(self.deck_id, deck_name)
12 | self.model = genanki.Model(
13 | self.model_id,
14 | "Anki Packager",
15 | fields=[
16 | {"name": "Word"}, # 词头
17 | {"name": "Pronunciation"}, # 读音
18 | {"name": "Phonetic_Symbols"}, # 音标
19 | {"name": "Examination_Syllabus"}, # 考试大纲
20 | {"name": "ECDict"}, # Ecdict 中文解释
21 | {"name": "Longman"}, # Longman
22 | {"name": "Youdao"}, # 有道词典示例短语和句子
23 | {"name": "Etymology_AI"}, # 词源
24 | {"name": "Associative_Mnemonic_AI"}, # 联想助记
25 | {"name": "Homophone_Mnemonic_AI"}, # 谐音助记
26 | {"name": "Discrimination"}, # 辨析
27 | {"name": "Story"}, # 故事
28 | ],
29 | templates=[
30 | {
31 | "name": "Dictionary Card",
32 | "qfmt": """
33 |
34 |
39 |
40 | """,
41 | "afmt": """
42 | {{FrontSide}}
43 |
44 |
45 |
{{ECDict}}
46 |
47 |
48 |
{{Etymology_AI}}
49 |
{{Associative_Mnemonic_AI}}
50 |
{{Homophone_Mnemonic_AI}}
51 |
52 |
53 |
{{Youdao}}
54 |
55 |
{{Discrimination}}
56 |
57 |
{{Longman}}
58 |
59 |
{{Story}}
60 |
61 | """,
62 | }
63 | ],
64 | css="""
65 | /* Color scheme variables */
66 | :root {
67 | /* Light mode (default) colors */
68 | --bg-color: #ffffff;
69 | --text-color: #333333;
70 | --secondary-text: #666666;
71 | --tertiary-text: #2F4F4F;
72 | --highlight-color: #0645AD;
73 | --accent-color: #990000;
74 | --divider-color: #99a;
75 | --pos-color: #990000;
76 | --cn-text-color: #8B008B;
77 | --phrase-color: #8B4513;
78 | }
79 |
80 | /* Dark mode colors */
81 | @media (prefers-color-scheme: dark) {
82 | .card {
83 | --bg-color: #1e1e2e;
84 | --text-color: #e0e0e0;
85 | --secondary-text: #b0b0b0;
86 | --tertiary-text: #a0c0c0;
87 | --highlight-color: #7cb8ff;
88 | --accent-color: #ff7c7c;
89 | --divider-color: #666;
90 | --pos-color: #ff9e64;
91 | --cn-text-color: #d183e8;
92 | --phrase-color: #e0c080;
93 | }
94 | }
95 |
96 | /* Night mode in Anki also triggers dark mode */
97 | .nightMode {
98 | --bg-color: #1e1e2e;
99 | --text-color: #e0e0e0;
100 | --secondary-text: #b0b0b0;
101 | --tertiary-text: #a0c0c0;
102 | --highlight-color: #7cb8ff;
103 | --accent-color: #ff7c7c;
104 | --divider-color: #666;
105 | --pos-color: #ff9e64;
106 | --cn-text-color: #d183e8;
107 | --phrase-color: #e0c080;
108 | }
109 |
110 | .card {
111 | font-family: Arial, sans-serif;
112 | text-align: left;
113 | padding: 20px;
114 | max-width: 800px;
115 | margin: auto;
116 | background-color: var(--bg-color);
117 | color: var(--text-color);
118 | line-height: 1.6;
119 | }
120 |
121 | /* 虚线分隔符 */
122 | .dashed {
123 | border: none;
124 | border-top: 1px dashed var(--divider-color);
125 | margin: 15px 0;
126 | width: 100%;
127 | }
128 |
129 | /* Front side */
130 | .card-front {
131 | margin-bottom: 20px;
132 | }
133 |
134 | /* Centered header section */
135 | .header-center {
136 | text-align: center;
137 | margin-bottom: 20px;
138 | }
139 |
140 | .word {
141 | font-size: 2.2em;
142 | font-weight: bold;
143 | color: var(--text-color);
144 | margin-bottom: 5px;
145 | }
146 |
147 | .pronunciation {
148 | font-size: 1.1em;
149 | color: var(--highlight-color);
150 | margin-bottom: 10px;
151 | }
152 |
153 | .front {
154 | color: var(--secondary-text);
155 | margin-bottom: 15px;
156 | font-size: 0.90em;
157 | }
158 |
159 | .phonetic_symbols {
160 | color: blue;
161 | }
162 |
163 | /* Back side */
164 | .card-back {
165 | margin-top: 20px;
166 | }
167 |
168 | .ecdict {
169 | margin: 15px 0;
170 | text-align: center;
171 | }
172 |
173 | .longman {
174 | margin: 15px 0;
175 | }
176 |
177 | .examples {
178 | color: var(--tertiary-text);
179 | margin: 15px 0;
180 | }
181 |
182 | .examples em {
183 | color: var(--highlight-color);
184 | font-style: normal;
185 | font-weight: bold;
186 | }
187 |
188 | .ai {
189 | color: var(--secondary-text);
190 | margin: 15px 0;
191 | }
192 |
193 | .discrimination {
194 | color: var(--text-color);
195 | margin: 15px 0;
196 | }
197 |
198 | /* Example sentences */
199 | .example {
200 | color: var(--tertiary-text);
201 | margin-left: 20px;
202 | margin-bottom: 10px;
203 | }
204 |
205 | /* Chinese text */
206 | .chinese {
207 | color: var(--secondary-text);
208 | margin-left: 20px;
209 | }
210 | """,
211 | )
212 |
213 | def format_pos(self, text: str) -> str:
214 | """Format definition with line breaks between parts of speech"""
215 | if not text:
216 | return ""
217 |
218 | parts = []
219 | current = []
220 |
221 | for word in text.split():
222 | # Check for part of speech markers
223 | if any(
224 | word.startswith(pos + ".")
225 | for pos in ["n", "v", "vt", "vi", "adj", "adv"]
226 | ):
227 | if current:
228 | parts.append(" ".join(current))
229 | word = f"{word}"
230 | current = [word]
231 | else:
232 | word = f"{word}"
233 | current.append(word)
234 |
235 | if current:
236 | parts.append(" ".join(current))
237 |
238 | return "
".join(parts)
239 |
240 | def format_trans(self, translation: str, tense: str, distribution: str) -> str:
241 | """Add tense and distribution of each word in Translation part"""
242 | if not tense:
243 | # AI is disabled
244 | return f"{translation}
{distribution}"
245 |
246 | return f"{translation}
{tense}
{distribution}"
247 |
248 | def format_youdao(self, data: dict) -> str:
249 | """format youdao example_phrases and example_sentences"""
250 | result = []
251 |
252 | # Format phrases if they exist
253 | if "example_phrases" in data and data["example_phrases"]:
254 | result.append("【短语】")
255 | phrases = []
256 | for phrase in data["example_phrases"]:
257 | formatted_phrase = f"{phrase['english']} {phrase['chinese']}"
258 | phrases.append(formatted_phrase)
259 |
260 | result.append("".join(phrases))
261 |
262 | # Format sentences if they exist
263 | if "example_sentences" in data and data["example_sentences"]:
264 | result.append("【例句】")
265 | phrases = []
266 | for sentence in data["example_sentences"]:
267 | formatted_sentence = f"{sentence['english']} {sentence['chinese']}"
268 | phrases.append(formatted_sentence)
269 |
270 | result.append("".join(phrases))
271 |
272 | return "
".join(result)
273 |
274 | def add_note(self, data: dict):
275 | note = genanki.Note(
276 | model=self.model,
277 | fields=[
278 | # 词头
279 | data.get("Word", ""),
280 | # 读音
281 | f"[sound:{data.get('Pronunciation', '')}]",
282 | # 音标 + 考试大纲 + 语料库词频: [ә'bændәn] (高考 四级 六级 考研 托福 GRE 2057/2182)
283 | f"{data.get('ECDict', {}).get('phonetic', '')}",
284 | f"{data.get('ECDict', {}).get('tag', '')} {data.get('ECDict', {}).get('bnc', '')}/{data.get('ECDict', {}).get('frq', '')}",
285 | # Ecdict 中文解释 + 释义分布 + 时态
286 | self.format_trans(
287 | self.format_pos(data.get("ECDict", {}).get("translation", "")),
288 | data.get("ECDict", {}).get("distribution", ""),
289 | data.get("AI", {}).get("tenses", ""),
290 | ),
291 | # TODO: use better English source
292 | f"【英解】
{self.format_pos(data.get('ECDict', {}).get('definition', ''))}",
293 | # 有道词典示例短语和句子
294 | self.format_youdao(data.get("Youdao", {})),
295 | # AI词源、助记
296 | ""
297 | if not data.get("AI")
298 | else f"【词源】
{data.get('AI', {}).get('origin', {}).get('etymology', '')}",
299 | ""
300 | if not data.get("AI")
301 | else f"【联想助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('associative', '')}",
302 | ""
303 | if not data.get("AI")
304 | else f"【谐音助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('homophone', '')}",
305 | # 词语辨析
306 | ""
307 | if not data.get("ECDict", {}).get("diffrentiation", "")
308 | else f"【辨析】{data.get('ECDict', {}).get('diffrentiation', '')}",
309 | # 故事
310 | ""
311 | if not data.get("AI")
312 | else f"【故事】 {data.get('AI', {}).get('story', {}).get('english', '')}
{data.get('AI', {}).get('story', {}).get('chinese', '')}",
313 | ],
314 | )
315 | self.deck.add_note(note)
316 | self.added = True
317 |
318 | def write_to_file(self, file_path: str, mp3_files):
319 | package = genanki.Package(self.deck)
320 | package.media_files = mp3_files
321 | package.write_to_file(file_path)
322 |
--------------------------------------------------------------------------------
/anki_packager/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import asyncio
3 | import os
4 | from os import environ as env
5 | import tomllib
6 | from tqdm.asyncio import tqdm
7 | import signal
8 |
9 | ### config
10 | from anki_packager.utils import get_user_config_dir
11 |
12 | ### logger
13 | from anki_packager.logger import logger
14 |
15 | ### AI
16 | from anki_packager.ai import llm
17 |
18 | ### Dictionaries
19 | from anki_packager.dict.youdao import YoudaoScraper
20 | from anki_packager.dict.ecdict import Ecdict
21 | from anki_packager.dict.eudic import EUDIC
22 |
23 | ### Anki
24 | from anki_packager.packager.deck import AnkiDeckCreator
25 |
26 | MAX_RETRIES = 3 # 最大重试次数
27 | RETRY_DELAY = 2 # 每次重试前的等待时间(秒)
28 | CONCURRENCY_LIMIT = 40 # 并发数
29 |
30 |
31 | def create_signal_handler(anki, audio_files, DECK_NAME):
32 | def signal_handler(sig, frame):
33 | logger.info("\033[1;31m程序被 异常中止...\033[0m")
34 | logger.info("正在写入已处理完毕的卡片...")
35 | anki.write_to_file(f"{DECK_NAME}.apkg", audio_files)
36 | logger.info("正在退出...")
37 | exit(0)
38 |
39 | return signal_handler
40 |
41 |
42 | async def main():
43 | parser = argparse.ArgumentParser()
44 |
45 | parser.add_argument("--word", dest="word", type=str, help="word to add")
46 |
47 | parser.add_argument(
48 | "--retry",
49 | action="store_true",
50 | help="Retry processing failed words only from config/failed.txt",
51 | )
52 |
53 | parser.add_argument(
54 | "--disable_ai",
55 | dest="disable_ai",
56 | action="store_true",
57 | help="Disable AI completions",
58 | )
59 |
60 | # ./prog --eudicid: run eudic.get_studylist()
61 | parser.add_argument(
62 | "--eudicid",
63 | action="store_true",
64 | help="Display EUDIC studylist by id",
65 | )
66 |
67 | parser.add_argument(
68 | "--eudic",
69 | action="store_true",
70 | help="Use EUDIC book instead of vocabulary.txt",
71 | )
72 |
73 | # support user-defined txt file: ./prog --txt demo.txt
74 | parser.add_argument(
75 | "--txt",
76 | dest="txt_file",
77 | type=str,
78 | help="Use a custom txt file instead of vocabulary.txt",
79 | )
80 |
81 | parser.add_argument("--model", dest="model", type=str, help="custome AI model")
82 |
83 | parser.add_argument(
84 | "-p",
85 | "--proxy",
86 | dest="proxy",
87 | type=str,
88 | default="",
89 | help="Default proxy like: http://127.0.0.1:7890",
90 | )
91 |
92 | parser.add_argument(
93 | "--api_base",
94 | metavar="API_BASE_URL",
95 | dest="api_base",
96 | type=str,
97 | help="Default base url other than the OpenAI's official API address",
98 | )
99 |
100 | options = parser.parse_args()
101 |
102 | ### set config according to config directory or parsed arguments
103 | config_dir = get_user_config_dir()
104 | config_path = os.path.join(config_dir, "config")
105 |
106 | ## 1. read config.toml
107 | with open(os.path.join(config_path, "config.toml"), "rb") as f:
108 | cfg = tomllib.load(f)
109 | MODEL_PARAM = cfg["MODEL_PARAM"]
110 | PROXY = cfg["PROXY"]
111 | EUDIC_TOKEN = cfg["EUDIC_TOKEN"]
112 | EUDIC_ID = cfg["EUDIC_ID"]
113 | DECK_NAME = cfg["DECK_NAME"]
114 |
115 | logger.info("配置读取完毕")
116 |
117 | # display eudict id only
118 | if options.eudicid:
119 | logger.info("设置:仅读取欧路词典 ID")
120 | eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID)
121 | await eudic.get_studylist()
122 | exit(0)
123 |
124 | # only add word into vocabulary.txt line by line
125 | elif options.word:
126 | WORD = options.word
127 | vocab_path = os.path.join(config_path, "vocabulary.txt")
128 | with open(vocab_path, "a") as f:
129 | f.write(WORD + "\n")
130 | logger.info(f"单词: {WORD} 已添加进 {vocab_path}")
131 | exit(0)
132 |
133 | words = []
134 | number_words = 0
135 | audio_files = []
136 | ai = None
137 |
138 | anki = AnkiDeckCreator(f"{DECK_NAME}")
139 | ecdict = Ecdict()
140 |
141 | # AI 配置
142 | if options.disable_ai:
143 | logger.info("AI 功能已关闭")
144 | else:
145 | PROXY = options.proxy or PROXY
146 | if PROXY:
147 | env["HTTP_PROXY"] = PROXY
148 | env["HTTPS_PROXY"] = PROXY
149 | logger.info(f"使用代理: {PROXY}")
150 |
151 | # 初始化 AI 模型
152 | try:
153 | ai = llm(MODEL_PARAM)
154 | logger.info(
155 | f"当前使用的 AI 模型: {[param['model'] for param in MODEL_PARAM]}"
156 | )
157 | except Exception as e:
158 | logger.error(f"初始化 AI 模型失败: {e}")
159 | exit(1)
160 | ## 4. vocabulary source: eudic data, custom txt file, or default vocabulary.txt
161 | if options.eudic:
162 | logger.info("配置: 对欧路词典生词本单词进行处理...")
163 | eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID)
164 | r = await eudic.get_words()
165 | eudic_words = r["data"]
166 | for word in eudic_words:
167 | words.append(word["word"])
168 | number_words = len(words)
169 | elif options.txt_file:
170 | txt_file_path = options.txt_file
171 | if not os.path.isabs(txt_file_path):
172 | # If relative path, resolve from current directory
173 | txt_file_path = os.path.abspath(txt_file_path)
174 |
175 | logger.info(f"配置: 对自定义单词文件 {txt_file_path} 进行处理...")
176 | try:
177 | with open(txt_file_path, "r") as vocab:
178 | for word in vocab:
179 | word = word.strip()
180 | if word: # Skip empty lines
181 | words.append(word)
182 | number_words = len(words)
183 | except FileNotFoundError:
184 | logger.error(f"文件 {txt_file_path} 未找到")
185 | exit(1)
186 | except Exception as e:
187 | logger.error(f"读取文件 {txt_file_path} 出错: {e}")
188 | exit(1)
189 | else:
190 | vocab_path = os.path.join(config_path, "vocabulary.txt")
191 | logger.info(f"配置: 对默认生词本单词 {vocab_path} 进行处理...")
192 | try:
193 | with open(vocab_path, "r") as vocab:
194 | for word in vocab:
195 | word = word.strip()
196 | if word: # Skip empty lines
197 | words.append(word)
198 | number_words = len(words)
199 | logger.info(f"从默认词库读取了 {number_words} 个单词")
200 | except FileNotFoundError:
201 | logger.error(f"默认词库文件 {vocab_path} 未找到")
202 | exit(1)
203 | except Exception as e:
204 | logger.error(f"读取默认词库文件出错: {e}")
205 | exit(1)
206 | vocab.close()
207 |
208 | signal.signal(
209 | signal.SIGINT,
210 | create_signal_handler(anki, audio_files, DECK_NAME),
211 | )
212 | async with YoudaoScraper() as youdao:
213 | logger.info(f"开始并发处理 {len(words)} 个单词...")
214 | with tqdm(total=len(words), desc="开始处理") as pbar:
215 | tasks = [
216 | task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files)
217 | for word in words
218 | ]
219 | results = await asyncio.gather(*tasks, return_exceptions=True)
220 |
221 | successful_results = []
222 | failed_words = []
223 |
224 | for word, result in zip(words, results):
225 | if isinstance(result, Exception):
226 | failed_words.append(word)
227 | logger.error(f"未能成功处理 '{word}'. 错误: {result}")
228 | else:
229 | successful_results.append(result)
230 |
231 | if failed_words:
232 | failed_file = os.path.join(config_path, "failed.txt")
233 | logger.error(
234 | f"共 {len(failed_words)} 个单词处理失败,将它们写入 {failed_file}"
235 | )
236 | with open(failed_file, "w", encoding="utf-8") as f:
237 | for word in failed_words:
238 | f.write(f"{word}\n")
239 | else:
240 | logger.info("所有单词均已成功处理!")
241 |
242 | try:
243 | if anki.added:
244 | anki.write_to_file(f"{DECK_NAME}.apkg", audio_files)
245 | logger.info(f"牌组生成完毕,请打开 {DECK_NAME}.apkg")
246 | except Exception as e:
247 | logger.error(f"Error saving Anki deck: {e}")
248 |
249 |
250 | async def process_word(word, ai, anki, youdao, ecdict, audio_files):
251 | data = {}
252 | data["Word"] = word
253 |
254 | # Get audio pronunciation from gtts
255 | audio_path = await youdao._get_audio(word)
256 | if not audio_path:
257 | raise Exception("Failed to get audio")
258 |
259 | audio_files.append(audio_path)
260 | # 只使用文件名作为 sound 标签的值
261 | audio_filename = os.path.basename(audio_path)
262 | data["Pronunciation"] = audio_filename
263 |
264 | # Get ECDICT definition
265 | dict_def = await ecdict.ret_word(word)
266 | if not dict_def:
267 | raise Exception("Failed to get ECDICT definition")
268 | data["ECDict"] = dict_def
269 |
270 | # Get Youdao dictionary information
271 | youdao_result = await youdao.get_word_info(word)
272 | if not youdao_result:
273 | raise Exception("Failed to get Youdao information")
274 |
275 | data["Youdao"] = youdao_result
276 |
277 | # Get AI explanation if AI is enabled
278 | if ai is not None:
279 | try:
280 | ai_explanation = await ai.explain(word)
281 | data["AI"] = ai_explanation
282 | except Exception as e:
283 | raise Exception(f"Failed to get AI explanation: {str(e)}")
284 | else:
285 | data["AI"] = {}
286 |
287 | # TODO: Longman English explain
288 |
289 | # Add note to deck
290 | anki.add_note(data)
291 | return True
292 |
293 |
294 | semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
295 |
296 |
297 | async def process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files):
298 | """
299 | 包含了重试和退避逻辑
300 | """
301 | for attempt in range(MAX_RETRIES):
302 | try:
303 | async with semaphore:
304 | result = await process_word(word, ai, anki, youdao, ecdict, audio_files)
305 | return result
306 | except Exception as e:
307 | logger.warning(
308 | f"处理 '{word}' 第 {attempt + 1}/{MAX_RETRIES} 次尝试失败: {e}"
309 | )
310 | if attempt + 1 == MAX_RETRIES:
311 | # 如果是最后一次尝试,则不再捕获异常,让它冒泡出去
312 | # gather(return_exceptions=True) 会捕获这个最终的异常
313 | logger.error(f"'{word}' 在所有 {MAX_RETRIES} 次尝试后最终失败。")
314 | raise
315 | await asyncio.sleep(RETRY_DELAY)
316 |
317 |
318 | async def task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files):
319 | """
320 | 运行带重试逻辑的任务,并确保进度条在最后总会更新。
321 | """
322 | try:
323 | r = await process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files)
324 | pbar.set_description(f"'{word}' 添加成功")
325 | return r
326 | except Exception:
327 | pbar.set_description(f"'{word}' 处理失败")
328 | raise
329 | finally:
330 | pbar.update(1)
331 |
332 |
333 | if __name__ == "__main__":
334 | asyncio.run(main())
335 |
--------------------------------------------------------------------------------
/anki_packager/dict/stardict.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim: set ts=4 sw=4 tw=0 et :
4 | #======================================================================
5 | #
6 | # stardict.py -
7 | #
8 | # Created by skywind on 2011/05/13
9 | # Last Modified: 2019/11/09 23:47
10 | #
11 | #======================================================================
12 | from __future__ import print_function
13 | import sys
14 | import time
15 | import os
16 | import io
17 | import csv
18 | import sqlite3
19 | import codecs
20 |
21 | try:
22 | import json
23 | except:
24 | import simplejson as json
25 |
26 | MySQLdb = None
27 |
28 |
29 | #----------------------------------------------------------------------
30 | # python3 compatible
31 | #----------------------------------------------------------------------
32 | if sys.version_info[0] >= 3:
33 | unicode = str
34 | long = int
35 | xrange = range
36 |
37 |
38 | #----------------------------------------------------------------------
39 | # word strip
40 | #----------------------------------------------------------------------
41 | def stripword(word):
42 | return (''.join([ n for n in word if n.isalnum() ])).lower()
43 |
44 |
45 | #----------------------------------------------------------------------
46 | # StarDict
47 | #----------------------------------------------------------------------
48 | class StarDict (object):
49 |
50 | def __init__ (self, filename, verbose = False):
51 | self.__dbname = filename
52 | if filename != ':memory:':
53 | os.path.abspath(filename)
54 | self.__conn = None
55 | self.__verbose = verbose
56 | self.__open()
57 |
58 | # 初始化并创建必要的表格和索引
59 | def __open (self):
60 | sql = '''
61 | CREATE TABLE IF NOT EXISTS "stardict" (
62 | "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE,
63 | "word" VARCHAR(64) COLLATE NOCASE NOT NULL UNIQUE,
64 | "sw" VARCHAR(64) COLLATE NOCASE NOT NULL,
65 | "phonetic" VARCHAR(64),
66 | "definition" TEXT,
67 | "translation" TEXT,
68 | "pos" VARCHAR(16),
69 | "collins" INTEGER DEFAULT(0),
70 | "oxford" INTEGER DEFAULT(0),
71 | "tag" VARCHAR(64),
72 | "bnc" INTEGER DEFAULT(NULL),
73 | "frq" INTEGER DEFAULT(NULL),
74 | "exchange" TEXT,
75 | "detail" TEXT,
76 | "audio" TEXT
77 | );
78 | CREATE UNIQUE INDEX IF NOT EXISTS "stardict_1" ON stardict (id);
79 | CREATE UNIQUE INDEX IF NOT EXISTS "stardict_2" ON stardict (word);
80 | CREATE INDEX IF NOT EXISTS "stardict_3" ON stardict (sw, word collate nocase);
81 | CREATE INDEX IF NOT EXISTS "sd_1" ON stardict (word collate nocase);
82 | '''
83 |
84 | self.__conn = sqlite3.connect(self.__dbname, isolation_level = "IMMEDIATE")
85 | self.__conn.isolation_level = "IMMEDIATE"
86 |
87 | sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ])
88 | sql = sql.strip('\n')
89 |
90 | self.__conn.executescript(sql)
91 | self.__conn.commit()
92 |
93 | fields = ( 'id', 'word', 'sw', 'phonetic', 'definition',
94 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq',
95 | 'exchange', 'detail', 'audio' )
96 | self.__fields = tuple([(fields[i], i) for i in range(len(fields))])
97 | self.__names = { }
98 | for k, v in self.__fields:
99 | self.__names[k] = v
100 | self.__enable = self.__fields[3:]
101 | return True
102 |
103 | # 数据库记录转化为字典
104 | def __record2obj (self, record):
105 | if record is None:
106 | return None
107 | word = {}
108 | for k, v in self.__fields:
109 | word[k] = record[v]
110 | if word['detail']:
111 | text = word['detail']
112 | try:
113 | obj = json.loads(text)
114 | except:
115 | obj = None
116 | word['detail'] = obj
117 | return word
118 |
119 | # 关闭数据库
120 | def close (self):
121 | if self.__conn:
122 | self.__conn.close()
123 | self.__conn = None
124 |
125 | def __del__ (self):
126 | self.close()
127 |
128 | # 输出日志
129 | def out (self, text):
130 | if self.__verbose:
131 | print(text)
132 | return True
133 |
134 | # 查询单词
135 | def query (self, key):
136 | c = self.__conn.cursor()
137 | record = None
138 | if isinstance(key, int) or isinstance(key, long):
139 | c.execute('select * from stardict where id = ?;', (key,))
140 | elif isinstance(key, str) or isinstance(key, unicode):
141 | c.execute('select * from stardict where word = ?', (key,))
142 | else:
143 | return None
144 | record = c.fetchone()
145 | return self.__record2obj(record)
146 |
147 | # 查询单词匹配
148 | def match (self, word, limit = 10, strip = False):
149 | c = self.__conn.cursor()
150 | if not strip:
151 | sql = 'select id, word from stardict where word >= ? '
152 | sql += 'order by word collate nocase limit ?;'
153 | c.execute(sql, (word, limit))
154 | else:
155 | sql = 'select id, word from stardict where sw >= ? '
156 | sql += 'order by sw, word collate nocase limit ?;'
157 | c.execute(sql, (stripword(word), limit))
158 | records = c.fetchall()
159 | result = []
160 | for record in records:
161 | result.append(tuple(record))
162 | return result
163 |
164 | # 批量查询
165 | def query_batch (self, keys):
166 | sql = 'select * from stardict where '
167 | if keys is None:
168 | return None
169 | if not keys:
170 | return []
171 | querys = []
172 | for key in keys:
173 | if isinstance(key, int) or isinstance(key, long):
174 | querys.append('id = ?')
175 | elif key is not None:
176 | querys.append('word = ?')
177 | sql = sql + ' or '.join(querys) + ';'
178 | query_word = {}
179 | query_id = {}
180 | c = self.__conn.cursor()
181 | c.execute(sql, tuple(keys))
182 | for row in c:
183 | obj = self.__record2obj(row)
184 | query_word[obj['word'].lower()] = obj
185 | query_id[obj['id']] = obj
186 | results = []
187 | for key in keys:
188 | if isinstance(key, int) or isinstance(key, long):
189 | results.append(query_id.get(key, None))
190 | elif key is not None:
191 | results.append(query_word.get(key.lower(), None))
192 | else:
193 | results.append(None)
194 | return tuple(results)
195 |
196 | # 取得单词总数
197 | def count (self):
198 | c = self.__conn.cursor()
199 | c.execute('select count(*) from stardict;')
200 | record = c.fetchone()
201 | return record[0]
202 |
203 | # 注册新单词
204 | def register (self, word, items, commit = True):
205 | sql = 'INSERT INTO stardict(word, sw) VALUES(?, ?);'
206 | try:
207 | self.__conn.execute(sql, (word, stripword(word)))
208 | except sqlite3.IntegrityError as e:
209 | self.out(str(e))
210 | return False
211 | except sqlite3.Error as e:
212 | self.out(str(e))
213 | return False
214 | self.update(word, items, commit)
215 | return True
216 |
217 | # 删除单词
218 | def remove (self, key, commit = True):
219 | if isinstance(key, int) or isinstance(key, long):
220 | sql = 'DELETE FROM stardict WHERE id=?;'
221 | else:
222 | sql = 'DELETE FROM stardict WHERE word=?;'
223 | try:
224 | self.__conn.execute(sql, (key,))
225 | if commit:
226 | self.__conn.commit()
227 | except sqlite3.IntegrityError:
228 | return False
229 | return True
230 |
231 | # 清空数据库
232 | def delete_all (self, reset_id = False):
233 | sql1 = 'DELETE FROM stardict;'
234 | sql2 = "UPDATE sqlite_sequence SET seq = 0 WHERE name = 'stardict';"
235 | try:
236 | self.__conn.execute(sql1)
237 | if reset_id:
238 | self.__conn.execute(sql2)
239 | self.__conn.commit()
240 | except sqlite3.IntegrityError as e:
241 | self.out(str(e))
242 | return False
243 | except sqlite3.Error as e:
244 | self.out(str(e))
245 | return False
246 | return True
247 |
248 | # 更新单词数据
249 | def update (self, key, items, commit = True):
250 | names = []
251 | values = []
252 | for name, id in self.__enable:
253 | if name in items:
254 | names.append(name)
255 | value = items[name]
256 | if name == 'detail':
257 | if value is not None:
258 | value = json.dumps(value, ensure_ascii = False)
259 | values.append(value)
260 | if len(names) == 0:
261 | if commit:
262 | try:
263 | self.__conn.commit()
264 | except sqlite3.IntegrityError:
265 | return False
266 | return False
267 | sql = 'UPDATE stardict SET ' + ', '.join(['%s=?'%n for n in names])
268 | if isinstance(key, str) or isinstance(key, unicode):
269 | sql += ' WHERE word=?;'
270 | else:
271 | sql += ' WHERE id=?;'
272 | try:
273 | self.__conn.execute(sql, tuple(values + [key]))
274 | if commit:
275 | self.__conn.commit()
276 | except sqlite3.IntegrityError:
277 | return False
278 | return True
279 |
280 | # 浏览词典
281 | def __iter__ (self):
282 | c = self.__conn.cursor()
283 | sql = 'select "id", "word" from "stardict"'
284 | sql += ' order by "word" collate nocase;'
285 | c.execute(sql)
286 | return c.__iter__()
287 |
288 | # 取得长度
289 | def __len__ (self):
290 | return self.count()
291 |
292 | # 检测存在
293 | def __contains__ (self, key):
294 | return self.query(key) is not None
295 |
296 | # 查询单词
297 | def __getitem__ (self, key):
298 | return self.query(key)
299 |
300 | # 提交变更
301 | def commit (self):
302 | try:
303 | self.__conn.commit()
304 | except sqlite3.IntegrityError:
305 | self.__conn.rollback()
306 | return False
307 | return True
308 |
309 | # 取得所有单词
310 | def dumps (self):
311 | return [ n for _, n in self.__iter__() ]
312 |
313 |
314 |
315 | #----------------------------------------------------------------------
316 | # startup MySQLdb
317 | #----------------------------------------------------------------------
318 | def mysql_startup():
319 | global MySQLdb
320 | if MySQLdb is not None:
321 | return True
322 | try:
323 | import MySQLdb as _mysql
324 | MySQLdb = _mysql
325 | except ImportError:
326 | return False
327 | return True
328 |
329 |
330 | #----------------------------------------------------------------------
331 | # DictMysql
332 | #----------------------------------------------------------------------
333 | class DictMySQL (object):
334 |
335 | def __init__ (self, desc, init = False, timeout = 10, verbose = False):
336 | self.__argv = {}
337 | self.__uri = {}
338 | if isinstance(desc, dict):
339 | argv = desc
340 | else:
341 | argv = self.__url_parse(desc)
342 | for k, v in argv.items():
343 | self.__argv[k] = v
344 | if k not in ('engine', 'init', 'db', 'verbose'):
345 | self.__uri[k] = v
346 | self.__uri['connect_timeout'] = timeout
347 | self.__conn = None
348 | self.__verbose = verbose
349 | self.__init = init
350 | if 'db' not in argv:
351 | raise KeyError('not find db name')
352 | self.__open()
353 |
354 | def __open (self):
355 | mysql_startup()
356 | if MySQLdb is None:
357 | raise ImportError('No module named MySQLdb')
358 | fields = [ 'id', 'word', 'sw', 'phonetic', 'definition',
359 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq',
360 | 'exchange', 'detail', 'audio' ]
361 | self.__fields = tuple([(fields[i], i) for i in range(len(fields))])
362 | self.__names = { }
363 | for k, v in self.__fields:
364 | self.__names[k] = v
365 | self.__enable = self.__fields[3:]
366 | self.__db = self.__argv.get('db', 'stardict')
367 | if not self.__init:
368 | uri = {}
369 | for k, v in self.__uri.items():
370 | uri[k] = v
371 | uri['db'] = self.__db
372 | self.__conn = MySQLdb.connect(**uri)
373 | else:
374 | self.__conn = MySQLdb.connect(**self.__uri)
375 | return self.init()
376 | return True
377 |
378 | # 输出日志
379 | def out (self, text):
380 | if self.__verbose:
381 | print(text)
382 | return True
383 |
384 | # 初始化数据库与表格
385 | def init (self):
386 | database = self.__argv.get('db', 'stardict')
387 | self.out('create database: %s'%database)
388 | self.__conn.query("SET sql_notes = 0;")
389 | self.__conn.query('CREATE DATABASE IF NOT EXISTS %s;'%database)
390 | self.__conn.query('USE %s;'%database)
391 | # self.__conn.query('drop table if exists stardict')
392 | sql = '''
393 | CREATE TABLE IF NOT EXISTS `%s`.`stardict` (
394 | `id` INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
395 | `word` VARCHAR(64) NOT NULL UNIQUE KEY,
396 | `sw` VARCHAR(64) NOT NULL,
397 | `phonetic` VARCHAR(64),
398 | `definition` TEXT,
399 | `translation` TEXT,
400 | `pos` VARCHAR(16),
401 | `collins` SMALLINT DEFAULT 0,
402 | `oxford` SMALLINT DEFAULT 0,
403 | `tag` VARCHAR(64),
404 | `bnc` INT DEFAULT NULL,
405 | `frq` INT DEFAULT NULL,
406 | `exchange` TEXT,
407 | `detail` TEXT,
408 | `audio` TEXT,
409 | KEY(`sw`, `word`),
410 | KEY(`collins`),
411 | KEY(`oxford`),
412 | KEY(`tag`)
413 | )
414 | '''%(database)
415 | sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ])
416 | sql = sql.strip('\n')
417 | sql += ' ENGINE=MyISAM DEFAULT CHARSET=utf8;'
418 | self.__conn.query(sql)
419 | self.__conn.commit()
420 | return True
421 |
422 | # 读取 mysql://user:passwd@host:port/database
423 | def __url_parse (self, url):
424 | if url[:8] != 'mysql://':
425 | return None
426 | url = url[8:]
427 | obj = {}
428 | part = url.split('/')
429 | main = part[0]
430 | p1 = main.find('@')
431 | if p1 >= 0:
432 | text = main[:p1].strip()
433 | main = main[p1 + 1:]
434 | p1 = text.find(':')
435 | if p1 >= 0:
436 | obj['user'] = text[:p1].strip()
437 | obj['passwd'] = text[p1 + 1:].strip()
438 | else:
439 | obj['user'] = text
440 | p1 = main.find(':')
441 | if p1 >= 0:
442 | port = main[p1 + 1:]
443 | main = main[:p1]
444 | obj['port'] = int(port)
445 | main = main.strip()
446 | if not main:
447 | main = 'localhost'
448 | obj['host'] = main.strip()
449 | if len(part) >= 2:
450 | obj['db'] = part[1]
451 | return obj
452 |
453 | # 数据库记录转化为字典
454 | def __record2obj (self, record):
455 | if record is None:
456 | return None
457 | word = {}
458 | for k, v in self.__fields:
459 | word[k] = record[v]
460 | if word['detail']:
461 | text = word['detail']
462 | try:
463 | obj = json.loads(text)
464 | except:
465 | obj = None
466 | word['detail'] = obj
467 | return word
468 |
469 | # 关闭数据库
470 | def close (self):
471 | if self.__conn:
472 | self.__conn.close()
473 | self.__conn = None
474 |
475 | def __del__ (self):
476 | self.close()
477 |
478 | # 查询单词
479 | def query (self, key):
480 | record = None
481 | if isinstance(key, int) or isinstance(key, long):
482 | sql = 'select * from stardict where id = %s;'
483 | elif isinstance(key, str) or isinstance(key, unicode):
484 | sql = 'select * from stardict where word = %s;'
485 | else:
486 | return None
487 | with self.__conn as c:
488 | c.execute(sql, (key,))
489 | record = c.fetchone()
490 | return self.__record2obj(record)
491 |
492 | # 查询单词匹配
493 | def match (self, word, limit = 10, strip = False):
494 | c = self.__conn.cursor()
495 | if not strip:
496 | sql = 'select id, word from stardict where word >= %s '
497 | sql += 'order by word limit %s;'
498 | c.execute(sql, (word, limit))
499 | else:
500 | sql = 'select id, word from stardict where sw >= %s '
501 | sql += 'order by sw, word limit %s;'
502 | c.execute(sql, (stripword(word), limit))
503 | records = c.fetchall()
504 | result = []
505 | for record in records:
506 | result.append(tuple(record))
507 | return result
508 |
509 | # 批量查询
510 | def query_batch (self, keys):
511 | sql = 'select * from stardict where '
512 | if keys is None:
513 | return None
514 | if not keys:
515 | return []
516 | querys = []
517 | for key in keys:
518 | if isinstance(key, int) or isinstance(key, long):
519 | querys.append('id = %s')
520 | elif key is not None:
521 | querys.append('word = %s')
522 | sql = sql + ' or '.join(querys) + ';'
523 | query_word = {}
524 | query_id = {}
525 | with self.__conn as c:
526 | c.execute(sql, tuple(keys))
527 | for row in c:
528 | obj = self.__record2obj(row)
529 | query_word[obj['word'].lower()] = obj
530 | query_id[obj['id']] = obj
531 | results = []
532 | for key in keys:
533 | if isinstance(key, int) or isinstance(key, long):
534 | results.append(query_id.get(key, None))
535 | elif key is not None:
536 | results.append(query_word.get(key.lower(), None))
537 | else:
538 | results.append(None)
539 | return tuple(results)
540 |
541 | # 注册新单词
542 | def register (self, word, items, commit = True):
543 | sql = 'INSERT INTO stardict(word, sw) VALUES(%s, %s);'
544 | try:
545 | with self.__conn as c:
546 | c.execute(sql, (word, stripword(word)))
547 | except MySQLdb.Error as e:
548 | self.out(str(e))
549 | return False
550 | self.update(word, items, commit)
551 | return True
552 |
553 | # 删除单词
554 | def remove (self, key, commit = True):
555 | if isinstance(key, int) or isinstance(key, long):
556 | sql = 'DELETE FROM stardict WHERE id=%s;'
557 | else:
558 | sql = 'DELETE FROM stardict WHERE word=%s;'
559 | try:
560 | with self.__conn as c:
561 | c.execute(sql, (key,))
562 | except MySQLdb.Error as e:
563 | self.out(str(e))
564 | return False
565 | return True
566 |
567 | # 清空数据库
568 | def delete_all (self, reset_id = False):
569 | sql1 = 'DELETE FROM stardict;'
570 | try:
571 | with self.__conn as c:
572 | c.execute(sql1)
573 | except MySQLdb.Error as e:
574 | self.out(str(e))
575 | return False
576 | return True
577 |
578 | # 更新单词数据
579 | def update (self, key, items, commit = True):
580 | names = []
581 | values = []
582 | for name, id in self.__enable:
583 | if name in items:
584 | names.append(name)
585 | value = items[name]
586 | if name == 'detail':
587 | if value is not None:
588 | value = json.dumps(value, ensure_ascii = False)
589 | values.append(value)
590 | if len(names) == 0:
591 | if commit:
592 | try:
593 | self.__conn.commit()
594 | except MySQLdb.Error as e:
595 | self.out(str(e))
596 | return False
597 | return False
598 | sql = 'UPDATE stardict SET ' + ', '.join(['%s=%%s'%n for n in names])
599 | if isinstance(key, str) or isinstance(key, unicode):
600 | sql += ' WHERE word=%s;'
601 | else:
602 | sql += ' WHERE id=%s;'
603 | try:
604 | with self.__conn as c:
605 | c.execute(sql, tuple(values + [key]))
606 | except MySQLdb.Error as e:
607 | self.out(str(e))
608 | return False
609 | return True
610 |
611 | # 取得数据量
612 | def count (self):
613 | sql = 'SELECT count(*) FROM stardict;'
614 | try:
615 | with self.__conn as c:
616 | c.execute(sql)
617 | row = c.fetchone()
618 | return row[0]
619 | except MySQLdb.Error as e:
620 | self.out(str(e))
621 | return -1
622 | return 0
623 |
624 | # 提交数据
625 | def commit (self):
626 | try:
627 | self.__conn.commit()
628 | except MySQLdb.Error as e:
629 | self.out(str(e))
630 | return False
631 | return True
632 |
633 | # 取得长度
634 | def __len__ (self):
635 | return self.count()
636 |
637 | # 检测存在
638 | def __contains__ (self, key):
639 | return self.query(key) is not None
640 |
641 | # 查询单词
642 | def __getitem__ (self, key):
643 | return self.query(key)
644 |
645 | # 取得所有单词
646 | def dumps (self):
647 | return [ n for _, n in self.__iter__() ]
648 |
649 |
650 |
651 | #----------------------------------------------------------------------
652 | # CSV COLUMNS
653 | #----------------------------------------------------------------------
654 | COLUMN_SIZE = 13
655 | COLUMN_ID = COLUMN_SIZE
656 | COLUMN_SD = COLUMN_SIZE + 1
657 | COLUMN_SW = COLUMN_SIZE + 2
658 |
659 |
660 | #----------------------------------------------------------------------
661 | # DictCsv
662 | #----------------------------------------------------------------------
663 | class DictCsv (object):
664 |
665 | def __init__ (self, filename, codec = 'utf-8'):
666 | self.__csvname = None
667 | if filename is not None:
668 | self.__csvname = os.path.abspath(filename)
669 | self.__codec = codec
670 | self.__heads = ( 'word', 'phonetic', 'definition',
671 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq',
672 | 'exchange', 'detail', 'audio' )
673 | heads = self.__heads
674 | self.__fields = tuple([ (heads[i], i) for i in range(len(heads)) ])
675 | self.__names = {}
676 | for k, v in self.__fields:
677 | self.__names[k] = v
678 | numbers = []
679 | for name in ('collins', 'oxford', 'bnc', 'frq'):
680 | numbers.append(self.__names[name])
681 | self.__numbers = tuple(numbers)
682 | self.__enable = self.__fields[1:]
683 | self.__dirty = False
684 | self.__words = {}
685 | self.__rows = []
686 | self.__index = []
687 | self.__read()
688 |
689 | def reset (self):
690 | self.__dirty = False
691 | self.__words = {}
692 | self.__rows = []
693 | self.__index = []
694 | return True
695 |
696 | def encode (self, text):
697 | if text is None:
698 | return None
699 | text = text.replace('\\', '\\\\').replace('\n', '\\n')
700 | return text.replace('\r', '\\r')
701 |
702 | def decode (self, text):
703 | output = []
704 | i = 0
705 | if text is None:
706 | return None
707 | size = len(text)
708 | while i < size:
709 | c = text[i]
710 | if c == '\\':
711 | c = text[i + 1:i + 2]
712 | if c == '\\':
713 | output.append('\\')
714 | elif c == 'n':
715 | output.append('\n')
716 | elif c == 'r':
717 | output.append('\r')
718 | else:
719 | output.append('\\' + c)
720 | i += 2
721 | else:
722 | output.append(c)
723 | i += 1
724 | return ''.join(output)
725 |
726 | # 安全转行整数
727 | def readint (self, text):
728 | if text is None:
729 | return None
730 | if text == '':
731 | return 0
732 | try:
733 | x = long(text)
734 | except:
735 | return 0
736 | if x < 0x7fffffff:
737 | return int(x)
738 | return x
739 |
740 | # 读取文件
741 | def __read (self):
742 | self.reset()
743 | filename = self.__csvname
744 | if filename is None:
745 | return False
746 | if not os.path.exists(self.__csvname):
747 | return False
748 | codec = self.__codec
749 | if sys.version_info[0] < 3:
750 | fp = open(filename, 'rb')
751 | content = fp.read()
752 | if not isinstance(content, type(b'')):
753 | content = content.encode(codec, 'ignore')
754 | content = content.replace(b'\r\n', b'\n')
755 | bio = io.BytesIO()
756 | bio.write(content)
757 | bio.seek(0)
758 | reader = csv.reader(bio)
759 | else:
760 | reader = csv.reader(open(filename, encoding = codec))
761 | rows = []
762 | index = []
763 | words = {}
764 | count = 0
765 | for row in reader:
766 | count += 1
767 | if count == 1:
768 | continue
769 | if len(row) < 1:
770 | continue
771 | if sys.version_info[0] < 3:
772 | row = [ n.decode(codec, 'ignore') for n in row ]
773 | if len(row) < COLUMN_SIZE:
774 | row.extend([None] * (COLUMN_SIZE - len(row)))
775 | if len(row) > COLUMN_SIZE:
776 | row = row[:COLUMN_SIZE]
777 | word = row[0].lower()
778 | if word in words:
779 | continue
780 | row.extend([0, 0, stripword(row[0])])
781 | words[word] = 1
782 | rows.append(row)
783 | index.append(row)
784 | self.__rows = rows
785 | self.__index = index
786 | self.__rows.sort(key = lambda row: row[0].lower())
787 | self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower()))
788 | for index in xrange(len(self.__rows)):
789 | row = self.__rows[index]
790 | row[COLUMN_ID] = index
791 | word = row[0].lower()
792 | self.__words[word] = row
793 | for index in xrange(len(self.__index)):
794 | row = self.__index[index]
795 | row[COLUMN_SD] = index
796 | return True
797 |
798 | # 保存文件
799 | def save (self, filename = None, codec = 'utf-8'):
800 | if filename is None:
801 | filename = self.__csvname
802 | if filename is None:
803 | return False
804 | if sys.version_info[0] < 3:
805 | fp = open(filename, 'wb')
806 | writer = csv.writer(fp)
807 | else:
808 | fp = open(filename, 'w', encoding = codec, newline = '')
809 | writer = csv.writer(fp)
810 | writer.writerow(self.__heads)
811 | for row in self.__rows:
812 | newrow = []
813 | for n in row:
814 | if isinstance(n, int) or isinstance(n, long):
815 | n = str(n)
816 | elif not isinstance(n, bytes):
817 | if (n is not None) and sys.version_info[0] < 3:
818 | n = n.encode(codec, 'ignore')
819 | newrow.append(n)
820 | writer.writerow(newrow[:COLUMN_SIZE])
821 | fp.close()
822 | return True
823 |
824 | # 对象解码
825 | def __obj_decode (self, row):
826 | if row is None:
827 | return None
828 | obj = {}
829 | obj['id'] = row[COLUMN_ID]
830 | obj['sw'] = row[COLUMN_SW]
831 | skip = self.__numbers
832 | for key, index in self.__fields:
833 | value = row[index]
834 | if index in skip:
835 | if value is not None:
836 | value = self.readint(value)
837 | elif key != 'detail':
838 | value = self.decode(value)
839 | obj[key] = value
840 | detail = obj.get('detail', None)
841 | if detail is not None:
842 | if detail != '':
843 | detail = json.loads(detail)
844 | else:
845 | detail = None
846 | obj['detail'] = detail
847 | return obj
848 |
849 | # 对象编码
850 | def __obj_encode (self, obj):
851 | row = [ None for i in xrange(len(self.__fields) + 3) ]
852 | for name, idx in self.__fields:
853 | value = obj.get(name, None)
854 | if value is None:
855 | continue
856 | if idx in self.__numbers:
857 | value = str(value)
858 | elif name == 'detail':
859 | value = json.dumps(value, ensure_ascii = False)
860 | else:
861 | value = self.encode(value)
862 | row[idx] = value
863 | return row
864 |
865 | # 重新排序
866 | def __resort (self):
867 | self.__rows.sort(key = lambda row: row[0].lower())
868 | self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower()))
869 | for index in xrange(len(self.__rows)):
870 | row = self.__rows[index]
871 | row[COLUMN_ID] = index
872 | for index in xrange(len(self.__index)):
873 | row = self.__index[index]
874 | row[COLUMN_SD] = index
875 | self.__dirty = False
876 |
877 | # 查询单词
878 | def query (self, key):
879 | if key is None:
880 | return None
881 | if self.__dirty:
882 | self.__resort()
883 | if isinstance(key, int) or isinstance(key, long):
884 | if key < 0 or key >= len(self.__rows):
885 | return None
886 | return self.__obj_decode(self.__rows[key])
887 | row = self.__words.get(key.lower(), None)
888 | return self.__obj_decode(row)
889 |
890 | # 查询单词匹配
891 | def match (self, word, count = 10, strip = False):
892 | if len(self.__rows) == 0:
893 | return []
894 | if self.__dirty:
895 | self.__resort()
896 | if not strip:
897 | index = self.__rows
898 | pos = 0
899 | else:
900 | index = self.__index
901 | pos = COLUMN_SW
902 | top = 0
903 | bottom = len(index) - 1
904 | middle = top
905 | key = word.lower()
906 | if strip:
907 | key = stripword(word)
908 | while top < bottom:
909 | middle = (top + bottom) >> 1
910 | if top == middle or bottom == middle:
911 | break
912 | text = index[middle][pos].lower()
913 | if key == text:
914 | break
915 | elif key < text:
916 | bottom = middle
917 | elif key > text:
918 | top = middle
919 | while index[middle][pos].lower() < key:
920 | middle += 1
921 | if middle >= len(index):
922 | break
923 | cc = COLUMN_ID
924 | likely = [ (tx[cc], tx[0]) for tx in index[middle:middle + count] ]
925 | return likely
926 |
927 | # 批量查询
928 | def query_batch (self, keys):
929 | return [ self.query(key) for key in keys ]
930 |
931 | # 单词总量
932 | def count (self):
933 | return len(self.__rows)
934 |
935 | # 取得长度
936 | def __len__ (self):
937 | return len(self.__rows)
938 |
939 | # 取得单词
940 | def __getitem__ (self, key):
941 | return self.query(key)
942 |
943 | # 是否存在
944 | def __contains__ (self, key):
945 | return self.__words.__contains__(key.lower())
946 |
947 | # 迭代器
948 | def __iter__ (self):
949 | record = []
950 | for index in xrange(len(self.__rows)):
951 | record.append((index, self.__rows[index][0]))
952 | return record.__iter__()
953 |
954 | # 注册新单词
955 | def register (self, word, items, commit = True):
956 | if word.lower() in self.__words:
957 | return False
958 | row = self.__obj_encode(items)
959 | row[0] = word
960 | row[COLUMN_ID] = len(self.__rows)
961 | row[COLUMN_SD] = len(self.__rows)
962 | row[COLUMN_SW] = stripword(word)
963 | self.__rows.append(row)
964 | self.__index.append(row)
965 | self.__words[word.lower()] = row
966 | self.__dirty = True
967 | return True
968 |
969 | # 删除单词
970 | def remove (self, key, commit = True):
971 | if isinstance(key, int) or isinstance(key, long):
972 | if key < 0 or key >= len(self.__rows):
973 | return False
974 | if self.__dirty:
975 | self.__resort()
976 | key = self.__rows[key][0]
977 | row = self.__words.get(key, None)
978 | if row is None:
979 | return False
980 | if len(self.__rows) == 1:
981 | self.reset()
982 | return True
983 | index = row[COLUMN_ID]
984 | self.__rows[index] = self.__rows[len(self.__rows) - 1]
985 | self.__rows.pop()
986 | index = row[COLUMN_SD]
987 | self.__index[index] = self.__index[len(self.__rows) - 1]
988 | self.__index.pop()
989 | del self.__words[key]
990 | self.__dirty = True
991 | return True
992 |
993 | # 清空所有
994 | def delete_all (self, reset_id = False):
995 | self.reset()
996 | return True
997 |
998 | # 更改单词
999 | def update (self, key, items, commit = True):
1000 | if isinstance(key, int) or isinstance(key, long):
1001 | if key < 0 or key >= len(self.__rows):
1002 | return False
1003 | if self.__dirty:
1004 | self.__resort()
1005 | key = self.__rows[key][0]
1006 | key = key.lower()
1007 | row = self.__words.get(key, None)
1008 | if row is None:
1009 | return False
1010 | newrow = self.__obj_encode(items)
1011 | for name, idx in self.__fields:
1012 | if idx == 0:
1013 | continue
1014 | if name in items:
1015 | row[idx] = newrow[idx]
1016 | return True
1017 |
1018 | # 提交变更
1019 | def commit (self):
1020 | if self.__csvname:
1021 | self.save(self.__csvname, self.__codec)
1022 | return True
1023 |
1024 | # 取得所有单词
1025 | def dumps (self):
1026 | return [ n for _, n in self.__iter__() ]
1027 |
1028 |
1029 | #----------------------------------------------------------------------
1030 | # 词形衍生:查找动词的各种时态,名词的复数等,或反向查找
1031 | # 格式为每行一条数据:根词汇 -> 衍生1,衍生2,衍生3
1032 | # 可以用 Hunspell数据生成,下面有个日本人做的简版(1.8万组数据):
1033 | # http://www.lexically.net/downloads/version4/downloading%20BNC.htm
1034 | #----------------------------------------------------------------------
1035 | class LemmaDB (object):
1036 |
1037 | def __init__ (self):
1038 | self._stems = {}
1039 | self._words = {}
1040 | self._frqs = {}
1041 |
1042 | # 读取数据
1043 | def load (self, filename, encoding = None):
1044 | content = open(filename, 'rb').read()
1045 | if content[:3] == b'\xef\xbb\xbf':
1046 | content = content[3:].decode('utf-8', 'ignore')
1047 | elif encoding is not None:
1048 | text = content.decode(encoding, 'ignore')
1049 | else:
1050 | text = None
1051 | match = ['utf-8', sys.getdefaultencoding(), 'ascii']
1052 | for encoding in match + ['gbk', 'latin1']:
1053 | try:
1054 | text = content.decode(encoding)
1055 | break
1056 | except:
1057 | pass
1058 | if text is None:
1059 | text = content.decode('utf-8', 'ignore')
1060 | number = 0
1061 | for line in text.split('\n'):
1062 | number += 1
1063 | line = line.strip('\r\n ')
1064 | if (not line) or (line[:1] == ';'):
1065 | continue
1066 | pos = line.find('->')
1067 | if not pos:
1068 | continue
1069 | stem = line[:pos].strip()
1070 | p1 = stem.find('/')
1071 | frq = 0
1072 | if p1 >= 0:
1073 | frq = int(stem[p1 + 1:].strip())
1074 | stem = stem[:p1].strip()
1075 | if not stem:
1076 | continue
1077 | if frq > 0:
1078 | self._frqs[stem] = frq
1079 | for word in line[pos + 2:].strip().split(','):
1080 | p1 = word.find('/')
1081 | if p1 >= 0:
1082 | word = word[:p1].strip()
1083 | if not word:
1084 | continue
1085 | self.add(stem, word.strip())
1086 | return True
1087 |
1088 | # 保存数据文件
1089 | def save (self, filename, encoding = 'utf-8'):
1090 | stems = list(self._stems.keys())
1091 | stems.sort(key = lambda x: x.lower())
1092 | import codecs
1093 | fp = codecs.open(filename, 'w', encoding)
1094 | output = []
1095 | for stem in stems:
1096 | words = self.get(stem)
1097 | if not words:
1098 | continue
1099 | frq = self._frqs.get(stem, 0)
1100 | if frq > 0:
1101 | stem = '%s/%d'%(stem, frq)
1102 | output.append((-frq, u'%s -> %s'%(stem, ','.join(words))))
1103 | output.sort()
1104 | for _, text in output:
1105 | fp.write(text + '\n')
1106 | fp.close()
1107 | return True
1108 |
1109 | # 添加一个词根的一个衍生词
1110 | def add (self, stem, word):
1111 | if stem not in self._stems:
1112 | self._stems[stem] = {}
1113 | if word not in self._stems[stem]:
1114 | self._stems[stem][word] = len(self._stems[stem])
1115 | if word not in self._words:
1116 | self._words[word] = {}
1117 | if stem not in self._words[word]:
1118 | self._words[word][stem] = len(self._words[word])
1119 | return True
1120 |
1121 | # 删除一个词根的一个衍生词
1122 | def remove (self, stem, word):
1123 | count = 0
1124 | if stem in self._stems:
1125 | if word in self._stems[stem]:
1126 | del self._stems[stem][word]
1127 | count += 1
1128 | if not self._stems[stem]:
1129 | del self._stems[stem]
1130 | if word in self._words:
1131 | if stem in self._words[word]:
1132 | del self._words[word][stem]
1133 | count += 1
1134 | if not self._words[word]:
1135 | del self._words[word]
1136 | return (count > 0) and True or False
1137 |
1138 | # 清空数据库
1139 | def reset (self):
1140 | self._stems = {}
1141 | self._words = {}
1142 | return True
1143 |
1144 | # 根据词根找衍生,或者根据衍生反向找词根
1145 | def get (self, word, reverse = False):
1146 | if not reverse:
1147 | if word not in self._stems:
1148 | if word in self._words:
1149 | return [word]
1150 | return None
1151 | words = [ (v, k) for (k, v) in self._stems[word].items() ]
1152 | else:
1153 | if word not in self._words:
1154 | if word in self._stems:
1155 | return [word]
1156 | return None
1157 | words = [ (v, k) for (k, v) in self._words[word].items() ]
1158 | words.sort()
1159 | return [ k for (v, k) in words ]
1160 |
1161 | # 知道一个单词求它的词根
1162 | def word_stem (self, word):
1163 | return self.get(word, reverse = True)
1164 |
1165 | # 总共多少条词根数据
1166 | def stem_size (self):
1167 | return len(self._stems)
1168 |
1169 | # 总共多少条衍生数据
1170 | def word_size (self):
1171 | return len(self._words)
1172 |
1173 | def dump (self, what = 'ALL'):
1174 | words = {}
1175 | what = what.lower()
1176 | if what in ('all', 'stem'):
1177 | for word in self._stems:
1178 | words[word] = 1
1179 | if what in ('all', 'word'):
1180 | for word in self._words:
1181 | words[word] = 1
1182 | return words
1183 |
1184 | def __len__ (self):
1185 | return len(self._stems)
1186 |
1187 | def __getitem__ (self, stem):
1188 | return self.get(stem)
1189 |
1190 | def __contains__ (self, stem):
1191 | return (stem in self._stems)
1192 |
1193 | def __iter__ (self):
1194 | return self._stems.__iter__()
1195 |
1196 |
1197 |
1198 | #----------------------------------------------------------------------
1199 | # DictHelper
1200 | #----------------------------------------------------------------------
1201 | class DictHelper (object):
1202 |
1203 | def __init__ (self):
1204 | self._exchanges = {}
1205 | self._exchanges['p'] = u'过去式'
1206 | self._exchanges['d'] = u'过去分词'
1207 | self._exchanges['i'] = u'现在分词'
1208 | self._exchanges['3'] = u'第三人称单数'
1209 | self._exchanges['r'] = u'比较级'
1210 | self._exchanges['t'] = u'最高级'
1211 | self._exchanges['s'] = u'复数'
1212 | self._exchanges['0'] = u'原型' # best 的原型是 good
1213 | self._exchanges['1'] = u'类别' # best 的类别是 good 里的 t
1214 | self._pos = {}
1215 | self._pos['a'] = (u'代词', 'pron.')
1216 | self._pos['c'] = (u'连接词', 'conj.')
1217 | self._pos['d'] = (u'限定词', 'determiner')
1218 | self._pos['i'] = (u'介词', 'prep.')
1219 | self._pos['j'] = (u'形容词', 'adj.')
1220 | self._pos['m'] = (u'数词', 'num.')
1221 | self._pos['n'] = (u'名词', 'n.')
1222 | self._pos['p'] = (u'代词', 'pron.')
1223 | self._pos['r'] = (u'副词', 'adv.')
1224 | self._pos['u'] = (u'感叹词', 'int.')
1225 | self._pos['t'] = (u'不定式标记', 'infm.')
1226 | self._pos['v'] = (u'动词', 'v.')
1227 | self._pos['x'] = (u'否定标记', 'not')
1228 |
1229 | # 返回一个进度指示条,传入总量,每走一格调用一次 next
1230 | def progress (self, total):
1231 | class ProgressIndicator (object):
1232 | def __init__ (self, total):
1233 | self.count = 0
1234 | self.percent = -1
1235 | self.total = total
1236 | self.timestamp = time.time()
1237 | self.counter = {}
1238 | def next (self):
1239 | if self.total:
1240 | self.count += 1
1241 | pc = int(self.count * 100 / self.total)
1242 | if pc != self.percent:
1243 | self.percent = pc
1244 | print('progress: %d%%'%pc)
1245 | def inc (self, name):
1246 | if name not in self.counter:
1247 | self.counter[name] = 1
1248 | else:
1249 | self.counter[name] += 1
1250 | def done (self):
1251 | t = (time.time() - self.timestamp)
1252 | keys = list(self.counter.keys())
1253 | keys.sort()
1254 | for key in keys:
1255 | print('[%s] -> %d'%(key, self.counter[key]))
1256 | print('[Finished in %d seconds (%d)]'%(t, self.count))
1257 | return ProgressIndicator(total)
1258 |
1259 | # 返回词典里所有词的 map,默认转为小写
1260 | def dump_map (self, dictionary, lower = True):
1261 | words = {}
1262 | for _, word in dictionary:
1263 | if lower:
1264 | word = word.lower()
1265 | words[word] = 1
1266 | return words
1267 |
1268 | # 字典差异导出
1269 | def discrepancy_export (self, dictionary, words, outname, opts = ''):
1270 | existence = self.dump_map(dictionary)
1271 | if os.path.splitext(outname)[-1].lower() in ('.txt', '.csv'):
1272 | db = DictCsv(outname)
1273 | else:
1274 | db = StarDict(outname)
1275 | db.delete_all()
1276 | count = 0
1277 | for word in words:
1278 | if word.lower() in existence:
1279 | continue
1280 | if '(' in word:
1281 | continue
1282 | if '/' in word:
1283 | continue
1284 | if '"' in word or '#' in word:
1285 | continue
1286 | if '0' in word or '1' in word or '2' in word or '3' in word:
1287 | continue
1288 | if 's' in opts:
1289 | if word.count(' ') >= 2:
1290 | continue
1291 | if 't' in opts:
1292 | if ' ' in word:
1293 | continue
1294 | if 'p' in opts:
1295 | if '-' in word:
1296 | continue
1297 | try:
1298 | word.encode('ascii')
1299 | except:
1300 | continue
1301 | db.register(word, {'tag':'PENDING'}, False)
1302 | count += 1
1303 | db.commit()
1304 | print('exported %d entries'%count)
1305 | return count
1306 |
1307 | # 字典差异导入
1308 | def discrepancy_import (self, dictionary, filename, opts = ''):
1309 | existence = self.dump_map(dictionary)
1310 | if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'):
1311 | db = DictCsv(filename)
1312 | else:
1313 | db = StarDict(filename)
1314 | count = 0
1315 | for word in self.dump_map(db, False):
1316 | data = db[word]
1317 | if data is None:
1318 | continue
1319 | if data['tag'] != 'OK':
1320 | continue
1321 | phonetic = data.get('phonetic', '')
1322 | definition = data.get('definition', '')
1323 | translation = data.get('translation', '')
1324 | update = {}
1325 | if phonetic:
1326 | update['phonetic'] = phonetic
1327 | if definition:
1328 | update['definition'] = definition
1329 | if translation:
1330 | update['translation'] = translation
1331 | if not update:
1332 | continue
1333 | if word.lower() in existence:
1334 | if 'n' not in opts:
1335 | dictionary.update(word, update, False)
1336 | else:
1337 | dictionary.register(word, update, False)
1338 | count += 1
1339 | dictionary.commit()
1340 | print('imported %d entries'%count)
1341 | return count
1342 |
1343 | # 差异比较(utf-8 的.txt 文件,单词和后面音标释义用tab分割)
1344 | def deficit_tab_txt (self, dictionary, txt, outname, opts = ''):
1345 | deficit = {}
1346 | for line in codecs.open(txt, encoding = 'utf-8'):
1347 | row = [ n.strip() for n in line.split('\t') ]
1348 | if len(row) < 2:
1349 | continue
1350 | word = row[0]
1351 | deficit[word] = 1
1352 | return self.deficit_export(dictionary, deficit, outname, opts)
1353 |
1354 | # 导出星际译王的词典文件,根据一个单词到释义的字典
1355 | def export_stardict (self, wordmap, outname, title):
1356 | mainname = os.path.splitext(outname)[0]
1357 | keys = [ k for k in wordmap ]
1358 | keys.sort(key = lambda x: (x.lower(), x))
1359 | import struct
1360 | pc = self.progress(len(wordmap))
1361 | position = 0
1362 | with open(mainname + '.idx', 'wb') as f1:
1363 | with open(mainname + '.dict', 'wb') as f2:
1364 | for word in keys:
1365 | pc.next()
1366 | f1.write(word.encode('utf-8', 'ignore') + b'\x00')
1367 | text = wordmap[word].encode('utf-8', 'ignore')
1368 | f1.write(struct.pack('>II', position, len(text)))
1369 | f2.write(text)
1370 | position += len(text)
1371 | with open(mainname + '.ifo', 'wb') as f3:
1372 | f3.write("StarDict's dict ifo file\nversion=2.4.2\n")
1373 | f3.write('wordcount=%d\n'%len(wordmap))
1374 | f3.write('idxfilesize=%d\n'%f1.tell())
1375 | f3.write('bookname=%s\n'%title.encode('utf-8', 'ignore'))
1376 | f3.write('author=\ndescription=\n')
1377 | import datetime
1378 | ts = datetime.datetime.now().strftime('%Y.%m.%d')
1379 | f3.write('date=%s\nsametypesequence=m\n'%ts)
1380 | pc.done()
1381 | return True
1382 |
1383 | # 导出 mdict 的源文件
1384 | def export_mdict (self, wordmap, outname):
1385 | keys = [ k for k in wordmap ]
1386 | keys.sort(key = lambda x: x.lower())
1387 | size = len(keys)
1388 | index = 0
1389 | pc = self.progress(size)
1390 | with codecs.open(outname, 'w', encoding = 'utf-8') as fp:
1391 | for key in keys:
1392 | pc.next()
1393 | word = key.replace('>', '').replace('\n', ' ')
1394 | text = wordmap[key].replace('>', '')
1395 | if not isinstance(word, unicode):
1396 | word = word.decode('gbk')
1397 | if not isinstance(text, unicode):
1398 | text = text.decode('gbk')
1399 | fp.write(word + '\r\n')
1400 | for line in text.split('\n'):
1401 | line = line.rstrip('\r')
1402 | fp.write(line)
1403 | fp.write('\r\n')
1404 | index += 1
1405 | fp.write('>' + ((index < size) and '\r\n' or ''))
1406 | pc.done()
1407 | return True
1408 |
1409 | # 导入mdx源文件
1410 | def import_mdict (self, filename, encoding = 'utf-8'):
1411 | import codecs
1412 | words = {}
1413 | with codecs.open(filename, 'r', encoding = encoding) as fp:
1414 | text = []
1415 | word = None
1416 | for line in fp:
1417 | line = line.rstrip('\r\n')
1418 | if word is None:
1419 | if line == '':
1420 | continue
1421 | else:
1422 | word = line.strip()
1423 | elif line.strip() != '>':
1424 | text.append(line)
1425 | else:
1426 | words[word] = '\n'.join(text)
1427 | word = None
1428 | text = []
1429 | return words
1430 |
1431 | # 直接生成 .mdx文件,需要 writemdict 支持:
1432 | # https://github.com/skywind3000/writemdict
1433 | def export_mdx (self, wordmap, outname, title, desc = None):
1434 | try:
1435 | import writemdict
1436 | except ImportError:
1437 | print('ERROR: can\'t import writemdict module, please install it:')
1438 | print('https://github.com/skywind3000/writemdict')
1439 | sys.exit(1)
1440 | if desc is None:
1441 | desc = u'Create by stardict.py'
1442 | writer = writemdict.MDictWriter(wordmap, title = title,
1443 | description = desc)
1444 | with open(outname, 'wb') as fp:
1445 | writer.write(fp)
1446 | return True
1447 |
1448 | # 读取 .mdx 文件,需要 readmdict 支持:
1449 | # https://github.com/skywind3000/writemdict (包含readmdict)
1450 | def read_mdx (self, mdxname, mdd = False):
1451 | try:
1452 | import readmdict
1453 | except ImportError:
1454 | print('ERROR: can\'t import readmdict module, please install it:')
1455 | print('https://github.com/skywind3000/writemdict')
1456 | sys.exit(1)
1457 | words = {}
1458 | if not mdd:
1459 | mdx = readmdict.MDX(mdxname)
1460 | else:
1461 | mdx = readmdict.MDD(mdxname)
1462 | for key, value in mdx.items():
1463 | key = key.decode('utf-8', 'ignore')
1464 | if not mdd:
1465 | words[key] = value.decode('utf-8', 'ignore')
1466 | else:
1467 | words[key] = value
1468 | return words
1469 |
1470 | # 导出词形变换字符串
1471 | def exchange_dumps (self, obj):
1472 | part = []
1473 | if not obj:
1474 | return None
1475 | for k, v in obj.items():
1476 | k = k.replace('/', '').replace(':', '').strip()
1477 | v = v.replace('/', '').replace(':', '').strip()
1478 | part.append(k + ':' + v)
1479 | return '/'.join(part)
1480 |
1481 | # 读取词形变换字符串
1482 | def exchange_loads (self, exchg):
1483 | if not exchg:
1484 | return None
1485 | obj = {}
1486 | for text in exchg.split('/'):
1487 | pos = text.find(':')
1488 | if pos < 0:
1489 | continue
1490 | k = text[:pos].strip()
1491 | v = text[pos + 1:].strip()
1492 | obj[k] = v
1493 | return obj
1494 |
1495 | def pos_loads (self, pos):
1496 | return self.exchange_loads(pos)
1497 |
1498 | def pos_dumps (self, obj):
1499 | return self.exchange_dumps(obj)
1500 |
1501 | # 返回词性
1502 | def pos_detect (self, word, pos):
1503 | word = word.lower()
1504 | if pos == 'a':
1505 | if word in ('a', 'the',):
1506 | return (u'冠词', 'art.')
1507 | if word in ('no', 'every'):
1508 | return (u'形容词', 'adj.')
1509 | return (u'代词', 'pron.')
1510 | if pos in self._pos:
1511 | return self._pos[pos]
1512 | return (u'未知', 'unknow')
1513 |
1514 | # 返回词形比例
1515 | def pos_extract (self, data):
1516 | if 'pos' not in data:
1517 | return None
1518 | position = data['pos']
1519 | if not position:
1520 | return None
1521 | part = self.pos_loads(position)
1522 | result = []
1523 | for x in part:
1524 | result.append((x, part[x]))
1525 | result.sort(reverse = True, key = lambda t: int(t[1]))
1526 | final = []
1527 | for pos, num in result:
1528 | mode = self.pos_detect(data['word'], pos)
1529 | final.append((mode, num))
1530 | return final
1531 |
1532 | # 设置详细内容,None代表删除
1533 | def set_detail (self, dictionary, word, item, value, create = False):
1534 | data = dictionary.query(word)
1535 | if data is None:
1536 | if not create:
1537 | return False
1538 | dictionary.register(word, {}, False)
1539 | data = {}
1540 | detail = data.get('detail')
1541 | if not detail:
1542 | detail = {}
1543 | if value is not None:
1544 | detail[item] = value
1545 | elif item in detail:
1546 | del detail[item]
1547 | if not detail:
1548 | detail = None
1549 | dictionary.update(word, {'detail': detail}, False)
1550 | return True
1551 |
1552 | # 取得详细内容
1553 | def get_detail (self, dictionary, word, item):
1554 | data = dictionary.query(word)
1555 | if not data:
1556 | return None
1557 | detail = data.get('detail')
1558 | if not detail:
1559 | return None
1560 | return detail.get(item, None)
1561 |
1562 | # load file and guess encoding
1563 | def load_text (self, filename, encoding = None):
1564 | content = None
1565 | try:
1566 | content = open(filename, 'rb').read()
1567 | except:
1568 | return None
1569 | if content[:3] == b'\xef\xbb\xbf':
1570 | text = content[3:].decode('utf-8')
1571 | elif encoding is not None:
1572 | text = content.decode(encoding, 'ignore')
1573 | else:
1574 | text = None
1575 | guess = [sys.getdefaultencoding(), 'utf-8']
1576 | if sys.stdout and sys.stdout.encoding:
1577 | guess.append(sys.stdout.encoding)
1578 | for name in guess + ['gbk', 'ascii', 'latin1']:
1579 | try:
1580 | text = content.decode(name)
1581 | break
1582 | except:
1583 | pass
1584 | if text is None:
1585 | text = content.decode('utf-8', 'ignore')
1586 | return text
1587 |
1588 | # csv 读取,自动检测编码
1589 | def csv_load (self, filename, encoding = None):
1590 | text = self.load_text(filename, encoding)
1591 | if not text:
1592 | return None
1593 | import csv
1594 | if sys.version_info[0] < 3:
1595 | import cStringIO
1596 | sio = cStringIO.StringIO(text.encode('utf-8', 'ignore'))
1597 | else:
1598 | import io
1599 | sio = io.StringIO(text)
1600 | reader = csv.reader(sio)
1601 | output = []
1602 | if sys.version_info[0] < 3:
1603 | for row in reader:
1604 | output.append([ n.decode('utf-8', 'ignore') for n in row ])
1605 | else:
1606 | for row in reader:
1607 | output.append(row)
1608 | return output
1609 |
1610 | # csv保存,可以指定编码
1611 | def csv_save (self, filename, rows, encoding = 'utf-8'):
1612 | import csv
1613 | ispy2 = (sys.version_info[0] < 3)
1614 | if not encoding:
1615 | encoding = 'utf-8'
1616 | if sys.version_info[0] < 3:
1617 | fp = open(filename, 'wb')
1618 | writer = csv.writer(fp)
1619 | else:
1620 | fp = open(filename, 'w', encoding = encoding, newline = '')
1621 | writer = csv.writer(fp)
1622 | for row in rows:
1623 | newrow = []
1624 | for n in row:
1625 | if isinstance(n, int) or isinstance(n, long):
1626 | n = str(n)
1627 | elif isinstance(n, float):
1628 | n = str(n)
1629 | elif not isinstance(n, bytes):
1630 | if (n is not None) and ispy2:
1631 | n = n.encode(encoding, 'ignore')
1632 | newrow.append(n)
1633 | writer.writerow(newrow)
1634 | fp.close()
1635 | return True
1636 |
1637 | # 加载 tab 分割的 txt 文件, 返回 key, value
1638 | def tab_txt_load (self, filename, encoding = None):
1639 | words = {}
1640 | content = self.load_text(filename, encoding)
1641 | if content is None:
1642 | return None
1643 | for line in content.split('\n'):
1644 | line = line.strip('\r\n\t ')
1645 | if not line:
1646 | continue
1647 | p1 = line.find('\t')
1648 | if p1 < 0:
1649 | continue
1650 | word = line[:p1].rstrip('\r\n\t ')
1651 | text = line[p1:].lstrip('\r\n\t ')
1652 | text = text.replace('\\n', '\n').replace('\\r', '\r')
1653 | words[word] = text.replace('\\t', '\t').replace('\\\\', '\\')
1654 | return words
1655 |
1656 | # 保存 tab 分割的 txt文件
1657 | def tab_txt_save (self, filename, words, encoding = 'utf-8'):
1658 | with codecs.open(filename, 'w', encoding = encoding) as fp:
1659 | for word in words:
1660 | text = words[word]
1661 | text = text.replace('\\', '\\\\').replace('\n', '\\n')
1662 | text = text.replace('\r', '\\r').replace('\t', '\\t')
1663 | fp.write('%s\t%s\r\n'%(word, text))
1664 | return True
1665 |
1666 | # Tab 分割的 txt文件释义导入
1667 | def tab_txt_import (self, dictionary, filename):
1668 | words = self.tab_txt_load(filename)
1669 | if not words:
1670 | return False
1671 | pc = self.progress(len(words))
1672 | for word in words:
1673 | data = dictionary.query(word)
1674 | if not data:
1675 | dictionary.register(word, {'translation':words[word]}, False)
1676 | else:
1677 | dictionary.update(word, {'translation':words[word]}, False)
1678 | pc.inc(0)
1679 | pc.next()
1680 | dictionary.commit()
1681 | pc.done()
1682 | return True
1683 |
1684 | # mdx-builder 使用writemdict代替MdxBuilder处理较大词典(需64为python)
1685 | def mdx_build (self, srcname, outname, title, desc = None):
1686 | print('loading %s'%srcname)
1687 | t = time.time()
1688 | words = self.import_mdict(srcname)
1689 | t = time.time() - t
1690 | print(u'%d records loaded in %.3f seconds'%(len(words), t))
1691 | print(u'building %s'%outname)
1692 | t = time.time()
1693 | self.export_mdx(words, outname, title, desc)
1694 | t = time.time() - t
1695 | print(u'complete in %.3f seconds'%t)
1696 | return True
1697 |
1698 | # 验证单词合法性
1699 | def validate_word (self, word, asc128):
1700 | alpha = 0
1701 | for ch in word:
1702 | if ch.isalpha():
1703 | alpha += 1
1704 | if ord(ch) >= 128 and asc128:
1705 | return False
1706 | elif (not ch.isalpha()) and (not ch.isdigit()):
1707 | if ch not in ('-', '\'', '/', '(', ')', ' ', ',', '.'):
1708 | if ch not in ('&', '!', '?', '_'):
1709 | if len(word) == 5 and word[2] == ';':
1710 | continue
1711 | if not ord(ch) in (239, 65292):
1712 | # print 'f1', ord(ch), word.find(ch)
1713 | return False
1714 | if alpha == 0:
1715 | if not word.isdigit():
1716 | return False
1717 | if word[:1] == '"' and word[-1:] == '"':
1718 | return False
1719 | if word[:1] == '(' and word[-1:] == ')':
1720 | if word.count('(') == 1:
1721 | return False
1722 | if word[:3] == '(-)':
1723 | return False
1724 | for ch in ('<', '>', '%', '*', '@', '`'):
1725 | if ch in word:
1726 | return False
1727 | if '%' in word or '\\' in word or '`' in word:
1728 | return False
1729 | if word[:1] in ('$', '@'):
1730 | return False
1731 | if len(word) == 1:
1732 | x = ord(word)
1733 | if (x < ord('a')) or (x > ord('z')):
1734 | if (x < ord('A')) or (x > ord('Z')):
1735 | return False
1736 | if (' ' not in word) and ('-' not in word):
1737 | if ('?' in word) or ('!' in word):
1738 | return False
1739 | if word.count('?') >= 2:
1740 | return False
1741 | if word.count('!') >= 2:
1742 | return False
1743 | if '---' in word:
1744 | return False
1745 | try:
1746 | word.lower()
1747 | except UnicodeWarning:
1748 | return False
1749 | return True
1750 |
1751 |
1752 | #----------------------------------------------------------------------
1753 | # Helper instance
1754 | #----------------------------------------------------------------------
1755 | tools = DictHelper()
1756 |
1757 | # 根据文件名自动判断数据库类型并打开
1758 | def open_dict(filename):
1759 | if isinstance(filename, dict):
1760 | return DictMySQL(filename)
1761 | if filename[:8] == 'mysql://':
1762 | return DictMySQL(filename)
1763 | if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'):
1764 | return DictCsv(filename)
1765 | return StarDict(filename)
1766 |
1767 |
1768 | # 字典转化,csv sqlite之间互转
1769 | def convert_dict(dstname, srcname):
1770 | dst = open_dict(dstname)
1771 | src = open_dict(srcname)
1772 | dst.delete_all()
1773 | pc = tools.progress(len(src))
1774 | for word in src.dumps():
1775 | pc.next()
1776 | data = src[word]
1777 | x = data['oxford']
1778 | if isinstance(x, int) or isinstance(x, long):
1779 | if x <= 0:
1780 | data['oxford'] = None
1781 | elif isinstance(x, str) or isinstance(x, unicode):
1782 | if x == '' or x == '0':
1783 | data['oxford'] = None
1784 | x = data['collins']
1785 | if isinstance(x, int) or isinstance(x, long):
1786 | if x <= 0:
1787 | data['collins'] = None
1788 | elif isinstance(x, str) or isinstance(x, unicode):
1789 | if x in ('', '0'):
1790 | data['collins'] = None
1791 | dst.register(word, data, False)
1792 | dst.commit()
1793 | pc.done()
1794 | return True
1795 |
1796 |
1797 | # 从 ~/.local/share/stardict 下面打开词典
1798 | def open_local(filename):
1799 | base = os.path.expanduser('~/.local')
1800 | for dir in [base, base + '/share', base + '/share/stardict']:
1801 | if not os.path.exists(dir):
1802 | os.mkdir(dir)
1803 | fn = os.path.join(base + '/share/stardict', filename)
1804 | return open_dict(fn)
1805 |
1806 |
1807 |
1808 |
1809 | #----------------------------------------------------------------------
1810 | # testing
1811 | #----------------------------------------------------------------------
1812 | if __name__ == '__main__':
1813 | db = os.path.join(os.path.dirname(__file__), 'test.db')
1814 | my = {'host':'??', 'user':'skywind', 'passwd':'??', 'db':'skywind_t1'}
1815 | def test1():
1816 | t = time.time()
1817 | sd = StarDict(db, False)
1818 | print(time.time() - t)
1819 | # sd.delete_all(True)
1820 | print(sd.register('kiss2', {'definition':'kiss me'}, False))
1821 | print(sd.register('kiss here', {'definition':'kiss me'}, False))
1822 | print(sd.register('Kiss', {'definition':'BIG KISS'}, False))
1823 | print(sd.register('kiss', {'definition':'kiss me'}, False))
1824 | print(sd.register('suck', {'definition':'suck me'}, False))
1825 | print(sd.register('give', {'definition':'give me', 'detail':[1,2,3]}, False))
1826 | sd.commit()
1827 | print('')
1828 | print(sd.count())
1829 | print(sd.query('kiSs'))
1830 | print(sd.query(2))
1831 | print(sd.match('kis', 10))
1832 | print('')
1833 | print(sd.query_batch(['give', 2]))
1834 | print(sd.match('kisshere', 10, True))
1835 | return 0
1836 | def test2():
1837 | t = time.time()
1838 | dm = DictMySQL(my, init = True)
1839 | print(time.time() - t)
1840 | # dm.delete_all(True)
1841 | print(dm.register('kiss2', {'definition':'kiss me'}, False))
1842 | print(dm.register('kiss here', {'definition':'kiss me'}, False))
1843 | print(dm.register('Kiss', {'definition':'kiss me'}, False))
1844 | print(dm.register('kiss', {'definition':'BIG KISS'}, False))
1845 | print(dm.register('suck', {'definition':'suck me'}, False))
1846 | print(dm.register('give', {'definition':'give me'}, False))
1847 | print(dm.query('kiss'))
1848 | print(dm.match('kis'))
1849 | print('')
1850 | print(dm.query('KiSs'))
1851 | print(dm.query_batch(['give', 2, 9]))
1852 | print('count: %d'%len(dm))
1853 | print(dm.match('kisshere', 10, True))
1854 | return 0
1855 | def test3():
1856 | csvname = os.path.join(os.path.dirname(__file__), 'test.csv')
1857 | dc = DictCsv(csvname)
1858 | dc.delete_all()
1859 | print(dc.register('kiss2', {'definition':'kiss me'}, False))
1860 | print(dc.register('kiss here', {'definition':'kiss me'}, False))
1861 | print(dc.register('Kiss', {'definition':'kiss me'}, False))
1862 | print(dc.register('kiss', {'definition':'kiss me'}, False))
1863 | print(dc.register('suck', {'definition':'suck me'}, False))
1864 | print(dc.register('word', {'definition':'WORD WORD'}, False))
1865 | print(dc.query('kiss'))
1866 | print('')
1867 | dc.remove('kiss2')
1868 | print(dc.match('kis'))
1869 | print(dc.match('kisshere', 10, True))
1870 | dc.commit()
1871 | return 0
1872 | def test4():
1873 | lemma = LemmaDB()
1874 | t = time.time()
1875 | lemma.load('lemma.en.txt')
1876 | print('load in %s seconds'%str(time.time() - t))
1877 | print(len(lemma))
1878 | for word in ('be', 'give', 'see', 'take'):
1879 | print('%s -> %s'%(word, ','.join(lemma.get(word))))
1880 | for word in ('gave', 'taken', 'looked', 'teeth', 'speak'):
1881 | print('%s <- %s'%(word, ','.join(lemma.word_stem(word))))
1882 | lemma.save('output.txt')
1883 | return 0
1884 | def test5():
1885 | print(tools.validate_word('Hello World', False))
1886 | test3()
1887 |
1888 |
1889 |
1890 |
--------------------------------------------------------------------------------