├── anki_packager ├── dict │ ├── __init__.py │ ├── longman.py │ ├── eudic.py │ ├── ecdict.py │ ├── youdao.py │ └── stardict.py ├── __main__.py ├── __init__.py ├── logger.py ├── prompt.py ├── utils.py ├── ai.py ├── packager │ └── deck.py └── cli.py ├── config ├── vocabulary.txt └── config.toml ├── test.apkg ├── dicts ├── 有道词语辨析.mdx └── 单词释义比例词典-带词性.mdx ├── images ├── apkg.png └── 卡片预览.png ├── publish.sh ├── MANIFEST.in ├── Dockerfile ├── LICENSE ├── setup.py ├── Makefile ├── requirements.txt ├── README.md └── .gitignore /anki_packager/dict/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /anki_packager/dict/longman.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config/vocabulary.txt: -------------------------------------------------------------------------------- 1 | reform 2 | open -------------------------------------------------------------------------------- /test.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/test.apkg -------------------------------------------------------------------------------- /dicts/有道词语辨析.mdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/有道词语辨析.mdx -------------------------------------------------------------------------------- /images/apkg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/apkg.png -------------------------------------------------------------------------------- /images/卡片预览.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/images/卡片预览.png -------------------------------------------------------------------------------- /dicts/单词释义比例词典-带词性.mdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyhu/anki_packager/HEAD/dicts/单词释义比例词典-带词性.mdx -------------------------------------------------------------------------------- /anki_packager/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from .cli import main 3 | 4 | if __name__ == "__main__": 5 | asyncio.run(main()) 6 | -------------------------------------------------------------------------------- /anki_packager/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9.5" 2 | 3 | from .utils import initialize_config 4 | 5 | try: 6 | initialize_config() 7 | except Exception as e: 8 | import sys 9 | 10 | print(f"Warning: Unable to initialize configuration: {e}", file=sys.stderr) 11 | -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Clean up previous builds 4 | rm -rf build/ dist/ *.egg-info/ 5 | 6 | # Build source and wheel distributions 7 | python -m build 8 | 9 | # Upload to PyPI 10 | # Uncomment when ready to publish: 11 | # twine upload dist/* 12 | 13 | echo -e "Build completed. To publish to PyPI, run: twine upload dist/*" 14 | -------------------------------------------------------------------------------- /config/config.toml: -------------------------------------------------------------------------------- 1 | PROXY = "" 2 | EUDIC_TOKEN = "" 3 | EUDIC_ID = "0" 4 | DECK_NAME = "anki_packager" 5 | 6 | [[MODEL_PARAM]] 7 | model = "gemini/gemini-2.5-flash" 8 | api_key = "GEMINI_API_KEY" 9 | rpm = 10 # 每分钟请求次数 10 | 11 | # [[MODEL_PARAM]] 12 | # model = "openai/gpt-4o" 13 | # api_key = "OPENAI_API_KEY" 14 | # api_base = "YOUR_API_BASE" 15 | # rpm = 200 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | recursive-include anki_packager/packager *.py 5 | global-exclude *.py[cod] __pycache__ *.so 6 | global-exclude */__pycache__/* 7 | exclude anki_packager.log 8 | exclude *.pyc 9 | exclude __pycache__ 10 | exclude *.apkg 11 | # 排除所有字典数据文件 12 | exclude anki_packager/dicts/*.mdx 13 | exclude anki_packager/dicts/*.7z 14 | exclude anki_packager/dicts/*.db 15 | exclude anki_packager/dicts/*.csv 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 你可能需要代理 2 | # FROM hub.icert.top/python:3.10.16-slim 3 | 4 | FROM python:3.10.16-slim 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | gcc \ 8 | build-essential \ 9 | libffi-dev \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | WORKDIR /app 13 | 14 | COPY requirements.txt ./ 15 | 16 | RUN pip install --no-cache-dir -r requirements.txt 17 | 18 | COPY . . 19 | 20 | RUN mkdir -p config dicts 21 | 22 | ENV PYTHONUNBUFFERED=1 23 | 24 | ENTRYPOINT ["python", "-m", "anki_packager", "--disable_ai"] 25 | CMD [] 26 | -------------------------------------------------------------------------------- /anki_packager/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # ANSI escape codes for bold blue 4 | BOLD_BLUE = "\033[1;34m" 5 | RESET = "\033[0m" 6 | 7 | logging.basicConfig( 8 | level=logging.INFO, 9 | format=f"{BOLD_BLUE}[%(filename)s:%(lineno)d:%(funcName)s]{RESET} %(message)s", 10 | handlers=[logging.FileHandler("anki_packager.log"), logging.StreamHandler()], 11 | ) 12 | 13 | litellm_logger = logging.getLogger("LiteLLM") 14 | litellm_logger.setLevel(logging.WARNING) 15 | litellm_logger = logging.getLogger("LiteLLM Router") 16 | litellm_logger.setLevel(logging.WARNING) 17 | 18 | logging.getLogger("httpx").setLevel(logging.WARNING) 19 | logger = logging.getLogger(__name__) 20 | -------------------------------------------------------------------------------- /anki_packager/prompt.py: -------------------------------------------------------------------------------- 1 | PROMPT = """ 2 | 你是一名中英文双语教育专家,拥有帮助将中文视为母语的用户理解和记忆英语单词的专长,请根据用户提供的英语单词,用中文且仅用 json 格式回复: 3 | { 4 | "word": "用户提供的单词", 5 | "origin": { 6 | "etymology": "<详细介绍单词的造词来源和发展历史,以及在欧美文化中的内涵>", 7 | "mnemonic": { 8 | "associative": "联想记忆:<提供一个联想记忆,帮助用户记住单词的含义>", 9 | "homophone": "谐音记忆:<提供一个谐音记忆,帮助用户记住单词的拼写>" 10 | } 11 | }, 12 | "tenses": "<按照以下格式(如果存在)列出词形变化:'v. 动词原形, 过去式, 过去分词, 现在分词; adj. 形容词形式; n. 名词形式; adv. 副词形式'...>", 13 | "story": { 14 | "english": "<用英文撰写一个有画面感的场景故事。要求:1. 必须包含目标单词;2. 使用简单易懂的词汇;3. 长度在80-100个单词之间;4. 突出目标单词的使用场景>", 15 | "chinese": "<故事的中文翻译,保持与英文版本一致的语气和画面感>" 16 | } 17 | } 18 | 19 | 注意事项: 20 | 1. 在 word_forms 中,只填写客观存在的词形,不要随意捏造或添加不存在的词形。并且只包含英文,不要加入中文注解。 21 | """ 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Yaoyao Hu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from anki_packager import __version__ 3 | 4 | setup( 5 | name="apkger", 6 | version=__version__, 7 | author="Yaoyao Hu", 8 | author_email="shady030314@gmail.com", 9 | description="自动化 Anki 英语单词卡片牌组生成器", 10 | long_description=open("README.md", encoding="utf-8").read(), 11 | long_description_content_type="text/markdown", 12 | url="https://github.com/yaoyhu/anki_packager", 13 | packages=find_packages( 14 | exclude=["*.pyc", "*.pyo", "__pycache__", "*.__pycache__*"] 15 | ), 16 | include_package_data=True, 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.9", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | "Topic :: Education", 23 | "Topic :: Text Processing :: Linguistic", 24 | "Development Status :: 4 - Beta", 25 | ], 26 | python_requires=">=3.9", 27 | install_requires=open("requirements.txt").read().splitlines(), 28 | entry_points={ 29 | "console_scripts": [ 30 | "apkger=anki_packager.__main__:main", 31 | ], 32 | }, 33 | ) 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Detect operating system 2 | ifeq ($(OS),Windows_NT) 3 | SHELL := powershell.exe 4 | .SHELLFLAGS := -NoProfile -Command 5 | RM = Remove-Item -Force -Recurse 6 | FOLDER_SET = $$env:FOLDER="$$(Get-Location)" 7 | else 8 | SHELL := /bin/bash 9 | RM = rm -rf 10 | FOLDER_SET = export FOLDER=$$(pwd) 11 | endif 12 | 13 | IMAGE_NAME = apkger 14 | CONTAINER_NAME = apkger 15 | VOLUME_NAME = apkger-dicts 16 | 17 | .PHONY: build run shell clean help 18 | 19 | # Build Docker image and create persistent volume 20 | build: 21 | docker build -t $(IMAGE_NAME) . 22 | docker volume create $(VOLUME_NAME) 23 | 24 | run: 25 | docker run --rm \ 26 | --name $(CONTAINER_NAME) \ 27 | -v $(VOLUME_NAME):/app/dicts \ 28 | $(IMAGE_NAME) 29 | 30 | # Enter shell in container with volume mounted 31 | shell: 32 | docker run -it --rm \ 33 | --name $(CONTAINER_NAME) \ 34 | -v $(VOLUME_NAME):/app/dicts \ 35 | -v $(shell pwd)/config:/app/config \ 36 | -v $(shell pwd):/app \ 37 | --entrypoint /bin/bash \ 38 | $(IMAGE_NAME) 39 | 40 | clean: 41 | -docker rmi $(IMAGE_NAME) 42 | -docker volume rm $(VOLUME_NAME) 43 | 44 | help: 45 | @echo "Available targets:" 46 | @echo " build - Build Docker image and create persistent volume" 47 | @echo " run - Run container with mounted current directory" 48 | @echo " shell - Enter shell in container with volume mounted" 49 | @echo " clean - Remove container and image" 50 | @echo " help - Show this help message" 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohappyeyeballs==2.6.1 2 | aiohttp==3.12.15 3 | aiosignal==1.4.0 4 | annotated-types==0.7.0 5 | anyio==4.11.0 6 | attrs==25.3.0 7 | beautifulsoup4==4.7.1 8 | brotli==1.1.0 9 | cached-property==2.0.1 10 | certifi==2025.8.3 11 | charset-normalizer==3.4.3 12 | chevron==0.14.0 13 | click==8.1.8 14 | colorama==0.4.6 15 | distro==1.9.0 16 | fastuuid==0.12.0 17 | filelock==3.19.1 18 | frozendict==2.4.6 19 | frozenlist==1.7.0 20 | fsspec==2025.9.0 21 | genanki==0.13.1 22 | gtts==2.5.3 23 | h11==0.16.0 24 | httpcore==1.0.9 25 | httpx==0.28.1 26 | huggingface-hub==0.35.1 27 | idna==3.10 28 | importlib-metadata==8.7.0 29 | inflate64==1.0.3 30 | jinja2==3.1.6 31 | jiter==0.11.0 32 | jsonschema==4.25.1 33 | jsonschema-specifications==2025.9.1 34 | litellm==1.77.3 35 | madoka==0.7.1 36 | markupsafe==3.0.2 37 | mdict-utils==1.3.14 38 | multidict==6.6.4 39 | multivolumefile==0.2.3 40 | openai==1.109.1 41 | packaging==25.0 42 | pondpond==1.4.1 43 | propcache==0.3.2 44 | psutil==7.1.0 45 | py7zr==0.22.0 46 | pybcj==1.0.6 47 | pycryptodomex==3.23.0 48 | pydantic==2.12.0a1 49 | pydantic-core==2.37.2 50 | pyppmd==1.1.1 51 | python-dotenv==1.1.1 52 | pyyaml==6.0.2 53 | pyzstd==0.17.0 54 | referencing==0.36.2 55 | regex==2025.9.18 56 | requests==2.32.5 57 | rpds-py==0.27.1 58 | sniffio==1.3.1 59 | soupsieve==2.8 60 | texttable==1.7.0 61 | tiktoken==0.11.0 62 | tokenizers==0.22.1 63 | tqdm==4.67.1 64 | typing-extensions==4.15.0 65 | typing-inspection==0.4.1 66 | urllib3==2.5.0 67 | xxhash==3.5.0 68 | yarl==1.20.1 69 | zipp==3.23.0 70 | socksio==1.0.0 71 | -------------------------------------------------------------------------------- /anki_packager/dict/eudic.py: -------------------------------------------------------------------------------- 1 | from anki_packager.logger import logger 2 | import aiohttp 3 | 4 | # https://my.eudic.net/OpenAPI/doc_api_study#-studylistapi-getcategory 5 | 6 | 7 | class EUDIC: 8 | def __init__(self, token: str, id: str): 9 | self.id = id 10 | self.token = token 11 | self.header = { 12 | "Authorization": self.token, 13 | "Content-Type": "application/json", 14 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", 15 | } 16 | self.studylist_url = ( 17 | "https://api.frdic.com/api/open/v1/studylist/category?language=en" 18 | ) 19 | 20 | self.words_url = "https://api.frdic.com/api/open/v1/studylist/words/" 21 | 22 | async def get_studylist(self): 23 | async with aiohttp.request( 24 | "GET", self.studylist_url, headers=self.header 25 | ) as response: 26 | self.check_token(response.status) 27 | json = await response.json() 28 | # show list id 29 | for book in json["data"]: 30 | logger.info(f"id: {book['id']}, name: {book['name']}") 31 | 32 | return json 33 | 34 | async def get_words(self): 35 | url = self.words_url + str(self.id) + "?language=en&category_id=0" 36 | async with aiohttp.request("GET", url, headers=self.header) as response: 37 | self.check_token(response.status) 38 | json = await response.json() 39 | return json 40 | 41 | def check_token(self, status_code: int): 42 | if status_code != 200: 43 | if status_code == 401: 44 | msg = "前往 https://my.eudic.net/OpenAPI/Authorization 获取 token 写入配置文件" 45 | logger.error(msg) 46 | exit(1) 47 | else: 48 | msg = "检查填写的 ID 是否正确" 49 | logger.error(msg) 50 | exit(1) 51 | -------------------------------------------------------------------------------- /anki_packager/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | 4 | from anki_packager.logger import logger 5 | 6 | 7 | def get_user_config_dir(): 8 | """ 9 | Returns the platform-specific user configuration directory. 10 | 11 | - Windows: %APPDATA%/anki_packager 12 | - macOS/Linux: ~/.config/anki_packager 13 | """ 14 | if platform.system() == "Windows": 15 | return os.path.join(os.environ.get("APPDATA", ""), "anki_packager") 16 | else: 17 | return os.path.expanduser("~/.config/anki_packager") 18 | 19 | 20 | def initialize_config(): 21 | """ 22 | Make sure user config dir exists. 23 | 24 | Example: 25 | ~/.config/anki_packager/ 26 | ├── config 27 | │   ├── config.toml 28 | │   ├── failed.txt 29 | │   └── vocabulary.txt 30 | └── dicts 31 | ├── 单词释义比例词典-带词性.mdx 32 | ├── 有道词语辨析.mdx 33 | ├── stardict.7z 34 | ├── stardict.csv 35 | └── stardict.db 36 | """ 37 | config_dir = get_user_config_dir() 38 | os.makedirs(config_dir, exist_ok=True) 39 | config_subdir = os.path.join(config_dir, "config") 40 | os.makedirs(config_subdir, exist_ok=True) 41 | dicts_dir = os.path.join(config_dir, "dicts") 42 | os.makedirs(dicts_dir, exist_ok=True) 43 | 44 | # Default configuration in TOML format 45 | default_config = """ 46 | PROXY = "" 47 | EUDIC_TOKEN = "" 48 | EUDIC_ID = "0" 49 | DECK_NAME = "anki_packager" 50 | 51 | [[MODEL_PARAM]] 52 | model = "gemini/gemini-2.5-flash" 53 | api_key = "GEMINI_API_KEY" 54 | rpm = 10 # 每分钟请求次数 55 | 56 | # [[MODEL_PARAM]] 57 | # model = "openai/gpt-4o" 58 | # api_key = "OPENAI_API_KEY" 59 | # api_base = "YOUR_API_BASE" 60 | # rpm = 200 61 | 62 | """ 63 | 64 | config_path = os.path.join(config_subdir, "config.toml") 65 | if not os.path.exists(config_path): 66 | with open(config_path, "w", encoding="utf-8") as f: 67 | f.write(default_config) 68 | 69 | vocab_path = os.path.join(config_subdir, "vocabulary.txt") 70 | if not os.path.exists(vocab_path): 71 | with open(vocab_path, "w", encoding="utf-8") as f: 72 | f.write("") 73 | 74 | failed_path = os.path.join(config_subdir, "failed.txt") 75 | if not os.path.exists(failed_path): 76 | with open(failed_path, "w", encoding="utf-8") as f: 77 | f.write("reform\nopen\n") 78 | 79 | logger.info(f"配置文件位于 {config_path}") 80 | -------------------------------------------------------------------------------- /anki_packager/ai.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from litellm import Choices, Message 3 | from litellm.router import Router 4 | from litellm.files.main import ModelResponse 5 | import json 6 | from anki_packager.prompt import PROMPT 7 | 8 | from pydantic import BaseModel, Field, ValidationError 9 | 10 | 11 | class Mnemonic(BaseModel): 12 | """助记法模型""" 13 | 14 | associative: str = Field(..., description="联想记忆法") 15 | homophone: str = Field(..., description="谐音记忆法") 16 | 17 | 18 | class Origin(BaseModel): 19 | """词源和助记模型""" 20 | 21 | etymology: str = Field(..., description="词源和文化内涵") 22 | mnemonic: Mnemonic 23 | 24 | 25 | class Story(BaseModel): 26 | """场景故事模型""" 27 | 28 | english: str = Field(..., description="英文场景故事") 29 | chinese: str = Field(..., description="故事的中文翻译") 30 | 31 | 32 | # 最终的、最顶层的完整数据模型 33 | class WordExplanation(BaseModel): 34 | """完整的单词解析数据模型""" 35 | 36 | word: str = Field(..., description="用户提供的单词") 37 | origin: Origin 38 | tenses: str = Field(..., description="单词的词形变化") 39 | story: Story 40 | 41 | 42 | class llm: 43 | def __init__(self, model_param: list): 44 | model_list = [ 45 | { 46 | "model_name": "a", # 为所有模型统一使用别名 "a" 47 | "litellm_params": param, 48 | } 49 | for param in model_param 50 | ] 51 | self.router = Router(model_list) 52 | 53 | async def explain(self, word: str) -> Dict: 54 | try: 55 | response = await self.router.acompletion( 56 | model="a", 57 | messages=[ 58 | {"role": "system", "content": PROMPT}, 59 | {"role": "user", "content": word}, 60 | ], 61 | temperature=0.3, 62 | max_tokens=500, 63 | response_format={"type": "json_object"}, 64 | ) 65 | if isinstance(response, ModelResponse): 66 | if isinstance(response.choices, list) and response.choices: 67 | first_choice = response.choices[0] 68 | if ( 69 | isinstance(first_choice, Choices) 70 | and isinstance(first_choice.message, Message) 71 | and isinstance(first_choice.message.content, str) 72 | ): 73 | result_str = first_choice.message.content 74 | 75 | if result_str.startswith("```json"): 76 | result_str = result_str.strip("```json\n").strip("```") 77 | 78 | # 1. 将字符串解析为 Python 字典 79 | data = json.loads(result_str) 80 | 81 | # 2. 使用 WordExplanation 模型进行验证和解析 82 | validated_data = WordExplanation.model_validate(data) 83 | 84 | return validated_data.model_dump() 85 | 86 | except json.JSONDecodeError as e: 87 | raise json.JSONDecodeError( 88 | f"Failed to parse JSON for '{word}': {e}. Raw response: '{result_str[:150]}...'", 89 | result_str, 90 | e.pos, 91 | ) 92 | except ValidationError as e: 93 | raise ValidationError(f"JSON structure validation failed for '{word}': {e}") 94 | except Exception as e: 95 | raise Exception(f"An unexpected error occurred for '{word}': {e}") 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 |

4 | Logo 5 |
anki_packager 6 |

7 |

8 | 自动化 Anki 英语单词高质量卡片牌组生成工具 9 |
10 | 关于项目 11 | · 12 | 使用指南 13 | · 14 | 开发计划 15 | · 16 | 致谢 17 |

18 |

19 | 20 | ## 关于项目 21 | 22 | `anki_packager` 是一款自动化的 Anki 单词卡片生成工具,能够自动创建高质量的 `.apkg` 牌组。本项目致力于为英语学习者提供一个高效、智能的记忆辅助工具。 23 | 24 | ### 核心特性 25 | 26 | - 多源精选词典整合:[ECDICT](https://github.com/skywind3000/ECDICT)、[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)、[单词释义比例词典](https://skywind.me/blog/archives/2938) 27 | - 智能化学习体验: 28 | - 自动抓取有道词典优质例句和常用短语 29 | - 支持谷歌 TTS 发音、中英双解、考纲标记等功能 30 | - 支持流行 AI 模型(需要 API-KEY)对单词进行总结、助记及和情境故事生成 31 | - 便捷的数据导入:支持欧路词典生词本一键导入并批量处理单词列表,自动生成卡片 32 | - 优良的命令行体验:显示处理进度,支持记录错误、支持丰富的命令行参数 33 | - 支持 Docker 运行、支持 PyPI 安装 34 | 35 | ### 卡片预览 36 | 37 | 每张单词卡片包含丰富的学习资源,结构清晰,内容全面: 38 | 39 | - 正面:词头、发音、音标 + 考试大纲标签(如 中高考、CET4、CET6、GRE 等) 40 | - 背面: 41 | - 释义:中文(ECDICT)、时态(AI)、释义和词性比例([单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx)) 42 | - AI 生成词根 + 辅助记忆(联想记忆 + 谐音记忆) 43 | - 短语 + 例句(有道爬虫) 44 | - 单词辨析([《有道词语辨析》加强版](https://pan.baidu.com/s/1gff2tdp)) 45 | - 英文释义(目前来自 ECDICT)+ AI 生成故事 46 | 47 | 背面 48 | 49 | ## 使用 50 | 51 | ### 快速开始 52 | 53 | ```bash 54 | # 直接使用 pip 安装 55 | pip install apkger 56 | ``` 57 | 58 | 在使用 apkger 之前,你需要先在 `config/config.toml` 文件中填写相关配置信息: 59 | 60 | 本项目使用 [litellm](https://github.com/BerriAI/litellm) 统一调用 LLM 服务。关于 `MODEL_PARAM` 的详细配置方法,请参考 [LiteLLM Providers 文档](https://docs.litellm.ai/docs/providers)。 61 | 62 | ```toml 63 | PROXY = "" 64 | EUDIC_TOKEN = "" 65 | EUDIC_ID = "0" 66 | DECK_NAME = "anki_packager" 67 | 68 | [[MODEL_PARAM]] 69 | model = "gemini/gemini-2.5-flash" 70 | api_key = "GEMINI_API_KEY" 71 | rpm = 10 # 每分钟请求次数 72 | 73 | ### OpenAI-Compatible Endpoints 示例 74 | # [[MODEL_PARAM]] 75 | # model = "openai/gpt-4o" 76 | # api_key = "OPENAI_API_KEY" 77 | # api_base = "YOUR_API_BASE" 78 | # rpm = 200 79 | ``` 80 | 81 | 下面是关于配置文件中各参数的详细说明: 82 | 83 | - `MODEL_PARAM`: 84 | - `model`: Provider Route on LiteLLM + Model ID 85 | - `api_key`: 对应模型的 API 密钥。 86 | - `api_base`: (可选) 仅在模型为 OpenAI-Compatible Endpoints 时需要填写 87 | - `rpm`: (可选) 每分钟的请求次数限制,用于控制 API 调用频率。 88 | - `PROXY`: 如果你无法直接连接到 AI 服务提供商,可以在这里设置代理服务器地址 89 | 90 | - 如果需要使用欧路词典生词本:先按照[欧陆官方获取](https://my.eudic.net/OpenAPI/Authorization) TOKEN,然后使用`apkger --eudicid` 选择 ID 写入配置文件 91 | 92 | ### 下载字典 93 | 94 | 下载字典到配置目录中(注意名称不要错): 95 | 96 | - Linux/MacOS: `~/.config/anki_packager/dicts/` 97 | - Windows: `C:\Users\<用户名>\AppData\Roaming\anki_packager\dicts\` 98 | 99 | 字典数据(感谢 [skywind)](https://github.com/skywind3000)下载地址: 100 | 101 | - [stardict.7z](https://github.com/skywind3000/ECDICT/raw/refs/heads/master/stardict.7z) 102 | - [单词释义比例词典-带词性](https://mdx.mdict.org/按词典语种来分类/词频/单词释义比例词典/单词释义比例词典-带词性.mdx) 103 | - [有道词语辨析](https://pan.baidu.com/s/1gff2tdp):**需要手动解压**放入 `config/dicts` 104 | 105 | 字典下载完毕后,解压和处理交给 anki_packager 即可。 106 | 107 | ### 运行 108 | 109 | 目前软件没有 UI 界面,只支持命令行运行,下面给出一些参考: 110 | 111 | ```bash 112 | # 查看帮助信息 113 | apkger -h 114 | 115 | # 从默认生词本读词生成卡片 116 | apkger 117 | 118 | ### 关闭 AI 功能 119 | apkger --disable_ai 120 | 121 | ### 从欧路词典生词本导出单词,生成卡片(需要先配置) 122 | ## 先查看 ID 写入配置文件 123 | apkger --eudicid 124 | ## 生成卡片 125 | apkger --eudic 126 | ``` 127 | 128 |
129 | 方式一:Conda 环境 130 | 131 | ```bash 132 | # 创建并激活一个名为 apkg 的 Python 3.9 虚拟环境 133 | conda create -n apkg python=3.9 134 | conda activate apkg 135 | 136 | # 安装项目依赖 137 | pip install -r requirements.txt 138 | 139 | # 查看帮助信息 140 | python -m anki_packager -h 141 | 142 | # 从欧路词典生词本导出单词,生成卡片(需要先配置) 143 | python -m anki_packager --eudic 144 | 145 | # 关闭 AI 功能 146 | python -m anki_packager --disable_ai 147 | 148 | # 从生词本读词生成卡片 149 | python -m anki_packager 150 | ``` 151 | 152 |
153 | 154 |
155 | 方式二:Docker 容器 156 | 157 | 如果你希望避免污染本地环境,可以使用 Docker 运行 anki_packager,可以配合 `Makefile` 使用: 158 | 159 | ```shell 160 | # 构建 Docker 镜像 和 创建持久化卷 161 | make build 162 | 163 | # 第一次运行容器下载词典(需要一点时间) 164 | make run 165 | 166 | # 进入容器(注意!需要在主机先配置 config/config.toml) 167 | # 在容器中运行 anki_packager,生成的牌组会保存在当前目录中 168 | make shell 169 | ``` 170 | 171 |
172 | 173 | ## TODO 174 | 175 | - [x] ~~集成单词释义比例词典~~ 176 | - [x] ~~近一步优化单词卡片 UI~~ 177 | - [x] ~~从欧路词典导入生词~~ 178 | - [x] ~~支持 SiliconFlow、Gemini~~ 179 | - [x] ~~重新支持 Docker~~ 180 | - [x] ~~发布到 PyPI~~ 181 | - [x] ~~训练现成的数据包发布 release~~ @Initsnow 182 | - [ ] 支持更多软件生词导出 183 | - [ ] 支持 Longman 词典 184 | - [ ] 开发 GUI 185 | 186 | ## Thanks 187 | 188 | 本项目得到了众多开源项目和社区的支持: 189 | 190 | - 感谢 [skywind](https://github.com/skywind3000) 开源的 [ECDICT](https://github.com/skywind3000/ECDICT) 以及其他词典项目,为本项目提供了丰富的词典资源。 191 | - 感谢 [yihong0618](https://github.com/yihong0618) 开源的众多优秀 Python 项目,从中获益良多。 192 | 193 | --- 194 | 195 |

如果这个项目对你有帮助,欢迎 Star ⭐️

196 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.apkg 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # UV 100 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | #uv.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 118 | .pdm.toml 119 | .pdm-python 120 | .pdm-build/ 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 172 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 173 | 174 | # User-specific stuff 175 | .idea/**/workspace.xml 176 | .idea/**/tasks.xml 177 | .idea/**/usage.statistics.xml 178 | .idea/**/dictionaries 179 | .idea/**/shelf 180 | 181 | # AWS User-specific 182 | .idea/**/aws.xml 183 | 184 | # Generated files 185 | .idea/**/contentModel.xml 186 | 187 | # Sensitive or high-churn files 188 | .idea/**/dataSources/ 189 | .idea/**/dataSources.ids 190 | .idea/**/dataSources.local.xml 191 | .idea/**/sqlDataSources.xml 192 | .idea/**/dynamic.xml 193 | .idea/**/uiDesigner.xml 194 | .idea/**/dbnavigator.xml 195 | 196 | # Gradle 197 | .idea/**/gradle.xml 198 | .idea/**/libraries 199 | 200 | # Gradle and Maven with auto-import 201 | # When using Gradle or Maven with auto-import, you should exclude module files, 202 | # since they will be recreated, and may cause churn. Uncomment if using 203 | # auto-import. 204 | # .idea/artifacts 205 | # .idea/compiler.xml 206 | # .idea/jarRepositories.xml 207 | # .idea/modules.xml 208 | # .idea/*.iml 209 | # .idea/modules 210 | # *.iml 211 | # *.ipr 212 | 213 | # CMake 214 | cmake-build-*/ 215 | 216 | # Mongo Explorer plugin 217 | .idea/**/mongoSettings.xml 218 | 219 | # File-based project format 220 | *.iws 221 | 222 | # IntelliJ 223 | out/ 224 | 225 | # mpeltonen/sbt-idea plugin 226 | .idea_modules/ 227 | 228 | # JIRA plugin 229 | atlassian-ide-plugin.xml 230 | 231 | # Cursive Clojure plugin 232 | .idea/replstate.xml 233 | 234 | # SonarLint plugin 235 | .idea/sonarlint/ 236 | 237 | # Crashlytics plugin (for Android Studio and IntelliJ) 238 | com_crashlytics_export_strings.xml 239 | crashlytics.properties 240 | crashlytics-build.properties 241 | fabric.properties 242 | 243 | # Editor-based Rest Client 244 | .idea/httpRequests 245 | 246 | # Android studio 3.1+ serialized cache file 247 | .idea/caches/build_file_checksums.ser 248 | 249 | # PyPI configuration file 250 | .pypirc 251 | 252 | .vscode 253 | .vscode/* 254 | !.vscode/settings.json 255 | !.vscode/tasks.json 256 | !.vscode/launch.json 257 | !.vscode/extensions.json 258 | !.vscode/*.code-snippets 259 | 260 | # Local History for Visual Studio Code 261 | .history/ 262 | 263 | # Built Visual Studio Code Extensions 264 | *.vsix 265 | 266 | # macOS shit 267 | .DS_Store -------------------------------------------------------------------------------- /anki_packager/dict/ecdict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | 4 | from anki_packager.logger import logger 5 | from anki_packager.utils import get_user_config_dir 6 | 7 | from anki_packager.dict import stardict 8 | 9 | # https://github.com/liuyug/mdict-utils 10 | from mdict_utils.reader import query 11 | from mdict_utils.utils import ElapsedTimer 12 | 13 | 14 | class Ecdict: 15 | def __init__(self): 16 | self.config_dir = get_user_config_dir() 17 | self.dicts_dir = os.path.join(self.config_dir, "dicts") 18 | # keep the package archive small 19 | self.seven_zip = os.path.join(self.dicts_dir, "stardict.7z") 20 | self.csv = os.path.join(self.dicts_dir, "stardict.csv") 21 | self.sqlite = os.path.join(self.dicts_dir, "stardict.db") 22 | self._convert() 23 | self.conn = sqlite3.connect(self.sqlite) 24 | self.cursor = self.conn.cursor() 25 | self.sd = stardict.StarDict(self.sqlite, False) 26 | 27 | def __del__(self): 28 | if hasattr(self, "conn"): 29 | self.cursor.close() 30 | self.conn.close() 31 | 32 | def _convert(self): 33 | if not os.path.exists(self.csv): 34 | # unzip stardict.csv in 7zip 35 | if not os.path.exists(self.seven_zip): 36 | raise FileNotFoundError(f"{self.seven_zip} 未找到!") 37 | 38 | import py7zr 39 | 40 | logger.info("首次使用: 正在解压词典到 anki_packager/dicts/stardict.csv") 41 | ar = py7zr.SevenZipFile(self.seven_zip, mode="r") 42 | ar.extractall(path=self.dicts_dir) 43 | ar.close() 44 | 45 | if not os.path.exists(self.sqlite): 46 | logger.info( 47 | "耐心等待(790M): 正在转换数据库 anki_packager/dicts/stardict.db" 48 | ) 49 | stardict.convert_dict(self.sqlite, self.csv) 50 | 51 | async def ret_word(self, word): 52 | """Return ECDICT data 53 | dict: 包含以下 ECDICT 数据字段的字典: 54 | - word: 单词名称 55 | - phonetic: 音标,以英语英标为主 56 | - definition: 单词释义(英文),每行一个释义 57 | - translation: 单词释义(中文),每行一个释义 58 | - pos: 词语位置,用 "/" 分割不同位置 59 | - collins: 柯林斯星级 60 | - oxford: 是否是牛津三千核心词汇 61 | - tag: 字符串标签: zk/中考, gk/高考, cet4/四级 等等标签,空格分割 62 | - bnc: 英国国家语料库词频顺序 63 | - frq: 当代语料库词频顺序 64 | - exchange: 时态复数等变换,使用 "/" 分割不同项目 65 | - detail: json 扩展信息,字典形式保存例句(待添加) 66 | - audio: 读音音频 url (待添加) 67 | """ 68 | data = self.sd.query(word) 69 | 70 | # 考纲标签 71 | data = self.parse_tag(data) 72 | # 释义分布 73 | data = self.get_distribution(data) 74 | # 词语辨析 75 | data = self.get_diffrentiation(data) 76 | return data 77 | 78 | def get_distribution(self, data): 79 | """ 80 | Get word distribution from mdx dictionary 81 | """ 82 | with ElapsedTimer(verbose=False): 83 | mdx_path = os.path.join( 84 | get_user_config_dir(), 85 | "dicts", 86 | "单词释义比例词典-带词性.mdx", 87 | ) 88 | record = query(mdx_path, data["word"]) 89 | if record: 90 | data["distribution"] = record 91 | return data 92 | 93 | def get_diffrentiation(self, data): 94 | """[《有道词语辨析》加强版](https://skywind.me/blog/archives/2941)""" 95 | with ElapsedTimer(verbose=False): 96 | mdx_path = os.path.join(get_user_config_dir(), "dicts", "有道词语辨析.mdx") 97 | record = query(mdx_path, data["word"]) 98 | if record: 99 | data["diffrentiation"] = record 100 | return data 101 | 102 | def definition_newline(self, data): 103 | """Add newline to definition for each part-of-speech 104 | 105 | Demo: 106 | Input: data["definition"] = "n. 词义1 v. 词义2" 107 | Output: data["definition"] = "n. 词义1
v. 词义2" 108 | 109 | """ 110 | definition = data.get("definition", "") 111 | if not definition: 112 | return data 113 | 114 | # Split on part of speech markers (like "n.", "v.", etc.) 115 | parts = [] 116 | current = "" 117 | words = definition.split() 118 | 119 | for word in words: 120 | if len(word) >= 2 and word.endswith(".") and word[0].isalpha(): 121 | if current: 122 | parts.append(current.strip()) 123 | current = word 124 | else: 125 | current += " " + word 126 | 127 | if current: 128 | parts.append(current.strip()) 129 | 130 | data["definition"] = "
".join(parts) 131 | return data 132 | 133 | def parse_tag(self, data): 134 | """parse tag infomation and update data dict 135 | Demo: 136 | Input: data["tag"] = "zk gk cet4 cet6 ky ielts toefl" 137 | Output: data["tag"] = "中考 高考 四级 六级 考研 雅思 托福" 138 | """ 139 | text = data.get("tag", "") 140 | if not text: 141 | return data 142 | 143 | tag_map = { 144 | "zk": "中考", 145 | "gk": "高考", 146 | "cet4": "四级", 147 | "cet6": "六级", 148 | "ky": "考研", 149 | "ielts": "雅思", 150 | "toefl": "托福", 151 | "gre": "GRE", 152 | } 153 | 154 | tags: str = text.split() 155 | result = [tag_map.get(tag, tag) for tag in tags] 156 | data["tag"] = " ".join(result) 157 | return data 158 | 159 | def parse_exchange(self, data): 160 | """parse exchange information and update data dict 161 | 162 | Demo: 163 | Input: data["exchange"] = "s:tests/d:tested/i:testing/p:tested/3:tests" 164 | Output: data["exchange"] = "复数:tests 过去式:tested 过去分词:tested 现在分词:testing 三单:tests" 165 | """ 166 | text = data.get("exchange", "") 167 | if not text: 168 | return data 169 | 170 | exchange_map = { 171 | "s": "复数", 172 | "d": "过去式", 173 | "p": "过去分词", 174 | "i": "现在分词", 175 | "3": "三单", 176 | "r": "比较级", 177 | "t": "最高级", 178 | "0": "原型", 179 | "1": "第一人称单数", 180 | } 181 | 182 | result = [] 183 | for item in text.split("/"): 184 | if ":" in item: 185 | key, value = item.split(":") 186 | if key in exchange_map: 187 | result.append(f"{exchange_map[key]}: {value}") 188 | 189 | data["exchange"] = " ".join(result) 190 | return data 191 | -------------------------------------------------------------------------------- /anki_packager/dict/youdao.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import re 4 | import shutil 5 | import tempfile 6 | import aiohttp 7 | from gtts import gTTS 8 | from bs4 import BeautifulSoup 9 | from typing import Dict, Optional 10 | from anki_packager.logger import logger 11 | 12 | 13 | class YoudaoScraper: 14 | def __init__(self): 15 | self.base_url = "https://m.youdao.com/result" 16 | self.tmp = tempfile.mkdtemp() 17 | 18 | async def __aenter__(self): 19 | """进入 async with 时被调用""" 20 | self._session = aiohttp.ClientSession( 21 | headers={ 22 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" 23 | } 24 | ) 25 | return self # 返回实例本身 26 | 27 | async def __aexit__(self, exc_type, exc_val, exc_tb): 28 | """离开 async with 时被调用,确保 Session 被关闭""" 29 | await self._session.close() 30 | try: 31 | self._clean_temp_dir() 32 | except Exception as e: 33 | logger.error(f"Error cleaning up audio files: {e}") 34 | 35 | async def _get_audio(self, word: str): 36 | """return the filename of the audio and the temp directory that needs to be cleaned up""" 37 | filename = os.path.join(self.tmp, f"{word}.mp3") 38 | loop = asyncio.get_running_loop() 39 | 40 | def generate_and_save_audio(): 41 | """A wrapper function for the blocking gTTS calls.""" 42 | tts = gTTS(text=word, lang="en") 43 | tts.save(filename) 44 | 45 | await loop.run_in_executor(None, generate_and_save_audio) 46 | 47 | return filename 48 | 49 | def _clean_temp_dir(self): 50 | """Clean up a temporary directory and its contents.""" 51 | try: 52 | if os.path.exists(self.tmp): 53 | shutil.rmtree(self.tmp) 54 | logger.info(f"音频临时文件夹已清理: {self.tmp}") 55 | except Exception as e: 56 | logger.error(f"音频临时文件夹 {self.tmp} 清理失败: {e}") 57 | 58 | async def get_word_info(self, word: str) -> Optional[Dict]: 59 | try: 60 | params = {"word": word, "lang": "en"} 61 | 62 | async with self._session.get(self.base_url, params=params) as response: 63 | response.raise_for_status() 64 | r_text = await response.text() 65 | soup = BeautifulSoup(r_text, "html.parser") 66 | 67 | result = { 68 | "word": word, 69 | "example_phrases": [], 70 | "example_sentences": [], 71 | } 72 | 73 | 74 | all_uls = soup.find_all("ul", class_="") 75 | # Extract example phrases 76 | if len(all_uls) > 0: 77 | phrase_ul = all_uls[0] 78 | if phrase_ul: 79 | phrase_lis = phrase_ul.find_all("li", class_="mcols-layout") 80 | for li in phrase_lis: 81 | index = ( 82 | li.find("span", class_="grey").text.strip() 83 | if li.find("span", class_="grey") 84 | else None 85 | ) 86 | col2_element = li.find("div", class_="col2") 87 | point_element = col2_element.find("a", class_="point") 88 | sen_phrase_element = col2_element.find("p", class_="sen-phrase") 89 | english = None 90 | chinese = None 91 | if point_element and sen_phrase_element: 92 | english = point_element.text.strip() 93 | chinese = sen_phrase_element.text.strip() 94 | else: 95 | content = col2_element.text.strip() 96 | parts = re.split(r"([;;])", content) 97 | parts = [ 98 | s.strip() 99 | for s in parts 100 | if s.strip() and s not in [";", ";"] 101 | ] 102 | if len(parts) > 1: 103 | english = parts[0] 104 | chinese = "".join(parts[1:]) 105 | else: 106 | english = content 107 | 108 | result["example_phrases"].append( 109 | { 110 | "index": index, 111 | "english": english, 112 | "chinese": chinese, 113 | } 114 | ) 115 | 116 | # Extract example sentences 117 | if len(all_uls) > 1: 118 | sentence_ul = all_uls[1] 119 | if sentence_ul: 120 | sentence_lis = sentence_ul.find_all("li", class_="mcols-layout") 121 | for li in sentence_lis: 122 | index = ( 123 | li.find("span", class_="grey index").text.strip() 124 | if li.find("span", class_="grey index") 125 | else None 126 | ) 127 | english_element = li.find("div", class_="sen-eng") 128 | chinese_element = li.find("div", class_="sen-ch") 129 | source_element = li.find("div", class_="secondary") 130 | 131 | english = english_element.text.strip() if english_element else None 132 | chinese = chinese_element.text.strip() if chinese_element else None 133 | source = source_element.text.strip() if source_element else None 134 | 135 | result["example_sentences"].append( 136 | { 137 | "index": index, 138 | "english": english, 139 | "chinese": chinese, 140 | "source": source, 141 | } 142 | ) 143 | 144 | return result 145 | 146 | except aiohttp.ClientError as e: 147 | logger.error(f"Request error: {e}") 148 | return None 149 | except Exception as e: 150 | logger.error(f"An error occurred: {e}") 151 | return None 152 | 153 | 154 | if __name__ == "__main__": 155 | async def main(): 156 | async with YoudaoScraper() as youdao: 157 | result = asyncio.run(youdao.get_word_info("variable")) 158 | print(result) 159 | asyncio.run(main()) 160 | -------------------------------------------------------------------------------- /anki_packager/packager/deck.py: -------------------------------------------------------------------------------- 1 | import genanki 2 | import random 3 | 4 | 5 | class AnkiDeckCreator: 6 | def __init__(self, deck_name: str): 7 | self.added = False 8 | self.deck_name = deck_name 9 | self.deck_id = random.randrange(1 << 30, 1 << 31) 10 | self.model_id = random.randrange(1 << 30, 1 << 31) 11 | self.deck = genanki.Deck(self.deck_id, deck_name) 12 | self.model = genanki.Model( 13 | self.model_id, 14 | "Anki Packager", 15 | fields=[ 16 | {"name": "Word"}, # 词头 17 | {"name": "Pronunciation"}, # 读音 18 | {"name": "Phonetic_Symbols"}, # 音标 19 | {"name": "Examination_Syllabus"}, # 考试大纲 20 | {"name": "ECDict"}, # Ecdict 中文解释 21 | {"name": "Longman"}, # Longman 22 | {"name": "Youdao"}, # 有道词典示例短语和句子 23 | {"name": "Etymology_AI"}, # 词源 24 | {"name": "Associative_Mnemonic_AI"}, # 联想助记 25 | {"name": "Homophone_Mnemonic_AI"}, # 谐音助记 26 | {"name": "Discrimination"}, # 辨析 27 | {"name": "Story"}, # 故事 28 | ], 29 | templates=[ 30 | { 31 | "name": "Dictionary Card", 32 | "qfmt": """ 33 |
34 |
35 |
{{Word}}
36 |
[{{Phonetic_Symbols}}] ({{Examination_Syllabus}})
37 |
[{{Pronunciation}}]
38 |
39 |
40 | """, 41 | "afmt": """ 42 | {{FrontSide}} 43 |
44 |
45 |
{{ECDict}}
46 |
47 |
48 |
{{Etymology_AI}}
49 |
{{Associative_Mnemonic_AI}}
50 |
{{Homophone_Mnemonic_AI}}
51 |
52 |
53 |
{{Youdao}}
54 |
55 |
{{Discrimination}}
56 |
57 |
{{Longman}}
58 |
59 |
{{Story}}
60 |
61 | """, 62 | } 63 | ], 64 | css=""" 65 | /* Color scheme variables */ 66 | :root { 67 | /* Light mode (default) colors */ 68 | --bg-color: #ffffff; 69 | --text-color: #333333; 70 | --secondary-text: #666666; 71 | --tertiary-text: #2F4F4F; 72 | --highlight-color: #0645AD; 73 | --accent-color: #990000; 74 | --divider-color: #99a; 75 | --pos-color: #990000; 76 | --cn-text-color: #8B008B; 77 | --phrase-color: #8B4513; 78 | } 79 | 80 | /* Dark mode colors */ 81 | @media (prefers-color-scheme: dark) { 82 | .card { 83 | --bg-color: #1e1e2e; 84 | --text-color: #e0e0e0; 85 | --secondary-text: #b0b0b0; 86 | --tertiary-text: #a0c0c0; 87 | --highlight-color: #7cb8ff; 88 | --accent-color: #ff7c7c; 89 | --divider-color: #666; 90 | --pos-color: #ff9e64; 91 | --cn-text-color: #d183e8; 92 | --phrase-color: #e0c080; 93 | } 94 | } 95 | 96 | /* Night mode in Anki also triggers dark mode */ 97 | .nightMode { 98 | --bg-color: #1e1e2e; 99 | --text-color: #e0e0e0; 100 | --secondary-text: #b0b0b0; 101 | --tertiary-text: #a0c0c0; 102 | --highlight-color: #7cb8ff; 103 | --accent-color: #ff7c7c; 104 | --divider-color: #666; 105 | --pos-color: #ff9e64; 106 | --cn-text-color: #d183e8; 107 | --phrase-color: #e0c080; 108 | } 109 | 110 | .card { 111 | font-family: Arial, sans-serif; 112 | text-align: left; 113 | padding: 20px; 114 | max-width: 800px; 115 | margin: auto; 116 | background-color: var(--bg-color); 117 | color: var(--text-color); 118 | line-height: 1.6; 119 | } 120 | 121 | /* 虚线分隔符 */ 122 | .dashed { 123 | border: none; 124 | border-top: 1px dashed var(--divider-color); 125 | margin: 15px 0; 126 | width: 100%; 127 | } 128 | 129 | /* Front side */ 130 | .card-front { 131 | margin-bottom: 20px; 132 | } 133 | 134 | /* Centered header section */ 135 | .header-center { 136 | text-align: center; 137 | margin-bottom: 20px; 138 | } 139 | 140 | .word { 141 | font-size: 2.2em; 142 | font-weight: bold; 143 | color: var(--text-color); 144 | margin-bottom: 5px; 145 | } 146 | 147 | .pronunciation { 148 | font-size: 1.1em; 149 | color: var(--highlight-color); 150 | margin-bottom: 10px; 151 | } 152 | 153 | .front { 154 | color: var(--secondary-text); 155 | margin-bottom: 15px; 156 | font-size: 0.90em; 157 | } 158 | 159 | .phonetic_symbols { 160 | color: blue; 161 | } 162 | 163 | /* Back side */ 164 | .card-back { 165 | margin-top: 20px; 166 | } 167 | 168 | .ecdict { 169 | margin: 15px 0; 170 | text-align: center; 171 | } 172 | 173 | .longman { 174 | margin: 15px 0; 175 | } 176 | 177 | .examples { 178 | color: var(--tertiary-text); 179 | margin: 15px 0; 180 | } 181 | 182 | .examples em { 183 | color: var(--highlight-color); 184 | font-style: normal; 185 | font-weight: bold; 186 | } 187 | 188 | .ai { 189 | color: var(--secondary-text); 190 | margin: 15px 0; 191 | } 192 | 193 | .discrimination { 194 | color: var(--text-color); 195 | margin: 15px 0; 196 | } 197 | 198 | /* Example sentences */ 199 | .example { 200 | color: var(--tertiary-text); 201 | margin-left: 20px; 202 | margin-bottom: 10px; 203 | } 204 | 205 | /* Chinese text */ 206 | .chinese { 207 | color: var(--secondary-text); 208 | margin-left: 20px; 209 | } 210 | """, 211 | ) 212 | 213 | def format_pos(self, text: str) -> str: 214 | """Format definition with line breaks between parts of speech""" 215 | if not text: 216 | return "" 217 | 218 | parts = [] 219 | current = [] 220 | 221 | for word in text.split(): 222 | # Check for part of speech markers 223 | if any( 224 | word.startswith(pos + ".") 225 | for pos in ["n", "v", "vt", "vi", "adj", "adv"] 226 | ): 227 | if current: 228 | parts.append(" ".join(current)) 229 | word = f"{word}" 230 | current = [word] 231 | else: 232 | word = f"{word}" 233 | current.append(word) 234 | 235 | if current: 236 | parts.append(" ".join(current)) 237 | 238 | return "
".join(parts) 239 | 240 | def format_trans(self, translation: str, tense: str, distribution: str) -> str: 241 | """Add tense and distribution of each word in Translation part""" 242 | if not tense: 243 | # AI is disabled 244 | return f"{translation}

{distribution}" 245 | 246 | return f"{translation}

{tense}

{distribution}" 247 | 248 | def format_youdao(self, data: dict) -> str: 249 | """format youdao example_phrases and example_sentences""" 250 | result = [] 251 | 252 | # Format phrases if they exist 253 | if "example_phrases" in data and data["example_phrases"]: 254 | result.append("【短语】") 255 | phrases = [] 256 | for phrase in data["example_phrases"]: 257 | formatted_phrase = f"
  • {phrase['english']} {phrase['chinese']}
  • " 258 | phrases.append(formatted_phrase) 259 | 260 | result.append("".join(phrases)) 261 | 262 | # Format sentences if they exist 263 | if "example_sentences" in data and data["example_sentences"]: 264 | result.append("【例句】") 265 | phrases = [] 266 | for sentence in data["example_sentences"]: 267 | formatted_sentence = f"
  • {sentence['english']} {sentence['chinese']}
  • " 268 | phrases.append(formatted_sentence) 269 | 270 | result.append("".join(phrases)) 271 | 272 | return "
    ".join(result) 273 | 274 | def add_note(self, data: dict): 275 | note = genanki.Note( 276 | model=self.model, 277 | fields=[ 278 | # 词头 279 | data.get("Word", ""), 280 | # 读音 281 | f"[sound:{data.get('Pronunciation', '')}]", 282 | # 音标 + 考试大纲 + 语料库词频: [ә'bændәn] (高考 四级 六级 考研 托福 GRE 2057/2182) 283 | f"{data.get('ECDict', {}).get('phonetic', '')}", 284 | f"{data.get('ECDict', {}).get('tag', '')} {data.get('ECDict', {}).get('bnc', '')}/{data.get('ECDict', {}).get('frq', '')}", 285 | # Ecdict 中文解释 + 释义分布 + 时态 286 | self.format_trans( 287 | self.format_pos(data.get("ECDict", {}).get("translation", "")), 288 | data.get("ECDict", {}).get("distribution", ""), 289 | data.get("AI", {}).get("tenses", ""), 290 | ), 291 | # TODO: use better English source 292 | f"【英解】
    {self.format_pos(data.get('ECDict', {}).get('definition', ''))}", 293 | # 有道词典示例短语和句子 294 | self.format_youdao(data.get("Youdao", {})), 295 | # AI词源、助记 296 | "" 297 | if not data.get("AI") 298 | else f"【词源】
    {data.get('AI', {}).get('origin', {}).get('etymology', '')}", 299 | "" 300 | if not data.get("AI") 301 | else f"【联想助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('associative', '')}", 302 | "" 303 | if not data.get("AI") 304 | else f"【谐音助记】{data.get('AI', {}).get('origin', {}).get('mnemonic', {}).get('homophone', '')}", 305 | # 词语辨析 306 | "" 307 | if not data.get("ECDict", {}).get("diffrentiation", "") 308 | else f"【辨析】{data.get('ECDict', {}).get('diffrentiation', '')}", 309 | # 故事 310 | "" 311 | if not data.get("AI") 312 | else f"【故事】 {data.get('AI', {}).get('story', {}).get('english', '')}

    {data.get('AI', {}).get('story', {}).get('chinese', '')}", 313 | ], 314 | ) 315 | self.deck.add_note(note) 316 | self.added = True 317 | 318 | def write_to_file(self, file_path: str, mp3_files): 319 | package = genanki.Package(self.deck) 320 | package.media_files = mp3_files 321 | package.write_to_file(file_path) 322 | -------------------------------------------------------------------------------- /anki_packager/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | from os import environ as env 5 | import tomllib 6 | from tqdm.asyncio import tqdm 7 | import signal 8 | 9 | ### config 10 | from anki_packager.utils import get_user_config_dir 11 | 12 | ### logger 13 | from anki_packager.logger import logger 14 | 15 | ### AI 16 | from anki_packager.ai import llm 17 | 18 | ### Dictionaries 19 | from anki_packager.dict.youdao import YoudaoScraper 20 | from anki_packager.dict.ecdict import Ecdict 21 | from anki_packager.dict.eudic import EUDIC 22 | 23 | ### Anki 24 | from anki_packager.packager.deck import AnkiDeckCreator 25 | 26 | MAX_RETRIES = 3 # 最大重试次数 27 | RETRY_DELAY = 2 # 每次重试前的等待时间(秒) 28 | CONCURRENCY_LIMIT = 40 # 并发数 29 | 30 | 31 | def create_signal_handler(anki, audio_files, DECK_NAME): 32 | def signal_handler(sig, frame): 33 | logger.info("\033[1;31m程序被 异常中止...\033[0m") 34 | logger.info("正在写入已处理完毕的卡片...") 35 | anki.write_to_file(f"{DECK_NAME}.apkg", audio_files) 36 | logger.info("正在退出...") 37 | exit(0) 38 | 39 | return signal_handler 40 | 41 | 42 | async def main(): 43 | parser = argparse.ArgumentParser() 44 | 45 | parser.add_argument("--word", dest="word", type=str, help="word to add") 46 | 47 | parser.add_argument( 48 | "--retry", 49 | action="store_true", 50 | help="Retry processing failed words only from config/failed.txt", 51 | ) 52 | 53 | parser.add_argument( 54 | "--disable_ai", 55 | dest="disable_ai", 56 | action="store_true", 57 | help="Disable AI completions", 58 | ) 59 | 60 | # ./prog --eudicid: run eudic.get_studylist() 61 | parser.add_argument( 62 | "--eudicid", 63 | action="store_true", 64 | help="Display EUDIC studylist by id", 65 | ) 66 | 67 | parser.add_argument( 68 | "--eudic", 69 | action="store_true", 70 | help="Use EUDIC book instead of vocabulary.txt", 71 | ) 72 | 73 | # support user-defined txt file: ./prog --txt demo.txt 74 | parser.add_argument( 75 | "--txt", 76 | dest="txt_file", 77 | type=str, 78 | help="Use a custom txt file instead of vocabulary.txt", 79 | ) 80 | 81 | parser.add_argument("--model", dest="model", type=str, help="custome AI model") 82 | 83 | parser.add_argument( 84 | "-p", 85 | "--proxy", 86 | dest="proxy", 87 | type=str, 88 | default="", 89 | help="Default proxy like: http://127.0.0.1:7890", 90 | ) 91 | 92 | parser.add_argument( 93 | "--api_base", 94 | metavar="API_BASE_URL", 95 | dest="api_base", 96 | type=str, 97 | help="Default base url other than the OpenAI's official API address", 98 | ) 99 | 100 | options = parser.parse_args() 101 | 102 | ### set config according to config directory or parsed arguments 103 | config_dir = get_user_config_dir() 104 | config_path = os.path.join(config_dir, "config") 105 | 106 | ## 1. read config.toml 107 | with open(os.path.join(config_path, "config.toml"), "rb") as f: 108 | cfg = tomllib.load(f) 109 | MODEL_PARAM = cfg["MODEL_PARAM"] 110 | PROXY = cfg["PROXY"] 111 | EUDIC_TOKEN = cfg["EUDIC_TOKEN"] 112 | EUDIC_ID = cfg["EUDIC_ID"] 113 | DECK_NAME = cfg["DECK_NAME"] 114 | 115 | logger.info("配置读取完毕") 116 | 117 | # display eudict id only 118 | if options.eudicid: 119 | logger.info("设置:仅读取欧路词典 ID") 120 | eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID) 121 | await eudic.get_studylist() 122 | exit(0) 123 | 124 | # only add word into vocabulary.txt line by line 125 | elif options.word: 126 | WORD = options.word 127 | vocab_path = os.path.join(config_path, "vocabulary.txt") 128 | with open(vocab_path, "a") as f: 129 | f.write(WORD + "\n") 130 | logger.info(f"单词: {WORD} 已添加进 {vocab_path}") 131 | exit(0) 132 | 133 | words = [] 134 | number_words = 0 135 | audio_files = [] 136 | ai = None 137 | 138 | anki = AnkiDeckCreator(f"{DECK_NAME}") 139 | ecdict = Ecdict() 140 | 141 | # AI 配置 142 | if options.disable_ai: 143 | logger.info("AI 功能已关闭") 144 | else: 145 | PROXY = options.proxy or PROXY 146 | if PROXY: 147 | env["HTTP_PROXY"] = PROXY 148 | env["HTTPS_PROXY"] = PROXY 149 | logger.info(f"使用代理: {PROXY}") 150 | 151 | # 初始化 AI 模型 152 | try: 153 | ai = llm(MODEL_PARAM) 154 | logger.info( 155 | f"当前使用的 AI 模型: {[param['model'] for param in MODEL_PARAM]}" 156 | ) 157 | except Exception as e: 158 | logger.error(f"初始化 AI 模型失败: {e}") 159 | exit(1) 160 | ## 4. vocabulary source: eudic data, custom txt file, or default vocabulary.txt 161 | if options.eudic: 162 | logger.info("配置: 对欧路词典生词本单词进行处理...") 163 | eudic = EUDIC(EUDIC_TOKEN, EUDIC_ID) 164 | r = await eudic.get_words() 165 | eudic_words = r["data"] 166 | for word in eudic_words: 167 | words.append(word["word"]) 168 | number_words = len(words) 169 | elif options.txt_file: 170 | txt_file_path = options.txt_file 171 | if not os.path.isabs(txt_file_path): 172 | # If relative path, resolve from current directory 173 | txt_file_path = os.path.abspath(txt_file_path) 174 | 175 | logger.info(f"配置: 对自定义单词文件 {txt_file_path} 进行处理...") 176 | try: 177 | with open(txt_file_path, "r") as vocab: 178 | for word in vocab: 179 | word = word.strip() 180 | if word: # Skip empty lines 181 | words.append(word) 182 | number_words = len(words) 183 | except FileNotFoundError: 184 | logger.error(f"文件 {txt_file_path} 未找到") 185 | exit(1) 186 | except Exception as e: 187 | logger.error(f"读取文件 {txt_file_path} 出错: {e}") 188 | exit(1) 189 | else: 190 | vocab_path = os.path.join(config_path, "vocabulary.txt") 191 | logger.info(f"配置: 对默认生词本单词 {vocab_path} 进行处理...") 192 | try: 193 | with open(vocab_path, "r") as vocab: 194 | for word in vocab: 195 | word = word.strip() 196 | if word: # Skip empty lines 197 | words.append(word) 198 | number_words = len(words) 199 | logger.info(f"从默认词库读取了 {number_words} 个单词") 200 | except FileNotFoundError: 201 | logger.error(f"默认词库文件 {vocab_path} 未找到") 202 | exit(1) 203 | except Exception as e: 204 | logger.error(f"读取默认词库文件出错: {e}") 205 | exit(1) 206 | vocab.close() 207 | 208 | signal.signal( 209 | signal.SIGINT, 210 | create_signal_handler(anki, audio_files, DECK_NAME), 211 | ) 212 | async with YoudaoScraper() as youdao: 213 | logger.info(f"开始并发处理 {len(words)} 个单词...") 214 | with tqdm(total=len(words), desc="开始处理") as pbar: 215 | tasks = [ 216 | task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files) 217 | for word in words 218 | ] 219 | results = await asyncio.gather(*tasks, return_exceptions=True) 220 | 221 | successful_results = [] 222 | failed_words = [] 223 | 224 | for word, result in zip(words, results): 225 | if isinstance(result, Exception): 226 | failed_words.append(word) 227 | logger.error(f"未能成功处理 '{word}'. 错误: {result}") 228 | else: 229 | successful_results.append(result) 230 | 231 | if failed_words: 232 | failed_file = os.path.join(config_path, "failed.txt") 233 | logger.error( 234 | f"共 {len(failed_words)} 个单词处理失败,将它们写入 {failed_file}" 235 | ) 236 | with open(failed_file, "w", encoding="utf-8") as f: 237 | for word in failed_words: 238 | f.write(f"{word}\n") 239 | else: 240 | logger.info("所有单词均已成功处理!") 241 | 242 | try: 243 | if anki.added: 244 | anki.write_to_file(f"{DECK_NAME}.apkg", audio_files) 245 | logger.info(f"牌组生成完毕,请打开 {DECK_NAME}.apkg") 246 | except Exception as e: 247 | logger.error(f"Error saving Anki deck: {e}") 248 | 249 | 250 | async def process_word(word, ai, anki, youdao, ecdict, audio_files): 251 | data = {} 252 | data["Word"] = word 253 | 254 | # Get audio pronunciation from gtts 255 | audio_path = await youdao._get_audio(word) 256 | if not audio_path: 257 | raise Exception("Failed to get audio") 258 | 259 | audio_files.append(audio_path) 260 | # 只使用文件名作为 sound 标签的值 261 | audio_filename = os.path.basename(audio_path) 262 | data["Pronunciation"] = audio_filename 263 | 264 | # Get ECDICT definition 265 | dict_def = await ecdict.ret_word(word) 266 | if not dict_def: 267 | raise Exception("Failed to get ECDICT definition") 268 | data["ECDict"] = dict_def 269 | 270 | # Get Youdao dictionary information 271 | youdao_result = await youdao.get_word_info(word) 272 | if not youdao_result: 273 | raise Exception("Failed to get Youdao information") 274 | 275 | data["Youdao"] = youdao_result 276 | 277 | # Get AI explanation if AI is enabled 278 | if ai is not None: 279 | try: 280 | ai_explanation = await ai.explain(word) 281 | data["AI"] = ai_explanation 282 | except Exception as e: 283 | raise Exception(f"Failed to get AI explanation: {str(e)}") 284 | else: 285 | data["AI"] = {} 286 | 287 | # TODO: Longman English explain 288 | 289 | # Add note to deck 290 | anki.add_note(data) 291 | return True 292 | 293 | 294 | semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT) 295 | 296 | 297 | async def process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files): 298 | """ 299 | 包含了重试和退避逻辑 300 | """ 301 | for attempt in range(MAX_RETRIES): 302 | try: 303 | async with semaphore: 304 | result = await process_word(word, ai, anki, youdao, ecdict, audio_files) 305 | return result 306 | except Exception as e: 307 | logger.warning( 308 | f"处理 '{word}' 第 {attempt + 1}/{MAX_RETRIES} 次尝试失败: {e}" 309 | ) 310 | if attempt + 1 == MAX_RETRIES: 311 | # 如果是最后一次尝试,则不再捕获异常,让它冒泡出去 312 | # gather(return_exceptions=True) 会捕获这个最终的异常 313 | logger.error(f"'{word}' 在所有 {MAX_RETRIES} 次尝试后最终失败。") 314 | raise 315 | await asyncio.sleep(RETRY_DELAY) 316 | 317 | 318 | async def task_wrapper(pbar, word, ai, anki, youdao, ecdict, audio_files): 319 | """ 320 | 运行带重试逻辑的任务,并确保进度条在最后总会更新。 321 | """ 322 | try: 323 | r = await process_word_with_retries(word, ai, anki, youdao, ecdict, audio_files) 324 | pbar.set_description(f"'{word}' 添加成功") 325 | return r 326 | except Exception: 327 | pbar.set_description(f"'{word}' 处理失败") 328 | raise 329 | finally: 330 | pbar.update(1) 331 | 332 | 333 | if __name__ == "__main__": 334 | asyncio.run(main()) 335 | -------------------------------------------------------------------------------- /anki_packager/dict/stardict.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim: set ts=4 sw=4 tw=0 et : 4 | #====================================================================== 5 | # 6 | # stardict.py - 7 | # 8 | # Created by skywind on 2011/05/13 9 | # Last Modified: 2019/11/09 23:47 10 | # 11 | #====================================================================== 12 | from __future__ import print_function 13 | import sys 14 | import time 15 | import os 16 | import io 17 | import csv 18 | import sqlite3 19 | import codecs 20 | 21 | try: 22 | import json 23 | except: 24 | import simplejson as json 25 | 26 | MySQLdb = None 27 | 28 | 29 | #---------------------------------------------------------------------- 30 | # python3 compatible 31 | #---------------------------------------------------------------------- 32 | if sys.version_info[0] >= 3: 33 | unicode = str 34 | long = int 35 | xrange = range 36 | 37 | 38 | #---------------------------------------------------------------------- 39 | # word strip 40 | #---------------------------------------------------------------------- 41 | def stripword(word): 42 | return (''.join([ n for n in word if n.isalnum() ])).lower() 43 | 44 | 45 | #---------------------------------------------------------------------- 46 | # StarDict 47 | #---------------------------------------------------------------------- 48 | class StarDict (object): 49 | 50 | def __init__ (self, filename, verbose = False): 51 | self.__dbname = filename 52 | if filename != ':memory:': 53 | os.path.abspath(filename) 54 | self.__conn = None 55 | self.__verbose = verbose 56 | self.__open() 57 | 58 | # 初始化并创建必要的表格和索引 59 | def __open (self): 60 | sql = ''' 61 | CREATE TABLE IF NOT EXISTS "stardict" ( 62 | "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE, 63 | "word" VARCHAR(64) COLLATE NOCASE NOT NULL UNIQUE, 64 | "sw" VARCHAR(64) COLLATE NOCASE NOT NULL, 65 | "phonetic" VARCHAR(64), 66 | "definition" TEXT, 67 | "translation" TEXT, 68 | "pos" VARCHAR(16), 69 | "collins" INTEGER DEFAULT(0), 70 | "oxford" INTEGER DEFAULT(0), 71 | "tag" VARCHAR(64), 72 | "bnc" INTEGER DEFAULT(NULL), 73 | "frq" INTEGER DEFAULT(NULL), 74 | "exchange" TEXT, 75 | "detail" TEXT, 76 | "audio" TEXT 77 | ); 78 | CREATE UNIQUE INDEX IF NOT EXISTS "stardict_1" ON stardict (id); 79 | CREATE UNIQUE INDEX IF NOT EXISTS "stardict_2" ON stardict (word); 80 | CREATE INDEX IF NOT EXISTS "stardict_3" ON stardict (sw, word collate nocase); 81 | CREATE INDEX IF NOT EXISTS "sd_1" ON stardict (word collate nocase); 82 | ''' 83 | 84 | self.__conn = sqlite3.connect(self.__dbname, isolation_level = "IMMEDIATE") 85 | self.__conn.isolation_level = "IMMEDIATE" 86 | 87 | sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ]) 88 | sql = sql.strip('\n') 89 | 90 | self.__conn.executescript(sql) 91 | self.__conn.commit() 92 | 93 | fields = ( 'id', 'word', 'sw', 'phonetic', 'definition', 94 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 95 | 'exchange', 'detail', 'audio' ) 96 | self.__fields = tuple([(fields[i], i) for i in range(len(fields))]) 97 | self.__names = { } 98 | for k, v in self.__fields: 99 | self.__names[k] = v 100 | self.__enable = self.__fields[3:] 101 | return True 102 | 103 | # 数据库记录转化为字典 104 | def __record2obj (self, record): 105 | if record is None: 106 | return None 107 | word = {} 108 | for k, v in self.__fields: 109 | word[k] = record[v] 110 | if word['detail']: 111 | text = word['detail'] 112 | try: 113 | obj = json.loads(text) 114 | except: 115 | obj = None 116 | word['detail'] = obj 117 | return word 118 | 119 | # 关闭数据库 120 | def close (self): 121 | if self.__conn: 122 | self.__conn.close() 123 | self.__conn = None 124 | 125 | def __del__ (self): 126 | self.close() 127 | 128 | # 输出日志 129 | def out (self, text): 130 | if self.__verbose: 131 | print(text) 132 | return True 133 | 134 | # 查询单词 135 | def query (self, key): 136 | c = self.__conn.cursor() 137 | record = None 138 | if isinstance(key, int) or isinstance(key, long): 139 | c.execute('select * from stardict where id = ?;', (key,)) 140 | elif isinstance(key, str) or isinstance(key, unicode): 141 | c.execute('select * from stardict where word = ?', (key,)) 142 | else: 143 | return None 144 | record = c.fetchone() 145 | return self.__record2obj(record) 146 | 147 | # 查询单词匹配 148 | def match (self, word, limit = 10, strip = False): 149 | c = self.__conn.cursor() 150 | if not strip: 151 | sql = 'select id, word from stardict where word >= ? ' 152 | sql += 'order by word collate nocase limit ?;' 153 | c.execute(sql, (word, limit)) 154 | else: 155 | sql = 'select id, word from stardict where sw >= ? ' 156 | sql += 'order by sw, word collate nocase limit ?;' 157 | c.execute(sql, (stripword(word), limit)) 158 | records = c.fetchall() 159 | result = [] 160 | for record in records: 161 | result.append(tuple(record)) 162 | return result 163 | 164 | # 批量查询 165 | def query_batch (self, keys): 166 | sql = 'select * from stardict where ' 167 | if keys is None: 168 | return None 169 | if not keys: 170 | return [] 171 | querys = [] 172 | for key in keys: 173 | if isinstance(key, int) or isinstance(key, long): 174 | querys.append('id = ?') 175 | elif key is not None: 176 | querys.append('word = ?') 177 | sql = sql + ' or '.join(querys) + ';' 178 | query_word = {} 179 | query_id = {} 180 | c = self.__conn.cursor() 181 | c.execute(sql, tuple(keys)) 182 | for row in c: 183 | obj = self.__record2obj(row) 184 | query_word[obj['word'].lower()] = obj 185 | query_id[obj['id']] = obj 186 | results = [] 187 | for key in keys: 188 | if isinstance(key, int) or isinstance(key, long): 189 | results.append(query_id.get(key, None)) 190 | elif key is not None: 191 | results.append(query_word.get(key.lower(), None)) 192 | else: 193 | results.append(None) 194 | return tuple(results) 195 | 196 | # 取得单词总数 197 | def count (self): 198 | c = self.__conn.cursor() 199 | c.execute('select count(*) from stardict;') 200 | record = c.fetchone() 201 | return record[0] 202 | 203 | # 注册新单词 204 | def register (self, word, items, commit = True): 205 | sql = 'INSERT INTO stardict(word, sw) VALUES(?, ?);' 206 | try: 207 | self.__conn.execute(sql, (word, stripword(word))) 208 | except sqlite3.IntegrityError as e: 209 | self.out(str(e)) 210 | return False 211 | except sqlite3.Error as e: 212 | self.out(str(e)) 213 | return False 214 | self.update(word, items, commit) 215 | return True 216 | 217 | # 删除单词 218 | def remove (self, key, commit = True): 219 | if isinstance(key, int) or isinstance(key, long): 220 | sql = 'DELETE FROM stardict WHERE id=?;' 221 | else: 222 | sql = 'DELETE FROM stardict WHERE word=?;' 223 | try: 224 | self.__conn.execute(sql, (key,)) 225 | if commit: 226 | self.__conn.commit() 227 | except sqlite3.IntegrityError: 228 | return False 229 | return True 230 | 231 | # 清空数据库 232 | def delete_all (self, reset_id = False): 233 | sql1 = 'DELETE FROM stardict;' 234 | sql2 = "UPDATE sqlite_sequence SET seq = 0 WHERE name = 'stardict';" 235 | try: 236 | self.__conn.execute(sql1) 237 | if reset_id: 238 | self.__conn.execute(sql2) 239 | self.__conn.commit() 240 | except sqlite3.IntegrityError as e: 241 | self.out(str(e)) 242 | return False 243 | except sqlite3.Error as e: 244 | self.out(str(e)) 245 | return False 246 | return True 247 | 248 | # 更新单词数据 249 | def update (self, key, items, commit = True): 250 | names = [] 251 | values = [] 252 | for name, id in self.__enable: 253 | if name in items: 254 | names.append(name) 255 | value = items[name] 256 | if name == 'detail': 257 | if value is not None: 258 | value = json.dumps(value, ensure_ascii = False) 259 | values.append(value) 260 | if len(names) == 0: 261 | if commit: 262 | try: 263 | self.__conn.commit() 264 | except sqlite3.IntegrityError: 265 | return False 266 | return False 267 | sql = 'UPDATE stardict SET ' + ', '.join(['%s=?'%n for n in names]) 268 | if isinstance(key, str) or isinstance(key, unicode): 269 | sql += ' WHERE word=?;' 270 | else: 271 | sql += ' WHERE id=?;' 272 | try: 273 | self.__conn.execute(sql, tuple(values + [key])) 274 | if commit: 275 | self.__conn.commit() 276 | except sqlite3.IntegrityError: 277 | return False 278 | return True 279 | 280 | # 浏览词典 281 | def __iter__ (self): 282 | c = self.__conn.cursor() 283 | sql = 'select "id", "word" from "stardict"' 284 | sql += ' order by "word" collate nocase;' 285 | c.execute(sql) 286 | return c.__iter__() 287 | 288 | # 取得长度 289 | def __len__ (self): 290 | return self.count() 291 | 292 | # 检测存在 293 | def __contains__ (self, key): 294 | return self.query(key) is not None 295 | 296 | # 查询单词 297 | def __getitem__ (self, key): 298 | return self.query(key) 299 | 300 | # 提交变更 301 | def commit (self): 302 | try: 303 | self.__conn.commit() 304 | except sqlite3.IntegrityError: 305 | self.__conn.rollback() 306 | return False 307 | return True 308 | 309 | # 取得所有单词 310 | def dumps (self): 311 | return [ n for _, n in self.__iter__() ] 312 | 313 | 314 | 315 | #---------------------------------------------------------------------- 316 | # startup MySQLdb 317 | #---------------------------------------------------------------------- 318 | def mysql_startup(): 319 | global MySQLdb 320 | if MySQLdb is not None: 321 | return True 322 | try: 323 | import MySQLdb as _mysql 324 | MySQLdb = _mysql 325 | except ImportError: 326 | return False 327 | return True 328 | 329 | 330 | #---------------------------------------------------------------------- 331 | # DictMysql 332 | #---------------------------------------------------------------------- 333 | class DictMySQL (object): 334 | 335 | def __init__ (self, desc, init = False, timeout = 10, verbose = False): 336 | self.__argv = {} 337 | self.__uri = {} 338 | if isinstance(desc, dict): 339 | argv = desc 340 | else: 341 | argv = self.__url_parse(desc) 342 | for k, v in argv.items(): 343 | self.__argv[k] = v 344 | if k not in ('engine', 'init', 'db', 'verbose'): 345 | self.__uri[k] = v 346 | self.__uri['connect_timeout'] = timeout 347 | self.__conn = None 348 | self.__verbose = verbose 349 | self.__init = init 350 | if 'db' not in argv: 351 | raise KeyError('not find db name') 352 | self.__open() 353 | 354 | def __open (self): 355 | mysql_startup() 356 | if MySQLdb is None: 357 | raise ImportError('No module named MySQLdb') 358 | fields = [ 'id', 'word', 'sw', 'phonetic', 'definition', 359 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 360 | 'exchange', 'detail', 'audio' ] 361 | self.__fields = tuple([(fields[i], i) for i in range(len(fields))]) 362 | self.__names = { } 363 | for k, v in self.__fields: 364 | self.__names[k] = v 365 | self.__enable = self.__fields[3:] 366 | self.__db = self.__argv.get('db', 'stardict') 367 | if not self.__init: 368 | uri = {} 369 | for k, v in self.__uri.items(): 370 | uri[k] = v 371 | uri['db'] = self.__db 372 | self.__conn = MySQLdb.connect(**uri) 373 | else: 374 | self.__conn = MySQLdb.connect(**self.__uri) 375 | return self.init() 376 | return True 377 | 378 | # 输出日志 379 | def out (self, text): 380 | if self.__verbose: 381 | print(text) 382 | return True 383 | 384 | # 初始化数据库与表格 385 | def init (self): 386 | database = self.__argv.get('db', 'stardict') 387 | self.out('create database: %s'%database) 388 | self.__conn.query("SET sql_notes = 0;") 389 | self.__conn.query('CREATE DATABASE IF NOT EXISTS %s;'%database) 390 | self.__conn.query('USE %s;'%database) 391 | # self.__conn.query('drop table if exists stardict') 392 | sql = ''' 393 | CREATE TABLE IF NOT EXISTS `%s`.`stardict` ( 394 | `id` INT PRIMARY KEY NOT NULL AUTO_INCREMENT, 395 | `word` VARCHAR(64) NOT NULL UNIQUE KEY, 396 | `sw` VARCHAR(64) NOT NULL, 397 | `phonetic` VARCHAR(64), 398 | `definition` TEXT, 399 | `translation` TEXT, 400 | `pos` VARCHAR(16), 401 | `collins` SMALLINT DEFAULT 0, 402 | `oxford` SMALLINT DEFAULT 0, 403 | `tag` VARCHAR(64), 404 | `bnc` INT DEFAULT NULL, 405 | `frq` INT DEFAULT NULL, 406 | `exchange` TEXT, 407 | `detail` TEXT, 408 | `audio` TEXT, 409 | KEY(`sw`, `word`), 410 | KEY(`collins`), 411 | KEY(`oxford`), 412 | KEY(`tag`) 413 | ) 414 | '''%(database) 415 | sql = '\n'.join([ n.strip('\t') for n in sql.split('\n') ]) 416 | sql = sql.strip('\n') 417 | sql += ' ENGINE=MyISAM DEFAULT CHARSET=utf8;' 418 | self.__conn.query(sql) 419 | self.__conn.commit() 420 | return True 421 | 422 | # 读取 mysql://user:passwd@host:port/database 423 | def __url_parse (self, url): 424 | if url[:8] != 'mysql://': 425 | return None 426 | url = url[8:] 427 | obj = {} 428 | part = url.split('/') 429 | main = part[0] 430 | p1 = main.find('@') 431 | if p1 >= 0: 432 | text = main[:p1].strip() 433 | main = main[p1 + 1:] 434 | p1 = text.find(':') 435 | if p1 >= 0: 436 | obj['user'] = text[:p1].strip() 437 | obj['passwd'] = text[p1 + 1:].strip() 438 | else: 439 | obj['user'] = text 440 | p1 = main.find(':') 441 | if p1 >= 0: 442 | port = main[p1 + 1:] 443 | main = main[:p1] 444 | obj['port'] = int(port) 445 | main = main.strip() 446 | if not main: 447 | main = 'localhost' 448 | obj['host'] = main.strip() 449 | if len(part) >= 2: 450 | obj['db'] = part[1] 451 | return obj 452 | 453 | # 数据库记录转化为字典 454 | def __record2obj (self, record): 455 | if record is None: 456 | return None 457 | word = {} 458 | for k, v in self.__fields: 459 | word[k] = record[v] 460 | if word['detail']: 461 | text = word['detail'] 462 | try: 463 | obj = json.loads(text) 464 | except: 465 | obj = None 466 | word['detail'] = obj 467 | return word 468 | 469 | # 关闭数据库 470 | def close (self): 471 | if self.__conn: 472 | self.__conn.close() 473 | self.__conn = None 474 | 475 | def __del__ (self): 476 | self.close() 477 | 478 | # 查询单词 479 | def query (self, key): 480 | record = None 481 | if isinstance(key, int) or isinstance(key, long): 482 | sql = 'select * from stardict where id = %s;' 483 | elif isinstance(key, str) or isinstance(key, unicode): 484 | sql = 'select * from stardict where word = %s;' 485 | else: 486 | return None 487 | with self.__conn as c: 488 | c.execute(sql, (key,)) 489 | record = c.fetchone() 490 | return self.__record2obj(record) 491 | 492 | # 查询单词匹配 493 | def match (self, word, limit = 10, strip = False): 494 | c = self.__conn.cursor() 495 | if not strip: 496 | sql = 'select id, word from stardict where word >= %s ' 497 | sql += 'order by word limit %s;' 498 | c.execute(sql, (word, limit)) 499 | else: 500 | sql = 'select id, word from stardict where sw >= %s ' 501 | sql += 'order by sw, word limit %s;' 502 | c.execute(sql, (stripword(word), limit)) 503 | records = c.fetchall() 504 | result = [] 505 | for record in records: 506 | result.append(tuple(record)) 507 | return result 508 | 509 | # 批量查询 510 | def query_batch (self, keys): 511 | sql = 'select * from stardict where ' 512 | if keys is None: 513 | return None 514 | if not keys: 515 | return [] 516 | querys = [] 517 | for key in keys: 518 | if isinstance(key, int) or isinstance(key, long): 519 | querys.append('id = %s') 520 | elif key is not None: 521 | querys.append('word = %s') 522 | sql = sql + ' or '.join(querys) + ';' 523 | query_word = {} 524 | query_id = {} 525 | with self.__conn as c: 526 | c.execute(sql, tuple(keys)) 527 | for row in c: 528 | obj = self.__record2obj(row) 529 | query_word[obj['word'].lower()] = obj 530 | query_id[obj['id']] = obj 531 | results = [] 532 | for key in keys: 533 | if isinstance(key, int) or isinstance(key, long): 534 | results.append(query_id.get(key, None)) 535 | elif key is not None: 536 | results.append(query_word.get(key.lower(), None)) 537 | else: 538 | results.append(None) 539 | return tuple(results) 540 | 541 | # 注册新单词 542 | def register (self, word, items, commit = True): 543 | sql = 'INSERT INTO stardict(word, sw) VALUES(%s, %s);' 544 | try: 545 | with self.__conn as c: 546 | c.execute(sql, (word, stripword(word))) 547 | except MySQLdb.Error as e: 548 | self.out(str(e)) 549 | return False 550 | self.update(word, items, commit) 551 | return True 552 | 553 | # 删除单词 554 | def remove (self, key, commit = True): 555 | if isinstance(key, int) or isinstance(key, long): 556 | sql = 'DELETE FROM stardict WHERE id=%s;' 557 | else: 558 | sql = 'DELETE FROM stardict WHERE word=%s;' 559 | try: 560 | with self.__conn as c: 561 | c.execute(sql, (key,)) 562 | except MySQLdb.Error as e: 563 | self.out(str(e)) 564 | return False 565 | return True 566 | 567 | # 清空数据库 568 | def delete_all (self, reset_id = False): 569 | sql1 = 'DELETE FROM stardict;' 570 | try: 571 | with self.__conn as c: 572 | c.execute(sql1) 573 | except MySQLdb.Error as e: 574 | self.out(str(e)) 575 | return False 576 | return True 577 | 578 | # 更新单词数据 579 | def update (self, key, items, commit = True): 580 | names = [] 581 | values = [] 582 | for name, id in self.__enable: 583 | if name in items: 584 | names.append(name) 585 | value = items[name] 586 | if name == 'detail': 587 | if value is not None: 588 | value = json.dumps(value, ensure_ascii = False) 589 | values.append(value) 590 | if len(names) == 0: 591 | if commit: 592 | try: 593 | self.__conn.commit() 594 | except MySQLdb.Error as e: 595 | self.out(str(e)) 596 | return False 597 | return False 598 | sql = 'UPDATE stardict SET ' + ', '.join(['%s=%%s'%n for n in names]) 599 | if isinstance(key, str) or isinstance(key, unicode): 600 | sql += ' WHERE word=%s;' 601 | else: 602 | sql += ' WHERE id=%s;' 603 | try: 604 | with self.__conn as c: 605 | c.execute(sql, tuple(values + [key])) 606 | except MySQLdb.Error as e: 607 | self.out(str(e)) 608 | return False 609 | return True 610 | 611 | # 取得数据量 612 | def count (self): 613 | sql = 'SELECT count(*) FROM stardict;' 614 | try: 615 | with self.__conn as c: 616 | c.execute(sql) 617 | row = c.fetchone() 618 | return row[0] 619 | except MySQLdb.Error as e: 620 | self.out(str(e)) 621 | return -1 622 | return 0 623 | 624 | # 提交数据 625 | def commit (self): 626 | try: 627 | self.__conn.commit() 628 | except MySQLdb.Error as e: 629 | self.out(str(e)) 630 | return False 631 | return True 632 | 633 | # 取得长度 634 | def __len__ (self): 635 | return self.count() 636 | 637 | # 检测存在 638 | def __contains__ (self, key): 639 | return self.query(key) is not None 640 | 641 | # 查询单词 642 | def __getitem__ (self, key): 643 | return self.query(key) 644 | 645 | # 取得所有单词 646 | def dumps (self): 647 | return [ n for _, n in self.__iter__() ] 648 | 649 | 650 | 651 | #---------------------------------------------------------------------- 652 | # CSV COLUMNS 653 | #---------------------------------------------------------------------- 654 | COLUMN_SIZE = 13 655 | COLUMN_ID = COLUMN_SIZE 656 | COLUMN_SD = COLUMN_SIZE + 1 657 | COLUMN_SW = COLUMN_SIZE + 2 658 | 659 | 660 | #---------------------------------------------------------------------- 661 | # DictCsv 662 | #---------------------------------------------------------------------- 663 | class DictCsv (object): 664 | 665 | def __init__ (self, filename, codec = 'utf-8'): 666 | self.__csvname = None 667 | if filename is not None: 668 | self.__csvname = os.path.abspath(filename) 669 | self.__codec = codec 670 | self.__heads = ( 'word', 'phonetic', 'definition', 671 | 'translation', 'pos', 'collins', 'oxford', 'tag', 'bnc', 'frq', 672 | 'exchange', 'detail', 'audio' ) 673 | heads = self.__heads 674 | self.__fields = tuple([ (heads[i], i) for i in range(len(heads)) ]) 675 | self.__names = {} 676 | for k, v in self.__fields: 677 | self.__names[k] = v 678 | numbers = [] 679 | for name in ('collins', 'oxford', 'bnc', 'frq'): 680 | numbers.append(self.__names[name]) 681 | self.__numbers = tuple(numbers) 682 | self.__enable = self.__fields[1:] 683 | self.__dirty = False 684 | self.__words = {} 685 | self.__rows = [] 686 | self.__index = [] 687 | self.__read() 688 | 689 | def reset (self): 690 | self.__dirty = False 691 | self.__words = {} 692 | self.__rows = [] 693 | self.__index = [] 694 | return True 695 | 696 | def encode (self, text): 697 | if text is None: 698 | return None 699 | text = text.replace('\\', '\\\\').replace('\n', '\\n') 700 | return text.replace('\r', '\\r') 701 | 702 | def decode (self, text): 703 | output = [] 704 | i = 0 705 | if text is None: 706 | return None 707 | size = len(text) 708 | while i < size: 709 | c = text[i] 710 | if c == '\\': 711 | c = text[i + 1:i + 2] 712 | if c == '\\': 713 | output.append('\\') 714 | elif c == 'n': 715 | output.append('\n') 716 | elif c == 'r': 717 | output.append('\r') 718 | else: 719 | output.append('\\' + c) 720 | i += 2 721 | else: 722 | output.append(c) 723 | i += 1 724 | return ''.join(output) 725 | 726 | # 安全转行整数 727 | def readint (self, text): 728 | if text is None: 729 | return None 730 | if text == '': 731 | return 0 732 | try: 733 | x = long(text) 734 | except: 735 | return 0 736 | if x < 0x7fffffff: 737 | return int(x) 738 | return x 739 | 740 | # 读取文件 741 | def __read (self): 742 | self.reset() 743 | filename = self.__csvname 744 | if filename is None: 745 | return False 746 | if not os.path.exists(self.__csvname): 747 | return False 748 | codec = self.__codec 749 | if sys.version_info[0] < 3: 750 | fp = open(filename, 'rb') 751 | content = fp.read() 752 | if not isinstance(content, type(b'')): 753 | content = content.encode(codec, 'ignore') 754 | content = content.replace(b'\r\n', b'\n') 755 | bio = io.BytesIO() 756 | bio.write(content) 757 | bio.seek(0) 758 | reader = csv.reader(bio) 759 | else: 760 | reader = csv.reader(open(filename, encoding = codec)) 761 | rows = [] 762 | index = [] 763 | words = {} 764 | count = 0 765 | for row in reader: 766 | count += 1 767 | if count == 1: 768 | continue 769 | if len(row) < 1: 770 | continue 771 | if sys.version_info[0] < 3: 772 | row = [ n.decode(codec, 'ignore') for n in row ] 773 | if len(row) < COLUMN_SIZE: 774 | row.extend([None] * (COLUMN_SIZE - len(row))) 775 | if len(row) > COLUMN_SIZE: 776 | row = row[:COLUMN_SIZE] 777 | word = row[0].lower() 778 | if word in words: 779 | continue 780 | row.extend([0, 0, stripword(row[0])]) 781 | words[word] = 1 782 | rows.append(row) 783 | index.append(row) 784 | self.__rows = rows 785 | self.__index = index 786 | self.__rows.sort(key = lambda row: row[0].lower()) 787 | self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower())) 788 | for index in xrange(len(self.__rows)): 789 | row = self.__rows[index] 790 | row[COLUMN_ID] = index 791 | word = row[0].lower() 792 | self.__words[word] = row 793 | for index in xrange(len(self.__index)): 794 | row = self.__index[index] 795 | row[COLUMN_SD] = index 796 | return True 797 | 798 | # 保存文件 799 | def save (self, filename = None, codec = 'utf-8'): 800 | if filename is None: 801 | filename = self.__csvname 802 | if filename is None: 803 | return False 804 | if sys.version_info[0] < 3: 805 | fp = open(filename, 'wb') 806 | writer = csv.writer(fp) 807 | else: 808 | fp = open(filename, 'w', encoding = codec, newline = '') 809 | writer = csv.writer(fp) 810 | writer.writerow(self.__heads) 811 | for row in self.__rows: 812 | newrow = [] 813 | for n in row: 814 | if isinstance(n, int) or isinstance(n, long): 815 | n = str(n) 816 | elif not isinstance(n, bytes): 817 | if (n is not None) and sys.version_info[0] < 3: 818 | n = n.encode(codec, 'ignore') 819 | newrow.append(n) 820 | writer.writerow(newrow[:COLUMN_SIZE]) 821 | fp.close() 822 | return True 823 | 824 | # 对象解码 825 | def __obj_decode (self, row): 826 | if row is None: 827 | return None 828 | obj = {} 829 | obj['id'] = row[COLUMN_ID] 830 | obj['sw'] = row[COLUMN_SW] 831 | skip = self.__numbers 832 | for key, index in self.__fields: 833 | value = row[index] 834 | if index in skip: 835 | if value is not None: 836 | value = self.readint(value) 837 | elif key != 'detail': 838 | value = self.decode(value) 839 | obj[key] = value 840 | detail = obj.get('detail', None) 841 | if detail is not None: 842 | if detail != '': 843 | detail = json.loads(detail) 844 | else: 845 | detail = None 846 | obj['detail'] = detail 847 | return obj 848 | 849 | # 对象编码 850 | def __obj_encode (self, obj): 851 | row = [ None for i in xrange(len(self.__fields) + 3) ] 852 | for name, idx in self.__fields: 853 | value = obj.get(name, None) 854 | if value is None: 855 | continue 856 | if idx in self.__numbers: 857 | value = str(value) 858 | elif name == 'detail': 859 | value = json.dumps(value, ensure_ascii = False) 860 | else: 861 | value = self.encode(value) 862 | row[idx] = value 863 | return row 864 | 865 | # 重新排序 866 | def __resort (self): 867 | self.__rows.sort(key = lambda row: row[0].lower()) 868 | self.__index.sort(key = lambda row: (row[COLUMN_SW], row[0].lower())) 869 | for index in xrange(len(self.__rows)): 870 | row = self.__rows[index] 871 | row[COLUMN_ID] = index 872 | for index in xrange(len(self.__index)): 873 | row = self.__index[index] 874 | row[COLUMN_SD] = index 875 | self.__dirty = False 876 | 877 | # 查询单词 878 | def query (self, key): 879 | if key is None: 880 | return None 881 | if self.__dirty: 882 | self.__resort() 883 | if isinstance(key, int) or isinstance(key, long): 884 | if key < 0 or key >= len(self.__rows): 885 | return None 886 | return self.__obj_decode(self.__rows[key]) 887 | row = self.__words.get(key.lower(), None) 888 | return self.__obj_decode(row) 889 | 890 | # 查询单词匹配 891 | def match (self, word, count = 10, strip = False): 892 | if len(self.__rows) == 0: 893 | return [] 894 | if self.__dirty: 895 | self.__resort() 896 | if not strip: 897 | index = self.__rows 898 | pos = 0 899 | else: 900 | index = self.__index 901 | pos = COLUMN_SW 902 | top = 0 903 | bottom = len(index) - 1 904 | middle = top 905 | key = word.lower() 906 | if strip: 907 | key = stripword(word) 908 | while top < bottom: 909 | middle = (top + bottom) >> 1 910 | if top == middle or bottom == middle: 911 | break 912 | text = index[middle][pos].lower() 913 | if key == text: 914 | break 915 | elif key < text: 916 | bottom = middle 917 | elif key > text: 918 | top = middle 919 | while index[middle][pos].lower() < key: 920 | middle += 1 921 | if middle >= len(index): 922 | break 923 | cc = COLUMN_ID 924 | likely = [ (tx[cc], tx[0]) for tx in index[middle:middle + count] ] 925 | return likely 926 | 927 | # 批量查询 928 | def query_batch (self, keys): 929 | return [ self.query(key) for key in keys ] 930 | 931 | # 单词总量 932 | def count (self): 933 | return len(self.__rows) 934 | 935 | # 取得长度 936 | def __len__ (self): 937 | return len(self.__rows) 938 | 939 | # 取得单词 940 | def __getitem__ (self, key): 941 | return self.query(key) 942 | 943 | # 是否存在 944 | def __contains__ (self, key): 945 | return self.__words.__contains__(key.lower()) 946 | 947 | # 迭代器 948 | def __iter__ (self): 949 | record = [] 950 | for index in xrange(len(self.__rows)): 951 | record.append((index, self.__rows[index][0])) 952 | return record.__iter__() 953 | 954 | # 注册新单词 955 | def register (self, word, items, commit = True): 956 | if word.lower() in self.__words: 957 | return False 958 | row = self.__obj_encode(items) 959 | row[0] = word 960 | row[COLUMN_ID] = len(self.__rows) 961 | row[COLUMN_SD] = len(self.__rows) 962 | row[COLUMN_SW] = stripword(word) 963 | self.__rows.append(row) 964 | self.__index.append(row) 965 | self.__words[word.lower()] = row 966 | self.__dirty = True 967 | return True 968 | 969 | # 删除单词 970 | def remove (self, key, commit = True): 971 | if isinstance(key, int) or isinstance(key, long): 972 | if key < 0 or key >= len(self.__rows): 973 | return False 974 | if self.__dirty: 975 | self.__resort() 976 | key = self.__rows[key][0] 977 | row = self.__words.get(key, None) 978 | if row is None: 979 | return False 980 | if len(self.__rows) == 1: 981 | self.reset() 982 | return True 983 | index = row[COLUMN_ID] 984 | self.__rows[index] = self.__rows[len(self.__rows) - 1] 985 | self.__rows.pop() 986 | index = row[COLUMN_SD] 987 | self.__index[index] = self.__index[len(self.__rows) - 1] 988 | self.__index.pop() 989 | del self.__words[key] 990 | self.__dirty = True 991 | return True 992 | 993 | # 清空所有 994 | def delete_all (self, reset_id = False): 995 | self.reset() 996 | return True 997 | 998 | # 更改单词 999 | def update (self, key, items, commit = True): 1000 | if isinstance(key, int) or isinstance(key, long): 1001 | if key < 0 or key >= len(self.__rows): 1002 | return False 1003 | if self.__dirty: 1004 | self.__resort() 1005 | key = self.__rows[key][0] 1006 | key = key.lower() 1007 | row = self.__words.get(key, None) 1008 | if row is None: 1009 | return False 1010 | newrow = self.__obj_encode(items) 1011 | for name, idx in self.__fields: 1012 | if idx == 0: 1013 | continue 1014 | if name in items: 1015 | row[idx] = newrow[idx] 1016 | return True 1017 | 1018 | # 提交变更 1019 | def commit (self): 1020 | if self.__csvname: 1021 | self.save(self.__csvname, self.__codec) 1022 | return True 1023 | 1024 | # 取得所有单词 1025 | def dumps (self): 1026 | return [ n for _, n in self.__iter__() ] 1027 | 1028 | 1029 | #---------------------------------------------------------------------- 1030 | # 词形衍生:查找动词的各种时态,名词的复数等,或反向查找 1031 | # 格式为每行一条数据:根词汇 -> 衍生1,衍生2,衍生3 1032 | # 可以用 Hunspell数据生成,下面有个日本人做的简版(1.8万组数据): 1033 | # http://www.lexically.net/downloads/version4/downloading%20BNC.htm 1034 | #---------------------------------------------------------------------- 1035 | class LemmaDB (object): 1036 | 1037 | def __init__ (self): 1038 | self._stems = {} 1039 | self._words = {} 1040 | self._frqs = {} 1041 | 1042 | # 读取数据 1043 | def load (self, filename, encoding = None): 1044 | content = open(filename, 'rb').read() 1045 | if content[:3] == b'\xef\xbb\xbf': 1046 | content = content[3:].decode('utf-8', 'ignore') 1047 | elif encoding is not None: 1048 | text = content.decode(encoding, 'ignore') 1049 | else: 1050 | text = None 1051 | match = ['utf-8', sys.getdefaultencoding(), 'ascii'] 1052 | for encoding in match + ['gbk', 'latin1']: 1053 | try: 1054 | text = content.decode(encoding) 1055 | break 1056 | except: 1057 | pass 1058 | if text is None: 1059 | text = content.decode('utf-8', 'ignore') 1060 | number = 0 1061 | for line in text.split('\n'): 1062 | number += 1 1063 | line = line.strip('\r\n ') 1064 | if (not line) or (line[:1] == ';'): 1065 | continue 1066 | pos = line.find('->') 1067 | if not pos: 1068 | continue 1069 | stem = line[:pos].strip() 1070 | p1 = stem.find('/') 1071 | frq = 0 1072 | if p1 >= 0: 1073 | frq = int(stem[p1 + 1:].strip()) 1074 | stem = stem[:p1].strip() 1075 | if not stem: 1076 | continue 1077 | if frq > 0: 1078 | self._frqs[stem] = frq 1079 | for word in line[pos + 2:].strip().split(','): 1080 | p1 = word.find('/') 1081 | if p1 >= 0: 1082 | word = word[:p1].strip() 1083 | if not word: 1084 | continue 1085 | self.add(stem, word.strip()) 1086 | return True 1087 | 1088 | # 保存数据文件 1089 | def save (self, filename, encoding = 'utf-8'): 1090 | stems = list(self._stems.keys()) 1091 | stems.sort(key = lambda x: x.lower()) 1092 | import codecs 1093 | fp = codecs.open(filename, 'w', encoding) 1094 | output = [] 1095 | for stem in stems: 1096 | words = self.get(stem) 1097 | if not words: 1098 | continue 1099 | frq = self._frqs.get(stem, 0) 1100 | if frq > 0: 1101 | stem = '%s/%d'%(stem, frq) 1102 | output.append((-frq, u'%s -> %s'%(stem, ','.join(words)))) 1103 | output.sort() 1104 | for _, text in output: 1105 | fp.write(text + '\n') 1106 | fp.close() 1107 | return True 1108 | 1109 | # 添加一个词根的一个衍生词 1110 | def add (self, stem, word): 1111 | if stem not in self._stems: 1112 | self._stems[stem] = {} 1113 | if word not in self._stems[stem]: 1114 | self._stems[stem][word] = len(self._stems[stem]) 1115 | if word not in self._words: 1116 | self._words[word] = {} 1117 | if stem not in self._words[word]: 1118 | self._words[word][stem] = len(self._words[word]) 1119 | return True 1120 | 1121 | # 删除一个词根的一个衍生词 1122 | def remove (self, stem, word): 1123 | count = 0 1124 | if stem in self._stems: 1125 | if word in self._stems[stem]: 1126 | del self._stems[stem][word] 1127 | count += 1 1128 | if not self._stems[stem]: 1129 | del self._stems[stem] 1130 | if word in self._words: 1131 | if stem in self._words[word]: 1132 | del self._words[word][stem] 1133 | count += 1 1134 | if not self._words[word]: 1135 | del self._words[word] 1136 | return (count > 0) and True or False 1137 | 1138 | # 清空数据库 1139 | def reset (self): 1140 | self._stems = {} 1141 | self._words = {} 1142 | return True 1143 | 1144 | # 根据词根找衍生,或者根据衍生反向找词根 1145 | def get (self, word, reverse = False): 1146 | if not reverse: 1147 | if word not in self._stems: 1148 | if word in self._words: 1149 | return [word] 1150 | return None 1151 | words = [ (v, k) for (k, v) in self._stems[word].items() ] 1152 | else: 1153 | if word not in self._words: 1154 | if word in self._stems: 1155 | return [word] 1156 | return None 1157 | words = [ (v, k) for (k, v) in self._words[word].items() ] 1158 | words.sort() 1159 | return [ k for (v, k) in words ] 1160 | 1161 | # 知道一个单词求它的词根 1162 | def word_stem (self, word): 1163 | return self.get(word, reverse = True) 1164 | 1165 | # 总共多少条词根数据 1166 | def stem_size (self): 1167 | return len(self._stems) 1168 | 1169 | # 总共多少条衍生数据 1170 | def word_size (self): 1171 | return len(self._words) 1172 | 1173 | def dump (self, what = 'ALL'): 1174 | words = {} 1175 | what = what.lower() 1176 | if what in ('all', 'stem'): 1177 | for word in self._stems: 1178 | words[word] = 1 1179 | if what in ('all', 'word'): 1180 | for word in self._words: 1181 | words[word] = 1 1182 | return words 1183 | 1184 | def __len__ (self): 1185 | return len(self._stems) 1186 | 1187 | def __getitem__ (self, stem): 1188 | return self.get(stem) 1189 | 1190 | def __contains__ (self, stem): 1191 | return (stem in self._stems) 1192 | 1193 | def __iter__ (self): 1194 | return self._stems.__iter__() 1195 | 1196 | 1197 | 1198 | #---------------------------------------------------------------------- 1199 | # DictHelper 1200 | #---------------------------------------------------------------------- 1201 | class DictHelper (object): 1202 | 1203 | def __init__ (self): 1204 | self._exchanges = {} 1205 | self._exchanges['p'] = u'过去式' 1206 | self._exchanges['d'] = u'过去分词' 1207 | self._exchanges['i'] = u'现在分词' 1208 | self._exchanges['3'] = u'第三人称单数' 1209 | self._exchanges['r'] = u'比较级' 1210 | self._exchanges['t'] = u'最高级' 1211 | self._exchanges['s'] = u'复数' 1212 | self._exchanges['0'] = u'原型' # best 的原型是 good 1213 | self._exchanges['1'] = u'类别' # best 的类别是 good 里的 t 1214 | self._pos = {} 1215 | self._pos['a'] = (u'代词', 'pron.') 1216 | self._pos['c'] = (u'连接词', 'conj.') 1217 | self._pos['d'] = (u'限定词', 'determiner') 1218 | self._pos['i'] = (u'介词', 'prep.') 1219 | self._pos['j'] = (u'形容词', 'adj.') 1220 | self._pos['m'] = (u'数词', 'num.') 1221 | self._pos['n'] = (u'名词', 'n.') 1222 | self._pos['p'] = (u'代词', 'pron.') 1223 | self._pos['r'] = (u'副词', 'adv.') 1224 | self._pos['u'] = (u'感叹词', 'int.') 1225 | self._pos['t'] = (u'不定式标记', 'infm.') 1226 | self._pos['v'] = (u'动词', 'v.') 1227 | self._pos['x'] = (u'否定标记', 'not') 1228 | 1229 | # 返回一个进度指示条,传入总量,每走一格调用一次 next 1230 | def progress (self, total): 1231 | class ProgressIndicator (object): 1232 | def __init__ (self, total): 1233 | self.count = 0 1234 | self.percent = -1 1235 | self.total = total 1236 | self.timestamp = time.time() 1237 | self.counter = {} 1238 | def next (self): 1239 | if self.total: 1240 | self.count += 1 1241 | pc = int(self.count * 100 / self.total) 1242 | if pc != self.percent: 1243 | self.percent = pc 1244 | print('progress: %d%%'%pc) 1245 | def inc (self, name): 1246 | if name not in self.counter: 1247 | self.counter[name] = 1 1248 | else: 1249 | self.counter[name] += 1 1250 | def done (self): 1251 | t = (time.time() - self.timestamp) 1252 | keys = list(self.counter.keys()) 1253 | keys.sort() 1254 | for key in keys: 1255 | print('[%s] -> %d'%(key, self.counter[key])) 1256 | print('[Finished in %d seconds (%d)]'%(t, self.count)) 1257 | return ProgressIndicator(total) 1258 | 1259 | # 返回词典里所有词的 map,默认转为小写 1260 | def dump_map (self, dictionary, lower = True): 1261 | words = {} 1262 | for _, word in dictionary: 1263 | if lower: 1264 | word = word.lower() 1265 | words[word] = 1 1266 | return words 1267 | 1268 | # 字典差异导出 1269 | def discrepancy_export (self, dictionary, words, outname, opts = ''): 1270 | existence = self.dump_map(dictionary) 1271 | if os.path.splitext(outname)[-1].lower() in ('.txt', '.csv'): 1272 | db = DictCsv(outname) 1273 | else: 1274 | db = StarDict(outname) 1275 | db.delete_all() 1276 | count = 0 1277 | for word in words: 1278 | if word.lower() in existence: 1279 | continue 1280 | if '(' in word: 1281 | continue 1282 | if '/' in word: 1283 | continue 1284 | if '"' in word or '#' in word: 1285 | continue 1286 | if '0' in word or '1' in word or '2' in word or '3' in word: 1287 | continue 1288 | if 's' in opts: 1289 | if word.count(' ') >= 2: 1290 | continue 1291 | if 't' in opts: 1292 | if ' ' in word: 1293 | continue 1294 | if 'p' in opts: 1295 | if '-' in word: 1296 | continue 1297 | try: 1298 | word.encode('ascii') 1299 | except: 1300 | continue 1301 | db.register(word, {'tag':'PENDING'}, False) 1302 | count += 1 1303 | db.commit() 1304 | print('exported %d entries'%count) 1305 | return count 1306 | 1307 | # 字典差异导入 1308 | def discrepancy_import (self, dictionary, filename, opts = ''): 1309 | existence = self.dump_map(dictionary) 1310 | if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'): 1311 | db = DictCsv(filename) 1312 | else: 1313 | db = StarDict(filename) 1314 | count = 0 1315 | for word in self.dump_map(db, False): 1316 | data = db[word] 1317 | if data is None: 1318 | continue 1319 | if data['tag'] != 'OK': 1320 | continue 1321 | phonetic = data.get('phonetic', '') 1322 | definition = data.get('definition', '') 1323 | translation = data.get('translation', '') 1324 | update = {} 1325 | if phonetic: 1326 | update['phonetic'] = phonetic 1327 | if definition: 1328 | update['definition'] = definition 1329 | if translation: 1330 | update['translation'] = translation 1331 | if not update: 1332 | continue 1333 | if word.lower() in existence: 1334 | if 'n' not in opts: 1335 | dictionary.update(word, update, False) 1336 | else: 1337 | dictionary.register(word, update, False) 1338 | count += 1 1339 | dictionary.commit() 1340 | print('imported %d entries'%count) 1341 | return count 1342 | 1343 | # 差异比较(utf-8 的.txt 文件,单词和后面音标释义用tab分割) 1344 | def deficit_tab_txt (self, dictionary, txt, outname, opts = ''): 1345 | deficit = {} 1346 | for line in codecs.open(txt, encoding = 'utf-8'): 1347 | row = [ n.strip() for n in line.split('\t') ] 1348 | if len(row) < 2: 1349 | continue 1350 | word = row[0] 1351 | deficit[word] = 1 1352 | return self.deficit_export(dictionary, deficit, outname, opts) 1353 | 1354 | # 导出星际译王的词典文件,根据一个单词到释义的字典 1355 | def export_stardict (self, wordmap, outname, title): 1356 | mainname = os.path.splitext(outname)[0] 1357 | keys = [ k for k in wordmap ] 1358 | keys.sort(key = lambda x: (x.lower(), x)) 1359 | import struct 1360 | pc = self.progress(len(wordmap)) 1361 | position = 0 1362 | with open(mainname + '.idx', 'wb') as f1: 1363 | with open(mainname + '.dict', 'wb') as f2: 1364 | for word in keys: 1365 | pc.next() 1366 | f1.write(word.encode('utf-8', 'ignore') + b'\x00') 1367 | text = wordmap[word].encode('utf-8', 'ignore') 1368 | f1.write(struct.pack('>II', position, len(text))) 1369 | f2.write(text) 1370 | position += len(text) 1371 | with open(mainname + '.ifo', 'wb') as f3: 1372 | f3.write("StarDict's dict ifo file\nversion=2.4.2\n") 1373 | f3.write('wordcount=%d\n'%len(wordmap)) 1374 | f3.write('idxfilesize=%d\n'%f1.tell()) 1375 | f3.write('bookname=%s\n'%title.encode('utf-8', 'ignore')) 1376 | f3.write('author=\ndescription=\n') 1377 | import datetime 1378 | ts = datetime.datetime.now().strftime('%Y.%m.%d') 1379 | f3.write('date=%s\nsametypesequence=m\n'%ts) 1380 | pc.done() 1381 | return True 1382 | 1383 | # 导出 mdict 的源文件 1384 | def export_mdict (self, wordmap, outname): 1385 | keys = [ k for k in wordmap ] 1386 | keys.sort(key = lambda x: x.lower()) 1387 | size = len(keys) 1388 | index = 0 1389 | pc = self.progress(size) 1390 | with codecs.open(outname, 'w', encoding = 'utf-8') as fp: 1391 | for key in keys: 1392 | pc.next() 1393 | word = key.replace('', '').replace('\n', ' ') 1394 | text = wordmap[key].replace('', '') 1395 | if not isinstance(word, unicode): 1396 | word = word.decode('gbk') 1397 | if not isinstance(text, unicode): 1398 | text = text.decode('gbk') 1399 | fp.write(word + '\r\n') 1400 | for line in text.split('\n'): 1401 | line = line.rstrip('\r') 1402 | fp.write(line) 1403 | fp.write('\r\n') 1404 | index += 1 1405 | fp.write('' + ((index < size) and '\r\n' or '')) 1406 | pc.done() 1407 | return True 1408 | 1409 | # 导入mdx源文件 1410 | def import_mdict (self, filename, encoding = 'utf-8'): 1411 | import codecs 1412 | words = {} 1413 | with codecs.open(filename, 'r', encoding = encoding) as fp: 1414 | text = [] 1415 | word = None 1416 | for line in fp: 1417 | line = line.rstrip('\r\n') 1418 | if word is None: 1419 | if line == '': 1420 | continue 1421 | else: 1422 | word = line.strip() 1423 | elif line.strip() != '': 1424 | text.append(line) 1425 | else: 1426 | words[word] = '\n'.join(text) 1427 | word = None 1428 | text = [] 1429 | return words 1430 | 1431 | # 直接生成 .mdx文件,需要 writemdict 支持: 1432 | # https://github.com/skywind3000/writemdict 1433 | def export_mdx (self, wordmap, outname, title, desc = None): 1434 | try: 1435 | import writemdict 1436 | except ImportError: 1437 | print('ERROR: can\'t import writemdict module, please install it:') 1438 | print('https://github.com/skywind3000/writemdict') 1439 | sys.exit(1) 1440 | if desc is None: 1441 | desc = u'Create by stardict.py' 1442 | writer = writemdict.MDictWriter(wordmap, title = title, 1443 | description = desc) 1444 | with open(outname, 'wb') as fp: 1445 | writer.write(fp) 1446 | return True 1447 | 1448 | # 读取 .mdx 文件,需要 readmdict 支持: 1449 | # https://github.com/skywind3000/writemdict (包含readmdict) 1450 | def read_mdx (self, mdxname, mdd = False): 1451 | try: 1452 | import readmdict 1453 | except ImportError: 1454 | print('ERROR: can\'t import readmdict module, please install it:') 1455 | print('https://github.com/skywind3000/writemdict') 1456 | sys.exit(1) 1457 | words = {} 1458 | if not mdd: 1459 | mdx = readmdict.MDX(mdxname) 1460 | else: 1461 | mdx = readmdict.MDD(mdxname) 1462 | for key, value in mdx.items(): 1463 | key = key.decode('utf-8', 'ignore') 1464 | if not mdd: 1465 | words[key] = value.decode('utf-8', 'ignore') 1466 | else: 1467 | words[key] = value 1468 | return words 1469 | 1470 | # 导出词形变换字符串 1471 | def exchange_dumps (self, obj): 1472 | part = [] 1473 | if not obj: 1474 | return None 1475 | for k, v in obj.items(): 1476 | k = k.replace('/', '').replace(':', '').strip() 1477 | v = v.replace('/', '').replace(':', '').strip() 1478 | part.append(k + ':' + v) 1479 | return '/'.join(part) 1480 | 1481 | # 读取词形变换字符串 1482 | def exchange_loads (self, exchg): 1483 | if not exchg: 1484 | return None 1485 | obj = {} 1486 | for text in exchg.split('/'): 1487 | pos = text.find(':') 1488 | if pos < 0: 1489 | continue 1490 | k = text[:pos].strip() 1491 | v = text[pos + 1:].strip() 1492 | obj[k] = v 1493 | return obj 1494 | 1495 | def pos_loads (self, pos): 1496 | return self.exchange_loads(pos) 1497 | 1498 | def pos_dumps (self, obj): 1499 | return self.exchange_dumps(obj) 1500 | 1501 | # 返回词性 1502 | def pos_detect (self, word, pos): 1503 | word = word.lower() 1504 | if pos == 'a': 1505 | if word in ('a', 'the',): 1506 | return (u'冠词', 'art.') 1507 | if word in ('no', 'every'): 1508 | return (u'形容词', 'adj.') 1509 | return (u'代词', 'pron.') 1510 | if pos in self._pos: 1511 | return self._pos[pos] 1512 | return (u'未知', 'unknow') 1513 | 1514 | # 返回词形比例 1515 | def pos_extract (self, data): 1516 | if 'pos' not in data: 1517 | return None 1518 | position = data['pos'] 1519 | if not position: 1520 | return None 1521 | part = self.pos_loads(position) 1522 | result = [] 1523 | for x in part: 1524 | result.append((x, part[x])) 1525 | result.sort(reverse = True, key = lambda t: int(t[1])) 1526 | final = [] 1527 | for pos, num in result: 1528 | mode = self.pos_detect(data['word'], pos) 1529 | final.append((mode, num)) 1530 | return final 1531 | 1532 | # 设置详细内容,None代表删除 1533 | def set_detail (self, dictionary, word, item, value, create = False): 1534 | data = dictionary.query(word) 1535 | if data is None: 1536 | if not create: 1537 | return False 1538 | dictionary.register(word, {}, False) 1539 | data = {} 1540 | detail = data.get('detail') 1541 | if not detail: 1542 | detail = {} 1543 | if value is not None: 1544 | detail[item] = value 1545 | elif item in detail: 1546 | del detail[item] 1547 | if not detail: 1548 | detail = None 1549 | dictionary.update(word, {'detail': detail}, False) 1550 | return True 1551 | 1552 | # 取得详细内容 1553 | def get_detail (self, dictionary, word, item): 1554 | data = dictionary.query(word) 1555 | if not data: 1556 | return None 1557 | detail = data.get('detail') 1558 | if not detail: 1559 | return None 1560 | return detail.get(item, None) 1561 | 1562 | # load file and guess encoding 1563 | def load_text (self, filename, encoding = None): 1564 | content = None 1565 | try: 1566 | content = open(filename, 'rb').read() 1567 | except: 1568 | return None 1569 | if content[:3] == b'\xef\xbb\xbf': 1570 | text = content[3:].decode('utf-8') 1571 | elif encoding is not None: 1572 | text = content.decode(encoding, 'ignore') 1573 | else: 1574 | text = None 1575 | guess = [sys.getdefaultencoding(), 'utf-8'] 1576 | if sys.stdout and sys.stdout.encoding: 1577 | guess.append(sys.stdout.encoding) 1578 | for name in guess + ['gbk', 'ascii', 'latin1']: 1579 | try: 1580 | text = content.decode(name) 1581 | break 1582 | except: 1583 | pass 1584 | if text is None: 1585 | text = content.decode('utf-8', 'ignore') 1586 | return text 1587 | 1588 | # csv 读取,自动检测编码 1589 | def csv_load (self, filename, encoding = None): 1590 | text = self.load_text(filename, encoding) 1591 | if not text: 1592 | return None 1593 | import csv 1594 | if sys.version_info[0] < 3: 1595 | import cStringIO 1596 | sio = cStringIO.StringIO(text.encode('utf-8', 'ignore')) 1597 | else: 1598 | import io 1599 | sio = io.StringIO(text) 1600 | reader = csv.reader(sio) 1601 | output = [] 1602 | if sys.version_info[0] < 3: 1603 | for row in reader: 1604 | output.append([ n.decode('utf-8', 'ignore') for n in row ]) 1605 | else: 1606 | for row in reader: 1607 | output.append(row) 1608 | return output 1609 | 1610 | # csv保存,可以指定编码 1611 | def csv_save (self, filename, rows, encoding = 'utf-8'): 1612 | import csv 1613 | ispy2 = (sys.version_info[0] < 3) 1614 | if not encoding: 1615 | encoding = 'utf-8' 1616 | if sys.version_info[0] < 3: 1617 | fp = open(filename, 'wb') 1618 | writer = csv.writer(fp) 1619 | else: 1620 | fp = open(filename, 'w', encoding = encoding, newline = '') 1621 | writer = csv.writer(fp) 1622 | for row in rows: 1623 | newrow = [] 1624 | for n in row: 1625 | if isinstance(n, int) or isinstance(n, long): 1626 | n = str(n) 1627 | elif isinstance(n, float): 1628 | n = str(n) 1629 | elif not isinstance(n, bytes): 1630 | if (n is not None) and ispy2: 1631 | n = n.encode(encoding, 'ignore') 1632 | newrow.append(n) 1633 | writer.writerow(newrow) 1634 | fp.close() 1635 | return True 1636 | 1637 | # 加载 tab 分割的 txt 文件, 返回 key, value 1638 | def tab_txt_load (self, filename, encoding = None): 1639 | words = {} 1640 | content = self.load_text(filename, encoding) 1641 | if content is None: 1642 | return None 1643 | for line in content.split('\n'): 1644 | line = line.strip('\r\n\t ') 1645 | if not line: 1646 | continue 1647 | p1 = line.find('\t') 1648 | if p1 < 0: 1649 | continue 1650 | word = line[:p1].rstrip('\r\n\t ') 1651 | text = line[p1:].lstrip('\r\n\t ') 1652 | text = text.replace('\\n', '\n').replace('\\r', '\r') 1653 | words[word] = text.replace('\\t', '\t').replace('\\\\', '\\') 1654 | return words 1655 | 1656 | # 保存 tab 分割的 txt文件 1657 | def tab_txt_save (self, filename, words, encoding = 'utf-8'): 1658 | with codecs.open(filename, 'w', encoding = encoding) as fp: 1659 | for word in words: 1660 | text = words[word] 1661 | text = text.replace('\\', '\\\\').replace('\n', '\\n') 1662 | text = text.replace('\r', '\\r').replace('\t', '\\t') 1663 | fp.write('%s\t%s\r\n'%(word, text)) 1664 | return True 1665 | 1666 | # Tab 分割的 txt文件释义导入 1667 | def tab_txt_import (self, dictionary, filename): 1668 | words = self.tab_txt_load(filename) 1669 | if not words: 1670 | return False 1671 | pc = self.progress(len(words)) 1672 | for word in words: 1673 | data = dictionary.query(word) 1674 | if not data: 1675 | dictionary.register(word, {'translation':words[word]}, False) 1676 | else: 1677 | dictionary.update(word, {'translation':words[word]}, False) 1678 | pc.inc(0) 1679 | pc.next() 1680 | dictionary.commit() 1681 | pc.done() 1682 | return True 1683 | 1684 | # mdx-builder 使用writemdict代替MdxBuilder处理较大词典(需64为python) 1685 | def mdx_build (self, srcname, outname, title, desc = None): 1686 | print('loading %s'%srcname) 1687 | t = time.time() 1688 | words = self.import_mdict(srcname) 1689 | t = time.time() - t 1690 | print(u'%d records loaded in %.3f seconds'%(len(words), t)) 1691 | print(u'building %s'%outname) 1692 | t = time.time() 1693 | self.export_mdx(words, outname, title, desc) 1694 | t = time.time() - t 1695 | print(u'complete in %.3f seconds'%t) 1696 | return True 1697 | 1698 | # 验证单词合法性 1699 | def validate_word (self, word, asc128): 1700 | alpha = 0 1701 | for ch in word: 1702 | if ch.isalpha(): 1703 | alpha += 1 1704 | if ord(ch) >= 128 and asc128: 1705 | return False 1706 | elif (not ch.isalpha()) and (not ch.isdigit()): 1707 | if ch not in ('-', '\'', '/', '(', ')', ' ', ',', '.'): 1708 | if ch not in ('&', '!', '?', '_'): 1709 | if len(word) == 5 and word[2] == ';': 1710 | continue 1711 | if not ord(ch) in (239, 65292): 1712 | # print 'f1', ord(ch), word.find(ch) 1713 | return False 1714 | if alpha == 0: 1715 | if not word.isdigit(): 1716 | return False 1717 | if word[:1] == '"' and word[-1:] == '"': 1718 | return False 1719 | if word[:1] == '(' and word[-1:] == ')': 1720 | if word.count('(') == 1: 1721 | return False 1722 | if word[:3] == '(-)': 1723 | return False 1724 | for ch in ('<', '>', '%', '*', '@', '`'): 1725 | if ch in word: 1726 | return False 1727 | if '%' in word or '\\' in word or '`' in word: 1728 | return False 1729 | if word[:1] in ('$', '@'): 1730 | return False 1731 | if len(word) == 1: 1732 | x = ord(word) 1733 | if (x < ord('a')) or (x > ord('z')): 1734 | if (x < ord('A')) or (x > ord('Z')): 1735 | return False 1736 | if (' ' not in word) and ('-' not in word): 1737 | if ('?' in word) or ('!' in word): 1738 | return False 1739 | if word.count('?') >= 2: 1740 | return False 1741 | if word.count('!') >= 2: 1742 | return False 1743 | if '---' in word: 1744 | return False 1745 | try: 1746 | word.lower() 1747 | except UnicodeWarning: 1748 | return False 1749 | return True 1750 | 1751 | 1752 | #---------------------------------------------------------------------- 1753 | # Helper instance 1754 | #---------------------------------------------------------------------- 1755 | tools = DictHelper() 1756 | 1757 | # 根据文件名自动判断数据库类型并打开 1758 | def open_dict(filename): 1759 | if isinstance(filename, dict): 1760 | return DictMySQL(filename) 1761 | if filename[:8] == 'mysql://': 1762 | return DictMySQL(filename) 1763 | if os.path.splitext(filename)[-1].lower() in ('.csv', '.txt'): 1764 | return DictCsv(filename) 1765 | return StarDict(filename) 1766 | 1767 | 1768 | # 字典转化,csv sqlite之间互转 1769 | def convert_dict(dstname, srcname): 1770 | dst = open_dict(dstname) 1771 | src = open_dict(srcname) 1772 | dst.delete_all() 1773 | pc = tools.progress(len(src)) 1774 | for word in src.dumps(): 1775 | pc.next() 1776 | data = src[word] 1777 | x = data['oxford'] 1778 | if isinstance(x, int) or isinstance(x, long): 1779 | if x <= 0: 1780 | data['oxford'] = None 1781 | elif isinstance(x, str) or isinstance(x, unicode): 1782 | if x == '' or x == '0': 1783 | data['oxford'] = None 1784 | x = data['collins'] 1785 | if isinstance(x, int) or isinstance(x, long): 1786 | if x <= 0: 1787 | data['collins'] = None 1788 | elif isinstance(x, str) or isinstance(x, unicode): 1789 | if x in ('', '0'): 1790 | data['collins'] = None 1791 | dst.register(word, data, False) 1792 | dst.commit() 1793 | pc.done() 1794 | return True 1795 | 1796 | 1797 | # 从 ~/.local/share/stardict 下面打开词典 1798 | def open_local(filename): 1799 | base = os.path.expanduser('~/.local') 1800 | for dir in [base, base + '/share', base + '/share/stardict']: 1801 | if not os.path.exists(dir): 1802 | os.mkdir(dir) 1803 | fn = os.path.join(base + '/share/stardict', filename) 1804 | return open_dict(fn) 1805 | 1806 | 1807 | 1808 | 1809 | #---------------------------------------------------------------------- 1810 | # testing 1811 | #---------------------------------------------------------------------- 1812 | if __name__ == '__main__': 1813 | db = os.path.join(os.path.dirname(__file__), 'test.db') 1814 | my = {'host':'??', 'user':'skywind', 'passwd':'??', 'db':'skywind_t1'} 1815 | def test1(): 1816 | t = time.time() 1817 | sd = StarDict(db, False) 1818 | print(time.time() - t) 1819 | # sd.delete_all(True) 1820 | print(sd.register('kiss2', {'definition':'kiss me'}, False)) 1821 | print(sd.register('kiss here', {'definition':'kiss me'}, False)) 1822 | print(sd.register('Kiss', {'definition':'BIG KISS'}, False)) 1823 | print(sd.register('kiss', {'definition':'kiss me'}, False)) 1824 | print(sd.register('suck', {'definition':'suck me'}, False)) 1825 | print(sd.register('give', {'definition':'give me', 'detail':[1,2,3]}, False)) 1826 | sd.commit() 1827 | print('') 1828 | print(sd.count()) 1829 | print(sd.query('kiSs')) 1830 | print(sd.query(2)) 1831 | print(sd.match('kis', 10)) 1832 | print('') 1833 | print(sd.query_batch(['give', 2])) 1834 | print(sd.match('kisshere', 10, True)) 1835 | return 0 1836 | def test2(): 1837 | t = time.time() 1838 | dm = DictMySQL(my, init = True) 1839 | print(time.time() - t) 1840 | # dm.delete_all(True) 1841 | print(dm.register('kiss2', {'definition':'kiss me'}, False)) 1842 | print(dm.register('kiss here', {'definition':'kiss me'}, False)) 1843 | print(dm.register('Kiss', {'definition':'kiss me'}, False)) 1844 | print(dm.register('kiss', {'definition':'BIG KISS'}, False)) 1845 | print(dm.register('suck', {'definition':'suck me'}, False)) 1846 | print(dm.register('give', {'definition':'give me'}, False)) 1847 | print(dm.query('kiss')) 1848 | print(dm.match('kis')) 1849 | print('') 1850 | print(dm.query('KiSs')) 1851 | print(dm.query_batch(['give', 2, 9])) 1852 | print('count: %d'%len(dm)) 1853 | print(dm.match('kisshere', 10, True)) 1854 | return 0 1855 | def test3(): 1856 | csvname = os.path.join(os.path.dirname(__file__), 'test.csv') 1857 | dc = DictCsv(csvname) 1858 | dc.delete_all() 1859 | print(dc.register('kiss2', {'definition':'kiss me'}, False)) 1860 | print(dc.register('kiss here', {'definition':'kiss me'}, False)) 1861 | print(dc.register('Kiss', {'definition':'kiss me'}, False)) 1862 | print(dc.register('kiss', {'definition':'kiss me'}, False)) 1863 | print(dc.register('suck', {'definition':'suck me'}, False)) 1864 | print(dc.register('word', {'definition':'WORD WORD'}, False)) 1865 | print(dc.query('kiss')) 1866 | print('') 1867 | dc.remove('kiss2') 1868 | print(dc.match('kis')) 1869 | print(dc.match('kisshere', 10, True)) 1870 | dc.commit() 1871 | return 0 1872 | def test4(): 1873 | lemma = LemmaDB() 1874 | t = time.time() 1875 | lemma.load('lemma.en.txt') 1876 | print('load in %s seconds'%str(time.time() - t)) 1877 | print(len(lemma)) 1878 | for word in ('be', 'give', 'see', 'take'): 1879 | print('%s -> %s'%(word, ','.join(lemma.get(word)))) 1880 | for word in ('gave', 'taken', 'looked', 'teeth', 'speak'): 1881 | print('%s <- %s'%(word, ','.join(lemma.word_stem(word)))) 1882 | lemma.save('output.txt') 1883 | return 0 1884 | def test5(): 1885 | print(tools.validate_word('Hello World', False)) 1886 | test3() 1887 | 1888 | 1889 | 1890 | --------------------------------------------------------------------------------