3 | RPGMaker_LLM_Translator 4 |

├── Mtool
    ├── requirements.txt
    ├── main.py
    └── main_dev.py
├── .gitignore
├── Translator++
    ├── pic
    │   ├── 1.png
    │   ├── 2.png
    │   └── 3.png
    ├── manual.py
    ├── manual2.py
    ├── 根据路径添加黄绿标签.js
    ├── 绿色标签添加路径翻译.js
    ├── README.md
    ├── api.py
    └── llm.py
└── README.md


/Mtool/requirements.txt:
--------------------------------------------------------------------------------
1 | pip
2 | wheel
3 | setuptools
4 | requests
5 | tqdm
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.gguf
3 | *.log
4 | ManualTransFile.json
5 | TranslatedFile.json
6 | TransFile/


--------------------------------------------------------------------------------
/Translator++/pic/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/1.png


--------------------------------------------------------------------------------
/Translator++/pic/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/2.png


--------------------------------------------------------------------------------
/Translator++/pic/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/3.png


--------------------------------------------------------------------------------
/Translator++/manual.py:
--------------------------------------------------------------------------------
 1 | # 一个手动运行的脚本，用于将 MTool 导出的 ManualTransFile.json 进行批量翻译
 2 | 
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | from llm import LLM, translate
 5 | from itertools import repeat
 6 | from tqdm import tqdm
 7 | import json
 8 | 
 9 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"])
10 | # 全局字典，只会将相关项传入模型
11 | global_dicts = ()
12 | 
13 | with open("ManualTransFile.json", "r", encoding="utf-8") as fp:
14 |     data = json.load(fp)
15 |     raw_texts = list(data.keys())
16 | 
17 | with ThreadPoolExecutor(4) as executor:
18 |     iterator = executor.map(translate, repeat(llm), list(data.keys()), repeat(()), repeat(()), repeat(global_dicts))
19 |     results = list(tqdm(iterator, total=len(raw_texts)))
20 | 
21 | with open("TranslatedFile.json", "w", encoding="utf-8") as fp:
22 |     json.dump(dict(zip(raw_texts, results)), fp, ensure_ascii=False, indent=4)
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | <h1>
 3 |   RPGMaker_LLM_Translator
 4 | </h1>
 5 | </div>
 6 | 
 7 | # 介绍
 8 | 这是一个基于Mtool/Translator++和Sakura模型的RPGMaker游戏本地翻译器，能够提供高质量离线日文翻译  
 9 | 建议使用[Sakura-13B-Galgame翻译模型](https://github.com/SakuraLLM/Sakura-13B-Galgame)，当前支持版本为Sakura v0.8/v0.9/v0.10pre1/Galtransl-v2.6
10 | 
11 | 项目经过重构，支持Mtool和Translator和最新版本Sakura模型。
12 | 
13 | ## TODO
14 | - [x] 添加退化检测（仅MTool）
15 | - [x] 添加历史上文（仅MTool）
16 | - [x] 添加prompt字典（仅MTool）
17 | - [x] 添加并发
18 | - [x] 添加对Sakura v0.10支持
19 | - [x] 添加对Sakura v1.0支持
20 | - [x] 添加对Galtransl-v2.6支持
21 | 
22 | ## 快速开始
23 | 首先需要部署Sakura模型，推荐使用Galtransl模型
24 | 请参考[Sakura模型部署教程](https://github.com/SakuraLLM/SakuraLLM/wiki)
25 | 
26 | ### Mtool
27 | 部署教程：详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki)
28 | 
29 | ### Translator++
30 | 详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki)
31 | 
32 | 在Translator++上安装ChatGPT插件
33 | ![image](https://github.com/user-attachments/assets/b77fc7e6-cb04-4efc-8488-203ac74224ac)
34 | 
35 | 然后便可以开始翻译了
36 | 


--------------------------------------------------------------------------------
/Translator++/manual2.py:
--------------------------------------------------------------------------------
 1 | # 一个手动运行的脚本，用于将 AutoTranslator 导出的文件进行批量翻译
 2 | # 会自动遍历并翻译 TransFile 文件夹下的所有文件
 3 | 
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | from llm import LLM, translate
 6 | from itertools import repeat
 7 | from tqdm import tqdm
 8 | import json
 9 | import os
10 | 
11 | folder = "../TransFile"
12 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"])
13 | # 全局字典，只会将相关项传入模型
14 | global_dicts = ()
15 | 
16 | for filename in tqdm(os.listdir(folder)):
17 |     filepath = f"{folder}/{filename}"
18 |     with open(filepath, "r", encoding="utf-8") as fp:
19 |         raw_texts = [x.split("=")[0] for x in fp.readlines()]
20 | 
21 |     with ThreadPoolExecutor(4) as executor:
22 |         iterator = executor.map(translate, repeat(llm), raw_texts, repeat(()), repeat(()), repeat(global_dicts))
23 |         results = list(tqdm(iterator, total=len(raw_texts)))
24 | 
25 |     with open(filepath, "w", encoding="utf-8") as fp:
26 |         for i in range(len(raw_texts)):
27 |             fp.write(f"{raw_texts[i]}={results[i]}\n")
28 | 


--------------------------------------------------------------------------------
/Translator++/根据路径添加黄绿标签.js:
--------------------------------------------------------------------------------
 1 | if (!Array.isArray(this.context)) {
 2 |     return;
 3 | }
 4 | const regexs = [
 5 |     /^Actors\/\d+\/note$/,
 6 |     /^Animations.*?$/,
 7 |     /^Armors\/\d+\/note$/,
 8 |     /^CommonEvents\/\d+\/name$/,
 9 |     /^CommonEvents\/\d+\/list\/\d+\/comment$/,
10 |     /^Enemies\/\d+\/note$/,
11 |     /^Items\/\d+\/note$/,
12 |     /^Map\d{3}\/events\/\d+\/(name|note)$/,
13 |     /^Mapinfos.*?$/,
14 |     /^Skills\/\d+\/note$/,
15 |     /^States\/\d+\/note$/,
16 |     /^System\/switches\/\d+$/,
17 |     /^System\/variables\/\d+$/,
18 |     /^Tilesets.*?$/,
19 |     /^Troops\/\d+\/name$/,
20 |     /^Weapons\/\d+\/note$/,
21 |     /^.*?MZ Plugin Command.*?$/,
22 |     /^.*?Control Variables.*?$/
23 | ];
24 | var count = 0;
25 | for (const context of this.context) {
26 |     for (const regex of regexs) {
27 |         if (regex.test(context)) {
28 |             count++;
29 |             break;
30 |         }
31 |     }
32 | }
33 | var index = this.tags.indexOf("yellow");
34 | if (index > -1) {
35 |     this.tags.splice(index, 1);
36 | }
37 | index = this.tags.indexOf("green");
38 | if (index > -1) {
39 |     this.tags.splice(index, 1);
40 | }
41 | if (count === this.context.length) {
42 |     this.tags.push("yellow");
43 | } else if (count > 0) {
44 |     this.tags.push("green");
45 | }
46 | 


--------------------------------------------------------------------------------
/Translator++/绿色标签添加路径翻译.js:
--------------------------------------------------------------------------------
 1 | if (!this.tags.includes("green")) {
 2 |     return;
 3 | }
 4 | if (!Array.isArray(this.context)) {
 5 |     return;
 6 | }
 7 | const regexs = [
 8 |     /^Actors\/\d+\/note$/,
 9 |     /^Animations.*?$/,
10 |     /^Armors\/\d+\/note$/,
11 |     /^CommonEvents\/\d+\/name$/,
12 |     /^CommonEvents\/\d+\/list\/\d+\/comment$/,
13 |     /^Enemies\/\d+\/note$/,
14 |     /^Items\/\d+\/note$/,
15 |     /^Map\d{3}\/events\/\d+\/(name|note)$/,
16 |     /^Mapinfos.*?$/,
17 |     /^Skills\/\d+\/note$/,
18 |     /^States\/\d+\/note$/,
19 |     /^System\/switches\/\d+$/,
20 |     /^System\/variables\/\d+$/,
21 |     /^Tilesets.*?$/,
22 |     /^Troops\/\d+\/name$/,
23 |     /^Weapons\/\d+\/note$/,
24 |     /^.*?MZ Plugin Command.*?$/,
25 |     /^.*?Control Variables.*?$/
26 | ];
27 | if (!Array.isArray(this.parameters)) {
28 |     this.parameters = []
29 |     for (let i = 0; i < this.context.length; i++) {
30 |         this.parameters.push({
31 |             contextStr: this.context[i]
32 |         });
33 |     }
34 | }
35 | for (let i = 0; i < this.context.length; i++) {
36 |     let context = this.context[i];
37 |     this.parameters[i]["translation"] = "";
38 |     for (const regex of regexs) {
39 |         if (regex.test(context)) {
40 |             this.parameters[i]["translation"] = this.cells[0];
41 |             break;
42 |         }
43 |     }
44 | }
45 | trans.project.files[this.file].parameters[this.rowId] = this.parameters
46 | 


--------------------------------------------------------------------------------
/Translator++/README.md:
--------------------------------------------------------------------------------
  1 | # Translator++工作流
  2 | 
  3 | 由于RPGMaker制作的游戏在文本细节上各不相同，在翻译了数个不同的游戏后，我总结了一套比较优秀的工作流，希望可以帮大家获得更好的翻译质量。
  4 | 
  5 | **本文内容有较高上手门槛**
  6 | 
  7 | ## Translator++设置
  8 | 
  9 | 首先是自定义控制符，在翻译时，所有符合这些内容的文本都会被替换为`$dat[1]`这样的格式。由于各个游戏的控制符格式不同，官方默认的这些可能有未覆盖到的，需要单独处理。如图所示。
 10 | 
 11 | ![](pic/1.png)
 12 | 
 13 | 以下是一些遇到过的情况，可以根据实际情况决定是否采用：
 14 | 
 15 | - 在每一个正则表达式后添加`\d*`，这样可以将控制符后的数字也包含进去，避免`\C[1]1000`被后端替换为`控制符11000`。
 16 | - 删除第四行的`\!`，这个不关键。
 17 | 
 18 | 或者，你也可以直接将2至4行整体替换为
 19 | 
 20 | ```re
 21 | /(\\[a-zA-Z0-9]+(?:\[.*?\]|<.*?\>)\d*|\\[a-zA-Z\{\}\\\$\.\|<\>\^]\d*)+/gi
 22 | ```
 23 | 
 24 | 如果想得到更好的效果，我建议将更复杂的逻辑和提示词拼接工作放到Python后端处理。所以在OpenAI ChatGPT插件设置中，我建议清空**System Message Template**，将**Body Message Template**设置为仅包含`${SOURCE_TEXT}`，如图所示。
 25 | 
 26 | ![](pic/2.png)
 27 | 
 28 | 还有一些其它设置，例如如果想使用这个文件夹中的api，还需要将**Target URL**设置为`http://127.0.0.1:1500/v1/chat/completions`, **Batch Delay**设置为1，**Max Characters per Batch**设置为65536，**Max row per concurrent requests**尽量调大。
 29 | 
 30 | ## 为特定路径的文本打标签
 31 | 
 32 | MTools翻译的一个缺点就是会把所有字符串都翻译了，而Translator++也会读取很多无意义的字符串。翻译这些字符串不仅耗时，而且可能会破坏一些游戏逻辑。可以右键行，通过**Row Properties**查看字符串的路径，如图所示。
 33 | 
 34 | ![](pic/3.png)
 35 | 
 36 | Translator++拥有js脚本执行功能，选中需要执行脚本的文件，右键，在**With XX Selected -> Run Automation -> For Each Row**执行脚本。
 37 | 
 38 | 更多执行细节，请参考[官方文档](https://dreamsavior.net/docs/translator/execute-script/pin-your-automation-to-quickly-launch-from-translator/)。
 39 | 
 40 | 我推荐首先使用[脚本1](根据路径添加黄绿标签.js)对每行打标签，黄色为所有上下文路径都不需要翻译，绿色为仅有部分上下文路径需要翻译。
 41 | 
 42 | 注意，由于每个游戏的差异，没有一劳永逸的正则表达式列表。为了提高翻译质量，建议开始翻以前人工浏览一遍，增减需要的正则表达式。
 43 | 
 44 | ## 开始翻译
 45 | 
 46 | 翻译的时候，红色和蓝色标签是Translator++加上的，记得和黄色的标签一起加入**黑名单**，这些行都不处理。
 47 | 
 48 | ## Python后端
 49 | 
 50 | 虽然重复造轮子不是好行为，但是一个简单的Python后端就可以做到很多事情，还是值得简单造一个轮子的。
 51 | 
 52 | [llm.py](llm.py) 和 [api.py](api.py) 这两个文件实现了一些简单的功能，文件注释写的比较详细，这里就不再赘述代码细节，只简单介绍。
 53 | 
 54 | ### 使用方式
 55 | 
 56 | 库依赖不多，主要就需要安装一个 [llama-cpp-python](https://llama-cpp-python.readthedocs.io/en/latest/) 和一个 FastAPI。
 57 | 
 58 | 在修改了 [api.py](api.py) 的一些参数之后，只需要简单 `python api.py` 即可启动。
 59 | 
 60 | ```py
 61 | port = 1500
 62 | logging.basicConfig(filename="log.log")
 63 | history_deque = deque(maxlen=3)
 64 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"])
 65 | app = FastAPI()
 66 | dicts = [
 67 |     {"src": "控制符", "dst": "控制符"}
 68 | ]
 69 | ```
 70 | 
 71 | port为服务启动的端口号。
 72 | 
 73 | basicConfig可以设置日志文件名，日志会记录控制符和行数翻译前后不一致的部分，供人工更正。
 74 | 
 75 | history_deque控制最大提供给LLM的上文数量。
 76 | 
 77 | LLM的参数都有接口说明，值得一提的是工作进程数和CUDA列表：
 78 | 
 79 | - 如果显存足够，建议一张卡上跑两个工作进程，可以吃满显卡算力，不推荐更多。
 80 | - 如果有多张卡，可以每张卡上都跑单独的工作进程，这个配置是4张4090的参考配置。
 81 | - 这边的工作进程越多，Translator++就应该设置越大的**Max row per concurrent requests**，以减少上下文切换的损耗。
 82 | 
 83 | app一般不用修改。
 84 | 
 85 | dicts是提供给模型的字典，如果要使用这个后端，至少保留控制符这个说明。
 86 | 
 87 | 如果不想深究，下面的小节可以跳过，直接看结束翻译段落即可。
 88 | 
 89 | ### 控制符格式
 90 | 
 91 | 代码中有一个处理，就是将Translator++的`${dat[1]}`这样的控制符全部替换为`控制符1`这样的文本，翻译完之后再替换回去。有什么用呢？请看例子：
 92 | 
 93 | > 味方単体に１ターン『${dat[1]}無敵』を付与
 94 | 
 95 | 这段文本，如果直接让LLM翻译，很可能会丢失掉`${dat[1]}`这样的控制符，或者是插入在错误的位置。我也试过将前后分别翻译再拼接，反而会丢失上下文。这个问题卡了我很久，一度想让我去再训练一个可以处理控制符的模型。某一天我观察到LLM会倾向于原样输出中文文本，这给了我灵感，如果将控制符改成中文：
 96 | 
 97 | > 味方単体に１ターン『控制符1無敵』を付与
 98 | 
 99 | 它就会翻译出正常的结果，并且把控制符放在合适的位置。哪怕是这种多控制符的文本：
100 | 
101 | > 控制符1敵全体にダメージを与え『控制符2心傷』『控制符3心弱』状態にする。
102 | 
103 | 经过测试也可以正确翻译并处理控制符的位置。
104 | 
105 | ### SG说明格式
106 | 
107 | 代码中还有对`<SGXX:XX>`格式的说明的处理，例如：
108 | 
109 | > <SG説明:生徒達に命令する事で、
110 | > 生徒達はＣＰを増やしたりします。
111 | > 増やしたＣＰは、スキルツリー呪力領域の開放や、
112 | > アイテム合成に使えます。>
113 | > <SG共通説明:自由行動の説明です>
114 | > <SGカテゴリ:\I[247]行動パート>
115 | 
116 | 这个里面的key是不能翻译的，而value是需要翻译的，所以代码对其进行了简单的提取处理。
117 | 
118 | ## 结束翻译
119 | 
120 | 翻译完成后，记得将日志中记录的错误进行简单的人工修正。
121 | 
122 | 然后使用[脚本2](绿色标签添加路径翻译.js)将绿色标签的上下文翻译自动设置完。
123 | 
124 | 最后就可以直接注入翻译开始游戏。
125 | 


--------------------------------------------------------------------------------
/Translator++/api.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, deque
  2 | from concurrent.futures import ThreadPoolExecutor
  3 | from fastapi import FastAPI, Request
  4 | from llm import LLM, translate
  5 | import logging
  6 | import uvicorn
  7 | import json
  8 | import re
  9 | 
 10 | port = 1500
 11 | logging.basicConfig(filename="log.log")
 12 | history_deque = deque(maxlen=3)
 13 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"])
 14 | app = FastAPI()
 15 | # 全局字典，只会将相关项传入模型
 16 | global_dicts = [
 17 |     {"src": "原文", "dst": "译文", "info": "说明（可选）"}
 18 | ]
 19 | 
 20 | def text_translate(text: str, history: tuple[str]) -> str:
 21 |     """预处理文本并执行翻译
 22 |     
 23 |     Args:
 24 |         text (str): 可能包含`${dat[数字]}`格式控制符的文本
 25 |         history (tuple[str]): 历史翻译上下文（需传入可哈希的tuple）
 26 |         
 27 |     Returns:
 28 |         str: 翻译后的文本
 29 |         
 30 |     Note:
 31 |         1. 自动转换 ${dat[1]} ↔ 控制符1 的格式
 32 |         2. 校验翻译前后控制符数量和行数是否一致，最多重试10次
 33 |         3. 超过最多重试次数时会记录警告日志
 34 |     """
 35 |     pattern1 = r"\$\{dat\[(\d+)\]\}"
 36 |     pattern2 = r"控制符(\d+)"
 37 | 
 38 |     # 重试时控制符会继续向后标号，以提供不同的原文来提高成功率
 39 |     counter = 0
 40 |     def replace_to_chinese(match):
 41 |         nonlocal counter
 42 |         counter += 1
 43 |         placeholder = "控制符" + str(counter)
 44 |         dat_mapping[placeholder] = match.group(0)
 45 |         return placeholder
 46 |     
 47 |     def replace_back_to_dat(match):
 48 |         placeholder = match.group(0)
 49 |         return dat_mapping.get(placeholder, placeholder)
 50 | 
 51 |     retry = True
 52 |     retry_counter = 0
 53 |     while retry and retry_counter < 10:
 54 |         dat_mapping = {}
 55 |         retry = False
 56 |         retry_counter += 1
 57 | 
 58 |         before = Counter(re.findall(pattern1, text))
 59 |         line_num = len(text.splitlines())
 60 |         result = re.sub(pattern1, replace_to_chinese, text)
 61 |         dat_dicts = ({"src": key, "dst": key} for key in dat_mapping.keys())
 62 |         result = translate(llm, result, history, dat_dicts, global_dicts)
 63 |         result = re.sub(pattern2, replace_back_to_dat, result)
 64 |         after = Counter(re.findall(pattern1, result))
 65 | 
 66 |         if before != after:
 67 |             # logging.warning(f"{before} != {after}\n{text}\n{result}")
 68 |             retry = True
 69 |         elif line_num != len(result.splitlines()):
 70 |             # logging.warning(f"line_num mismatch\n{text}\n{result}")
 71 |             retry = True
 72 |     if retry:
 73 |         logging.warning(f"stop retry after {retry_counter} attempts\n{text}\n{result}")
 74 |     # elif retry_counter > 1:
 75 |     #     logging.warning(f"get correct translation after {retry_counter} attempts\n{text}\n{result}")
 76 | 
 77 |     return result
 78 | 
 79 | def data_translate(data: str, history: tuple[str]) -> str:
 80 |     """处理包含<SG标签>的复合数据翻译
 81 |     
 82 |     Args:
 83 |         data (str): 可能包含<SG...>标签的文本
 84 |         history (tuple[str]): 历史翻译上下文（需传入可哈希的tuple）
 85 |         
 86 |     Returns:
 87 |         str: 翻译后的完整文本
 88 |         
 89 |     Note:
 90 |         1. 优先提取<SG...:内容>结构进行分段翻译
 91 |         2. 无标签时直接调用text_translate
 92 |         3. 保持原标签结构不变只翻译内容部分
 93 |     """
 94 |     pattern = r"<SG.*?>"
 95 |     finds = re.findall(pattern, data, re.DOTALL)
 96 |     if len(finds) > 0:
 97 |         for raw in finds:
 98 |             index = raw.find(":")
 99 |             if index == -1:
100 |                 continue
101 |             text = raw[index + 1 : -1]
102 |             text = text_translate(text, history)
103 |             data = data.replace(raw, f"{raw[:index]}:{text}>")
104 |     else:
105 |         data = text_translate(data, history)
106 |     return data
107 | 
108 | @app.post("/v1/chat/completions")
109 | async def read_item(request: Request):
110 |     """批量翻译API端点（POST方法）
111 |     
112 |     Args:
113 |         request (Request): FastAPI请求对象，需包含：
114 |         {
115 |             "messages": [{
116 |                 "role": "user",
117 |                 "content": "[\"text1\", \"text2\"]"  # JSON字符串数组
118 |             }]
119 |         }
120 |         
121 |     Returns:
122 |         dict: 格式化的响应数据：
123 |         {
124 |             "choices": [{
125 |                 "message": {
126 |                     "content": "[\"trans1\", \"trans2\"]"  # JSON字符串数组
127 |                 }
128 |             }]
129 |         }
130 |         
131 |     Note:
132 |         1. 使用ThreadPoolExecutor实现多文本并发翻译
133 |         2. 维护全局history_deque保存最近3条历史记录
134 |         3. 每个文本会附带其之前3条文本作为上文
135 |     """
136 |     data = await request.json()
137 |     data = data["messages"][0]["content"]
138 |     data = json.loads(data)
139 |     history = []
140 |     for d in data:
141 |         history.append(tuple(history_deque))
142 |         history_deque.append(d)
143 |     with ThreadPoolExecutor(len(data)) as executor:
144 |         data = executor.map(data_translate, data, history)
145 |     return {"choices": [{"message": {"content": json.dumps(list(data))}}]}
146 | 
147 | @app.get("/")
148 | def read_item(text: str):
149 |     """单条文本翻译API端点（GET方法）
150 |     
151 |     Args:
152 |         text (str): 通过URL参数传递的待翻译文本
153 |         
154 |     Returns:
155 |         str: 直接返回翻译结果字符串
156 |     """
157 |     result = translate(llm, text, (), (), ())
158 |     return result
159 | 
160 | if __name__ == '__main__':
161 |     uvicorn.run(app, port=port)
162 | 


--------------------------------------------------------------------------------
/Translator++/llm.py:
--------------------------------------------------------------------------------
  1 | from llama_cpp import Llama
  2 | from multiprocessing import Pool
  3 | from functools import lru_cache
  4 | import os
  5 | 
  6 | def contains_japanese(text):
  7 |     """检查文本是否包含日文片假名
  8 |     
  9 |     Args:
 10 |         text (str): 待检测的文本
 11 |         
 12 |     Returns:
 13 |         bool: 如果文本中包含日文片假名（Unicode范围3040-30FF）返回True，否则返回False
 14 |     """
 15 |     for char in text:
 16 |         if "\u3040" <= char <= "\u30FF":
 17 |             return True
 18 |     return False
 19 | 
 20 | def _init_worker(model_path: str, cuda_device: str):
 21 |     """
 22 |     初始化工作进程的LLM模型
 23 | 
 24 |     Args:
 25 |         model_path (str): 模型文件路径
 26 |         cuda_device (str): 指定使用的CUDA设备ID
 27 |     """
 28 |     global worker_model
 29 |     print(f"PID: {os.getpid()} CUDA: {cuda_device}")
 30 |     os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device
 31 |     worker_model = Llama(model_path, n_gpu_layers=-1, n_ctx=2048, verbose=False)
 32 | 
 33 | def _get_glossary(gpt_dicts: list[dict]) -> str:
 34 |     """
 35 |     将术语字典列表格式化为字符串
 36 | 
 37 |     Args:
 38 |         gpt_dicts (list[dict]): 术语字典列表，每个字典应包含:
 39 |             - src: 源语言术语
 40 |             - dst: 目标语言翻译
 41 |             - info(可选): 附加信息
 42 | 
 43 |     Returns:
 44 |         str: 格式化后的术语表字符串，每行格式为"src->dst #info"或"src->dst"
 45 | 
 46 |     Example:
 47 |         >>> _get_glossary([{"src": "猫", "dst": "cat", "info": "动物"}])
 48 |         >>> '猫->cat #动物\\n'
 49 |     """
 50 |     glossary = ""
 51 |     for gpt in gpt_dicts:
 52 |         if "info" in gpt.keys():
 53 |             glossary += "{}->{} #{}\n".format(gpt["src"], gpt["dst"], gpt["info"])
 54 |         else:
 55 |             glossary += "{}->{}\n".format(gpt["src"], gpt["dst"])
 56 |     return glossary
 57 | 
 58 | def _process_translate(model_name: str, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []) -> str:
 59 |     """
 60 |     执行单条文本的翻译
 61 | 
 62 |     Args:
 63 |         model_name (str): 模型名称，支持"sakura"或"galtransl"
 64 |         text (str): 待翻译的日文文本
 65 |         history (list[dict], optional): 对话历史记录
 66 |         gpt_dicts (list[dict], optional): 术语字典列表
 67 | 
 68 |     Returns:
 69 |         str: 翻译后的中文文本
 70 |     """
 71 |     messages = []
 72 |     if model_name == "sakura":
 73 |         messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。"})
 74 |         for item in history:
 75 |             messages.append({"role": "assistant", "content": item})
 76 |         if len(gpt_dicts) == 0:
 77 |             user_prompt = "将下面的日文文本翻译成中文：" + text
 78 |         else:
 79 |             user_prompt = "根据以下术语表（可以为空）：\n"
 80 |             user_prompt += _get_glossary(gpt_dicts)
 81 |             user_prompt += "将下面的日文文本根据对应关系和备注翻译成中文：" + text
 82 |     
 83 |     elif model_name == "galtransl":
 84 |         messages.append({"role": "system", "content": "你是一个视觉小说翻译模型，可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，注意"})
 85 |         user_prompt = "历史翻译：\n" + "\n".join(history) + "\n"
 86 |         if len(gpt_dicts) != 0:
 87 |             user_prompt += "参考以下术语表（可为空，格式为src->dst #备注）：\n"
 88 |             user_prompt += _get_glossary(gpt_dicts)
 89 |         user_prompt += "根据以上术语表的对应关系和备注，结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：\n" + text
 90 |     
 91 |     messages.append({"role": "user", "content": user_prompt})
 92 |     if model_name == "sakura":
 93 |         res = worker_model.create_chat_completion(messages=messages, temperature=0.1, top_p=0.3, repeat_penalty=1, max_tokens=512, frequency_penalty=0.2)
 94 |     elif model_name == "galtransl":
 95 |         res = worker_model.create_chat_completion(messages=messages, temperature=0.6, top_p=0.8, repeat_penalty=1, max_tokens=512, frequency_penalty=0.1)
 96 |     return res["choices"][0]["message"]["content"]
 97 | 
 98 | class LLM:
 99 |     """
100 |     多进程LLM翻译器主类
101 | 
102 |     Attributes:
103 |         model_name (str): 模型名称
104 |         pool (multiprocessing.Pool): 工作进程池
105 |     """
106 |     def __init__(self, model_name: str, model_path: str, num_process: int, cuda_device: list[str]):
107 |         """
108 |         初始化LLM翻译器
109 | 
110 |         Args:
111 |             model_name (str): 模型名称 ("sakura" | "galtransl")
112 |             model_path (str): 模型文件路径
113 |             num_process (int): 工作进程数
114 |             cuda_device (list[str]): 每个进程使用的CUDA设备ID列表
115 | 
116 |         Note:
117 |             - cuda_device列表长度应与num_process匹配
118 |         """
119 |         self.model_name = model_name
120 |         self.pool = Pool(num_process)
121 |         init_args = [(model_path, cuda_device[i]) for i in range(num_process)]
122 |         self.pool.starmap(_init_worker, init_args)
123 |     
124 |     def translate(self, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []):
125 |         """
126 |         提交单个翻译任务到进程池
127 | 
128 |         Args:
129 |             text (str): 待翻译文本
130 |             history (list[dict], optional): 历史对话
131 |             gpt_dicts (list[dict], optional): 术语表
132 | 
133 |         Returns:
134 |             multiprocessing.pool.AsyncResult: 异步结果对象
135 |         """
136 |         return self.pool.apply_async(_process_translate, (self.model_name, text, history, gpt_dicts))
137 |     
138 |     def batch_translate(self, datas: list[dict]) -> list[str]:
139 |         """
140 |         批量翻译文本
141 | 
142 |         Args:
143 |             datas (list[dict]): 待翻译数据列表，每个元素应包含:
144 |                 - text: 待翻译文本
145 |                 - history: 历史对话
146 |                 - gpt_dicts: 术语表
147 | 
148 |         Returns:
149 |             list[str]: 翻译结果列表，顺序与输入一致
150 | 
151 |         Note:
152 |             - 每个 key 都必须有值，即使是空列表
153 | 
154 |         Example:
155 |             >>> translator.batch_translate([{"text": "こんにちは", "history": [], "gpt_dicts": []}])
156 |             >>> ['你好']
157 |         """
158 |         tasks = [self.__translate(data["text"], data["history"], data["gpt_dicts"]) for data in datas]
159 |         results = [task.get() for task in tasks]
160 |         return results
161 | 
162 | @lru_cache(maxsize=1024)
163 | def translate(llm: LLM, text: str, history: tuple[str], local_dicts: tuple[str], global_dicts: tuple[str]) -> str:
164 |     """带缓存的单条文本翻译核心函数
165 |     
166 |     Args:
167 |         llm (LLM): 多进程LLM翻译器实例
168 |         text (str): 待翻译文本（自动替换全角空格为半角空格）
169 |         history (tuple[str]): 历史翻译上下文（需传入可哈希的tuple）
170 |         local_dicts (tuple[str]): 局部字典（需传入可哈希的tuple）无论文本中是否出现都会传入翻译器
171 |         global_dicts (tuple[str]): 全局字典（需传入可哈希的tuple）只会将文本中出现的部分传入翻译器
172 |         
173 |     Returns:
174 |         str: 翻译后的中文文本
175 |         
176 |     Note:
177 |         1. 使用LRU缓存（最多1024条）加速重复文本翻译
178 |         2. 非日文文本会直接返回原内容
179 |         3. 实际调用llm.translate()执行翻译
180 |     """
181 |     text = text.replace("\u3000", "  ")
182 |     if not contains_japanese(text):
183 |         return text
184 |     gpt_dicts = list(local_dicts)
185 |     for item in global_dicts:
186 |         if item["src"] in text:
187 |             gpt_dicts.append(item)
188 |     result = llm.translate(text, history, gpt_dicts).get()
189 |     return result
190 | 


--------------------------------------------------------------------------------
/Mtool/main.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import re
  4 | import os
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import unicodedata
  8 | import csv
  9 | import sys
 10 | from concurrent.futures import ThreadPoolExecutor, as_completed
 11 | import threading
 12 | 
 13 | # 读取全局配置信息
 14 | def load_config():
 15 |     if not os.path.exists("config.json"):
 16 |         config_data = {
 17 |             "last_processed": 0,
 18 |             "task_list": ["ManualTransFile.json"],
 19 |             "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"],
 20 |             "model_type": "Sgaltransl",
 21 |             "model_version": "2.6",
 22 |             "use_dict": False,
 23 |             "dict": {},
 24 |             "dict_mode": "Partial",
 25 |             "save_frequency": 100,
 26 |             "shutdown": 0,
 27 |             "max_workers": 1,
 28 |             "context_size": 0
 29 |         }
 30 |         with open("config.json", 'w') as file:
 31 |             json.dump(config_data, file, indent=4)
 32 |     with open('config.json', 'r', encoding='utf-8') as file:
 33 |         return json.load(file)
 34 | 
 35 | # 初始化字典
 36 | def initialize_dict(dict_str):
 37 |     if not dict_str:
 38 |         return {}, ""
 39 |     try:
 40 |         dict_data = json.loads(dict_str)
 41 |         dict_converted = {}
 42 |         for key, value in dict_data.items():
 43 |             if isinstance(value, list) and len(value) > 0:
 44 |                 if len(value) == 1:
 45 |                     dict_converted[key] = [value[0], ""]
 46 |                 else:
 47 |                     dict_converted[key] = value[:2]
 48 |             else:
 49 |                 dict_converted[key] = [value, ""]
 50 |         dict_strings = get_dict_string_list(dict_converted)
 51 |         return dict_converted, "\n".join(dict_strings)
 52 |     except Exception as e:
 53 |         print(f"Error initializing dictionary: {e}")
 54 |         return {}, ""
 55 | 
 56 | # 获取字典字符串列表
 57 | def get_dict_string_list(kv_pairs):
 58 |     dict_list = []
 59 |     for key, value in kv_pairs.items():
 60 |         src = key
 61 |         dst = value[0]
 62 |         info = value[1]
 63 |         if info:
 64 |             dict_list.append(f"{src}->{dst} #{info}")
 65 |         else:
 66 |             dict_list.append(f"{src}->{dst}")
 67 |     return dict_list
 68 | 
 69 | # 模型版本管理
 70 | def get_translation_model(model_name, model_version):
 71 |     if model_name.lower() == "sakura":
 72 |         if model_version == "0.8":
 73 |             return "SakuraV0_8"
 74 |         elif model_version == "0.9":
 75 |             return "SakuraV0_9"
 76 |         elif model_version == "0.10":
 77 |             return "SakuraV0_10"
 78 |         elif model_version == "1.0":
 79 |             return "SakuraV1_0"
 80 |         else:
 81 |             return "SakuraV1_0"
 82 |     elif model_name.lower() == "sakura32b":
 83 |         if model_version == "0.10":
 84 |             return "Sakura32bV0_10"
 85 |         else:
 86 |             return "Sakura32bV0_10"
 87 |     elif model_name.lower() == "galtransl":
 88 |         if model_version == "2.6":
 89 |             return "GalTranslV2_6"
 90 |         elif model_version == "3.0":
 91 |             return "GalTranslV3"
 92 |         else:
 93 |             return "GalTranslV2_6"
 94 |     else:
 95 |         return "SakuraV1_0"
 96 | 
 97 | # 检查文本是否包含日文字符
 98 | def contains_japanese(text):
 99 |     text = unicodedata.normalize('NFKC', text)
100 |     return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text
101 | 
102 | # 分割文本段落
103 | def split_text_with_newlines(text):
104 |     paragraphs = re.split(r'(\r\n|\r|\n)', text)
105 |     return paragraphs
106 | 
107 | # 判断是否是文件路径
108 | def is_file_path(text):
109 |     # 基于文本特征判断是否是文件路径
110 |     return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text))
111 | 
112 | # 符号管理工具类
113 | def fix_translation_end(original, translation):
114 |     if translation.endswith("。") and not original.endswith("。"):
115 |         translation = translation[:-1]
116 |     if translation.endswith("。」") and not original.endswith("。」"):
117 |         translation = translation[:-2] + "」"
118 |     return translation
119 | 
120 | def unescape_translation(original, translation):
121 |     if "\r" not in original:
122 |         translation = translation.replace("\r", "\r")
123 |     if "\n" not in original:
124 |         translation = translation.replace("\n", "\n")
125 |     if "\t" not in original:
126 |         translation = translation.replace("\t", "\t")
127 |     return translation
128 | 
129 | # 翻译文本，按段落翻译
130 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None):
131 |     # 如果是文件路径或者文件，直接跳过
132 |     if is_file_path(text):
133 |         return text
134 |     
135 |     contains_jp, updated_text = contains_japanese(text)
136 |     if contains_jp:
137 |         segments = split_text_with_newlines(updated_text)
138 |         translated_segments = []
139 |         for segment in segments:
140 |             if segment in ['\r\n', '\r', '\n']:
141 |                 translated_segments.append(segment)
142 |             else:
143 |                 if segment:
144 |                     translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations))
145 |                 else:
146 |                     translated_segments.append(segment)
147 |         translated_text = ''.join(translated_segments)
148 |         return translated_text
149 |     else:
150 |         return text
151 | 
152 | # 调用API进行翻译
153 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None):
154 |     try:
155 |         endpoint = config['endpoint'][api_idx]
156 |         model_type = get_translation_model(config['model_type'], config['model_version'])
157 |         context_size = config.get('context_size', 0)
158 |         context = previous_translations[-context_size:] if previous_translations else []
159 |         data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context)
160 |         response = requests.post(endpoint, json=data)
161 |         response.raise_for_status()
162 | 
163 |         response_data = response.json()
164 |         completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0)
165 |         max_tokens = data["max_tokens"]
166 | 
167 |         # 检查是否发生退化，重试时调整 frequency_penalty
168 |         if completion_tokens == max_tokens:
169 |             print("模型可能发生退化，调整 frequency_penalty 并重试...")
170 |             data["frequency_penalty"] = 0.8
171 |             response = requests.post(endpoint, json=data)
172 |             response.raise_for_status()
173 |             response_data = response.json()
174 | 
175 |     except requests.RequestException as e:
176 |         print(f'请求翻译API错误: {e}')
177 |         return ""
178 |     
179 |     translated_text = response_data.get("choices")[0].get("message", {}).get("content", "")
180 |     translated_text = translated_text.replace("将下面的日文文本翻译成中文：", "").replace("<|im_end|>", "")
181 |     translated_text = fix_translation_end(text, translated_text)
182 |     translated_text = unescape_translation(text, translated_text)
183 |     print(f"原文: {text}\n翻译: {translated_text}\n")  # 调试信息，输出翻译前后的文本
184 |     return translated_text
185 | 
186 | # 处理翻译请求的JSON构造
187 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context):    
188 |     messages = []
189 |     
190 |     if model_type == "SakuraV0_8":
191 |         messages.append({"role": "system", "content": "你是一个简单的日文翻译模型，将日文翻译成简体中文。"})
192 |         messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文：{text}"})
193 |     else:
194 |         if model_type == "SakuraV0_9":
195 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅地将日文翻译成简体中文，并正确使用人称代词。"})
196 |         elif model_type == "SakuraV0_10":
197 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。"})
198 |         elif model_type == "SakuraV1_0":
199 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。"})
200 |         elif model_type == "GalTranslV2_6":
201 |             messages.append({"role": "system", "content": "你是一个视觉小说翻译模型，可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文，并联系上下文正确使用人称代词。"})
202 |         elif model_type == "GalTranslV3":
203 |             messages.append({"role": "system", "content": "你是一个视觉小说翻译模型，可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文，并联系上下文正确使用人称代词。"})
204 |         else:
205 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地将日文翻译成简体中文。"})
206 |         
207 |         if context:
208 |             history_text = "历史翻译：" + "\n".join(context)
209 |         else:
210 |             history_text = ""
211 |         
212 |         if model_type == "GalTranslV3":
213 |             if use_dict:
214 |                 dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
215 |                 user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注，结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：\n{text}"
216 |             else:
217 |                 user_content = f"{history_text}\n结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：\n{text}"
218 |             messages.append({"role": "user", "content": user_content})
219 |         else:
220 |             if context:
221 |                 for c in context:
222 |                     messages.append({"role": "assistant", "content": c})
223 |             
224 |             if use_dict:
225 |                 dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
226 |                 messages.append({"role": "user", "content": f"根据上文和以下术语表：\n{dict_str}\n将下面的日文文本翻译成中文：{text}"})
227 |             else:
228 |                 messages.append({"role": "user", "content": f"根据上文，将下面的日文文本翻译成中文：{text}"})
229 |     
230 |     temperature = 0.6 if model_type == "GalTranslV3" else 0.2
231 |     
232 |     data = {
233 |         "model": "sukinishiro",
234 |         "messages": messages,
235 |         "temperature": temperature,
236 |         "top_p": 0.3,
237 |         "max_tokens": 384,
238 |         "frequency_penalty": 0.2,
239 |         "do_sample": True,
240 |         "num_beams": 1,
241 |         "repetition_penalty": 1.0
242 |     }
243 |     return data
244 | 
245 | # 保存翻译进度
246 | def save_progress(data, filename, index, task_list):
247 |     if filename.endswith(".json"):
248 |         with open(filename, 'w', encoding='utf-8') as file:
249 |             json.dump(data, file, ensure_ascii=False, indent=4)
250 |     elif filename.endswith(".csv"):
251 |         data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
252 |     config = load_config()
253 |     config['last_processed'] = index
254 |     config['task_list'] = task_list
255 |     with open('config.json', 'w', encoding='utf-8') as file:
256 |         json.dump(config, file, indent=4)
257 | 
258 | # 主流程
259 | def main():
260 |     config = load_config()
261 |     if not config['endpoint']:
262 |         print("请配置API endpoint后再运行程序。")
263 |         return
264 |     
265 |     # 初始化字典
266 |     dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {})))
267 |     config['dict'] = dict_data
268 |     
269 |     task_list = config['task_list']
270 |     if not task_list:
271 |         print("未找到待翻译文件，请更新config.json。")
272 |         return
273 | 
274 |     for task_name in task_list:
275 |         if not os.path.exists(task_name):
276 |             print(f"文件{task_name}不存在，跳过。")
277 |             continue
278 | 
279 |         if task_name.endswith(".json"):
280 |             with open(task_name, 'r', encoding='utf-8') as file:
281 |                 data = json.load(file)
282 |             json_keys = list(data.keys())
283 |         elif task_name.endswith(".csv"):
284 |             data = pd.read_csv(task_name, encoding='utf-8')
285 |             data['Original Text'] = data['Original Text'].astype(str)
286 |             data['Machine translation'] = data['Machine translation'].astype(str)
287 |         else:
288 |             print(f"不支持的文件类型: {task_name}")
289 |             continue
290 | 
291 |         total_keys = len(data)
292 |         start_index = config['last_processed']
293 |         api_num = len(config['endpoint'])
294 |         previous_translations = []
295 |         with ThreadPoolExecutor(max_workers=config['max_workers']) as executor:
296 |             future_to_index = {}
297 |             for i in range(start_index, total_keys):
298 |                 key = json_keys[i] if task_name.endswith(".json") else data.loc[i, 'Original Text']
299 |                 api_index = i % api_num
300 |                 future = executor.submit(translate_text_by_paragraph, key, i, api_index, config, previous_translations)
301 |                 future_to_index[future] = i
302 |             for future in tqdm(as_completed(future_to_index), total=len(future_to_index), desc="任务进度"):
303 |                 index = future_to_index[future]
304 |                 try:
305 |                     translated_text = future.result()
306 |                     previous_translations.append(translated_text)
307 |                     if len(previous_translations) > config.get('context_size', 0):
308 |                         previous_translations.pop(0)
309 |                     if task_name.endswith(".json"):
310 |                         data[json_keys[index]] = translated_text
311 |                     if task_name.endswith(".csv"):
312 |                         data.loc[index, 'Machine translation'] = translated_text
313 |                     if (index + 1) % config['save_frequency'] == 0 or index + 1 == total_keys:
314 |                         save_progress(data, task_name, index + 1, task_list)
315 |                 except Exception as exc:
316 |                     print(f'{index + 1}行翻译发生异常: {exc}')
317 | 
318 | if __name__ == "__main__":
319 |     main()
320 | 


--------------------------------------------------------------------------------
/Mtool/main_dev.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import re
  4 | import os
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import unicodedata
  8 | import csv
  9 | import sys
 10 | import shutil
 11 | from concurrent.futures import ThreadPoolExecutor, as_completed
 12 | import threading
 13 | import time
 14 | 
 15 | # 全局变量，用于控制进度条显示
 16 | progress_bars = {}
 17 | progress_lock = threading.Lock()
 18 | debug_output = []  # 用于存储调试输出
 19 | 
 20 | # 读取全局配置信息
 21 | def load_config():
 22 |     if not os.path.exists("config.json"):
 23 |         config_data = {
 24 |             "last_processed": 0,
 25 |             "task_list": ["ManualTransFile.json"],
 26 |             "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"],
 27 |             "model_type": "Sgaltransl",
 28 |             "model_version": "2.6",
 29 |             "use_dict": False,
 30 |             "dict": {},
 31 |             "dict_mode": "Partial",
 32 |             "save_frequency": 100,
 33 |             "shutdown": 0,
 34 |             "max_workers": 1,
 35 |             "context_size": 0
 36 |         }
 37 |         with open("config.json", 'w') as file:
 38 |             json.dump(config_data, file, indent=4)
 39 |     with open('config.json', 'r', encoding='utf-8') as file:
 40 |         return json.load(file)
 41 | 
 42 | # 初始化字典
 43 | def initialize_dict(dict_str):
 44 |     if not dict_str:
 45 |         return {}, ""
 46 |     try:
 47 |         dict_data = json.loads(dict_str)
 48 |         dict_converted = {}
 49 |         for key, value in dict_data.items():
 50 |             if isinstance(value, list) and len(value) > 0:
 51 |                 if len(value) == 1:
 52 |                     dict_converted[key] = [value[0], ""]
 53 |                 else:
 54 |                     dict_converted[key] = value[:2]
 55 |             else:
 56 |                 dict_converted[key] = [value, ""]
 57 |         dict_strings = get_dict_string_list(dict_converted)
 58 |         return dict_converted, "\n".join(dict_strings)
 59 |     except Exception as e:
 60 |         console_print(f"Error initializing dictionary: {e}")
 61 |         return {}, ""
 62 | 
 63 | # 获取字典字符串列表
 64 | def get_dict_string_list(kv_pairs):
 65 |     dict_list = []
 66 |     for key, value in kv_pairs.items():
 67 |         src = key
 68 |         dst = value[0]
 69 |         info = value[1]
 70 |         if info:
 71 |             dict_list.append(f"{src}->{dst} #{info}")
 72 |         else:
 73 |             dict_list.append(f"{src}->{dst}")
 74 |     return dict_list
 75 | 
 76 | # 模型版本管理
 77 | def get_translation_model(model_name, model_version):
 78 |     if model_name.lower() == "sakura":
 79 |         if model_version == "0.8":
 80 |             return "SakuraV0_8"
 81 |         elif model_version == "0.9":
 82 |             return "SakuraV0_9"
 83 |         elif model_version == "0.10":
 84 |             return "SakuraV0_10"
 85 |         elif model_version == "1.0":
 86 |             return "SakuraV1_0"
 87 |         else:
 88 |             return "SakuraV1_0"
 89 |     elif model_name.lower() == "sakura32b":
 90 |         if model_version == "0.10":
 91 |             return "Sakura32bV0_10"
 92 |         else:
 93 |             return "Sakura32bV0_10"
 94 |     elif model_name.lower() == "galtransl":
 95 |         if model_version == "2.6":
 96 |             return "GalTranslV2_6"
 97 |         elif model_version == "3.0":
 98 |             return "GalTranslV3"
 99 |         else:
100 |             return "GalTranslV2_6"
101 |     else:
102 |         return "SakuraV1_0"
103 | 
104 | # 检查文本是否包含日文字符
105 | def contains_japanese(text):
106 |     text = unicodedata.normalize('NFKC', text)
107 |     return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text
108 | 
109 | # 判断文本是否为纯英文（不包含中文字符）
110 | def is_pure_english(text):
111 |     # 检查文本是否仅包含英文字母、数字、标点和空白字符
112 |     # 如果包含中文字符则返回False
113 |     return not bool(re.search(r'[\u4e00-\u9fff]', text))
114 | 
115 | # 分割文本段落
116 | def split_text_with_newlines(text):
117 |     paragraphs = re.split(r'(\r\n|\r|\n)', text)
118 |     return paragraphs
119 | 
120 | # 判断是否是文件路径
121 | def is_file_path(text):
122 |     # 基于文本特征判断是否是文件路径
123 |     return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text))
124 | 
125 | # 符号管理工具类
126 | def fix_translation_end(original, translation):
127 |     if translation.endswith("。") and not original.endswith("。"):
128 |         translation = translation[:-1]
129 |     if translation.endswith("。」") and not original.endswith("。」"):
130 |         translation = translation[:-2] + "」"
131 |     return translation
132 | 
133 | def unescape_translation(original, translation):
134 |     if "\r" not in original:
135 |         translation = translation.replace("\r", "\r")
136 |     if "\n" not in original:
137 |         translation = translation.replace("\n", "\n")
138 |     if "\t" not in original:
139 |         translation = translation.replace("\t", "\t")
140 |     return translation
141 | 
142 | # 自定义用于调试输出的函数
143 | def console_print(*args, **kwargs):
144 |     message = " ".join(map(str, args))
145 |     with progress_lock:
146 |         # 将消息存入调试输出列表
147 |         debug_output.append(message)
148 |         # 限制调试输出列表长度
149 |         if len(debug_output) > 20:
150 |             debug_output.pop(0)
151 |         
152 |         # 清屏并重新打印所有内容
153 |         print("\033[H\033[J", end="")  # 清屏
154 |         
155 |         # 打印调试输出
156 |         for line in debug_output:
157 |             print(line)
158 |         
159 |         # 打印空行分隔
160 |         rows, columns = shutil.get_terminal_size()
161 |         print("\n" * 3)  # 空出进度条区域
162 |         
163 |         # 刷新所有进度条
164 |         refresh_all_progress_bars()
165 | 
166 | # 刷新所有进度条
167 | def refresh_all_progress_bars():
168 |     for bar in progress_bars.values():
169 |         if bar:
170 |             bar.refresh()
171 | 
172 | # 翻译文本，按段落翻译
173 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None):
174 |     # 如果是文件路径或者文件，直接跳过
175 |     if is_file_path(text):
176 |         return text
177 |     
178 |     contains_jp, updated_text = contains_japanese(text)
179 |     if contains_jp:
180 |         segments = split_text_with_newlines(updated_text)
181 |         translated_segments = []
182 |         for segment in segments:
183 |             if segment in ['\r\n', '\r', '\n']:
184 |                 translated_segments.append(segment)
185 |             else:
186 |                 if segment:
187 |                     translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations))
188 |                 else:
189 |                     translated_segments.append(segment)
190 |         translated_text = ''.join(translated_segments)
191 |         return translated_text
192 |     else:
193 |         return text
194 | 
195 | # 调用API进行翻译
196 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None):
197 |     try:
198 |         endpoint = config['endpoint'][api_idx]
199 |         model_type = get_translation_model(config['model_type'], config['model_version'])
200 |         context_size = config.get('context_size', 0)
201 |         context = previous_translations[-context_size:] if previous_translations else []
202 |         data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context)
203 |         response = requests.post(endpoint, json=data)
204 |         response.raise_for_status()
205 | 
206 |         response_data = response.json()
207 |         completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0)
208 |         max_tokens = data["max_tokens"]
209 | 
210 |         # 检查是否发生退化，重试时调整 frequency_penalty
211 |         if completion_tokens == max_tokens:
212 |             console_print("模型可能发生退化，调整 frequency_penalty 并重试...")
213 |             data["frequency_penalty"] = 0.8
214 |             response = requests.post(endpoint, json=data)
215 |             response.raise_for_status()
216 |             response_data = response.json()
217 | 
218 |     except requests.RequestException as e:
219 |         console_print(f'请求翻译API错误: {e}')
220 |         return ""
221 |     
222 |     translated_text = response_data.get("choices")[0].get("message", {}).get("content", "")
223 |     translated_text = translated_text.replace("将下面的日文文本翻译成中文：", "").replace("<|im_end|>", "")
224 |     translated_text = fix_translation_end(text, translated_text)
225 |     translated_text = unescape_translation(text, translated_text)
226 |     
227 |     # 检查翻译结果是否为纯英文，如果是则记录行号
228 |     if is_pure_english(translated_text):
229 |         console_print(f"警告：行号 {index} 的翻译结果为纯英文：'{translated_text}'")
230 |         
231 |         with open("english_translations.log", "a", encoding="utf-8") as log_file:
232 |             log_file.write(f"行号: {index}, 原文: {text}, 翻译: {translated_text}\n")
233 |     
234 |     console_print(f"原文: {text}\n翻译: {translated_text}\n")  # 调试信息，输出翻译前后的文本
235 |     return translated_text
236 | 
237 | # 处理翻译请求的JSON构造
238 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context):    
239 |     messages = []
240 |     
241 |     if model_type == "SakuraV0_8":
242 |         messages.append({"role": "system", "content": "你是一个简单的日文翻译模型，将日文翻译成简体中文。"})
243 |         messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文：{text}"})
244 |     else:
245 |         if model_type == "SakuraV0_9":
246 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅地将日文翻译成简体中文，并正确使用人称代词。"})
247 |         elif model_type == "SakuraV0_10":
248 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。"})
249 |         elif model_type == "SakuraV1_0":
250 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。"})
251 |         elif model_type == "GalTranslV2_6":
252 |             messages.append({"role": "system", "content": "你是一个视觉小说翻译模型，可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文，并联系上下文正确使用人称代词。"})
253 |         elif model_type == "GalTranslV3":
254 |             messages.append({"role": "system", "content": "你是一个视觉小说翻译模型，可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文，并联系上下文正确使用人称代词。"})
255 |         else:
256 |             messages.append({"role": "system", "content": "你是一个轻小说翻译模型，可以流畅通顺地将日文翻译成简体中文。"})
257 |         
258 |         if context:
259 |             history_text = "历史翻译：" + "\n".join(context)
260 |         else:
261 |             history_text = ""
262 |         
263 |         if model_type == "GalTranslV3":
264 |             if use_dict:
265 |                 dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
266 |                 user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注，结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：\n{text}"
267 |             else:
268 |                 user_content = f"{history_text}\n结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：\n{text}"
269 |             messages.append({"role": "user", "content": user_content})
270 |         else:
271 |             if context:
272 |                 for c in context:
273 |                     messages.append({"role": "assistant", "content": c})
274 |             
275 |             if use_dict:
276 |                 dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
277 |                 messages.append({"role": "user", "content": f"参考以下术语表：\n{dict_str}\n根据以上术语表的对应关系和备注，结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：{text}"})
278 |             else:
279 |                 messages.append({"role": "user", "content": f"结合历史剧情和上下文，将下面的文本从日文翻译成简体中文：{text}"})
280 |     
281 |     temperature = 0.6 if model_type == "GalTranslV3" else 0.2
282 |     
283 |     data = {
284 |         "model": "sukinishiro",
285 |         "messages": messages,
286 |         "temperature": temperature,
287 |         "top_p": 0.3,
288 |         "max_tokens": 512,
289 |         "frequency_penalty": 0.2,
290 |         "do_sample": True,
291 |         "num_beams": 1,
292 |         "repetition_penalty": 1.0
293 |     }
294 |     return data
295 | 
296 | # 进度管理类
297 | class TranslationProgress:
298 |     def __init__(self, task_name, total_items, num_threads):
299 |         self.progress_file = f"{task_name}.progress.json"
300 |         self.task_name = task_name
301 |         self.total_items = total_items
302 |         self.num_threads = num_threads
303 |         self.lock = threading.Lock()
304 |         self.initialize()
305 |     
306 |     def initialize(self):
307 |         if os.path.exists(self.progress_file):
308 |             with open(self.progress_file, 'r', encoding='utf-8') as file:
309 |                 self.progress_data = json.load(file)
310 |         else:
311 |             # 创建新的进度文件
312 |             chunk_size = self.total_items // self.num_threads
313 |             remainder = self.total_items % self.num_threads
314 |             
315 |             threads_info = []
316 |             start_idx = 0
317 |             
318 |             for i in range(self.num_threads):
319 |                 # 计算每个线程的起止范围
320 |                 end_idx = start_idx + chunk_size - 1
321 |                 if i == self.num_threads - 1:
322 |                     end_idx += remainder
323 |                 
324 |                 threads_info.append({
325 |                     "thread_id": i,
326 |                     "start_index": start_idx,
327 |                     "end_index": end_idx,
328 |                     "current_index": start_idx,
329 |                     "previous_translations": []
330 |                 })
331 |                 
332 |                 start_idx = end_idx + 1
333 |             
334 |             self.progress_data = {
335 |                 "task_name": self.task_name,
336 |                 "total_items": self.total_items,
337 |                 "num_threads": self.num_threads,
338 |                 "threads": threads_info
339 |             }
340 |             self.save()
341 |     
342 |     def update_progress(self, thread_id, current_index, translation=None, context_size=0):
343 |         with self.lock:
344 |             thread_info = self.progress_data["threads"][thread_id]
345 |             thread_info["current_index"] = current_index
346 |             
347 |             # 更新历史翻译记录
348 |             if translation and context_size > 0:
349 |                 thread_info.setdefault("previous_translations", [])
350 |                 thread_info["previous_translations"].append(translation)
351 |                 # 仅保留最近的N条翻译
352 |                 if len(thread_info["previous_translations"]) > context_size:
353 |                     thread_info["previous_translations"] = thread_info["previous_translations"][-context_size:]
354 |             
355 |             self.save()
356 |     
357 |     def get_thread_info(self, thread_id):
358 |         return self.progress_data["threads"][thread_id]
359 |     
360 |     def get_previous_translations(self, thread_id):
361 |         thread_info = self.progress_data["threads"][thread_id]
362 |         return thread_info.get("previous_translations", [])
363 |     
364 |     def is_completed(self):
365 |         for thread_info in self.progress_data["threads"]:
366 |             if thread_info["current_index"] <= thread_info["end_index"]:
367 |                 return False
368 |         return True
369 |     
370 |     def save(self):
371 |         with open(self.progress_file, 'w', encoding='utf-8') as file:
372 |             json.dump(self.progress_data, file, ensure_ascii=False, indent=4)
373 | 
374 | # 翻译工作线程函数
375 | def translate_worker(thread_id, task_name, data, json_keys, progress_manager, config):
376 |     thread_info = progress_manager.get_thread_info(thread_id)
377 |     start_index = thread_info["current_index"]
378 |     end_index = thread_info["end_index"]
379 |     api_num = len(config['endpoint'])
380 |     
381 |     # 获取该线程的历史翻译记录
382 |     previous_translations = progress_manager.get_previous_translations(thread_id)
383 |     
384 |     # 使用tqdm创建带有更多信息的进度条
385 |     with progress_lock:
386 |         pbar = tqdm(
387 |             total=end_index - thread_info["start_index"] + 1,
388 |             desc=f"线程 {thread_id}",
389 |             position=thread_id,
390 |             leave=True,
391 |             ncols=100,  # 增加宽度以容纳更多信息
392 |             bar_format='{l_bar}{bar:20}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'
393 |         )
394 |         progress_bars[thread_id] = pbar
395 |     
396 |     # 计算已完成的工作量并更新进度条
397 |     completed = start_index - thread_info["start_index"]
398 |     if completed > 0:
399 |         pbar.update(completed)
400 |     
401 |     for i in range(start_index, end_index + 1):
402 |         api_index = thread_id % api_num  # 使用线程ID来分配API端点
403 |         
404 |         if task_name.endswith(".json"):
405 |             key = json_keys[i]
406 |             original_text = key
407 |         else:  # CSV文件
408 |             original_text = data.loc[i, 'Original Text']
409 |         
410 |         translated_text = translate_text_by_paragraph(
411 |             original_text, i, api_index, config, previous_translations
412 |         )
413 |         
414 |         # 更新数据
415 |         if task_name.endswith(".json"):
416 |             data[json_keys[i]] = translated_text
417 |         else:  # CSV文件
418 |             data.loc[i, 'Machine translation'] = translated_text
419 |         
420 |         # 更新进度和历史翻译
421 |         progress_manager.update_progress(
422 |             thread_id, i + 1, translated_text, config.get('context_size', 0)
423 |         )
424 |         
425 |         # 更新进度条
426 |         with progress_lock:
427 |             pbar.update(1)
428 |         
429 |         # 定期保存整个翻译文件
430 |         if (i + 1) % config['save_frequency'] == 0 or i + 1 > end_index:
431 |             save_translation_data(data, task_name)
432 |             console_print(f"线程 {thread_id}: 已保存进度 {i + 1}/{end_index + 1}")
433 |     
434 |     # 完成后关闭进度条并从字典中移除
435 |     with progress_lock:
436 |         pbar.close()
437 |         progress_bars[thread_id] = None
438 | 
439 | # 保存翻译数据
440 | def save_translation_data(data, filename):
441 |     if filename.endswith(".json"):
442 |         with open(filename, 'w', encoding='utf-8') as file:
443 |             json.dump(data, file, ensure_ascii=False, indent=4)
444 |     elif filename.endswith(".csv"):
445 |         data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
446 | 
447 | # 初始化终端显示
448 | def setup_terminal():
449 |     # 清屏
450 |     os.system('cls' if os.name == 'nt' else 'clear')
451 |     # 将光标移到顶部
452 |     print("\033[H", end="")
453 | 
454 | # 主函数
455 | def main():
456 |     # 初始化终端显示
457 |     setup_terminal()
458 |     
459 |     config = load_config()
460 |     if not config['endpoint']:
461 |         console_print("请配置API endpoint后再运行程序。")
462 |         return
463 |     
464 |     # 初始化字典
465 |     dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {})))
466 |     config['dict'] = dict_data
467 |     
468 |     task_list = config['task_list']
469 |     if not task_list:
470 |         console_print("未找到待翻译文件，请更新config.json。")
471 |         return
472 | 
473 |     for task_name in task_list:
474 |         if not os.path.exists(task_name):
475 |             console_print(f"文件{task_name}不存在，跳过。")
476 |             continue
477 | 
478 |         # 加载数据
479 |         if task_name.endswith(".json"):
480 |             with open(task_name, 'r', encoding='utf-8') as file:
481 |                 data = json.load(file)
482 |             json_keys = list(data.keys())
483 |             total_items = len(json_keys)
484 |         elif task_name.endswith(".csv"):
485 |             data = pd.read_csv(task_name, encoding='utf-8')
486 |             data['Original Text'] = data['Original Text'].astype(str)
487 |             data['Machine translation'] = data['Machine translation'].astype(str)
488 |             total_items = len(data)
489 |             json_keys = None
490 |         else:
491 |             console_print(f"不支持的文件类型: {task_name}")
492 |             continue
493 | 
494 |         # 创建或加载进度管理器
495 |         num_threads = config['max_workers']
496 |         progress_manager = TranslationProgress(task_name, total_items, num_threads)
497 |         
498 |         console_print(f"开始处理任务: {task_name} (总条目: {total_items})")
499 |         console_print("调试信息将显示在顶部，进度条显示在底部")
500 |         time.sleep(1)  # 给用户时间阅读信息
501 |         
502 |         # 创建并启动工作线程
503 |         threads = []
504 |         for thread_id in range(num_threads):
505 |             thread = threading.Thread(
506 |                 target=translate_worker,
507 |                 args=(thread_id, task_name, data, json_keys, progress_manager, config)
508 |             )
509 |             threads.append(thread)
510 |             thread.start()
511 |             thread_info = progress_manager.get_thread_info(thread_id)
512 |             console_print(f"线程 {thread_id} 已启动，处理范围: {thread_info['start_index']} - {thread_info['end_index']}, 当前进度: {thread_info['current_index']}")
513 |         
514 |         # 等待所有线程完成
515 |         for thread in threads:
516 |             thread.join()
517 |         
518 |         console_print(f"任务 {task_name} 翻译完成")
519 |         
520 |         # 任务完成后，可以删除进度文件或保留作为记录
521 |         # os.remove(f"{task_name}.progress.json")
522 | 
523 | if __name__ == "__main__":
524 |     try:
525 |         main()
526 |     except KeyboardInterrupt:
527 |         # 处理Ctrl+C中断
528 |         print("\n程序被用户中断，正在保存进度...")
529 |         # 这里可以添加保存进度的代码
530 |     except Exception as e:
531 |         print(f"程序发生异常: {e}")
532 |     finally:
533 |         # 确保在程序退出时清理终端
534 |         print("\033[?25h")  # 确保光标可见
535 | 


--------------------------------------------------------------------------------