├── Mtool
├── requirements.txt
├── main.py
└── main_dev.py
├── .gitignore
├── Translator++
├── pic
│ ├── 1.png
│ ├── 2.png
│ └── 3.png
├── manual.py
├── manual2.py
├── 根据路径添加黄绿标签.js
├── 绿色标签添加路径翻译.js
├── README.md
├── api.py
└── llm.py
└── README.md
/Mtool/requirements.txt:
--------------------------------------------------------------------------------
1 | pip
2 | wheel
3 | setuptools
4 | requests
5 | tqdm
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.gguf
3 | *.log
4 | ManualTransFile.json
5 | TranslatedFile.json
6 | TransFile/
--------------------------------------------------------------------------------
/Translator++/pic/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/1.png
--------------------------------------------------------------------------------
/Translator++/pic/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/2.png
--------------------------------------------------------------------------------
/Translator++/pic/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/3.png
--------------------------------------------------------------------------------
/Translator++/manual.py:
--------------------------------------------------------------------------------
1 | # 一个手动运行的脚本,用于将 MTool 导出的 ManualTransFile.json 进行批量翻译
2 |
3 | from concurrent.futures import ThreadPoolExecutor
4 | from llm import LLM, translate
5 | from itertools import repeat
6 | from tqdm import tqdm
7 | import json
8 |
9 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"])
10 | # 全局字典,只会将相关项传入模型
11 | global_dicts = ()
12 |
13 | with open("ManualTransFile.json", "r", encoding="utf-8") as fp:
14 | data = json.load(fp)
15 | raw_texts = list(data.keys())
16 |
17 | with ThreadPoolExecutor(4) as executor:
18 | iterator = executor.map(translate, repeat(llm), list(data.keys()), repeat(()), repeat(()), repeat(global_dicts))
19 | results = list(tqdm(iterator, total=len(raw_texts)))
20 |
21 | with open("TranslatedFile.json", "w", encoding="utf-8") as fp:
22 | json.dump(dict(zip(raw_texts, results)), fp, ensure_ascii=False, indent=4)
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | RPGMaker_LLM_Translator
4 |
5 |
6 |
7 | # 介绍
8 | 这是一个基于Mtool/Translator++和Sakura模型的RPGMaker游戏本地翻译器,能够提供高质量离线日文翻译
9 | 建议使用[Sakura-13B-Galgame翻译模型](https://github.com/SakuraLLM/Sakura-13B-Galgame),当前支持版本为Sakura v0.8/v0.9/v0.10pre1/Galtransl-v2.6
10 |
11 | 项目经过重构,支持Mtool和Translator和最新版本Sakura模型。
12 |
13 | ## TODO
14 | - [x] 添加退化检测(仅MTool)
15 | - [x] 添加历史上文(仅MTool)
16 | - [x] 添加prompt字典(仅MTool)
17 | - [x] 添加并发
18 | - [x] 添加对Sakura v0.10支持
19 | - [x] 添加对Sakura v1.0支持
20 | - [x] 添加对Galtransl-v2.6支持
21 |
22 | ## 快速开始
23 | 首先需要部署Sakura模型,推荐使用Galtransl模型
24 | 请参考[Sakura模型部署教程](https://github.com/SakuraLLM/SakuraLLM/wiki)
25 |
26 | ### Mtool
27 | 部署教程:详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki)
28 |
29 | ### Translator++
30 | 详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki)
31 |
32 | 在Translator++上安装ChatGPT插件
33 | 
34 |
35 | 然后便可以开始翻译了
36 |
--------------------------------------------------------------------------------
/Translator++/manual2.py:
--------------------------------------------------------------------------------
1 | # 一个手动运行的脚本,用于将 AutoTranslator 导出的文件进行批量翻译
2 | # 会自动遍历并翻译 TransFile 文件夹下的所有文件
3 |
4 | from concurrent.futures import ThreadPoolExecutor
5 | from llm import LLM, translate
6 | from itertools import repeat
7 | from tqdm import tqdm
8 | import json
9 | import os
10 |
11 | folder = "../TransFile"
12 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"])
13 | # 全局字典,只会将相关项传入模型
14 | global_dicts = ()
15 |
16 | for filename in tqdm(os.listdir(folder)):
17 | filepath = f"{folder}/{filename}"
18 | with open(filepath, "r", encoding="utf-8") as fp:
19 | raw_texts = [x.split("=")[0] for x in fp.readlines()]
20 |
21 | with ThreadPoolExecutor(4) as executor:
22 | iterator = executor.map(translate, repeat(llm), raw_texts, repeat(()), repeat(()), repeat(global_dicts))
23 | results = list(tqdm(iterator, total=len(raw_texts)))
24 |
25 | with open(filepath, "w", encoding="utf-8") as fp:
26 | for i in range(len(raw_texts)):
27 | fp.write(f"{raw_texts[i]}={results[i]}\n")
28 |
--------------------------------------------------------------------------------
/Translator++/根据路径添加黄绿标签.js:
--------------------------------------------------------------------------------
1 | if (!Array.isArray(this.context)) {
2 | return;
3 | }
4 | const regexs = [
5 | /^Actors\/\d+\/note$/,
6 | /^Animations.*?$/,
7 | /^Armors\/\d+\/note$/,
8 | /^CommonEvents\/\d+\/name$/,
9 | /^CommonEvents\/\d+\/list\/\d+\/comment$/,
10 | /^Enemies\/\d+\/note$/,
11 | /^Items\/\d+\/note$/,
12 | /^Map\d{3}\/events\/\d+\/(name|note)$/,
13 | /^Mapinfos.*?$/,
14 | /^Skills\/\d+\/note$/,
15 | /^States\/\d+\/note$/,
16 | /^System\/switches\/\d+$/,
17 | /^System\/variables\/\d+$/,
18 | /^Tilesets.*?$/,
19 | /^Troops\/\d+\/name$/,
20 | /^Weapons\/\d+\/note$/,
21 | /^.*?MZ Plugin Command.*?$/,
22 | /^.*?Control Variables.*?$/
23 | ];
24 | var count = 0;
25 | for (const context of this.context) {
26 | for (const regex of regexs) {
27 | if (regex.test(context)) {
28 | count++;
29 | break;
30 | }
31 | }
32 | }
33 | var index = this.tags.indexOf("yellow");
34 | if (index > -1) {
35 | this.tags.splice(index, 1);
36 | }
37 | index = this.tags.indexOf("green");
38 | if (index > -1) {
39 | this.tags.splice(index, 1);
40 | }
41 | if (count === this.context.length) {
42 | this.tags.push("yellow");
43 | } else if (count > 0) {
44 | this.tags.push("green");
45 | }
46 |
--------------------------------------------------------------------------------
/Translator++/绿色标签添加路径翻译.js:
--------------------------------------------------------------------------------
1 | if (!this.tags.includes("green")) {
2 | return;
3 | }
4 | if (!Array.isArray(this.context)) {
5 | return;
6 | }
7 | const regexs = [
8 | /^Actors\/\d+\/note$/,
9 | /^Animations.*?$/,
10 | /^Armors\/\d+\/note$/,
11 | /^CommonEvents\/\d+\/name$/,
12 | /^CommonEvents\/\d+\/list\/\d+\/comment$/,
13 | /^Enemies\/\d+\/note$/,
14 | /^Items\/\d+\/note$/,
15 | /^Map\d{3}\/events\/\d+\/(name|note)$/,
16 | /^Mapinfos.*?$/,
17 | /^Skills\/\d+\/note$/,
18 | /^States\/\d+\/note$/,
19 | /^System\/switches\/\d+$/,
20 | /^System\/variables\/\d+$/,
21 | /^Tilesets.*?$/,
22 | /^Troops\/\d+\/name$/,
23 | /^Weapons\/\d+\/note$/,
24 | /^.*?MZ Plugin Command.*?$/,
25 | /^.*?Control Variables.*?$/
26 | ];
27 | if (!Array.isArray(this.parameters)) {
28 | this.parameters = []
29 | for (let i = 0; i < this.context.length; i++) {
30 | this.parameters.push({
31 | contextStr: this.context[i]
32 | });
33 | }
34 | }
35 | for (let i = 0; i < this.context.length; i++) {
36 | let context = this.context[i];
37 | this.parameters[i]["translation"] = "";
38 | for (const regex of regexs) {
39 | if (regex.test(context)) {
40 | this.parameters[i]["translation"] = this.cells[0];
41 | break;
42 | }
43 | }
44 | }
45 | trans.project.files[this.file].parameters[this.rowId] = this.parameters
46 |
--------------------------------------------------------------------------------
/Translator++/README.md:
--------------------------------------------------------------------------------
1 | # Translator++工作流
2 |
3 | 由于RPGMaker制作的游戏在文本细节上各不相同,在翻译了数个不同的游戏后,我总结了一套比较优秀的工作流,希望可以帮大家获得更好的翻译质量。
4 |
5 | **本文内容有较高上手门槛**
6 |
7 | ## Translator++设置
8 |
9 | 首先是自定义控制符,在翻译时,所有符合这些内容的文本都会被替换为`$dat[1]`这样的格式。由于各个游戏的控制符格式不同,官方默认的这些可能有未覆盖到的,需要单独处理。如图所示。
10 |
11 | 
12 |
13 | 以下是一些遇到过的情况,可以根据实际情况决定是否采用:
14 |
15 | - 在每一个正则表达式后添加`\d*`,这样可以将控制符后的数字也包含进去,避免`\C[1]1000`被后端替换为`控制符11000`。
16 | - 删除第四行的`\!`,这个不关键。
17 |
18 | 或者,你也可以直接将2至4行整体替换为
19 |
20 | ```re
21 | /(\\[a-zA-Z0-9]+(?:\[.*?\]|<.*?\>)\d*|\\[a-zA-Z\{\}\\\$\.\|<\>\^]\d*)+/gi
22 | ```
23 |
24 | 如果想得到更好的效果,我建议将更复杂的逻辑和提示词拼接工作放到Python后端处理。所以在OpenAI ChatGPT插件设置中,我建议清空**System Message Template**,将**Body Message Template**设置为仅包含`${SOURCE_TEXT}`,如图所示。
25 |
26 | 
27 |
28 | 还有一些其它设置,例如如果想使用这个文件夹中的api,还需要将**Target URL**设置为`http://127.0.0.1:1500/v1/chat/completions`, **Batch Delay**设置为1,**Max Characters per Batch**设置为65536,**Max row per concurrent requests**尽量调大。
29 |
30 | ## 为特定路径的文本打标签
31 |
32 | MTools翻译的一个缺点就是会把所有字符串都翻译了,而Translator++也会读取很多无意义的字符串。翻译这些字符串不仅耗时,而且可能会破坏一些游戏逻辑。可以右键行,通过**Row Properties**查看字符串的路径,如图所示。
33 |
34 | 
35 |
36 | Translator++拥有js脚本执行功能,选中需要执行脚本的文件,右键,在**With XX Selected -> Run Automation -> For Each Row**执行脚本。
37 |
38 | 更多执行细节,请参考[官方文档](https://dreamsavior.net/docs/translator/execute-script/pin-your-automation-to-quickly-launch-from-translator/)。
39 |
40 | 我推荐首先使用[脚本1](根据路径添加黄绿标签.js)对每行打标签,黄色为所有上下文路径都不需要翻译,绿色为仅有部分上下文路径需要翻译。
41 |
42 | 注意,由于每个游戏的差异,没有一劳永逸的正则表达式列表。为了提高翻译质量,建议开始翻以前人工浏览一遍,增减需要的正则表达式。
43 |
44 | ## 开始翻译
45 |
46 | 翻译的时候,红色和蓝色标签是Translator++加上的,记得和黄色的标签一起加入**黑名单**,这些行都不处理。
47 |
48 | ## Python后端
49 |
50 | 虽然重复造轮子不是好行为,但是一个简单的Python后端就可以做到很多事情,还是值得简单造一个轮子的。
51 |
52 | [llm.py](llm.py) 和 [api.py](api.py) 这两个文件实现了一些简单的功能,文件注释写的比较详细,这里就不再赘述代码细节,只简单介绍。
53 |
54 | ### 使用方式
55 |
56 | 库依赖不多,主要就需要安装一个 [llama-cpp-python](https://llama-cpp-python.readthedocs.io/en/latest/) 和一个 FastAPI。
57 |
58 | 在修改了 [api.py](api.py) 的一些参数之后,只需要简单 `python api.py` 即可启动。
59 |
60 | ```py
61 | port = 1500
62 | logging.basicConfig(filename="log.log")
63 | history_deque = deque(maxlen=3)
64 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"])
65 | app = FastAPI()
66 | dicts = [
67 | {"src": "控制符", "dst": "控制符"}
68 | ]
69 | ```
70 |
71 | port为服务启动的端口号。
72 |
73 | basicConfig可以设置日志文件名,日志会记录控制符和行数翻译前后不一致的部分,供人工更正。
74 |
75 | history_deque控制最大提供给LLM的上文数量。
76 |
77 | LLM的参数都有接口说明,值得一提的是工作进程数和CUDA列表:
78 |
79 | - 如果显存足够,建议一张卡上跑两个工作进程,可以吃满显卡算力,不推荐更多。
80 | - 如果有多张卡,可以每张卡上都跑单独的工作进程,这个配置是4张4090的参考配置。
81 | - 这边的工作进程越多,Translator++就应该设置越大的**Max row per concurrent requests**,以减少上下文切换的损耗。
82 |
83 | app一般不用修改。
84 |
85 | dicts是提供给模型的字典,如果要使用这个后端,至少保留控制符这个说明。
86 |
87 | 如果不想深究,下面的小节可以跳过,直接看结束翻译段落即可。
88 |
89 | ### 控制符格式
90 |
91 | 代码中有一个处理,就是将Translator++的`${dat[1]}`这样的控制符全部替换为`控制符1`这样的文本,翻译完之后再替换回去。有什么用呢?请看例子:
92 |
93 | > 味方単体に1ターン『${dat[1]}無敵』を付与
94 |
95 | 这段文本,如果直接让LLM翻译,很可能会丢失掉`${dat[1]}`这样的控制符,或者是插入在错误的位置。我也试过将前后分别翻译再拼接,反而会丢失上下文。这个问题卡了我很久,一度想让我去再训练一个可以处理控制符的模型。某一天我观察到LLM会倾向于原样输出中文文本,这给了我灵感,如果将控制符改成中文:
96 |
97 | > 味方単体に1ターン『控制符1無敵』を付与
98 |
99 | 它就会翻译出正常的结果,并且把控制符放在合适的位置。哪怕是这种多控制符的文本:
100 |
101 | > 控制符1敵全体にダメージを与え『控制符2心傷』『控制符3心弱』状態にする。
102 |
103 | 经过测试也可以正确翻译并处理控制符的位置。
104 |
105 | ### SG说明格式
106 |
107 | 代码中还有对``格式的说明的处理,例如:
108 |
109 | > 生徒達はCPを増やしたりします。
111 | > 増やしたCPは、スキルツリー呪力領域の開放や、
112 | > アイテム合成に使えます。>
113 | >
114 | >
115 |
116 | 这个里面的key是不能翻译的,而value是需要翻译的,所以代码对其进行了简单的提取处理。
117 |
118 | ## 结束翻译
119 |
120 | 翻译完成后,记得将日志中记录的错误进行简单的人工修正。
121 |
122 | 然后使用[脚本2](绿色标签添加路径翻译.js)将绿色标签的上下文翻译自动设置完。
123 |
124 | 最后就可以直接注入翻译开始游戏。
125 |
--------------------------------------------------------------------------------
/Translator++/api.py:
--------------------------------------------------------------------------------
1 | from collections import Counter, deque
2 | from concurrent.futures import ThreadPoolExecutor
3 | from fastapi import FastAPI, Request
4 | from llm import LLM, translate
5 | import logging
6 | import uvicorn
7 | import json
8 | import re
9 |
10 | port = 1500
11 | logging.basicConfig(filename="log.log")
12 | history_deque = deque(maxlen=3)
13 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"])
14 | app = FastAPI()
15 | # 全局字典,只会将相关项传入模型
16 | global_dicts = [
17 | {"src": "原文", "dst": "译文", "info": "说明(可选)"}
18 | ]
19 |
20 | def text_translate(text: str, history: tuple[str]) -> str:
21 | """预处理文本并执行翻译
22 |
23 | Args:
24 | text (str): 可能包含`${dat[数字]}`格式控制符的文本
25 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple)
26 |
27 | Returns:
28 | str: 翻译后的文本
29 |
30 | Note:
31 | 1. 自动转换 ${dat[1]} ↔ 控制符1 的格式
32 | 2. 校验翻译前后控制符数量和行数是否一致,最多重试10次
33 | 3. 超过最多重试次数时会记录警告日志
34 | """
35 | pattern1 = r"\$\{dat\[(\d+)\]\}"
36 | pattern2 = r"控制符(\d+)"
37 |
38 | # 重试时控制符会继续向后标号,以提供不同的原文来提高成功率
39 | counter = 0
40 | def replace_to_chinese(match):
41 | nonlocal counter
42 | counter += 1
43 | placeholder = "控制符" + str(counter)
44 | dat_mapping[placeholder] = match.group(0)
45 | return placeholder
46 |
47 | def replace_back_to_dat(match):
48 | placeholder = match.group(0)
49 | return dat_mapping.get(placeholder, placeholder)
50 |
51 | retry = True
52 | retry_counter = 0
53 | while retry and retry_counter < 10:
54 | dat_mapping = {}
55 | retry = False
56 | retry_counter += 1
57 |
58 | before = Counter(re.findall(pattern1, text))
59 | line_num = len(text.splitlines())
60 | result = re.sub(pattern1, replace_to_chinese, text)
61 | dat_dicts = ({"src": key, "dst": key} for key in dat_mapping.keys())
62 | result = translate(llm, result, history, dat_dicts, global_dicts)
63 | result = re.sub(pattern2, replace_back_to_dat, result)
64 | after = Counter(re.findall(pattern1, result))
65 |
66 | if before != after:
67 | # logging.warning(f"{before} != {after}\n{text}\n{result}")
68 | retry = True
69 | elif line_num != len(result.splitlines()):
70 | # logging.warning(f"line_num mismatch\n{text}\n{result}")
71 | retry = True
72 | if retry:
73 | logging.warning(f"stop retry after {retry_counter} attempts\n{text}\n{result}")
74 | # elif retry_counter > 1:
75 | # logging.warning(f"get correct translation after {retry_counter} attempts\n{text}\n{result}")
76 |
77 | return result
78 |
79 | def data_translate(data: str, history: tuple[str]) -> str:
80 | """处理包含的复合数据翻译
81 |
82 | Args:
83 | data (str): 可能包含标签的文本
84 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple)
85 |
86 | Returns:
87 | str: 翻译后的完整文本
88 |
89 | Note:
90 | 1. 优先提取结构进行分段翻译
91 | 2. 无标签时直接调用text_translate
92 | 3. 保持原标签结构不变只翻译内容部分
93 | """
94 | pattern = r""
95 | finds = re.findall(pattern, data, re.DOTALL)
96 | if len(finds) > 0:
97 | for raw in finds:
98 | index = raw.find(":")
99 | if index == -1:
100 | continue
101 | text = raw[index + 1 : -1]
102 | text = text_translate(text, history)
103 | data = data.replace(raw, f"{raw[:index]}:{text}>")
104 | else:
105 | data = text_translate(data, history)
106 | return data
107 |
108 | @app.post("/v1/chat/completions")
109 | async def read_item(request: Request):
110 | """批量翻译API端点(POST方法)
111 |
112 | Args:
113 | request (Request): FastAPI请求对象,需包含:
114 | {
115 | "messages": [{
116 | "role": "user",
117 | "content": "[\"text1\", \"text2\"]" # JSON字符串数组
118 | }]
119 | }
120 |
121 | Returns:
122 | dict: 格式化的响应数据:
123 | {
124 | "choices": [{
125 | "message": {
126 | "content": "[\"trans1\", \"trans2\"]" # JSON字符串数组
127 | }
128 | }]
129 | }
130 |
131 | Note:
132 | 1. 使用ThreadPoolExecutor实现多文本并发翻译
133 | 2. 维护全局history_deque保存最近3条历史记录
134 | 3. 每个文本会附带其之前3条文本作为上文
135 | """
136 | data = await request.json()
137 | data = data["messages"][0]["content"]
138 | data = json.loads(data)
139 | history = []
140 | for d in data:
141 | history.append(tuple(history_deque))
142 | history_deque.append(d)
143 | with ThreadPoolExecutor(len(data)) as executor:
144 | data = executor.map(data_translate, data, history)
145 | return {"choices": [{"message": {"content": json.dumps(list(data))}}]}
146 |
147 | @app.get("/")
148 | def read_item(text: str):
149 | """单条文本翻译API端点(GET方法)
150 |
151 | Args:
152 | text (str): 通过URL参数传递的待翻译文本
153 |
154 | Returns:
155 | str: 直接返回翻译结果字符串
156 | """
157 | result = translate(llm, text, (), (), ())
158 | return result
159 |
160 | if __name__ == '__main__':
161 | uvicorn.run(app, port=port)
162 |
--------------------------------------------------------------------------------
/Translator++/llm.py:
--------------------------------------------------------------------------------
1 | from llama_cpp import Llama
2 | from multiprocessing import Pool
3 | from functools import lru_cache
4 | import os
5 |
6 | def contains_japanese(text):
7 | """检查文本是否包含日文片假名
8 |
9 | Args:
10 | text (str): 待检测的文本
11 |
12 | Returns:
13 | bool: 如果文本中包含日文片假名(Unicode范围3040-30FF)返回True,否则返回False
14 | """
15 | for char in text:
16 | if "\u3040" <= char <= "\u30FF":
17 | return True
18 | return False
19 |
20 | def _init_worker(model_path: str, cuda_device: str):
21 | """
22 | 初始化工作进程的LLM模型
23 |
24 | Args:
25 | model_path (str): 模型文件路径
26 | cuda_device (str): 指定使用的CUDA设备ID
27 | """
28 | global worker_model
29 | print(f"PID: {os.getpid()} CUDA: {cuda_device}")
30 | os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device
31 | worker_model = Llama(model_path, n_gpu_layers=-1, n_ctx=2048, verbose=False)
32 |
33 | def _get_glossary(gpt_dicts: list[dict]) -> str:
34 | """
35 | 将术语字典列表格式化为字符串
36 |
37 | Args:
38 | gpt_dicts (list[dict]): 术语字典列表,每个字典应包含:
39 | - src: 源语言术语
40 | - dst: 目标语言翻译
41 | - info(可选): 附加信息
42 |
43 | Returns:
44 | str: 格式化后的术语表字符串,每行格式为"src->dst #info"或"src->dst"
45 |
46 | Example:
47 | >>> _get_glossary([{"src": "猫", "dst": "cat", "info": "动物"}])
48 | >>> '猫->cat #动物\\n'
49 | """
50 | glossary = ""
51 | for gpt in gpt_dicts:
52 | if "info" in gpt.keys():
53 | glossary += "{}->{} #{}\n".format(gpt["src"], gpt["dst"], gpt["info"])
54 | else:
55 | glossary += "{}->{}\n".format(gpt["src"], gpt["dst"])
56 | return glossary
57 |
58 | def _process_translate(model_name: str, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []) -> str:
59 | """
60 | 执行单条文本的翻译
61 |
62 | Args:
63 | model_name (str): 模型名称,支持"sakura"或"galtransl"
64 | text (str): 待翻译的日文文本
65 | history (list[dict], optional): 对话历史记录
66 | gpt_dicts (list[dict], optional): 术语字典列表
67 |
68 | Returns:
69 | str: 翻译后的中文文本
70 | """
71 | messages = []
72 | if model_name == "sakura":
73 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"})
74 | for item in history:
75 | messages.append({"role": "assistant", "content": item})
76 | if len(gpt_dicts) == 0:
77 | user_prompt = "将下面的日文文本翻译成中文:" + text
78 | else:
79 | user_prompt = "根据以下术语表(可以为空):\n"
80 | user_prompt += _get_glossary(gpt_dicts)
81 | user_prompt += "将下面的日文文本根据对应关系和备注翻译成中文:" + text
82 |
83 | elif model_name == "galtransl":
84 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,注意"})
85 | user_prompt = "历史翻译:\n" + "\n".join(history) + "\n"
86 | if len(gpt_dicts) != 0:
87 | user_prompt += "参考以下术语表(可为空,格式为src->dst #备注):\n"
88 | user_prompt += _get_glossary(gpt_dicts)
89 | user_prompt += "根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n" + text
90 |
91 | messages.append({"role": "user", "content": user_prompt})
92 | if model_name == "sakura":
93 | res = worker_model.create_chat_completion(messages=messages, temperature=0.1, top_p=0.3, repeat_penalty=1, max_tokens=512, frequency_penalty=0.2)
94 | elif model_name == "galtransl":
95 | res = worker_model.create_chat_completion(messages=messages, temperature=0.6, top_p=0.8, repeat_penalty=1, max_tokens=512, frequency_penalty=0.1)
96 | return res["choices"][0]["message"]["content"]
97 |
98 | class LLM:
99 | """
100 | 多进程LLM翻译器主类
101 |
102 | Attributes:
103 | model_name (str): 模型名称
104 | pool (multiprocessing.Pool): 工作进程池
105 | """
106 | def __init__(self, model_name: str, model_path: str, num_process: int, cuda_device: list[str]):
107 | """
108 | 初始化LLM翻译器
109 |
110 | Args:
111 | model_name (str): 模型名称 ("sakura" | "galtransl")
112 | model_path (str): 模型文件路径
113 | num_process (int): 工作进程数
114 | cuda_device (list[str]): 每个进程使用的CUDA设备ID列表
115 |
116 | Note:
117 | - cuda_device列表长度应与num_process匹配
118 | """
119 | self.model_name = model_name
120 | self.pool = Pool(num_process)
121 | init_args = [(model_path, cuda_device[i]) for i in range(num_process)]
122 | self.pool.starmap(_init_worker, init_args)
123 |
124 | def translate(self, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []):
125 | """
126 | 提交单个翻译任务到进程池
127 |
128 | Args:
129 | text (str): 待翻译文本
130 | history (list[dict], optional): 历史对话
131 | gpt_dicts (list[dict], optional): 术语表
132 |
133 | Returns:
134 | multiprocessing.pool.AsyncResult: 异步结果对象
135 | """
136 | return self.pool.apply_async(_process_translate, (self.model_name, text, history, gpt_dicts))
137 |
138 | def batch_translate(self, datas: list[dict]) -> list[str]:
139 | """
140 | 批量翻译文本
141 |
142 | Args:
143 | datas (list[dict]): 待翻译数据列表,每个元素应包含:
144 | - text: 待翻译文本
145 | - history: 历史对话
146 | - gpt_dicts: 术语表
147 |
148 | Returns:
149 | list[str]: 翻译结果列表,顺序与输入一致
150 |
151 | Note:
152 | - 每个 key 都必须有值,即使是空列表
153 |
154 | Example:
155 | >>> translator.batch_translate([{"text": "こんにちは", "history": [], "gpt_dicts": []}])
156 | >>> ['你好']
157 | """
158 | tasks = [self.__translate(data["text"], data["history"], data["gpt_dicts"]) for data in datas]
159 | results = [task.get() for task in tasks]
160 | return results
161 |
162 | @lru_cache(maxsize=1024)
163 | def translate(llm: LLM, text: str, history: tuple[str], local_dicts: tuple[str], global_dicts: tuple[str]) -> str:
164 | """带缓存的单条文本翻译核心函数
165 |
166 | Args:
167 | llm (LLM): 多进程LLM翻译器实例
168 | text (str): 待翻译文本(自动替换全角空格为半角空格)
169 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple)
170 | local_dicts (tuple[str]): 局部字典(需传入可哈希的tuple)无论文本中是否出现都会传入翻译器
171 | global_dicts (tuple[str]): 全局字典(需传入可哈希的tuple)只会将文本中出现的部分传入翻译器
172 |
173 | Returns:
174 | str: 翻译后的中文文本
175 |
176 | Note:
177 | 1. 使用LRU缓存(最多1024条)加速重复文本翻译
178 | 2. 非日文文本会直接返回原内容
179 | 3. 实际调用llm.translate()执行翻译
180 | """
181 | text = text.replace("\u3000", " ")
182 | if not contains_japanese(text):
183 | return text
184 | gpt_dicts = list(local_dicts)
185 | for item in global_dicts:
186 | if item["src"] in text:
187 | gpt_dicts.append(item)
188 | result = llm.translate(text, history, gpt_dicts).get()
189 | return result
190 |
--------------------------------------------------------------------------------
/Mtool/main.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 | import re
4 | import os
5 | import pandas as pd
6 | from tqdm import tqdm
7 | import unicodedata
8 | import csv
9 | import sys
10 | from concurrent.futures import ThreadPoolExecutor, as_completed
11 | import threading
12 |
13 | # 读取全局配置信息
14 | def load_config():
15 | if not os.path.exists("config.json"):
16 | config_data = {
17 | "last_processed": 0,
18 | "task_list": ["ManualTransFile.json"],
19 | "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"],
20 | "model_type": "Sgaltransl",
21 | "model_version": "2.6",
22 | "use_dict": False,
23 | "dict": {},
24 | "dict_mode": "Partial",
25 | "save_frequency": 100,
26 | "shutdown": 0,
27 | "max_workers": 1,
28 | "context_size": 0
29 | }
30 | with open("config.json", 'w') as file:
31 | json.dump(config_data, file, indent=4)
32 | with open('config.json', 'r', encoding='utf-8') as file:
33 | return json.load(file)
34 |
35 | # 初始化字典
36 | def initialize_dict(dict_str):
37 | if not dict_str:
38 | return {}, ""
39 | try:
40 | dict_data = json.loads(dict_str)
41 | dict_converted = {}
42 | for key, value in dict_data.items():
43 | if isinstance(value, list) and len(value) > 0:
44 | if len(value) == 1:
45 | dict_converted[key] = [value[0], ""]
46 | else:
47 | dict_converted[key] = value[:2]
48 | else:
49 | dict_converted[key] = [value, ""]
50 | dict_strings = get_dict_string_list(dict_converted)
51 | return dict_converted, "\n".join(dict_strings)
52 | except Exception as e:
53 | print(f"Error initializing dictionary: {e}")
54 | return {}, ""
55 |
56 | # 获取字典字符串列表
57 | def get_dict_string_list(kv_pairs):
58 | dict_list = []
59 | for key, value in kv_pairs.items():
60 | src = key
61 | dst = value[0]
62 | info = value[1]
63 | if info:
64 | dict_list.append(f"{src}->{dst} #{info}")
65 | else:
66 | dict_list.append(f"{src}->{dst}")
67 | return dict_list
68 |
69 | # 模型版本管理
70 | def get_translation_model(model_name, model_version):
71 | if model_name.lower() == "sakura":
72 | if model_version == "0.8":
73 | return "SakuraV0_8"
74 | elif model_version == "0.9":
75 | return "SakuraV0_9"
76 | elif model_version == "0.10":
77 | return "SakuraV0_10"
78 | elif model_version == "1.0":
79 | return "SakuraV1_0"
80 | else:
81 | return "SakuraV1_0"
82 | elif model_name.lower() == "sakura32b":
83 | if model_version == "0.10":
84 | return "Sakura32bV0_10"
85 | else:
86 | return "Sakura32bV0_10"
87 | elif model_name.lower() == "galtransl":
88 | if model_version == "2.6":
89 | return "GalTranslV2_6"
90 | elif model_version == "3.0":
91 | return "GalTranslV3"
92 | else:
93 | return "GalTranslV2_6"
94 | else:
95 | return "SakuraV1_0"
96 |
97 | # 检查文本是否包含日文字符
98 | def contains_japanese(text):
99 | text = unicodedata.normalize('NFKC', text)
100 | return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text
101 |
102 | # 分割文本段落
103 | def split_text_with_newlines(text):
104 | paragraphs = re.split(r'(\r\n|\r|\n)', text)
105 | return paragraphs
106 |
107 | # 判断是否是文件路径
108 | def is_file_path(text):
109 | # 基于文本特征判断是否是文件路径
110 | return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text))
111 |
112 | # 符号管理工具类
113 | def fix_translation_end(original, translation):
114 | if translation.endswith("。") and not original.endswith("。"):
115 | translation = translation[:-1]
116 | if translation.endswith("。」") and not original.endswith("。」"):
117 | translation = translation[:-2] + "」"
118 | return translation
119 |
120 | def unescape_translation(original, translation):
121 | if "\r" not in original:
122 | translation = translation.replace("\r", "\r")
123 | if "\n" not in original:
124 | translation = translation.replace("\n", "\n")
125 | if "\t" not in original:
126 | translation = translation.replace("\t", "\t")
127 | return translation
128 |
129 | # 翻译文本,按段落翻译
130 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None):
131 | # 如果是文件路径或者文件,直接跳过
132 | if is_file_path(text):
133 | return text
134 |
135 | contains_jp, updated_text = contains_japanese(text)
136 | if contains_jp:
137 | segments = split_text_with_newlines(updated_text)
138 | translated_segments = []
139 | for segment in segments:
140 | if segment in ['\r\n', '\r', '\n']:
141 | translated_segments.append(segment)
142 | else:
143 | if segment:
144 | translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations))
145 | else:
146 | translated_segments.append(segment)
147 | translated_text = ''.join(translated_segments)
148 | return translated_text
149 | else:
150 | return text
151 |
152 | # 调用API进行翻译
153 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None):
154 | try:
155 | endpoint = config['endpoint'][api_idx]
156 | model_type = get_translation_model(config['model_type'], config['model_version'])
157 | context_size = config.get('context_size', 0)
158 | context = previous_translations[-context_size:] if previous_translations else []
159 | data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context)
160 | response = requests.post(endpoint, json=data)
161 | response.raise_for_status()
162 |
163 | response_data = response.json()
164 | completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0)
165 | max_tokens = data["max_tokens"]
166 |
167 | # 检查是否发生退化,重试时调整 frequency_penalty
168 | if completion_tokens == max_tokens:
169 | print("模型可能发生退化,调整 frequency_penalty 并重试...")
170 | data["frequency_penalty"] = 0.8
171 | response = requests.post(endpoint, json=data)
172 | response.raise_for_status()
173 | response_data = response.json()
174 |
175 | except requests.RequestException as e:
176 | print(f'请求翻译API错误: {e}')
177 | return ""
178 |
179 | translated_text = response_data.get("choices")[0].get("message", {}).get("content", "")
180 | translated_text = translated_text.replace("将下面的日文文本翻译成中文:", "").replace("<|im_end|>", "")
181 | translated_text = fix_translation_end(text, translated_text)
182 | translated_text = unescape_translation(text, translated_text)
183 | print(f"原文: {text}\n翻译: {translated_text}\n") # 调试信息,输出翻译前后的文本
184 | return translated_text
185 |
186 | # 处理翻译请求的JSON构造
187 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context):
188 | messages = []
189 |
190 | if model_type == "SakuraV0_8":
191 | messages.append({"role": "system", "content": "你是一个简单的日文翻译模型,将日文翻译成简体中文。"})
192 | messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文:{text}"})
193 | else:
194 | if model_type == "SakuraV0_9":
195 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅地将日文翻译成简体中文,并正确使用人称代词。"})
196 | elif model_type == "SakuraV0_10":
197 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"})
198 | elif model_type == "SakuraV1_0":
199 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"})
200 | elif model_type == "GalTranslV2_6":
201 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"})
202 | elif model_type == "GalTranslV3":
203 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"})
204 | else:
205 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地将日文翻译成简体中文。"})
206 |
207 | if context:
208 | history_text = "历史翻译:" + "\n".join(context)
209 | else:
210 | history_text = ""
211 |
212 | if model_type == "GalTranslV3":
213 | if use_dict:
214 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
215 | user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}"
216 | else:
217 | user_content = f"{history_text}\n结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}"
218 | messages.append({"role": "user", "content": user_content})
219 | else:
220 | if context:
221 | for c in context:
222 | messages.append({"role": "assistant", "content": c})
223 |
224 | if use_dict:
225 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
226 | messages.append({"role": "user", "content": f"根据上文和以下术语表:\n{dict_str}\n将下面的日文文本翻译成中文:{text}"})
227 | else:
228 | messages.append({"role": "user", "content": f"根据上文,将下面的日文文本翻译成中文:{text}"})
229 |
230 | temperature = 0.6 if model_type == "GalTranslV3" else 0.2
231 |
232 | data = {
233 | "model": "sukinishiro",
234 | "messages": messages,
235 | "temperature": temperature,
236 | "top_p": 0.3,
237 | "max_tokens": 384,
238 | "frequency_penalty": 0.2,
239 | "do_sample": True,
240 | "num_beams": 1,
241 | "repetition_penalty": 1.0
242 | }
243 | return data
244 |
245 | # 保存翻译进度
246 | def save_progress(data, filename, index, task_list):
247 | if filename.endswith(".json"):
248 | with open(filename, 'w', encoding='utf-8') as file:
249 | json.dump(data, file, ensure_ascii=False, indent=4)
250 | elif filename.endswith(".csv"):
251 | data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
252 | config = load_config()
253 | config['last_processed'] = index
254 | config['task_list'] = task_list
255 | with open('config.json', 'w', encoding='utf-8') as file:
256 | json.dump(config, file, indent=4)
257 |
258 | # 主流程
259 | def main():
260 | config = load_config()
261 | if not config['endpoint']:
262 | print("请配置API endpoint后再运行程序。")
263 | return
264 |
265 | # 初始化字典
266 | dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {})))
267 | config['dict'] = dict_data
268 |
269 | task_list = config['task_list']
270 | if not task_list:
271 | print("未找到待翻译文件,请更新config.json。")
272 | return
273 |
274 | for task_name in task_list:
275 | if not os.path.exists(task_name):
276 | print(f"文件{task_name}不存在,跳过。")
277 | continue
278 |
279 | if task_name.endswith(".json"):
280 | with open(task_name, 'r', encoding='utf-8') as file:
281 | data = json.load(file)
282 | json_keys = list(data.keys())
283 | elif task_name.endswith(".csv"):
284 | data = pd.read_csv(task_name, encoding='utf-8')
285 | data['Original Text'] = data['Original Text'].astype(str)
286 | data['Machine translation'] = data['Machine translation'].astype(str)
287 | else:
288 | print(f"不支持的文件类型: {task_name}")
289 | continue
290 |
291 | total_keys = len(data)
292 | start_index = config['last_processed']
293 | api_num = len(config['endpoint'])
294 | previous_translations = []
295 | with ThreadPoolExecutor(max_workers=config['max_workers']) as executor:
296 | future_to_index = {}
297 | for i in range(start_index, total_keys):
298 | key = json_keys[i] if task_name.endswith(".json") else data.loc[i, 'Original Text']
299 | api_index = i % api_num
300 | future = executor.submit(translate_text_by_paragraph, key, i, api_index, config, previous_translations)
301 | future_to_index[future] = i
302 | for future in tqdm(as_completed(future_to_index), total=len(future_to_index), desc="任务进度"):
303 | index = future_to_index[future]
304 | try:
305 | translated_text = future.result()
306 | previous_translations.append(translated_text)
307 | if len(previous_translations) > config.get('context_size', 0):
308 | previous_translations.pop(0)
309 | if task_name.endswith(".json"):
310 | data[json_keys[index]] = translated_text
311 | if task_name.endswith(".csv"):
312 | data.loc[index, 'Machine translation'] = translated_text
313 | if (index + 1) % config['save_frequency'] == 0 or index + 1 == total_keys:
314 | save_progress(data, task_name, index + 1, task_list)
315 | except Exception as exc:
316 | print(f'{index + 1}行翻译发生异常: {exc}')
317 |
318 | if __name__ == "__main__":
319 | main()
320 |
--------------------------------------------------------------------------------
/Mtool/main_dev.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 | import re
4 | import os
5 | import pandas as pd
6 | from tqdm import tqdm
7 | import unicodedata
8 | import csv
9 | import sys
10 | import shutil
11 | from concurrent.futures import ThreadPoolExecutor, as_completed
12 | import threading
13 | import time
14 |
15 | # 全局变量,用于控制进度条显示
16 | progress_bars = {}
17 | progress_lock = threading.Lock()
18 | debug_output = [] # 用于存储调试输出
19 |
20 | # 读取全局配置信息
21 | def load_config():
22 | if not os.path.exists("config.json"):
23 | config_data = {
24 | "last_processed": 0,
25 | "task_list": ["ManualTransFile.json"],
26 | "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"],
27 | "model_type": "Sgaltransl",
28 | "model_version": "2.6",
29 | "use_dict": False,
30 | "dict": {},
31 | "dict_mode": "Partial",
32 | "save_frequency": 100,
33 | "shutdown": 0,
34 | "max_workers": 1,
35 | "context_size": 0
36 | }
37 | with open("config.json", 'w') as file:
38 | json.dump(config_data, file, indent=4)
39 | with open('config.json', 'r', encoding='utf-8') as file:
40 | return json.load(file)
41 |
42 | # 初始化字典
43 | def initialize_dict(dict_str):
44 | if not dict_str:
45 | return {}, ""
46 | try:
47 | dict_data = json.loads(dict_str)
48 | dict_converted = {}
49 | for key, value in dict_data.items():
50 | if isinstance(value, list) and len(value) > 0:
51 | if len(value) == 1:
52 | dict_converted[key] = [value[0], ""]
53 | else:
54 | dict_converted[key] = value[:2]
55 | else:
56 | dict_converted[key] = [value, ""]
57 | dict_strings = get_dict_string_list(dict_converted)
58 | return dict_converted, "\n".join(dict_strings)
59 | except Exception as e:
60 | console_print(f"Error initializing dictionary: {e}")
61 | return {}, ""
62 |
63 | # 获取字典字符串列表
64 | def get_dict_string_list(kv_pairs):
65 | dict_list = []
66 | for key, value in kv_pairs.items():
67 | src = key
68 | dst = value[0]
69 | info = value[1]
70 | if info:
71 | dict_list.append(f"{src}->{dst} #{info}")
72 | else:
73 | dict_list.append(f"{src}->{dst}")
74 | return dict_list
75 |
76 | # 模型版本管理
77 | def get_translation_model(model_name, model_version):
78 | if model_name.lower() == "sakura":
79 | if model_version == "0.8":
80 | return "SakuraV0_8"
81 | elif model_version == "0.9":
82 | return "SakuraV0_9"
83 | elif model_version == "0.10":
84 | return "SakuraV0_10"
85 | elif model_version == "1.0":
86 | return "SakuraV1_0"
87 | else:
88 | return "SakuraV1_0"
89 | elif model_name.lower() == "sakura32b":
90 | if model_version == "0.10":
91 | return "Sakura32bV0_10"
92 | else:
93 | return "Sakura32bV0_10"
94 | elif model_name.lower() == "galtransl":
95 | if model_version == "2.6":
96 | return "GalTranslV2_6"
97 | elif model_version == "3.0":
98 | return "GalTranslV3"
99 | else:
100 | return "GalTranslV2_6"
101 | else:
102 | return "SakuraV1_0"
103 |
104 | # 检查文本是否包含日文字符
105 | def contains_japanese(text):
106 | text = unicodedata.normalize('NFKC', text)
107 | return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text
108 |
109 | # 判断文本是否为纯英文(不包含中文字符)
110 | def is_pure_english(text):
111 | # 检查文本是否仅包含英文字母、数字、标点和空白字符
112 | # 如果包含中文字符则返回False
113 | return not bool(re.search(r'[\u4e00-\u9fff]', text))
114 |
115 | # 分割文本段落
116 | def split_text_with_newlines(text):
117 | paragraphs = re.split(r'(\r\n|\r|\n)', text)
118 | return paragraphs
119 |
120 | # 判断是否是文件路径
121 | def is_file_path(text):
122 | # 基于文本特征判断是否是文件路径
123 | return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text))
124 |
125 | # 符号管理工具类
126 | def fix_translation_end(original, translation):
127 | if translation.endswith("。") and not original.endswith("。"):
128 | translation = translation[:-1]
129 | if translation.endswith("。」") and not original.endswith("。」"):
130 | translation = translation[:-2] + "」"
131 | return translation
132 |
133 | def unescape_translation(original, translation):
134 | if "\r" not in original:
135 | translation = translation.replace("\r", "\r")
136 | if "\n" not in original:
137 | translation = translation.replace("\n", "\n")
138 | if "\t" not in original:
139 | translation = translation.replace("\t", "\t")
140 | return translation
141 |
142 | # 自定义用于调试输出的函数
143 | def console_print(*args, **kwargs):
144 | message = " ".join(map(str, args))
145 | with progress_lock:
146 | # 将消息存入调试输出列表
147 | debug_output.append(message)
148 | # 限制调试输出列表长度
149 | if len(debug_output) > 20:
150 | debug_output.pop(0)
151 |
152 | # 清屏并重新打印所有内容
153 | print("\033[H\033[J", end="") # 清屏
154 |
155 | # 打印调试输出
156 | for line in debug_output:
157 | print(line)
158 |
159 | # 打印空行分隔
160 | rows, columns = shutil.get_terminal_size()
161 | print("\n" * 3) # 空出进度条区域
162 |
163 | # 刷新所有进度条
164 | refresh_all_progress_bars()
165 |
166 | # 刷新所有进度条
167 | def refresh_all_progress_bars():
168 | for bar in progress_bars.values():
169 | if bar:
170 | bar.refresh()
171 |
172 | # 翻译文本,按段落翻译
173 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None):
174 | # 如果是文件路径或者文件,直接跳过
175 | if is_file_path(text):
176 | return text
177 |
178 | contains_jp, updated_text = contains_japanese(text)
179 | if contains_jp:
180 | segments = split_text_with_newlines(updated_text)
181 | translated_segments = []
182 | for segment in segments:
183 | if segment in ['\r\n', '\r', '\n']:
184 | translated_segments.append(segment)
185 | else:
186 | if segment:
187 | translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations))
188 | else:
189 | translated_segments.append(segment)
190 | translated_text = ''.join(translated_segments)
191 | return translated_text
192 | else:
193 | return text
194 |
195 | # 调用API进行翻译
196 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None):
197 | try:
198 | endpoint = config['endpoint'][api_idx]
199 | model_type = get_translation_model(config['model_type'], config['model_version'])
200 | context_size = config.get('context_size', 0)
201 | context = previous_translations[-context_size:] if previous_translations else []
202 | data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context)
203 | response = requests.post(endpoint, json=data)
204 | response.raise_for_status()
205 |
206 | response_data = response.json()
207 | completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0)
208 | max_tokens = data["max_tokens"]
209 |
210 | # 检查是否发生退化,重试时调整 frequency_penalty
211 | if completion_tokens == max_tokens:
212 | console_print("模型可能发生退化,调整 frequency_penalty 并重试...")
213 | data["frequency_penalty"] = 0.8
214 | response = requests.post(endpoint, json=data)
215 | response.raise_for_status()
216 | response_data = response.json()
217 |
218 | except requests.RequestException as e:
219 | console_print(f'请求翻译API错误: {e}')
220 | return ""
221 |
222 | translated_text = response_data.get("choices")[0].get("message", {}).get("content", "")
223 | translated_text = translated_text.replace("将下面的日文文本翻译成中文:", "").replace("<|im_end|>", "")
224 | translated_text = fix_translation_end(text, translated_text)
225 | translated_text = unescape_translation(text, translated_text)
226 |
227 | # 检查翻译结果是否为纯英文,如果是则记录行号
228 | if is_pure_english(translated_text):
229 | console_print(f"警告:行号 {index} 的翻译结果为纯英文:'{translated_text}'")
230 |
231 | with open("english_translations.log", "a", encoding="utf-8") as log_file:
232 | log_file.write(f"行号: {index}, 原文: {text}, 翻译: {translated_text}\n")
233 |
234 | console_print(f"原文: {text}\n翻译: {translated_text}\n") # 调试信息,输出翻译前后的文本
235 | return translated_text
236 |
237 | # 处理翻译请求的JSON构造
238 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context):
239 | messages = []
240 |
241 | if model_type == "SakuraV0_8":
242 | messages.append({"role": "system", "content": "你是一个简单的日文翻译模型,将日文翻译成简体中文。"})
243 | messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文:{text}"})
244 | else:
245 | if model_type == "SakuraV0_9":
246 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅地将日文翻译成简体中文,并正确使用人称代词。"})
247 | elif model_type == "SakuraV0_10":
248 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"})
249 | elif model_type == "SakuraV1_0":
250 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"})
251 | elif model_type == "GalTranslV2_6":
252 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"})
253 | elif model_type == "GalTranslV3":
254 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"})
255 | else:
256 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地将日文翻译成简体中文。"})
257 |
258 | if context:
259 | history_text = "历史翻译:" + "\n".join(context)
260 | else:
261 | history_text = ""
262 |
263 | if model_type == "GalTranslV3":
264 | if use_dict:
265 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
266 | user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}"
267 | else:
268 | user_content = f"{history_text}\n结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}"
269 | messages.append({"role": "user", "content": user_content})
270 | else:
271 | if context:
272 | for c in context:
273 | messages.append({"role": "assistant", "content": c})
274 |
275 | if use_dict:
276 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()])
277 | messages.append({"role": "user", "content": f"参考以下术语表:\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:{text}"})
278 | else:
279 | messages.append({"role": "user", "content": f"结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:{text}"})
280 |
281 | temperature = 0.6 if model_type == "GalTranslV3" else 0.2
282 |
283 | data = {
284 | "model": "sukinishiro",
285 | "messages": messages,
286 | "temperature": temperature,
287 | "top_p": 0.3,
288 | "max_tokens": 512,
289 | "frequency_penalty": 0.2,
290 | "do_sample": True,
291 | "num_beams": 1,
292 | "repetition_penalty": 1.0
293 | }
294 | return data
295 |
296 | # 进度管理类
297 | class TranslationProgress:
298 | def __init__(self, task_name, total_items, num_threads):
299 | self.progress_file = f"{task_name}.progress.json"
300 | self.task_name = task_name
301 | self.total_items = total_items
302 | self.num_threads = num_threads
303 | self.lock = threading.Lock()
304 | self.initialize()
305 |
306 | def initialize(self):
307 | if os.path.exists(self.progress_file):
308 | with open(self.progress_file, 'r', encoding='utf-8') as file:
309 | self.progress_data = json.load(file)
310 | else:
311 | # 创建新的进度文件
312 | chunk_size = self.total_items // self.num_threads
313 | remainder = self.total_items % self.num_threads
314 |
315 | threads_info = []
316 | start_idx = 0
317 |
318 | for i in range(self.num_threads):
319 | # 计算每个线程的起止范围
320 | end_idx = start_idx + chunk_size - 1
321 | if i == self.num_threads - 1:
322 | end_idx += remainder
323 |
324 | threads_info.append({
325 | "thread_id": i,
326 | "start_index": start_idx,
327 | "end_index": end_idx,
328 | "current_index": start_idx,
329 | "previous_translations": []
330 | })
331 |
332 | start_idx = end_idx + 1
333 |
334 | self.progress_data = {
335 | "task_name": self.task_name,
336 | "total_items": self.total_items,
337 | "num_threads": self.num_threads,
338 | "threads": threads_info
339 | }
340 | self.save()
341 |
342 | def update_progress(self, thread_id, current_index, translation=None, context_size=0):
343 | with self.lock:
344 | thread_info = self.progress_data["threads"][thread_id]
345 | thread_info["current_index"] = current_index
346 |
347 | # 更新历史翻译记录
348 | if translation and context_size > 0:
349 | thread_info.setdefault("previous_translations", [])
350 | thread_info["previous_translations"].append(translation)
351 | # 仅保留最近的N条翻译
352 | if len(thread_info["previous_translations"]) > context_size:
353 | thread_info["previous_translations"] = thread_info["previous_translations"][-context_size:]
354 |
355 | self.save()
356 |
357 | def get_thread_info(self, thread_id):
358 | return self.progress_data["threads"][thread_id]
359 |
360 | def get_previous_translations(self, thread_id):
361 | thread_info = self.progress_data["threads"][thread_id]
362 | return thread_info.get("previous_translations", [])
363 |
364 | def is_completed(self):
365 | for thread_info in self.progress_data["threads"]:
366 | if thread_info["current_index"] <= thread_info["end_index"]:
367 | return False
368 | return True
369 |
370 | def save(self):
371 | with open(self.progress_file, 'w', encoding='utf-8') as file:
372 | json.dump(self.progress_data, file, ensure_ascii=False, indent=4)
373 |
374 | # 翻译工作线程函数
375 | def translate_worker(thread_id, task_name, data, json_keys, progress_manager, config):
376 | thread_info = progress_manager.get_thread_info(thread_id)
377 | start_index = thread_info["current_index"]
378 | end_index = thread_info["end_index"]
379 | api_num = len(config['endpoint'])
380 |
381 | # 获取该线程的历史翻译记录
382 | previous_translations = progress_manager.get_previous_translations(thread_id)
383 |
384 | # 使用tqdm创建带有更多信息的进度条
385 | with progress_lock:
386 | pbar = tqdm(
387 | total=end_index - thread_info["start_index"] + 1,
388 | desc=f"线程 {thread_id}",
389 | position=thread_id,
390 | leave=True,
391 | ncols=100, # 增加宽度以容纳更多信息
392 | bar_format='{l_bar}{bar:20}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'
393 | )
394 | progress_bars[thread_id] = pbar
395 |
396 | # 计算已完成的工作量并更新进度条
397 | completed = start_index - thread_info["start_index"]
398 | if completed > 0:
399 | pbar.update(completed)
400 |
401 | for i in range(start_index, end_index + 1):
402 | api_index = thread_id % api_num # 使用线程ID来分配API端点
403 |
404 | if task_name.endswith(".json"):
405 | key = json_keys[i]
406 | original_text = key
407 | else: # CSV文件
408 | original_text = data.loc[i, 'Original Text']
409 |
410 | translated_text = translate_text_by_paragraph(
411 | original_text, i, api_index, config, previous_translations
412 | )
413 |
414 | # 更新数据
415 | if task_name.endswith(".json"):
416 | data[json_keys[i]] = translated_text
417 | else: # CSV文件
418 | data.loc[i, 'Machine translation'] = translated_text
419 |
420 | # 更新进度和历史翻译
421 | progress_manager.update_progress(
422 | thread_id, i + 1, translated_text, config.get('context_size', 0)
423 | )
424 |
425 | # 更新进度条
426 | with progress_lock:
427 | pbar.update(1)
428 |
429 | # 定期保存整个翻译文件
430 | if (i + 1) % config['save_frequency'] == 0 or i + 1 > end_index:
431 | save_translation_data(data, task_name)
432 | console_print(f"线程 {thread_id}: 已保存进度 {i + 1}/{end_index + 1}")
433 |
434 | # 完成后关闭进度条并从字典中移除
435 | with progress_lock:
436 | pbar.close()
437 | progress_bars[thread_id] = None
438 |
439 | # 保存翻译数据
440 | def save_translation_data(data, filename):
441 | if filename.endswith(".json"):
442 | with open(filename, 'w', encoding='utf-8') as file:
443 | json.dump(data, file, ensure_ascii=False, indent=4)
444 | elif filename.endswith(".csv"):
445 | data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
446 |
447 | # 初始化终端显示
448 | def setup_terminal():
449 | # 清屏
450 | os.system('cls' if os.name == 'nt' else 'clear')
451 | # 将光标移到顶部
452 | print("\033[H", end="")
453 |
454 | # 主函数
455 | def main():
456 | # 初始化终端显示
457 | setup_terminal()
458 |
459 | config = load_config()
460 | if not config['endpoint']:
461 | console_print("请配置API endpoint后再运行程序。")
462 | return
463 |
464 | # 初始化字典
465 | dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {})))
466 | config['dict'] = dict_data
467 |
468 | task_list = config['task_list']
469 | if not task_list:
470 | console_print("未找到待翻译文件,请更新config.json。")
471 | return
472 |
473 | for task_name in task_list:
474 | if not os.path.exists(task_name):
475 | console_print(f"文件{task_name}不存在,跳过。")
476 | continue
477 |
478 | # 加载数据
479 | if task_name.endswith(".json"):
480 | with open(task_name, 'r', encoding='utf-8') as file:
481 | data = json.load(file)
482 | json_keys = list(data.keys())
483 | total_items = len(json_keys)
484 | elif task_name.endswith(".csv"):
485 | data = pd.read_csv(task_name, encoding='utf-8')
486 | data['Original Text'] = data['Original Text'].astype(str)
487 | data['Machine translation'] = data['Machine translation'].astype(str)
488 | total_items = len(data)
489 | json_keys = None
490 | else:
491 | console_print(f"不支持的文件类型: {task_name}")
492 | continue
493 |
494 | # 创建或加载进度管理器
495 | num_threads = config['max_workers']
496 | progress_manager = TranslationProgress(task_name, total_items, num_threads)
497 |
498 | console_print(f"开始处理任务: {task_name} (总条目: {total_items})")
499 | console_print("调试信息将显示在顶部,进度条显示在底部")
500 | time.sleep(1) # 给用户时间阅读信息
501 |
502 | # 创建并启动工作线程
503 | threads = []
504 | for thread_id in range(num_threads):
505 | thread = threading.Thread(
506 | target=translate_worker,
507 | args=(thread_id, task_name, data, json_keys, progress_manager, config)
508 | )
509 | threads.append(thread)
510 | thread.start()
511 | thread_info = progress_manager.get_thread_info(thread_id)
512 | console_print(f"线程 {thread_id} 已启动,处理范围: {thread_info['start_index']} - {thread_info['end_index']}, 当前进度: {thread_info['current_index']}")
513 |
514 | # 等待所有线程完成
515 | for thread in threads:
516 | thread.join()
517 |
518 | console_print(f"任务 {task_name} 翻译完成")
519 |
520 | # 任务完成后,可以删除进度文件或保留作为记录
521 | # os.remove(f"{task_name}.progress.json")
522 |
523 | if __name__ == "__main__":
524 | try:
525 | main()
526 | except KeyboardInterrupt:
527 | # 处理Ctrl+C中断
528 | print("\n程序被用户中断,正在保存进度...")
529 | # 这里可以添加保存进度的代码
530 | except Exception as e:
531 | print(f"程序发生异常: {e}")
532 | finally:
533 | # 确保在程序退出时清理终端
534 | print("\033[?25h") # 确保光标可见
535 |
--------------------------------------------------------------------------------