├── Mtool ├── requirements.txt ├── main.py └── main_dev.py ├── .gitignore ├── Translator++ ├── pic │ ├── 1.png │ ├── 2.png │ └── 3.png ├── manual.py ├── manual2.py ├── 根据路径添加黄绿标签.js ├── 绿色标签添加路径翻译.js ├── README.md ├── api.py └── llm.py └── README.md /Mtool/requirements.txt: -------------------------------------------------------------------------------- 1 | pip 2 | wheel 3 | setuptools 4 | requests 5 | tqdm 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.gguf 3 | *.log 4 | ManualTransFile.json 5 | TranslatedFile.json 6 | TransFile/ -------------------------------------------------------------------------------- /Translator++/pic/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/1.png -------------------------------------------------------------------------------- /Translator++/pic/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/2.png -------------------------------------------------------------------------------- /Translator++/pic/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fkiliver/RPGMaker_LLM_Translator/HEAD/Translator++/pic/3.png -------------------------------------------------------------------------------- /Translator++/manual.py: -------------------------------------------------------------------------------- 1 | # 一个手动运行的脚本,用于将 MTool 导出的 ManualTransFile.json 进行批量翻译 2 | 3 | from concurrent.futures import ThreadPoolExecutor 4 | from llm import LLM, translate 5 | from itertools import repeat 6 | from tqdm import tqdm 7 | import json 8 | 9 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"]) 10 | # 全局字典,只会将相关项传入模型 11 | global_dicts = () 12 | 13 | with open("ManualTransFile.json", "r", encoding="utf-8") as fp: 14 | data = json.load(fp) 15 | raw_texts = list(data.keys()) 16 | 17 | with ThreadPoolExecutor(4) as executor: 18 | iterator = executor.map(translate, repeat(llm), list(data.keys()), repeat(()), repeat(()), repeat(global_dicts)) 19 | results = list(tqdm(iterator, total=len(raw_texts))) 20 | 21 | with open("TranslatedFile.json", "w", encoding="utf-8") as fp: 22 | json.dump(dict(zip(raw_texts, results)), fp, ensure_ascii=False, indent=4) 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | RPGMaker_LLM_Translator 4 |

5 |
6 | 7 | # 介绍 8 | 这是一个基于Mtool/Translator++和Sakura模型的RPGMaker游戏本地翻译器,能够提供高质量离线日文翻译 9 | 建议使用[Sakura-13B-Galgame翻译模型](https://github.com/SakuraLLM/Sakura-13B-Galgame),当前支持版本为Sakura v0.8/v0.9/v0.10pre1/Galtransl-v2.6 10 | 11 | 项目经过重构,支持Mtool和Translator和最新版本Sakura模型。 12 | 13 | ## TODO 14 | - [x] 添加退化检测(仅MTool) 15 | - [x] 添加历史上文(仅MTool) 16 | - [x] 添加prompt字典(仅MTool) 17 | - [x] 添加并发 18 | - [x] 添加对Sakura v0.10支持 19 | - [x] 添加对Sakura v1.0支持 20 | - [x] 添加对Galtransl-v2.6支持 21 | 22 | ## 快速开始 23 | 首先需要部署Sakura模型,推荐使用Galtransl模型 24 | 请参考[Sakura模型部署教程](https://github.com/SakuraLLM/SakuraLLM/wiki) 25 | 26 | ### Mtool 27 | 部署教程:详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki) 28 | 29 | ### Translator++ 30 | 详见[本仓库wiki](https://github.com/fkiliver/RPGMaker_LLM_Translator/wiki) 31 | 32 | 在Translator++上安装ChatGPT插件 33 | ![image](https://github.com/user-attachments/assets/b77fc7e6-cb04-4efc-8488-203ac74224ac) 34 | 35 | 然后便可以开始翻译了 36 | -------------------------------------------------------------------------------- /Translator++/manual2.py: -------------------------------------------------------------------------------- 1 | # 一个手动运行的脚本,用于将 AutoTranslator 导出的文件进行批量翻译 2 | # 会自动遍历并翻译 TransFile 文件夹下的所有文件 3 | 4 | from concurrent.futures import ThreadPoolExecutor 5 | from llm import LLM, translate 6 | from itertools import repeat 7 | from tqdm import tqdm 8 | import json 9 | import os 10 | 11 | folder = "../TransFile" 12 | llm = LLM("sakura", "sakura-14b-qwen2.5-v1.0-q6k.gguf", 4, ["0", "1", "2", "3"]) 13 | # 全局字典,只会将相关项传入模型 14 | global_dicts = () 15 | 16 | for filename in tqdm(os.listdir(folder)): 17 | filepath = f"{folder}/{filename}" 18 | with open(filepath, "r", encoding="utf-8") as fp: 19 | raw_texts = [x.split("=")[0] for x in fp.readlines()] 20 | 21 | with ThreadPoolExecutor(4) as executor: 22 | iterator = executor.map(translate, repeat(llm), raw_texts, repeat(()), repeat(()), repeat(global_dicts)) 23 | results = list(tqdm(iterator, total=len(raw_texts))) 24 | 25 | with open(filepath, "w", encoding="utf-8") as fp: 26 | for i in range(len(raw_texts)): 27 | fp.write(f"{raw_texts[i]}={results[i]}\n") 28 | -------------------------------------------------------------------------------- /Translator++/根据路径添加黄绿标签.js: -------------------------------------------------------------------------------- 1 | if (!Array.isArray(this.context)) { 2 | return; 3 | } 4 | const regexs = [ 5 | /^Actors\/\d+\/note$/, 6 | /^Animations.*?$/, 7 | /^Armors\/\d+\/note$/, 8 | /^CommonEvents\/\d+\/name$/, 9 | /^CommonEvents\/\d+\/list\/\d+\/comment$/, 10 | /^Enemies\/\d+\/note$/, 11 | /^Items\/\d+\/note$/, 12 | /^Map\d{3}\/events\/\d+\/(name|note)$/, 13 | /^Mapinfos.*?$/, 14 | /^Skills\/\d+\/note$/, 15 | /^States\/\d+\/note$/, 16 | /^System\/switches\/\d+$/, 17 | /^System\/variables\/\d+$/, 18 | /^Tilesets.*?$/, 19 | /^Troops\/\d+\/name$/, 20 | /^Weapons\/\d+\/note$/, 21 | /^.*?MZ Plugin Command.*?$/, 22 | /^.*?Control Variables.*?$/ 23 | ]; 24 | var count = 0; 25 | for (const context of this.context) { 26 | for (const regex of regexs) { 27 | if (regex.test(context)) { 28 | count++; 29 | break; 30 | } 31 | } 32 | } 33 | var index = this.tags.indexOf("yellow"); 34 | if (index > -1) { 35 | this.tags.splice(index, 1); 36 | } 37 | index = this.tags.indexOf("green"); 38 | if (index > -1) { 39 | this.tags.splice(index, 1); 40 | } 41 | if (count === this.context.length) { 42 | this.tags.push("yellow"); 43 | } else if (count > 0) { 44 | this.tags.push("green"); 45 | } 46 | -------------------------------------------------------------------------------- /Translator++/绿色标签添加路径翻译.js: -------------------------------------------------------------------------------- 1 | if (!this.tags.includes("green")) { 2 | return; 3 | } 4 | if (!Array.isArray(this.context)) { 5 | return; 6 | } 7 | const regexs = [ 8 | /^Actors\/\d+\/note$/, 9 | /^Animations.*?$/, 10 | /^Armors\/\d+\/note$/, 11 | /^CommonEvents\/\d+\/name$/, 12 | /^CommonEvents\/\d+\/list\/\d+\/comment$/, 13 | /^Enemies\/\d+\/note$/, 14 | /^Items\/\d+\/note$/, 15 | /^Map\d{3}\/events\/\d+\/(name|note)$/, 16 | /^Mapinfos.*?$/, 17 | /^Skills\/\d+\/note$/, 18 | /^States\/\d+\/note$/, 19 | /^System\/switches\/\d+$/, 20 | /^System\/variables\/\d+$/, 21 | /^Tilesets.*?$/, 22 | /^Troops\/\d+\/name$/, 23 | /^Weapons\/\d+\/note$/, 24 | /^.*?MZ Plugin Command.*?$/, 25 | /^.*?Control Variables.*?$/ 26 | ]; 27 | if (!Array.isArray(this.parameters)) { 28 | this.parameters = [] 29 | for (let i = 0; i < this.context.length; i++) { 30 | this.parameters.push({ 31 | contextStr: this.context[i] 32 | }); 33 | } 34 | } 35 | for (let i = 0; i < this.context.length; i++) { 36 | let context = this.context[i]; 37 | this.parameters[i]["translation"] = ""; 38 | for (const regex of regexs) { 39 | if (regex.test(context)) { 40 | this.parameters[i]["translation"] = this.cells[0]; 41 | break; 42 | } 43 | } 44 | } 45 | trans.project.files[this.file].parameters[this.rowId] = this.parameters 46 | -------------------------------------------------------------------------------- /Translator++/README.md: -------------------------------------------------------------------------------- 1 | # Translator++工作流 2 | 3 | 由于RPGMaker制作的游戏在文本细节上各不相同,在翻译了数个不同的游戏后,我总结了一套比较优秀的工作流,希望可以帮大家获得更好的翻译质量。 4 | 5 | **本文内容有较高上手门槛** 6 | 7 | ## Translator++设置 8 | 9 | 首先是自定义控制符,在翻译时,所有符合这些内容的文本都会被替换为`$dat[1]`这样的格式。由于各个游戏的控制符格式不同,官方默认的这些可能有未覆盖到的,需要单独处理。如图所示。 10 | 11 | ![](pic/1.png) 12 | 13 | 以下是一些遇到过的情况,可以根据实际情况决定是否采用: 14 | 15 | - 在每一个正则表达式后添加`\d*`,这样可以将控制符后的数字也包含进去,避免`\C[1]1000`被后端替换为`控制符11000`。 16 | - 删除第四行的`\!`,这个不关键。 17 | 18 | 或者,你也可以直接将2至4行整体替换为 19 | 20 | ```re 21 | /(\\[a-zA-Z0-9]+(?:\[.*?\]|<.*?\>)\d*|\\[a-zA-Z\{\}\\\$\.\|<\>\^]\d*)+/gi 22 | ``` 23 | 24 | 如果想得到更好的效果,我建议将更复杂的逻辑和提示词拼接工作放到Python后端处理。所以在OpenAI ChatGPT插件设置中,我建议清空**System Message Template**,将**Body Message Template**设置为仅包含`${SOURCE_TEXT}`,如图所示。 25 | 26 | ![](pic/2.png) 27 | 28 | 还有一些其它设置,例如如果想使用这个文件夹中的api,还需要将**Target URL**设置为`http://127.0.0.1:1500/v1/chat/completions`, **Batch Delay**设置为1,**Max Characters per Batch**设置为65536,**Max row per concurrent requests**尽量调大。 29 | 30 | ## 为特定路径的文本打标签 31 | 32 | MTools翻译的一个缺点就是会把所有字符串都翻译了,而Translator++也会读取很多无意义的字符串。翻译这些字符串不仅耗时,而且可能会破坏一些游戏逻辑。可以右键行,通过**Row Properties**查看字符串的路径,如图所示。 33 | 34 | ![](pic/3.png) 35 | 36 | Translator++拥有js脚本执行功能,选中需要执行脚本的文件,右键,在**With XX Selected -> Run Automation -> For Each Row**执行脚本。 37 | 38 | 更多执行细节,请参考[官方文档](https://dreamsavior.net/docs/translator/execute-script/pin-your-automation-to-quickly-launch-from-translator/)。 39 | 40 | 我推荐首先使用[脚本1](根据路径添加黄绿标签.js)对每行打标签,黄色为所有上下文路径都不需要翻译,绿色为仅有部分上下文路径需要翻译。 41 | 42 | 注意,由于每个游戏的差异,没有一劳永逸的正则表达式列表。为了提高翻译质量,建议开始翻以前人工浏览一遍,增减需要的正则表达式。 43 | 44 | ## 开始翻译 45 | 46 | 翻译的时候,红色和蓝色标签是Translator++加上的,记得和黄色的标签一起加入**黑名单**,这些行都不处理。 47 | 48 | ## Python后端 49 | 50 | 虽然重复造轮子不是好行为,但是一个简单的Python后端就可以做到很多事情,还是值得简单造一个轮子的。 51 | 52 | [llm.py](llm.py) 和 [api.py](api.py) 这两个文件实现了一些简单的功能,文件注释写的比较详细,这里就不再赘述代码细节,只简单介绍。 53 | 54 | ### 使用方式 55 | 56 | 库依赖不多,主要就需要安装一个 [llama-cpp-python](https://llama-cpp-python.readthedocs.io/en/latest/) 和一个 FastAPI。 57 | 58 | 在修改了 [api.py](api.py) 的一些参数之后,只需要简单 `python api.py` 即可启动。 59 | 60 | ```py 61 | port = 1500 62 | logging.basicConfig(filename="log.log") 63 | history_deque = deque(maxlen=3) 64 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"]) 65 | app = FastAPI() 66 | dicts = [ 67 | {"src": "控制符", "dst": "控制符"} 68 | ] 69 | ``` 70 | 71 | port为服务启动的端口号。 72 | 73 | basicConfig可以设置日志文件名,日志会记录控制符和行数翻译前后不一致的部分,供人工更正。 74 | 75 | history_deque控制最大提供给LLM的上文数量。 76 | 77 | LLM的参数都有接口说明,值得一提的是工作进程数和CUDA列表: 78 | 79 | - 如果显存足够,建议一张卡上跑两个工作进程,可以吃满显卡算力,不推荐更多。 80 | - 如果有多张卡,可以每张卡上都跑单独的工作进程,这个配置是4张4090的参考配置。 81 | - 这边的工作进程越多,Translator++就应该设置越大的**Max row per concurrent requests**,以减少上下文切换的损耗。 82 | 83 | app一般不用修改。 84 | 85 | dicts是提供给模型的字典,如果要使用这个后端,至少保留控制符这个说明。 86 | 87 | 如果不想深究,下面的小节可以跳过,直接看结束翻译段落即可。 88 | 89 | ### 控制符格式 90 | 91 | 代码中有一个处理,就是将Translator++的`${dat[1]}`这样的控制符全部替换为`控制符1`这样的文本,翻译完之后再替换回去。有什么用呢?请看例子: 92 | 93 | > 味方単体に1ターン『${dat[1]}無敵』を付与 94 | 95 | 这段文本,如果直接让LLM翻译,很可能会丢失掉`${dat[1]}`这样的控制符,或者是插入在错误的位置。我也试过将前后分别翻译再拼接,反而会丢失上下文。这个问题卡了我很久,一度想让我去再训练一个可以处理控制符的模型。某一天我观察到LLM会倾向于原样输出中文文本,这给了我灵感,如果将控制符改成中文: 96 | 97 | > 味方単体に1ターン『控制符1無敵』を付与 98 | 99 | 它就会翻译出正常的结果,并且把控制符放在合适的位置。哪怕是这种多控制符的文本: 100 | 101 | > 控制符1敵全体にダメージを与え『控制符2心傷』『控制符3心弱』状態にする。 102 | 103 | 经过测试也可以正确翻译并处理控制符的位置。 104 | 105 | ### SG说明格式 106 | 107 | 代码中还有对``格式的说明的处理,例如: 108 | 109 | > 生徒達はCPを増やしたりします。 111 | > 増やしたCPは、スキルツリー呪力領域の開放や、 112 | > アイテム合成に使えます。> 113 | > 114 | > 115 | 116 | 这个里面的key是不能翻译的,而value是需要翻译的,所以代码对其进行了简单的提取处理。 117 | 118 | ## 结束翻译 119 | 120 | 翻译完成后,记得将日志中记录的错误进行简单的人工修正。 121 | 122 | 然后使用[脚本2](绿色标签添加路径翻译.js)将绿色标签的上下文翻译自动设置完。 123 | 124 | 最后就可以直接注入翻译开始游戏。 125 | -------------------------------------------------------------------------------- /Translator++/api.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, deque 2 | from concurrent.futures import ThreadPoolExecutor 3 | from fastapi import FastAPI, Request 4 | from llm import LLM, translate 5 | import logging 6 | import uvicorn 7 | import json 8 | import re 9 | 10 | port = 1500 11 | logging.basicConfig(filename="log.log") 12 | history_deque = deque(maxlen=3) 13 | llm = LLM("galtransl", "Sakura-GalTransl-7B-v3-Q5_K_S.gguf", 8, ["0", "1", "2", "3", "0", "1", "2", "3"]) 14 | app = FastAPI() 15 | # 全局字典,只会将相关项传入模型 16 | global_dicts = [ 17 | {"src": "原文", "dst": "译文", "info": "说明(可选)"} 18 | ] 19 | 20 | def text_translate(text: str, history: tuple[str]) -> str: 21 | """预处理文本并执行翻译 22 | 23 | Args: 24 | text (str): 可能包含`${dat[数字]}`格式控制符的文本 25 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple) 26 | 27 | Returns: 28 | str: 翻译后的文本 29 | 30 | Note: 31 | 1. 自动转换 ${dat[1]} ↔ 控制符1 的格式 32 | 2. 校验翻译前后控制符数量和行数是否一致,最多重试10次 33 | 3. 超过最多重试次数时会记录警告日志 34 | """ 35 | pattern1 = r"\$\{dat\[(\d+)\]\}" 36 | pattern2 = r"控制符(\d+)" 37 | 38 | # 重试时控制符会继续向后标号,以提供不同的原文来提高成功率 39 | counter = 0 40 | def replace_to_chinese(match): 41 | nonlocal counter 42 | counter += 1 43 | placeholder = "控制符" + str(counter) 44 | dat_mapping[placeholder] = match.group(0) 45 | return placeholder 46 | 47 | def replace_back_to_dat(match): 48 | placeholder = match.group(0) 49 | return dat_mapping.get(placeholder, placeholder) 50 | 51 | retry = True 52 | retry_counter = 0 53 | while retry and retry_counter < 10: 54 | dat_mapping = {} 55 | retry = False 56 | retry_counter += 1 57 | 58 | before = Counter(re.findall(pattern1, text)) 59 | line_num = len(text.splitlines()) 60 | result = re.sub(pattern1, replace_to_chinese, text) 61 | dat_dicts = ({"src": key, "dst": key} for key in dat_mapping.keys()) 62 | result = translate(llm, result, history, dat_dicts, global_dicts) 63 | result = re.sub(pattern2, replace_back_to_dat, result) 64 | after = Counter(re.findall(pattern1, result)) 65 | 66 | if before != after: 67 | # logging.warning(f"{before} != {after}\n{text}\n{result}") 68 | retry = True 69 | elif line_num != len(result.splitlines()): 70 | # logging.warning(f"line_num mismatch\n{text}\n{result}") 71 | retry = True 72 | if retry: 73 | logging.warning(f"stop retry after {retry_counter} attempts\n{text}\n{result}") 74 | # elif retry_counter > 1: 75 | # logging.warning(f"get correct translation after {retry_counter} attempts\n{text}\n{result}") 76 | 77 | return result 78 | 79 | def data_translate(data: str, history: tuple[str]) -> str: 80 | """处理包含的复合数据翻译 81 | 82 | Args: 83 | data (str): 可能包含标签的文本 84 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple) 85 | 86 | Returns: 87 | str: 翻译后的完整文本 88 | 89 | Note: 90 | 1. 优先提取结构进行分段翻译 91 | 2. 无标签时直接调用text_translate 92 | 3. 保持原标签结构不变只翻译内容部分 93 | """ 94 | pattern = r"" 95 | finds = re.findall(pattern, data, re.DOTALL) 96 | if len(finds) > 0: 97 | for raw in finds: 98 | index = raw.find(":") 99 | if index == -1: 100 | continue 101 | text = raw[index + 1 : -1] 102 | text = text_translate(text, history) 103 | data = data.replace(raw, f"{raw[:index]}:{text}>") 104 | else: 105 | data = text_translate(data, history) 106 | return data 107 | 108 | @app.post("/v1/chat/completions") 109 | async def read_item(request: Request): 110 | """批量翻译API端点(POST方法) 111 | 112 | Args: 113 | request (Request): FastAPI请求对象,需包含: 114 | { 115 | "messages": [{ 116 | "role": "user", 117 | "content": "[\"text1\", \"text2\"]" # JSON字符串数组 118 | }] 119 | } 120 | 121 | Returns: 122 | dict: 格式化的响应数据: 123 | { 124 | "choices": [{ 125 | "message": { 126 | "content": "[\"trans1\", \"trans2\"]" # JSON字符串数组 127 | } 128 | }] 129 | } 130 | 131 | Note: 132 | 1. 使用ThreadPoolExecutor实现多文本并发翻译 133 | 2. 维护全局history_deque保存最近3条历史记录 134 | 3. 每个文本会附带其之前3条文本作为上文 135 | """ 136 | data = await request.json() 137 | data = data["messages"][0]["content"] 138 | data = json.loads(data) 139 | history = [] 140 | for d in data: 141 | history.append(tuple(history_deque)) 142 | history_deque.append(d) 143 | with ThreadPoolExecutor(len(data)) as executor: 144 | data = executor.map(data_translate, data, history) 145 | return {"choices": [{"message": {"content": json.dumps(list(data))}}]} 146 | 147 | @app.get("/") 148 | def read_item(text: str): 149 | """单条文本翻译API端点(GET方法) 150 | 151 | Args: 152 | text (str): 通过URL参数传递的待翻译文本 153 | 154 | Returns: 155 | str: 直接返回翻译结果字符串 156 | """ 157 | result = translate(llm, text, (), (), ()) 158 | return result 159 | 160 | if __name__ == '__main__': 161 | uvicorn.run(app, port=port) 162 | -------------------------------------------------------------------------------- /Translator++/llm.py: -------------------------------------------------------------------------------- 1 | from llama_cpp import Llama 2 | from multiprocessing import Pool 3 | from functools import lru_cache 4 | import os 5 | 6 | def contains_japanese(text): 7 | """检查文本是否包含日文片假名 8 | 9 | Args: 10 | text (str): 待检测的文本 11 | 12 | Returns: 13 | bool: 如果文本中包含日文片假名(Unicode范围3040-30FF)返回True,否则返回False 14 | """ 15 | for char in text: 16 | if "\u3040" <= char <= "\u30FF": 17 | return True 18 | return False 19 | 20 | def _init_worker(model_path: str, cuda_device: str): 21 | """ 22 | 初始化工作进程的LLM模型 23 | 24 | Args: 25 | model_path (str): 模型文件路径 26 | cuda_device (str): 指定使用的CUDA设备ID 27 | """ 28 | global worker_model 29 | print(f"PID: {os.getpid()} CUDA: {cuda_device}") 30 | os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device 31 | worker_model = Llama(model_path, n_gpu_layers=-1, n_ctx=2048, verbose=False) 32 | 33 | def _get_glossary(gpt_dicts: list[dict]) -> str: 34 | """ 35 | 将术语字典列表格式化为字符串 36 | 37 | Args: 38 | gpt_dicts (list[dict]): 术语字典列表,每个字典应包含: 39 | - src: 源语言术语 40 | - dst: 目标语言翻译 41 | - info(可选): 附加信息 42 | 43 | Returns: 44 | str: 格式化后的术语表字符串,每行格式为"src->dst #info"或"src->dst" 45 | 46 | Example: 47 | >>> _get_glossary([{"src": "猫", "dst": "cat", "info": "动物"}]) 48 | >>> '猫->cat #动物\\n' 49 | """ 50 | glossary = "" 51 | for gpt in gpt_dicts: 52 | if "info" in gpt.keys(): 53 | glossary += "{}->{} #{}\n".format(gpt["src"], gpt["dst"], gpt["info"]) 54 | else: 55 | glossary += "{}->{}\n".format(gpt["src"], gpt["dst"]) 56 | return glossary 57 | 58 | def _process_translate(model_name: str, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []) -> str: 59 | """ 60 | 执行单条文本的翻译 61 | 62 | Args: 63 | model_name (str): 模型名称,支持"sakura"或"galtransl" 64 | text (str): 待翻译的日文文本 65 | history (list[dict], optional): 对话历史记录 66 | gpt_dicts (list[dict], optional): 术语字典列表 67 | 68 | Returns: 69 | str: 翻译后的中文文本 70 | """ 71 | messages = [] 72 | if model_name == "sakura": 73 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"}) 74 | for item in history: 75 | messages.append({"role": "assistant", "content": item}) 76 | if len(gpt_dicts) == 0: 77 | user_prompt = "将下面的日文文本翻译成中文:" + text 78 | else: 79 | user_prompt = "根据以下术语表(可以为空):\n" 80 | user_prompt += _get_glossary(gpt_dicts) 81 | user_prompt += "将下面的日文文本根据对应关系和备注翻译成中文:" + text 82 | 83 | elif model_name == "galtransl": 84 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,注意"}) 85 | user_prompt = "历史翻译:\n" + "\n".join(history) + "\n" 86 | if len(gpt_dicts) != 0: 87 | user_prompt += "参考以下术语表(可为空,格式为src->dst #备注):\n" 88 | user_prompt += _get_glossary(gpt_dicts) 89 | user_prompt += "根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n" + text 90 | 91 | messages.append({"role": "user", "content": user_prompt}) 92 | if model_name == "sakura": 93 | res = worker_model.create_chat_completion(messages=messages, temperature=0.1, top_p=0.3, repeat_penalty=1, max_tokens=512, frequency_penalty=0.2) 94 | elif model_name == "galtransl": 95 | res = worker_model.create_chat_completion(messages=messages, temperature=0.6, top_p=0.8, repeat_penalty=1, max_tokens=512, frequency_penalty=0.1) 96 | return res["choices"][0]["message"]["content"] 97 | 98 | class LLM: 99 | """ 100 | 多进程LLM翻译器主类 101 | 102 | Attributes: 103 | model_name (str): 模型名称 104 | pool (multiprocessing.Pool): 工作进程池 105 | """ 106 | def __init__(self, model_name: str, model_path: str, num_process: int, cuda_device: list[str]): 107 | """ 108 | 初始化LLM翻译器 109 | 110 | Args: 111 | model_name (str): 模型名称 ("sakura" | "galtransl") 112 | model_path (str): 模型文件路径 113 | num_process (int): 工作进程数 114 | cuda_device (list[str]): 每个进程使用的CUDA设备ID列表 115 | 116 | Note: 117 | - cuda_device列表长度应与num_process匹配 118 | """ 119 | self.model_name = model_name 120 | self.pool = Pool(num_process) 121 | init_args = [(model_path, cuda_device[i]) for i in range(num_process)] 122 | self.pool.starmap(_init_worker, init_args) 123 | 124 | def translate(self, text: str, history: list[dict] = [], gpt_dicts: list[dict] = []): 125 | """ 126 | 提交单个翻译任务到进程池 127 | 128 | Args: 129 | text (str): 待翻译文本 130 | history (list[dict], optional): 历史对话 131 | gpt_dicts (list[dict], optional): 术语表 132 | 133 | Returns: 134 | multiprocessing.pool.AsyncResult: 异步结果对象 135 | """ 136 | return self.pool.apply_async(_process_translate, (self.model_name, text, history, gpt_dicts)) 137 | 138 | def batch_translate(self, datas: list[dict]) -> list[str]: 139 | """ 140 | 批量翻译文本 141 | 142 | Args: 143 | datas (list[dict]): 待翻译数据列表,每个元素应包含: 144 | - text: 待翻译文本 145 | - history: 历史对话 146 | - gpt_dicts: 术语表 147 | 148 | Returns: 149 | list[str]: 翻译结果列表,顺序与输入一致 150 | 151 | Note: 152 | - 每个 key 都必须有值,即使是空列表 153 | 154 | Example: 155 | >>> translator.batch_translate([{"text": "こんにちは", "history": [], "gpt_dicts": []}]) 156 | >>> ['你好'] 157 | """ 158 | tasks = [self.__translate(data["text"], data["history"], data["gpt_dicts"]) for data in datas] 159 | results = [task.get() for task in tasks] 160 | return results 161 | 162 | @lru_cache(maxsize=1024) 163 | def translate(llm: LLM, text: str, history: tuple[str], local_dicts: tuple[str], global_dicts: tuple[str]) -> str: 164 | """带缓存的单条文本翻译核心函数 165 | 166 | Args: 167 | llm (LLM): 多进程LLM翻译器实例 168 | text (str): 待翻译文本(自动替换全角空格为半角空格) 169 | history (tuple[str]): 历史翻译上下文(需传入可哈希的tuple) 170 | local_dicts (tuple[str]): 局部字典(需传入可哈希的tuple)无论文本中是否出现都会传入翻译器 171 | global_dicts (tuple[str]): 全局字典(需传入可哈希的tuple)只会将文本中出现的部分传入翻译器 172 | 173 | Returns: 174 | str: 翻译后的中文文本 175 | 176 | Note: 177 | 1. 使用LRU缓存(最多1024条)加速重复文本翻译 178 | 2. 非日文文本会直接返回原内容 179 | 3. 实际调用llm.translate()执行翻译 180 | """ 181 | text = text.replace("\u3000", " ") 182 | if not contains_japanese(text): 183 | return text 184 | gpt_dicts = list(local_dicts) 185 | for item in global_dicts: 186 | if item["src"] in text: 187 | gpt_dicts.append(item) 188 | result = llm.translate(text, history, gpt_dicts).get() 189 | return result 190 | -------------------------------------------------------------------------------- /Mtool/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import re 4 | import os 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import unicodedata 8 | import csv 9 | import sys 10 | from concurrent.futures import ThreadPoolExecutor, as_completed 11 | import threading 12 | 13 | # 读取全局配置信息 14 | def load_config(): 15 | if not os.path.exists("config.json"): 16 | config_data = { 17 | "last_processed": 0, 18 | "task_list": ["ManualTransFile.json"], 19 | "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"], 20 | "model_type": "Sgaltransl", 21 | "model_version": "2.6", 22 | "use_dict": False, 23 | "dict": {}, 24 | "dict_mode": "Partial", 25 | "save_frequency": 100, 26 | "shutdown": 0, 27 | "max_workers": 1, 28 | "context_size": 0 29 | } 30 | with open("config.json", 'w') as file: 31 | json.dump(config_data, file, indent=4) 32 | with open('config.json', 'r', encoding='utf-8') as file: 33 | return json.load(file) 34 | 35 | # 初始化字典 36 | def initialize_dict(dict_str): 37 | if not dict_str: 38 | return {}, "" 39 | try: 40 | dict_data = json.loads(dict_str) 41 | dict_converted = {} 42 | for key, value in dict_data.items(): 43 | if isinstance(value, list) and len(value) > 0: 44 | if len(value) == 1: 45 | dict_converted[key] = [value[0], ""] 46 | else: 47 | dict_converted[key] = value[:2] 48 | else: 49 | dict_converted[key] = [value, ""] 50 | dict_strings = get_dict_string_list(dict_converted) 51 | return dict_converted, "\n".join(dict_strings) 52 | except Exception as e: 53 | print(f"Error initializing dictionary: {e}") 54 | return {}, "" 55 | 56 | # 获取字典字符串列表 57 | def get_dict_string_list(kv_pairs): 58 | dict_list = [] 59 | for key, value in kv_pairs.items(): 60 | src = key 61 | dst = value[0] 62 | info = value[1] 63 | if info: 64 | dict_list.append(f"{src}->{dst} #{info}") 65 | else: 66 | dict_list.append(f"{src}->{dst}") 67 | return dict_list 68 | 69 | # 模型版本管理 70 | def get_translation_model(model_name, model_version): 71 | if model_name.lower() == "sakura": 72 | if model_version == "0.8": 73 | return "SakuraV0_8" 74 | elif model_version == "0.9": 75 | return "SakuraV0_9" 76 | elif model_version == "0.10": 77 | return "SakuraV0_10" 78 | elif model_version == "1.0": 79 | return "SakuraV1_0" 80 | else: 81 | return "SakuraV1_0" 82 | elif model_name.lower() == "sakura32b": 83 | if model_version == "0.10": 84 | return "Sakura32bV0_10" 85 | else: 86 | return "Sakura32bV0_10" 87 | elif model_name.lower() == "galtransl": 88 | if model_version == "2.6": 89 | return "GalTranslV2_6" 90 | elif model_version == "3.0": 91 | return "GalTranslV3" 92 | else: 93 | return "GalTranslV2_6" 94 | else: 95 | return "SakuraV1_0" 96 | 97 | # 检查文本是否包含日文字符 98 | def contains_japanese(text): 99 | text = unicodedata.normalize('NFKC', text) 100 | return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text 101 | 102 | # 分割文本段落 103 | def split_text_with_newlines(text): 104 | paragraphs = re.split(r'(\r\n|\r|\n)', text) 105 | return paragraphs 106 | 107 | # 判断是否是文件路径 108 | def is_file_path(text): 109 | # 基于文本特征判断是否是文件路径 110 | return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text)) 111 | 112 | # 符号管理工具类 113 | def fix_translation_end(original, translation): 114 | if translation.endswith("。") and not original.endswith("。"): 115 | translation = translation[:-1] 116 | if translation.endswith("。」") and not original.endswith("。」"): 117 | translation = translation[:-2] + "」" 118 | return translation 119 | 120 | def unescape_translation(original, translation): 121 | if "\r" not in original: 122 | translation = translation.replace("\r", "\r") 123 | if "\n" not in original: 124 | translation = translation.replace("\n", "\n") 125 | if "\t" not in original: 126 | translation = translation.replace("\t", "\t") 127 | return translation 128 | 129 | # 翻译文本,按段落翻译 130 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None): 131 | # 如果是文件路径或者文件,直接跳过 132 | if is_file_path(text): 133 | return text 134 | 135 | contains_jp, updated_text = contains_japanese(text) 136 | if contains_jp: 137 | segments = split_text_with_newlines(updated_text) 138 | translated_segments = [] 139 | for segment in segments: 140 | if segment in ['\r\n', '\r', '\n']: 141 | translated_segments.append(segment) 142 | else: 143 | if segment: 144 | translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations)) 145 | else: 146 | translated_segments.append(segment) 147 | translated_text = ''.join(translated_segments) 148 | return translated_text 149 | else: 150 | return text 151 | 152 | # 调用API进行翻译 153 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None): 154 | try: 155 | endpoint = config['endpoint'][api_idx] 156 | model_type = get_translation_model(config['model_type'], config['model_version']) 157 | context_size = config.get('context_size', 0) 158 | context = previous_translations[-context_size:] if previous_translations else [] 159 | data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context) 160 | response = requests.post(endpoint, json=data) 161 | response.raise_for_status() 162 | 163 | response_data = response.json() 164 | completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0) 165 | max_tokens = data["max_tokens"] 166 | 167 | # 检查是否发生退化,重试时调整 frequency_penalty 168 | if completion_tokens == max_tokens: 169 | print("模型可能发生退化,调整 frequency_penalty 并重试...") 170 | data["frequency_penalty"] = 0.8 171 | response = requests.post(endpoint, json=data) 172 | response.raise_for_status() 173 | response_data = response.json() 174 | 175 | except requests.RequestException as e: 176 | print(f'请求翻译API错误: {e}') 177 | return "" 178 | 179 | translated_text = response_data.get("choices")[0].get("message", {}).get("content", "") 180 | translated_text = translated_text.replace("将下面的日文文本翻译成中文:", "").replace("<|im_end|>", "") 181 | translated_text = fix_translation_end(text, translated_text) 182 | translated_text = unescape_translation(text, translated_text) 183 | print(f"原文: {text}\n翻译: {translated_text}\n") # 调试信息,输出翻译前后的文本 184 | return translated_text 185 | 186 | # 处理翻译请求的JSON构造 187 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context): 188 | messages = [] 189 | 190 | if model_type == "SakuraV0_8": 191 | messages.append({"role": "system", "content": "你是一个简单的日文翻译模型,将日文翻译成简体中文。"}) 192 | messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文:{text}"}) 193 | else: 194 | if model_type == "SakuraV0_9": 195 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅地将日文翻译成简体中文,并正确使用人称代词。"}) 196 | elif model_type == "SakuraV0_10": 197 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"}) 198 | elif model_type == "SakuraV1_0": 199 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"}) 200 | elif model_type == "GalTranslV2_6": 201 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"}) 202 | elif model_type == "GalTranslV3": 203 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"}) 204 | else: 205 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地将日文翻译成简体中文。"}) 206 | 207 | if context: 208 | history_text = "历史翻译:" + "\n".join(context) 209 | else: 210 | history_text = "" 211 | 212 | if model_type == "GalTranslV3": 213 | if use_dict: 214 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()]) 215 | user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}" 216 | else: 217 | user_content = f"{history_text}\n结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}" 218 | messages.append({"role": "user", "content": user_content}) 219 | else: 220 | if context: 221 | for c in context: 222 | messages.append({"role": "assistant", "content": c}) 223 | 224 | if use_dict: 225 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()]) 226 | messages.append({"role": "user", "content": f"根据上文和以下术语表:\n{dict_str}\n将下面的日文文本翻译成中文:{text}"}) 227 | else: 228 | messages.append({"role": "user", "content": f"根据上文,将下面的日文文本翻译成中文:{text}"}) 229 | 230 | temperature = 0.6 if model_type == "GalTranslV3" else 0.2 231 | 232 | data = { 233 | "model": "sukinishiro", 234 | "messages": messages, 235 | "temperature": temperature, 236 | "top_p": 0.3, 237 | "max_tokens": 384, 238 | "frequency_penalty": 0.2, 239 | "do_sample": True, 240 | "num_beams": 1, 241 | "repetition_penalty": 1.0 242 | } 243 | return data 244 | 245 | # 保存翻译进度 246 | def save_progress(data, filename, index, task_list): 247 | if filename.endswith(".json"): 248 | with open(filename, 'w', encoding='utf-8') as file: 249 | json.dump(data, file, ensure_ascii=False, indent=4) 250 | elif filename.endswith(".csv"): 251 | data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL) 252 | config = load_config() 253 | config['last_processed'] = index 254 | config['task_list'] = task_list 255 | with open('config.json', 'w', encoding='utf-8') as file: 256 | json.dump(config, file, indent=4) 257 | 258 | # 主流程 259 | def main(): 260 | config = load_config() 261 | if not config['endpoint']: 262 | print("请配置API endpoint后再运行程序。") 263 | return 264 | 265 | # 初始化字典 266 | dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {}))) 267 | config['dict'] = dict_data 268 | 269 | task_list = config['task_list'] 270 | if not task_list: 271 | print("未找到待翻译文件,请更新config.json。") 272 | return 273 | 274 | for task_name in task_list: 275 | if not os.path.exists(task_name): 276 | print(f"文件{task_name}不存在,跳过。") 277 | continue 278 | 279 | if task_name.endswith(".json"): 280 | with open(task_name, 'r', encoding='utf-8') as file: 281 | data = json.load(file) 282 | json_keys = list(data.keys()) 283 | elif task_name.endswith(".csv"): 284 | data = pd.read_csv(task_name, encoding='utf-8') 285 | data['Original Text'] = data['Original Text'].astype(str) 286 | data['Machine translation'] = data['Machine translation'].astype(str) 287 | else: 288 | print(f"不支持的文件类型: {task_name}") 289 | continue 290 | 291 | total_keys = len(data) 292 | start_index = config['last_processed'] 293 | api_num = len(config['endpoint']) 294 | previous_translations = [] 295 | with ThreadPoolExecutor(max_workers=config['max_workers']) as executor: 296 | future_to_index = {} 297 | for i in range(start_index, total_keys): 298 | key = json_keys[i] if task_name.endswith(".json") else data.loc[i, 'Original Text'] 299 | api_index = i % api_num 300 | future = executor.submit(translate_text_by_paragraph, key, i, api_index, config, previous_translations) 301 | future_to_index[future] = i 302 | for future in tqdm(as_completed(future_to_index), total=len(future_to_index), desc="任务进度"): 303 | index = future_to_index[future] 304 | try: 305 | translated_text = future.result() 306 | previous_translations.append(translated_text) 307 | if len(previous_translations) > config.get('context_size', 0): 308 | previous_translations.pop(0) 309 | if task_name.endswith(".json"): 310 | data[json_keys[index]] = translated_text 311 | if task_name.endswith(".csv"): 312 | data.loc[index, 'Machine translation'] = translated_text 313 | if (index + 1) % config['save_frequency'] == 0 or index + 1 == total_keys: 314 | save_progress(data, task_name, index + 1, task_list) 315 | except Exception as exc: 316 | print(f'{index + 1}行翻译发生异常: {exc}') 317 | 318 | if __name__ == "__main__": 319 | main() 320 | -------------------------------------------------------------------------------- /Mtool/main_dev.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import re 4 | import os 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import unicodedata 8 | import csv 9 | import sys 10 | import shutil 11 | from concurrent.futures import ThreadPoolExecutor, as_completed 12 | import threading 13 | import time 14 | 15 | # 全局变量,用于控制进度条显示 16 | progress_bars = {} 17 | progress_lock = threading.Lock() 18 | debug_output = [] # 用于存储调试输出 19 | 20 | # 读取全局配置信息 21 | def load_config(): 22 | if not os.path.exists("config.json"): 23 | config_data = { 24 | "last_processed": 0, 25 | "task_list": ["ManualTransFile.json"], 26 | "endpoint": ["http://127.0.0.1:5000/v1/chat/completions"], 27 | "model_type": "Sgaltransl", 28 | "model_version": "2.6", 29 | "use_dict": False, 30 | "dict": {}, 31 | "dict_mode": "Partial", 32 | "save_frequency": 100, 33 | "shutdown": 0, 34 | "max_workers": 1, 35 | "context_size": 0 36 | } 37 | with open("config.json", 'w') as file: 38 | json.dump(config_data, file, indent=4) 39 | with open('config.json', 'r', encoding='utf-8') as file: 40 | return json.load(file) 41 | 42 | # 初始化字典 43 | def initialize_dict(dict_str): 44 | if not dict_str: 45 | return {}, "" 46 | try: 47 | dict_data = json.loads(dict_str) 48 | dict_converted = {} 49 | for key, value in dict_data.items(): 50 | if isinstance(value, list) and len(value) > 0: 51 | if len(value) == 1: 52 | dict_converted[key] = [value[0], ""] 53 | else: 54 | dict_converted[key] = value[:2] 55 | else: 56 | dict_converted[key] = [value, ""] 57 | dict_strings = get_dict_string_list(dict_converted) 58 | return dict_converted, "\n".join(dict_strings) 59 | except Exception as e: 60 | console_print(f"Error initializing dictionary: {e}") 61 | return {}, "" 62 | 63 | # 获取字典字符串列表 64 | def get_dict_string_list(kv_pairs): 65 | dict_list = [] 66 | for key, value in kv_pairs.items(): 67 | src = key 68 | dst = value[0] 69 | info = value[1] 70 | if info: 71 | dict_list.append(f"{src}->{dst} #{info}") 72 | else: 73 | dict_list.append(f"{src}->{dst}") 74 | return dict_list 75 | 76 | # 模型版本管理 77 | def get_translation_model(model_name, model_version): 78 | if model_name.lower() == "sakura": 79 | if model_version == "0.8": 80 | return "SakuraV0_8" 81 | elif model_version == "0.9": 82 | return "SakuraV0_9" 83 | elif model_version == "0.10": 84 | return "SakuraV0_10" 85 | elif model_version == "1.0": 86 | return "SakuraV1_0" 87 | else: 88 | return "SakuraV1_0" 89 | elif model_name.lower() == "sakura32b": 90 | if model_version == "0.10": 91 | return "Sakura32bV0_10" 92 | else: 93 | return "Sakura32bV0_10" 94 | elif model_name.lower() == "galtransl": 95 | if model_version == "2.6": 96 | return "GalTranslV2_6" 97 | elif model_version == "3.0": 98 | return "GalTranslV3" 99 | else: 100 | return "GalTranslV2_6" 101 | else: 102 | return "SakuraV1_0" 103 | 104 | # 检查文本是否包含日文字符 105 | def contains_japanese(text): 106 | text = unicodedata.normalize('NFKC', text) 107 | return bool(re.search(r'[\u3040-\u30ff\u3400-\u4DBF\u4E00-\u9FFF]', text)), text 108 | 109 | # 判断文本是否为纯英文(不包含中文字符) 110 | def is_pure_english(text): 111 | # 检查文本是否仅包含英文字母、数字、标点和空白字符 112 | # 如果包含中文字符则返回False 113 | return not bool(re.search(r'[\u4e00-\u9fff]', text)) 114 | 115 | # 分割文本段落 116 | def split_text_with_newlines(text): 117 | paragraphs = re.split(r'(\r\n|\r|\n)', text) 118 | return paragraphs 119 | 120 | # 判断是否是文件路径 121 | def is_file_path(text): 122 | # 基于文本特征判断是否是文件路径 123 | return bool(re.search(r'\.[a-zA-Z0-9]{3}$', text)) 124 | 125 | # 符号管理工具类 126 | def fix_translation_end(original, translation): 127 | if translation.endswith("。") and not original.endswith("。"): 128 | translation = translation[:-1] 129 | if translation.endswith("。」") and not original.endswith("。」"): 130 | translation = translation[:-2] + "」" 131 | return translation 132 | 133 | def unescape_translation(original, translation): 134 | if "\r" not in original: 135 | translation = translation.replace("\r", "\r") 136 | if "\n" not in original: 137 | translation = translation.replace("\n", "\n") 138 | if "\t" not in original: 139 | translation = translation.replace("\t", "\t") 140 | return translation 141 | 142 | # 自定义用于调试输出的函数 143 | def console_print(*args, **kwargs): 144 | message = " ".join(map(str, args)) 145 | with progress_lock: 146 | # 将消息存入调试输出列表 147 | debug_output.append(message) 148 | # 限制调试输出列表长度 149 | if len(debug_output) > 20: 150 | debug_output.pop(0) 151 | 152 | # 清屏并重新打印所有内容 153 | print("\033[H\033[J", end="") # 清屏 154 | 155 | # 打印调试输出 156 | for line in debug_output: 157 | print(line) 158 | 159 | # 打印空行分隔 160 | rows, columns = shutil.get_terminal_size() 161 | print("\n" * 3) # 空出进度条区域 162 | 163 | # 刷新所有进度条 164 | refresh_all_progress_bars() 165 | 166 | # 刷新所有进度条 167 | def refresh_all_progress_bars(): 168 | for bar in progress_bars.values(): 169 | if bar: 170 | bar.refresh() 171 | 172 | # 翻译文本,按段落翻译 173 | def translate_text_by_paragraph(text, index, api_idx=0, config=None, previous_translations=None): 174 | # 如果是文件路径或者文件,直接跳过 175 | if is_file_path(text): 176 | return text 177 | 178 | contains_jp, updated_text = contains_japanese(text) 179 | if contains_jp: 180 | segments = split_text_with_newlines(updated_text) 181 | translated_segments = [] 182 | for segment in segments: 183 | if segment in ['\r\n', '\r', '\n']: 184 | translated_segments.append(segment) 185 | else: 186 | if segment: 187 | translated_segments.append(translate_text(segment, index, api_idx=api_idx, config=config, previous_translations=previous_translations)) 188 | else: 189 | translated_segments.append(segment) 190 | translated_text = ''.join(translated_segments) 191 | return translated_text 192 | else: 193 | return text 194 | 195 | # 调用API进行翻译 196 | def translate_text(text, index, api_idx=0, attempt=1, config=None, previous_translations=None): 197 | try: 198 | endpoint = config['endpoint'][api_idx] 199 | model_type = get_translation_model(config['model_type'], config['model_version']) 200 | context_size = config.get('context_size', 0) 201 | context = previous_translations[-context_size:] if previous_translations else [] 202 | data = make_request_json(text, model_type, config['use_dict'], config['dict_mode'], config['dict'], context) 203 | response = requests.post(endpoint, json=data) 204 | response.raise_for_status() 205 | 206 | response_data = response.json() 207 | completion_tokens = response_data.get("usage", {}).get("completion_tokens", 0) 208 | max_tokens = data["max_tokens"] 209 | 210 | # 检查是否发生退化,重试时调整 frequency_penalty 211 | if completion_tokens == max_tokens: 212 | console_print("模型可能发生退化,调整 frequency_penalty 并重试...") 213 | data["frequency_penalty"] = 0.8 214 | response = requests.post(endpoint, json=data) 215 | response.raise_for_status() 216 | response_data = response.json() 217 | 218 | except requests.RequestException as e: 219 | console_print(f'请求翻译API错误: {e}') 220 | return "" 221 | 222 | translated_text = response_data.get("choices")[0].get("message", {}).get("content", "") 223 | translated_text = translated_text.replace("将下面的日文文本翻译成中文:", "").replace("<|im_end|>", "") 224 | translated_text = fix_translation_end(text, translated_text) 225 | translated_text = unescape_translation(text, translated_text) 226 | 227 | # 检查翻译结果是否为纯英文,如果是则记录行号 228 | if is_pure_english(translated_text): 229 | console_print(f"警告:行号 {index} 的翻译结果为纯英文:'{translated_text}'") 230 | 231 | with open("english_translations.log", "a", encoding="utf-8") as log_file: 232 | log_file.write(f"行号: {index}, 原文: {text}, 翻译: {translated_text}\n") 233 | 234 | console_print(f"原文: {text}\n翻译: {translated_text}\n") # 调试信息,输出翻译前后的文本 235 | return translated_text 236 | 237 | # 处理翻译请求的JSON构造 238 | def make_request_json(text, model_type, use_dict, dict_mode, dict_data, context): 239 | messages = [] 240 | 241 | if model_type == "SakuraV0_8": 242 | messages.append({"role": "system", "content": "你是一个简单的日文翻译模型,将日文翻译成简体中文。"}) 243 | messages.append({"role": "user", "content": f"将下面的日文文本翻译成中文:{text}"}) 244 | else: 245 | if model_type == "SakuraV0_9": 246 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅地将日文翻译成简体中文,并正确使用人称代词。"}) 247 | elif model_type == "SakuraV0_10": 248 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"}) 249 | elif model_type == "SakuraV1_0": 250 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文,并联系上下文正确使用人称代词,不擅自添加原文中没有的代词。"}) 251 | elif model_type == "GalTranslV2_6": 252 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"}) 253 | elif model_type == "GalTranslV3": 254 | messages.append({"role": "system", "content": "你是一个视觉小说翻译模型,可以通顺地使用给定的术语表以指定的风格将日文翻译成简体中文,并联系上下文正确使用人称代词。"}) 255 | else: 256 | messages.append({"role": "system", "content": "你是一个轻小说翻译模型,可以流畅通顺地将日文翻译成简体中文。"}) 257 | 258 | if context: 259 | history_text = "历史翻译:" + "\n".join(context) 260 | else: 261 | history_text = "" 262 | 263 | if model_type == "GalTranslV3": 264 | if use_dict: 265 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()]) 266 | user_content = f"{history_text}\n参考以下术语表\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}" 267 | else: 268 | user_content = f"{history_text}\n结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:\n{text}" 269 | messages.append({"role": "user", "content": user_content}) 270 | else: 271 | if context: 272 | for c in context: 273 | messages.append({"role": "assistant", "content": c}) 274 | 275 | if use_dict: 276 | dict_str = '\n'.join([f"{k}->{v[0]}" for k, v in dict_data.items()]) 277 | messages.append({"role": "user", "content": f"参考以下术语表:\n{dict_str}\n根据以上术语表的对应关系和备注,结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:{text}"}) 278 | else: 279 | messages.append({"role": "user", "content": f"结合历史剧情和上下文,将下面的文本从日文翻译成简体中文:{text}"}) 280 | 281 | temperature = 0.6 if model_type == "GalTranslV3" else 0.2 282 | 283 | data = { 284 | "model": "sukinishiro", 285 | "messages": messages, 286 | "temperature": temperature, 287 | "top_p": 0.3, 288 | "max_tokens": 512, 289 | "frequency_penalty": 0.2, 290 | "do_sample": True, 291 | "num_beams": 1, 292 | "repetition_penalty": 1.0 293 | } 294 | return data 295 | 296 | # 进度管理类 297 | class TranslationProgress: 298 | def __init__(self, task_name, total_items, num_threads): 299 | self.progress_file = f"{task_name}.progress.json" 300 | self.task_name = task_name 301 | self.total_items = total_items 302 | self.num_threads = num_threads 303 | self.lock = threading.Lock() 304 | self.initialize() 305 | 306 | def initialize(self): 307 | if os.path.exists(self.progress_file): 308 | with open(self.progress_file, 'r', encoding='utf-8') as file: 309 | self.progress_data = json.load(file) 310 | else: 311 | # 创建新的进度文件 312 | chunk_size = self.total_items // self.num_threads 313 | remainder = self.total_items % self.num_threads 314 | 315 | threads_info = [] 316 | start_idx = 0 317 | 318 | for i in range(self.num_threads): 319 | # 计算每个线程的起止范围 320 | end_idx = start_idx + chunk_size - 1 321 | if i == self.num_threads - 1: 322 | end_idx += remainder 323 | 324 | threads_info.append({ 325 | "thread_id": i, 326 | "start_index": start_idx, 327 | "end_index": end_idx, 328 | "current_index": start_idx, 329 | "previous_translations": [] 330 | }) 331 | 332 | start_idx = end_idx + 1 333 | 334 | self.progress_data = { 335 | "task_name": self.task_name, 336 | "total_items": self.total_items, 337 | "num_threads": self.num_threads, 338 | "threads": threads_info 339 | } 340 | self.save() 341 | 342 | def update_progress(self, thread_id, current_index, translation=None, context_size=0): 343 | with self.lock: 344 | thread_info = self.progress_data["threads"][thread_id] 345 | thread_info["current_index"] = current_index 346 | 347 | # 更新历史翻译记录 348 | if translation and context_size > 0: 349 | thread_info.setdefault("previous_translations", []) 350 | thread_info["previous_translations"].append(translation) 351 | # 仅保留最近的N条翻译 352 | if len(thread_info["previous_translations"]) > context_size: 353 | thread_info["previous_translations"] = thread_info["previous_translations"][-context_size:] 354 | 355 | self.save() 356 | 357 | def get_thread_info(self, thread_id): 358 | return self.progress_data["threads"][thread_id] 359 | 360 | def get_previous_translations(self, thread_id): 361 | thread_info = self.progress_data["threads"][thread_id] 362 | return thread_info.get("previous_translations", []) 363 | 364 | def is_completed(self): 365 | for thread_info in self.progress_data["threads"]: 366 | if thread_info["current_index"] <= thread_info["end_index"]: 367 | return False 368 | return True 369 | 370 | def save(self): 371 | with open(self.progress_file, 'w', encoding='utf-8') as file: 372 | json.dump(self.progress_data, file, ensure_ascii=False, indent=4) 373 | 374 | # 翻译工作线程函数 375 | def translate_worker(thread_id, task_name, data, json_keys, progress_manager, config): 376 | thread_info = progress_manager.get_thread_info(thread_id) 377 | start_index = thread_info["current_index"] 378 | end_index = thread_info["end_index"] 379 | api_num = len(config['endpoint']) 380 | 381 | # 获取该线程的历史翻译记录 382 | previous_translations = progress_manager.get_previous_translations(thread_id) 383 | 384 | # 使用tqdm创建带有更多信息的进度条 385 | with progress_lock: 386 | pbar = tqdm( 387 | total=end_index - thread_info["start_index"] + 1, 388 | desc=f"线程 {thread_id}", 389 | position=thread_id, 390 | leave=True, 391 | ncols=100, # 增加宽度以容纳更多信息 392 | bar_format='{l_bar}{bar:20}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' 393 | ) 394 | progress_bars[thread_id] = pbar 395 | 396 | # 计算已完成的工作量并更新进度条 397 | completed = start_index - thread_info["start_index"] 398 | if completed > 0: 399 | pbar.update(completed) 400 | 401 | for i in range(start_index, end_index + 1): 402 | api_index = thread_id % api_num # 使用线程ID来分配API端点 403 | 404 | if task_name.endswith(".json"): 405 | key = json_keys[i] 406 | original_text = key 407 | else: # CSV文件 408 | original_text = data.loc[i, 'Original Text'] 409 | 410 | translated_text = translate_text_by_paragraph( 411 | original_text, i, api_index, config, previous_translations 412 | ) 413 | 414 | # 更新数据 415 | if task_name.endswith(".json"): 416 | data[json_keys[i]] = translated_text 417 | else: # CSV文件 418 | data.loc[i, 'Machine translation'] = translated_text 419 | 420 | # 更新进度和历史翻译 421 | progress_manager.update_progress( 422 | thread_id, i + 1, translated_text, config.get('context_size', 0) 423 | ) 424 | 425 | # 更新进度条 426 | with progress_lock: 427 | pbar.update(1) 428 | 429 | # 定期保存整个翻译文件 430 | if (i + 1) % config['save_frequency'] == 0 or i + 1 > end_index: 431 | save_translation_data(data, task_name) 432 | console_print(f"线程 {thread_id}: 已保存进度 {i + 1}/{end_index + 1}") 433 | 434 | # 完成后关闭进度条并从字典中移除 435 | with progress_lock: 436 | pbar.close() 437 | progress_bars[thread_id] = None 438 | 439 | # 保存翻译数据 440 | def save_translation_data(data, filename): 441 | if filename.endswith(".json"): 442 | with open(filename, 'w', encoding='utf-8') as file: 443 | json.dump(data, file, ensure_ascii=False, indent=4) 444 | elif filename.endswith(".csv"): 445 | data.to_csv(filename, index=False, quoting=csv.QUOTE_ALL) 446 | 447 | # 初始化终端显示 448 | def setup_terminal(): 449 | # 清屏 450 | os.system('cls' if os.name == 'nt' else 'clear') 451 | # 将光标移到顶部 452 | print("\033[H", end="") 453 | 454 | # 主函数 455 | def main(): 456 | # 初始化终端显示 457 | setup_terminal() 458 | 459 | config = load_config() 460 | if not config['endpoint']: 461 | console_print("请配置API endpoint后再运行程序。") 462 | return 463 | 464 | # 初始化字典 465 | dict_data, full_dict_str = initialize_dict(json.dumps(config.get('dict', {}))) 466 | config['dict'] = dict_data 467 | 468 | task_list = config['task_list'] 469 | if not task_list: 470 | console_print("未找到待翻译文件,请更新config.json。") 471 | return 472 | 473 | for task_name in task_list: 474 | if not os.path.exists(task_name): 475 | console_print(f"文件{task_name}不存在,跳过。") 476 | continue 477 | 478 | # 加载数据 479 | if task_name.endswith(".json"): 480 | with open(task_name, 'r', encoding='utf-8') as file: 481 | data = json.load(file) 482 | json_keys = list(data.keys()) 483 | total_items = len(json_keys) 484 | elif task_name.endswith(".csv"): 485 | data = pd.read_csv(task_name, encoding='utf-8') 486 | data['Original Text'] = data['Original Text'].astype(str) 487 | data['Machine translation'] = data['Machine translation'].astype(str) 488 | total_items = len(data) 489 | json_keys = None 490 | else: 491 | console_print(f"不支持的文件类型: {task_name}") 492 | continue 493 | 494 | # 创建或加载进度管理器 495 | num_threads = config['max_workers'] 496 | progress_manager = TranslationProgress(task_name, total_items, num_threads) 497 | 498 | console_print(f"开始处理任务: {task_name} (总条目: {total_items})") 499 | console_print("调试信息将显示在顶部,进度条显示在底部") 500 | time.sleep(1) # 给用户时间阅读信息 501 | 502 | # 创建并启动工作线程 503 | threads = [] 504 | for thread_id in range(num_threads): 505 | thread = threading.Thread( 506 | target=translate_worker, 507 | args=(thread_id, task_name, data, json_keys, progress_manager, config) 508 | ) 509 | threads.append(thread) 510 | thread.start() 511 | thread_info = progress_manager.get_thread_info(thread_id) 512 | console_print(f"线程 {thread_id} 已启动,处理范围: {thread_info['start_index']} - {thread_info['end_index']}, 当前进度: {thread_info['current_index']}") 513 | 514 | # 等待所有线程完成 515 | for thread in threads: 516 | thread.join() 517 | 518 | console_print(f"任务 {task_name} 翻译完成") 519 | 520 | # 任务完成后,可以删除进度文件或保留作为记录 521 | # os.remove(f"{task_name}.progress.json") 522 | 523 | if __name__ == "__main__": 524 | try: 525 | main() 526 | except KeyboardInterrupt: 527 | # 处理Ctrl+C中断 528 | print("\n程序被用户中断,正在保存进度...") 529 | # 这里可以添加保存进度的代码 530 | except Exception as e: 531 | print(f"程序发生异常: {e}") 532 | finally: 533 | # 确保在程序退出时清理终端 534 | print("\033[?25h") # 确保光标可见 535 | --------------------------------------------------------------------------------