├── baidu ├── __init__.py ├── main.py └── translate.py ├── google ├── __init__.py └── main.py ├── youdao ├── __init__.py ├── youdaotrans.py └── main.py ├── .gitignore ├── requirements.txt ├── input ├── schemas.json └── keywords.txt ├── README.md ├── main.py └── back_translate.py /baidu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /google/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /youdao/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | /input/questions.json 3 | /output/* 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyexecjs==1.5.1 2 | requests==2.22.0 3 | googletrans==2.4.0 -------------------------------------------------------------------------------- /input/schemas.json: -------------------------------------------------------------------------------- 1 | { 2 | "baidu": [ 3 | ["zh-CN", "en", "zh-CN"], 4 | ["zh-CN", "ja", "zh-CN"], 5 | ["zh-CN", "ko", "zh-CN"], 6 | ["zh-CN", "fr", "zh-CN"], 7 | ["zh-CN", "es", "zh-CN"], 8 | ["zh-CN", "th", "zh-CN"], 9 | ["zh-CN", "de", "zh-CN"], 10 | ["zh-CN", "zh-TW", "zh-CN"] 11 | ], 12 | "youdao": [ 13 | ["zh-CN", "en", "zh-CN"], 14 | ["zh-CN", "ko", "zh-CN"], 15 | ["zh-CN", "fr", "zh-CN"] 16 | ] 17 | } -------------------------------------------------------------------------------- /input/keywords.txt: -------------------------------------------------------------------------------- 1 | 项目 2 | 研发项目 3 | 集团委托项目 4 | 重大 5 | 重点 6 | 一般支撑项目 7 | 自立项目 8 | 国拨资金项目 9 | 教育部基金项目 10 | I类项目 11 | II类项目 12 | III类项目 13 | IV类项目 14 | 战略型产品 15 | 能力型产品 16 | 支撑型产品 17 | 资本化项目 18 | 院主管 19 | 项目总监 20 | 所主管 21 | 项目经理 22 | 院级 23 | 院级项目 24 | 项目线角色 25 | 所级 26 | 所级项目 27 | 科技管理部 28 | 采购部 29 | 合作部 30 | 财务部 31 | 申报 32 | 决策层级 33 | 委托项目 34 | 开题 35 | 开题材料 36 | 交付 37 | 里程碑 38 | 外协 39 | 评审会 40 | 评审 41 | 开题评审 42 | PMS 43 | 结题 44 | 结题材料 45 | 结题评审 46 | 流程 47 | 归档 48 | 附件 49 | 白皮书 50 | 后评估 51 | 国拨项目 -------------------------------------------------------------------------------- /youdao/youdaotrans.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | 5 | class Translator: 6 | @staticmethod 7 | def translate(text, src, dst): 8 | src = src.replace("-", "_") 9 | dst = dst.replace("-", "_") 10 | tp = src.upper() + "2" + dst.upper() 11 | url = f"http://fanyi.youdao.com/translate?&doctype=json&type={tp}&i={text}" 12 | resp = requests.get(url) 13 | return json.loads(resp.content) 14 | 15 | 16 | def test(): 17 | translator = Translator() 18 | print(translator.translate("你叫什么名字", "zh-CN", "en")) 19 | 20 | 21 | if __name__ == "__main__": 22 | test() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Back Translation 2 | 3 | 调用各个翻译平台的API对各个问题进行回译 4 | 5 | ### 如何使用 6 | 7 | 1. 安装依赖环境: `pip install -r requirements.txt` 8 | 2. 运行测试: `python3 back_translate.py` 9 | 10 | ### 谷歌翻译 11 | 12 | 使用了`googletrans`这个包,来获取单词的翻译结果,见[py-googletrans](https://github.com/ssut/py-googletrans)。 13 | 14 | 调用时需要翻墙。 15 | 16 | 调用速度有一定限制,不能太快,否则调用报错。 17 | 18 | ### 百度翻译 19 | 20 | 使用了一部分来自[hBaiduTranslate](https://github.com/ZCY01/BaiduTranslate)的代码,用于获取单次翻译结果。 21 | 22 | 请求速度不能太快,不然会返回`None`。在代码里加入了`time.sleep`。 23 | 24 | ### 标准语言代码表 25 | 26 | 本程序的语言代码完全按照谷歌翻译的语言代码,可以在[这里](https://b.imacroc.cn/original/74.html)找到,具体如下: 27 | - zh-CN:简体中文 28 | - en:英语 29 | - ja:日语 30 | - ko:韩语 31 | - fr:法语 32 | - es:西班牙语 33 | - th:泰语 34 | - de:德语 35 | - zh-TW:繁体中文 36 | 37 | 对于不同翻译平台API之间的语言代码不适配,会在内部通过语言代码映射进行处理。 38 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import json 3 | import argparse 4 | 5 | from back_translate import back_translate 6 | 7 | 8 | def main(args): 9 | schemas = json.load(open(args.schema_file, "r")) 10 | if "keywords_file" in args: 11 | keywords = [line.strip() for line in open(args.keywords_file, "r")] 12 | else: 13 | keywords = None 14 | 15 | with open(args.input_file, "r") as f_in: 16 | lines = [line.strip() for line in f_in.readlines() if line.strip()] 17 | 18 | with open(args.output_file, "w") as f_out: 19 | for line in tqdm.tqdm(lines): 20 | ques, ans = line.split("\t") 21 | result_dict = back_translate(ques, schemas, keywords) 22 | for arg_ques in result_dict.values(): 23 | f_out.write(f"{arg_ques}\t{ans}\n") 24 | f_out.flush() 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--input_file", type=str, required=True) 30 | parser.add_argument("--output_file", type=str, required=True) 31 | parser.add_argument("--schema_file", type=str, required=True) 32 | parser.add_argument("--keywords_file", type=str, help="Required if keyword mask is applied") 33 | args = parser.parse_args() 34 | 35 | main(args) -------------------------------------------------------------------------------- /back_translate.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Callable, Any 2 | 3 | import json 4 | import random 5 | 6 | 7 | def back_translate(text: str, 8 | schemas: Dict[str, List[List[str]]], 9 | keywords: List[str] = None, 10 | handle_func: Callable[[Dict[str, str]], Any] = lambda x: x) -> Any: 11 | """ 12 | 输入一句话,使用不同翻译平台、翻译模式(中间语言)进行数据增强,生成多个回复句子。 13 | 14 | 参数: 15 | ------ 16 | text: ``str`` 17 | 输入的句子 18 | schemas: ``Dict[str, List[List[str]]]`` 19 | 定义了翻译平台和翻译模式(中间语言),见``input/schemas.json`` 20 | keywords: ``List[str]``, optional, default=``None`` 21 | 如果指定了keywords,则使用keyword mask的方法,否则不使用 22 | handle_func: ``Callable[[Dict[str, str]], Any]``, optional, default=``lambda x: x`` 23 | 对结果(res)进行处理,例如: 24 | 过滤掉重复的生成结果、改变输出结构、限制最大生成个数、使用匹配模型进行过滤等 25 | """ 26 | res = {"origin": text} 27 | for platform, schema_list in schemas.items(): 28 | trans_func = __import__(f"{platform}.main", fromlist=platform).back_translate 29 | for schema in schema_list: 30 | try: 31 | schema_key = "->".join(schema) 32 | res[f"{platform} {schema_key}"] = trans_func(text, lang_list=schema) 33 | except Exception: 34 | pass 35 | 36 | if keywords: # 使用keyword mask 37 | keywords = list(set(keywords)) # 过滤重复keywords 38 | hit_keywords = [keyword for keyword in keywords if keyword in text] 39 | for selected_keyword in hit_keywords: 40 | try: 41 | replaced_text = text.replace(selected_keyword, "UNK") 42 | back_translate_res = trans_func(replaced_text, lang_list=schema) 43 | if "UNK" in back_translate_res or "unk" in back_translate_res: 44 | back_translate_res = back_translate_res.replace("UNK", selected_keyword) 45 | back_translate_res = back_translate_res.replace("unk", selected_keyword) 46 | res[f"{platform} {schema_key} kw_mask{selected_keyword}"] = back_translate_res 47 | except Exception: 48 | pass 49 | 50 | return handle_func(res) 51 | 52 | def test(): 53 | def handle_res(res): 54 | no_repeat = list(set(res.values())) 55 | for item in no_repeat: 56 | print(item) 57 | return no_repeat 58 | schemas = json.load(open("./input/schemas.json", "r")) 59 | keywords = [line.strip() for line in open("./input/keywords.txt", "r")] 60 | result = back_translate("后评估工作如何开展?", schemas, keywords) 61 | print(json.dumps(result, indent=4, ensure_ascii=False)) 62 | print(json.dumps(handle_res(result), indent=4, ensure_ascii=False)) 63 | 64 | 65 | if __name__ == "__main__": 66 | test() -------------------------------------------------------------------------------- /youdao/main.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import time 4 | import random 5 | import tqdm 6 | import json 7 | import collections 8 | 9 | from youdao import youdaotrans 10 | 11 | 12 | translator = youdaotrans.Translator() 13 | 14 | # Mapping from standard lang id (see README.md for more details) 15 | # to target (youdao) lang id 16 | _LANG_MAPPING = { 17 | "zh-CN": "zh-CN", 18 | "en": "en", 19 | "ja": "ja", 20 | "ko": "kr", 21 | "fr": "fr" 22 | } 23 | 24 | 25 | def trans(text: str, 26 | src: str, 27 | dst: str, 28 | sleep_mean: float = 1.0, 29 | sleep_dev: float = 0.3) -> str: 30 | """ 31 | 调用有道翻译接口进行单次翻译。 32 | 33 | 参数: 34 | ------ 35 | text: ``str`` 36 | 要翻译的文本 37 | src: ``str`` 38 | 源语言代码,详见``README.md`` 39 | dst: ``str`` 40 | 目标语言代码,详见``README.md`` 41 | sleep_mean: ``float``, optional, default=``1.0`` 42 | 延迟均值(为了防止ip被封,需要延迟) 43 | sleep_dev: ``float``, optional, default=``0.3`` 44 | 延迟标准差 45 | """ 46 | # TODO: 设计一个自适应的算法来控制延迟 47 | time.sleep(max(0.1, random.gauss(sleep_mean, sleep_dev))) 48 | res = translator.translate(text, dst=_LANG_MAPPING[dst], src=_LANG_MAPPING[src]) 49 | return res["translateResult"][0][0]["tgt"] 50 | 51 | def back_translate(text: str, 52 | lang_list: Iterable[str], 53 | sleep_mean: float = 1.0, 54 | sleep_dev: float = 0.3) -> str: 55 | """ 56 | 调用谷歌翻译接口进行单次回译。 57 | 58 | 参数: 59 | ------ 60 | text: ``str`` 61 | 要回译的文本 62 | lang_list: ``Iterable[str]`` 63 | 要回译的语言代码列表。例如要将一句话从中文翻译到英文再翻译回中文,则指定: 64 | ``lang_list=("zh-CN", "en", "zh-CN"])`` 65 | sleep_mean: ``float``, optional, default=``1.0`` 66 | 延迟均值(为了防止ip被封,需要延迟) 67 | sleep_dev: ``float``, optional, default=``0.3`` 68 | 延迟标准差 69 | """ 70 | assert len(lang_list) >= 2 71 | current_text = text 72 | for i in range(len(lang_list) - 1): 73 | current_text = trans(current_text, src=lang_list[i], dst=lang_list[i+1], 74 | sleep_mean=sleep_mean, sleep_dev=sleep_dev) 75 | return current_text 76 | 77 | def main(): 78 | input_file = "../input/questions.json" 79 | output_file = "../output/youdao.json" 80 | schemas = [ 81 | ('zh-CN', 'en', 'zh-CN'), 82 | ('zh-CN', 'ja', 'zh-CN'), 83 | ('zh-CN', 'ko', 'zh-CN'), 84 | ('zh-CN', 'fr', 'zh-CN') 85 | ] 86 | 87 | input_json = json.load(open(input_file, "r", encoding="utf-8")) 88 | output_json = collections.OrderedDict() 89 | for i in tqdm.tqdm(range(102)): 90 | key = str(i + 1) 91 | text = input_json[key] 92 | item_result = collections.OrderedDict() 93 | item_result["origin"] = text 94 | for schema in schemas: 95 | try: 96 | item_result[" -> ".join(schema)] = back_translate(text, lang_list=schema) 97 | except Exception: 98 | print("您的请求速度过快") 99 | output_json[key] = item_result 100 | json.dump(output_json, open(output_file, "w", encoding="utf-8"), indent=4, ensure_ascii=False) 101 | 102 | def test(): 103 | schemas = [ 104 | ('zh-CN', 'en', 'zh-CN'), 105 | ('zh-CN', 'ja', 'zh-CN'), 106 | ('zh-CN', 'ko', 'zh-CN'), 107 | ('zh-CN', 'fr', 'zh-CN') 108 | ] 109 | origin = "UNK是指什么" 110 | for schema in schemas: 111 | print(" -> ".join(schema), ":", back_translate(text=origin, lang_list=schema)) 112 | 113 | 114 | if __name__ == "__main__": 115 | test() 116 | # main() -------------------------------------------------------------------------------- /google/main.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import googletrans 4 | import time 5 | import random 6 | import tqdm 7 | import json 8 | import collections 9 | 10 | 11 | translator = googletrans.Translator() 12 | 13 | # Mapping from standard lang id (see README.md for more details) 14 | # to target (google) lang id 15 | _LANG_MAPPING = { 16 | "zh-CN": "zh-CN", 17 | "en": "en", 18 | "ja": "ja", 19 | "ko": "ko", 20 | "fr": "fr", 21 | "es": "es", 22 | "th": "th", 23 | "de": "de", 24 | "zh-TW": "zh-TW" 25 | } 26 | 27 | 28 | def trans(text: str, 29 | src: str, 30 | dst: str, 31 | sleep_mean: float = 1.0, 32 | sleep_dev: float = 0.3) -> str: 33 | """ 34 | 调用谷歌翻译接口进行单次翻译。 35 | 36 | 参数: 37 | ------ 38 | text: ``str`` 39 | 要翻译的文本 40 | src: ``str`` 41 | 源语言代码,详见``README.md`` 42 | dst: ``str`` 43 | 目标语言代码,详见``README.md`` 44 | sleep_mean: ``float``, optional, default=``1.0`` 45 | 延迟均值(为了防止ip被封,需要延迟) 46 | sleep_dev: ``float``, optional, default=``0.3`` 47 | 延迟标准差 48 | """ 49 | # TODO: 设计一个自适应的算法来控制延迟 50 | time.sleep(max(0.1, random.gauss(sleep_mean, sleep_dev))) 51 | res = translator.translate(text, dest=_LANG_MAPPING[dst], src=_LANG_MAPPING[src]) 52 | return res.text 53 | 54 | def back_translate(text: str, 55 | lang_list: Iterable[str], 56 | sleep_mean: float = 1.0, 57 | sleep_dev: float = 0.3) -> str: 58 | """ 59 | 调用谷歌翻译接口进行单次回译。 60 | 61 | 参数: 62 | ------ 63 | text: ``str`` 64 | 要回译的文本 65 | lang_list: ``Iterable[str]`` 66 | 要回译的语言代码列表。例如要将一句话从中文翻译到英文再翻译回中文,则指定: 67 | ``lang_list=("zh-CN", "en", "zh-CN"])`` 68 | sleep_mean: ``float``, optional, default=``1.0`` 69 | 延迟均值(为了防止ip被封,需要延迟) 70 | sleep_dev: ``float``, optional, default=``0.3`` 71 | 延迟标准差 72 | """ 73 | assert len(lang_list) >= 2 74 | current_text = text 75 | for i in range(len(lang_list) - 1): 76 | current_text = trans(current_text, src=lang_list[i], dst=lang_list[i+1], 77 | sleep_mean=sleep_mean, sleep_dev=sleep_dev) 78 | return current_text 79 | 80 | def main(): 81 | input_file = "../input/questions.json" 82 | output_file = "../output/google.json" 83 | schemas = [ 84 | ('zh-CN', 'en', 'zh-CN'), 85 | ('zh-CN', 'ja', 'zh-CN'), 86 | ('zh-CN', 'ko', 'zh-CN'), 87 | ('zh-CN', 'fr', 'zh-CN'), 88 | ('zh-CN', 'es', 'zh-CN'), 89 | ('zh-CN', 'th', 'zh-CN'), 90 | ('zh-CN', 'de', 'zh-CN'), 91 | ('zh-CN', 'zh-TW', 'zh-CN') 92 | ] 93 | 94 | input_json = json.load(open(input_file, "r", encoding="utf-8")) 95 | output_json = collections.OrderedDict() 96 | for i in tqdm.tqdm(range(102)): 97 | key = str(i + 1) 98 | text = input_json[key] 99 | item_result = collections.OrderedDict() 100 | item_result["origin"] = text 101 | for schema in schemas: 102 | try: 103 | item_result[" -> ".join(schema)] = back_translate(text, lang_list=schema) 104 | except Exception: 105 | print("您的请求速度过快") 106 | output_json[key] = item_result 107 | json.dump(output_json, open(output_file, "w", encoding="utf-8"), indent=4, ensure_ascii=False) 108 | 109 | def test(): 110 | schemas = [ 111 | ('zh-CN', 'en', 'zh-CN'), 112 | ('zh-CN', 'ja', 'zh-CN'), 113 | ('zh-CN', 'ko', 'zh-CN'), 114 | ('zh-CN', 'fr', 'zh-CN'), 115 | ('zh-CN', 'es', 'zh-CN'), 116 | ('zh-CN', 'th', 'zh-CN'), 117 | ('zh-CN', 'de', 'zh-CN'), 118 | ('zh-CN', 'zh-TW', 'zh-CN') 119 | ] 120 | origin = "什么是研发项目" 121 | for schema in schemas: 122 | print(" -> ".join(schema), ":", back_translate(text=origin, lang_list=schema)) 123 | 124 | 125 | if __name__ == "__main__": 126 | test() 127 | # main() -------------------------------------------------------------------------------- /baidu/main.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import json 4 | import time 5 | import random 6 | import tqdm 7 | import collections 8 | 9 | from baidu import translate 10 | 11 | 12 | translator = translate.Dict() 13 | 14 | # Mapping from standard lang id (see README.md for more details) 15 | # to target (baidu) lang id 16 | _LANG_MAPPING = { 17 | "zh-CN": "zh", 18 | "en": "en", 19 | "ja": "jp", 20 | "ko": "kor", 21 | "fr": "fra", 22 | "es": "spa", 23 | "th": "th", 24 | "de": "de", 25 | "zh-TW": "cht" 26 | } 27 | 28 | 29 | def trans(text: str, 30 | src: str, 31 | dst: str, 32 | sleep_mean: float = 1.0, 33 | sleep_dev: float = 0.3) -> str: 34 | """ 35 | 调用百度翻译接口进行单次翻译。 36 | 37 | 参数: 38 | ------ 39 | text: ``str`` 40 | 要翻译的文本 41 | src: ``str`` 42 | 源语言代码,详见``README.md`` 43 | dst: ``str`` 44 | 目标语言代码,详见``README.md`` 45 | sleep_mean: ``float``, optional, default=``1.0`` 46 | 延迟均值(为了防止ip被封,需要延迟) 47 | sleep_dev: ``float``, optional, default=``0.3`` 48 | 延迟标准差 49 | """ 50 | # TODO: 设计一个自适应的算法来控制延迟 51 | time.sleep(max(0.1, random.gauss(sleep_mean, sleep_dev))) 52 | res = translator.dictionary(text, dst=_LANG_MAPPING[dst], src=_LANG_MAPPING[src]) 53 | return res['trans_result']['data'][0]['dst'] 54 | 55 | def back_translate(text: str, 56 | lang_list: Iterable[str], 57 | sleep_mean: float = 1.0, 58 | sleep_dev: float = 0.3) -> str: 59 | """ 60 | 调用百度翻译接口进行单次回译。 61 | 62 | 参数: 63 | ------ 64 | text: ``str`` 65 | 要回译的文本 66 | lang_list: ``Iterable[str]`` 67 | 要回译的语言代码列表。例如要将一句话从中文翻译到英文再翻译回中文,则指定: 68 | ``lang_list=("zh-CN", "en", "zh-CN"])`` 69 | sleep_mean: ``float``, optional, default=``1.0`` 70 | 延迟均值(为了防止ip被封,需要延迟) 71 | sleep_dev: ``float``, optional, default=``0.3`` 72 | 延迟标准差 73 | """ 74 | assert len(lang_list) >= 2 75 | current_text = text 76 | for i in range(len(lang_list) - 1): 77 | current_text = trans(current_text, src=lang_list[i], dst=lang_list[i+1], 78 | sleep_mean=sleep_mean, sleep_dev=sleep_dev) 79 | return current_text 80 | 81 | def main(): 82 | input_file = "../input/questions.json" 83 | output_file = "../output/baidu.json" 84 | schemas = [ 85 | ('zh-CN', 'en', 'zh-CN'), 86 | ('zh-CN', 'ja', 'zh-CN'), 87 | ('zh-CN', 'ko', 'zh-CN'), 88 | ('zh-CN', 'fr', 'zh-CN'), 89 | ('zh-CN', 'es', 'zh-CN'), 90 | ('zh-CN', 'th', 'zh-CN'), 91 | ('zh-CN', 'de', 'zh-CN'), 92 | ('zh-CN', 'zh-TW', 'zh-CN') 93 | ] 94 | 95 | input_json = json.load(open(input_file, "r", encoding="utf-8")) 96 | output_json = collections.OrderedDict() 97 | for i in tqdm.tqdm(range(102)): 98 | key = str(i + 1) 99 | text = input_json[key] 100 | item_result = collections.OrderedDict() 101 | item_result["origin"] = text 102 | for schema in schemas: 103 | try: 104 | item_result[" -> ".join(schema)] = back_translate(text, lang_list=schema) 105 | except Exception as e: 106 | print("您的请求速度过快") 107 | output_json[key] = item_result 108 | json.dump(output_json, open(output_file, "w", encoding="utf-8"), indent=4, ensure_ascii=False) 109 | 110 | def test(): 111 | schemas = [ 112 | ('zh-CN', 'en', 'zh-CN'), 113 | ('zh-CN', 'ja', 'zh-CN'), 114 | ('zh-CN', 'ko', 'zh-CN'), 115 | ('zh-CN', 'fr', 'zh-CN'), 116 | ('zh-CN', 'es', 'zh-CN'), 117 | ('zh-CN', 'th', 'zh-CN'), 118 | ('zh-CN', 'de', 'zh-CN'), 119 | ('zh-CN', 'zh-TW', 'zh-CN') 120 | ] 121 | origin = "什么是研发项目" 122 | for schema in schemas: 123 | print(" -> ".join(schema), ":", back_translate(text=origin, lang_list=schema)) 124 | 125 | 126 | if __name__ == "__main__": 127 | test() 128 | # main() -------------------------------------------------------------------------------- /baidu/translate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import execjs 3 | import requests 4 | import re 5 | 6 | JS_CODE = """ 7 | function a(r, o) { 8 | for (var t = 0; t < o.length - 2; t += 3) { 9 | var a = o.charAt(t + 2); 10 | a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), 11 | a = "+" === o.charAt(t + 1) ? r >>> a: r << a, 12 | r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a 13 | } 14 | return r 15 | } 16 | var C = null; 17 | var token = function(r, _gtk) { 18 | var o = r.length; 19 | o > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(o / 2) - 5, 10) + r.substring(r.length, r.length - 10)); 20 | var t = void 0, 21 | t = null !== C ? C: (C = _gtk || "") || ""; 22 | for (var e = t.split("."), h = Number(e[0]) || 0, i = Number(e[1]) || 0, d = [], f = 0, g = 0; g < r.length; g++) { 23 | var m = r.charCodeAt(g); 24 | 128 > m ? d[f++] = m: (2048 > m ? d[f++] = m >> 6 | 192 : (55296 === (64512 & m) && g + 1 < r.length && 56320 === (64512 & r.charCodeAt(g + 1)) ? (m = 65536 + ((1023 & m) << 10) + (1023 & r.charCodeAt(++g)), d[f++] = m >> 18 | 240, d[f++] = m >> 12 & 63 | 128) : d[f++] = m >> 12 | 224, d[f++] = m >> 6 & 63 | 128), d[f++] = 63 & m | 128) 25 | } 26 | for (var S = h, 27 | u = "+-a^+6", 28 | l = "+-3^+b+-f", 29 | s = 0; s < d.length; s++) S += d[s], 30 | S = a(S, u); 31 | 32 | return S = a(S, l), 33 | S ^= i, 34 | 0 > S && (S = (2147483647 & S) + 2147483648), 35 | S %= 1e6, 36 | S.toString() + "." + (S ^ h) 37 | } 38 | """ 39 | 40 | 41 | class Dict: 42 | def __init__(self): 43 | self.sess = requests.Session() 44 | self.headers = { 45 | 'User-Agent': 46 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 47 | } 48 | self.token = None 49 | self.gtk = None 50 | self.javascript = execjs.compile(JS_CODE) 51 | 52 | # 获得token和gtk 53 | # 必须要加载两次保证token是最新的,否则会出现998的错误 54 | self.loadMainPage() 55 | self.loadMainPage() 56 | 57 | def loadMainPage(self): 58 | """ 59 | load main page : https://fanyi.baidu.com/ 60 | and get token, gtk 61 | """ 62 | url = 'https://fanyi.baidu.com' 63 | 64 | try: 65 | r = self.sess.get(url, headers=self.headers) 66 | self.token = re.findall(r"token: '(.*?)',", r.text)[0] 67 | self.gtk = re.findall(r"window.gtk = '(.*?)';", r.text)[0] 68 | except Exception as e: 69 | raise e 70 | 71 | def langdetect(self, query): 72 | """ 73 | post query to https://fanyi.baidu.com/langdetect 74 | return json like 75 | {"error":0,"msg":"success","lan":"en"} 76 | """ 77 | url = 'https://fanyi.baidu.com/langdetect' 78 | data = {'query': query} 79 | try: 80 | r = self.sess.post(url=url, data=data) 81 | except Exception as e: 82 | raise e 83 | 84 | json = r.json() 85 | if 'msg' in json and json['msg'] == 'success': 86 | return json['lan'] 87 | return None 88 | 89 | def dictionary(self, query, dst='zh', src=None): 90 | """ 91 | get translate result from https://fanyi.baidu.com/v2transapi 92 | """ 93 | url = 'https://fanyi.baidu.com/v2transapi' 94 | 95 | sign = self.javascript.call('token', query, self.gtk) 96 | 97 | if not src: 98 | src = self.langdetect(query) 99 | 100 | data = { 101 | 'from': src, 102 | 'to': dst, 103 | 'query': query, 104 | 'simple_means_flag': 3, 105 | 'sign': sign, 106 | 'token': self.token, 107 | } 108 | try: 109 | r = self.sess.post(url=url, data=data) 110 | except Exception as e: 111 | raise e 112 | 113 | if r.status_code == 200: 114 | json = r.json() 115 | if 'error' in json: 116 | raise Exception('baidu sdk error: {}'.format(json['error'])) 117 | # 998错误则意味需要重新加载主页获取新的token 118 | return json 119 | return None 120 | --------------------------------------------------------------------------------