├── requirements.txt ├── README.md └── translate.py /requirements.txt: -------------------------------------------------------------------------------- 1 | tiktoken 2 | openai 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPT-LN-Translator 💬📚 2 | 3 | This is a command-line tool for translating novels using OpenAI's GPT language models. The tool allows you to translate a TXT file from any language to English using the OpenAI API. 4 | 5 | ## Usage 🚀 6 | 7 | To use the tool, you will need to provide the following arguments: 8 | 9 | - `--input`: The path to the TXT file that you want to translate. 10 | - `--lang-out`: The language of the output text (default: English). 11 | - `--length-limit`: The maximum length of the prompt (default: 4000, You can go up to 8000 with gpt-4 but the api has a little trouble so I recommend going up to 6000 with it). 12 | - `--openai-key`: Your OpenAI API key 13 | - `--openai-model`: The OpenAI API engine to use (default: gpt-3.5-turbo, other options: text-davinci-003, gpt-4). 14 | 15 | Here's an example of how to use the tool: 16 | ```python translate.py --input novel.txt --lang-out English --length-limit 4000 --openai-key --openai-model gpt-3.5-turbo``` 17 | 18 | ## Installation 🛠️ 19 | 20 | 1. Clone the repository: 21 | ```git clone https://github.com/Snowad14/GPT-LN-Translator.git``` 22 | 23 | 2. Install the dependencies: 24 | ```pip install -r requirements.txt``` 25 | 26 | ## Explanation of Code 🤔 27 | 28 | My code will use a coefficient that can be calculated from the previous translation that allows to know approximately how many words the model will give. This allows to find the ideal number of Japanese characters to put in the prompt so that it can give a complete translation in the desired language with the maximum possible context 29 | -------------------------------------------------------------------------------- /translate.py: -------------------------------------------------------------------------------- 1 | import argparse, tiktoken, openai, math, concurrent.futures, uuid 2 | 3 | parser = argparse.ArgumentParser() 4 | parser.add_argument('--input', type=str, required=True, help='TXT File path that will be translated') 5 | parser.add_argument('--lang-out', type=str, default='English', help='Directory for storing model') 6 | parser.add_argument('--length-limit', type=int, default=4000, help='Max Length of the prompt') 7 | parser.add_argument('--openai-key', type=str, required=False, help='OpenAI API Key') 8 | parser.add_argument('--openai-model', type=str, default='gpt-3.5-turbo', help='OpenAI API Engine') # gpt-3.5-turbo, text-davinci-003, gpt-4 9 | args = parser.parse_args() 10 | openai.api_key = args.openai_key 11 | 12 | # Language Coefficient are the coefficients that allow to estimate the number of output tokens of the translation 13 | LANGUAGE_COEF = { 14 | "French" : 0.7, # JP In : [1935, 1942, 1919, 1925, 1894, 1938, 1938] | FR Out : [1359, 1244, 1208, 1208, 1258, 1221, 1115] | AVG : 0.6385 15 | "English": 0.5 # JP In : [1935, 1942, 1919, 1925, 1894, 1938] | EN Out : [917, 918, 868, 861, 930, 932] | AVG : 0.4697 16 | } 17 | 18 | CHOOSED_COEF = LANGUAGE_COEF.get(args.lang_out) if LANGUAGE_COEF.get(args.lang_out) else exit(f"Language not supported, please choose one language of the list : {list(LANGUAGE_COEF.keys())}") 19 | MODEL_ENCODER = tiktoken.encoding_for_model(args.openai_model) 20 | PROMPT_TEMPLATE = f"""I want you to act as an {args.lang_out} translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in {args.lang_out}. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level {args.lang_out} words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations. Don't forget to skip lines to space the text""" 21 | MAX_PROMPT_TOKEN = math.ceil(args.length_limit / (1 + CHOOSED_COEF)) 22 | CHAT_BASED_MODEL = ["gpt-3.5-turbo", "gpt-4"] 23 | 24 | def getTokensCountFromString(string): 25 | return len(MODEL_ENCODER.encode(string)) 26 | 27 | def getTokensCountFromChatBased(messages): 28 | num_tokens = 0 29 | for message in messages: 30 | num_tokens += 4 31 | for key, value in message.items(): 32 | num_tokens += getTokensCountFromString(value) 33 | if key == "name": 34 | num_tokens += -1 35 | num_tokens += 2 36 | return num_tokens 37 | 38 | def craftMessageWithPrompt(prompt): 39 | return [ 40 | {"role": "system", "content": PROMPT_TEMPLATE}, 41 | {"role": "user", "content": prompt} 42 | ] 43 | 44 | def translate(content): 45 | index, message = content 46 | print(f"Starting translation task #{index}") 47 | response = openai.ChatCompletion.create( 48 | model=args.openai_model, 49 | messages=message, 50 | temperature=0.7, 51 | request_timeout=1200 52 | ) 53 | if response['usage']['total_tokens'] < args.length_limit + 97: 54 | print(f"Finished translation task #{index} with {response['usage']['total_tokens']} tokens used") 55 | else: 56 | print(f"Warning for Translation task #{index} : Used {response['usage']['total_tokens']} tokens : the prompt is too long, the translation will miss some words") 57 | with open(f"{index}.txt", "w", encoding="utf-8", errors="ignore") as writer: 58 | writer.write(response['choices'][0]['message']['content']) 59 | return response['choices'][0]['message']['content'] 60 | 61 | text_parts = [] 62 | prompt = "" 63 | num_line = len(open(args.input, "r", encoding="utf-8", errors="ignore").readlines()) 64 | for c, line in enumerate(open(args.input, "r", encoding="utf-8", errors="ignore")): 65 | if args.openai_model in CHAT_BASED_MODEL: 66 | messages= craftMessageWithPrompt(prompt) 67 | next_messages= craftMessageWithPrompt(prompt + line) 68 | if getTokensCountFromChatBased(messages) <= MAX_PROMPT_TOKEN and getTokensCountFromChatBased(next_messages) <= MAX_PROMPT_TOKEN: 69 | prompt += line 70 | if c == num_line - 1: 71 | text_parts.append(prompt) 72 | else: 73 | text_parts.append(prompt) 74 | prompt = line 75 | else: 76 | exit("Model not supported, please use a chat based model") 77 | 78 | print(f"Text divided in {len(text_parts)} parts, starting translation with {args.openai_model} model") 79 | 80 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 81 | translated_messages = list(executor.map(translate, [(index, craftMessageWithPrompt(text_part)) for index, text_part in enumerate(text_parts)])) 82 | 83 | with open("output.txt", "w", encoding="utf-8", errors="ignore") as writer: 84 | for part in translated_messages: 85 | writer.write(part) 86 | 87 | 88 | --------------------------------------------------------------------------------