├── README.md ├── config.json ├── requirements.txt ├── script.py └── voices └── example.wav /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Install from the Original Repo: 3 | https://github.com/kanttouchthis/text_generation_webui_xtts 4 | 5 | # Example 6 | This tooke about 3 seconds to render on a 4090 7 | ![ouput](https://github.com/RandomInternetPreson/text_generation_webui_xtt_Alts/assets/6488699/cbcf7952-93bb-4ec9-8540-53e38baf310a) 8 | 9 | https://github.com/RandomInternetPreson/text_generation_webui_xtt_Alts/assets/6488699/a455b6ff-a6c4-41a0-abed-65f37bf6dd90 10 | 11 | # The orignal repo should be compatible with winodows now, install from there: 12 | Orignial Repo: https://github.com/kanttouchthis/text_generation_webui_xtts 13 | use these instructions if you need help installing extensions 14 | 15 | # Installation (Windows) 16 | 17 | https://github.com/RandomInternetPreson/text_generation_webui_xtt_Alts/assets/6488699/30185557-d9ec-431d-bf56-20ffbaa0bd31 18 | 19 | Clone this repo with git or clone using the text-generation-webui software: 20 | ``` 21 | git clone https://github.com/kanttouchthis/text_generation_webui_xtts 22 | or just enter the url in the "Install or update an extension" window in the "Session" tab (remember to press enter): 23 | https://github.com/kanttouchthis/text_generation_webui_xtts 24 | ``` 25 | Activate your environment. 26 | ``` 27 | Go to the install folder where you unzipped the one-click installer, and click on cmd_windows.bat 28 | ``` 29 | Install dependencies for TTS. 30 | ``` 31 | Via the command window that has now popped up, navitate to the text_generation_webui_xtt extension in the "extensions" folder in the textgen install directory. To do this enter: 32 | 33 | cd your-directory-here 34 | 35 | cd means change directory, and your directory will look something like this: L:\OobNov13\text-generation-webui-main\extensions\text_generation_webui_xtt 36 | so you would put "cd L:\OobNov13\text-generation-webui-main\extensions\text_generation_webui_xtt" in the command window (without the quotes) 37 | 38 | Once you have navigated to the install directory for the extension from the perspective of the command window enter the following: 39 | 40 | pip install -r requirements.txt 41 | ``` 42 | Install TTS. Their version requirements cause issues so we install the dependencies above, without version requirements. 43 | ``` 44 | With the command window still open (reopen it if you have closed it) enter the text below: 45 | pip install TTS --no-dependencies 46 | ``` 47 | 48 | 49 | # Installation 50 | Clone this repo: 51 | ``` 52 | cd extensions 53 | git clone https://github.com/kanttouchthis/text-generation-webui-xtts 54 | ``` 55 | Activate your environment. For example: 56 | ``` 57 | conda activate textgen 58 | ``` 59 | Install dependencies for TTS. 60 | ``` 61 | pip install -r requirements.txt 62 | ``` 63 | Install TTS. Their version requirements cause issues so we install the dependencies above, without version requirements. 64 | ``` 65 | pip install TTS --no-dependencies 66 | ``` 67 | 68 | # Usage 69 | Once you finished the steps above, you can add some voices to the voices folder. This can be any short (3-6 seconds) wav clip of someone talking. Make sure it's high quality audio with no long gaps. 70 | Then, run the webui with `--extensions xtts` and select your voice/language and other settings at the bottom. You might have to accept the terms and conditions via the console when you first run it. 71 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "activate": true, 3 | "autoplay": true, 4 | "show_text": true, 5 | "combine": true, 6 | "voice": "example.wav", 7 | "narrator": "example.wav", 8 | "language": "English", 9 | "available_languages": { 10 | "English": "en", 11 | "Spanish": "es", 12 | "French": "fr", 13 | "German": "de", 14 | "Italian": "it", 15 | "Portuguese": "pt", 16 | "Polish": "pl", 17 | "Turkish": "tr", 18 | "Russian": "ru", 19 | "Dutch": "nl", 20 | "Czech": "cs", 21 | "Arabic": "ar", 22 | "Chinese": "zh-cn", 23 | "Japanese": "ja", 24 | "Hungarian": "hu", 25 | "Korean": "ko" 26 | } 27 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | soundfile 4 | librosa 5 | scikit-learn 6 | numba 7 | inflect 8 | tqdm 9 | anyascii 10 | pyyaml 11 | fsspec 12 | aiohttp 13 | packaging 14 | pysbd 15 | coqpit 16 | unidecode 17 | num2words 18 | trainer 19 | # bangla 20 | bangla 21 | bnnumerizer 22 | bnunicodenormalizer 23 | # de es fr 24 | gruut[de,es,fr] 25 | # korean 26 | jamo 27 | nltk 28 | g2pkk 29 | hangul_romanize 30 | # chinese 31 | jieba 32 | pypinyin 33 | # japanese 34 | spacy[ja] 35 | cutlet 36 | unidic-lite 37 | mecab-python3 38 | -------------------------------------------------------------------------------- /script.py: -------------------------------------------------------------------------------- 1 | from TTS.api import TTS 2 | import os 3 | import json 4 | import time 5 | from pathlib import Path 6 | import gradio as gr 7 | import soundfile as sf 8 | import numpy as np 9 | from modules import shared 10 | 11 | streaming_state = shared.args.no_stream 12 | 13 | tts = None 14 | this_dir = os.path.dirname(os.path.abspath(__file__)) 15 | params = json.load(open(f"{this_dir}/config.json")) 16 | languages = params["available_languages"] 17 | voice_presets = sorted(os.listdir(f"{this_dir}/voices")) 18 | narrator_presets = ["None", "Skip"] + voice_presets 19 | 20 | 21 | def preprocess(raw_input): 22 | raw_input = raw_input.replace("&", "&") 23 | raw_input = raw_input.replace("<", "<") 24 | raw_input = raw_input.replace(">", ">") 25 | raw_input = raw_input.replace(""", '"') 26 | raw_input = raw_input.replace("'", "'") 27 | raw_input = raw_input.strip("\"") 28 | return raw_input 29 | 30 | 31 | def preprocess_narrator(raw_input): 32 | raw_input = preprocess(raw_input) 33 | raw_input = raw_input.replace("***", "*") 34 | raw_input = raw_input.replace("**", "*") 35 | narrated_text = raw_input.split("*") 36 | return raw_input, narrated_text 37 | 38 | 39 | def combine(audiofiles): 40 | audio = np.array([]) 41 | for audiofile in audiofiles: 42 | audio = np.concatenate((audio, sf.read(audiofile)[0])) 43 | return audio 44 | 45 | 46 | def history_modifier(history): 47 | if len(history["internal"]) > 0: 48 | history["visible"][-1] = [ 49 | history["visible"][-1][0], 50 | history["visible"][-1][1].replace( 51 | "controls autoplay style=\"height: 30px;\">", "controls style=\"height: 30px;\">") 52 | ] 53 | return history 54 | 55 | 56 | def format_html(audiofiles): 57 | if params["combine"]: 58 | autoplay = "autoplay" if params["autoplay"] else "" 59 | combined = combine(audiofiles) 60 | time_label = audiofiles[0].split("/")[-1].split("_")[0] 61 | sf.write(f"{this_dir}/generated/{time_label}_combined.wav", 62 | combined, 24000) 63 | return f'' 64 | else: 65 | string = "" 66 | for audiofile in audiofiles: 67 | string += f'' 68 | return string 69 | 70 | 71 | def input_modifier(string): 72 | if not params["activate"]: 73 | shared.processing_message = "*Is typing...*" 74 | return string 75 | shared.processing_message = "*Is recording a voice message...*" 76 | shared.args.no_stream = True 77 | return string 78 | 79 | 80 | def tts_char(string): 81 | global tts 82 | string = string 83 | if not params["activate"]: 84 | return string 85 | 86 | if tts is None: 87 | print("[XTTS] Loading XTTS...") 88 | tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) 89 | 90 | ttstext = preprocess(string) 91 | time_label = int(time.time()) 92 | tts.tts_to_file(text=ttstext, 93 | file_path=f"{this_dir}/generated/{time_label}.wav", 94 | speaker_wav=[f"{this_dir}/voices/{params['voice']}"], 95 | language=languages[params["language"]]) 96 | 97 | autoplay = "autoplay" if params["autoplay"] else "" 98 | 99 | string = f'
{ttstext}' 100 | if params["show_text"]: 101 | string += f"
{ttstext}" 102 | 103 | shared.args.no_stream = streaming_state 104 | return string 105 | 106 | 107 | def tts_narrator(string): 108 | global tts 109 | string = string 110 | if not params["activate"]: 111 | return string 112 | 113 | if tts is None: 114 | print("[XTTS] Loading XTTS...") 115 | tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) 116 | 117 | ttstext, turns = preprocess_narrator(string) 118 | voices = (params["voice"], params["narrator"]) 119 | audiofiles = [] 120 | time_label = int(time.time()) 121 | for i, turn in enumerate(turns): 122 | if turn.strip() == "": 123 | continue 124 | voice = voices[i % 2] 125 | if voice == "Skip": 126 | continue 127 | tts.tts_to_file(text=turn, 128 | file_path=f"{this_dir}/generated/{time_label}_{i:03d}.wav", 129 | speaker_wav=[f"{this_dir}/voices/{voice}"], 130 | language=languages[params["language"]]) 131 | audiofiles.append( 132 | f"{this_dir}/generated/{time_label}_{i:03d}.wav") 133 | 134 | string = format_html(audiofiles) 135 | if params["show_text"]: 136 | string += f"
{ttstext}" 137 | shared.args.no_stream = streaming_state 138 | return string 139 | 140 | 141 | def output_modifier(string): 142 | if params["narrator"] == "None": 143 | return tts_char(string) 144 | else: 145 | return tts_narrator(string) 146 | 147 | 148 | def setup(): 149 | global tts 150 | print("[XTTS] Loading XTTS...") 151 | tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) 152 | print("[XTTS] Creating directories (if they don't exist)...") 153 | if not Path(f"{this_dir}/generated").exists(): 154 | Path(f"{this_dir}/generated").mkdir(parents=True) 155 | print("[XTTS] Done!") 156 | 157 | 158 | def ui(): 159 | with gr.Accordion("XTTS"): 160 | with gr.Row(): 161 | activate = gr.Checkbox( 162 | value=params["activate"], label="Activate TTS") 163 | autoplay = gr.Checkbox(value=params["autoplay"], label="Autoplay") 164 | show_text = gr.Checkbox( 165 | value=params["show_text"], label="Show text") 166 | combine_audio = gr.Checkbox( 167 | value=params["combine"], label="Combine audio") 168 | with gr.Row(): 169 | voice = gr.Dropdown( 170 | voice_presets, label="Voice Wav", value=params["voice"]) 171 | narrator = gr.Dropdown( 172 | narrator_presets, label="Narrator Wav", value=params["narrator"]) 173 | language = gr.Dropdown( 174 | languages.keys(), label="Language", value=params["language"]) 175 | 176 | activate.change(lambda x: params.update({"activate": x}), activate, None) 177 | autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None) 178 | show_text.change(lambda x: params.update( 179 | {"show_text": x}), show_text, None) 180 | combine_audio.change(lambda x: params.update( 181 | {"combine": x}), combine_audio, None) 182 | 183 | voice.change(lambda x: params.update({"voice": x}), voice, None) 184 | narrator.change(lambda x: params.update({"narrator": x}), narrator, None) 185 | language.change(lambda x: params.update({"language": x}), language, None) 186 | -------------------------------------------------------------------------------- /voices/example.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RandomInternetPreson/text_generation_webui_xtt_Alts/f379b3561e2a02853c15b493a410af774eefff06/voices/example.wav --------------------------------------------------------------------------------