├── ping.wav ├── pong.wav ├── mymoegoe ├── text │ ├── cleaners.py │ ├── __init__.py │ └── english.py ├── commons.py ├── tts.py ├── transforms.py ├── attentions.py ├── modules.py └── models.py ├── requirements.txt ├── README.md ├── xtts └── stream.py └── launch.py /ping.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/April93/ai-voice-assistant/HEAD/ping.wav -------------------------------------------------------------------------------- /pong.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/April93/ai-voice-assistant/HEAD/pong.wav -------------------------------------------------------------------------------- /mymoegoe/text/cleaners.py: -------------------------------------------------------------------------------- 1 | import re 2 | def cjke_cleaners2(text): 3 | from mymoegoe.text.english import english_to_ipa2 4 | text = re.sub(r'^(.*?)$', 5 | lambda x: english_to_ipa2(x.group(1))+' ', text) 6 | text = re.sub(r'\s+$', '', text) 7 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 8 | return text 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cloudscraper==1.2.68 2 | eng_to_ipa==0.0.2 3 | inflect==5.6.0 4 | numpy==1.23.5 5 | openai 6 | Pillow==9.5.0 7 | PySoundFile==0.9.0.post1 8 | pyttsx3==2.90 9 | Requests==2.31.0 10 | scipy==1.10.1 11 | sounddevice==0.4.5 12 | soundfile==0.12.1 13 | SpeechRecognition==3.9.0 14 | torch==2.0.1 15 | Unidecode==1.3.6 16 | WhisperMic 17 | -------------------------------------------------------------------------------- /mymoegoe/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from mymoegoe.text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /mymoegoe/commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | import torch.jit 4 | 5 | 6 | def script_method(fn, _rcb=None): 7 | return fn 8 | 9 | 10 | def script(obj, optimize=True, _frames_up=0, _rcb=None): 11 | return obj 12 | 13 | 14 | torch.jit.script_method = script_method 15 | torch.jit.script = script 16 | 17 | 18 | def init_weights(m, mean=0.0, std=0.01): 19 | classname = m.__class__.__name__ 20 | if classname.find("Conv") != -1: 21 | m.weight.data.normal_(mean, std) 22 | 23 | 24 | def get_padding(kernel_size, dilation=1): 25 | return int((kernel_size*dilation - dilation)/2) 26 | 27 | 28 | def intersperse(lst, item): 29 | result = [item] * (len(lst) * 2 + 1) 30 | result[1::2] = lst 31 | return result 32 | 33 | 34 | def slice_segments(x, ids_str, segment_size=4): 35 | ret = torch.zeros_like(x[:, :, :segment_size]) 36 | for i in range(x.size(0)): 37 | idx_str = ids_str[i] 38 | idx_end = idx_str + segment_size 39 | ret[i] = x[i, :, idx_str:idx_end] 40 | return ret 41 | 42 | 43 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 44 | b, d, t = x.size() 45 | if x_lengths is None: 46 | x_lengths = t 47 | ids_str_max = x_lengths - segment_size + 1 48 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 49 | ret = slice_segments(x, ids_str, segment_size) 50 | return ret, ids_str 51 | 52 | 53 | def subsequent_mask(length): 54 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 55 | return mask 56 | 57 | 58 | @torch.jit.script 59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 60 | n_channels_int = n_channels[0] 61 | in_act = input_a + input_b 62 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 63 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 64 | acts = t_act * s_act 65 | return acts 66 | 67 | 68 | def convert_pad_shape(pad_shape): 69 | l = pad_shape[::-1] 70 | pad_shape = [item for sublist in l for item in sublist] 71 | return pad_shape 72 | 73 | 74 | def sequence_mask(length, max_length=None): 75 | if max_length is None: 76 | max_length = length.max() 77 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 78 | return x.unsqueeze(0) < length.unsqueeze(1) 79 | 80 | 81 | def generate_path(duration, mask): 82 | """ 83 | duration: [b, 1, t_x] 84 | mask: [b, 1, t_y, t_x] 85 | """ 86 | device = duration.device 87 | 88 | b, _, t_y, t_x = mask.shape 89 | cum_duration = torch.cumsum(duration, -1) 90 | 91 | cum_duration_flat = cum_duration.view(b * t_x) 92 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 93 | path = path.view(b, t_x, t_y) 94 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 95 | path = path.unsqueeze(1).transpose(2,3) * mask 96 | return path 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ai-voice-assistant 2 | 3 | I hooked up you.com's youchat into speech to text and text to speech services, to create an AI voice assistant. I'm using this alongside vb-cable and vmagicmirror to create an anime character chatbot. The code supports both text and voice input, configuring voice, wake word, and prompt context, and of course vb-cable support. See '-h' for launch parameters. 4 | 5 | 6 | ## Optional setup 7 | 8 | ### Local language model with Oobabooga Web UI 9 | 10 | You can use this with [Oobabooga's Web UI](https://github.com/oobabooga/text-generation-webui/) for a local LLM (instead of Youchat) by launching with `--ooba`. When using this, you can also load in TavernAI png/webp or Pygmalion/Oobabooga json character cards with `--chara filename.png`. The script assumes you are running oobabooga with `--extensions api` and on port 5000 (default). 11 | 12 | ### Local language model with LM Studio 13 | 14 | You can use this with [LM Studio](https://lmstudio.ai/) for a local LLM (instead of Youchat) by launching with `--openai`. When using this, you can also load in TavernAI png/webp or Pygmalion/Oobabooga json character cards with `--chara filename.png`. The script assumes you're running LM Studio and have started the local inference server on port 1234 (default). 15 | 16 | ### Local speech recognition with Vosk 17 | 18 | Use `--vosk` to run with vosk speech recognition instead of the default whisper one. Make sure you download a model from [here](https://alphacephei.com/vosk/models) and put it in the folder with it named as `model`. Make sure to run with `--voiceinput` when using speech to text input. 19 | 20 | ### Moegoe TTS 21 | 22 | A modified version of moegoe is included in the repo. To use it, place a compatible `.pth` and `.json` model in `mymoegoe/models/` ensuring both the pth and json have the same name, then launch the script with `--moegoe`. The script assumes a model named `g.pth` and `g.json`. You can manually select it by launching with `--mgmodel modelnamehere` for example: `--mgmodel g` without file extensions. If the voice is too fast or too slow, you can modify the speed with `--voicespeed 1.0` adjusting the number (higher is slower). A popular model with thousands of anime voices can be found [here.](https://huggingface.co/spaces/skytnt/moe-tts/tree/main/saved_model/15) 23 | 24 | ### XTTS 25 | 26 | Use XTTS-v2 as the TTS engine by launching the script with `--xtts`. The script requires that you place the XTTS model in `xtts/models`. The expected default model name is `base v2.0.2`. You can use a reference voice by placing a wav file in `xtts/voices` and launching the script with both `--xtts` as well as `--voice filename` without the wav extension. By default, `xtts/voices/en_sample.wav` file is used as a reference. The XTTS-v2 model can be found [here.](https://huggingface.co/coqui/XTTS-v2) 27 | 28 | ### Anime character visualization with VMagicMirror and VB-Cable 29 | 30 | For windows and mac it's possible to install [VB-Cable](https://vb-audio.com/Cable/) and [VMagicMirror](https://github.com/malaybaku/VMagicMirror/) to send the TTS output to an on screen anime character. Launch the script with `--vbcable` to send TTS to vbcable. Then run VMagicMirror and set the microphone to the virtual VB-Cable microphone. 31 | 32 | ## Launch Arguments 33 | 34 | | Launch Argument | Description | 35 | | ------------- |:-------------:| 36 | |`--vbcable`|Send audio to vb-cable virtual microphone.| 37 | |`--voiceinput`|Interact with the AI using your voice instead of text.| 38 | |`--pc='string'`|set a prompt context. To prepend to prompts. Optionally can be set as fake history.| 39 | |`--pcaschat`|Sets prompt context to be a fake chat history.| 40 | |`--caphistory=number`|Caps chat history length. Default is 4. Set to -1 to disable.| 41 | |`--voice=number/string`|Set the TTS voice.| 42 | |`--voices`|List voices on your computer.| 43 | |`--wakeword='string'`|Sets the wake word when using voice input.| 44 | |`--alwayslisten`|Always listen for input, not using a wake word.| 45 | |`--ooba`|Use local oobabooga webui as LLM instead of YouChat.| 46 | |`--openai`|Use openai api as LLM instead of YouChat.| 47 | |`--vosk`|Use local vosk as STT.| 48 | |`--googlestt`|Use google's online service as STT.| 49 | |`--chara='filename'`|Load tavernai character card or oobabooga character json file.| 50 | |`--moegoe`|Use moegoe as TTS instead of default TTS.| 51 | |`--xtts`|Use xtts as TTS instead of default TTS.| 52 | |`--bootmsg='string'`|What to say when booting up.| 53 | |`--wakeprompt`|Like alwayslisten, but doesn't prompt unless wakeword is included.| 54 | |`--nowakeping`|Doesn't ping when starting to listen for wake word| 55 | |`--voicespeed=number`|Speed of moegoe tts. Higher=slower. default is 1.| 56 | |`--mgmodel='filename'`|set the filename of the moegoe model. default is g| 57 | |`--template='string'`|specify a prompt template (chatml or phi3). default typical chat format.| 58 | |`-v`|Print debug info.| -------------------------------------------------------------------------------- /mymoegoe/text/english.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | 18 | 19 | import re 20 | import inflect 21 | from unidecode import unidecode 22 | import eng_to_ipa as ipa 23 | _inflect = inflect.engine() 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 29 | _number_re = re.compile(r'[0-9]+') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | # List of (ipa, lazy ipa) pairs: 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 56 | ('r', 'ɹ'), 57 | ('æ', 'e'), 58 | ('ɑ', 'a'), 59 | ('ɔ', 'o'), 60 | ('ð', 'z'), 61 | ('θ', 's'), 62 | ('ɛ', 'e'), 63 | ('ɪ', 'i'), 64 | ('ʊ', 'u'), 65 | ('ʒ', 'ʥ'), 66 | ('ʤ', 'ʥ'), 67 | ('ˈ', '↓'), 68 | ]] 69 | 70 | # List of (ipa, lazy ipa2) pairs: 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 72 | ('r', 'ɹ'), 73 | ('ð', 'z'), 74 | ('θ', 's'), 75 | ('ʒ', 'ʑ'), 76 | ('ʤ', 'dʑ'), 77 | ('ˈ', '↓'), 78 | ]] 79 | 80 | # List of (ipa, ipa2) pairs 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 82 | ('r', 'ɹ'), 83 | ('ʤ', 'dʒ'), 84 | ('ʧ', 'tʃ') 85 | ]] 86 | 87 | 88 | def expand_abbreviations(text): 89 | for regex, replacement in _abbreviations: 90 | text = re.sub(regex, replacement, text) 91 | return text 92 | 93 | 94 | def collapse_whitespace(text): 95 | return re.sub(r'\s+', ' ', text) 96 | 97 | 98 | def _remove_commas(m): 99 | return m.group(1).replace(',', '') 100 | 101 | 102 | def _expand_decimal_point(m): 103 | return m.group(1).replace('.', ' point ') 104 | 105 | 106 | def _expand_dollars(m): 107 | match = m.group(1) 108 | parts = match.split('.') 109 | if len(parts) > 2: 110 | return match + ' dollars' # Unexpected format 111 | dollars = int(parts[0]) if parts[0] else 0 112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 113 | if dollars and cents: 114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 115 | cent_unit = 'cent' if cents == 1 else 'cents' 116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 117 | elif dollars: 118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 119 | return '%s %s' % (dollars, dollar_unit) 120 | elif cents: 121 | cent_unit = 'cent' if cents == 1 else 'cents' 122 | return '%s %s' % (cents, cent_unit) 123 | else: 124 | return 'zero dollars' 125 | 126 | 127 | def _expand_ordinal(m): 128 | return _inflect.number_to_words(m.group(0)) 129 | 130 | 131 | def _expand_number(m): 132 | num = int(m.group(0)) 133 | if num > 1000 and num < 3000: 134 | if num == 2000: 135 | return 'two thousand' 136 | elif num > 2000 and num < 2010: 137 | return 'two thousand ' + _inflect.number_to_words(num % 100) 138 | elif num % 100 == 0: 139 | return _inflect.number_to_words(num // 100) + ' hundred' 140 | else: 141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 142 | else: 143 | return _inflect.number_to_words(num, andword='') 144 | 145 | 146 | def normalize_numbers(text): 147 | text = re.sub(_comma_number_re, _remove_commas, text) 148 | text = re.sub(_pounds_re, r'\1 pounds', text) 149 | text = re.sub(_dollars_re, _expand_dollars, text) 150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 151 | text = re.sub(_ordinal_re, _expand_ordinal, text) 152 | text = re.sub(_number_re, _expand_number, text) 153 | return text 154 | 155 | 156 | def mark_dark_l(text): 157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) 158 | 159 | 160 | def english_to_ipa(text): 161 | text = unidecode(text).lower() 162 | text = expand_abbreviations(text) 163 | text = normalize_numbers(text) 164 | phonemes = ipa.convert(text) 165 | phonemes = collapse_whitespace(phonemes) 166 | return phonemes 167 | 168 | 169 | def english_to_lazy_ipa(text): 170 | text = english_to_ipa(text) 171 | for regex, replacement in _lazy_ipa: 172 | text = re.sub(regex, replacement, text) 173 | return text 174 | 175 | 176 | def english_to_ipa2(text): 177 | text = english_to_ipa(text) 178 | text = mark_dark_l(text) 179 | for regex, replacement in _ipa_to_ipa2: 180 | text = re.sub(regex, replacement, text) 181 | return text.replace('...', '…') 182 | 183 | 184 | def english_to_lazy_ipa2(text): 185 | text = english_to_ipa(text) 186 | for regex, replacement in _lazy_ipa2: 187 | text = re.sub(regex, replacement, text) 188 | return text 189 | -------------------------------------------------------------------------------- /mymoegoe/tts.py: -------------------------------------------------------------------------------- 1 | #Needed to write wav file, no added install cost 2 | import wave 3 | import struct 4 | 5 | 6 | #Internal Reqs, no added install cost 7 | from mymoegoe.text import text_to_sequence, _clean_text 8 | from mymoegoe.models import SynthesizerTrn 9 | import mymoegoe.commons as commons 10 | 11 | #re is common 12 | import re 13 | 14 | #Torch is common 15 | from torch import no_grad, LongTensor 16 | 17 | #utils imports. Json and torch both common 18 | from json import loads 19 | from torch import load, FloatTensor 20 | import torch 21 | from scipy.io.wavfile import write 22 | import os 23 | 24 | #Utils 25 | #--------------- 26 | class HParams(): 27 | def __init__(self, **kwargs): 28 | for k, v in kwargs.items(): 29 | if type(v) == dict: 30 | v = HParams(**v) 31 | self[k] = v 32 | 33 | def keys(self): 34 | return self.__dict__.keys() 35 | 36 | def items(self): 37 | return self.__dict__.items() 38 | 39 | def values(self): 40 | return self.__dict__.values() 41 | 42 | def __len__(self): 43 | return len(self.__dict__) 44 | 45 | def __getitem__(self, key): 46 | return getattr(self, key) 47 | 48 | def __setitem__(self, key, value): 49 | return setattr(self, key, value) 50 | 51 | def __contains__(self, key): 52 | return key in self.__dict__ 53 | 54 | def __repr__(self): 55 | return self.__dict__.__repr__() 56 | 57 | 58 | def load_checkpoint(checkpoint_path, model): 59 | checkpoint_dict = load(checkpoint_path, map_location="cpu") 60 | iteration = checkpoint_dict['iteration'] 61 | saved_state_dict = checkpoint_dict['model'] 62 | if hasattr(model, 'module'): 63 | state_dict = model.module.state_dict() 64 | else: 65 | state_dict = model.state_dict() 66 | new_state_dict= {} 67 | for k, v in state_dict.items(): 68 | try: 69 | new_state_dict[k] = saved_state_dict[k] 70 | except: 71 | print("Not in dictionary: ", k) 72 | #logging.info("%s is not in the checkpoint" % k) 73 | new_state_dict[k] = v 74 | if hasattr(model, 'module'): 75 | model.module.load_state_dict(new_state_dict) 76 | else: 77 | model.load_state_dict(new_state_dict) 78 | #logging.info("Loaded checkpoint '{}' (iteration {})" .format( 79 | # checkpoint_path, iteration)) 80 | return 81 | 82 | 83 | def get_hparams_from_file(config_path): 84 | with open(config_path, "r", encoding="utf-8") as f: 85 | data = f.read() 86 | config = loads(data) 87 | 88 | hparams = HParams(**config) 89 | return hparams 90 | 91 | 92 | 93 | #Script 94 | #--------- 95 | 96 | #Model Loading 97 | mchoice = "g" 98 | model = "mymoegoe/models/"+mchoice+".pth" 99 | config = "mymoegoe/models/"+mchoice+".json" 100 | 101 | #Set speaker/voice, usually 0. Along with wav destination 102 | speaker_id = 0 103 | #out_path = "temp.wav" 104 | 105 | #Default Audio Settings 106 | defaultlength = 1 107 | defaultnoisescale = 0.667 108 | defaultnoisedeviation = 0.8 109 | 110 | #Audio Settings 111 | length_scale = 1 #length scale 112 | noise_scale = 0.5 #noise scale - phoneme length? 113 | noise_scale_w = 0.1 #deviation of noise - emotionality? 114 | 115 | #Input Text 116 | #text = "" 117 | 118 | n_symbols = 0 119 | hps_ms = None 120 | net_g_ms = None 121 | 122 | def loadtts(mgmodel): 123 | global model, config, mchoice 124 | mchoice = mgmodel 125 | script_path = os.path.abspath(__file__) 126 | directory = os.path.dirname(script_path) 127 | model = os.path.join(directory,"models/"+mgmodel+".pth") 128 | config = os.path.join(directory,"models/"+mgmodel+".json") 129 | global n_symbols, hps_ms, net_g_ms 130 | #Load params from the config 131 | hps_ms = get_hparams_from_file(config) 132 | 133 | #Seems to get number of speakers 134 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 135 | #Seems to get number of symbols? 136 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 137 | #Get the speakers. 138 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 139 | #Emotion embedding stuff, seems unneeded 140 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 141 | 142 | #Some model loading stuff? 143 | net_g_ms = SynthesizerTrn( 144 | n_symbols, 145 | hps_ms.data.filter_length // 2 + 1, 146 | hps_ms.train.segment_size // hps_ms.data.hop_length, 147 | n_speakers=n_speakers, 148 | emotion_embedding=emotion_embedding, 149 | **hps_ms.model) 150 | net_g_ms.cuda() 151 | _ = net_g_ms.eval() 152 | load_checkpoint(model, net_g_ms) 153 | 154 | def tts(text, out_path="temp.wav", voice=speaker_id, speed=length_scale): 155 | speaker_id = voice 156 | length_scale = speed 157 | 158 | if n_symbols != 0: 159 | 160 | #Clean Text 161 | #text = text.replace("\"","") 162 | text_norm = text_to_sequence(text, hps_ms.symbols, hps_ms.data.text_cleaners) 163 | if hps_ms.data.add_blank: 164 | text_norm = commons.intersperse(text_norm, 0) 165 | text_norm = LongTensor(text_norm) 166 | stn_tst = text_norm 167 | #--------------- 168 | 169 | 170 | with no_grad(): 171 | 172 | #Generate the TTS audio 173 | x_tst = stn_tst.unsqueeze(0).cuda() 174 | x_tst_lengths = LongTensor([stn_tst.size(0)]).cuda() 175 | sid = LongTensor([speaker_id]).cuda() 176 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 177 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 178 | 179 | write(out_path, hps_ms.data.sampling_rate, audio) 180 | # # Save Wav File 181 | # with wave.open(out_path, 'wb') as wav_file: 182 | # # Set audio file parameters 183 | # wav_file.setnchannels(1) # Mono audio 184 | # wav_file.setsampwidth(2) # 16-bit audio 185 | # wav_file.setframerate(hps_ms.data.sampling_rate) # Sample Rate 186 | 187 | # # Write audio data to file 188 | # for sample in audio: 189 | # # Convert sample to 16-bit signed integer format 190 | # sample = max(-1, min(1, sample)) # Clamp sample to range [-1, 1] 191 | # sample = int(sample * 32767) # Scale sample to range [-32767, 32767] 192 | # packed_sample = struct.pack(' None: 23 | with open(filename, "wb") as f: 24 | f.write(audio) 25 | 26 | 27 | #Model Params 28 | basemodelname = "xtts/models/base v2.0.2/" 29 | modelname = basemodelname 30 | reference = "xtts/voices/en_sample.wav" 31 | 32 | config = None 33 | model = None 34 | 35 | #Load Model 36 | def loadModel(modelname="base v2.0.2", voice="en_sample"): 37 | global model, config, reference 38 | model_path = "xtts/models/"+modelname+"/" 39 | reference = "xtts/voices/"+voice+".wav" 40 | configname = model_path+"config.json" 41 | config = XttsConfig() 42 | config.load_json(configname) 43 | model = Xtts.init_from_config(config) 44 | model.load_checkpoint(config, checkpoint_dir=model_path, eval=True) 45 | model.cuda() 46 | print("TTS Model Loaded.") 47 | 48 | 49 | #clone speaker 50 | def predict_speaker(wav_file): 51 | """Compute conditioning inputs from reference audio file.""" 52 | gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( 53 | wav_file 54 | ) 55 | return { 56 | "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(), 57 | "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(), 58 | } 59 | 60 | 61 | #Processing tts wav stuff for stream 62 | def postprocess(wav): 63 | """Post process the output waveform""" 64 | if isinstance(wav, list): 65 | wav = torch.cat(wav, dim=0) 66 | wav = wav.clone().detach().cpu().numpy() 67 | wav = wav[None, : int(wav.shape[0])] 68 | wav = np.clip(wav, -1, 1) 69 | wav = (wav * 32767).astype(np.int16) 70 | return wav 71 | def encode_audio_common(frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1): 72 | """Return base64 encoded audio""" 73 | wav_buf = io.BytesIO() 74 | with wave.open(wav_buf, "wb") as vfout: 75 | vfout.setnchannels(channels) 76 | vfout.setsampwidth(sample_width) 77 | vfout.setframerate(sample_rate) 78 | vfout.writeframes(frame_input) 79 | 80 | wav_buf.seek(0) 81 | if encode_base64: 82 | b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8") 83 | return b64_encoded 84 | else: 85 | return wav_buf.read() 86 | 87 | #Seems to generate the streamed tts output 88 | def predict_streaming_generator(parsed_input: dict): 89 | speaker_embedding = torch.tensor(parsed_input["speaker_embedding"]).unsqueeze(0).unsqueeze(-1) 90 | gpt_cond_latent = torch.tensor(parsed_input["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0) 91 | text = parsed_input["text"] 92 | language = parsed_input["language"] 93 | 94 | stream_chunk_size = int(parsed_input["stream_chunk_size"]) 95 | add_wav_header = False#parsed_input["add_wav_header"] 96 | 97 | 98 | chunks = model.inference_stream( 99 | text, 100 | language, 101 | gpt_cond_latent, 102 | speaker_embedding, 103 | stream_chunk_size=stream_chunk_size, 104 | enable_text_splitting=True 105 | ) 106 | 107 | for i, chunk in enumerate(chunks): 108 | chunk = postprocess(chunk) 109 | if i == 0 and add_wav_header: 110 | #This breaks playaudiostream but works for ffplay? 111 | yield encode_audio_common(b"", encode_base64=False) 112 | yield chunk.tobytes() 113 | else: 114 | yield chunk.tobytes() 115 | 116 | #Plays the tts output live? 117 | def stream_ffplay(audio_stream, output_file=None, save=False): 118 | if not save: 119 | ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"] 120 | else: 121 | print("Saving to ", output_file) 122 | ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", output_file] 123 | 124 | ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE) 125 | for chunk in audio_stream: 126 | if chunk is not None: 127 | ffplay_proc.stdin.write(chunk) 128 | 129 | # close on finish 130 | ffplay_proc.stdin.close() 131 | ffplay_proc.wait() 132 | 133 | 134 | def tts(text, speaker, language, stream_chunk_size, verbose=False) -> Iterator[bytes]: 135 | start = time.perf_counter() 136 | speaker["text"] = text 137 | speaker["language"] = language 138 | speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality 139 | 140 | 141 | if verbose: 142 | end = time.perf_counter() 143 | print(f"Time to make POST: {end-start}s", file=sys.stderr) 144 | 145 | first = True 146 | #for chunk in res.iter_content(chunk_size=512): 147 | for chunk in predict_streaming_generator(speaker): 148 | if first: 149 | if verbose: 150 | end = time.perf_counter() 151 | print(f"Time to first chunk: {end-start}s", file=sys.stderr) 152 | first = False 153 | if chunk: 154 | yield chunk 155 | 156 | #print("⏱️ response.elapsed:", res.elapsed) 157 | 158 | 159 | def get_speaker(ref_audio): 160 | wav_file = open(ref_audio, "rb") 161 | response = predict_speaker(wav_file) 162 | return response 163 | 164 | 165 | if __name__ == "__main__": 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument( 168 | "--text", 169 | default="It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", 170 | help="text input for TTS" 171 | ) 172 | parser.add_argument( 173 | "--language", 174 | default="en", 175 | help="Language to use default is 'en' (English)" 176 | ) 177 | parser.add_argument( 178 | "--output_file", 179 | default=None, 180 | help="Save TTS output to given filename" 181 | ) 182 | parser.add_argument( 183 | "--ref_file", 184 | default=None, 185 | help="Reference audio file to use, when not given will use default" 186 | ) 187 | parser.add_argument( 188 | "--stream_chunk_size", 189 | default="20", 190 | help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality" 191 | ) 192 | args = parser.parse_args() 193 | 194 | loadModel() 195 | 196 | with open("./default_speaker.json", "r") as file: 197 | speaker = json.load(file) 198 | 199 | if args.ref_file is not None: 200 | print("Computing the latents for a new reference...") 201 | speaker = get_speaker(args.ref_file) 202 | 203 | audio = stream_ffplay( 204 | tts( 205 | args.text, 206 | speaker, 207 | args.language, 208 | args.stream_chunk_size 209 | ), 210 | args.output_file, 211 | save=bool(args.output_file) 212 | ) 213 | audio = stream_ffplay( 214 | tts( 215 | "This should play after the first one.", 216 | speaker, 217 | args.language, 218 | args.stream_chunk_size 219 | ), 220 | args.output_file, 221 | save=bool(args.output_file) 222 | ) 223 | -------------------------------------------------------------------------------- /mymoegoe/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /mymoegoe/attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import mymoegoe.commons as commons 7 | from mymoegoe.modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /mymoegoe/modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import mymoegoe.commons as commons 10 | from mymoegoe.commons import init_weights, get_padding 11 | from mymoegoe.transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dilated and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /mymoegoe/models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import mymoegoe.commons as commons 7 | import mymoegoe.modules as modules 8 | import mymoegoe.attentions as attentions 9 | 10 | from torch.nn import Conv1d, ConvTranspose1d 11 | from torch.nn.utils import weight_norm 12 | from mymoegoe.commons import init_weights 13 | 14 | 15 | class StochasticDurationPredictor(nn.Module): 16 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 17 | super().__init__() 18 | filter_channels = in_channels # it needs to be removed from future version. 19 | self.in_channels = in_channels 20 | self.filter_channels = filter_channels 21 | self.kernel_size = kernel_size 22 | self.p_dropout = p_dropout 23 | self.n_flows = n_flows 24 | self.gin_channels = gin_channels 25 | 26 | self.log_flow = modules.Log() 27 | self.flows = nn.ModuleList() 28 | self.flows.append(modules.ElementwiseAffine(2)) 29 | for i in range(n_flows): 30 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 31 | self.flows.append(modules.Flip()) 32 | 33 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 34 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 35 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 36 | self.post_flows = nn.ModuleList() 37 | self.post_flows.append(modules.ElementwiseAffine(2)) 38 | for i in range(4): 39 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 40 | self.post_flows.append(modules.Flip()) 41 | 42 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 43 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 44 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 45 | if gin_channels != 0: 46 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 47 | 48 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 49 | x = torch.detach(x) 50 | x = self.pre(x) 51 | if g is not None: 52 | g = torch.detach(g) 53 | x = x + self.cond(g) 54 | x = self.convs(x, x_mask) 55 | x = self.proj(x) * x_mask 56 | 57 | if not reverse: 58 | flows = self.flows 59 | assert w is not None 60 | 61 | logdet_tot_q = 0 62 | h_w = self.post_pre(w) 63 | h_w = self.post_convs(h_w, x_mask) 64 | h_w = self.post_proj(h_w) * x_mask 65 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 66 | z_q = e_q 67 | for flow in self.post_flows: 68 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 69 | logdet_tot_q += logdet_q 70 | z_u, z1 = torch.split(z_q, [1, 1], 1) 71 | u = torch.sigmoid(z_u) * x_mask 72 | z0 = (w - u) * x_mask 73 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 74 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 75 | 76 | logdet_tot = 0 77 | z0, logdet = self.log_flow(z0, x_mask) 78 | logdet_tot += logdet 79 | z = torch.cat([z0, z1], 1) 80 | for flow in flows: 81 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 82 | logdet_tot = logdet_tot + logdet 83 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 84 | return nll + logq # [b] 85 | else: 86 | flows = list(reversed(self.flows)) 87 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 88 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 89 | for flow in flows: 90 | z = flow(z, x_mask, g=x, reverse=reverse) 91 | z0, z1 = torch.split(z, [1, 1], 1) 92 | logw = z0 93 | return logw 94 | 95 | 96 | class DurationPredictor(nn.Module): 97 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 98 | super().__init__() 99 | 100 | self.in_channels = in_channels 101 | self.filter_channels = filter_channels 102 | self.kernel_size = kernel_size 103 | self.p_dropout = p_dropout 104 | self.gin_channels = gin_channels 105 | 106 | self.drop = nn.Dropout(p_dropout) 107 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 108 | self.norm_1 = modules.LayerNorm(filter_channels) 109 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 110 | self.norm_2 = modules.LayerNorm(filter_channels) 111 | self.proj = nn.Conv1d(filter_channels, 1, 1) 112 | 113 | if gin_channels != 0: 114 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 115 | 116 | def forward(self, x, x_mask, g=None): 117 | x = torch.detach(x) 118 | if g is not None: 119 | g = torch.detach(g) 120 | x = x + self.cond(g) 121 | x = self.conv_1(x * x_mask) 122 | x = torch.relu(x) 123 | x = self.norm_1(x) 124 | x = self.drop(x) 125 | x = self.conv_2(x * x_mask) 126 | x = torch.relu(x) 127 | x = self.norm_2(x) 128 | x = self.drop(x) 129 | x = self.proj(x * x_mask) 130 | return x * x_mask 131 | 132 | 133 | class TextEncoder(nn.Module): 134 | def __init__(self, 135 | n_vocab, 136 | out_channels, 137 | hidden_channels, 138 | filter_channels, 139 | n_heads, 140 | n_layers, 141 | kernel_size, 142 | p_dropout, 143 | emotion_embedding): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.out_channels = out_channels 147 | self.hidden_channels = hidden_channels 148 | self.filter_channels = filter_channels 149 | self.n_heads = n_heads 150 | self.n_layers = n_layers 151 | self.kernel_size = kernel_size 152 | self.p_dropout = p_dropout 153 | self.emotion_embedding = emotion_embedding 154 | 155 | if self.n_vocab!=0: 156 | self.emb = nn.Embedding(n_vocab, hidden_channels) 157 | if emotion_embedding: 158 | self.emo_proj = nn.Linear(1024, hidden_channels) 159 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 160 | 161 | self.encoder = attentions.Encoder( 162 | hidden_channels, 163 | filter_channels, 164 | n_heads, 165 | n_layers, 166 | kernel_size, 167 | p_dropout) 168 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 169 | 170 | def forward(self, x, x_lengths, emotion_embedding=None): 171 | if self.n_vocab!=0: 172 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 173 | if emotion_embedding is not None: 174 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1)) 175 | x = torch.transpose(x, 1, -1) # [b, h, t] 176 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 177 | 178 | x = self.encoder(x * x_mask, x_mask) 179 | stats = self.proj(x) * x_mask 180 | 181 | m, logs = torch.split(stats, self.out_channels, dim=1) 182 | return x, m, logs, x_mask 183 | 184 | 185 | class ResidualCouplingBlock(nn.Module): 186 | def __init__(self, 187 | channels, 188 | hidden_channels, 189 | kernel_size, 190 | dilation_rate, 191 | n_layers, 192 | n_flows=4, 193 | gin_channels=0): 194 | super().__init__() 195 | self.channels = channels 196 | self.hidden_channels = hidden_channels 197 | self.kernel_size = kernel_size 198 | self.dilation_rate = dilation_rate 199 | self.n_layers = n_layers 200 | self.n_flows = n_flows 201 | self.gin_channels = gin_channels 202 | 203 | self.flows = nn.ModuleList() 204 | for i in range(n_flows): 205 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 206 | self.flows.append(modules.Flip()) 207 | 208 | def forward(self, x, x_mask, g=None, reverse=False): 209 | if not reverse: 210 | for flow in self.flows: 211 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 212 | else: 213 | for flow in reversed(self.flows): 214 | x = flow(x, x_mask, g=g, reverse=reverse) 215 | return x 216 | 217 | 218 | class PosteriorEncoder(nn.Module): 219 | def __init__(self, 220 | in_channels, 221 | out_channels, 222 | hidden_channels, 223 | kernel_size, 224 | dilation_rate, 225 | n_layers, 226 | gin_channels=0): 227 | super().__init__() 228 | self.in_channels = in_channels 229 | self.out_channels = out_channels 230 | self.hidden_channels = hidden_channels 231 | self.kernel_size = kernel_size 232 | self.dilation_rate = dilation_rate 233 | self.n_layers = n_layers 234 | self.gin_channels = gin_channels 235 | 236 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 237 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 238 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 239 | 240 | def forward(self, x, x_lengths, g=None): 241 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 242 | x = self.pre(x) * x_mask 243 | x = self.enc(x, x_mask, g=g) 244 | stats = self.proj(x) * x_mask 245 | m, logs = torch.split(stats, self.out_channels, dim=1) 246 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 247 | return z, m, logs, x_mask 248 | 249 | 250 | class Generator(torch.nn.Module): 251 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 252 | super(Generator, self).__init__() 253 | self.num_kernels = len(resblock_kernel_sizes) 254 | self.num_upsamples = len(upsample_rates) 255 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 256 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 257 | 258 | self.ups = nn.ModuleList() 259 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 260 | self.ups.append(weight_norm( 261 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 262 | k, u, padding=(k-u)//2))) 263 | 264 | self.resblocks = nn.ModuleList() 265 | for i in range(len(self.ups)): 266 | ch = upsample_initial_channel//(2**(i+1)) 267 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 268 | self.resblocks.append(resblock(ch, k, d)) 269 | 270 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 271 | self.ups.apply(init_weights) 272 | 273 | if gin_channels != 0: 274 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 275 | 276 | def forward(self, x, g=None): 277 | x = self.conv_pre(x) 278 | if g is not None: 279 | x = x + self.cond(g) 280 | 281 | for i in range(self.num_upsamples): 282 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 283 | x = self.ups[i](x) 284 | xs = None 285 | for j in range(self.num_kernels): 286 | if xs is None: 287 | xs = self.resblocks[i*self.num_kernels+j](x) 288 | else: 289 | xs += self.resblocks[i*self.num_kernels+j](x) 290 | x = xs / self.num_kernels 291 | x = F.leaky_relu(x) 292 | x = self.conv_post(x) 293 | x = torch.tanh(x) 294 | 295 | return x 296 | 297 | 298 | class SynthesizerTrn(nn.Module): 299 | """ 300 | Synthesizer for Training 301 | """ 302 | 303 | def __init__(self, 304 | n_vocab, 305 | spec_channels, 306 | segment_size, 307 | inter_channels, 308 | hidden_channels, 309 | filter_channels, 310 | n_heads, 311 | n_layers, 312 | kernel_size, 313 | p_dropout, 314 | resblock, 315 | resblock_kernel_sizes, 316 | resblock_dilation_sizes, 317 | upsample_rates, 318 | upsample_initial_channel, 319 | upsample_kernel_sizes, 320 | n_speakers=0, 321 | gin_channels=0, 322 | use_sdp=True, 323 | emotion_embedding=False, 324 | **kwargs): 325 | 326 | super().__init__() 327 | self.n_vocab = n_vocab 328 | self.spec_channels = spec_channels 329 | self.inter_channels = inter_channels 330 | self.hidden_channels = hidden_channels 331 | self.filter_channels = filter_channels 332 | self.n_heads = n_heads 333 | self.n_layers = n_layers 334 | self.kernel_size = kernel_size 335 | self.p_dropout = p_dropout 336 | self.resblock = resblock 337 | self.resblock_kernel_sizes = resblock_kernel_sizes 338 | self.resblock_dilation_sizes = resblock_dilation_sizes 339 | self.upsample_rates = upsample_rates 340 | self.upsample_initial_channel = upsample_initial_channel 341 | self.upsample_kernel_sizes = upsample_kernel_sizes 342 | self.segment_size = segment_size 343 | self.n_speakers = n_speakers 344 | self.gin_channels = gin_channels 345 | 346 | self.use_sdp = use_sdp 347 | 348 | self.enc_p = TextEncoder(n_vocab, 349 | inter_channels, 350 | hidden_channels, 351 | filter_channels, 352 | n_heads, 353 | n_layers, 354 | kernel_size, 355 | p_dropout, 356 | emotion_embedding) 357 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 358 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 359 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 360 | 361 | if use_sdp: 362 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 363 | else: 364 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 365 | 366 | if n_speakers > 1: 367 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 368 | 369 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None): 370 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding) 371 | if self.n_speakers > 0: 372 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 373 | else: 374 | g = None 375 | 376 | if self.use_sdp: 377 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 378 | else: 379 | logw = self.dp(x, x_mask, g=g) 380 | w = torch.exp(logw) * x_mask * length_scale 381 | w_ceil = torch.ceil(w) 382 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 383 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 384 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 385 | attn = commons.generate_path(w_ceil, attn_mask) 386 | 387 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 388 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 389 | 390 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 391 | z = self.flow(z_p, y_mask, g=g, reverse=True) 392 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 393 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 394 | 395 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 396 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 397 | g_src = self.emb_g(sid_src).unsqueeze(-1) 398 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 399 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 400 | z_p = self.flow(z, y_mask, g=g_src) 401 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 402 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 403 | return o_hat, y_mask, (z, z_p, z_hat) 404 | 405 | -------------------------------------------------------------------------------- /launch.py: -------------------------------------------------------------------------------- 1 | #Speech recognition library, very important 2 | import speech_recognition as sr 3 | #Alternative speech recognition with whisper 4 | from whisper_mic.whisper_mic import WhisperMic 5 | #pyttsx3 is our tts engine 6 | import pyttsx3 7 | #Load other TTS 8 | import mymoegoe.tts as mytts 9 | import xtts.tts as xtts 10 | import xtts.stream as xttsstream 11 | #Pygame is used to play the wav audio files that pyttsx3 generates 12 | # import os 13 | # os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide" 14 | # from pygame import mixer, _sdl2 as devices 15 | #These are tools used for interfacing with youchat and other data. json, regex, and cloudflare scraper 16 | import json 17 | import cloudscraper 18 | import re 19 | import random 20 | import threading 21 | #Ooba Reqs 22 | import requests 23 | #LM Studio and Other OpenAI compatibles Reqs 24 | import openai 25 | #Character Card Reqs 26 | from PIL import Image 27 | from PIL.ExifTags import TAGS 28 | import base64 29 | #terminal arg libs 30 | import sys 31 | import getopt 32 | import time 33 | #Needed for piping to vbcable and playing TTS 34 | import sounddevice as sd 35 | import soundfile as sf 36 | import numpy as np 37 | import os 38 | #-------------------------------------- 39 | argsv = sys.argv[1:] 40 | options, args = getopt.getopt(argsv, 'hv', 41 | ["vbcable", "voiceinput", "pc=", "pcaschat", "caphistory=", "voice=", "voices", "wakeword=", 42 | "alwayslisten", "ooba", "openai", "vosk", "googlestt", "chara=", "moegoe", "xtts", "bootmsg=", "wakeprompt", "nowakeping", 'voicespeed=', 'mgmodel=', 'template=']) 43 | 44 | #Config variables 45 | vbcable = False 46 | textinput = True 47 | wakeword = "computer" 48 | promptcontext = "" 49 | promptcontextaschat = False 50 | caphistory = 4 51 | voice = None 52 | alwayslisten = False 53 | waketext = "" 54 | ooba = False 55 | openaiapi = False 56 | vosk = False 57 | googlestt = False 58 | ttsengine = "pyttsx3" 59 | wakeprompt = False 60 | wakeping = True 61 | charafilename = "" 62 | speed = 1 63 | bootmsg = "Booting Up" 64 | mgmodel = "g" 65 | xttsmodel = "base v2.0.2" 66 | verbose = False 67 | chatml = False 68 | phi3 = False 69 | streamchunks = 20 70 | 71 | # Put your URI end point:port here for your openai inference server (such as LM Studio) 72 | openai.api_base='http://localhost:1234/v1' 73 | # Put in an empty API Key for LM stuido 74 | openai.api_key='' 75 | openaimodel = "local model" 76 | 77 | script_path = os.path.abspath(__file__) 78 | directory = os.path.dirname(script_path) 79 | 80 | for opt, arg in options: 81 | if opt == "-h": 82 | print("--vbcable: Send audio to vb-cable virtual microphone.") 83 | print("--voiceinput: Interact with the AI using your voice instead of text.") 84 | print("--pc='string': set a prompt context. To prepend to prompts. Optionally can be set as fake history.") 85 | print("--pcaschat: Sets prompt context to be a fake chat history.") 86 | print("--caphistory=number: Caps chat history length. Default is 4. Set to -1 to disable.") 87 | print("--voice=number/string: Set the TTS voice.") 88 | print("--voices: List voices on your computer.") 89 | print("--wakeword='string': Sets the wake word when using voice input.") 90 | print("--alwayslisten: Always listen for input, not using a wake word.") 91 | print("--ooba: Use local oobabooga webui as LLM instead of YouChat.") 92 | print("--openai: Use openai api as LLM instead of YouChat.") 93 | print("--vosk: Use local vosk as STT.") 94 | print("--googlestt: Use google's online service as STT.") 95 | print("--chara='filename': Load tavernai character card or oobabooga character json file.") 96 | print("--moegoe: Use moegoe as TTS instead of default TTS.") 97 | print("--xtts: Use xtts as TTS instead of default TTS.") 98 | print("--bootmsg='string': What to say when booting up.") 99 | print("--wakeprompt: Like alwayslisten, but doesn't prompt unless wakeword is included.") 100 | print("--nowakeping: Doesn't ping when starting to listen for wake word") 101 | print("--voicespeed=number: Speed of moegoe tts. Higher=slower. default is 1.") 102 | print("--mgmodel='filename': set the filename of the moegoe model. default is g") 103 | print("--template='string': specify a prompt template (chatml or phi3). default typical chat format.") 104 | print("-v: Print debug info.") 105 | sys.exit(2) 106 | elif opt == '--vbcable': 107 | vbcable = True 108 | elif opt == '--voiceinput': 109 | textinput = False 110 | elif opt == '--pc': 111 | promptcontext = "["+arg+"]" 112 | elif opt == "--pcaschat": 113 | promptcontextaschat = True 114 | elif opt == "--caphistory": 115 | caphistory = int(arg) 116 | elif opt == '--voice': 117 | voice = arg 118 | elif opt == '--voices': 119 | engine = pyttsx3.init() 120 | voices = engine.getProperty('voices') 121 | for v in voices: 122 | print (v) 123 | sys.exit(2) 124 | elif opt == '--wakeword': 125 | wakeword = arg 126 | elif opt == '--alwayslisten': 127 | alwayslisten = True 128 | elif opt == "--ooba": 129 | ooba = True 130 | elif opt == "--openai": 131 | openaiapi = True 132 | elif opt == "--vosk": 133 | vosk = True 134 | elif opt == "--googlestt": 135 | googlestt = True 136 | elif opt == "--chara": 137 | charafilename = arg 138 | elif opt == "--moegoe": 139 | ttsengine = "moegoe" 140 | elif opt == "--xtts": 141 | ttsengine = "xtts" 142 | elif opt == "--bootmsg": 143 | bootmsg = arg 144 | elif opt == "--wakeprompt": 145 | wakeprompt = True 146 | elif opt == "--nowakeping": 147 | wakeping = False 148 | elif opt == "--voicespeed": 149 | speed = float(arg) 150 | elif opt == "--mgmodel": 151 | mgmodel = arg 152 | xttsmodel = arg 153 | elif opt == "--template": 154 | if arg == "chatml": 155 | chatml = True 156 | if arg == "phi3": 157 | phi3 = True 158 | elif opt == "-v": 159 | verbose = True 160 | 161 | 162 | # Find VB-Cable device IDs 163 | vbcable_output = None 164 | vbcable_input = None 165 | if vbcable: 166 | for device in sd.query_devices(): 167 | if 'CABLE Output' in device['name'] and device['max_input_channels'] == 2 and vbcable_output == None: 168 | if verbose: 169 | print("Found Cable Output.", device['name'], device['index']) 170 | vbcable_output = device["index"] 171 | if 'CABLE Input' in device['name'] and device['max_output_channels'] == 2 and vbcable_input == None: 172 | if verbose: 173 | print("Found Cable Input.", device['name'], device['index']) 174 | vbcable_input = device["index"] 175 | 176 | #New function to load and play the tts outputs. Check for vbcable vs speaker 177 | def playaudio(): 178 | audiofile = os.path.join(directory,"temp.wav") 179 | if os.path.isfile(audiofile): 180 | data, fs = sf.read(audiofile, dtype='float32') 181 | data_stereo = np.tile(data, (2, 1)).T.copy(order='C') 182 | delay = int(fs * 0.2) # 100ms delay 183 | zeros = np.zeros((delay, 2)) 184 | sd.play(zeros, fs, blocking=True, device=sd.default.device) 185 | sd.play(data, fs, device=sd.default.device) 186 | if vbcable: 187 | with sd.OutputStream(device=vbcable_input, 188 | samplerate=fs, 189 | channels=2) as stream: 190 | stream.write(data_stereo) 191 | sd.wait() 192 | else: 193 | print("Generated TTS audio temp.wav not found!") 194 | def playstream(audio_stream, stream1, stream2): 195 | for chunk in audio_stream: 196 | #Get chunk data into np format (pcm audio samples) 197 | audio_pcm = np.frombuffer(chunk, dtype=np.int16) 198 | # Convert PCM audio samples to 32-bit floating-point values 199 | #data_float = audio_pcm.astype(np.float32) / 32768.0 200 | stream1.write(audio_pcm) 201 | if stream2 != None: 202 | stream2.write(audio_pcm) 203 | 204 | def playaudiostream(audio_stream): 205 | # xttsstream.stream_ffplay(audio_stream) 206 | with sd.OutputStream(device=sd.default.device, 207 | samplerate=24000, 208 | channels=1, blocksize=44544, latency=1, dtype='int16') as stream1: 209 | if vbcable: 210 | with sd.OutputStream(device=vbcable_input, 211 | samplerate=24000, 212 | channels=1, blocksize=44544, latency=1, dtype='int16') as stream2: 213 | playstream(audio_stream, stream1, stream2) 214 | else: 215 | playstream(audio_stream, stream1, None) 216 | 217 | def playchime(pingpong="ping"): 218 | data, fs = sf.read(os.path.join(directory,pingpong+".wav"), dtype='float32') 219 | sd.play(data, fs, device=sd.default.device) 220 | 221 | #Here we initialize python's audio output 222 | #It checks to see if we enabled vb-cable to pipe the audio to vmagicmirror 223 | #Be sure to turn on listening to the vb-cable mic if you wish to hear the ai speak, otherwise it's silent 224 | # if vbcable: 225 | # mixer.init(devicename = "CABLE Input (VB-Audio Virtual Cable)") 226 | # else: 227 | # mixer.init() 228 | #a debug print to check our audio devices 229 | #print("Outputs:", devices.audio.get_audio_device_names()[0]) 230 | 231 | 232 | #Here we initialize the tts with a default boot message 233 | #voices[2].id is to get the voice we want. 234 | if ttsengine == "pyttsx3": 235 | if voice == None: 236 | voice = 0 237 | engine = pyttsx3.init() 238 | voices = engine.getProperty('voices') 239 | if len(voices) == 0: 240 | print("No TTS voices detected. Please install a TTS voice on your OS.") 241 | sys.exit(2) 242 | engine.setProperty('voice', voices[int(voice)].id) 243 | engine.save_to_file(bootmsg, os.path.join(directory,"temp.wav")) 244 | engine.runAndWait(); 245 | playaudio() 246 | #-------------------- 247 | 248 | if ttsengine == "moegoe": 249 | if voice == None: 250 | voice = 0 251 | mytts.loadtts(mgmodel) 252 | mytts.tts(bootmsg, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed) 253 | playaudio() 254 | 255 | if ttsengine == "xtts": 256 | if voice == None: 257 | voice = "en_sample" 258 | xttsstream.loadModel(xttsmodel, voice=voice) 259 | speaker = xttsstream.get_speaker(xttsstream.reference) 260 | audio = playaudiostream(xttsstream.tts(bootmsg, speaker, "en", streamchunks)) 261 | #xtts.loadModel(xttsmodel, voice=voice) 262 | #xtts.generateSpeech(bootmsg, os.path.join(directory,"temp.wav")) 263 | #playaudio() 264 | 265 | #Load sfx 266 | # ping = mixer.Sound("ping.wav") 267 | # pong = mixer.Sound("pong.wav") 268 | 269 | #Initialize the cloudflare scraper that we use for youchat requests 270 | if not ooba and not openaiapi: 271 | scraper = cloudscraper.create_scraper(ecdhCurve='secp384r1') 272 | 273 | #New traceid function. This fetches the needed traceid for youchat to function 274 | def getinitialtraceid(): 275 | headers = { 276 | 'Accept': 'text/event-stream', 277 | 'Connection': 'keep-alive', 278 | 'Sec-Fetch-Mode': 'cors', 279 | 'Sec-Fetch-Site': 'same-origin', 280 | 'Sec-GPC': '1', 281 | 'Referer': 'https://you.com/search?q=hello&fromSearchBar=true&tbm=youchat' 282 | } 283 | payload = {'q': "hello"} 284 | try: 285 | response = scraper.get("https://you.com/search", params=payload, headers=headers) 286 | except cloudscraper.exceptions.CloudflareChallengeError as e: 287 | return "Sorry, there was a cloudflare error. Please try again." 288 | 289 | data = response.text 290 | match = re.search(r'"initialTraceId":"(.+?)"', data) 291 | first_capture_group = match.group(1) 292 | #print("traceid:", first_capture_group) 293 | return first_capture_group 294 | if not ooba and not openaiapi: 295 | traceid = getinitialtraceid() 296 | randuuid = str(random.random())[2:] 297 | #print("Random UUID:", randuuid) 298 | #--------------------------- 299 | 300 | #sendq is the youchat api request. Just enter prompt for the parameter and we get the response back 301 | #chat variable is kept updated with chat history 302 | chat=[] 303 | def sendq(question): 304 | global chat, traceid, randuuid 305 | headers = { 306 | 'Accept': 'text/event-stream', 307 | 'Connection': 'keep-alive', 308 | 'Sec-Fetch-Mode': 'cors', 309 | 'Sec-Fetch-Site': 'same-origin', 310 | 'Sec-GPC': '1', 311 | 'Referer': 'https://you.com/search?q=hello&fromSearchBar=true&tbm=youchat', 312 | 'Cookie': ('uuid_guest='+randuuid+";").encode() 313 | } 314 | if promptcontextaschat: 315 | chat.append({"question":'"'+promptcontext+'"', "answer":''}) 316 | payload = { 317 | 'q': question, 318 | 'chat': str(chat), 319 | 'queryTraceId': traceid, 320 | 'domain': 'youchat', 321 | 'page': '1', 322 | 'count': '10', 323 | 'safeSearch': 'Off', 324 | 'onShoppingPage': 'false', 325 | 'freshness':'Month', 326 | 'mkt':'', 327 | 'responseFilter': 'WebPages,Translations,TimeZone,Computation,RelatedSearches' 328 | } 329 | try: 330 | response = scraper.get("https://you.com/api/streamingSearch", params=payload, headers=headers, stream=True) 331 | except cloudscraper.exceptions.CloudflareChallengeError as e: 332 | return "Sorry, there was a cloudflare error. Please try again." 333 | 334 | output = "" 335 | for line in response.iter_lines(): 336 | if line: 337 | decoded_line = line.decode("utf-8") 338 | if decoded_line != "{}": 339 | key, value = decoded_line.split(":", 1) 340 | key = key.strip() 341 | value = value.strip() 342 | if key == "data": 343 | if value == "I'm Mr. Meeseeks. Look at me.": 344 | break 345 | data = json.loads(value) 346 | if "youChatToken" in data: 347 | output += data["youChatToken"] 348 | else: 349 | return "Sorry, the AI server is too busy. An error has occurred. Please try again." 350 | if caphistory >= 0: 351 | if len(chat) > caphistory: 352 | chat = chat[:0-caphistory] 353 | chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'}) 354 | return output 355 | 356 | #Initialize Character Persona Details for Ooba LLM 357 | yourname = "You" 358 | charactername = "Friend" 359 | characterpersona = "" 360 | worldscenario = "Reply as the user's friend. Make up details about yourself. Do not mention or say you are an AI. Respond only with one or two sentences and keeps responses brief." 361 | exampledialogue = "" 362 | exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue) 363 | exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue) 364 | greeting = "" 365 | 366 | def loadcharacard(filename): 367 | global charactername, characterpersona, worldscenario, exampledialogue, greeting 368 | if verbose: 369 | print("PNG/WEBP character file loading...") 370 | # load the image 371 | img = Image.open(filename) 372 | exif_data = img._getexif() 373 | img.load() 374 | chara = "" 375 | if filename[-4:] == ".png": 376 | chara = img.info["chara"] 377 | decoded_bytes = base64.b64decode(chara) 378 | decoded_string = decoded_bytes.decode('utf-8') 379 | chara = decoded_string 380 | if filename[-4:] == "webp": 381 | for tag_id, value in exif_data.items(): 382 | tag = TAGS.get(tag_id, tag_id) 383 | if tag == "UserComment": 384 | chara = value[8:] 385 | 386 | charajson = json.loads(chara) 387 | print("Loading "+charajson['name']) 388 | charactername = charajson['name'] 389 | characterpersona = charajson['description']+"\nPersonality: "+charajson['personality'] 390 | characterpersona = re.sub(r'{{char}}', charactername, characterpersona) 391 | characterpersona = re.sub(r'{{user}}', yourname, characterpersona) 392 | worldscenario = charajson['scenario'] 393 | worldscenario = re.sub(r'{{char}}', charactername, worldscenario) 394 | worldscenario = re.sub(r'{{user}}', yourname, worldscenario) 395 | greeting = charajson['first_mes'] 396 | greeting = re.sub(r'{{char}}', charactername, greeting) 397 | greeting = re.sub(r'{{user}}', yourname, greeting) 398 | exampledialogue = charajson['mes_example'] 399 | exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue) 400 | exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue) 401 | 402 | def loadoobacharjson(filename): 403 | global charactername, characterpersona, worldscenario, exampledialogue, greeting 404 | if verbose: 405 | print("JSON character file loading...") 406 | with open(filename, encoding="utf-8") as f: 407 | data = json.load(f) 408 | print("Loading "+data['char_name']) 409 | charactername = data['char_name'] 410 | characterpersona = data['char_persona'] 411 | characterpersona = re.sub(r'{{char}}', charactername, characterpersona) 412 | characterpersona = re.sub(r'{{user}}', yourname, characterpersona) 413 | worldscenario = data['world_scenario'] 414 | worldscenario = re.sub(r'{{char}}', charactername, worldscenario) 415 | worldscenario = re.sub(r'{{user}}', yourname, worldscenario) 416 | greeting = data['char_greeting'] 417 | greeting = re.sub(r'{{char}}', charactername, greeting) 418 | greeting = re.sub(r'{{user}}', yourname, greeting) 419 | exampledialogue = data['example_dialogue'] 420 | exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue) 421 | exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue) 422 | 423 | def loadchara(filename): 424 | if verbose: 425 | print("Chara file extension:", filename[-4:]) 426 | if filename[-4:] == "json": 427 | loadoobacharjson(filename) 428 | elif filename[-4:] == ".png" or filename[-4:] == "webp": 429 | loadcharacard(filename) 430 | else: 431 | print("Could not detect character format...") 432 | 433 | if charafilename != "": 434 | loadchara(charafilename) 435 | 436 | if greeting != "": 437 | print(charactername+": "+greeting) 438 | chat.append({"question":'', "answer":greeting}) 439 | out = re.sub("\n", "", greeting) 440 | out = re.sub("[\"\']", "", out) 441 | out = re.sub("[^\x00-\x7F]+", "", out) 442 | out = re.sub("[<>]", "", out) 443 | out = re.sub("-", " - ", out) 444 | if ttsengine == "pyttsx3": 445 | engine.save_to_file(out, os.path.join(directory,"temp.wav")) 446 | engine.runAndWait(); 447 | if ttsengine == "moegoe": 448 | mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed) 449 | if ttsengine == "xtts": 450 | audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks)) 451 | #xtts.generateSpeech(out, os.path.join(directory,"temp.wav")) 452 | if ttsengine != "xtts": 453 | playaudio() 454 | 455 | #Creates the prompt for non-youchat apis 456 | def createprompt(question): 457 | global chat, yourname, charactername 458 | prompt = "" 459 | 460 | #ChatML 461 | promptuserstart = "<|im_start|>" 462 | promptend = "<|im_end|>" 463 | promptassistantstart = "<|im_start|>" 464 | 465 | #Phi 3 466 | if phi3: 467 | promptuserstart = "<|user|>" 468 | promptend = "<|end|>" 469 | promptassistantstart = "<|assistant|>" 470 | 471 | if chatml: 472 | prompt = promptuserstart+"system\n" 473 | yourname = "user" 474 | charactername = "assistant" 475 | if phi3: 476 | prompt = "<|system|>"#promptuserstart+"\n" 477 | #Handle legacy prompt context 478 | if promptcontextaschat: 479 | chat.append({"question":'"'+promptcontext+'"', "answer":''}) 480 | else: 481 | prompt = promptcontext+"\n" 482 | 483 | #characterpersona = "You are chatting with Bot. Bot is an AI assistant that helps answer your questions." 484 | 485 | #Add Character Context 486 | if characterpersona != "": 487 | prompt += charactername+"'s Persona: "+characterpersona+"\n" 488 | if worldscenario != "": 489 | prompt += "Scenario: "+worldscenario+"\n" 490 | if exampledialogue != "" and not chatml and not phi3: 491 | prompt += ""+"\n"+exampledialogue+"\n" 492 | if (characterpersona != "" or worldscenario != "" or exampledialogue != "") and not chatml and not phi3: 493 | prompt += "" 494 | 495 | if chatml: 496 | prompt += promptend+"\n" 497 | 498 | #Add Chat History to Prompt 499 | for ch in chat: 500 | if ch["question"] != "": 501 | if chatml: 502 | prompt += promptuserstart+yourname+"\n"+ch["question"]+promptend+"\n" 503 | elif phi3: 504 | prompt += promptuserstart+"\n"+ch["question"]+promptend+"\n" 505 | else: 506 | prompt += '\n'+yourname+': '+ch["question"] 507 | if ch["answer"] != "": 508 | if chatml: 509 | prompt += promptassistantstart+charactername+"\n"+ch["answer"]+promptend+"\n" 510 | elif phi3: 511 | prompt += promptassistantstart+"\n"+ch["answer"]+promptend+"\n" 512 | else: 513 | prompt += '\n'+charactername+': '+ch["answer"] 514 | 515 | #Add newest chat to prompt 516 | if chatml: 517 | prompt += promptuserstart+yourname+"\n"+question+promptend+"\n"+promptassistantstart+charactername+"\n" 518 | elif phi3: 519 | prompt += promptuserstart+"\n"+question+promptend+"\n"+promptassistantstart+"\n" 520 | else: 521 | prompt += '\n'+yourname+': ' 522 | prompt += question 523 | prompt += '\n'+charactername+': ' 524 | return prompt 525 | 526 | #openaisendq is the openai api request. 527 | def openaisendq(question): 528 | global chat 529 | 530 | prompt = createprompt(question) 531 | 532 | #Set stopping strings. This tells LLM to stop writing. 533 | stopping_strings = ["\n"+yourname, "\n"+charactername, "", "", "", "<|im_end|>", "<|im_start|>", "<|user|>", "<|end|>", "<|assistant|>"] 534 | 535 | #formatted_prompt = f"{yourname}: {question}\n{charactername}:" 536 | messages = [{"role": "user", "content": prompt}] 537 | response = openai.ChatCompletion.create( 538 | model=openaimodel, 539 | messages=messages, 540 | stop=stopping_strings, 541 | #temperature=0.0 542 | # temperature=0.7, 543 | # rep_pen = 1.18, 544 | # top_p = 1 545 | ) 546 | output = response.choices[0].message["content"] 547 | #Append message to chat history 548 | if caphistory >= 0: 549 | if len(chat) > caphistory: 550 | chat = chat[:0-caphistory] 551 | chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'}) 552 | return output 553 | 554 | #oobasendq is the oobabooga api request. Just enter prompt for the parameter and we get the response back 555 | def oobasendq(question): 556 | global chat 557 | 558 | prompt = createprompt(question) 559 | 560 | #Set stopping strings. This tells LLM to stop writing. 561 | stopping_strings = ["\n"+yourname, "\n"+charactername] 562 | 563 | #print(prompt) 564 | #Send the request 565 | data = {"prompt": prompt, "stopping_strings": stopping_strings, "temperature": 0.7, "rep_pen": 1.18, "top_p":1} 566 | response = requests.post('http://127.0.0.1:5000/api/v1/generate', data=json.dumps(data)) 567 | if response.status_code == 200: 568 | 569 | #Get the output from the response 570 | if verbose: 571 | print(response.content) 572 | jsondata = json.loads(response.content.decode('utf-8')) 573 | output = str(jsondata['results'][0]['text']).strip() 574 | 575 | #Append message to chat history 576 | if caphistory >= 0: 577 | if len(chat) > caphistory: 578 | chat = chat[:0-caphistory] 579 | chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'}) 580 | 581 | return output 582 | else: 583 | return "Error" 584 | 585 | 586 | def getaudiovosknew(r, m, wake=False): 587 | global ping 588 | print("New Vosk Recognizer") 589 | text = "" 590 | firstit = True 591 | while text == "": 592 | with m as source: 593 | r.adjust_for_ambient_noise(source) 594 | if firstit: 595 | if (wake and wakeping) or (not wake): 596 | playchime("ping") 597 | firstit = False 598 | print("Listening for Vosk!") 599 | audio = r.listen(source) 600 | try: 601 | text = r.recognize_vosk(audio) 602 | text = text.lower() 603 | except: 604 | print("Failed to recognize") 605 | text = "" 606 | output = json.loads(text)["text"] 607 | print("Detected speech:", output) 608 | return output 609 | 610 | def getaudiogooglenew(r, m, wake=False): 611 | global ping 612 | print("New Google Recognizer") 613 | text = "" 614 | firstit = True 615 | while text == "": 616 | with m as source: 617 | r.adjust_for_ambient_noise(source) 618 | if firstit: 619 | if (wake and wakeping) or (not wake): 620 | playchime("ping") 621 | firstit = False 622 | print("Listening for Google!") 623 | audio = r.listen(source) 624 | try: 625 | text = r.recognize_google(audio) 626 | text = text.lower() 627 | except: 628 | print("Failed to recognize") 629 | text = "" 630 | print("Detected speech:", text) 631 | return text 632 | 633 | #Main function. Two different options: whether we wish to use text input or voice 634 | if textinput: 635 | while True: 636 | #get input string 637 | input_string = input(yourname+": ") 638 | combinedprompt = promptcontext+input_string 639 | if promptcontextaschat: 640 | combinedprompt = input_string 641 | start_time = time.time() 642 | 643 | #Send prompt to LLM 644 | if ooba: 645 | out = oobasendq(input_string) 646 | elif openaiapi: 647 | out = openaisendq(input_string) 648 | else: 649 | #Youchat 650 | out = sendq(combinedprompt) 651 | out = re.sub(r'\[.+?\]\(.+?\)', '', out) 652 | 653 | #Print response 654 | print(charactername+":", out) 655 | end_time = time.time() 656 | elapsed_time = end_time - start_time 657 | if verbose: 658 | print("Text-Gen time: ", elapsed_time, "seconds") 659 | 660 | #Clear out string to ensure TTS doesn't crash 661 | out = re.sub("\n", "", out) 662 | out = re.sub("[\"\']", "", out) 663 | out = re.sub("[^\x00-\x7F]+", "", out) 664 | out = re.sub("[<>]", "", out) 665 | out = re.sub("-", " ", out) 666 | if out and out != "": 667 | # Text to speech to a file 668 | #tts.tts_to_file(text=out, file_path="temp.wav") 669 | if ttsengine == "pyttsx3": 670 | engine.save_to_file(out, os.path.join(directory,"temp.wav")) 671 | engine.runAndWait(); 672 | elif ttsengine == "moegoe": 673 | mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed) 674 | elif ttsengine == "xtts": 675 | #xtts.generateSpeech(out, os.path.join(directory,"temp.wav")) 676 | end_time = time.time() 677 | elapsed_time = end_time - start_time 678 | if verbose: 679 | print("Elapsed time: ", elapsed_time, "seconds") 680 | audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks)) 681 | 682 | #Calculate and print time if verbose 683 | 684 | if ttsengine != "xtts": 685 | end_time = time.time() 686 | elapsed_time = end_time - start_time 687 | if verbose: 688 | print("Elapsed time: ", elapsed_time, "seconds") 689 | threadaudio = threading.Thread(target=playaudio) 690 | threadaudio.start() 691 | threadaudio.join() 692 | #playaudio() 693 | #mixer.music.load("temp.wav") 694 | #mixer.music.play() 695 | else: 696 | stop_listening = None 697 | #start microphone recognition 698 | if vosk or googlestt: 699 | r = sr.Recognizer() 700 | m = sr.Microphone() 701 | else: 702 | mic = WhisperMic(model="base.en") 703 | # def callback(recognizer, audio): 704 | # global waketext 705 | # #recognizer.adjust_for_ambient_noise(source) 706 | # try: 707 | # if vosk: 708 | # waketext = recognizer.recognize_vosk(audio) 709 | # waketext = json.loads(waketext)["text"] 710 | # else: 711 | # waketext = recognizer.recognize_google(audio) 712 | # waketext = waketext.lower() 713 | # if verbose: 714 | # print("Wake Word Check: {}".format(waketext)) 715 | # except: 716 | # waketext = "" 717 | # print("Failed to recognize!") 718 | 719 | if vosk or googlestt: 720 | with m as source: 721 | r.adjust_for_ambient_noise(source) 722 | #stop_listening = r.listen_in_background(m, callback) 723 | while True: 724 | 725 | #Listen for Wake Word 726 | # waketext = "" 727 | # if stop_listening: 728 | # stop_listening(wait_for_stop=False) 729 | 730 | waketext = "" 731 | 732 | def listenwake(): 733 | global waketext, r, m 734 | #print(r, m) 735 | if vosk: 736 | waketext = getaudiovosknew(r,m, True) 737 | elif googlestt: 738 | waketext = getaudiogooglenew(r,m, True) 739 | else: 740 | waketext = mic.listen() 741 | waketext = waketext.lower() 742 | print("Detected speech:", waketext) 743 | if wakeping: 744 | playchime("ping") 745 | 746 | if alwayslisten == True: 747 | while waketext == "": 748 | listenwake() 749 | continue 750 | textg = waketext 751 | if wakeping: 752 | playchime("pong") 753 | else: 754 | while wakeword not in waketext: 755 | listenwake() 756 | continue 757 | if wakeprompt: 758 | textg = waketext 759 | else: 760 | waketext = "" 761 | 762 | #stop_listening(wait_for_stop=False) 763 | #---------------------------------------- 764 | 765 | 766 | #Listen for Prompt 767 | if alwayslisten == False and wakeprompt == False: 768 | if vosk: 769 | textg = getaudiovosknew(r,m) 770 | elif googlestt: 771 | textg = getaudiogooglenew(r,m) 772 | else: 773 | textg = mic.listen() 774 | if wakeping: 775 | playchime("pong") 776 | #---------------------- 777 | 778 | #Send prompt to youchat and print output 779 | print(yourname+":", textg) 780 | start_time = time.time() 781 | combinedprompt = promptcontext+textg 782 | if promptcontextaschat: 783 | combinedprompt = textg 784 | if ooba: 785 | out = oobasendq(textg) 786 | elif openaiapi: 787 | out = openaisendq(textg) 788 | else: 789 | #Youchat 790 | out = sendq(combinedprompt) 791 | out = re.sub(r'\[.+?\]\(.+?\)', '', out) 792 | print(charactername+":", out) 793 | #---------------------- 794 | 795 | #TTS Response 796 | #mixer.music.unload() 797 | #Clear out string to ensure TTS doesn't crash 798 | out = re.sub("\n", "", out) 799 | out = re.sub("\"", "", out) 800 | out = re.sub("[^\x00-\x7F]+", "", out) 801 | out = re.sub("[<>]", "", out) 802 | out = re.sub("-", " ", out) 803 | if out and out != "": 804 | if ttsengine == "pyttsx3": 805 | engine.save_to_file(out, os.path.join(directory,"temp.wav")) 806 | engine.runAndWait(); 807 | elif ttsengine == "moegoe": 808 | mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed) 809 | elif ttsengine == "xtts": 810 | #xtts.generateSpeech(out, os.path.join(directory,"temp.wav")) 811 | end_time = time.time() 812 | elapsed_time = end_time - start_time 813 | if verbose: 814 | print("Elapsed time: ", elapsed_time, "seconds") 815 | audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks)) 816 | 817 | #Calculate and print time if verbose 818 | if ttsengine != "xtts": 819 | end_time = time.time() 820 | elapsed_time = end_time - start_time 821 | if verbose: 822 | print("Elapsed time: ", elapsed_time, "seconds") 823 | threadaudio = threading.Thread(target=playaudio) 824 | threadaudio.start() 825 | threadaudio.join() 826 | #playaudio() 827 | #------------------- --------------------------------------------------------------------------------