├── ping.wav
├── pong.wav
├── mymoegoe
    ├── text
    │   ├── cleaners.py
    │   ├── __init__.py
    │   └── english.py
    ├── commons.py
    ├── tts.py
    ├── transforms.py
    ├── attentions.py
    ├── modules.py
    └── models.py
├── requirements.txt
├── README.md
├── xtts
    └── stream.py
└── launch.py


/ping.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/April93/ai-voice-assistant/HEAD/ping.wav


--------------------------------------------------------------------------------
/pong.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/April93/ai-voice-assistant/HEAD/pong.wav


--------------------------------------------------------------------------------
/mymoegoe/text/cleaners.py:
--------------------------------------------------------------------------------
1 | import re
2 | def cjke_cleaners2(text):
3 |     from mymoegoe.text.english import english_to_ipa2
4 |     text = re.sub(r'^(.*?)$',
5 |                   lambda x: english_to_ipa2(x.group(1))+' ', text)
6 |     text = re.sub(r'\s+$', '', text)
7 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
8 |     return text
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cloudscraper==1.2.68
 2 | eng_to_ipa==0.0.2
 3 | inflect==5.6.0
 4 | numpy==1.23.5
 5 | openai
 6 | Pillow==9.5.0
 7 | PySoundFile==0.9.0.post1
 8 | pyttsx3==2.90
 9 | Requests==2.31.0
10 | scipy==1.10.1
11 | sounddevice==0.4.5
12 | soundfile==0.12.1
13 | SpeechRecognition==3.9.0
14 | torch==2.0.1
15 | Unidecode==1.3.6
16 | WhisperMic
17 | 


--------------------------------------------------------------------------------
/mymoegoe/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from mymoegoe.text import cleaners
 3 | 
 4 | 
 5 | def text_to_sequence(text, symbols, cleaner_names):
 6 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 7 |     Args:
 8 |       text: string to convert to a sequence
 9 |       cleaner_names: names of the cleaner functions to run the text through
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14 | 
15 |   sequence = []
16 | 
17 |   clean_text = _clean_text(text, cleaner_names)
18 |   for symbol in clean_text:
19 |     if symbol not in _symbol_to_id.keys():
20 |       continue
21 |     symbol_id = _symbol_to_id[symbol]
22 |     sequence += [symbol_id]
23 |   return sequence
24 | 
25 | 
26 | def _clean_text(text, cleaner_names):
27 |   for name in cleaner_names:
28 |     cleaner = getattr(cleaners, name)
29 |     if not cleaner:
30 |       raise Exception('Unknown cleaner: %s' % name)
31 |     text = cleaner(text)
32 |   return text
33 | 


--------------------------------------------------------------------------------
/mymoegoe/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | import torch.jit
 4 | 
 5 | 
 6 | def script_method(fn, _rcb=None):
 7 |   return fn
 8 | 
 9 | 
10 | def script(obj, optimize=True, _frames_up=0, _rcb=None):
11 |   return obj
12 | 
13 | 
14 | torch.jit.script_method = script_method
15 | torch.jit.script = script
16 | 
17 | 
18 | def init_weights(m, mean=0.0, std=0.01):
19 |   classname = m.__class__.__name__
20 |   if classname.find("Conv") != -1:
21 |     m.weight.data.normal_(mean, std)
22 | 
23 | 
24 | def get_padding(kernel_size, dilation=1):
25 |   return int((kernel_size*dilation - dilation)/2)
26 | 
27 | 
28 | def intersperse(lst, item):
29 |   result = [item] * (len(lst) * 2 + 1)
30 |   result[1::2] = lst
31 |   return result
32 | 
33 | 
34 | def slice_segments(x, ids_str, segment_size=4):
35 |   ret = torch.zeros_like(x[:, :, :segment_size])
36 |   for i in range(x.size(0)):
37 |     idx_str = ids_str[i]
38 |     idx_end = idx_str + segment_size
39 |     ret[i] = x[i, :, idx_str:idx_end]
40 |   return ret
41 | 
42 | 
43 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
44 |   b, d, t = x.size()
45 |   if x_lengths is None:
46 |     x_lengths = t
47 |   ids_str_max = x_lengths - segment_size + 1
48 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
49 |   ret = slice_segments(x, ids_str, segment_size)
50 |   return ret, ids_str
51 | 
52 | 
53 | def subsequent_mask(length):
54 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
55 |   return mask
56 | 
57 | 
58 | @torch.jit.script
59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
60 |   n_channels_int = n_channels[0]
61 |   in_act = input_a + input_b
62 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
63 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
64 |   acts = t_act * s_act
65 |   return acts
66 | 
67 | 
68 | def convert_pad_shape(pad_shape):
69 |   l = pad_shape[::-1]
70 |   pad_shape = [item for sublist in l for item in sublist]
71 |   return pad_shape
72 | 
73 | 
74 | def sequence_mask(length, max_length=None):
75 |   if max_length is None:
76 |     max_length = length.max()
77 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
78 |   return x.unsqueeze(0) < length.unsqueeze(1)
79 | 
80 | 
81 | def generate_path(duration, mask):
82 |   """
83 |   duration: [b, 1, t_x]
84 |   mask: [b, 1, t_y, t_x]
85 |   """
86 |   device = duration.device
87 |   
88 |   b, _, t_y, t_x = mask.shape
89 |   cum_duration = torch.cumsum(duration, -1)
90 |   
91 |   cum_duration_flat = cum_duration.view(b * t_x)
92 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
93 |   path = path.view(b, t_x, t_y)
94 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
95 |   path = path.unsqueeze(1).transpose(2,3) * mask
96 |   return path
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ai-voice-assistant
 2 | 
 3 | I hooked up you.com's youchat into speech to text and text to speech services, to create an AI voice assistant. I'm using this alongside vb-cable and vmagicmirror to create an anime character chatbot. The code supports both text and voice input, configuring voice, wake word, and prompt context, and of course vb-cable support. See '-h' for launch parameters.
 4 | 
 5 | 
 6 | ## Optional setup
 7 | 
 8 | ### Local language model with Oobabooga Web UI
 9 | 
10 | You can use this with [Oobabooga's Web UI](https://github.com/oobabooga/text-generation-webui/) for a local LLM (instead of Youchat) by launching with `--ooba`. When using this, you can also load in TavernAI png/webp or Pygmalion/Oobabooga json character cards with `--chara filename.png`. The script assumes you are running oobabooga with `--extensions api` and on port 5000 (default).
11 | 
12 | ### Local language model with LM Studio
13 | 
14 | You can use this with [LM Studio](https://lmstudio.ai/) for a local LLM (instead of Youchat) by launching with `--openai`. When using this, you can also load in TavernAI png/webp or Pygmalion/Oobabooga json character cards with `--chara filename.png`. The script assumes you're running LM Studio and have started the local inference server on port 1234 (default).
15 | 
16 | ### Local speech recognition with Vosk
17 | 
18 | Use `--vosk` to run with vosk speech recognition instead of the default whisper one. Make sure you download a model from [here](https://alphacephei.com/vosk/models) and put it in the folder with it named as `model`. Make sure to run with `--voiceinput` when using speech to text input.
19 | 
20 | ### Moegoe TTS
21 | 
22 | A modified version of moegoe is included in the repo. To use it, place a compatible `.pth` and `.json` model in `mymoegoe/models/` ensuring both the pth and json have the same name, then launch the script with `--moegoe`. The script assumes a model named `g.pth` and `g.json`. You can manually select it by launching with `--mgmodel modelnamehere` for example: `--mgmodel g` without file extensions. If the voice is too fast or too slow, you can modify the speed with `--voicespeed 1.0` adjusting the number (higher is slower). A popular model with thousands of anime voices can be found [here.](https://huggingface.co/spaces/skytnt/moe-tts/tree/main/saved_model/15)
23 | 
24 | ### XTTS
25 | 
26 | Use XTTS-v2 as the TTS engine by launching the script with `--xtts`. The script requires that you place the XTTS model in `xtts/models`. The expected default model name is `base v2.0.2`.  You can use a reference voice by placing a wav file in `xtts/voices` and launching the script with both `--xtts` as well as `--voice filename` without the wav extension. By default, `xtts/voices/en_sample.wav` file is used as a reference. The XTTS-v2 model can be found [here.](https://huggingface.co/coqui/XTTS-v2)
27 | 
28 | ### Anime character visualization with VMagicMirror and VB-Cable
29 | 
30 | For windows and mac it's possible to install [VB-Cable](https://vb-audio.com/Cable/) and [VMagicMirror](https://github.com/malaybaku/VMagicMirror/) to send the TTS output to an on screen anime character. Launch the script with `--vbcable` to send TTS to vbcable. Then run VMagicMirror and set the microphone to the virtual VB-Cable microphone.
31 | 
32 | ## Launch Arguments
33 | 
34 | | Launch Argument  | Description |
35 | | ------------- |:-------------:|
36 | |`--vbcable`|Send audio to vb-cable virtual microphone.|
37 | |`--voiceinput`|Interact with the AI using your voice instead of text.|
38 | |`--pc='string'`|set a prompt context. To prepend to prompts. Optionally can be set as fake history.|
39 | |`--pcaschat`|Sets prompt context to be a fake chat history.|
40 | |`--caphistory=number`|Caps chat history length. Default is 4. Set to -1 to disable.|
41 | |`--voice=number/string`|Set the TTS voice.|
42 | |`--voices`|List voices on your computer.|
43 | |`--wakeword='string'`|Sets the wake word when using voice input.|
44 | |`--alwayslisten`|Always listen for input, not using a wake word.|
45 | |`--ooba`|Use local oobabooga webui as LLM instead of YouChat.|
46 | |`--openai`|Use openai api as LLM instead of YouChat.|
47 | |`--vosk`|Use local vosk as STT.|
48 | |`--googlestt`|Use google's online service as STT.|
49 | |`--chara='filename'`|Load tavernai character card or oobabooga character json file.|
50 | |`--moegoe`|Use moegoe as TTS instead of default TTS.|
51 | |`--xtts`|Use xtts as TTS instead of default TTS.|
52 | |`--bootmsg='string'`|What to say when booting up.|
53 | |`--wakeprompt`|Like alwayslisten, but doesn't prompt unless wakeword is included.|
54 | |`--nowakeping`|Doesn't ping when starting to listen for wake word|
55 | |`--voicespeed=number`|Speed of moegoe tts. Higher=slower. default is 1.|
56 | |`--mgmodel='filename'`|set the filename of the moegoe model. default is g|
57 | |`--template='string'`|specify a prompt template (chatml or phi3). default typical chat format.|
58 | |`-v`|Print debug info.|


--------------------------------------------------------------------------------
/mymoegoe/text/english.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | 
 16 | # Regular expression matching whitespace:
 17 | 
 18 | 
 19 | import re
 20 | import inflect
 21 | from unidecode import unidecode
 22 | import eng_to_ipa as ipa
 23 | _inflect = inflect.engine()
 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
 29 | _number_re = re.compile(r'[0-9]+')
 30 | 
 31 | # List of (regular expression, replacement) pairs for abbreviations:
 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 33 |     ('mrs', 'misess'),
 34 |     ('mr', 'mister'),
 35 |     ('dr', 'doctor'),
 36 |     ('st', 'saint'),
 37 |     ('co', 'company'),
 38 |     ('jr', 'junior'),
 39 |     ('maj', 'major'),
 40 |     ('gen', 'general'),
 41 |     ('drs', 'doctors'),
 42 |     ('rev', 'reverend'),
 43 |     ('lt', 'lieutenant'),
 44 |     ('hon', 'honorable'),
 45 |     ('sgt', 'sergeant'),
 46 |     ('capt', 'captain'),
 47 |     ('esq', 'esquire'),
 48 |     ('ltd', 'limited'),
 49 |     ('col', 'colonel'),
 50 |     ('ft', 'fort'),
 51 | ]]
 52 | 
 53 | 
 54 | # List of (ipa, lazy ipa) pairs:
 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 56 |     ('r', 'ɹ'),
 57 |     ('æ', 'e'),
 58 |     ('ɑ', 'a'),
 59 |     ('ɔ', 'o'),
 60 |     ('ð', 'z'),
 61 |     ('θ', 's'),
 62 |     ('ɛ', 'e'),
 63 |     ('ɪ', 'i'),
 64 |     ('ʊ', 'u'),
 65 |     ('ʒ', 'ʥ'),
 66 |     ('ʤ', 'ʥ'),
 67 |     ('ˈ', '↓'),
 68 | ]]
 69 | 
 70 | # List of (ipa, lazy ipa2) pairs:
 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 72 |     ('r', 'ɹ'),
 73 |     ('ð', 'z'),
 74 |     ('θ', 's'),
 75 |     ('ʒ', 'ʑ'),
 76 |     ('ʤ', 'dʑ'),
 77 |     ('ˈ', '↓'),
 78 | ]]
 79 | 
 80 | # List of (ipa, ipa2) pairs
 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 82 |     ('r', 'ɹ'),
 83 |     ('ʤ', 'dʒ'),
 84 |     ('ʧ', 'tʃ')
 85 | ]]
 86 | 
 87 | 
 88 | def expand_abbreviations(text):
 89 |     for regex, replacement in _abbreviations:
 90 |         text = re.sub(regex, replacement, text)
 91 |     return text
 92 | 
 93 | 
 94 | def collapse_whitespace(text):
 95 |     return re.sub(r'\s+', ' ', text)
 96 | 
 97 | 
 98 | def _remove_commas(m):
 99 |     return m.group(1).replace(',', '')
100 | 
101 | 
102 | def _expand_decimal_point(m):
103 |     return m.group(1).replace('.', ' point ')
104 | 
105 | 
106 | def _expand_dollars(m):
107 |     match = m.group(1)
108 |     parts = match.split('.')
109 |     if len(parts) > 2:
110 |         return match + ' dollars'  # Unexpected format
111 |     dollars = int(parts[0]) if parts[0] else 0
112 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113 |     if dollars and cents:
114 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115 |         cent_unit = 'cent' if cents == 1 else 'cents'
116 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117 |     elif dollars:
118 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119 |         return '%s %s' % (dollars, dollar_unit)
120 |     elif cents:
121 |         cent_unit = 'cent' if cents == 1 else 'cents'
122 |         return '%s %s' % (cents, cent_unit)
123 |     else:
124 |         return 'zero dollars'
125 | 
126 | 
127 | def _expand_ordinal(m):
128 |     return _inflect.number_to_words(m.group(0))
129 | 
130 | 
131 | def _expand_number(m):
132 |     num = int(m.group(0))
133 |     if num > 1000 and num < 3000:
134 |         if num == 2000:
135 |             return 'two thousand'
136 |         elif num > 2000 and num < 2010:
137 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
138 |         elif num % 100 == 0:
139 |             return _inflect.number_to_words(num // 100) + ' hundred'
140 |         else:
141 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142 |     else:
143 |         return _inflect.number_to_words(num, andword='')
144 | 
145 | 
146 | def normalize_numbers(text):
147 |     text = re.sub(_comma_number_re, _remove_commas, text)
148 |     text = re.sub(_pounds_re, r'\1 pounds', text)
149 |     text = re.sub(_dollars_re, _expand_dollars, text)
150 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
152 |     text = re.sub(_number_re, _expand_number, text)
153 |     return text
154 | 
155 | 
156 | def mark_dark_l(text):
157 |     return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158 | 
159 | 
160 | def english_to_ipa(text):
161 |     text = unidecode(text).lower()
162 |     text = expand_abbreviations(text)
163 |     text = normalize_numbers(text)
164 |     phonemes = ipa.convert(text)
165 |     phonemes = collapse_whitespace(phonemes)
166 |     return phonemes
167 | 
168 | 
169 | def english_to_lazy_ipa(text):
170 |     text = english_to_ipa(text)
171 |     for regex, replacement in _lazy_ipa:
172 |         text = re.sub(regex, replacement, text)
173 |     return text
174 | 
175 | 
176 | def english_to_ipa2(text):
177 |     text = english_to_ipa(text)
178 |     text = mark_dark_l(text)
179 |     for regex, replacement in _ipa_to_ipa2:
180 |         text = re.sub(regex, replacement, text)
181 |     return text.replace('...', '…')
182 | 
183 | 
184 | def english_to_lazy_ipa2(text):
185 |     text = english_to_ipa(text)
186 |     for regex, replacement in _lazy_ipa2:
187 |         text = re.sub(regex, replacement, text)
188 |     return text
189 | 


--------------------------------------------------------------------------------
/mymoegoe/tts.py:
--------------------------------------------------------------------------------
  1 | #Needed to write wav file, no added install cost
  2 | import wave
  3 | import struct
  4 | 
  5 | 
  6 | #Internal Reqs, no added install cost
  7 | from mymoegoe.text import text_to_sequence, _clean_text
  8 | from mymoegoe.models import SynthesizerTrn
  9 | import mymoegoe.commons as commons
 10 | 
 11 | #re is common
 12 | import re
 13 | 
 14 | #Torch is common
 15 | from torch import no_grad, LongTensor
 16 | 
 17 | #utils imports. Json and torch both common
 18 | from json import loads
 19 | from torch import load, FloatTensor
 20 | import torch
 21 | from scipy.io.wavfile import write
 22 | import os
 23 | 
 24 | #Utils
 25 | #---------------
 26 | class HParams():
 27 |   def __init__(self, **kwargs):
 28 |     for k, v in kwargs.items():
 29 |       if type(v) == dict:
 30 |         v = HParams(**v)
 31 |       self[k] = v
 32 | 
 33 |   def keys(self):
 34 |     return self.__dict__.keys()
 35 | 
 36 |   def items(self):
 37 |     return self.__dict__.items()
 38 | 
 39 |   def values(self):
 40 |     return self.__dict__.values()
 41 | 
 42 |   def __len__(self):
 43 |     return len(self.__dict__)
 44 | 
 45 |   def __getitem__(self, key):
 46 |     return getattr(self, key)
 47 | 
 48 |   def __setitem__(self, key, value):
 49 |     return setattr(self, key, value)
 50 | 
 51 |   def __contains__(self, key):
 52 |     return key in self.__dict__
 53 | 
 54 |   def __repr__(self):
 55 |     return self.__dict__.__repr__()
 56 | 
 57 | 
 58 | def load_checkpoint(checkpoint_path, model):
 59 |   checkpoint_dict = load(checkpoint_path, map_location="cpu")
 60 |   iteration = checkpoint_dict['iteration']
 61 |   saved_state_dict = checkpoint_dict['model']
 62 |   if hasattr(model, 'module'):
 63 |     state_dict = model.module.state_dict()
 64 |   else:
 65 |     state_dict = model.state_dict()
 66 |   new_state_dict= {}
 67 |   for k, v in state_dict.items():
 68 |     try:
 69 |       new_state_dict[k] = saved_state_dict[k]
 70 |     except:
 71 |       print("Not in dictionary: ", k)
 72 |       #logging.info("%s is not in the checkpoint" % k)
 73 |       new_state_dict[k] = v
 74 |   if hasattr(model, 'module'):
 75 |     model.module.load_state_dict(new_state_dict)
 76 |   else:
 77 |     model.load_state_dict(new_state_dict)
 78 |   #logging.info("Loaded checkpoint '{}' (iteration {})" .format(
 79 |   #  checkpoint_path, iteration))
 80 |   return
 81 | 
 82 | 
 83 | def get_hparams_from_file(config_path):
 84 |   with open(config_path, "r", encoding="utf-8") as f:
 85 |     data = f.read()
 86 |   config = loads(data)
 87 | 
 88 |   hparams = HParams(**config)
 89 |   return hparams
 90 | 
 91 | 
 92 | 
 93 | #Script
 94 | #---------
 95 | 
 96 | #Model Loading
 97 | mchoice = "g"
 98 | model = "mymoegoe/models/"+mchoice+".pth"
 99 | config = "mymoegoe/models/"+mchoice+".json"
100 | 
101 | #Set speaker/voice, usually 0. Along with wav destination
102 | speaker_id = 0
103 | #out_path = "temp.wav"
104 | 
105 | #Default Audio Settings
106 | defaultlength = 1
107 | defaultnoisescale = 0.667
108 | defaultnoisedeviation = 0.8
109 | 
110 | #Audio Settings
111 | length_scale = 1 #length scale
112 | noise_scale = 0.5 #noise scale - phoneme length?
113 | noise_scale_w = 0.1 #deviation of noise - emotionality?
114 | 
115 | #Input Text
116 | #text = ""
117 | 
118 | n_symbols = 0
119 | hps_ms = None
120 | net_g_ms = None
121 | 
122 | def loadtts(mgmodel):
123 |     global model, config, mchoice
124 |     mchoice = mgmodel
125 |     script_path = os.path.abspath(__file__)
126 |     directory = os.path.dirname(script_path)
127 |     model = os.path.join(directory,"models/"+mgmodel+".pth")
128 |     config = os.path.join(directory,"models/"+mgmodel+".json")
129 |     global n_symbols, hps_ms, net_g_ms
130 |     #Load params from the config
131 |     hps_ms = get_hparams_from_file(config)
132 | 
133 |     #Seems to get number of speakers
134 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
135 |     #Seems to get number of symbols?
136 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
137 |     #Get the speakers.
138 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
139 |     #Emotion embedding stuff, seems unneeded
140 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
141 | 
142 |     #Some model loading stuff?
143 |     net_g_ms = SynthesizerTrn(
144 |         n_symbols,
145 |         hps_ms.data.filter_length // 2 + 1,
146 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
147 |         n_speakers=n_speakers,
148 |         emotion_embedding=emotion_embedding,
149 |         **hps_ms.model)
150 |     net_g_ms.cuda()
151 |     _ = net_g_ms.eval()
152 |     load_checkpoint(model, net_g_ms)
153 | 
154 | def tts(text, out_path="temp.wav", voice=speaker_id, speed=length_scale):
155 |     speaker_id = voice
156 |     length_scale = speed
157 | 
158 |     if n_symbols != 0:
159 | 
160 |         #Clean Text
161 |         #text = text.replace("\"","")
162 |         text_norm = text_to_sequence(text, hps_ms.symbols, hps_ms.data.text_cleaners)
163 |         if hps_ms.data.add_blank:
164 |             text_norm = commons.intersperse(text_norm, 0)
165 |         text_norm = LongTensor(text_norm)
166 |         stn_tst = text_norm
167 |         #---------------
168 | 
169 | 
170 |         with no_grad():
171 | 
172 |             #Generate the TTS audio
173 |             x_tst = stn_tst.unsqueeze(0).cuda()
174 |             x_tst_lengths = LongTensor([stn_tst.size(0)]).cuda()
175 |             sid = LongTensor([speaker_id]).cuda()
176 |             audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
177 |                                     noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
178 |             
179 |             write(out_path, hps_ms.data.sampling_rate, audio)
180 |             # # Save Wav File
181 |             # with wave.open(out_path, 'wb') as wav_file:
182 |             #     # Set audio file parameters
183 |             #     wav_file.setnchannels(1)  # Mono audio
184 |             #     wav_file.setsampwidth(2)  # 16-bit audio
185 |             #     wav_file.setframerate(hps_ms.data.sampling_rate) # Sample Rate
186 | 
187 |             #     # Write audio data to file
188 |             #     for sample in audio:
189 |             #         # Convert sample to 16-bit signed integer format
190 |             #         sample = max(-1, min(1, sample))  # Clamp sample to range [-1, 1]
191 |             #         sample = int(sample * 32767)  # Scale sample to range [-32767, 32767]
192 |             #         packed_sample = struct.pack('<h', sample)  # Convert to little-endian 16-bit signed integer
193 |             #         wav_file.writeframes(packed_sample)
194 | 
195 | #loadtts()
196 | #tts()


--------------------------------------------------------------------------------
/xtts/stream.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import subprocess
  4 | import sys
  5 | import time
  6 | from typing import Iterator
  7 | from typing import List
  8 | 
  9 | import requests
 10 | 
 11 | import base64
 12 | import io
 13 | import os
 14 | import tempfile
 15 | import wave
 16 | import torch
 17 | import numpy as np
 18 | from TTS.tts.configs.xtts_config import XttsConfig
 19 | from TTS.tts.models.xtts import Xtts
 20 | 
 21 | 
 22 | def save(audio: bytes, filename: str) -> None:
 23 |     with open(filename, "wb") as f:
 24 |         f.write(audio)
 25 | 
 26 | 
 27 | #Model Params
 28 | basemodelname = "xtts/models/base v2.0.2/"
 29 | modelname = basemodelname
 30 | reference = "xtts/voices/en_sample.wav"
 31 | 
 32 | config = None
 33 | model = None
 34 | 
 35 | #Load Model
 36 | def loadModel(modelname="base v2.0.2", voice="en_sample"):
 37 |     global model, config, reference
 38 |     model_path = "xtts/models/"+modelname+"/"
 39 |     reference = "xtts/voices/"+voice+".wav"
 40 |     configname = model_path+"config.json"
 41 |     config = XttsConfig()
 42 |     config.load_json(configname)
 43 |     model = Xtts.init_from_config(config)
 44 |     model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
 45 |     model.cuda()
 46 |     print("TTS Model Loaded.")
 47 | 
 48 | 
 49 | #clone speaker
 50 | def predict_speaker(wav_file):
 51 |     """Compute conditioning inputs from reference audio file."""
 52 |     gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
 53 |         wav_file
 54 |     )
 55 |     return {
 56 |         "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
 57 |         "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
 58 |     }
 59 | 
 60 | 
 61 | #Processing tts wav stuff for stream
 62 | def postprocess(wav):
 63 |     """Post process the output waveform"""
 64 |     if isinstance(wav, list):
 65 |         wav = torch.cat(wav, dim=0)
 66 |     wav = wav.clone().detach().cpu().numpy()
 67 |     wav = wav[None, : int(wav.shape[0])]
 68 |     wav = np.clip(wav, -1, 1)
 69 |     wav = (wav * 32767).astype(np.int16)
 70 |     return wav
 71 | def encode_audio_common(frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1):
 72 |     """Return base64 encoded audio"""
 73 |     wav_buf = io.BytesIO()
 74 |     with wave.open(wav_buf, "wb") as vfout:
 75 |         vfout.setnchannels(channels)
 76 |         vfout.setsampwidth(sample_width)
 77 |         vfout.setframerate(sample_rate)
 78 |         vfout.writeframes(frame_input)
 79 | 
 80 |     wav_buf.seek(0)
 81 |     if encode_base64:
 82 |         b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8")
 83 |         return b64_encoded
 84 |     else:
 85 |         return wav_buf.read()
 86 | 
 87 | #Seems to generate the streamed tts output
 88 | def predict_streaming_generator(parsed_input: dict):
 89 |     speaker_embedding = torch.tensor(parsed_input["speaker_embedding"]).unsqueeze(0).unsqueeze(-1)
 90 |     gpt_cond_latent = torch.tensor(parsed_input["gpt_cond_latent"]).reshape((-1, 1024)).unsqueeze(0)
 91 |     text = parsed_input["text"]
 92 |     language = parsed_input["language"]
 93 | 
 94 |     stream_chunk_size = int(parsed_input["stream_chunk_size"])
 95 |     add_wav_header = False#parsed_input["add_wav_header"]
 96 | 
 97 | 
 98 |     chunks = model.inference_stream(
 99 |         text,
100 |         language,
101 |         gpt_cond_latent,
102 |         speaker_embedding,
103 |         stream_chunk_size=stream_chunk_size,
104 |         enable_text_splitting=True
105 |     )
106 | 
107 |     for i, chunk in enumerate(chunks):
108 |         chunk = postprocess(chunk)
109 |         if i == 0 and add_wav_header:
110 |             #This breaks playaudiostream but works for ffplay?
111 |             yield encode_audio_common(b"", encode_base64=False)
112 |             yield chunk.tobytes()
113 |         else:
114 |             yield chunk.tobytes()
115 | 
116 | #Plays the tts output live?
117 | def stream_ffplay(audio_stream, output_file=None, save=False):
118 |     if not save:
119 |         ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"]
120 |     else:
121 |         print("Saving to ", output_file)
122 |         ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", output_file]
123 | 
124 |     ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
125 |     for chunk in audio_stream:
126 |         if chunk is not None:
127 |             ffplay_proc.stdin.write(chunk)
128 | 
129 |     # close on finish
130 |     ffplay_proc.stdin.close()
131 |     ffplay_proc.wait()
132 | 
133 | 
134 | def tts(text, speaker, language, stream_chunk_size, verbose=False) -> Iterator[bytes]:
135 |     start = time.perf_counter()
136 |     speaker["text"] = text
137 |     speaker["language"] = language
138 |     speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
139 | 
140 |     
141 |     if verbose:
142 |         end = time.perf_counter()
143 |         print(f"Time to make POST: {end-start}s", file=sys.stderr)
144 | 
145 |     first = True
146 |     #for chunk in res.iter_content(chunk_size=512):
147 |     for chunk in predict_streaming_generator(speaker):
148 |         if first:
149 |             if verbose:
150 |                 end = time.perf_counter()
151 |                 print(f"Time to first chunk: {end-start}s", file=sys.stderr)
152 |             first = False
153 |         if chunk:
154 |             yield chunk
155 | 
156 |     #print("⏱️ response.elapsed:", res.elapsed)
157 | 
158 | 
159 | def get_speaker(ref_audio):
160 |     wav_file = open(ref_audio, "rb")
161 |     response = predict_speaker(wav_file)
162 |     return response
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     parser = argparse.ArgumentParser()
167 |     parser.add_argument(
168 |         "--text",
169 |         default="It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
170 |         help="text input for TTS"
171 |     )
172 |     parser.add_argument(
173 |         "--language",
174 |         default="en",
175 |         help="Language to use default is 'en'  (English)"
176 |     )
177 |     parser.add_argument(
178 |         "--output_file",
179 |         default=None,
180 |         help="Save TTS output to given filename"
181 |     )
182 |     parser.add_argument(
183 |         "--ref_file",
184 |         default=None,
185 |         help="Reference audio file to use, when not given will use default"
186 |     )
187 |     parser.add_argument(
188 |         "--stream_chunk_size",
189 |         default="20",
190 |         help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
191 |     )
192 |     args = parser.parse_args()
193 | 
194 |     loadModel()
195 | 
196 |     with open("./default_speaker.json", "r") as file:
197 |         speaker = json.load(file)
198 | 
199 |     if args.ref_file is not None:
200 |         print("Computing the latents for a new reference...")
201 |         speaker = get_speaker(args.ref_file)
202 | 
203 |     audio = stream_ffplay(
204 |         tts(
205 |             args.text,
206 |             speaker,
207 |             args.language,
208 |             args.stream_chunk_size
209 |         ), 
210 |         args.output_file,
211 |         save=bool(args.output_file)
212 |     )
213 |     audio = stream_ffplay(
214 |         tts(
215 |             "This should play after the first one.",
216 |             speaker,
217 |             args.language,
218 |             args.stream_chunk_size
219 |         ), 
220 |         args.output_file,
221 |         save=bool(args.output_file)
222 |     )
223 | 


--------------------------------------------------------------------------------
/mymoegoe/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/mymoegoe/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import mymoegoe.commons as commons
  7 | from mymoegoe.modules import LayerNorm
  8 |    
  9 | 
 10 | class Encoder(nn.Module):
 11 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 12 |     super().__init__()
 13 |     self.hidden_channels = hidden_channels
 14 |     self.filter_channels = filter_channels
 15 |     self.n_heads = n_heads
 16 |     self.n_layers = n_layers
 17 |     self.kernel_size = kernel_size
 18 |     self.p_dropout = p_dropout
 19 |     self.window_size = window_size
 20 | 
 21 |     self.drop = nn.Dropout(p_dropout)
 22 |     self.attn_layers = nn.ModuleList()
 23 |     self.norm_layers_1 = nn.ModuleList()
 24 |     self.ffn_layers = nn.ModuleList()
 25 |     self.norm_layers_2 = nn.ModuleList()
 26 |     for i in range(self.n_layers):
 27 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 28 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 29 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 30 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 31 | 
 32 |   def forward(self, x, x_mask):
 33 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 34 |     x = x * x_mask
 35 |     for i in range(self.n_layers):
 36 |       y = self.attn_layers[i](x, x, attn_mask)
 37 |       y = self.drop(y)
 38 |       x = self.norm_layers_1[i](x + y)
 39 | 
 40 |       y = self.ffn_layers[i](x, x_mask)
 41 |       y = self.drop(y)
 42 |       x = self.norm_layers_2[i](x + y)
 43 |     x = x * x_mask
 44 |     return x
 45 | 
 46 | 
 47 | class Decoder(nn.Module):
 48 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 49 |     super().__init__()
 50 |     self.hidden_channels = hidden_channels
 51 |     self.filter_channels = filter_channels
 52 |     self.n_heads = n_heads
 53 |     self.n_layers = n_layers
 54 |     self.kernel_size = kernel_size
 55 |     self.p_dropout = p_dropout
 56 |     self.proximal_bias = proximal_bias
 57 |     self.proximal_init = proximal_init
 58 | 
 59 |     self.drop = nn.Dropout(p_dropout)
 60 |     self.self_attn_layers = nn.ModuleList()
 61 |     self.norm_layers_0 = nn.ModuleList()
 62 |     self.encdec_attn_layers = nn.ModuleList()
 63 |     self.norm_layers_1 = nn.ModuleList()
 64 |     self.ffn_layers = nn.ModuleList()
 65 |     self.norm_layers_2 = nn.ModuleList()
 66 |     for i in range(self.n_layers):
 67 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 68 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 69 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 70 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 71 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 72 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 73 | 
 74 |   def forward(self, x, x_mask, h, h_mask):
 75 |     """
 76 |     x: decoder input
 77 |     h: encoder output
 78 |     """
 79 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 80 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 81 |     x = x * x_mask
 82 |     for i in range(self.n_layers):
 83 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 84 |       y = self.drop(y)
 85 |       x = self.norm_layers_0[i](x + y)
 86 | 
 87 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 88 |       y = self.drop(y)
 89 |       x = self.norm_layers_1[i](x + y)
 90 |       
 91 |       y = self.ffn_layers[i](x, x_mask)
 92 |       y = self.drop(y)
 93 |       x = self.norm_layers_2[i](x + y)
 94 |     x = x * x_mask
 95 |     return x
 96 | 
 97 | 
 98 | class MultiHeadAttention(nn.Module):
 99 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100 |     super().__init__()
101 |     assert channels % n_heads == 0
102 | 
103 |     self.channels = channels
104 |     self.out_channels = out_channels
105 |     self.n_heads = n_heads
106 |     self.p_dropout = p_dropout
107 |     self.window_size = window_size
108 |     self.heads_share = heads_share
109 |     self.block_length = block_length
110 |     self.proximal_bias = proximal_bias
111 |     self.proximal_init = proximal_init
112 |     self.attn = None
113 | 
114 |     self.k_channels = channels // n_heads
115 |     self.conv_q = nn.Conv1d(channels, channels, 1)
116 |     self.conv_k = nn.Conv1d(channels, channels, 1)
117 |     self.conv_v = nn.Conv1d(channels, channels, 1)
118 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
119 |     self.drop = nn.Dropout(p_dropout)
120 | 
121 |     if window_size is not None:
122 |       n_heads_rel = 1 if heads_share else n_heads
123 |       rel_stddev = self.k_channels**-0.5
124 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126 | 
127 |     nn.init.xavier_uniform_(self.conv_q.weight)
128 |     nn.init.xavier_uniform_(self.conv_k.weight)
129 |     nn.init.xavier_uniform_(self.conv_v.weight)
130 |     if proximal_init:
131 |       with torch.no_grad():
132 |         self.conv_k.weight.copy_(self.conv_q.weight)
133 |         self.conv_k.bias.copy_(self.conv_q.bias)
134 |       
135 |   def forward(self, x, c, attn_mask=None):
136 |     q = self.conv_q(x)
137 |     k = self.conv_k(c)
138 |     v = self.conv_v(c)
139 |     
140 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
141 | 
142 |     x = self.conv_o(x)
143 |     return x
144 | 
145 |   def attention(self, query, key, value, mask=None):
146 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
147 |     b, d, t_s, t_t = (*key.size(), query.size(2))
148 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151 | 
152 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153 |     if self.window_size is not None:
154 |       assert t_s == t_t, "Relative attention is only available for self-attention."
155 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
158 |       scores = scores + scores_local
159 |     if self.proximal_bias:
160 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
161 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162 |     if mask is not None:
163 |       scores = scores.masked_fill(mask == 0, -1e4)
164 |       if self.block_length is not None:
165 |         assert t_s == t_t, "Local attention is only available for self-attention."
166 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167 |         scores = scores.masked_fill(block_mask == 0, -1e4)
168 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169 |     p_attn = self.drop(p_attn)
170 |     output = torch.matmul(p_attn, value)
171 |     if self.window_size is not None:
172 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
173 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176 |     return output, p_attn
177 | 
178 |   def _matmul_with_relative_values(self, x, y):
179 |     """
180 |     x: [b, h, l, m]
181 |     y: [h or 1, m, d]
182 |     ret: [b, h, l, d]
183 |     """
184 |     ret = torch.matmul(x, y.unsqueeze(0))
185 |     return ret
186 | 
187 |   def _matmul_with_relative_keys(self, x, y):
188 |     """
189 |     x: [b, h, l, d]
190 |     y: [h or 1, m, d]
191 |     ret: [b, h, l, m]
192 |     """
193 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194 |     return ret
195 | 
196 |   def _get_relative_embeddings(self, relative_embeddings, length):
197 |     max_relative_position = 2 * self.window_size + 1
198 |     # Pad first before slice to avoid using cond ops.
199 |     pad_length = max(length - (self.window_size + 1), 0)
200 |     slice_start_position = max((self.window_size + 1) - length, 0)
201 |     slice_end_position = slice_start_position + 2 * length - 1
202 |     if pad_length > 0:
203 |       padded_relative_embeddings = F.pad(
204 |           relative_embeddings,
205 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206 |     else:
207 |       padded_relative_embeddings = relative_embeddings
208 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209 |     return used_relative_embeddings
210 | 
211 |   def _relative_position_to_absolute_position(self, x):
212 |     """
213 |     x: [b, h, l, 2*l-1]
214 |     ret: [b, h, l, l]
215 |     """
216 |     batch, heads, length, _ = x.size()
217 |     # Concat columns of pad to shift from relative to absolute indexing.
218 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219 | 
220 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
221 |     x_flat = x.view([batch, heads, length * 2 * length])
222 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223 | 
224 |     # Reshape and slice out the padded elements.
225 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226 |     return x_final
227 | 
228 |   def _absolute_position_to_relative_position(self, x):
229 |     """
230 |     x: [b, h, l, l]
231 |     ret: [b, h, l, 2*l-1]
232 |     """
233 |     batch, heads, length, _ = x.size()
234 |     # padd along column
235 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237 |     # add 0's in the beginning that will skew the elements after reshape
238 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240 |     return x_final
241 | 
242 |   def _attention_bias_proximal(self, length):
243 |     """Bias for self-attention to encourage attention to close positions.
244 |     Args:
245 |       length: an integer scalar.
246 |     Returns:
247 |       a Tensor with shape [1, 1, length, length]
248 |     """
249 |     r = torch.arange(length, dtype=torch.float32)
250 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252 | 
253 | 
254 | class FFN(nn.Module):
255 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256 |     super().__init__()
257 |     self.in_channels = in_channels
258 |     self.out_channels = out_channels
259 |     self.filter_channels = filter_channels
260 |     self.kernel_size = kernel_size
261 |     self.p_dropout = p_dropout
262 |     self.activation = activation
263 |     self.causal = causal
264 | 
265 |     if causal:
266 |       self.padding = self._causal_padding
267 |     else:
268 |       self.padding = self._same_padding
269 | 
270 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272 |     self.drop = nn.Dropout(p_dropout)
273 | 
274 |   def forward(self, x, x_mask):
275 |     x = self.conv_1(self.padding(x * x_mask))
276 |     if self.activation == "gelu":
277 |       x = x * torch.sigmoid(1.702 * x)
278 |     else:
279 |       x = torch.relu(x)
280 |     x = self.drop(x)
281 |     x = self.conv_2(self.padding(x * x_mask))
282 |     return x * x_mask
283 |   
284 |   def _causal_padding(self, x):
285 |     if self.kernel_size == 1:
286 |       return x
287 |     pad_l = self.kernel_size - 1
288 |     pad_r = 0
289 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290 |     x = F.pad(x, commons.convert_pad_shape(padding))
291 |     return x
292 | 
293 |   def _same_padding(self, x):
294 |     if self.kernel_size == 1:
295 |       return x
296 |     pad_l = (self.kernel_size - 1) // 2
297 |     pad_r = self.kernel_size // 2
298 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299 |     x = F.pad(x, commons.convert_pad_shape(padding))
300 |     return x
301 | 


--------------------------------------------------------------------------------
/mymoegoe/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from torch.nn import Conv1d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm
  8 | 
  9 | import mymoegoe.commons as commons
 10 | from mymoegoe.commons import init_weights, get_padding
 11 | from mymoegoe.transforms import piecewise_rational_quadratic_transform
 12 | 
 13 | 
 14 | LRELU_SLOPE = 0.1
 15 | 
 16 | 
 17 | class LayerNorm(nn.Module):
 18 |   def __init__(self, channels, eps=1e-5):
 19 |     super().__init__()
 20 |     self.channels = channels
 21 |     self.eps = eps
 22 | 
 23 |     self.gamma = nn.Parameter(torch.ones(channels))
 24 |     self.beta = nn.Parameter(torch.zeros(channels))
 25 | 
 26 |   def forward(self, x):
 27 |     x = x.transpose(1, -1)
 28 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 29 |     return x.transpose(1, -1)
 30 | 
 31 |  
 32 | class ConvReluNorm(nn.Module):
 33 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 34 |     super().__init__()
 35 |     self.in_channels = in_channels
 36 |     self.hidden_channels = hidden_channels
 37 |     self.out_channels = out_channels
 38 |     self.kernel_size = kernel_size
 39 |     self.n_layers = n_layers
 40 |     self.p_dropout = p_dropout
 41 |     assert n_layers > 1, "Number of layers should be larger than 0."
 42 | 
 43 |     self.conv_layers = nn.ModuleList()
 44 |     self.norm_layers = nn.ModuleList()
 45 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 46 |     self.norm_layers.append(LayerNorm(hidden_channels))
 47 |     self.relu_drop = nn.Sequential(
 48 |         nn.ReLU(),
 49 |         nn.Dropout(p_dropout))
 50 |     for _ in range(n_layers-1):
 51 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 52 |       self.norm_layers.append(LayerNorm(hidden_channels))
 53 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 54 |     self.proj.weight.data.zero_()
 55 |     self.proj.bias.data.zero_()
 56 | 
 57 |   def forward(self, x, x_mask):
 58 |     x_org = x
 59 |     for i in range(self.n_layers):
 60 |       x = self.conv_layers[i](x * x_mask)
 61 |       x = self.norm_layers[i](x)
 62 |       x = self.relu_drop(x)
 63 |     x = x_org + self.proj(x)
 64 |     return x * x_mask
 65 | 
 66 | 
 67 | class DDSConv(nn.Module):
 68 |   """
 69 |   Dilated and Depth-Separable Convolution
 70 |   """
 71 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 72 |     super().__init__()
 73 |     self.channels = channels
 74 |     self.kernel_size = kernel_size
 75 |     self.n_layers = n_layers
 76 |     self.p_dropout = p_dropout
 77 | 
 78 |     self.drop = nn.Dropout(p_dropout)
 79 |     self.convs_sep = nn.ModuleList()
 80 |     self.convs_1x1 = nn.ModuleList()
 81 |     self.norms_1 = nn.ModuleList()
 82 |     self.norms_2 = nn.ModuleList()
 83 |     for i in range(n_layers):
 84 |       dilation = kernel_size ** i
 85 |       padding = (kernel_size * dilation - dilation) // 2
 86 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 87 |           groups=channels, dilation=dilation, padding=padding
 88 |       ))
 89 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 90 |       self.norms_1.append(LayerNorm(channels))
 91 |       self.norms_2.append(LayerNorm(channels))
 92 | 
 93 |   def forward(self, x, x_mask, g=None):
 94 |     if g is not None:
 95 |       x = x + g
 96 |     for i in range(self.n_layers):
 97 |       y = self.convs_sep[i](x * x_mask)
 98 |       y = self.norms_1[i](y)
 99 |       y = F.gelu(y)
100 |       y = self.convs_1x1[i](y)
101 |       y = self.norms_2[i](y)
102 |       y = F.gelu(y)
103 |       y = self.drop(y)
104 |       x = x + y
105 |     return x * x_mask
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 |     super(WN, self).__init__()
111 |     assert(kernel_size % 2 == 1)
112 |     self.hidden_channels =hidden_channels
113 |     self.kernel_size = kernel_size,
114 |     self.dilation_rate = dilation_rate
115 |     self.n_layers = n_layers
116 |     self.gin_channels = gin_channels
117 |     self.p_dropout = p_dropout
118 | 
119 |     self.in_layers = torch.nn.ModuleList()
120 |     self.res_skip_layers = torch.nn.ModuleList()
121 |     self.drop = nn.Dropout(p_dropout)
122 | 
123 |     if gin_channels != 0:
124 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
125 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126 | 
127 |     for i in range(n_layers):
128 |       dilation = dilation_rate ** i
129 |       padding = int((kernel_size * dilation - dilation) / 2)
130 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
131 |                                  dilation=dilation, padding=padding)
132 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133 |       self.in_layers.append(in_layer)
134 | 
135 |       # last one is not necessary
136 |       if i < n_layers - 1:
137 |         res_skip_channels = 2 * hidden_channels
138 |       else:
139 |         res_skip_channels = hidden_channels
140 | 
141 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143 |       self.res_skip_layers.append(res_skip_layer)
144 | 
145 |   def forward(self, x, x_mask, g=None, **kwargs):
146 |     output = torch.zeros_like(x)
147 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
148 | 
149 |     if g is not None:
150 |       g = self.cond_layer(g)
151 | 
152 |     for i in range(self.n_layers):
153 |       x_in = self.in_layers[i](x)
154 |       if g is not None:
155 |         cond_offset = i * 2 * self.hidden_channels
156 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
157 |       else:
158 |         g_l = torch.zeros_like(x_in)
159 | 
160 |       acts = commons.fused_add_tanh_sigmoid_multiply(
161 |           x_in,
162 |           g_l,
163 |           n_channels_tensor)
164 |       acts = self.drop(acts)
165 | 
166 |       res_skip_acts = self.res_skip_layers[i](acts)
167 |       if i < self.n_layers - 1:
168 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
169 |         x = (x + res_acts) * x_mask
170 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
171 |       else:
172 |         output = output + res_skip_acts
173 |     return output * x_mask
174 | 
175 |   def remove_weight_norm(self):
176 |     if self.gin_channels != 0:
177 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
178 |     for l in self.in_layers:
179 |       torch.nn.utils.remove_weight_norm(l)
180 |     for l in self.res_skip_layers:
181 |      torch.nn.utils.remove_weight_norm(l)
182 | 
183 | 
184 | class ResBlock1(torch.nn.Module):
185 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
186 |         super(ResBlock1, self).__init__()
187 |         self.convs1 = nn.ModuleList([
188 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
189 |                                padding=get_padding(kernel_size, dilation[0]))),
190 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
191 |                                padding=get_padding(kernel_size, dilation[1]))),
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
193 |                                padding=get_padding(kernel_size, dilation[2])))
194 |         ])
195 |         self.convs1.apply(init_weights)
196 | 
197 |         self.convs2 = nn.ModuleList([
198 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
199 |                                padding=get_padding(kernel_size, 1))),
200 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 |                                padding=get_padding(kernel_size, 1))),
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=get_padding(kernel_size, 1)))
204 |         ])
205 |         self.convs2.apply(init_weights)
206 | 
207 |     def forward(self, x, x_mask=None):
208 |         for c1, c2 in zip(self.convs1, self.convs2):
209 |             xt = F.leaky_relu(x, LRELU_SLOPE)
210 |             if x_mask is not None:
211 |                 xt = xt * x_mask
212 |             xt = c1(xt)
213 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
214 |             if x_mask is not None:
215 |                 xt = xt * x_mask
216 |             xt = c2(xt)
217 |             x = xt + x
218 |         if x_mask is not None:
219 |             x = x * x_mask
220 |         return x
221 | 
222 |     def remove_weight_norm(self):
223 |         for l in self.convs1:
224 |             remove_weight_norm(l)
225 |         for l in self.convs2:
226 |             remove_weight_norm(l)
227 | 
228 | 
229 | class ResBlock2(torch.nn.Module):
230 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
231 |         super(ResBlock2, self).__init__()
232 |         self.convs = nn.ModuleList([
233 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
234 |                                padding=get_padding(kernel_size, dilation[0]))),
235 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
236 |                                padding=get_padding(kernel_size, dilation[1])))
237 |         ])
238 |         self.convs.apply(init_weights)
239 | 
240 |     def forward(self, x, x_mask=None):
241 |         for c in self.convs:
242 |             xt = F.leaky_relu(x, LRELU_SLOPE)
243 |             if x_mask is not None:
244 |                 xt = xt * x_mask
245 |             xt = c(xt)
246 |             x = xt + x
247 |         if x_mask is not None:
248 |             x = x * x_mask
249 |         return x
250 | 
251 |     def remove_weight_norm(self):
252 |         for l in self.convs:
253 |             remove_weight_norm(l)
254 | 
255 | 
256 | class Log(nn.Module):
257 |   def forward(self, x, x_mask, reverse=False, **kwargs):
258 |     if not reverse:
259 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260 |       logdet = torch.sum(-y, [1, 2])
261 |       return y, logdet
262 |     else:
263 |       x = torch.exp(x) * x_mask
264 |       return x
265 |     
266 | 
267 | class Flip(nn.Module):
268 |   def forward(self, x, *args, reverse=False, **kwargs):
269 |     x = torch.flip(x, [1])
270 |     if not reverse:
271 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272 |       return x, logdet
273 |     else:
274 |       return x
275 | 
276 | 
277 | class ElementwiseAffine(nn.Module):
278 |   def __init__(self, channels):
279 |     super().__init__()
280 |     self.channels = channels
281 |     self.m = nn.Parameter(torch.zeros(channels,1))
282 |     self.logs = nn.Parameter(torch.zeros(channels,1))
283 | 
284 |   def forward(self, x, x_mask, reverse=False, **kwargs):
285 |     if not reverse:
286 |       y = self.m + torch.exp(self.logs) * x
287 |       y = y * x_mask
288 |       logdet = torch.sum(self.logs * x_mask, [1,2])
289 |       return y, logdet
290 |     else:
291 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
292 |       return x
293 | 
294 | 
295 | class ResidualCouplingLayer(nn.Module):
296 |   def __init__(self,
297 |       channels,
298 |       hidden_channels,
299 |       kernel_size,
300 |       dilation_rate,
301 |       n_layers,
302 |       p_dropout=0,
303 |       gin_channels=0,
304 |       mean_only=False):
305 |     assert channels % 2 == 0, "channels should be divisible by 2"
306 |     super().__init__()
307 |     self.channels = channels
308 |     self.hidden_channels = hidden_channels
309 |     self.kernel_size = kernel_size
310 |     self.dilation_rate = dilation_rate
311 |     self.n_layers = n_layers
312 |     self.half_channels = channels // 2
313 |     self.mean_only = mean_only
314 | 
315 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
317 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
318 |     self.post.weight.data.zero_()
319 |     self.post.bias.data.zero_()
320 | 
321 |   def forward(self, x, x_mask, g=None, reverse=False):
322 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
323 |     h = self.pre(x0) * x_mask
324 |     h = self.enc(h, x_mask, g=g)
325 |     stats = self.post(h) * x_mask
326 |     if not self.mean_only:
327 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
328 |     else:
329 |       m = stats
330 |       logs = torch.zeros_like(m)
331 | 
332 |     if not reverse:
333 |       x1 = m + x1 * torch.exp(logs) * x_mask
334 |       x = torch.cat([x0, x1], 1)
335 |       logdet = torch.sum(logs, [1,2])
336 |       return x, logdet
337 |     else:
338 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
339 |       x = torch.cat([x0, x1], 1)
340 |       return x
341 | 
342 | 
343 | class ConvFlow(nn.Module):
344 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
345 |     super().__init__()
346 |     self.in_channels = in_channels
347 |     self.filter_channels = filter_channels
348 |     self.kernel_size = kernel_size
349 |     self.n_layers = n_layers
350 |     self.num_bins = num_bins
351 |     self.tail_bound = tail_bound
352 |     self.half_channels = in_channels // 2
353 | 
354 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
355 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
356 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
357 |     self.proj.weight.data.zero_()
358 |     self.proj.bias.data.zero_()
359 | 
360 |   def forward(self, x, x_mask, g=None, reverse=False):
361 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
362 |     h = self.pre(x0)
363 |     h = self.convs(h, x_mask, g=g)
364 |     h = self.proj(h) * x_mask
365 | 
366 |     b, c, t = x0.shape
367 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
368 | 
369 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
370 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
371 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
372 | 
373 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
374 |         unnormalized_widths,
375 |         unnormalized_heights,
376 |         unnormalized_derivatives,
377 |         inverse=reverse,
378 |         tails='linear',
379 |         tail_bound=self.tail_bound
380 |     )
381 | 
382 |     x = torch.cat([x0, x1], 1) * x_mask
383 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
384 |     if not reverse:
385 |         return x, logdet
386 |     else:
387 |         return x
388 | 


--------------------------------------------------------------------------------
/mymoegoe/models.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import mymoegoe.commons as commons
  7 | import mymoegoe.modules as modules
  8 | import mymoegoe.attentions as attentions
  9 | 
 10 | from torch.nn import Conv1d, ConvTranspose1d
 11 | from torch.nn.utils import weight_norm
 12 | from mymoegoe.commons import init_weights
 13 | 
 14 | 
 15 | class StochasticDurationPredictor(nn.Module):
 16 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
 17 |     super().__init__()
 18 |     filter_channels = in_channels # it needs to be removed from future version.
 19 |     self.in_channels = in_channels
 20 |     self.filter_channels = filter_channels
 21 |     self.kernel_size = kernel_size
 22 |     self.p_dropout = p_dropout
 23 |     self.n_flows = n_flows
 24 |     self.gin_channels = gin_channels
 25 | 
 26 |     self.log_flow = modules.Log()
 27 |     self.flows = nn.ModuleList()
 28 |     self.flows.append(modules.ElementwiseAffine(2))
 29 |     for i in range(n_flows):
 30 |       self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 31 |       self.flows.append(modules.Flip())
 32 | 
 33 |     self.post_pre = nn.Conv1d(1, filter_channels, 1)
 34 |     self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 35 |     self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 36 |     self.post_flows = nn.ModuleList()
 37 |     self.post_flows.append(modules.ElementwiseAffine(2))
 38 |     for i in range(4):
 39 |       self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 40 |       self.post_flows.append(modules.Flip())
 41 | 
 42 |     self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 43 |     self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 44 |     self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 45 |     if gin_channels != 0:
 46 |       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 47 | 
 48 |   def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 49 |     x = torch.detach(x)
 50 |     x = self.pre(x)
 51 |     if g is not None:
 52 |       g = torch.detach(g)
 53 |       x = x + self.cond(g)
 54 |     x = self.convs(x, x_mask)
 55 |     x = self.proj(x) * x_mask
 56 | 
 57 |     if not reverse:
 58 |       flows = self.flows
 59 |       assert w is not None
 60 | 
 61 |       logdet_tot_q = 0 
 62 |       h_w = self.post_pre(w)
 63 |       h_w = self.post_convs(h_w, x_mask)
 64 |       h_w = self.post_proj(h_w) * x_mask
 65 |       e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
 66 |       z_q = e_q
 67 |       for flow in self.post_flows:
 68 |         z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 69 |         logdet_tot_q += logdet_q
 70 |       z_u, z1 = torch.split(z_q, [1, 1], 1) 
 71 |       u = torch.sigmoid(z_u) * x_mask
 72 |       z0 = (w - u) * x_mask
 73 |       logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 74 |       logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
 75 | 
 76 |       logdet_tot = 0
 77 |       z0, logdet = self.log_flow(z0, x_mask)
 78 |       logdet_tot += logdet
 79 |       z = torch.cat([z0, z1], 1)
 80 |       for flow in flows:
 81 |         z, logdet = flow(z, x_mask, g=x, reverse=reverse)
 82 |         logdet_tot = logdet_tot + logdet
 83 |       nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
 84 |       return nll + logq # [b]
 85 |     else:
 86 |       flows = list(reversed(self.flows))
 87 |       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
 88 |       z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
 89 |       for flow in flows:
 90 |         z = flow(z, x_mask, g=x, reverse=reverse)
 91 |       z0, z1 = torch.split(z, [1, 1], 1)
 92 |       logw = z0
 93 |       return logw
 94 | 
 95 | 
 96 | class DurationPredictor(nn.Module):
 97 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
 98 |     super().__init__()
 99 | 
100 |     self.in_channels = in_channels
101 |     self.filter_channels = filter_channels
102 |     self.kernel_size = kernel_size
103 |     self.p_dropout = p_dropout
104 |     self.gin_channels = gin_channels
105 | 
106 |     self.drop = nn.Dropout(p_dropout)
107 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
108 |     self.norm_1 = modules.LayerNorm(filter_channels)
109 |     self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
110 |     self.norm_2 = modules.LayerNorm(filter_channels)
111 |     self.proj = nn.Conv1d(filter_channels, 1, 1)
112 | 
113 |     if gin_channels != 0:
114 |       self.cond = nn.Conv1d(gin_channels, in_channels, 1)
115 | 
116 |   def forward(self, x, x_mask, g=None):
117 |     x = torch.detach(x)
118 |     if g is not None:
119 |       g = torch.detach(g)
120 |       x = x + self.cond(g)
121 |     x = self.conv_1(x * x_mask)
122 |     x = torch.relu(x)
123 |     x = self.norm_1(x)
124 |     x = self.drop(x)
125 |     x = self.conv_2(x * x_mask)
126 |     x = torch.relu(x)
127 |     x = self.norm_2(x)
128 |     x = self.drop(x)
129 |     x = self.proj(x * x_mask)
130 |     return x * x_mask
131 | 
132 | 
133 | class TextEncoder(nn.Module):
134 |   def __init__(self,
135 |       n_vocab,
136 |       out_channels,
137 |       hidden_channels,
138 |       filter_channels,
139 |       n_heads,
140 |       n_layers,
141 |       kernel_size,
142 |       p_dropout,
143 |       emotion_embedding):
144 |     super().__init__()
145 |     self.n_vocab = n_vocab
146 |     self.out_channels = out_channels
147 |     self.hidden_channels = hidden_channels
148 |     self.filter_channels = filter_channels
149 |     self.n_heads = n_heads
150 |     self.n_layers = n_layers
151 |     self.kernel_size = kernel_size
152 |     self.p_dropout = p_dropout
153 |     self.emotion_embedding = emotion_embedding
154 |     
155 |     if self.n_vocab!=0:
156 |       self.emb = nn.Embedding(n_vocab, hidden_channels)
157 |       if emotion_embedding:
158 |         self.emo_proj = nn.Linear(1024, hidden_channels)
159 |       nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
160 | 
161 |     self.encoder = attentions.Encoder(
162 |       hidden_channels,
163 |       filter_channels,
164 |       n_heads,
165 |       n_layers,
166 |       kernel_size,
167 |       p_dropout)
168 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
169 | 
170 |   def forward(self, x, x_lengths, emotion_embedding=None):
171 |     if self.n_vocab!=0:
172 |       x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
173 |     if emotion_embedding is not None:
174 |       x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
175 |     x = torch.transpose(x, 1, -1) # [b, h, t]
176 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
177 | 
178 |     x = self.encoder(x * x_mask, x_mask)
179 |     stats = self.proj(x) * x_mask
180 | 
181 |     m, logs = torch.split(stats, self.out_channels, dim=1)
182 |     return x, m, logs, x_mask
183 | 
184 | 
185 | class ResidualCouplingBlock(nn.Module):
186 |   def __init__(self,
187 |       channels,
188 |       hidden_channels,
189 |       kernel_size,
190 |       dilation_rate,
191 |       n_layers,
192 |       n_flows=4,
193 |       gin_channels=0):
194 |     super().__init__()
195 |     self.channels = channels
196 |     self.hidden_channels = hidden_channels
197 |     self.kernel_size = kernel_size
198 |     self.dilation_rate = dilation_rate
199 |     self.n_layers = n_layers
200 |     self.n_flows = n_flows
201 |     self.gin_channels = gin_channels
202 | 
203 |     self.flows = nn.ModuleList()
204 |     for i in range(n_flows):
205 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
206 |       self.flows.append(modules.Flip())
207 | 
208 |   def forward(self, x, x_mask, g=None, reverse=False):
209 |     if not reverse:
210 |       for flow in self.flows:
211 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
212 |     else:
213 |       for flow in reversed(self.flows):
214 |         x = flow(x, x_mask, g=g, reverse=reverse)
215 |     return x
216 | 
217 | 
218 | class PosteriorEncoder(nn.Module):
219 |   def __init__(self,
220 |       in_channels,
221 |       out_channels,
222 |       hidden_channels,
223 |       kernel_size,
224 |       dilation_rate,
225 |       n_layers,
226 |       gin_channels=0):
227 |     super().__init__()
228 |     self.in_channels = in_channels
229 |     self.out_channels = out_channels
230 |     self.hidden_channels = hidden_channels
231 |     self.kernel_size = kernel_size
232 |     self.dilation_rate = dilation_rate
233 |     self.n_layers = n_layers
234 |     self.gin_channels = gin_channels
235 | 
236 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
237 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
238 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
239 | 
240 |   def forward(self, x, x_lengths, g=None):
241 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
242 |     x = self.pre(x) * x_mask
243 |     x = self.enc(x, x_mask, g=g)
244 |     stats = self.proj(x) * x_mask
245 |     m, logs = torch.split(stats, self.out_channels, dim=1)
246 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
247 |     return z, m, logs, x_mask
248 | 
249 | 
250 | class Generator(torch.nn.Module):
251 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
252 |         super(Generator, self).__init__()
253 |         self.num_kernels = len(resblock_kernel_sizes)
254 |         self.num_upsamples = len(upsample_rates)
255 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
256 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
257 | 
258 |         self.ups = nn.ModuleList()
259 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
260 |             self.ups.append(weight_norm(
261 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
262 |                                 k, u, padding=(k-u)//2)))
263 | 
264 |         self.resblocks = nn.ModuleList()
265 |         for i in range(len(self.ups)):
266 |             ch = upsample_initial_channel//(2**(i+1))
267 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
268 |                 self.resblocks.append(resblock(ch, k, d))
269 | 
270 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
271 |         self.ups.apply(init_weights)
272 | 
273 |         if gin_channels != 0:
274 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
275 | 
276 |     def forward(self, x, g=None):
277 |         x = self.conv_pre(x)
278 |         if g is not None:
279 |           x = x + self.cond(g)
280 | 
281 |         for i in range(self.num_upsamples):
282 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
283 |             x = self.ups[i](x)
284 |             xs = None
285 |             for j in range(self.num_kernels):
286 |                 if xs is None:
287 |                     xs = self.resblocks[i*self.num_kernels+j](x)
288 |                 else:
289 |                     xs += self.resblocks[i*self.num_kernels+j](x)
290 |             x = xs / self.num_kernels
291 |         x = F.leaky_relu(x)
292 |         x = self.conv_post(x)
293 |         x = torch.tanh(x)
294 | 
295 |         return x
296 | 
297 | 
298 | class SynthesizerTrn(nn.Module):
299 |   """
300 |   Synthesizer for Training
301 |   """
302 | 
303 |   def __init__(self, 
304 |     n_vocab,
305 |     spec_channels,
306 |     segment_size,
307 |     inter_channels,
308 |     hidden_channels,
309 |     filter_channels,
310 |     n_heads,
311 |     n_layers,
312 |     kernel_size,
313 |     p_dropout,
314 |     resblock, 
315 |     resblock_kernel_sizes, 
316 |     resblock_dilation_sizes, 
317 |     upsample_rates, 
318 |     upsample_initial_channel, 
319 |     upsample_kernel_sizes,
320 |     n_speakers=0,
321 |     gin_channels=0,
322 |     use_sdp=True,
323 |     emotion_embedding=False,
324 |     **kwargs):
325 | 
326 |     super().__init__()
327 |     self.n_vocab = n_vocab
328 |     self.spec_channels = spec_channels
329 |     self.inter_channels = inter_channels
330 |     self.hidden_channels = hidden_channels
331 |     self.filter_channels = filter_channels
332 |     self.n_heads = n_heads
333 |     self.n_layers = n_layers
334 |     self.kernel_size = kernel_size
335 |     self.p_dropout = p_dropout
336 |     self.resblock = resblock
337 |     self.resblock_kernel_sizes = resblock_kernel_sizes
338 |     self.resblock_dilation_sizes = resblock_dilation_sizes
339 |     self.upsample_rates = upsample_rates
340 |     self.upsample_initial_channel = upsample_initial_channel
341 |     self.upsample_kernel_sizes = upsample_kernel_sizes
342 |     self.segment_size = segment_size
343 |     self.n_speakers = n_speakers
344 |     self.gin_channels = gin_channels
345 | 
346 |     self.use_sdp = use_sdp
347 | 
348 |     self.enc_p = TextEncoder(n_vocab,
349 |         inter_channels,
350 |         hidden_channels,
351 |         filter_channels,
352 |         n_heads,
353 |         n_layers,
354 |         kernel_size,
355 |         p_dropout,
356 |         emotion_embedding)
357 |     self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
358 |     self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
359 |     self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
360 | 
361 |     if use_sdp:
362 |       self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
363 |     else:
364 |       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
365 | 
366 |     if n_speakers > 1:
367 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
368 | 
369 |   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
370 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
371 |     if self.n_speakers > 0:
372 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
373 |     else:
374 |       g = None
375 | 
376 |     if self.use_sdp:
377 |       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
378 |     else:
379 |       logw = self.dp(x, x_mask, g=g)
380 |     w = torch.exp(logw) * x_mask * length_scale
381 |     w_ceil = torch.ceil(w)
382 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
383 |     y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
384 |     attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
385 |     attn = commons.generate_path(w_ceil, attn_mask)
386 | 
387 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
388 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
389 | 
390 |     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
391 |     z = self.flow(z_p, y_mask, g=g, reverse=True)
392 |     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
393 |     return o, attn, y_mask, (z, z_p, m_p, logs_p)
394 | 
395 |   def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
396 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
397 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
398 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
399 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
400 |     z_p = self.flow(z, y_mask, g=g_src)
401 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
402 |     o_hat = self.dec(z_hat * y_mask, g=g_tgt)
403 |     return o_hat, y_mask, (z, z_p, z_hat)
404 | 
405 | 


--------------------------------------------------------------------------------
/launch.py:
--------------------------------------------------------------------------------
  1 | #Speech recognition library, very important
  2 | import speech_recognition as sr
  3 | #Alternative speech recognition with whisper
  4 | from whisper_mic.whisper_mic import WhisperMic
  5 | #pyttsx3 is our tts engine
  6 | import pyttsx3
  7 | #Load other TTS
  8 | import mymoegoe.tts as mytts
  9 | import xtts.tts as xtts
 10 | import xtts.stream as xttsstream
 11 | #Pygame is used to play the wav audio files that pyttsx3 generates
 12 | # import os
 13 | # os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
 14 | # from pygame import mixer, _sdl2 as devices
 15 | #These are tools used for interfacing with youchat and other data. json, regex, and cloudflare scraper
 16 | import json
 17 | import cloudscraper
 18 | import re
 19 | import random
 20 | import threading
 21 | #Ooba Reqs
 22 | import requests
 23 | #LM Studio and Other OpenAI compatibles Reqs
 24 | import openai
 25 | #Character Card Reqs
 26 | from PIL import Image
 27 | from PIL.ExifTags import TAGS
 28 | import base64
 29 | #terminal arg libs
 30 | import sys
 31 | import getopt
 32 | import time
 33 | #Needed for piping to vbcable and playing TTS
 34 | import sounddevice as sd
 35 | import soundfile as sf
 36 | import numpy as np
 37 | import os
 38 | #--------------------------------------
 39 | argsv = sys.argv[1:]
 40 | options, args = getopt.getopt(argsv, 'hv',
 41 | 	["vbcable", "voiceinput", "pc=", "pcaschat", "caphistory=", "voice=", "voices", "wakeword=", 
 42 | 	"alwayslisten", "ooba", "openai", "vosk", "googlestt", "chara=", "moegoe", "xtts", "bootmsg=", "wakeprompt", "nowakeping", 'voicespeed=', 'mgmodel=', 'template='])
 43 | 
 44 | #Config variables
 45 | vbcable = False
 46 | textinput = True
 47 | wakeword = "computer"
 48 | promptcontext = ""
 49 | promptcontextaschat = False
 50 | caphistory = 4
 51 | voice = None
 52 | alwayslisten = False
 53 | waketext = ""
 54 | ooba = False
 55 | openaiapi = False
 56 | vosk = False
 57 | googlestt = False
 58 | ttsengine = "pyttsx3"
 59 | wakeprompt = False
 60 | wakeping = True
 61 | charafilename = ""
 62 | speed = 1 
 63 | bootmsg = "Booting Up"
 64 | mgmodel = "g"
 65 | xttsmodel = "base v2.0.2"
 66 | verbose = False
 67 | chatml = False
 68 | phi3 = False
 69 | streamchunks = 20
 70 | 
 71 | # Put your URI end point:port here for your openai inference server (such as LM Studio) 
 72 | openai.api_base='http://localhost:1234/v1'
 73 | # Put in an empty API Key for LM stuido
 74 | openai.api_key=''
 75 | openaimodel = "local model"
 76 | 
 77 | script_path = os.path.abspath(__file__)
 78 | directory = os.path.dirname(script_path)
 79 | 
 80 | for opt, arg in options:
 81 | 	if opt == "-h":
 82 | 		print("--vbcable: Send audio to vb-cable virtual microphone.")
 83 | 		print("--voiceinput: Interact with the AI using your voice instead of text.")
 84 | 		print("--pc='string': set a prompt context. To prepend to prompts. Optionally can be set as fake history.")
 85 | 		print("--pcaschat: Sets prompt context to be a fake chat history.")
 86 | 		print("--caphistory=number: Caps chat history length. Default is 4. Set to -1 to disable.")
 87 | 		print("--voice=number/string: Set the TTS voice.")
 88 | 		print("--voices: List voices on your computer.")
 89 | 		print("--wakeword='string': Sets the wake word when using voice input.")
 90 | 		print("--alwayslisten: Always listen for input, not using a wake word.")
 91 | 		print("--ooba: Use local oobabooga webui as LLM instead of YouChat.")
 92 | 		print("--openai: Use openai api as LLM instead of YouChat.")
 93 | 		print("--vosk: Use local vosk as STT.")
 94 | 		print("--googlestt: Use google's online service as STT.")
 95 | 		print("--chara='filename': Load tavernai character card or oobabooga character json file.")
 96 | 		print("--moegoe: Use moegoe as TTS instead of default TTS.")
 97 | 		print("--xtts: Use xtts as TTS instead of default TTS.")
 98 | 		print("--bootmsg='string': What to say when booting up.")
 99 | 		print("--wakeprompt: Like alwayslisten, but doesn't prompt unless wakeword is included.")
100 | 		print("--nowakeping: Doesn't ping when starting to listen for wake word")
101 | 		print("--voicespeed=number: Speed of moegoe tts. Higher=slower. default is 1.")
102 | 		print("--mgmodel='filename': set the filename of the moegoe model. default is g")
103 | 		print("--template='string': specify a prompt template (chatml or phi3). default typical chat format.")
104 | 		print("-v: Print debug info.")
105 | 		sys.exit(2)
106 | 	elif opt == '--vbcable':
107 | 		vbcable = True
108 | 	elif opt == '--voiceinput':
109 | 		textinput = False
110 | 	elif opt == '--pc':
111 | 		promptcontext = "["+arg+"]"
112 | 	elif opt == "--pcaschat":
113 | 		promptcontextaschat = True
114 | 	elif opt == "--caphistory":
115 | 		caphistory = int(arg)
116 | 	elif opt == '--voice':
117 | 		voice = arg
118 | 	elif opt == '--voices':
119 | 		engine = pyttsx3.init()
120 | 		voices = engine.getProperty('voices')
121 | 		for v in voices:
122 | 			print (v)
123 | 		sys.exit(2)
124 | 	elif opt == '--wakeword':
125 | 		wakeword = arg
126 | 	elif opt == '--alwayslisten':
127 | 		alwayslisten = True
128 | 	elif opt == "--ooba":
129 | 		ooba = True
130 | 	elif opt == "--openai":
131 | 		openaiapi = True
132 | 	elif opt == "--vosk":
133 | 		vosk = True
134 | 	elif opt == "--googlestt":
135 | 		googlestt = True
136 | 	elif opt == "--chara":
137 | 		charafilename = arg
138 | 	elif opt == "--moegoe":
139 | 		ttsengine = "moegoe"
140 | 	elif opt == "--xtts":
141 | 		ttsengine = "xtts"
142 | 	elif opt == "--bootmsg":
143 | 		bootmsg = arg
144 | 	elif opt == "--wakeprompt":
145 | 		wakeprompt = True
146 | 	elif opt == "--nowakeping":
147 | 		wakeping = False
148 | 	elif opt == "--voicespeed":
149 | 		speed = float(arg)
150 | 	elif opt == "--mgmodel":
151 | 		mgmodel = arg
152 | 		xttsmodel = arg
153 | 	elif opt == "--template":
154 | 		if arg == "chatml":
155 | 			chatml = True
156 | 		if arg == "phi3":
157 | 			phi3 = True
158 | 	elif opt == "-v":
159 | 		verbose = True
160 | 
161 | 
162 | # Find VB-Cable device IDs
163 | vbcable_output = None
164 | vbcable_input = None
165 | if vbcable:
166 | 	for device in sd.query_devices():
167 | 		if 'CABLE Output' in device['name'] and device['max_input_channels'] == 2 and vbcable_output == None:
168 | 			if verbose:
169 | 				print("Found Cable Output.", device['name'], device['index'])
170 | 			vbcable_output = device["index"]
171 | 		if 'CABLE Input' in device['name'] and device['max_output_channels'] == 2 and vbcable_input == None:
172 | 			if verbose:
173 | 				print("Found Cable Input.", device['name'], device['index'])
174 | 			vbcable_input = device["index"]
175 | 
176 | #New function to load and play the tts outputs. Check for vbcable vs speaker
177 | def playaudio():
178 | 	audiofile = os.path.join(directory,"temp.wav")
179 | 	if os.path.isfile(audiofile):
180 | 		data, fs = sf.read(audiofile, dtype='float32')
181 | 		data_stereo = np.tile(data, (2, 1)).T.copy(order='C')
182 | 		delay = int(fs * 0.2)  # 100ms delay
183 | 		zeros = np.zeros((delay, 2))
184 | 		sd.play(zeros, fs, blocking=True, device=sd.default.device)
185 | 		sd.play(data, fs, device=sd.default.device)
186 | 		if vbcable:
187 | 			with sd.OutputStream(device=vbcable_input,
188 | 								samplerate=fs,
189 | 								channels=2) as stream:
190 | 				stream.write(data_stereo)
191 | 		sd.wait()
192 | 	else:
193 | 		print("Generated TTS audio temp.wav not found!")
194 | def playstream(audio_stream, stream1, stream2):
195 | 	for chunk in audio_stream:
196 | 		#Get chunk data into np format (pcm audio samples)
197 | 		audio_pcm = np.frombuffer(chunk, dtype=np.int16)
198 | 		# Convert PCM audio samples to 32-bit floating-point values
199 | 		#data_float = audio_pcm.astype(np.float32) / 32768.0
200 | 		stream1.write(audio_pcm)
201 | 		if stream2 != None:
202 | 			stream2.write(audio_pcm)
203 | 
204 | def playaudiostream(audio_stream):
205 | 	# xttsstream.stream_ffplay(audio_stream)
206 | 	with sd.OutputStream(device=sd.default.device,
207 | 								samplerate=24000,
208 | 								channels=1, blocksize=44544, latency=1, dtype='int16') as stream1:
209 | 		if vbcable:
210 | 			with sd.OutputStream(device=vbcable_input,
211 | 								samplerate=24000,
212 | 								channels=1, blocksize=44544, latency=1, dtype='int16') as stream2:
213 | 				playstream(audio_stream, stream1, stream2)
214 | 		else:
215 | 			playstream(audio_stream, stream1, None)
216 | 
217 | def playchime(pingpong="ping"):
218 | 	data, fs = sf.read(os.path.join(directory,pingpong+".wav"), dtype='float32')
219 | 	sd.play(data, fs, device=sd.default.device)
220 | 
221 | #Here we initialize python's audio output
222 | #It checks to see if we enabled vb-cable to pipe the audio to vmagicmirror
223 | #Be sure to turn on listening to the vb-cable mic if you wish to hear the ai speak, otherwise it's silent
224 | # if vbcable:
225 | # 	mixer.init(devicename = "CABLE Input (VB-Audio Virtual Cable)")
226 | # else:
227 | # 	mixer.init()
228 | #a debug print to check our audio devices
229 | #print("Outputs:", devices.audio.get_audio_device_names()[0])
230 | 
231 | 
232 | #Here we initialize the tts with a default boot message
233 | #voices[2].id is to get the voice we want.
234 | if ttsengine == "pyttsx3":
235 | 	if voice == None:
236 | 		voice = 0
237 | 	engine = pyttsx3.init()
238 | 	voices = engine.getProperty('voices')
239 | 	if len(voices) == 0:
240 | 		print("No TTS voices detected. Please install a TTS voice on your OS.")
241 | 		sys.exit(2)
242 | 	engine.setProperty('voice', voices[int(voice)].id)
243 | 	engine.save_to_file(bootmsg, os.path.join(directory,"temp.wav"))
244 | 	engine.runAndWait();
245 | 	playaudio()
246 | #--------------------
247 | 
248 | if ttsengine == "moegoe":
249 | 	if voice == None:
250 | 		voice = 0
251 | 	mytts.loadtts(mgmodel)
252 | 	mytts.tts(bootmsg, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed)
253 | 	playaudio()
254 | 
255 | if ttsengine == "xtts":
256 | 	if voice == None:
257 | 		voice = "en_sample"
258 | 	xttsstream.loadModel(xttsmodel, voice=voice)
259 | 	speaker = xttsstream.get_speaker(xttsstream.reference)
260 | 	audio = playaudiostream(xttsstream.tts(bootmsg, speaker, "en", streamchunks))
261 | 	#xtts.loadModel(xttsmodel, voice=voice)
262 | 	#xtts.generateSpeech(bootmsg, os.path.join(directory,"temp.wav"))
263 | 	#playaudio()
264 | 
265 | #Load sfx
266 | # ping = mixer.Sound("ping.wav")
267 | # pong = mixer.Sound("pong.wav")
268 | 
269 | #Initialize the cloudflare scraper that we use for youchat requests
270 | if not ooba and not openaiapi:
271 | 	scraper = cloudscraper.create_scraper(ecdhCurve='secp384r1')
272 | 
273 | #New traceid function. This fetches the needed traceid for youchat to function
274 | def getinitialtraceid():
275 | 	headers = {
276 | 		'Accept': 'text/event-stream',
277 | 		'Connection': 'keep-alive',
278 | 		'Sec-Fetch-Mode': 'cors',
279 | 		'Sec-Fetch-Site': 'same-origin',
280 | 		'Sec-GPC': '1',
281 | 		'Referer': 'https://you.com/search?q=hello&fromSearchBar=true&tbm=youchat'
282 | 	}
283 | 	payload = {'q': "hello"}
284 | 	try:
285 | 		response = scraper.get("https://you.com/search", params=payload, headers=headers)
286 | 	except cloudscraper.exceptions.CloudflareChallengeError as e:
287 | 		return "Sorry, there was a cloudflare error. Please try again."
288 | 
289 | 	data = response.text
290 | 	match = re.search(r'"initialTraceId":"(.+?)"', data)
291 | 	first_capture_group = match.group(1)
292 | 	#print("traceid:", first_capture_group)
293 | 	return first_capture_group
294 | if not ooba and not openaiapi:
295 | 	traceid = getinitialtraceid()
296 | 	randuuid = str(random.random())[2:]
297 | #print("Random UUID:", randuuid)
298 | #---------------------------
299 | 
300 | #sendq is the youchat api request. Just enter prompt for the parameter and we get the response back
301 | #chat variable is kept updated with chat history
302 | chat=[]
303 | def sendq(question):
304 | 	global chat, traceid, randuuid
305 | 	headers = {
306 | 		'Accept': 'text/event-stream',
307 | 		'Connection': 'keep-alive',
308 | 		'Sec-Fetch-Mode': 'cors',
309 | 		'Sec-Fetch-Site': 'same-origin',
310 | 		'Sec-GPC': '1',
311 | 		'Referer': 'https://you.com/search?q=hello&fromSearchBar=true&tbm=youchat',
312 | 		'Cookie': ('uuid_guest='+randuuid+";").encode()
313 | 	}
314 | 	if promptcontextaschat:
315 | 		chat.append({"question":'"'+promptcontext+'"', "answer":''})
316 | 	payload = {
317 | 		'q': question, 
318 | 		'chat': str(chat), 
319 | 		'queryTraceId': traceid, 
320 | 		'domain': 'youchat',
321 | 		'page': '1',
322 | 		'count': '10',
323 | 		'safeSearch': 'Off',
324 | 		'onShoppingPage': 'false',
325 | 		'freshness':'Month',
326 | 		'mkt':'',
327 | 		'responseFilter': 'WebPages,Translations,TimeZone,Computation,RelatedSearches'
328 | 		}
329 | 	try:
330 | 		response = scraper.get("https://you.com/api/streamingSearch", params=payload, headers=headers, stream=True)
331 | 	except cloudscraper.exceptions.CloudflareChallengeError as e:
332 | 		return "Sorry, there was a cloudflare error. Please try again."
333 | 
334 | 	output = ""
335 | 	for line in response.iter_lines():
336 | 		if line:
337 | 			decoded_line = line.decode("utf-8")
338 | 			if decoded_line != "{}":
339 | 				key, value = decoded_line.split(":", 1)
340 | 				key = key.strip()
341 | 				value = value.strip()
342 | 				if key == "data":
343 | 					if value == "I'm Mr. Meeseeks. Look at me.":
344 | 						break
345 | 					data = json.loads(value)
346 | 					if "youChatToken" in data:
347 | 						output += data["youChatToken"]
348 | 			else:
349 | 				return "Sorry, the AI server is too busy. An error has occurred. Please try again."
350 | 	if caphistory >= 0:
351 | 		if len(chat) > caphistory:
352 | 			chat = chat[:0-caphistory]
353 | 	chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'})
354 | 	return output
355 | 
356 | #Initialize Character Persona Details for Ooba LLM
357 | yourname = "You"
358 | charactername = "Friend"
359 | characterpersona = ""
360 | worldscenario = "Reply as the user's friend. Make up details about yourself. Do not mention or say you are an AI. Respond only with one or two sentences and keeps responses brief."
361 | exampledialogue = ""
362 | exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue)
363 | exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue)
364 | greeting = ""
365 | 
366 | def loadcharacard(filename):
367 | 	global charactername, characterpersona, worldscenario, exampledialogue, greeting
368 | 	if verbose: 
369 | 		print("PNG/WEBP character file loading...")
370 | 	# load the image
371 | 	img = Image.open(filename)
372 | 	exif_data = img._getexif()
373 | 	img.load()
374 | 	chara = ""
375 | 	if filename[-4:] == ".png":
376 | 		chara = img.info["chara"]
377 | 		decoded_bytes = base64.b64decode(chara)
378 | 		decoded_string = decoded_bytes.decode('utf-8')
379 | 		chara = decoded_string
380 | 	if filename[-4:] == "webp":
381 | 		for tag_id, value in exif_data.items():
382 | 			tag = TAGS.get(tag_id, tag_id)
383 | 			if tag == "UserComment":
384 | 				chara = value[8:]
385 | 
386 | 	charajson = json.loads(chara)
387 | 	print("Loading "+charajson['name'])
388 | 	charactername = charajson['name']
389 | 	characterpersona = charajson['description']+"\nPersonality: "+charajson['personality']
390 | 	characterpersona = re.sub(r'{{char}}', charactername, characterpersona)
391 | 	characterpersona = re.sub(r'{{user}}', yourname, characterpersona)
392 | 	worldscenario = charajson['scenario']
393 | 	worldscenario = re.sub(r'{{char}}', charactername, worldscenario)
394 | 	worldscenario = re.sub(r'{{user}}', yourname, worldscenario)
395 | 	greeting = charajson['first_mes']
396 | 	greeting = re.sub(r'{{char}}', charactername, greeting)
397 | 	greeting = re.sub(r'{{user}}', yourname, greeting)
398 | 	exampledialogue = charajson['mes_example']
399 | 	exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue)
400 | 	exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue)
401 | 
402 | def loadoobacharjson(filename):
403 | 	global charactername, characterpersona, worldscenario, exampledialogue, greeting
404 | 	if verbose: 
405 | 		print("JSON character file loading...")
406 | 	with open(filename, encoding="utf-8") as f:
407 | 		data = json.load(f)
408 | 		print("Loading "+data['char_name'])
409 | 		charactername = data['char_name']
410 | 		characterpersona = data['char_persona']
411 | 		characterpersona = re.sub(r'{{char}}', charactername, characterpersona)
412 | 		characterpersona = re.sub(r'{{user}}', yourname, characterpersona)
413 | 		worldscenario = data['world_scenario']
414 | 		worldscenario = re.sub(r'{{char}}', charactername, worldscenario)
415 | 		worldscenario = re.sub(r'{{user}}', yourname, worldscenario)
416 | 		greeting = data['char_greeting']
417 | 		greeting = re.sub(r'{{char}}', charactername, greeting)
418 | 		greeting = re.sub(r'{{user}}', yourname, greeting)
419 | 		exampledialogue = data['example_dialogue']
420 | 		exampledialogue = re.sub(r'{{char}}', charactername, exampledialogue)
421 | 		exampledialogue = re.sub(r'{{user}}', yourname, exampledialogue)
422 | 
423 | def loadchara(filename):
424 | 	if verbose:
425 | 		print("Chara file extension:", filename[-4:])
426 | 	if filename[-4:] == "json":
427 | 		loadoobacharjson(filename)
428 | 	elif filename[-4:] == ".png" or filename[-4:] == "webp":
429 | 		loadcharacard(filename)
430 | 	else:
431 | 		print("Could not detect character format...")
432 | 
433 | if charafilename != "":
434 | 	loadchara(charafilename)
435 | 
436 | if greeting != "":
437 | 	print(charactername+": "+greeting)
438 | 	chat.append({"question":'', "answer":greeting})
439 | 	out = re.sub("\n", "", greeting)
440 | 	out = re.sub("[\"\']", "", out)
441 | 	out = re.sub("[^\x00-\x7F]+", "", out)
442 | 	out = re.sub("[<>]", "", out)
443 | 	out = re.sub("-", " - ", out)
444 | 	if ttsengine == "pyttsx3":
445 | 		engine.save_to_file(out, os.path.join(directory,"temp.wav"))
446 | 		engine.runAndWait();
447 | 	if ttsengine == "moegoe":
448 | 		mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed)
449 | 	if ttsengine == "xtts":
450 | 		audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks))
451 | 		#xtts.generateSpeech(out, os.path.join(directory,"temp.wav"))
452 | 	if ttsengine != "xtts":
453 | 		playaudio()
454 | 
455 | #Creates the prompt for non-youchat apis
456 | def createprompt(question):
457 | 	global chat, yourname, charactername
458 | 	prompt = ""
459 | 
460 | 	#ChatML
461 | 	promptuserstart = "<|im_start|>"
462 | 	promptend = "<|im_end|>"
463 | 	promptassistantstart = "<|im_start|>"
464 | 
465 | 	#Phi 3
466 | 	if phi3:
467 | 		promptuserstart = "<|user|>"
468 | 		promptend = "<|end|>"
469 | 		promptassistantstart = "<|assistant|>"
470 | 
471 | 	if chatml:
472 | 		prompt = promptuserstart+"system\n"
473 | 		yourname = "user"
474 | 		charactername = "assistant"
475 | 	if phi3:
476 | 		prompt = "<|system|>"#promptuserstart+"\n"
477 | 	#Handle legacy prompt context
478 | 	if promptcontextaschat:
479 | 		chat.append({"question":'"'+promptcontext+'"', "answer":''})
480 | 	else:
481 | 		prompt = promptcontext+"\n"
482 | 	
483 | 	#characterpersona = "You are chatting with Bot. Bot is an AI assistant that helps answer your questions."
484 | 
485 | 	#Add Character Context
486 | 	if characterpersona != "":
487 | 		prompt += charactername+"'s Persona: "+characterpersona+"\n"
488 | 	if worldscenario != "":
489 | 		prompt += "Scenario: "+worldscenario+"\n"
490 | 	if exampledialogue != "" and not chatml and not phi3:
491 | 		prompt += "<START>"+"\n"+exampledialogue+"\n"
492 | 	if (characterpersona != "" or worldscenario != "" or exampledialogue != "") and not chatml and not phi3:
493 | 		prompt += "<START>"
494 | 
495 | 	if chatml:
496 | 		prompt += promptend+"\n"
497 | 
498 | 	#Add Chat History to Prompt
499 | 	for ch in chat:
500 | 		if ch["question"] != "":
501 | 			if chatml:
502 | 				prompt += promptuserstart+yourname+"\n"+ch["question"]+promptend+"\n"
503 | 			elif phi3:
504 | 				prompt += promptuserstart+"\n"+ch["question"]+promptend+"\n"
505 | 			else:
506 | 				prompt += '\n'+yourname+': '+ch["question"]
507 | 		if ch["answer"] != "":
508 | 			if chatml:
509 | 				prompt += promptassistantstart+charactername+"\n"+ch["answer"]+promptend+"\n"
510 | 			elif phi3:
511 | 				prompt += promptassistantstart+"\n"+ch["answer"]+promptend+"\n"
512 | 			else:
513 | 				prompt += '\n'+charactername+': '+ch["answer"]
514 | 
515 | 	#Add newest chat to prompt
516 | 	if chatml:
517 | 		prompt += promptuserstart+yourname+"\n"+question+promptend+"\n"+promptassistantstart+charactername+"\n"
518 | 	elif phi3:
519 | 		prompt += promptuserstart+"\n"+question+promptend+"\n"+promptassistantstart+"\n"
520 | 	else:
521 | 		prompt += '\n'+yourname+': '
522 | 		prompt += question
523 | 		prompt += '\n'+charactername+': '
524 | 	return prompt
525 | 
526 | #openaisendq is the openai api request.
527 | def openaisendq(question):
528 | 	global chat
529 | 
530 | 	prompt = createprompt(question)
531 | 
532 | 	#Set stopping strings. This tells LLM to stop writing.
533 | 	stopping_strings = ["\n"+yourname, "\n"+charactername, "<STOP>", "<END>", "<START>", "<|im_end|>", "<|im_start|>", "<|user|>", "<|end|>", "<|assistant|>"]
534 | 
535 | 	#formatted_prompt = f"{yourname}: {question}\n{charactername}:"
536 | 	messages = [{"role": "user", "content": prompt}]
537 | 	response = openai.ChatCompletion.create(
538 | 		model=openaimodel,
539 | 		messages=messages,
540 | 		stop=stopping_strings,
541 | 		#temperature=0.0
542 | 		# temperature=0.7,
543 | 		# rep_pen = 1.18,
544 | 		# top_p = 1
545 | 	)
546 | 	output = response.choices[0].message["content"]
547 | 	#Append message to chat history
548 | 	if caphistory >= 0:
549 | 		if len(chat) > caphistory:
550 | 			chat = chat[:0-caphistory]
551 | 	chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'})
552 | 	return output
553 | 
554 | #oobasendq is the oobabooga api request. Just enter prompt for the parameter and we get the response back
555 | def oobasendq(question):
556 | 	global chat
557 | 
558 | 	prompt = createprompt(question)
559 | 
560 | 	#Set stopping strings. This tells LLM to stop writing.
561 | 	stopping_strings = ["\n"+yourname, "\n"+charactername]
562 | 
563 | 	#print(prompt)
564 | 	#Send the request
565 | 	data = {"prompt": prompt, "stopping_strings": stopping_strings, "temperature": 0.7, "rep_pen": 1.18, "top_p":1}
566 | 	response = requests.post('http://127.0.0.1:5000/api/v1/generate', data=json.dumps(data))
567 | 	if response.status_code == 200:
568 | 
569 | 		#Get the output from the response
570 | 		if verbose:
571 | 			print(response.content)
572 | 		jsondata = json.loads(response.content.decode('utf-8'))
573 | 		output = str(jsondata['results'][0]['text']).strip()
574 | 
575 | 		#Append message to chat history
576 | 		if caphistory >= 0:
577 | 			if len(chat) > caphistory:
578 | 				chat = chat[:0-caphistory]
579 | 		chat.append({"question":'"'+question+'"', "answer":'"'+output+'"'})
580 | 
581 | 		return output
582 | 	else:
583 | 		return "Error"
584 | 
585 | 
586 | def getaudiovosknew(r, m, wake=False):
587 | 	global ping
588 | 	print("New Vosk Recognizer")
589 | 	text = ""
590 | 	firstit = True
591 | 	while text == "":
592 | 		with m as source:
593 | 			r.adjust_for_ambient_noise(source)
594 | 			if firstit:
595 | 				if (wake and wakeping) or (not wake):
596 | 					playchime("ping")
597 | 				firstit = False
598 | 			print("Listening for Vosk!")
599 | 			audio = r.listen(source)
600 | 		try:
601 | 			text = r.recognize_vosk(audio)
602 | 			text = text.lower()
603 | 		except:
604 | 			print("Failed to recognize")
605 | 			text = ""
606 | 	output = json.loads(text)["text"]
607 | 	print("Detected speech:", output)
608 | 	return output
609 | 
610 | def getaudiogooglenew(r, m, wake=False):
611 | 	global ping 
612 | 	print("New Google Recognizer")
613 | 	text = ""
614 | 	firstit = True
615 | 	while text == "":
616 | 		with m as source:
617 | 			r.adjust_for_ambient_noise(source)
618 | 			if firstit:
619 | 				if (wake and wakeping) or (not wake):
620 | 					playchime("ping")
621 | 				firstit = False
622 | 			print("Listening for Google!")
623 | 			audio = r.listen(source)
624 | 		try:
625 | 			text = r.recognize_google(audio)
626 | 			text = text.lower()
627 | 		except:
628 | 			print("Failed to recognize")
629 | 			text = ""
630 | 	print("Detected speech:", text)
631 | 	return text
632 | 
633 | #Main function. Two different options: whether we wish to use text input or voice
634 | if textinput:
635 | 	while True:
636 | 		#get input string
637 | 		input_string = input(yourname+": ")
638 | 		combinedprompt = promptcontext+input_string
639 | 		if promptcontextaschat:
640 | 			combinedprompt = input_string
641 | 		start_time = time.time()
642 | 
643 | 		#Send prompt to LLM
644 | 		if ooba:
645 | 			out = oobasendq(input_string)
646 | 		elif openaiapi:
647 | 			out = openaisendq(input_string)
648 | 		else:
649 | 			#Youchat
650 | 			out = sendq(combinedprompt)
651 | 			out = re.sub(r'\[.+?\]\(.+?\)', '', out)
652 | 
653 | 		#Print response
654 | 		print(charactername+":", out)
655 | 		end_time = time.time()
656 | 		elapsed_time = end_time - start_time
657 | 		if verbose:
658 | 			print("Text-Gen time: ", elapsed_time, "seconds")
659 | 
660 | 		#Clear out string to ensure TTS doesn't crash
661 | 		out = re.sub("\n", "", out)
662 | 		out = re.sub("[\"\']", "", out)
663 | 		out = re.sub("[^\x00-\x7F]+", "", out)
664 | 		out = re.sub("[<>]", "", out)
665 | 		out = re.sub("-", " ", out)
666 | 		if out and out != "":
667 | 			# Text to speech to a file
668 | 			#tts.tts_to_file(text=out, file_path="temp.wav")
669 | 			if ttsengine == "pyttsx3":
670 | 				engine.save_to_file(out, os.path.join(directory,"temp.wav"))
671 | 				engine.runAndWait();
672 | 			elif ttsengine == "moegoe":
673 | 				mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed)
674 | 			elif ttsengine == "xtts":
675 | 				#xtts.generateSpeech(out, os.path.join(directory,"temp.wav"))
676 | 				end_time = time.time()
677 | 				elapsed_time = end_time - start_time
678 | 				if verbose:
679 | 					print("Elapsed time: ", elapsed_time, "seconds")
680 | 				audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks))
681 | 			
682 | 			#Calculate and print time if verbose
683 | 
684 | 			if ttsengine != "xtts":
685 | 				end_time = time.time()
686 | 				elapsed_time = end_time - start_time
687 | 				if verbose:
688 | 					print("Elapsed time: ", elapsed_time, "seconds")
689 | 				threadaudio = threading.Thread(target=playaudio)
690 | 				threadaudio.start()
691 | 				threadaudio.join()
692 | 			#playaudio()
693 | 			#mixer.music.load("temp.wav")
694 | 			#mixer.music.play()
695 | else:
696 | 	stop_listening = None
697 | 	#start microphone recognition
698 | 	if vosk or googlestt:
699 | 		r = sr.Recognizer()
700 | 		m = sr.Microphone()
701 | 	else:
702 | 		mic = WhisperMic(model="base.en")
703 | 	# def callback(recognizer, audio):
704 | 	# 	global waketext
705 | 	# 	#recognizer.adjust_for_ambient_noise(source)
706 | 	# 	try:
707 | 	# 		if vosk:
708 | 	# 			waketext = recognizer.recognize_vosk(audio)
709 | 	# 			waketext = json.loads(waketext)["text"]
710 | 	# 		else:
711 | 	# 			waketext = recognizer.recognize_google(audio)
712 | 	# 		waketext = waketext.lower()
713 | 	# 		if verbose:
714 | 	# 			print("Wake Word Check: {}".format(waketext))
715 | 	# 	except:
716 | 	# 		waketext = ""
717 | 	# 		print("Failed to recognize!")
718 | 
719 | 	if vosk or googlestt:
720 | 		with m as source:
721 | 			r.adjust_for_ambient_noise(source)
722 | 	#stop_listening = r.listen_in_background(m, callback)
723 | 	while True:
724 | 
725 | 		#Listen for Wake Word
726 | 		# waketext = ""
727 | 		# if stop_listening:
728 | 		# 	stop_listening(wait_for_stop=False)
729 | 		
730 | 		waketext = ""
731 | 		
732 | 		def listenwake():
733 | 			global waketext, r, m
734 | 			#print(r, m)
735 | 			if vosk:
736 | 				waketext = getaudiovosknew(r,m, True)
737 | 			elif googlestt:
738 | 				waketext = getaudiogooglenew(r,m, True)
739 | 			else:
740 | 				waketext = mic.listen()
741 | 				waketext = waketext.lower()
742 | 				print("Detected speech:", waketext)
743 | 				if wakeping:
744 | 					playchime("ping")
745 | 		
746 | 		if alwayslisten == True:
747 | 			while waketext == "":
748 | 				listenwake()
749 | 				continue
750 | 			textg = waketext
751 | 			if wakeping:
752 | 				playchime("pong")
753 | 		else:
754 | 			while wakeword not in waketext:
755 | 				listenwake()
756 | 				continue
757 | 			if wakeprompt:
758 | 				textg = waketext
759 | 			else:
760 | 				waketext = ""
761 | 
762 | 		#stop_listening(wait_for_stop=False)
763 | 		#----------------------------------------
764 | 
765 | 
766 | 		#Listen for Prompt
767 | 		if alwayslisten == False and wakeprompt == False:
768 | 			if vosk:
769 | 				textg = getaudiovosknew(r,m)
770 | 			elif googlestt:
771 | 				textg = getaudiogooglenew(r,m)
772 | 			else:
773 | 				textg = mic.listen()
774 | 			if wakeping:
775 | 				playchime("pong")
776 | 		#----------------------
777 | 
778 | 		#Send prompt to youchat and print output
779 | 		print(yourname+":", textg)
780 | 		start_time = time.time()
781 | 		combinedprompt = promptcontext+textg
782 | 		if promptcontextaschat:
783 | 			combinedprompt = textg
784 | 		if ooba:
785 | 			out = oobasendq(textg)
786 | 		elif openaiapi:
787 | 			out = openaisendq(textg)
788 | 		else:
789 | 			#Youchat
790 | 			out = sendq(combinedprompt)
791 | 			out = re.sub(r'\[.+?\]\(.+?\)', '', out)
792 | 		print(charactername+":", out)
793 | 		#----------------------
794 | 
795 | 		#TTS Response
796 | 		#mixer.music.unload()
797 | 		#Clear out string to ensure TTS doesn't crash
798 | 		out = re.sub("\n", "", out)
799 | 		out = re.sub("\"", "", out)
800 | 		out = re.sub("[^\x00-\x7F]+", "", out)
801 | 		out = re.sub("[<>]", "", out)
802 | 		out = re.sub("-", " ", out)
803 | 		if out and out != "":
804 | 			if ttsengine == "pyttsx3":
805 | 				engine.save_to_file(out, os.path.join(directory,"temp.wav"))
806 | 				engine.runAndWait();
807 | 			elif ttsengine == "moegoe":
808 | 				mytts.tts(out, os.path.join(directory,"temp.wav"), voice=int(voice), speed=speed)
809 | 			elif ttsengine == "xtts":
810 | 				#xtts.generateSpeech(out, os.path.join(directory,"temp.wav"))
811 | 				end_time = time.time()
812 | 				elapsed_time = end_time - start_time
813 | 				if verbose:
814 | 					print("Elapsed time: ", elapsed_time, "seconds")
815 | 				audio = playaudiostream(xttsstream.tts(out, speaker, "en", streamchunks))
816 | 			
817 | 			#Calculate and print time if verbose
818 | 			if ttsengine != "xtts":
819 | 				end_time = time.time()
820 | 				elapsed_time = end_time - start_time
821 | 				if verbose:
822 | 					print("Elapsed time: ", elapsed_time, "seconds")
823 | 				threadaudio = threading.Thread(target=playaudio)
824 | 				threadaudio.start()
825 | 				threadaudio.join()
826 | 			#playaudio()
827 | 		#-------------------


--------------------------------------------------------------------------------