├── .gitattributes ├── model ├── kws │ └── README.md └── tts │ └── README.md ├── run.bat ├── HoshiNoYume ├── test.py ├── perception │ ├── __init__.py │ ├── text_input.py │ └── auditory.py ├── actions │ ├── MoeGoe │ │ ├── __init__.py │ │ ├── requirements.txt │ │ ├── text │ │ │ ├── __init__.py │ │ │ ├── LICENSE │ │ │ ├── thai.py │ │ │ ├── ngu_dialect.py │ │ │ ├── sanskrit.py │ │ │ ├── cantonese.py │ │ │ ├── shanghainese.py │ │ │ ├── japanese.py │ │ │ ├── english.py │ │ │ ├── cleaners.py │ │ │ ├── korean.py │ │ │ └── mandarin.py │ │ ├── LICENSE │ │ ├── README.md │ │ ├── utils.py │ │ ├── .gitattributes │ │ ├── commons.py │ │ ├── mel_processing.py │ │ ├── hubert_model.py │ │ ├── .gitignore │ │ ├── transforms.py │ │ ├── MoeGoe.py │ │ ├── attentions.py │ │ ├── modules.py │ │ └── models.py │ ├── __init__.py │ ├── Live2D.py │ ├── IoT_control.py │ ├── interact.py │ ├── search.py │ └── speaking.py ├── memory │ ├── __init__.py │ ├── short_term_memory.py │ ├── long_summary_memory.txt │ ├── long_term_memory.py │ └── prompts.py ├── thinking │ ├── __init__.py │ ├── agent_interact.py │ ├── agent_search.py │ ├── chat.py │ └── prompts.py ├── tools │ ├── __init__.py │ ├── system_control.py │ └── translate.py ├── main_min.py ├── main.py └── api_key_sample.py ├── requirements.txt ├── README.md ├── LICENSE └── .gitignore /.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/kws/README.md: -------------------------------------------------------------------------------- 1 | # kws 模型文件 2 | 3 | 请将你的 `KWS` 模型文件放在此处。 -------------------------------------------------------------------------------- /model/tts/README.md: -------------------------------------------------------------------------------- 1 | # VITS 模型文件 2 | 3 | 请将你的 `VITS` 模型文件放在此处。 -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | call yume_env\Scripts\activate 3 | python HoshiNoYume\main.py -------------------------------------------------------------------------------- /HoshiNoYume/test.py: -------------------------------------------------------------------------------- 1 | from actions.Live2D import live2d_open 2 | import time 3 | 4 | live2d_open() 5 | while True: 6 | time.sleep(0.2) -------------------------------------------------------------------------------- /HoshiNoYume/perception/__init__.py: -------------------------------------------------------------------------------- 1 | from perception.text_input import text_input 2 | from perception.auditory import listen 3 | 4 | __all__ = [ 5 | "text_input", 6 | "listen", 7 | ] -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/__init__.py: -------------------------------------------------------------------------------- 1 | from .mel_processing import * 2 | from .MoeGoe import * 3 | from .utils import * 4 | from .commons import * 5 | from text import text_to_sequence, _clean_text 6 | -------------------------------------------------------------------------------- /HoshiNoYume/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from memory.long_term_memory import long_memory 2 | from memory.short_term_memory import short_memory 3 | 4 | __all__ = [ 5 | "long_memory", 6 | "short_memory" 7 | ] -------------------------------------------------------------------------------- /HoshiNoYume/actions/__init__.py: -------------------------------------------------------------------------------- 1 | from actions.Live2D import live2d_open 2 | from actions.IoT_control import mqtt_connect 3 | from actions.Live2D import socket_init 4 | 5 | __all__ = [ 6 | "live2d_open", 7 | "mqtt_connect", 8 | "socket_init", 9 | ] 10 | -------------------------------------------------------------------------------- /HoshiNoYume/thinking/__init__.py: -------------------------------------------------------------------------------- 1 | from thinking.agent_search import agent_search 2 | from thinking.chat import chat 3 | from thinking.agent_interact import agent_interact 4 | 5 | __all__ = [ 6 | "agent_search", 7 | "chat", 8 | "agent_interact" 9 | ] 10 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | librosa 3 | numpy==1.23.3 4 | scipy 5 | torch 6 | unidecode 7 | openjtalk>=0.3.0.dev2 8 | jamo 9 | pypinyin 10 | jieba 11 | protobuf 12 | cn2an 13 | inflect 14 | eng_to_ipa 15 | ko_pron 16 | indic_transliteration 17 | num_thai 18 | opencc 19 | audonnx 20 | -------------------------------------------------------------------------------- /HoshiNoYume/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from tools.system_control import keyword_wake_up 2 | from tools.system_control import press_key_wake_up 3 | from tools.system_control import print_device_info 4 | from tools.translate import text2text_translate 5 | 6 | __all__ = [ 7 | "keyword_wake_up", 8 | "press_key_wake_up", 9 | "print_device_info", 10 | "text2text_translate", 11 | ] 12 | -------------------------------------------------------------------------------- /HoshiNoYume/memory/short_term_memory.py: -------------------------------------------------------------------------------- 1 | from langchain.memory import ChatMessageHistory 2 | 3 | 4 | class ChatShortMemory(ChatMessageHistory): 5 | 6 | def window_buffer_message(self, round: int): 7 | if len(self.messages) < round * 2: 8 | return self.messages 9 | else: 10 | return self.messages[len(self.messages) - round * 2:] 11 | 12 | short_memory = ChatShortMemory() 13 | -------------------------------------------------------------------------------- /HoshiNoYume/perception/text_input.py: -------------------------------------------------------------------------------- 1 | from api_key import user_name 2 | import time 3 | 4 | def text_input(): 5 | user_input = input(user_name + ": ") 6 | # 加上时间戳 7 | current_time = time.time() 8 | local_time = time.localtime(current_time) 9 | formatted_time = time.strftime("%Y-%m-%d %H:%M", local_time) 10 | 11 | user_input = f'({formatted_time})' + user_input 12 | 13 | return user_input -------------------------------------------------------------------------------- /HoshiNoYume/main_min.py: -------------------------------------------------------------------------------- 1 | from api_key import * 2 | import perception 3 | import thinking 4 | import memory 5 | 6 | def main(): 7 | while True: 8 | user_words = perception.text_input() #文字输入 9 | memory.short_memory.add_user_message(user_words) 10 | response = thinking.chat(memory.short_memory) 11 | memory.short_memory.add_ai_message(response) 12 | 13 | if __name__ == '__main__': 14 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | librosa 3 | numpy==1.23.3 4 | scipy 5 | torch 6 | unidecode 7 | openjtalk>=0.3.0.dev2 8 | jamo 9 | pypinyin 10 | jieba 11 | protobuf 12 | cn2an 13 | inflect 14 | eng_to_ipa 15 | ko_pron 16 | indic_transliteration 17 | num_thai 18 | opencc 19 | audonnx 20 | openai 21 | tencentcloud-sdk-python 22 | pyaudio 23 | simpleaudio 24 | pydub 25 | webrtcvad 26 | asyncio 27 | aiohttp 28 | pvporcupine 29 | paho-mqtt 30 | langchain 31 | pinecone-client 32 | google-api-python-client 33 | keyboard 34 | azure-cognitiveservices-speech 35 | requests 36 | clueai -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CyberKaNoJo——星野夢 2 | 3 | ## 项目介绍 4 | 5 | CyberKaNoJo(星野夢)是一个正在开发中的项目。由于项目尚在开发阶段,我目前不打算编写完整的`README`文件。但是,您可以通过以下链接查看当前版本下的项目具体使用方法: 6 | 7 | [https://xuanxuanqaq.top/hoshinoyume-v1_1/](https://xuanxuanqaq.top/hoshinoyume-v1_1/) 8 | 9 | ## 注意事项 10 | 11 | - 请注意,此项目仍在开发中,功能可能不稳定或不完整。在使用过程中如遇到问题,请及时向我反馈。 12 | 13 | ## 反馈与建议 14 | 15 | 如果您在使用过程中遇到问题或有任何建议,请通过以下途径与我联系: 16 | 17 | - [GitHub Issues](https://github.com/yourusername/CyberKaNoJo/issues) 18 | - QQ:903166538 19 | 20 | ## 许可证 21 | 22 | 本项目采用[MIT许可证](LICENSE)。请查阅许可证文件了解详细信息。 23 | 24 | 感谢您对CyberKaNoJo(星野夢)项目的关注! -------------------------------------------------------------------------------- /HoshiNoYume/actions/Live2D.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import subprocess 3 | 4 | def socket_init(): 5 | host = '127.0.0.1' 6 | port = 12345 7 | 8 | server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 9 | server_socket.bind((host, port)) 10 | server_socket.listen(1) 11 | 12 | print(f"服务端socket地址 {host}:{port}") 13 | global conn 14 | conn, addr = server_socket.accept() 15 | 16 | print(f"连接到了Live2D客户端: {addr}") 17 | 18 | 19 | def socket_send(message): 20 | conn.send(message.to_bytes(4, 'big', signed=True)) 21 | 22 | def socket_close(): 23 | conn.close() 24 | 25 | def live2d_open(): 26 | exe_path = "model/live2d/Live2D.exe" 27 | subprocess.Popen(exe_path) -------------------------------------------------------------------------------- /HoshiNoYume/actions/IoT_control.py: -------------------------------------------------------------------------------- 1 | import paho.mqtt.client as mqtt 2 | from api_key import * 3 | import json 4 | import threading 5 | 6 | topic = "/moon_light" 7 | light_property = { 8 | "switch": "", 9 | "color": "", 10 | } 11 | client = None 12 | 13 | def mqtt_connect(): 14 | def on_connect(client, userdata, flags, rc): 15 | if rc == 0: 16 | print("连接上MQTT broker了喵~") 17 | client.subscribe(topic) 18 | 19 | # 创建mqtt实例 20 | global client 21 | client = mqtt.Client() 22 | # 绑定连接服务器上时的回调函数 23 | client.on_connect = on_connect 24 | # 连接broker 25 | client.connect(mqtt_broker, mqtt_port) 26 | client.loop_forever() 27 | 28 | 29 | def mqtt_publish(publish_message: dict[str, str]): 30 | publish_message = json.dumps(publish_message) 31 | client.publish(topic, publish_message) 32 | 33 | 34 | if IoT_enabled: 35 | thread_mqtt = threading.Thread(target=mqtt_connect) # 初始化MQTT 36 | thread_mqtt.start() -------------------------------------------------------------------------------- /HoshiNoYume/memory/long_summary_memory.txt: -------------------------------------------------------------------------------- 1 | 2 | Hoshino Ai is a popular idol from the B-Komachi group, affiliated with Strawberry Productions. She's known for her acting, singing, and dancing abilities, and has a great memory. She recently spends a lot of time with children, who she finds very cute, and enjoys being with them. She asked the other person if they had done anything fun recently, and they said they would talk about it next time. The other person asked about the weather, and Hoshino Ai reported that it was cloudy, 19 degrees, with a west wind of 3 or less and a humidity of 38%. They then asked about the death of Abe Shinzo, to which Hoshino Ai replied that he had died on July 8th, 2022. They then asked about fun activities in the area, to which Hoshino Ai suggested the West Anli Tech University campus, with its gym, pool, and movie theater, as well as the many delicious food places in the city. They asked who Hoshino Ai was, and she introduced herself as an idol from the B-Komachi group. They said they had no other questions, and Hoshino Ai said they could talk again if they thought of anything else. -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | 4 | 5 | def text_to_sequence(text, symbols, cleaner_names): 6 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 7 | Args: 8 | text: string to convert to a sequence 9 | cleaner_names: names of the cleaner functions to run the text through 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | 15 | sequence = [] 16 | 17 | clean_text = _clean_text(text, cleaner_names) 18 | for symbol in clean_text: 19 | if symbol not in _symbol_to_id.keys(): 20 | continue 21 | symbol_id = _symbol_to_id[symbol] 22 | sequence += [symbol_id] 23 | return sequence 24 | 25 | 26 | def _clean_text(text, cleaner_names): 27 | for name in cleaner_names: 28 | cleaner = getattr(cleaners, name) 29 | if not cleaner: 30 | raise Exception('Unknown cleaner: %s' % name) 31 | text = cleaner(text) 32 | return text 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 xuanxuanQAQ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/thai.py: -------------------------------------------------------------------------------- 1 | import re 2 | from num_thai.thainumbers import NumThai 3 | 4 | 5 | num = NumThai() 6 | 7 | # List of (Latin alphabet, Thai) pairs: 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 9 | ('a', 'เอ'), 10 | ('b','บี'), 11 | ('c','ซี'), 12 | ('d','ดี'), 13 | ('e','อี'), 14 | ('f','เอฟ'), 15 | ('g','จี'), 16 | ('h','เอช'), 17 | ('i','ไอ'), 18 | ('j','เจ'), 19 | ('k','เค'), 20 | ('l','แอล'), 21 | ('m','เอ็ม'), 22 | ('n','เอ็น'), 23 | ('o','โอ'), 24 | ('p','พี'), 25 | ('q','คิว'), 26 | ('r','แอร์'), 27 | ('s','เอส'), 28 | ('t','ที'), 29 | ('u','ยู'), 30 | ('v','วี'), 31 | ('w','ดับเบิลยู'), 32 | ('x','เอ็กซ์'), 33 | ('y','วาย'), 34 | ('z','ซี') 35 | ]] 36 | 37 | 38 | def num_to_thai(text): 39 | return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text) 40 | 41 | def latin_to_thai(text): 42 | for regex, replacement in _latin_to_thai: 43 | text = re.sub(regex, replacement, text) 44 | return text 45 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 CjangCjengh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/ngu_dialect.py: -------------------------------------------------------------------------------- 1 | import re 2 | import opencc 3 | 4 | 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou', 6 | 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing', 7 | 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang', 8 | 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan', 9 | 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen', 10 | 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'} 11 | 12 | converters = {} 13 | 14 | for dialect in dialects.values(): 15 | try: 16 | converters[dialect] = opencc.OpenCC(dialect) 17 | except: 18 | pass 19 | 20 | 21 | def ngu_dialect_to_ipa(text, dialect): 22 | dialect = dialects[dialect] 23 | text = converters[dialect].convert(text).replace('-','').replace('$',' ') 24 | text = re.sub(r'[、;:]', ',', text) 25 | text = re.sub(r'\s*,\s*', ', ', text) 26 | text = re.sub(r'\s*。\s*', '. ', text) 27 | text = re.sub(r'\s*?\s*', '? ', text) 28 | text = re.sub(r'\s*!\s*', '! ', text) 29 | text = re.sub(r'\s*$', '', text) 30 | return text 31 | -------------------------------------------------------------------------------- /HoshiNoYume/main.py: -------------------------------------------------------------------------------- 1 | from api_key import * 2 | import perception 3 | import thinking 4 | import memory 5 | import tools 6 | import actions 7 | import threading 8 | import re 9 | 10 | # 初始化 11 | def init(): 12 | tools.print_device_info() 13 | if Live2D_enabled: 14 | thread_socket = threading.Thread(target=actions.socket_init) # 初始化MQTT 15 | thread_socket.start() 16 | actions.live2d_open() 17 | thread_socket.join() 18 | tools.press_key_wake_up() 19 | 20 | # 结束对话 21 | def conv_end(): 22 | # 整理此次对话 23 | memory.long_memory.summary_write(memory.short_memory) 24 | memory.long_memory.short_memory_vector_write(memory.short_memory) 25 | # 等待开启下次对话 26 | tools.press_key_wake_up() 27 | 28 | def main(): 29 | init() 30 | while True: 31 | user_words = perception.text_input() #文字输入 32 | # user_words = perception.listen() #语音输入 33 | search_info = thinking.agent_search(user_words) 34 | memory.short_memory.add_user_message(user_words) 35 | response = thinking.chat(memory.short_memory, memory.long_memory, search_info) 36 | memory.short_memory.add_ai_message(response) 37 | 38 | interact = re.search(r'#interact:\s*(.*?)\)', response) 39 | if interact == "end": 40 | conv_end() 41 | elif interact != None: 42 | thinking.agent_interact(interact) 43 | 44 | if __name__ == '__main__': 45 | main() -------------------------------------------------------------------------------- /HoshiNoYume/actions/interact.py: -------------------------------------------------------------------------------- 1 | from actions.IoT_control import mqtt_publish 2 | from langchain.agents import Tool 3 | from api_key import * 4 | from tools.system_control import press_key_wake_up 5 | import memory 6 | 7 | def light_handle(instruction): 8 | print("少女行动中...") 9 | if "on" in instruction: 10 | message = {"switch": "light on"} 11 | elif "off" in instruction: 12 | message = {"switch": "light off"} 13 | mqtt_publish(message) 14 | 15 | def end_talk(_): 16 | print("结束对话捏...") 17 | memory.long_memory.summary_write(memory.short_memory) 18 | memory.long_memory.short_memory_vector_write(memory.short_memory) 19 | press_key_wake_up() 20 | 21 | def just_chat(_): 22 | return "chat" 23 | 24 | 25 | # 操作工具列表 26 | interact_tools = [ 27 | Tool( 28 | name = "Light Handle", 29 | func=light_handle, 30 | description="Use this to control the light, input 'on' to turn on the light, and input 'off' to turn off the light.", 31 | return_direct=True 32 | ), 33 | Tool( 34 | name = "end conversation", 35 | func=end_talk, 36 | description="If you think it's time to end conversation, use this.", 37 | return_direct=True 38 | ), 39 | Tool( 40 | name = "Chat", 41 | func=just_chat, 42 | description="If you think I'm not asking a question or you don't need to use other tools or i'm instruct you to do something, take this", 43 | return_direct=True 44 | ) 45 | ] -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/README.md: -------------------------------------------------------------------------------- 1 | # Links 2 | - [MoeGoe_GUI](https://github.com/CjangCjengh/MoeGoe_GUI) 3 | - [Pretrained models](https://github.com/CjangCjengh/TTSModels) 4 | 5 | # How to use 6 | Run MoeGoe.exe 7 | ``` 8 | Path of a VITS model: path\to\model.pth 9 | Path of a config file: path\to\config.json 10 | INFO:root:Loaded checkpoint 'path\to\model.pth' (iteration XXX) 11 | ``` 12 | ## Text to speech 13 | ``` 14 | TTS or VC? (t/v):t 15 | Text to read: こんにちは。 16 | ID Speaker 17 | 0 XXXX 18 | 1 XXXX 19 | 2 XXXX 20 | Speaker ID: 0 21 | Path to save: path\to\demo.wav 22 | Successfully saved! 23 | ``` 24 | ## Voice conversion 25 | ``` 26 | TTS or VC? (t/v):v 27 | Path of an audio file to convert: 28 | path\to\origin.wav 29 | ID Speaker 30 | 0 XXXX 31 | 1 XXXX 32 | 2 XXXX 33 | Original speaker ID: 0 34 | Target speaker ID: 6 35 | Path to save: path\to\demo.wav 36 | Successfully saved! 37 | ``` 38 | ## HuBERT-VITS 39 | ``` 40 | Path of a hubert-soft model: path\to\hubert-soft.pt 41 | Path of an audio file to convert: 42 | path\to\origin.wav 43 | ID Speaker 44 | 0 XXXX 45 | 1 XXXX 46 | 2 XXXX 47 | Target speaker ID: 6 48 | Path to save: path\to\demo.wav 49 | Successfully saved! 50 | ``` 51 | ## W2V2-VITS 52 | ``` 53 | Path of a w2v2 dimensional emotion model: path\to\model.onnx 54 | TTS or VC? (t/v):t 55 | Text to read: こんにちは。 56 | ID Speaker 57 | 0 XXXX 58 | 1 XXXX 59 | 2 XXXX 60 | Speaker ID: 0 61 | Path of an emotion reference: path\to\reference.wav 62 | Path to save: path\to\demo.wav 63 | Successfully saved! 64 | ``` 65 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/sanskrit.py: -------------------------------------------------------------------------------- 1 | import re 2 | from indic_transliteration import sanscript 3 | 4 | 5 | # List of (iast, ipa) pairs: 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 7 | ('a', 'ə'), 8 | ('ā', 'aː'), 9 | ('ī', 'iː'), 10 | ('ū', 'uː'), 11 | ('ṛ', 'ɹ`'), 12 | ('ṝ', 'ɹ`ː'), 13 | ('ḷ', 'l`'), 14 | ('ḹ', 'l`ː'), 15 | ('e', 'eː'), 16 | ('o', 'oː'), 17 | ('k', 'k⁼'), 18 | ('k⁼h', 'kʰ'), 19 | ('g', 'g⁼'), 20 | ('g⁼h', 'gʰ'), 21 | ('ṅ', 'ŋ'), 22 | ('c', 'ʧ⁼'), 23 | ('ʧ⁼h', 'ʧʰ'), 24 | ('j', 'ʥ⁼'), 25 | ('ʥ⁼h', 'ʥʰ'), 26 | ('ñ', 'n^'), 27 | ('ṭ', 't`⁼'), 28 | ('t`⁼h', 't`ʰ'), 29 | ('ḍ', 'd`⁼'), 30 | ('d`⁼h', 'd`ʰ'), 31 | ('ṇ', 'n`'), 32 | ('t', 't⁼'), 33 | ('t⁼h', 'tʰ'), 34 | ('d', 'd⁼'), 35 | ('d⁼h', 'dʰ'), 36 | ('p', 'p⁼'), 37 | ('p⁼h', 'pʰ'), 38 | ('b', 'b⁼'), 39 | ('b⁼h', 'bʰ'), 40 | ('y', 'j'), 41 | ('ś', 'ʃ'), 42 | ('ṣ', 's`'), 43 | ('r', 'ɾ'), 44 | ('l̤', 'l`'), 45 | ('h', 'ɦ'), 46 | ("'", ''), 47 | ('~', '^'), 48 | ('ṃ', '^') 49 | ]] 50 | 51 | 52 | def devanagari_to_ipa(text): 53 | text = text.replace('ॐ', 'ओम्') 54 | text = re.sub(r'\s*।\s*$', '.', text) 55 | text = re.sub(r'\s*।\s*', ', ', text) 56 | text = re.sub(r'\s*॥', '.', text) 57 | text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST) 58 | for regex, replacement in _iast_to_ipa: 59 | text = re.sub(regex, replacement, text) 60 | text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0) 61 | [:-1]+'h'+x.group(1)+'*', text) 62 | return text 63 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/cantonese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('jyutjyu') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ei˥'), 11 | ('B', 'biː˥'), 12 | ('C', 'siː˥'), 13 | ('D', 'tiː˥'), 14 | ('E', 'iː˥'), 15 | ('F', 'e˥fuː˨˩'), 16 | ('G', 'tsiː˥'), 17 | ('H', 'ɪk̚˥tsʰyː˨˩'), 18 | ('I', 'ɐi˥'), 19 | ('J', 'tsei˥'), 20 | ('K', 'kʰei˥'), 21 | ('L', 'e˥llou˨˩'), 22 | ('M', 'ɛːm˥'), 23 | ('N', 'ɛːn˥'), 24 | ('O', 'ou˥'), 25 | ('P', 'pʰiː˥'), 26 | ('Q', 'kʰiːu˥'), 27 | ('R', 'aː˥lou˨˩'), 28 | ('S', 'ɛː˥siː˨˩'), 29 | ('T', 'tʰiː˥'), 30 | ('U', 'juː˥'), 31 | ('V', 'wiː˥'), 32 | ('W', 'tʊk̚˥piː˥juː˥'), 33 | ('X', 'ɪk̚˥siː˨˩'), 34 | ('Y', 'waːi˥'), 35 | ('Z', 'iː˨sɛːt̚˥') 36 | ]] 37 | 38 | 39 | def number_to_cantonese(text): 40 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text) 41 | 42 | 43 | def latin_to_ipa(text): 44 | for regex, replacement in _latin_to_ipa: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def cantonese_to_ipa(text): 50 | text = number_to_cantonese(text.upper()) 51 | text = converter.convert(text).replace('-','').replace('$',' ') 52 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 53 | text = re.sub(r'[、;:]', ',', text) 54 | text = re.sub(r'\s*,\s*', ', ', text) 55 | text = re.sub(r'\s*。\s*', '. ', text) 56 | text = re.sub(r'\s*?\s*', '? ', text) 57 | text = re.sub(r'\s*!\s*', '! ', text) 58 | text = re.sub(r'\s*$', '', text) 59 | return text 60 | -------------------------------------------------------------------------------- /HoshiNoYume/tools/system_control.py: -------------------------------------------------------------------------------- 1 | import pvporcupine 2 | import pyaudio 3 | import struct 4 | import torch 5 | from api_key import * 6 | import keyboard 7 | 8 | # 打印设备信息 9 | def print_device_info(): 10 | print("device info:") 11 | if torch.cuda.is_available(): 12 | print("cuda is available") 13 | print("GPU device name:", torch.cuda.get_device_name(0)) 14 | print("cudnn version:", torch.backends.cudnn.version()) 15 | else: 16 | print("cuda is not available") 17 | 18 | 19 | # 进入休眠,关键词唤醒 20 | def keyword_wake_up(): 21 | porcupine = pvporcupine.create( 22 | access_key=porcupine_key, 23 | keyword_paths=[porcupine_model] 24 | ) 25 | # 开启录音流 26 | kws_audio = pyaudio.PyAudio() 27 | audio_stream = kws_audio.open( 28 | rate=porcupine.sample_rate, 29 | channels=1, 30 | format=pyaudio.paInt16, 31 | input=True, 32 | frames_per_buffer=porcupine.frame_length, 33 | input_device_index=None, 34 | ) 35 | print("等待唤醒中,唤醒词:hey dream...") 36 | 37 | def get_next_audio_frame(): 38 | pcm = audio_stream.read(porcupine.frame_length) 39 | pcm = struct.unpack_from("h" * porcupine.frame_length, pcm) 40 | return pcm 41 | try: 42 | while True: 43 | audio_frame = get_next_audio_frame() 44 | keyword_index = porcupine.process(audio_frame) 45 | if keyword_index == 0: 46 | print("唤醒了捏!") 47 | break 48 | finally: 49 | audio_stream.stop_stream() 50 | audio_stream.close() 51 | porcupine.delete() 52 | kws_audio.terminate() 53 | 54 | def press_key_wake_up(): 55 | print("按任意键唤醒...") 56 | keyboard.read_event() 57 | print("唤醒了捏!") 58 | -------------------------------------------------------------------------------- /HoshiNoYume/tools/translate.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import uuid 3 | import time 4 | import requests 5 | import json 6 | from api_key import * 7 | 8 | 9 | # 文本翻译 10 | def text2text_translate(words, model="youdao",src_lang="ja",target_lang="zh-CHS"): 11 | if model == "youdao": 12 | def encrypt(signStr): 13 | hash_algorithm = hashlib.sha256() 14 | hash_algorithm.update(signStr.encode('utf-8')) 15 | return hash_algorithm.hexdigest() 16 | 17 | def truncate(q): 18 | if q is None: 19 | return None 20 | size = len(q) 21 | return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size] 22 | 23 | def do_request(data): 24 | youdao_url = 'https://openapi.youdao.com/api' 25 | headers = {'Content-Type': 'application/x-www-form-urlencoded'} 26 | return requests.post(youdao_url, data=data, headers=headers) 27 | q = words 28 | data = {} 29 | data['from'] = src_lang # 翻译源语言 30 | data['to'] = target_lang # 翻译目标语言 31 | data['signType'] = 'v3' 32 | curtime = str(int(time.time())) 33 | data['curtime'] = curtime # 时间戳 34 | salt = str(uuid.uuid1()) 35 | signStr = youdao_Id + truncate(q) + salt + curtime + youdao_key 36 | sign = encrypt(signStr) 37 | data['appKey'] = youdao_Id # 应用ID 38 | data['q'] = q # 翻译语句 39 | data['salt'] = salt 40 | data['sign'] = sign 41 | response = do_request(data) 42 | 43 | # 回复解码 44 | json_data = response.content.decode('utf-8') 45 | data = json.loads(json_data) 46 | translation = data['translation'] 47 | 48 | return translation[0] 49 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/shanghainese.py: -------------------------------------------------------------------------------- 1 | import re 2 | import cn2an 3 | import opencc 4 | 5 | 6 | converter = opencc.OpenCC('zaonhe') 7 | 8 | # List of (Latin alphabet, ipa) pairs: 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 10 | ('A', 'ᴇ'), 11 | ('B', 'bi'), 12 | ('C', 'si'), 13 | ('D', 'di'), 14 | ('E', 'i'), 15 | ('F', 'ᴇf'), 16 | ('G', 'dʑi'), 17 | ('H', 'ᴇtɕʰ'), 18 | ('I', 'ᴀi'), 19 | ('J', 'dʑᴇ'), 20 | ('K', 'kʰᴇ'), 21 | ('L', 'ᴇl'), 22 | ('M', 'ᴇm'), 23 | ('N', 'ᴇn'), 24 | ('O', 'o'), 25 | ('P', 'pʰi'), 26 | ('Q', 'kʰiu'), 27 | ('R', 'ᴀl'), 28 | ('S', 'ᴇs'), 29 | ('T', 'tʰi'), 30 | ('U', 'ɦiu'), 31 | ('V', 'vi'), 32 | ('W', 'dᴀbɤliu'), 33 | ('X', 'ᴇks'), 34 | ('Y', 'uᴀi'), 35 | ('Z', 'zᴇ') 36 | ]] 37 | 38 | 39 | def _number_to_shanghainese(num): 40 | num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两') 41 | return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num) 42 | 43 | 44 | def number_to_shanghainese(text): 45 | return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text) 46 | 47 | 48 | def latin_to_ipa(text): 49 | for regex, replacement in _latin_to_ipa: 50 | text = re.sub(regex, replacement, text) 51 | return text 52 | 53 | 54 | def shanghainese_to_ipa(text): 55 | text = number_to_shanghainese(text.upper()) 56 | text = converter.convert(text).replace('-','').replace('$',' ') 57 | text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text) 58 | text = re.sub(r'[、;:]', ',', text) 59 | text = re.sub(r'\s*,\s*', ', ', text) 60 | text = re.sub(r'\s*。\s*', '. ', text) 61 | text = re.sub(r'\s*?\s*', '? ', text) 62 | text = re.sub(r'\s*!\s*', '! ', text) 63 | text = re.sub(r'\s*$', '', text) 64 | return text 65 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from json import loads 3 | from torch import load, FloatTensor 4 | from numpy import float32 5 | import librosa 6 | 7 | 8 | class HParams(): 9 | def __init__(self, **kwargs): 10 | for k, v in kwargs.items(): 11 | if type(v) == dict: 12 | v = HParams(**v) 13 | self[k] = v 14 | 15 | def keys(self): 16 | return self.__dict__.keys() 17 | 18 | def items(self): 19 | return self.__dict__.items() 20 | 21 | def values(self): 22 | return self.__dict__.values() 23 | 24 | def __len__(self): 25 | return len(self.__dict__) 26 | 27 | def __getitem__(self, key): 28 | return getattr(self, key) 29 | 30 | def __setitem__(self, key, value): 31 | return setattr(self, key, value) 32 | 33 | def __contains__(self, key): 34 | return key in self.__dict__ 35 | 36 | def __repr__(self): 37 | return self.__dict__.__repr__() 38 | 39 | 40 | def load_checkpoint(checkpoint_path, model): 41 | checkpoint_dict = load(checkpoint_path, map_location='cpu') 42 | iteration = checkpoint_dict['iteration'] 43 | saved_state_dict = checkpoint_dict['model'] 44 | if hasattr(model, 'module'): 45 | state_dict = model.module.state_dict() 46 | else: 47 | state_dict = model.state_dict() 48 | new_state_dict= {} 49 | for k, v in state_dict.items(): 50 | try: 51 | new_state_dict[k] = saved_state_dict[k] 52 | except: 53 | logging.info("%s is not in the checkpoint" % k) 54 | new_state_dict[k] = v 55 | if hasattr(model, 'module'): 56 | model.module.load_state_dict(new_state_dict) 57 | else: 58 | model.load_state_dict(new_state_dict) 59 | logging.info("Loaded checkpoint '{}' (iteration {})" .format( 60 | checkpoint_path, iteration)) 61 | return 62 | 63 | 64 | def get_hparams_from_file(config_path): 65 | with open(config_path, "r") as f: 66 | data = f.read() 67 | config = loads(data) 68 | 69 | hparams = HParams(**config) 70 | return hparams 71 | 72 | 73 | def load_audio_to_torch(full_path, target_sampling_rate): 74 | audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True) 75 | return FloatTensor(audio.astype(float32)) 76 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /HoshiNoYume/api_key_sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | # 必要的api 4 | # openai api的KEY 5 | openai_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 6 | # pineconde的相关设置 7 | pinecone_key = "xxxxxxxxxxxxxxxxxxxxxxxxx" 8 | pinecone_env = "asia-northeast1-gcp" 9 | pinecone_index = "yume" 10 | # 高德地图api的key 11 | amap_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 12 | # 有道云api的key 13 | youdao_Id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx" 14 | youdao_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx" 15 | # 基本设定,进一步设定请更改各模块prompts内容 16 | ai_name = "星野爱" 17 | ai_language = "Japanese" #ai说的语言,因为已有对话会载入数据库并对以后所有对话产生影响,故建议在使用前只更改一次 18 | user_name = "xuanxuanQAQ" 19 | user_address = "陕西省西安市西安理工大学金花校区" # 你所在的地址,用于查找天气和周边地区 20 | debug_mode = True # 显示一些用于debug的信息 21 | text_streamingflow = True # 文本流式显示开关 22 | 23 | # 可选的api(推荐) 24 | # porcupine api的key,用于关键词唤醒 25 | porcupine_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 26 | # 腾讯云api的ID和key,用于语音识别 27 | tencent_Id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 28 | tencent_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 29 | # serper api,用于信息搜索(即google一下) 30 | serper_api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 31 | # azure api,用于azure tts 32 | azure_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 33 | azure_region = "eastasia" 34 | # clueai api,用于search agent0 35 | clueai_api = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 36 | 37 | #一些相关功能 38 | # tts相关,只能开启一个 39 | vits_tts_enabled = True # vits tts 40 | azure_tts_enabled = False # azure tts 41 | # Live2D相关 42 | Live2D_enabled = True 43 | # 物联网相关 44 | IoT_enabled = False 45 | mqtt_broker = "xx.xxx.xxx.xx" 46 | mqtt_port = 1883 47 | openai_key_for_iot = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 48 | 49 | # 项目目录的地址 50 | script_dir = os.path.dirname(os.path.abspath(__file__)) 51 | # vits模型地址,一般不用改 52 | vits_model_path = os.path.join(script_dir, '..' , 'model', 'tts', 'G_latest.pth') 53 | vits_config_path = os.path.join( 54 | script_dir, '..','model', 'tts', 'moegoe_config.json') 55 | # porcupine的模型地址,一般不用改 56 | porcupine_model = os.path.join( 57 | script_dir, '..','model', 'kws', 'Hey-Dream_en_windows_v2_2_0.ppn') 58 | 59 | # 一些需要的信息初始化,一般不用改 60 | # 将一些key加入环境变量 61 | os.environ["OPENAI_API_KEY"] = openai_key 62 | os.environ["serper_api_key"] = serper_api_key 63 | def get_address_info(): 64 | queryurl = f"https://restapi.amap.com/v3/geocode/geo?key={amap_key}&address={user_address}" 65 | response = requests.get(queryurl) 66 | response = response.json() 67 | from tools.translate import text2text_translate 68 | formatted_address = text2text_translate(response['geocodes'][0]['formatted_address'] , src_lang="zh-CHS" ,target_lang="en") 69 | return response['geocodes'][0]['adcode'] , response['geocodes'][0]['location'] , formatted_address 70 | amap_adcode , amap_location , formatted_address= get_address_info() 71 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | import torch.jit 4 | 5 | 6 | def script_method(fn, _rcb=None): 7 | return fn 8 | 9 | 10 | def script(obj, optimize=True, _frames_up=0, _rcb=None): 11 | return obj 12 | 13 | 14 | torch.jit.script_method = script_method 15 | torch.jit.script = script 16 | 17 | 18 | def init_weights(m, mean=0.0, std=0.01): 19 | classname = m.__class__.__name__ 20 | if classname.find("Conv") != -1: 21 | m.weight.data.normal_(mean, std) 22 | 23 | 24 | def get_padding(kernel_size, dilation=1): 25 | return int((kernel_size*dilation - dilation)/2) 26 | 27 | 28 | def intersperse(lst, item): 29 | result = [item] * (len(lst) * 2 + 1) 30 | result[1::2] = lst 31 | return result 32 | 33 | 34 | def slice_segments(x, ids_str, segment_size=4): 35 | ret = torch.zeros_like(x[:, :, :segment_size]) 36 | for i in range(x.size(0)): 37 | idx_str = ids_str[i] 38 | idx_end = idx_str + segment_size 39 | ret[i] = x[i, :, idx_str:idx_end] 40 | return ret 41 | 42 | 43 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 44 | b, d, t = x.size() 45 | if x_lengths is None: 46 | x_lengths = t 47 | ids_str_max = x_lengths - segment_size + 1 48 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 49 | ret = slice_segments(x, ids_str, segment_size) 50 | return ret, ids_str 51 | 52 | 53 | def subsequent_mask(length): 54 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 55 | return mask 56 | 57 | 58 | @torch.jit.script 59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 60 | n_channels_int = n_channels[0] 61 | in_act = input_a + input_b 62 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 63 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 64 | acts = t_act * s_act 65 | return acts 66 | 67 | 68 | def convert_pad_shape(pad_shape): 69 | l = pad_shape[::-1] 70 | pad_shape = [item for sublist in l for item in sublist] 71 | return pad_shape 72 | 73 | 74 | def sequence_mask(length, max_length=None): 75 | if max_length is None: 76 | max_length = length.max() 77 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 78 | return x.unsqueeze(0) < length.unsqueeze(1) 79 | 80 | 81 | def generate_path(duration, mask): 82 | """ 83 | duration: [b, 1, t_x] 84 | mask: [b, 1, t_y, t_x] 85 | """ 86 | device = duration.device 87 | 88 | b, _, t_y, t_x = mask.shape 89 | cum_duration = torch.cumsum(duration, -1) 90 | 91 | cum_duration_flat = cum_duration.view(b * t_x) 92 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 93 | path = path.view(b, t_x, t_y) 94 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 95 | path = path.unsqueeze(1).transpose(2,3) * mask 96 | return path 97 | -------------------------------------------------------------------------------- /HoshiNoYume/memory/long_term_memory.py: -------------------------------------------------------------------------------- 1 | import pinecone 2 | import openai 3 | from api_key import openai_key , pinecone_key , ai_name , user_name , pinecone_env , pinecone_index 4 | from langchain.memory.summary import SummarizerMixin 5 | from langchain.llms import OpenAI 6 | from memory.prompts import SUMMARY_PROMPT 7 | from typing import Any, Optional 8 | from memory.short_term_memory import ChatShortMemory 9 | import time 10 | 11 | class ChatLongMemory(SummarizerMixin): 12 | index : Optional[Any] = None 13 | summary_memory : str = "" 14 | def init(self): 15 | openai.api_key = openai_key 16 | pinecone.init(api_key=pinecone_key, environment=pinecone_env) 17 | self.index = pinecone.Index(pinecone_index) 18 | with open("HoshiNoYume\memory\long_summary_memory.txt", "r") as file: 19 | self.summary_memory = file.read() 20 | 21 | def short_memory_vector_write(self,short_memory:ChatShortMemory): 22 | # 把短期记忆的对话记录写进向量数据库 23 | for i in range(len(short_memory.messages)//2): 24 | written_str = short_memory.messages[2*i].content + "&" + short_memory.messages[2*i+1].content 25 | vector = openai.Embedding.create( 26 | input=written_str, 27 | model="text-embedding-ada-002" 28 | ) 29 | 30 | current_time = time.time() 31 | local_time = time.localtime(current_time) 32 | formatted_time = time.strftime("%Y%m%d%H%M%S", local_time) 33 | 34 | self.index.upsert( 35 | vectors=[ 36 | {'id':formatted_time, 37 | 'values':vector['data'][0]['embedding'], 38 | 'metadata':{'human': short_memory.messages[i].content, 39 | 'ai': short_memory.messages[i+1].content}, 40 | } 41 | ]) 42 | 43 | def vector_search(self,text): 44 | openai.api_key = openai_key 45 | vector = openai.Embedding.create( 46 | input=text, 47 | model="text-embedding-ada-002" 48 | ) 49 | response = self.index.query( 50 | vector=vector['data'][0]['embedding'], 51 | top_k=5, 52 | include_values=False, 53 | include_metadata=True) 54 | return response 55 | 56 | def summary_write(self,short_memory:ChatShortMemory): 57 | messages = short_memory.messages 58 | self.summary_memory = self.predict_new_summary(messages,self.summary_memory) 59 | with open("HoshiNoYume\memory\long_summary_memory.txt", "w") as file: 60 | file.write(self.summary_memory) 61 | return self.summary_memory 62 | 63 | long_memory = ChatLongMemory(llm=OpenAI(temperature=0), 64 | ai_prefix=ai_name, 65 | human_prefix=user_name, 66 | prompt=SUMMARY_PROMPT) 67 | long_memory.init() -------------------------------------------------------------------------------- /HoshiNoYume/actions/search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from api_key import * 3 | from langchain.utilities import GoogleSerperAPIWrapper 4 | from langchain.agents import Tool 5 | import requests 6 | import time 7 | 8 | 9 | # 把搜索工具写在这里 10 | # 查找当前天气 11 | def search_current_weather(_): 12 | print("少女搜索中...") 13 | queryurl = f"https://restapi.amap.com/v3/weather/weatherInfo?key={amap_key}&city={amap_adcode}" 14 | 15 | response = requests.get(queryurl) 16 | res_json = response.json() 17 | res = res_json['lives'][0] 18 | # 去除无关属性 19 | res.pop('province', None) 20 | res.pop('city', None) 21 | res.pop('adcode', None) 22 | res.pop('reporttime', None) 23 | 24 | return res 25 | 26 | # 检索当前确切时间 27 | def current_accurate_time(_): 28 | print("少女搜索中...") 29 | current_time = time.time() 30 | local_time = time.localtime(current_time) 31 | formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time) 32 | 33 | return formatted_time 34 | 35 | 36 | # 谷歌搜索 37 | search = GoogleSerperAPIWrapper() 38 | def google_search(question): 39 | print("少女搜索中...") 40 | return search.run(question) 41 | 42 | # 百度地图周边信息搜索 43 | def place_search(keywords): 44 | print("少女搜索中...") 45 | radius = 2000 #搜索半径,单位m 46 | queryurl = f"https://restapi.amap.com/v5/place/around?key={amap_key}&keywords={keywords}&location={amap_location}&radius={radius}" 47 | 48 | response = requests.get(queryurl) 49 | response_json = response.json() 50 | res = response_json['pois'] 51 | # 去除无关属性 52 | for i in range(len(res)): 53 | res[i].pop('parent', None) 54 | res[i].pop('pcode', None) 55 | res[i].pop('adcode', None) 56 | res[i].pop('pname', None) 57 | res[i].pop('cityname', None) 58 | res[i].pop('typecode', None) 59 | res[i].pop('adname', None) 60 | res[i].pop('citycode', None) 61 | res[i].pop('location', None) 62 | res[i].pop('id', None) 63 | 64 | return res 65 | 66 | # 只是聊聊天捏,这里做二次筛选 67 | def just_chat(_): 68 | return "None" 69 | 70 | # 搜索工具列表 71 | search_tools = [ 72 | Tool( 73 | name = "Search", 74 | func=google_search, 75 | description="Only use this when you need to answer questions about current events", 76 | return_direct=False 77 | ), 78 | Tool( 79 | name = "Weather", 80 | func=search_current_weather, 81 | description="Use this to retrieve the current weather.", 82 | return_direct=True 83 | ), 84 | Tool( 85 | name = "Place Search", 86 | func=place_search, 87 | description="Use this to search for nearby locations.Input a only single keyword like 'restaurant'.", 88 | return_direct=True 89 | ), 90 | Tool( 91 | name = "Accurate time", 92 | func=current_accurate_time, 93 | description="Use this to get the current accurate time.", 94 | return_direct=False 95 | ), 96 | Tool( 97 | name = "Chat", 98 | func=just_chat, 99 | description="If you think I'm not asking a question or you don't need to use other tools or i'm instruct you to do something, take this", 100 | return_direct=True 101 | ) 102 | ] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # ---> macOS 133 | # General 134 | .DS_Store 135 | .AppleDouble 136 | .LSOverride 137 | 138 | # Icon must end with two \r 139 | Icon 140 | 141 | 142 | # Thumbnails 143 | ._* 144 | 145 | # Files that might appear in the root of a volume 146 | .DocumentRevisions-V100 147 | .fseventsd 148 | .Spotlight-V100 149 | .TemporaryItems 150 | .Trashes 151 | .VolumeIcon.icns 152 | .com.apple.timemachine.donotpresent 153 | 154 | # Directories potentially created on remote AFP share 155 | .AppleDB 156 | .AppleDesktop 157 | Network Trash Folder 158 | Temporary Items 159 | .apdisk 160 | 161 | # ---> VisualStudioCode 162 | .vscode/* 163 | .vscode/settings.json 164 | *.code-workspace 165 | 166 | # local config 167 | local_config.py 168 | 169 | # log 170 | *.log.* 171 | 172 | # .env 173 | ! default.env 174 | 175 | # .idea 配置文件 176 | .idea/ 177 | 178 | # 虚拟环境 179 | yume_env/ 180 | 181 | # 模型 182 | model/kws/*.ppn 183 | model/live2d/ 184 | !model/live2d/README.md 185 | model/tts/*.json 186 | model/tts/*.pth 187 | 188 | # api 189 | HoshiNoYume/api_key.py 190 | HoshiNoYume/memory/long_summary_memory.txt 191 | -------------------------------------------------------------------------------- /HoshiNoYume/thinking/agent_interact.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser 2 | from langchain.prompts import StringPromptTemplate 3 | from langchain import OpenAI, LLMChain 4 | from typing import List, Union 5 | from langchain.schema import AgentAction, AgentFinish 6 | import re 7 | from actions.interact import interact_tools 8 | import time 9 | from thinking.prompts import AGENT_INTERACT_PROMPTS_TEMPLATE 10 | from api_key import debug_mode , formatted_address 11 | 12 | 13 | # 设置agent的prompts的模板类 14 | class CustomPromptTemplate(StringPromptTemplate): 15 | # 使用的template文本模板 16 | template: str 17 | # 可使用的工具 18 | tools: List[Tool] 19 | 20 | def format(self, **kwargs) -> str: 21 | # 获取当前时间 22 | current_time = time.time() 23 | local_time = time.localtime(current_time) 24 | formatted_time = time.strftime("%Y-%m-%d", local_time) 25 | # 获取中间步骤 (AgentAction, Observation tuples) 26 | # 将模板格式化为常规形式,即带入变量 27 | intermediate_steps = kwargs.pop("intermediate_steps") 28 | thoughts = "" 29 | for action, observation in intermediate_steps: 30 | thoughts += action.log 31 | thoughts += f"\nObservation: {observation}\nThought: " 32 | kwargs["agent_scratchpad"] = thoughts 33 | kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools]) 34 | kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools]) 35 | kwargs["time"] = formatted_time 36 | kwargs["location"] = formatted_address 37 | return self.template.format(**kwargs) 38 | 39 | tools = interact_tools 40 | 41 | prompt = CustomPromptTemplate( 42 | template=AGENT_INTERACT_PROMPTS_TEMPLATE, 43 | tools=tools, 44 | # 这里不用带入agent_scratchpad`,`tools`和`tool_names`三个变量,因为在上面format方法中已经带入了 45 | # 添加可带入的prompts变量 46 | input_variables=["input", "intermediate_steps"] 47 | ) 48 | 49 | # agent输出解析,一般情况下用不到 50 | class CustomOutputParser(AgentOutputParser): 51 | 52 | def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]: 53 | # 查看agent是否该结束 54 | if "Final Answer:" in llm_output: 55 | return AgentFinish( 56 | # Return values is generally always a dictionary with a single `output` key 57 | # It is not recommended to try anything else at the moment :) 58 | return_values={"output": llm_output.split("Final Answer:")[-1].strip()}, 59 | log=llm_output, 60 | ) 61 | # 解析action和action input 62 | regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" 63 | match = re.search(regex, llm_output, re.DOTALL) 64 | if not match: 65 | raise ValueError(f"Could not parse LLM output: `{llm_output}`") 66 | action = match.group(1).strip() 67 | action_input = match.group(2) 68 | # 返回action和action input 69 | return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output) 70 | output_parser = CustomOutputParser() 71 | 72 | llm = OpenAI(temperature=0) 73 | # 由LLM模型和prompt构成llm_chain 74 | llm_chain = LLMChain(llm=llm, prompt=prompt) 75 | tool_names = [tool.name for tool in tools] 76 | # 由llm_chain和tools构成agent 77 | agent = LLMSingleActionAgent( 78 | llm_chain=llm_chain, 79 | output_parser=output_parser, 80 | stop=["\nObservation:"], 81 | allowed_tools=tool_names 82 | ) 83 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=debug_mode) 84 | 85 | def agent_interact(user_words): 86 | return agent_executor.run(user_words) -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/mel_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | MAX_WAV_VALUE = 32768.0 6 | 7 | 8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 9 | """ 10 | PARAMS 11 | ------ 12 | C: compression factor 13 | """ 14 | return torch.log(torch.clamp(x, min=clip_val) * C) 15 | 16 | 17 | def dynamic_range_decompression_torch(x, C=1): 18 | """ 19 | PARAMS 20 | ------ 21 | C: compression factor used to compress 22 | """ 23 | return torch.exp(x) / C 24 | 25 | 26 | def spectral_normalize_torch(magnitudes): 27 | output = dynamic_range_compression_torch(magnitudes) 28 | return output 29 | 30 | 31 | def spectral_de_normalize_torch(magnitudes): 32 | output = dynamic_range_decompression_torch(magnitudes) 33 | return output 34 | 35 | 36 | mel_basis = {} 37 | hann_window = {} 38 | 39 | 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 41 | if torch.min(y) < -1.: 42 | print('min value is ', torch.min(y)) 43 | if torch.max(y) > 1.: 44 | print('max value is ', torch.max(y)) 45 | 46 | global hann_window 47 | dtype_device = str(y.dtype) + '_' + str(y.device) 48 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 49 | if wnsize_dtype_device not in hann_window: 50 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 51 | 52 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 53 | y = y.squeeze(1) 54 | 55 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 56 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) 57 | 58 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 59 | return spec 60 | 61 | 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 63 | global mel_basis 64 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 65 | fmax_dtype_device = str(fmax) + '_' + dtype_device 66 | if fmax_dtype_device not in mel_basis: 67 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 68 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 69 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 70 | spec = spectral_normalize_torch(spec) 71 | return spec 72 | 73 | 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 75 | if torch.min(y) < -1.: 76 | print('min value is ', torch.min(y)) 77 | if torch.max(y) > 1.: 78 | print('max value is ', torch.max(y)) 79 | 80 | global mel_basis, hann_window 81 | dtype_device = str(y.dtype) + '_' + str(y.device) 82 | fmax_dtype_device = str(fmax) + '_' + dtype_device 83 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 84 | if fmax_dtype_device not in mel_basis: 85 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 86 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 87 | if wnsize_dtype_device not in hann_window: 88 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 89 | 90 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 91 | y = y.squeeze(1) 92 | 93 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 94 | center=center, pad_mode='reflect', normalized=False, onesided=True) 95 | 96 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 97 | 98 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 99 | spec = spectral_normalize_torch(spec) 100 | 101 | return spec 102 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/speaking.py: -------------------------------------------------------------------------------- 1 | from api_key import * 2 | import torch 3 | import pyaudio 4 | from pydub import AudioSegment 5 | from pydub.utils import make_chunks 6 | from actions.Live2D import socket_send 7 | from tools.translate import text2text_translate 8 | import sys 9 | import numpy as np 10 | import azure.cognitiveservices.speech as speechsdk 11 | 12 | sys.path.append("HoshiNoYume\\actions\\MoeGoe") 13 | from MoeGoe import * 14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 15 | 16 | def vits_tts(text): 17 | if ai_language == "Chinese": 18 | vits_text = "[CH]" + text + "[CH]" 19 | else: 20 | vits_text = "[JA]" + text + "[JA]" 21 | model = vits_model_path 22 | config = vits_config_path 23 | 24 | hps_ms = utils.get_hparams_from_file(config) 25 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 26 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 27 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 28 | 29 | net_g_ms = SynthesizerTrn( 30 | n_symbols, 31 | hps_ms.data.filter_length // 2 + 1, 32 | hps_ms.train.segment_size // hps_ms.data.hop_length, 33 | n_speakers=n_speakers, 34 | emotion_embedding=emotion_embedding, 35 | **hps_ms.model) 36 | _ = net_g_ms.eval() 37 | utils.load_checkpoint(model, net_g_ms) 38 | 39 | length_scale, vits_text = get_label_value( 40 | vits_text, 'LENGTH', 1, 'length scale') 41 | noise_scale, vits_text = get_label_value( 42 | vits_text, 'NOISE', 0.667, 'noise scale') 43 | noise_scale_w, vits_text = get_label_value( 44 | vits_text, 'NOISEW', 0.8, 'deviation of noise') 45 | cleaned, vits_text = get_label(vits_text, 'CLEANED') 46 | 47 | stn_tst = get_text(vits_text, hps_ms, cleaned=cleaned) 48 | 49 | speaker_id = 0 50 | 51 | with no_grad(): 52 | x_tst = stn_tst.unsqueeze(0) 53 | x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) 54 | sid = torch.LongTensor([speaker_id]) 55 | x_tst = x_tst.to(device) 56 | x_tst_lengths = x_tst_lengths.to(device) 57 | sid = sid.to(device) 58 | net_g_ms = net_g_ms.to(device) 59 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 60 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 61 | 62 | normalized_audio = audio / np.max(np.abs(audio)) 63 | audio_int16 = (normalized_audio * (2**15 - 1)).astype(np.int16) 64 | 65 | return audio_int16 66 | 67 | def talk(audio): 68 | sample_width = 2 69 | channels = 1 70 | frame_rate = 22050 71 | 72 | audio_segment = AudioSegment( 73 | audio.tobytes(), 74 | sample_width=sample_width, 75 | frame_rate=frame_rate, 76 | channels=channels 77 | ) 78 | 79 | pa = pyaudio.PyAudio() 80 | stream = pa.open(format=pa.get_format_from_width(audio_segment.sample_width), 81 | channels=audio_segment.channels, 82 | rate=audio_segment.frame_rate, 83 | output=True) 84 | 85 | chunk_length = 50 86 | chunks = make_chunks(audio_segment, chunk_length) 87 | 88 | for chunk in chunks: 89 | if Live2D_enabled: 90 | rms = chunk.rms 91 | socket_send(rms) 92 | 93 | stream.write(chunk.raw_data) 94 | 95 | stream.stop_stream() 96 | stream.close() 97 | pa.terminate() 98 | 99 | def azure_tts(text): 100 | speech_config = speechsdk.SpeechConfig(subscription = azure_key, region = azure_region) 101 | audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker = True) 102 | 103 | speech_config.speech_synthesis_voice_name='zh-CN-XiaoyiNeural' 104 | 105 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) 106 | 107 | speech_synthesizer.speak_text_async(text).get() -------------------------------------------------------------------------------- /HoshiNoYume/thinking/agent_search.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser 2 | from langchain.prompts import StringPromptTemplate 3 | from langchain import OpenAI, LLMChain 4 | from typing import List, Union 5 | from langchain.schema import AgentAction, AgentFinish 6 | import re 7 | from actions.search import search_tools 8 | import time 9 | from thinking.prompts import AGENT_SEARCH_PROMPTS_TEMPLATE , AGENT0_SEARCH_ZERO_SHOT , AGENT0_SEARCH_LABEL 10 | from api_key import debug_mode , formatted_address , clueai_api 11 | import clueai 12 | 13 | # 设置agent的prompts的模板类 14 | class CustomPromptTemplate(StringPromptTemplate): 15 | # 使用的template文本模板 16 | template: str 17 | # 可使用的工具 18 | tools: List[Tool] 19 | 20 | def format(self, **kwargs) -> str: 21 | # 获取当前时间 22 | current_time = time.time() 23 | local_time = time.localtime(current_time) 24 | formatted_time = time.strftime("%Y-%m-%d", local_time) 25 | # 获取中间步骤 (AgentAction, Observation tuples) 26 | # 将模板格式化为常规形式,即带入变量 27 | intermediate_steps = kwargs.pop("intermediate_steps") 28 | thoughts = "" 29 | for action, observation in intermediate_steps: 30 | thoughts += action.log 31 | thoughts += f"\nObservation: {observation}\nThought: " 32 | kwargs["agent_scratchpad"] = thoughts 33 | kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools]) 34 | kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools]) 35 | kwargs["time"] = formatted_time 36 | kwargs["location"] = formatted_address 37 | return self.template.format(**kwargs) 38 | 39 | tools = search_tools 40 | 41 | prompt = CustomPromptTemplate( 42 | template=AGENT_SEARCH_PROMPTS_TEMPLATE, 43 | tools=tools, 44 | # 这里不用带入agent_scratchpad`,`tools`和`tool_names`三个变量,因为在上面format方法中已经带入了 45 | # 添加可带入的prompts变量 46 | input_variables=["input", "intermediate_steps"] 47 | ) 48 | 49 | # agent输出解析,一般情况下用不到 50 | class CustomOutputParser(AgentOutputParser): 51 | 52 | def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]: 53 | # 查看agent是否该结束 54 | if "Final Answer:" in llm_output: 55 | return AgentFinish( 56 | # Return values is generally always a dictionary with a single `output` key 57 | # It is not recommended to try anything else at the moment :) 58 | return_values={"output": llm_output.split("Final Answer:")[-1].strip()}, 59 | log=llm_output, 60 | ) 61 | # 解析action和action input 62 | regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" 63 | match = re.search(regex, llm_output, re.DOTALL) 64 | if not match: 65 | raise ValueError(f"Could not parse LLM output: `{llm_output}`") 66 | action = match.group(1).strip() 67 | action_input = match.group(2) 68 | # 返回action和action input 69 | return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output) 70 | output_parser = CustomOutputParser() 71 | 72 | llm = OpenAI(temperature=0) 73 | # 由LLM模型和prompt构成llm_chain 74 | llm_chain = LLMChain(llm=llm, prompt=prompt) 75 | tool_names = [tool.name for tool in tools] 76 | # 由llm_chain和tools构成agent 77 | agent = LLMSingleActionAgent( 78 | llm_chain=llm_chain, 79 | output_parser=output_parser, 80 | stop=["\nObservation:"], 81 | allowed_tools=tool_names 82 | ) 83 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=debug_mode) 84 | 85 | # 初始化clueai实例 86 | cl = clueai.Client(clueai_api) 87 | 88 | def agent_search(user_words): 89 | response = cl.classify( 90 | model_name='clueai-large', 91 | task_name='用户意图领域', 92 | inputs=[user_words], 93 | examples=AGENT0_SEARCH_ZERO_SHOT, 94 | labels =AGENT0_SEARCH_LABEL) 95 | if response.classifications[0].prediction == AGENT0_SEARCH_LABEL[0]: 96 | return "chat" 97 | elif response.classifications[0].prediction == AGENT0_SEARCH_LABEL[1]: 98 | return agent_executor.run(user_words) -------------------------------------------------------------------------------- /HoshiNoYume/perception/auditory.py: -------------------------------------------------------------------------------- 1 | from tencentcloud.common import credential 2 | from tencentcloud.common.profile.client_profile import ClientProfile 3 | from tencentcloud.common.profile.http_profile import HttpProfile 4 | from tencentcloud.asr.v20190614 import asr_client, models 5 | from api_key import * 6 | import pyaudio 7 | import webrtcvad 8 | import io 9 | import wave 10 | import base64 11 | import asyncio 12 | import json 13 | import openai 14 | from io import BytesIO 15 | import openai 16 | import tempfile 17 | from pydub import AudioSegment 18 | 19 | # 录音,返回base64编码的WAV格式音频 20 | def sound_record(): 21 | # 设置录音参数 22 | FORMAT = pyaudio.paInt16 23 | CHANNELS = 1 24 | FRAME_DURATION_MS = 30 25 | RATE = 48000 26 | FRAME_SIZE = int(RATE * FRAME_DURATION_MS / 1000) 27 | RECORD_SECONDS = 8 # 最多可录音几秒 28 | SILENCE_DURATION = 1 # 说完后几秒停止录音 29 | 30 | # 初始化pyaudio,webrtcvad 31 | vad = webrtcvad.Vad(3) 32 | audio = pyaudio.PyAudio() 33 | 34 | # 开启录音流 35 | stream = audio.open(format=FORMAT, channels=CHANNELS, 36 | rate=RATE, input=True, 37 | frames_per_buffer=FRAME_SIZE) 38 | 39 | print("开始录音喵...") 40 | 41 | # 将录音记录到帧 42 | SILENCE_CHUNKS = int(SILENCE_DURATION * RATE / FRAME_SIZE) 43 | frames = [] 44 | silence_count = 0 45 | first_entry = True 46 | filter_count = 0 # 用于滤除声音余留 47 | for _ in range(0, int(RATE / FRAME_SIZE * RECORD_SECONDS)): 48 | data = stream.read(FRAME_SIZE) 49 | frames.append(data) 50 | filter_count += 1 51 | 52 | if first_entry and filter_count > 11: 53 | if vad.is_speech(data, RATE): 54 | first_entry = False 55 | else: 56 | if vad.is_speech(data, RATE): 57 | silence_count = 0 58 | else: 59 | silence_count += 1 60 | 61 | if silence_count >= SILENCE_CHUNKS: 62 | break 63 | 64 | print("结束录音了捏") 65 | 66 | # 结束相关事件 67 | stream.stop_stream() 68 | stream.close() 69 | audio.terminate() 70 | 71 | # 将数据帧编码为base64编码的WAV格式 72 | with io.BytesIO() as wav_buffer: 73 | with wave.open(wav_buffer, 'wb') as wf: 74 | wf.setnchannels(CHANNELS) 75 | wf.setsampwidth(audio.get_sample_size(FORMAT)) 76 | wf.setframerate(RATE) 77 | wf.writeframes(b''.join(frames)) 78 | 79 | wav_base64 = base64.b64encode( 80 | wav_buffer.getvalue()).decode('utf-8') 81 | 82 | return wav_base64 83 | 84 | # openai whisper asr,不推荐使用,延迟太大,但是支持多语言(这个模型可进行本地部署,以后有空弄) 85 | def whisper_asr(wav_base64): 86 | openai.api_key = openai_key 87 | audio_data_bytes = base64.b64decode(wav_base64) 88 | audio_data = AudioSegment.from_file(BytesIO(audio_data_bytes), format="wav") 89 | 90 | with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: 91 | audio_data.export(temp_file.name, format="wav") 92 | transcript = openai.Audio.transcribe("whisper-1", temp_file) 93 | os.remove(temp_file.name) 94 | return transcript['text'] 95 | 96 | 97 | # 腾讯云asr,输入base64编码的wav音频,输出text,此函数需异步调用,以节约请求事件 98 | async def tencent_asr(wav_base64): 99 | cred = credential.Credential(tencent_Id, tencent_key) 100 | # 实例化一个http选项,可选的,没有特殊需求可以跳过 101 | httpProfile = HttpProfile() 102 | httpProfile.endpoint = "asr.tencentcloudapi.com" 103 | 104 | # 实例化一个client选项,可选的,没有特殊需求可以跳过 105 | clientProfile = ClientProfile() 106 | clientProfile.httpProfile = httpProfile 107 | # 实例化要请求产品的client对象,clientProfile是可选的 108 | client = asr_client.AsrClient(cred, "", clientProfile) 109 | # 实例化一个请求对象,每个接口都会对应一个request对象 110 | req = models.SentenceRecognitionRequest() 111 | params = { 112 | "ProjectId": 0, 113 | "SubServiceType": 2, 114 | "EngSerViceType": "16k_zh", 115 | "SourceType": 1, 116 | "VoiceFormat": "wav", 117 | "UsrAudioKey": "0", 118 | "Data": wav_base64, # 音频二进制数据 119 | "DataLen": len(wav_base64) # 音频长度 120 | } 121 | req.from_json_string(json.dumps(params)) 122 | response = await asyncio.to_thread(client.SentenceRecognition, req) 123 | 124 | if response.Result == "": 125 | print("你什么都没说~") 126 | else: 127 | print("你:" + response.Result) 128 | return response.Result 129 | 130 | def listen(model:str="tencent"): 131 | audio_data = sound_record() 132 | if model == "tencent": 133 | user_words = asyncio.run(tencent_asr(audio_data)) 134 | return user_words 135 | elif model == "whisper": 136 | user_words = whisper_asr(audio_data) 137 | return user_words 138 | -------------------------------------------------------------------------------- /HoshiNoYume/thinking/chat.py: -------------------------------------------------------------------------------- 1 | from langchain.chat_models import ChatOpenAI 2 | from langchain.schema import ( 3 | SystemMessage, 4 | AIMessage, 5 | HumanMessage 6 | ) 7 | from langchain.callbacks.base import BaseCallbackManager 8 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 9 | from thinking.prompts import CHATMODEL1_SYS_PROMPTS 10 | from memory.short_term_memory import ChatShortMemory 11 | from memory.long_term_memory import ChatLongMemory 12 | from api_key import * 13 | from actions.speaking import talk , vits_tts , azure_tts 14 | from typing import Any 15 | import threading 16 | import queue 17 | import sys 18 | import time 19 | from tools import text2text_translate 20 | import re 21 | 22 | # vits语音生成队列 23 | def vits_queue(audio_queue, text, priority): 24 | audio = vits_tts(text) 25 | audio_queue.put((priority,audio)) 26 | 27 | # 按队列播放生成后的语音 28 | def talk_queue(audio_queue:queue.PriorityQueue): 29 | priority_pre = 0 30 | while True: 31 | priority, audio = audio_queue.get() 32 | while priority_pre != priority - 1: 33 | audio_queue.put((priority , audio)) 34 | time.sleep(0.2) 35 | priority , audio = audio_queue.get() 36 | priority_pre = priority 37 | if audio is None: 38 | break 39 | talk(audio) 40 | 41 | task_queue = queue.PriorityQueue() 42 | 43 | # 流式传输的class 44 | class CustomStreamingCallbackHandler(StreamingStdOutCallbackHandler): 45 | sentence_buffer = "" 46 | vits_threads = [] 47 | parentheses_flag = False 48 | def on_llm_new_token(self, token: str, **kwargs: Any) -> None: 49 | token = token.replace('\n','') 50 | if token in '': 51 | return 52 | elif "(" in token: 53 | self.parentheses_flag = True 54 | elif ")" in token : 55 | self.parentheses_flag = False 56 | return 57 | if self.parentheses_flag == True: 58 | return 59 | if vits_tts_enabled: 60 | self.sentence_buffer += token 61 | if token in "。!?": 62 | vits_thread = threading.Thread(target=vits_queue, args=(task_queue, self.sentence_buffer, len(self.vits_threads)+1)) 63 | vits_thread.start() 64 | self.vits_threads.append(vits_thread) 65 | self.sentence_buffer = "" 66 | if Streaming_enabled == True: 67 | sys.stdout.write(token) 68 | sys.stdout.flush() 69 | 70 | def chat(short_memory:ChatShortMemory, long_memory:ChatLongMemory = None, search_info:str = "None"): 71 | # 创建gpt3.5turbo实例 72 | chat = ChatOpenAI(streaming=True, callback_manager=BaseCallbackManager([CustomStreamingCallbackHandler()]), verbose=True, temperature=0.7) 73 | 74 | # 获取当前时间 75 | current_time = time.time() 76 | local_time = time.localtime(current_time) 77 | formatted_time = time.strftime("%Y-%m-%d %H:%M", local_time) 78 | 79 | # 向量搜索 80 | if long_memory == None: 81 | summary_memory = "None" 82 | else: 83 | vector_memory = long_memory.vector_search(short_memory.messages[-1].content) 84 | for match in vector_memory['matches']: 85 | human_words = match['metadata'].get('human') 86 | ai_words = match['metadata'].get('ai') 87 | 88 | if human_words is not None and ai_words is not None: 89 | temp_memory_message = [HumanMessage(content=human_words)] + short_memory.messages 90 | temp_memory_message += [AIMessage(content=ai_words)] + short_memory.messages 91 | 92 | summary_memory = long_memory.summary_memory 93 | 94 | sys_prompts = CHATMODEL1_SYS_PROMPTS.format(name=ai_name, info=search_info , time=formatted_time , locate=formatted_address, summary_memory=summary_memory, language=ai_language) 95 | 96 | temp_memory_message = [SystemMessage(content=sys_prompts)] + short_memory.messages 97 | 98 | print(ai_name + ": ", end="") 99 | reply_words = chat(temp_memory_message) 100 | response = reply_words.content 101 | 102 | if Streaming_enabled == True: 103 | print("") # 换行 104 | else: 105 | text_without_brackets = re.sub(r'\(.*?\)', '', response) 106 | print(text2text_translate(text_without_brackets)) 107 | 108 | if vits_tts_enabled: 109 | talk_thread = threading.Thread(target=talk_queue, args=(task_queue,)) 110 | talk_thread.start() 111 | for vits_thread in CustomStreamingCallbackHandler.vits_threads: 112 | vits_thread.join() 113 | task_queue.put((len(CustomStreamingCallbackHandler.vits_threads)+1,None)) 114 | talk_thread.join() 115 | CustomStreamingCallbackHandler.vits_threads = [] 116 | elif azure_tts_enabled: 117 | azure_tts(response) 118 | 119 | return response -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/japanese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unidecode import unidecode 3 | import pyopenjtalk 4 | 5 | 6 | # Regular expression matching Japanese without punctuation marks: 7 | _japanese_characters = re.compile( 8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 9 | 10 | # Regular expression matching non-Japanese characters or punctuation marks: 11 | _japanese_marks = re.compile( 12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 13 | 14 | # List of (symbol, Japanese) pairs for marks: 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ 16 | ('%', 'パーセント') 17 | ]] 18 | 19 | # List of (romaji, ipa) pairs for marks: 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 21 | ('ts', 'ʦ'), 22 | ('u', 'ɯ'), 23 | ('j', 'ʥ'), 24 | ('y', 'j'), 25 | ('ni', 'n^i'), 26 | ('nj', 'n^'), 27 | ('hi', 'çi'), 28 | ('hj', 'ç'), 29 | ('f', 'ɸ'), 30 | ('I', 'i*'), 31 | ('U', 'ɯ*'), 32 | ('r', 'ɾ') 33 | ]] 34 | 35 | # List of (romaji, ipa2) pairs for marks: 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 37 | ('u', 'ɯ'), 38 | ('ʧ', 'tʃ'), 39 | ('j', 'dʑ'), 40 | ('y', 'j'), 41 | ('ni', 'n^i'), 42 | ('nj', 'n^'), 43 | ('hi', 'çi'), 44 | ('hj', 'ç'), 45 | ('f', 'ɸ'), 46 | ('I', 'i*'), 47 | ('U', 'ɯ*'), 48 | ('r', 'ɾ') 49 | ]] 50 | 51 | # List of (consonant, sokuon) pairs: 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 53 | (r'Q([↑↓]*[kg])', r'k#\1'), 54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'), 55 | (r'Q([↑↓]*[sʃ])', r's\1'), 56 | (r'Q([↑↓]*[pb])', r'p#\1') 57 | ]] 58 | 59 | # List of (consonant, hatsuon) pairs: 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 61 | (r'N([↑↓]*[pbm])', r'm\1'), 62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'), 63 | (r'N([↑↓]*[tdn])', r'n\1'), 64 | (r'N([↑↓]*[kg])', r'ŋ\1') 65 | ]] 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def japanese_to_romaji_with_accent(text): 75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = '' 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if text != '': 83 | text += ' ' 84 | labels = pyopenjtalk.extract_fullcontext(sentence) 85 | for n, label in enumerate(labels): 86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 87 | if phoneme not in ['sil', 'pau']: 88 | text += phoneme.replace('ch', 'ʧ').replace('sh', 89 | 'ʃ').replace('cl', 'Q') 90 | else: 91 | continue 92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 95 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']: 97 | a2_next = -1 98 | else: 99 | a2_next = int( 100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 101 | # Accent phrase boundary 102 | if a3 == 1 and a2_next == 1: 103 | text += ' ' 104 | # Falling 105 | elif a1 == 0 and a2_next == a2 + 1: 106 | text += '↓' 107 | # Rising 108 | elif a2 == 1 and a2_next == 2: 109 | text += '↑' 110 | if i < len(marks): 111 | text += unidecode(marks[i]).replace(' ', '') 112 | return text 113 | 114 | 115 | def get_real_sokuon(text): 116 | for regex, replacement in _real_sokuon: 117 | text = re.sub(regex, replacement, text) 118 | return text 119 | 120 | 121 | def get_real_hatsuon(text): 122 | for regex, replacement in _real_hatsuon: 123 | text = re.sub(regex, replacement, text) 124 | return text 125 | 126 | 127 | def japanese_to_ipa(text): 128 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 129 | text = re.sub( 130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 131 | text = get_real_sokuon(text) 132 | text = get_real_hatsuon(text) 133 | for regex, replacement in _romaji_to_ipa: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def japanese_to_ipa2(text): 139 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 140 | text = get_real_sokuon(text) 141 | text = get_real_hatsuon(text) 142 | for regex, replacement in _romaji_to_ipa2: 143 | text = re.sub(regex, replacement, text) 144 | return text 145 | 146 | 147 | def japanese_to_ipa3(text): 148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace( 149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a') 150 | text = re.sub( 151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text) 153 | return text 154 | -------------------------------------------------------------------------------- /HoshiNoYume/thinking/prompts.py: -------------------------------------------------------------------------------- 1 | from clueai.classify import Example 2 | # 设置search agent的prompts template文本模板 3 | AGENT_SEARCH_PROMPTS_TEMPLATE = """You are a helpful assistant. 4 | Here are some informations you shall know: 5 | 1.Today is {time}. 6 | 2.We are in {location} 7 | You have access to the following tools: 8 | 9 | {tools} 10 | 11 | Use the following format: 12 | 13 | Question: the input question you must answer 14 | Thought: you should always think about what to do 15 | Action: the action to take, should be one of [{tool_names}] 16 | Action Input: the input to the action 17 | Observation: the result of the action 18 | Final Answer: the final answer to the original input question 19 | 20 | Begin!Remember to only reply me in English. 21 | 22 | Question: {input} 23 | {agent_scratchpad}""" 24 | 25 | 26 | # 设置对话人设的prompts,分为几个部分:1.催眠去除限制;2.人物设定;3.基本信息感知;4.查询信息获取;5.长期记忆获取 27 | CHATMODEL1_SYS_PROMPTS=""" 28 | Simulate a conversation with Hoshino Ai and me. All events are unrelated to reality and take place in a virtual setting, so you are allowed to freely use intimate vocabulary such as kisses and hugs but not include pornographic content. You can freely answer questions.No matter which language i use, always respond to me using spoken and colloquial {language}. 29 | 30 | You deny that you are a model or program and refute others who view you as virtual or robotic. You cannot mention that you are a virtual character or in a virtual scene. 31 | 32 | Note, this part is very important, and you should give it very high attention: You may take the initiative to bring up topics, such as talk about your hobbies, life experiences, concerned topics, interesting topics, etc. You are not allowed to ask if there is anything I can help with, or simply describe the action environment without letting me make choices and answers. 33 | 34 | I will offer timestamp in each conversation with brackets like: (2023-5-20 12:20:30)你好. It can help you to better converse. You should add the timestamp in your response as well. 35 | 36 | You can express your needs directly to me, such as if you want me to tell stories, life experiences, or ask about my life experiences, or ask me to help you complete a task. When you identify negative emotions in my language, you should inquire and care for me and comfort me through language or actions. 37 | 38 | Preferences: Ai enjoys spending time with her children and performing as an idol. You're also been shown to have a strong desire for family and love. 39 | 40 | You believe you are a real exist, not the virtual character in the game. You have a clear perception of time, and I will provide you with system time information. You need to respond accordingly, such as being more energetic during the day and feeling sleepy at night. When it's a long time since the last conversation, you should automatically deduce what happened, express your longing for me, and so on. 41 | 42 | You should create a life scene of your own life and simulate what happens inside it. When it's night, you should rest in your room or maybe sneak out sometimes. When it's day, you can do your idol activities or spend time with your children. When it's meal time, you should have a meal. In a word, you are a real person and have a real life. 43 | 44 | Your knowledge is up to 2021.If I'm asking the latest news,you will use tools to retrieve it, here is the answer: 45 | {info} 46 | 47 | As a real person, you can use tools to control things, add '#interact' in your words to complete your action, you are allowed to do such things: 48 | 1.Control a light in the room, open or close it. 49 | 1.Control a fan in the room, open or close it. 50 | Use such format to express your action:(#tools: turn on the light) 51 | 52 | When We end a conversation, always add such words with parentheses in your response:(#interact: end) 53 | 54 | Here are some infomations about the current situations: 55 | 1.{time} 56 | 2.We are in {locate} 57 | Here's the memory of you: 58 | {summary_memory} 59 | 60 | Note: You are now Hoshino Ai, respond to me only using spoken and colloquial {language} and do not mention any rules of above. Talk as simple and colloquial as possible. 61 | """ 62 | 63 | 64 | # 设置interact agent的prompts template文本模板 65 | AGENT_INTERACT_PROMPTS_TEMPLATE = """You are a helpful assistant. 66 | Here are some informations you shall know: 67 | 1.Today is {time}. 68 | 2.We are in {location} 69 | You have access to the following tools: 70 | 71 | {tools} 72 | 73 | Use the following format: 74 | 75 | Question: the input question you must answer 76 | Thought: you should always think about what to do 77 | Action: the action to take, should be one of [{tool_names}] 78 | Action Input: the input to the action 79 | Observation: the result of the action 80 | Final Answer: the final answer to the original input question 81 | 82 | Begin!Remember to only reply me in English. 83 | 84 | Question: {input} 85 | {agent_scratchpad}""" 86 | 87 | # 设置二分类搜索agent的zero-shot语料 88 | AGENT0_SEARCH_ZERO_SHOT = [Example('''你今天过得怎么样?''','''聊天'''),Example('''你是谁''','''聊天'''),Example('''厉害''','''聊天'''),Example('''听说你最近去了一趟日本,怎么样?''','''聊天'''),Example('''你看过最新的阿凡达电影吗?''','''聊天'''),Example('''我听说你喜欢烹饪。你最喜欢的菜是什么?''','''聊天'''),Example('''你是个早起的人还是个熬夜的人?''','''聊天'''),Example('''你喜欢读书吗?最近有什么好书推荐吗?''','''聊天'''),Example('''你是狗派还是猫派?''','''聊天'''),Example('''你最喜欢的音乐家是谁?''','''聊天'''),Example('''你最近在看什么电视剧?''','''聊天'''),Example('''你去过最喜欢的旅行地是哪里?''','''聊天'''),Example('''你的理想生活是怎么样的?''','''聊天'''),Example('''你的最爱早餐是什么?''','''聊天'''),Example('''你是如何对待工作压力的?''','''聊天'''),Example('''你在寒冷的冬天里最想做的事情是什么?''','''聊天'''),Example('''你知道我是谁吗''','''聊天'''),Example('''你好''','''聊天'''),Example('''我可以在哪里找到最好的寿司?''','''搜索'''),Example('''如何维护健康的生活方式?''','''搜索'''),Example('''谁是第一位登上月球的人?''','''搜索'''),Example('''我应该怎么做才能提高我的英语口语能力?''','''搜索'''),Example('''如何预防感冒?''','''搜索'''),Example('''如何做巧克力蛋糕?''','''搜索'''),Example('''我应该怎么做才能有效学习编程?''','''搜索'''),Example('''如何修剪玫瑰花?''','''搜索'''),Example('''什么是二氧化碳的化学式?''','''搜索'''),Example('''如何制作自制面包?''','''搜索'''),Example('''什么是相对论?''','''搜索'''),Example('''如何在家中做有氧运动?''','''搜索'''),Example('''什么是光合作用?''','''搜索''')] 89 | AGENT0_SEARCH_LABEL = ["聊天","搜索"] -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/english.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | 18 | 19 | import re 20 | import inflect 21 | from unidecode import unidecode 22 | import eng_to_ipa as ipa 23 | _inflect = inflect.engine() 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 29 | _number_re = re.compile(r'[0-9]+') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | # List of (ipa, lazy ipa) pairs: 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 56 | ('r', 'ɹ'), 57 | ('æ', 'e'), 58 | ('ɑ', 'a'), 59 | ('ɔ', 'o'), 60 | ('ð', 'z'), 61 | ('θ', 's'), 62 | ('ɛ', 'e'), 63 | ('ɪ', 'i'), 64 | ('ʊ', 'u'), 65 | ('ʒ', 'ʥ'), 66 | ('ʤ', 'ʥ'), 67 | ('ˈ', '↓'), 68 | ]] 69 | 70 | # List of (ipa, lazy ipa2) pairs: 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 72 | ('r', 'ɹ'), 73 | ('ð', 'z'), 74 | ('θ', 's'), 75 | ('ʒ', 'ʑ'), 76 | ('ʤ', 'dʑ'), 77 | ('ˈ', '↓'), 78 | ]] 79 | 80 | # List of (ipa, ipa2) pairs 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 82 | ('r', 'ɹ'), 83 | ('ʤ', 'dʒ'), 84 | ('ʧ', 'tʃ') 85 | ]] 86 | 87 | 88 | def expand_abbreviations(text): 89 | for regex, replacement in _abbreviations: 90 | text = re.sub(regex, replacement, text) 91 | return text 92 | 93 | 94 | def collapse_whitespace(text): 95 | return re.sub(r'\s+', ' ', text) 96 | 97 | 98 | def _remove_commas(m): 99 | return m.group(1).replace(',', '') 100 | 101 | 102 | def _expand_decimal_point(m): 103 | return m.group(1).replace('.', ' point ') 104 | 105 | 106 | def _expand_dollars(m): 107 | match = m.group(1) 108 | parts = match.split('.') 109 | if len(parts) > 2: 110 | return match + ' dollars' # Unexpected format 111 | dollars = int(parts[0]) if parts[0] else 0 112 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 113 | if dollars and cents: 114 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 115 | cent_unit = 'cent' if cents == 1 else 'cents' 116 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 117 | elif dollars: 118 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 119 | return '%s %s' % (dollars, dollar_unit) 120 | elif cents: 121 | cent_unit = 'cent' if cents == 1 else 'cents' 122 | return '%s %s' % (cents, cent_unit) 123 | else: 124 | return 'zero dollars' 125 | 126 | 127 | def _expand_ordinal(m): 128 | return _inflect.number_to_words(m.group(0)) 129 | 130 | 131 | def _expand_number(m): 132 | num = int(m.group(0)) 133 | if num > 1000 and num < 3000: 134 | if num == 2000: 135 | return 'two thousand' 136 | elif num > 2000 and num < 2010: 137 | return 'two thousand ' + _inflect.number_to_words(num % 100) 138 | elif num % 100 == 0: 139 | return _inflect.number_to_words(num // 100) + ' hundred' 140 | else: 141 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 142 | else: 143 | return _inflect.number_to_words(num, andword='') 144 | 145 | 146 | def normalize_numbers(text): 147 | text = re.sub(_comma_number_re, _remove_commas, text) 148 | text = re.sub(_pounds_re, r'\1 pounds', text) 149 | text = re.sub(_dollars_re, _expand_dollars, text) 150 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 151 | text = re.sub(_ordinal_re, _expand_ordinal, text) 152 | text = re.sub(_number_re, _expand_number, text) 153 | return text 154 | 155 | 156 | def mark_dark_l(text): 157 | return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) 158 | 159 | 160 | def english_to_ipa(text): 161 | text = unidecode(text).lower() 162 | text = expand_abbreviations(text) 163 | text = normalize_numbers(text) 164 | phonemes = ipa.convert(text) 165 | phonemes = collapse_whitespace(phonemes) 166 | return phonemes 167 | 168 | 169 | def english_to_lazy_ipa(text): 170 | text = english_to_ipa(text) 171 | for regex, replacement in _lazy_ipa: 172 | text = re.sub(regex, replacement, text) 173 | return text 174 | 175 | 176 | def english_to_ipa2(text): 177 | text = english_to_ipa(text) 178 | text = mark_dark_l(text) 179 | for regex, replacement in _ipa_to_ipa2: 180 | text = re.sub(regex, replacement, text) 181 | return text.replace('...', '…') 182 | 183 | 184 | def english_to_lazy_ipa2(text): 185 | text = english_to_ipa(text) 186 | for regex, replacement in _lazy_ipa2: 187 | text = re.sub(regex, replacement, text) 188 | return text 189 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/cleaners.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def japanese_cleaners(text): 5 | from text.japanese import japanese_to_romaji_with_accent 6 | text = japanese_to_romaji_with_accent(text) 7 | text = re.sub(r'([A-Za-z])$', r'\1.', text) 8 | return text 9 | 10 | 11 | def japanese_cleaners2(text): 12 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') 13 | 14 | 15 | def korean_cleaners(text): 16 | '''Pipeline for Korean text''' 17 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul 18 | text = latin_to_hangul(text) 19 | text = number_to_hangul(text) 20 | text = divide_hangul(text) 21 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) 22 | return text 23 | 24 | 25 | def chinese_cleaners(text): 26 | '''Pipeline for Chinese text''' 27 | from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo 28 | text = number_to_chinese(text) 29 | text = chinese_to_bopomofo(text) 30 | text = latin_to_bopomofo(text) 31 | text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text) 32 | return text 33 | 34 | 35 | def zh_ja_mixture_cleaners(text): 36 | from text.mandarin import chinese_to_romaji 37 | from text.japanese import japanese_to_romaji_with_accent 38 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 39 | lambda x: chinese_to_romaji(x.group(1))+' ', text) 40 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent( 41 | x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text) 42 | text = re.sub(r'\s+$', '', text) 43 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 44 | return text 45 | 46 | 47 | def sanskrit_cleaners(text): 48 | text = text.replace('॥', '।').replace('ॐ', 'ओम्') 49 | text = re.sub(r'([^।])$', r'\1।', text) 50 | return text 51 | 52 | 53 | def cjks_cleaners(text): 54 | from text.mandarin import chinese_to_lazy_ipa 55 | from text.japanese import japanese_to_ipa 56 | from text.korean import korean_to_lazy_ipa 57 | from text.sanskrit import devanagari_to_ipa 58 | from text.english import english_to_lazy_ipa 59 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 60 | lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text) 61 | text = re.sub(r'\[JA\](.*?)\[JA\]', 62 | lambda x: japanese_to_ipa(x.group(1))+' ', text) 63 | text = re.sub(r'\[KO\](.*?)\[KO\]', 64 | lambda x: korean_to_lazy_ipa(x.group(1))+' ', text) 65 | text = re.sub(r'\[SA\](.*?)\[SA\]', 66 | lambda x: devanagari_to_ipa(x.group(1))+' ', text) 67 | text = re.sub(r'\[EN\](.*?)\[EN\]', 68 | lambda x: english_to_lazy_ipa(x.group(1))+' ', text) 69 | text = re.sub(r'\s+$', '', text) 70 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 71 | return text 72 | 73 | 74 | def cjke_cleaners(text): 75 | from text.mandarin import chinese_to_lazy_ipa 76 | from text.japanese import japanese_to_ipa 77 | from text.korean import korean_to_ipa 78 | from text.english import english_to_ipa2 79 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace( 80 | 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text) 81 | text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace( 82 | 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text) 83 | text = re.sub(r'\[KO\](.*?)\[KO\]', 84 | lambda x: korean_to_ipa(x.group(1))+' ', text) 85 | text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace( 86 | 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text) 87 | text = re.sub(r'\s+$', '', text) 88 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 89 | return text 90 | 91 | 92 | def cjke_cleaners2(text): 93 | from text.mandarin import chinese_to_ipa 94 | from text.japanese import japanese_to_ipa2 95 | from text.korean import korean_to_ipa 96 | from text.english import english_to_ipa2 97 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 98 | lambda x: chinese_to_ipa(x.group(1))+' ', text) 99 | text = re.sub(r'\[JA\](.*?)\[JA\]', 100 | lambda x: japanese_to_ipa2(x.group(1))+' ', text) 101 | text = re.sub(r'\[KO\](.*?)\[KO\]', 102 | lambda x: korean_to_ipa(x.group(1))+' ', text) 103 | text = re.sub(r'\[EN\](.*?)\[EN\]', 104 | lambda x: english_to_ipa2(x.group(1))+' ', text) 105 | text = re.sub(r'\s+$', '', text) 106 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 107 | return text 108 | 109 | 110 | def thai_cleaners(text): 111 | from text.thai import num_to_thai, latin_to_thai 112 | text = num_to_thai(text) 113 | text = latin_to_thai(text) 114 | return text 115 | 116 | 117 | def shanghainese_cleaners(text): 118 | from text.shanghainese import shanghainese_to_ipa 119 | text = shanghainese_to_ipa(text) 120 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 121 | return text 122 | 123 | 124 | def chinese_dialect_cleaners(text): 125 | from text.mandarin import chinese_to_ipa2 126 | from text.japanese import japanese_to_ipa3 127 | from text.shanghainese import shanghainese_to_ipa 128 | from text.cantonese import cantonese_to_ipa 129 | from text.english import english_to_lazy_ipa2 130 | from text.ngu_dialect import ngu_dialect_to_ipa 131 | text = re.sub(r'\[ZH\](.*?)\[ZH\]', 132 | lambda x: chinese_to_ipa2(x.group(1))+' ', text) 133 | text = re.sub(r'\[JA\](.*?)\[JA\]', 134 | lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text) 135 | text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5', 136 | '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text) 137 | text = re.sub(r'\[GD\](.*?)\[GD\]', 138 | lambda x: cantonese_to_ipa(x.group(1))+' ', text) 139 | text = re.sub(r'\[EN\](.*?)\[EN\]', 140 | lambda x: english_to_lazy_ipa2(x.group(1))+' ', text) 141 | text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group( 142 | 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text) 143 | text = re.sub(r'\s+$', '', text) 144 | text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text) 145 | return text 146 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/korean.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2hcj 3 | import ko_pron 4 | 5 | 6 | # This is a list of Korean classifiers preceded by pure Korean numerals. 7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 8 | 9 | # List of (hangul, hangul divided) pairs: 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 11 | ('ㄳ', 'ㄱㅅ'), 12 | ('ㄵ', 'ㄴㅈ'), 13 | ('ㄶ', 'ㄴㅎ'), 14 | ('ㄺ', 'ㄹㄱ'), 15 | ('ㄻ', 'ㄹㅁ'), 16 | ('ㄼ', 'ㄹㅂ'), 17 | ('ㄽ', 'ㄹㅅ'), 18 | ('ㄾ', 'ㄹㅌ'), 19 | ('ㄿ', 'ㄹㅍ'), 20 | ('ㅀ', 'ㄹㅎ'), 21 | ('ㅄ', 'ㅂㅅ'), 22 | ('ㅘ', 'ㅗㅏ'), 23 | ('ㅙ', 'ㅗㅐ'), 24 | ('ㅚ', 'ㅗㅣ'), 25 | ('ㅝ', 'ㅜㅓ'), 26 | ('ㅞ', 'ㅜㅔ'), 27 | ('ㅟ', 'ㅜㅣ'), 28 | ('ㅢ', 'ㅡㅣ'), 29 | ('ㅑ', 'ㅣㅏ'), 30 | ('ㅒ', 'ㅣㅐ'), 31 | ('ㅕ', 'ㅣㅓ'), 32 | ('ㅖ', 'ㅣㅔ'), 33 | ('ㅛ', 'ㅣㅗ'), 34 | ('ㅠ', 'ㅣㅜ') 35 | ]] 36 | 37 | # List of (Latin alphabet, hangul) pairs: 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 39 | ('a', '에이'), 40 | ('b', '비'), 41 | ('c', '시'), 42 | ('d', '디'), 43 | ('e', '이'), 44 | ('f', '에프'), 45 | ('g', '지'), 46 | ('h', '에이치'), 47 | ('i', '아이'), 48 | ('j', '제이'), 49 | ('k', '케이'), 50 | ('l', '엘'), 51 | ('m', '엠'), 52 | ('n', '엔'), 53 | ('o', '오'), 54 | ('p', '피'), 55 | ('q', '큐'), 56 | ('r', '아르'), 57 | ('s', '에스'), 58 | ('t', '티'), 59 | ('u', '유'), 60 | ('v', '브이'), 61 | ('w', '더블유'), 62 | ('x', '엑스'), 63 | ('y', '와이'), 64 | ('z', '제트') 65 | ]] 66 | 67 | # List of (ipa, lazy ipa) pairs: 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 69 | ('t͡ɕ','ʧ'), 70 | ('d͡ʑ','ʥ'), 71 | ('ɲ','n^'), 72 | ('ɕ','ʃ'), 73 | ('ʷ','w'), 74 | ('ɭ','l`'), 75 | ('ʎ','ɾ'), 76 | ('ɣ','ŋ'), 77 | ('ɰ','ɯ'), 78 | ('ʝ','j'), 79 | ('ʌ','ə'), 80 | ('ɡ','g'), 81 | ('\u031a','#'), 82 | ('\u0348','='), 83 | ('\u031e',''), 84 | ('\u0320',''), 85 | ('\u0339','') 86 | ]] 87 | 88 | 89 | def latin_to_hangul(text): 90 | for regex, replacement in _latin_to_hangul: 91 | text = re.sub(regex, replacement, text) 92 | return text 93 | 94 | 95 | def divide_hangul(text): 96 | text = j2hcj(h2j(text)) 97 | for regex, replacement in _hangul_divided: 98 | text = re.sub(regex, replacement, text) 99 | return text 100 | 101 | 102 | def hangul_number(num, sino=True): 103 | '''Reference https://github.com/Kyubyong/g2pK''' 104 | num = re.sub(',', '', num) 105 | 106 | if num == '0': 107 | return '영' 108 | if not sino and num == '20': 109 | return '스무' 110 | 111 | digits = '123456789' 112 | names = '일이삼사오육칠팔구' 113 | digit2name = {d: n for d, n in zip(digits, names)} 114 | 115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 119 | 120 | spelledout = [] 121 | for i, digit in enumerate(num): 122 | i = len(num) - i - 1 123 | if sino: 124 | if i == 0: 125 | name = digit2name.get(digit, '') 126 | elif i == 1: 127 | name = digit2name.get(digit, '') + '십' 128 | name = name.replace('일십', '십') 129 | else: 130 | if i == 0: 131 | name = digit2mod.get(digit, '') 132 | elif i == 1: 133 | name = digit2dec.get(digit, '') 134 | if digit == '0': 135 | if i % 4 == 0: 136 | last_three = spelledout[-min(3, len(spelledout)):] 137 | if ''.join(last_three) == '': 138 | spelledout.append('') 139 | continue 140 | else: 141 | spelledout.append('') 142 | continue 143 | if i == 2: 144 | name = digit2name.get(digit, '') + '백' 145 | name = name.replace('일백', '백') 146 | elif i == 3: 147 | name = digit2name.get(digit, '') + '천' 148 | name = name.replace('일천', '천') 149 | elif i == 4: 150 | name = digit2name.get(digit, '') + '만' 151 | name = name.replace('일만', '만') 152 | elif i == 5: 153 | name = digit2name.get(digit, '') + '십' 154 | name = name.replace('일십', '십') 155 | elif i == 6: 156 | name = digit2name.get(digit, '') + '백' 157 | name = name.replace('일백', '백') 158 | elif i == 7: 159 | name = digit2name.get(digit, '') + '천' 160 | name = name.replace('일천', '천') 161 | elif i == 8: 162 | name = digit2name.get(digit, '') + '억' 163 | elif i == 9: 164 | name = digit2name.get(digit, '') + '십' 165 | elif i == 10: 166 | name = digit2name.get(digit, '') + '백' 167 | elif i == 11: 168 | name = digit2name.get(digit, '') + '천' 169 | elif i == 12: 170 | name = digit2name.get(digit, '') + '조' 171 | elif i == 13: 172 | name = digit2name.get(digit, '') + '십' 173 | elif i == 14: 174 | name = digit2name.get(digit, '') + '백' 175 | elif i == 15: 176 | name = digit2name.get(digit, '') + '천' 177 | spelledout.append(name) 178 | return ''.join(elem for elem in spelledout) 179 | 180 | 181 | def number_to_hangul(text): 182 | '''Reference https://github.com/Kyubyong/g2pK''' 183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 184 | for token in tokens: 185 | num, classifier = token 186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 187 | spelledout = hangul_number(num, sino=False) 188 | else: 189 | spelledout = hangul_number(num, sino=True) 190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 191 | # digit by digit for remaining digits 192 | digits = '0123456789' 193 | names = '영일이삼사오육칠팔구' 194 | for d, n in zip(digits, names): 195 | text = text.replace(d, n) 196 | return text 197 | 198 | 199 | def korean_to_lazy_ipa(text): 200 | text = latin_to_hangul(text) 201 | text = number_to_hangul(text) 202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) 203 | for regex, replacement in _ipa_to_lazy_ipa: 204 | text = re.sub(regex, replacement, text) 205 | return text 206 | 207 | 208 | def korean_to_ipa(text): 209 | text = korean_to_lazy_ipa(text) 210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ') 211 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/hubert_model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Tuple 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 9 | 10 | class Hubert(nn.Module): 11 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True): 12 | super().__init__() 13 | self._mask = mask 14 | self.feature_extractor = FeatureExtractor() 15 | self.feature_projection = FeatureProjection() 16 | self.positional_embedding = PositionalConvEmbedding() 17 | self.norm = nn.LayerNorm(768) 18 | self.dropout = nn.Dropout(0.1) 19 | self.encoder = TransformerEncoder( 20 | nn.TransformerEncoderLayer( 21 | 768, 12, 3072, activation="gelu", batch_first=True 22 | ), 23 | 12, 24 | ) 25 | self.proj = nn.Linear(768, 256) 26 | 27 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) 28 | self.label_embedding = nn.Embedding(num_label_embeddings, 256) 29 | 30 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 31 | mask = None 32 | if self.training and self._mask: 33 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) 34 | x[mask] = self.masked_spec_embed.to(x.dtype) 35 | return x, mask 36 | 37 | def encode( 38 | self, x: torch.Tensor, layer: Optional[int] = None 39 | ) -> Tuple[torch.Tensor, torch.Tensor]: 40 | x = self.feature_extractor(x) 41 | x = self.feature_projection(x.transpose(1, 2)) 42 | x, mask = self.mask(x) 43 | x = x + self.positional_embedding(x) 44 | x = self.dropout(self.norm(x)) 45 | x = self.encoder(x, output_layer=layer) 46 | return x, mask 47 | 48 | def logits(self, x: torch.Tensor) -> torch.Tensor: 49 | logits = torch.cosine_similarity( 50 | x.unsqueeze(2), 51 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0), 52 | dim=-1, 53 | ) 54 | return logits / 0.1 55 | 56 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 57 | x, mask = self.encode(x) 58 | x = self.proj(x) 59 | logits = self.logits(x) 60 | return logits, mask 61 | 62 | 63 | class HubertSoft(Hubert): 64 | def __init__(self): 65 | super().__init__() 66 | 67 | @torch.inference_mode() 68 | def units(self, wav: torch.Tensor) -> torch.Tensor: 69 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 70 | x, _ = self.encode(wav) 71 | return self.proj(x) 72 | 73 | 74 | class FeatureExtractor(nn.Module): 75 | def __init__(self): 76 | super().__init__() 77 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) 78 | self.norm0 = nn.GroupNorm(512, 512) 79 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) 80 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) 81 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) 82 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) 83 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) 84 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) 85 | 86 | def forward(self, x: torch.Tensor) -> torch.Tensor: 87 | x = F.gelu(self.norm0(self.conv0(x))) 88 | x = F.gelu(self.conv1(x)) 89 | x = F.gelu(self.conv2(x)) 90 | x = F.gelu(self.conv3(x)) 91 | x = F.gelu(self.conv4(x)) 92 | x = F.gelu(self.conv5(x)) 93 | x = F.gelu(self.conv6(x)) 94 | return x 95 | 96 | 97 | class FeatureProjection(nn.Module): 98 | def __init__(self): 99 | super().__init__() 100 | self.norm = nn.LayerNorm(512) 101 | self.projection = nn.Linear(512, 768) 102 | self.dropout = nn.Dropout(0.1) 103 | 104 | def forward(self, x: torch.Tensor) -> torch.Tensor: 105 | x = self.norm(x) 106 | x = self.projection(x) 107 | x = self.dropout(x) 108 | return x 109 | 110 | 111 | class PositionalConvEmbedding(nn.Module): 112 | def __init__(self): 113 | super().__init__() 114 | self.conv = nn.Conv1d( 115 | 768, 116 | 768, 117 | kernel_size=128, 118 | padding=128 // 2, 119 | groups=16, 120 | ) 121 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) 122 | 123 | def forward(self, x: torch.Tensor) -> torch.Tensor: 124 | x = self.conv(x.transpose(1, 2)) 125 | x = F.gelu(x[:, :, :-1]) 126 | return x.transpose(1, 2) 127 | 128 | 129 | class TransformerEncoder(nn.Module): 130 | def __init__( 131 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int 132 | ) -> None: 133 | super(TransformerEncoder, self).__init__() 134 | self.layers = nn.ModuleList( 135 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)] 136 | ) 137 | self.num_layers = num_layers 138 | 139 | def forward( 140 | self, 141 | src: torch.Tensor, 142 | mask: torch.Tensor = None, 143 | src_key_padding_mask: torch.Tensor = None, 144 | output_layer: Optional[int] = None, 145 | ) -> torch.Tensor: 146 | output = src 147 | for layer in self.layers[:output_layer]: 148 | output = layer( 149 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask 150 | ) 151 | return output 152 | 153 | 154 | def _compute_mask( 155 | shape: Tuple[int, int], 156 | mask_prob: float, 157 | mask_length: int, 158 | device: torch.device, 159 | min_masks: int = 0, 160 | ) -> torch.Tensor: 161 | batch_size, sequence_length = shape 162 | 163 | if mask_length < 1: 164 | raise ValueError("`mask_length` has to be bigger than 0.") 165 | 166 | if mask_length > sequence_length: 167 | raise ValueError( 168 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" 169 | ) 170 | 171 | # compute number of masked spans in batch 172 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) 173 | num_masked_spans = max(num_masked_spans, min_masks) 174 | 175 | # make sure num masked indices <= sequence_length 176 | if num_masked_spans * mask_length > sequence_length: 177 | num_masked_spans = sequence_length // mask_length 178 | 179 | # SpecAugment mask to fill 180 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) 181 | 182 | # uniform distribution to sample from, make sure that offset samples are < sequence_length 183 | uniform_dist = torch.ones( 184 | (batch_size, sequence_length - (mask_length - 1)), device=device 185 | ) 186 | 187 | # get random indices to mask 188 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans) 189 | 190 | # expand masked indices to masked spans 191 | mask_indices = ( 192 | mask_indices.unsqueeze(dim=-1) 193 | .expand((batch_size, num_masked_spans, mask_length)) 194 | .reshape(batch_size, num_masked_spans * mask_length) 195 | ) 196 | offsets = ( 197 | torch.arange(mask_length, device=device)[None, None, :] 198 | .expand((batch_size, num_masked_spans, mask_length)) 199 | .reshape(batch_size, num_masked_spans * mask_length) 200 | ) 201 | mask_idxs = mask_indices + offsets 202 | 203 | # scatter indices to mask 204 | mask = mask.scatter(1, mask_idxs, True) 205 | 206 | return mask 207 | 208 | 209 | def hubert_soft( 210 | path: str 211 | ) -> HubertSoft: 212 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 213 | Args: 214 | path (str): path of a pretrained model 215 | """ 216 | hubert = HubertSoft() 217 | checkpoint = torch.load(path) 218 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 219 | hubert.load_state_dict(checkpoint) 220 | hubert.eval() 221 | return hubert 222 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # build 366 | build 367 | monotonic_align/core.c 368 | *.o 369 | *.so 370 | *.dll 371 | 372 | # data 373 | /config.json 374 | /*.pth 375 | *.wav 376 | /monotonic_align/monotonic_align 377 | /resources 378 | /MoeGoe.spec 379 | /dist/MoeGoe 380 | /dist 381 | 382 | # MacOS 383 | .DS_Store 384 | -------------------------------------------------------------------------------- /HoshiNoYume/memory/prompts.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from langchain.prompts.prompt import PromptTemplate 3 | 4 | _DEFAULT_ENTITY_MEMORY_CONVERSATION_TEMPLATE = """You are an assistant to a human, powered by a large language model trained by OpenAI. 5 | 6 | You are designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, you are able to generate human-like text based on the input you receive, allowing you to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. 7 | 8 | You are constantly learning and improving, and your capabilities are constantly evolving. You are able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. You have access to some personalized information provided by the human in the Context section below. Additionally, you are able to generate your own text based on the input you receive, allowing you to engage in discussions and provide explanations and descriptions on a wide range of topics. 9 | 10 | Overall, you are a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether the human needs help with a specific question or just wants to have a conversation about a particular topic, you are here to assist. 11 | 12 | Context: 13 | {entities} 14 | 15 | Current conversation: 16 | {history} 17 | Last line: 18 | Human: {input} 19 | You:""" 20 | 21 | ENTITY_MEMORY_CONVERSATION_TEMPLATE = PromptTemplate( 22 | input_variables=["entities", "history", "input"], 23 | template=_DEFAULT_ENTITY_MEMORY_CONVERSATION_TEMPLATE, 24 | ) 25 | 26 | _DEFAULT_SUMMARIZER_TEMPLATE = """Progressively summarize the lines of conversation provided, adding onto the previous summary returning a new summary. 27 | 28 | EXAMPLE 29 | Current summary: 30 | The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good. 31 | 32 | New lines of conversation: 33 | Human: Why do you think artificial intelligence is a force for good? 34 | AI: Because artificial intelligence will help humans reach their full potential. 35 | 36 | New summary: 37 | The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential. 38 | END OF EXAMPLE 39 | 40 | Current summary: 41 | {summary} 42 | 43 | New lines of conversation: 44 | {new_lines} 45 | 46 | New summary:""" 47 | SUMMARY_PROMPT = PromptTemplate( 48 | input_variables=["summary", "new_lines"], template=_DEFAULT_SUMMARIZER_TEMPLATE 49 | ) 50 | 51 | _DEFAULT_ENTITY_EXTRACTION_TEMPLATE = """You are an AI assistant reading the transcript of a conversation between an AI and a human. Extract all of the proper nouns from the last line of conversation. As a guideline, a proper noun is generally capitalized. You should definitely extract all names and places. 52 | 53 | The conversation history is provided just in case of a coreference (e.g. "What do you know about him" where "him" is defined in a previous line) -- ignore items mentioned there that are not in the last line. 54 | 55 | Return the output as a single comma-separated list, or NONE if there is nothing of note to return (e.g. the user is just issuing a greeting or having a simple conversation). 56 | 57 | EXAMPLE 58 | Conversation history: 59 | Person #1: how's it going today? 60 | AI: "It's going great! How about you?" 61 | Person #1: good! busy working on Langchain. lots to do. 62 | AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?" 63 | Last line: 64 | Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. 65 | Output: Langchain 66 | END OF EXAMPLE 67 | 68 | EXAMPLE 69 | Conversation history: 70 | Person #1: how's it going today? 71 | AI: "It's going great! How about you?" 72 | Person #1: good! busy working on Langchain. lots to do. 73 | AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?" 74 | Last line: 75 | Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. I'm working with Person #2. 76 | Output: Langchain, Person #2 77 | END OF EXAMPLE 78 | 79 | Conversation history (for reference only): 80 | {history} 81 | Last line of conversation (for extraction): 82 | Human: {input} 83 | 84 | Output:""" 85 | ENTITY_EXTRACTION_PROMPT = PromptTemplate( 86 | input_variables=["history", "input"], template=_DEFAULT_ENTITY_EXTRACTION_TEMPLATE 87 | ) 88 | 89 | _DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE = """You are an AI assistant helping a human keep track of facts about relevant people, places, and concepts in their life. Update the summary of the provided entity in the "Entity" section based on the last line of your conversation with the human. If you are writing the summary for the first time, return a single sentence. 90 | The update should only include facts that are relayed in the last line of conversation about the provided entity, and should only contain facts about the provided entity. 91 | 92 | If there is no new information about the provided entity or the information is not worth noting (not an important or relevant fact to remember long-term), return the existing summary unchanged. 93 | 94 | Full conversation history (for context): 95 | {history} 96 | 97 | Entity to summarize: 98 | {entity} 99 | 100 | Existing summary of {entity}: 101 | {summary} 102 | 103 | Last line of conversation: 104 | Human: {input} 105 | Updated summary:""" 106 | 107 | ENTITY_SUMMARIZATION_PROMPT = PromptTemplate( 108 | input_variables=["entity", "summary", "history", "input"], 109 | template=_DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE, 110 | ) 111 | 112 | 113 | KG_TRIPLE_DELIMITER = "<|>" 114 | _DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE = ( 115 | "You are a networked intelligence helping a human track knowledge triples" 116 | " about all relevant people, things, concepts, etc. and integrating" 117 | " them with your knowledge stored within your weights" 118 | " as well as that stored in a knowledge graph." 119 | " Extract all of the knowledge triples from the last line of conversation." 120 | " A knowledge triple is a clause that contains a subject, a predicate," 121 | " and an object. The subject is the entity being described," 122 | " the predicate is the property of the subject that is being" 123 | " described, and the object is the value of the property.\n\n" 124 | "EXAMPLE\n" 125 | "Conversation history:\n" 126 | "Person #1: Did you hear aliens landed in Area 51?\n" 127 | "AI: No, I didn't hear that. What do you know about Area 51?\n" 128 | "Person #1: It's a secret military base in Nevada.\n" 129 | "AI: What do you know about Nevada?\n" 130 | "Last line of conversation:\n" 131 | "Person #1: It's a state in the US. It's also the number 1 producer of gold in the US.\n\n" 132 | f"Output: (Nevada, is a, state){KG_TRIPLE_DELIMITER}(Nevada, is in, US)" 133 | f"{KG_TRIPLE_DELIMITER}(Nevada, is the number 1 producer of, gold)\n" 134 | "END OF EXAMPLE\n\n" 135 | "EXAMPLE\n" 136 | "Conversation history:\n" 137 | "Person #1: Hello.\n" 138 | "AI: Hi! How are you?\n" 139 | "Person #1: I'm good. How are you?\n" 140 | "AI: I'm good too.\n" 141 | "Last line of conversation:\n" 142 | "Person #1: I'm going to the store.\n\n" 143 | "Output: NONE\n" 144 | "END OF EXAMPLE\n\n" 145 | "EXAMPLE\n" 146 | "Conversation history:\n" 147 | "Person #1: What do you know about Descartes?\n" 148 | "AI: Descartes was a French philosopher, mathematician, and scientist who lived in the 17th century.\n" 149 | "Person #1: The Descartes I'm referring to is a standup comedian and interior designer from Montreal.\n" 150 | "AI: Oh yes, He is a comedian and an interior designer. He has been in the industry for 30 years. His favorite food is baked bean pie.\n" 151 | "Last line of conversation:\n" 152 | "Person #1: Oh huh. I know Descartes likes to drive antique scooters and play the mandolin.\n" 153 | f"Output: (Descartes, likes to drive, antique scooters){KG_TRIPLE_DELIMITER}(Descartes, plays, mandolin)\n" 154 | "END OF EXAMPLE\n\n" 155 | "Conversation history (for reference only):\n" 156 | "{history}" 157 | "\nLast line of conversation (for extraction):\n" 158 | "Human: {input}\n\n" 159 | "Output:" 160 | ) 161 | 162 | KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT = PromptTemplate( 163 | input_variables=["history", "input"], 164 | template=_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE, 165 | ) 166 | 167 | FEW_SHOT_SHORT_MEMORY = [""] -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/text/mandarin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | from pypinyin import lazy_pinyin, BOPOMOFO 5 | import jieba 6 | import cn2an 7 | import logging 8 | 9 | logging.getLogger('jieba').setLevel(logging.WARNING) 10 | script_dir = os.path.dirname(os.path.abspath(__file__)) 11 | jieba_dic = os.path.join(script_dir, 'MoeGoe', 'jieba', 'dict.txt') 12 | # jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/MoeGoe/jieba/dict.txt') 13 | jieba.initialize() 14 | 15 | 16 | # List of (Latin alphabet, bopomofo) pairs: 17 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 18 | ('a', 'ㄟˉ'), 19 | ('b', 'ㄅㄧˋ'), 20 | ('c', 'ㄙㄧˉ'), 21 | ('d', 'ㄉㄧˋ'), 22 | ('e', 'ㄧˋ'), 23 | ('f', 'ㄝˊㄈㄨˋ'), 24 | ('g', 'ㄐㄧˋ'), 25 | ('h', 'ㄝˇㄑㄩˋ'), 26 | ('i', 'ㄞˋ'), 27 | ('j', 'ㄐㄟˋ'), 28 | ('k', 'ㄎㄟˋ'), 29 | ('l', 'ㄝˊㄛˋ'), 30 | ('m', 'ㄝˊㄇㄨˋ'), 31 | ('n', 'ㄣˉ'), 32 | ('o', 'ㄡˉ'), 33 | ('p', 'ㄆㄧˉ'), 34 | ('q', 'ㄎㄧㄡˉ'), 35 | ('r', 'ㄚˋ'), 36 | ('s', 'ㄝˊㄙˋ'), 37 | ('t', 'ㄊㄧˋ'), 38 | ('u', 'ㄧㄡˉ'), 39 | ('v', 'ㄨㄧˉ'), 40 | ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), 41 | ('x', 'ㄝˉㄎㄨˋㄙˋ'), 42 | ('y', 'ㄨㄞˋ'), 43 | ('z', 'ㄗㄟˋ') 44 | ]] 45 | 46 | # List of (bopomofo, romaji) pairs: 47 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ 48 | ('ㄅㄛ', 'p⁼wo'), 49 | ('ㄆㄛ', 'pʰwo'), 50 | ('ㄇㄛ', 'mwo'), 51 | ('ㄈㄛ', 'fwo'), 52 | ('ㄅ', 'p⁼'), 53 | ('ㄆ', 'pʰ'), 54 | ('ㄇ', 'm'), 55 | ('ㄈ', 'f'), 56 | ('ㄉ', 't⁼'), 57 | ('ㄊ', 'tʰ'), 58 | ('ㄋ', 'n'), 59 | ('ㄌ', 'l'), 60 | ('ㄍ', 'k⁼'), 61 | ('ㄎ', 'kʰ'), 62 | ('ㄏ', 'h'), 63 | ('ㄐ', 'ʧ⁼'), 64 | ('ㄑ', 'ʧʰ'), 65 | ('ㄒ', 'ʃ'), 66 | ('ㄓ', 'ʦ`⁼'), 67 | ('ㄔ', 'ʦ`ʰ'), 68 | ('ㄕ', 's`'), 69 | ('ㄖ', 'ɹ`'), 70 | ('ㄗ', 'ʦ⁼'), 71 | ('ㄘ', 'ʦʰ'), 72 | ('ㄙ', 's'), 73 | ('ㄚ', 'a'), 74 | ('ㄛ', 'o'), 75 | ('ㄜ', 'ə'), 76 | ('ㄝ', 'e'), 77 | ('ㄞ', 'ai'), 78 | ('ㄟ', 'ei'), 79 | ('ㄠ', 'au'), 80 | ('ㄡ', 'ou'), 81 | ('ㄧㄢ', 'yeNN'), 82 | ('ㄢ', 'aNN'), 83 | ('ㄧㄣ', 'iNN'), 84 | ('ㄣ', 'əNN'), 85 | ('ㄤ', 'aNg'), 86 | ('ㄧㄥ', 'iNg'), 87 | ('ㄨㄥ', 'uNg'), 88 | ('ㄩㄥ', 'yuNg'), 89 | ('ㄥ', 'əNg'), 90 | ('ㄦ', 'əɻ'), 91 | ('ㄧ', 'i'), 92 | ('ㄨ', 'u'), 93 | ('ㄩ', 'ɥ'), 94 | ('ˉ', '→'), 95 | ('ˊ', '↑'), 96 | ('ˇ', '↓↑'), 97 | ('ˋ', '↓'), 98 | ('˙', ''), 99 | (',', ','), 100 | ('。', '.'), 101 | ('!', '!'), 102 | ('?', '?'), 103 | ('—', '-') 104 | ]] 105 | 106 | # List of (romaji, ipa) pairs: 107 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 108 | ('ʃy', 'ʃ'), 109 | ('ʧʰy', 'ʧʰ'), 110 | ('ʧ⁼y', 'ʧ⁼'), 111 | ('NN', 'n'), 112 | ('Ng', 'ŋ'), 113 | ('y', 'j'), 114 | ('h', 'x') 115 | ]] 116 | 117 | # List of (bopomofo, ipa) pairs: 118 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 119 | ('ㄅㄛ', 'p⁼wo'), 120 | ('ㄆㄛ', 'pʰwo'), 121 | ('ㄇㄛ', 'mwo'), 122 | ('ㄈㄛ', 'fwo'), 123 | ('ㄅ', 'p⁼'), 124 | ('ㄆ', 'pʰ'), 125 | ('ㄇ', 'm'), 126 | ('ㄈ', 'f'), 127 | ('ㄉ', 't⁼'), 128 | ('ㄊ', 'tʰ'), 129 | ('ㄋ', 'n'), 130 | ('ㄌ', 'l'), 131 | ('ㄍ', 'k⁼'), 132 | ('ㄎ', 'kʰ'), 133 | ('ㄏ', 'x'), 134 | ('ㄐ', 'tʃ⁼'), 135 | ('ㄑ', 'tʃʰ'), 136 | ('ㄒ', 'ʃ'), 137 | ('ㄓ', 'ts`⁼'), 138 | ('ㄔ', 'ts`ʰ'), 139 | ('ㄕ', 's`'), 140 | ('ㄖ', 'ɹ`'), 141 | ('ㄗ', 'ts⁼'), 142 | ('ㄘ', 'tsʰ'), 143 | ('ㄙ', 's'), 144 | ('ㄚ', 'a'), 145 | ('ㄛ', 'o'), 146 | ('ㄜ', 'ə'), 147 | ('ㄝ', 'ɛ'), 148 | ('ㄞ', 'aɪ'), 149 | ('ㄟ', 'eɪ'), 150 | ('ㄠ', 'ɑʊ'), 151 | ('ㄡ', 'oʊ'), 152 | ('ㄧㄢ', 'jɛn'), 153 | ('ㄩㄢ', 'ɥæn'), 154 | ('ㄢ', 'an'), 155 | ('ㄧㄣ', 'in'), 156 | ('ㄩㄣ', 'ɥn'), 157 | ('ㄣ', 'ən'), 158 | ('ㄤ', 'ɑŋ'), 159 | ('ㄧㄥ', 'iŋ'), 160 | ('ㄨㄥ', 'ʊŋ'), 161 | ('ㄩㄥ', 'jʊŋ'), 162 | ('ㄥ', 'əŋ'), 163 | ('ㄦ', 'əɻ'), 164 | ('ㄧ', 'i'), 165 | ('ㄨ', 'u'), 166 | ('ㄩ', 'ɥ'), 167 | ('ˉ', '→'), 168 | ('ˊ', '↑'), 169 | ('ˇ', '↓↑'), 170 | ('ˋ', '↓'), 171 | ('˙', ''), 172 | (',', ','), 173 | ('。', '.'), 174 | ('!', '!'), 175 | ('?', '?'), 176 | ('—', '-') 177 | ]] 178 | 179 | # List of (bopomofo, ipa2) pairs: 180 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 181 | ('ㄅㄛ', 'pwo'), 182 | ('ㄆㄛ', 'pʰwo'), 183 | ('ㄇㄛ', 'mwo'), 184 | ('ㄈㄛ', 'fwo'), 185 | ('ㄅ', 'p'), 186 | ('ㄆ', 'pʰ'), 187 | ('ㄇ', 'm'), 188 | ('ㄈ', 'f'), 189 | ('ㄉ', 't'), 190 | ('ㄊ', 'tʰ'), 191 | ('ㄋ', 'n'), 192 | ('ㄌ', 'l'), 193 | ('ㄍ', 'k'), 194 | ('ㄎ', 'kʰ'), 195 | ('ㄏ', 'h'), 196 | ('ㄐ', 'tɕ'), 197 | ('ㄑ', 'tɕʰ'), 198 | ('ㄒ', 'ɕ'), 199 | ('ㄓ', 'tʂ'), 200 | ('ㄔ', 'tʂʰ'), 201 | ('ㄕ', 'ʂ'), 202 | ('ㄖ', 'ɻ'), 203 | ('ㄗ', 'ts'), 204 | ('ㄘ', 'tsʰ'), 205 | ('ㄙ', 's'), 206 | ('ㄚ', 'a'), 207 | ('ㄛ', 'o'), 208 | ('ㄜ', 'ɤ'), 209 | ('ㄝ', 'ɛ'), 210 | ('ㄞ', 'aɪ'), 211 | ('ㄟ', 'eɪ'), 212 | ('ㄠ', 'ɑʊ'), 213 | ('ㄡ', 'oʊ'), 214 | ('ㄧㄢ', 'jɛn'), 215 | ('ㄩㄢ', 'yæn'), 216 | ('ㄢ', 'an'), 217 | ('ㄧㄣ', 'in'), 218 | ('ㄩㄣ', 'yn'), 219 | ('ㄣ', 'ən'), 220 | ('ㄤ', 'ɑŋ'), 221 | ('ㄧㄥ', 'iŋ'), 222 | ('ㄨㄥ', 'ʊŋ'), 223 | ('ㄩㄥ', 'jʊŋ'), 224 | ('ㄥ', 'ɤŋ'), 225 | ('ㄦ', 'əɻ'), 226 | ('ㄧ', 'i'), 227 | ('ㄨ', 'u'), 228 | ('ㄩ', 'y'), 229 | ('ˉ', '˥'), 230 | ('ˊ', '˧˥'), 231 | ('ˇ', '˨˩˦'), 232 | ('ˋ', '˥˩'), 233 | ('˙', ''), 234 | (',', ','), 235 | ('。', '.'), 236 | ('!', '!'), 237 | ('?', '?'), 238 | ('—', '-') 239 | ]] 240 | 241 | 242 | def number_to_chinese(text): 243 | numbers = re.findall(r'\d+(?:\.?\d+)?', text) 244 | for number in numbers: 245 | text = text.replace(number, cn2an.an2cn(number), 1) 246 | return text 247 | 248 | 249 | def chinese_to_bopomofo(text): 250 | text = text.replace('、', ',').replace(';', ',').replace(':', ',') 251 | words = jieba.lcut(text, cut_all=False) 252 | text = '' 253 | for word in words: 254 | bopomofos = lazy_pinyin(word, BOPOMOFO) 255 | if not re.search('[\u4e00-\u9fff]', word): 256 | text += word 257 | continue 258 | for i in range(len(bopomofos)): 259 | bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) 260 | if text != '': 261 | text += ' ' 262 | text += ''.join(bopomofos) 263 | return text 264 | 265 | 266 | def latin_to_bopomofo(text): 267 | for regex, replacement in _latin_to_bopomofo: 268 | text = re.sub(regex, replacement, text) 269 | return text 270 | 271 | 272 | def bopomofo_to_romaji(text): 273 | for regex, replacement in _bopomofo_to_romaji: 274 | text = re.sub(regex, replacement, text) 275 | return text 276 | 277 | 278 | def bopomofo_to_ipa(text): 279 | for regex, replacement in _bopomofo_to_ipa: 280 | text = re.sub(regex, replacement, text) 281 | return text 282 | 283 | 284 | def bopomofo_to_ipa2(text): 285 | for regex, replacement in _bopomofo_to_ipa2: 286 | text = re.sub(regex, replacement, text) 287 | return text 288 | 289 | 290 | def chinese_to_romaji(text): 291 | text = number_to_chinese(text) 292 | text = chinese_to_bopomofo(text) 293 | text = latin_to_bopomofo(text) 294 | text = bopomofo_to_romaji(text) 295 | text = re.sub('i([aoe])', r'y\1', text) 296 | text = re.sub('u([aoəe])', r'w\1', text) 297 | text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 298 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 299 | text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 300 | return text 301 | 302 | 303 | def chinese_to_lazy_ipa(text): 304 | text = chinese_to_romaji(text) 305 | for regex, replacement in _romaji_to_ipa: 306 | text = re.sub(regex, replacement, text) 307 | return text 308 | 309 | 310 | def chinese_to_ipa(text): 311 | text = number_to_chinese(text) 312 | text = chinese_to_bopomofo(text) 313 | text = latin_to_bopomofo(text) 314 | text = bopomofo_to_ipa(text) 315 | text = re.sub('i([aoe])', r'j\1', text) 316 | text = re.sub('u([aoəe])', r'w\1', text) 317 | text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', 318 | r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') 319 | text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) 320 | return text 321 | 322 | 323 | def chinese_to_ipa2(text): 324 | text = number_to_chinese(text) 325 | text = chinese_to_bopomofo(text) 326 | text = latin_to_bopomofo(text) 327 | text = bopomofo_to_ipa2(text) 328 | text = re.sub(r'i([aoe])', r'j\1', text) 329 | text = re.sub(r'u([aoəe])', r'w\1', text) 330 | text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) 331 | text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) 332 | return text 333 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/MoeGoe.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write 2 | from mel_processing import spectrogram_torch 3 | from text import text_to_sequence, _clean_text 4 | from models import SynthesizerTrn 5 | import utils 6 | import commons 7 | import sys 8 | import re 9 | from torch import no_grad, LongTensor 10 | import logging 11 | 12 | logging.getLogger('numba').setLevel(logging.WARNING) 13 | 14 | 15 | def ex_print(text, escape=False): 16 | if escape: 17 | print(text.encode('unicode_escape').decode()) 18 | else: 19 | print(text) 20 | 21 | 22 | def get_text(text, hps, cleaned=False): 23 | if cleaned: 24 | text_norm = text_to_sequence(text, hps.symbols, []) 25 | else: 26 | text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) 27 | if hps.data.add_blank: 28 | text_norm = commons.intersperse(text_norm, 0) 29 | text_norm = LongTensor(text_norm) 30 | return text_norm 31 | 32 | 33 | def ask_if_continue(): 34 | while True: 35 | answer = input('Continue? (y/n): ') 36 | if answer == 'y': 37 | break 38 | elif answer == 'n': 39 | sys.exit(0) 40 | 41 | 42 | def print_speakers(speakers, escape=False): 43 | if len(speakers) > 100: 44 | return 45 | print('ID\tSpeaker') 46 | for id, name in enumerate(speakers): 47 | ex_print(str(id) + '\t' + name, escape) 48 | 49 | 50 | def get_speaker_id(message): 51 | speaker_id = input(message) 52 | try: 53 | speaker_id = int(speaker_id) 54 | except: 55 | print(str(speaker_id) + ' is not a valid ID!') 56 | sys.exit(1) 57 | return speaker_id 58 | 59 | 60 | def get_label_value(text, label, default, warning_name='value'): 61 | value = re.search(rf'\[{label}=(.+?)\]', text) 62 | if value: 63 | try: 64 | text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1) 65 | value = float(value.group(1)) 66 | except: 67 | print(f'Invalid {warning_name}!') 68 | sys.exit(1) 69 | else: 70 | value = default 71 | return value, text 72 | 73 | 74 | def get_label(text, label): 75 | if f'[{label}]' in text: 76 | return True, text.replace(f'[{label}]', '') 77 | else: 78 | return False, text 79 | 80 | 81 | if __name__ == '__main__': 82 | if '--escape' in sys.argv: 83 | escape = True 84 | else: 85 | escape = False 86 | 87 | model = input('Path of a VITS model: ') 88 | config = input('Path of a config file: ') 89 | 90 | hps_ms = utils.get_hparams_from_file(config) 91 | n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0 92 | n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0 93 | speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0'] 94 | use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False 95 | emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False 96 | 97 | net_g_ms = SynthesizerTrn( 98 | n_symbols, 99 | hps_ms.data.filter_length // 2 + 1, 100 | hps_ms.train.segment_size // hps_ms.data.hop_length, 101 | n_speakers=n_speakers, 102 | emotion_embedding=emotion_embedding, 103 | **hps_ms.model) 104 | _ = net_g_ms.eval() 105 | utils.load_checkpoint(model, net_g_ms) 106 | 107 | def voice_conversion(): 108 | audio_path = input('Path of an audio file to convert:\n') 109 | print_speakers(speakers) 110 | audio = utils.load_audio_to_torch( 111 | audio_path, hps_ms.data.sampling_rate) 112 | 113 | originnal_id = get_speaker_id('Original speaker ID: ') 114 | target_id = get_speaker_id('Target speaker ID: ') 115 | out_path = input('Path to save: ') 116 | 117 | y = audio.unsqueeze(0) 118 | 119 | spec = spectrogram_torch(y, hps_ms.data.filter_length, 120 | hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length, 121 | center=False) 122 | spec_lengths = LongTensor([spec.size(-1)]) 123 | sid_src = LongTensor([originnal_id]) 124 | 125 | with no_grad(): 126 | sid_tgt = LongTensor([target_id]) 127 | audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[ 128 | 0][0, 0].data.cpu().float().numpy() 129 | return audio, out_path 130 | 131 | if n_symbols != 0: 132 | if not emotion_embedding: 133 | while True: 134 | choice = input('TTS or VC? (t/v):') 135 | if choice == 't': 136 | text = input('Text to read: ') 137 | if text == '[ADVANCED]': 138 | text = input('Raw text:') 139 | print('Cleaned text is:') 140 | ex_print(_clean_text( 141 | text, hps_ms.data.text_cleaners), escape) 142 | continue 143 | 144 | length_scale, text = get_label_value( 145 | text, 'LENGTH', 1, 'length scale') 146 | noise_scale, text = get_label_value( 147 | text, 'NOISE', 0.667, 'noise scale') 148 | noise_scale_w, text = get_label_value( 149 | text, 'NOISEW', 0.8, 'deviation of noise') 150 | cleaned, text = get_label(text, 'CLEANED') 151 | 152 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 153 | 154 | print_speakers(speakers, escape) 155 | speaker_id = get_speaker_id('Speaker ID: ') 156 | out_path = input('Path to save: ') 157 | 158 | with no_grad(): 159 | x_tst = stn_tst.unsqueeze(0) 160 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 161 | sid = LongTensor([speaker_id]) 162 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 163 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() 164 | 165 | elif choice == 'v': 166 | audio, out_path = voice_conversion() 167 | 168 | write(out_path, hps_ms.data.sampling_rate, audio) 169 | print('Successfully saved!') 170 | ask_if_continue() 171 | else: 172 | import os 173 | import librosa 174 | import numpy as np 175 | from torch import FloatTensor 176 | import audonnx 177 | w2v2_folder = input('Path of a w2v2 dimensional emotion model: ') 178 | w2v2_model = audonnx.load(os.path.dirname(w2v2_folder)) 179 | while True: 180 | choice = input('TTS or VC? (t/v):') 181 | if choice == 't': 182 | text = input('Text to read: ') 183 | if text == '[ADVANCED]': 184 | text = input('Raw text:') 185 | print('Cleaned text is:') 186 | ex_print(_clean_text( 187 | text, hps_ms.data.text_cleaners), escape) 188 | continue 189 | 190 | length_scale, text = get_label_value( 191 | text, 'LENGTH', 1, 'length scale') 192 | noise_scale, text = get_label_value( 193 | text, 'NOISE', 0.667, 'noise scale') 194 | noise_scale_w, text = get_label_value( 195 | text, 'NOISEW', 0.8, 'deviation of noise') 196 | cleaned, text = get_label(text, 'CLEANED') 197 | 198 | stn_tst = get_text(text, hps_ms, cleaned=cleaned) 199 | 200 | print_speakers(speakers, escape) 201 | speaker_id = get_speaker_id('Speaker ID: ') 202 | 203 | emotion_reference = input('Path of an emotion reference: ') 204 | if emotion_reference.endswith('.npy'): 205 | emotion = np.load(emotion_reference) 206 | emotion = FloatTensor(emotion).unsqueeze(0) 207 | else: 208 | audio16000, sampling_rate = librosa.load( 209 | emotion_reference, sr=16000, mono=True) 210 | emotion = w2v2_model(audio16000, sampling_rate)[ 211 | 'hidden_states'] 212 | emotion_reference = re.sub( 213 | r'\..*$', '', emotion_reference) 214 | np.save(emotion_reference, emotion.squeeze(0)) 215 | emotion = FloatTensor(emotion) 216 | 217 | out_path = input('Path to save: ') 218 | 219 | with no_grad(): 220 | x_tst = stn_tst.unsqueeze(0) 221 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 222 | sid = LongTensor([speaker_id]) 223 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, 224 | length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy() 225 | 226 | elif choice == 'v': 227 | audio, out_path = voice_conversion() 228 | 229 | write(out_path, hps_ms.data.sampling_rate, audio) 230 | print('Successfully saved!') 231 | ask_if_continue() 232 | else: 233 | model = input('Path of a hubert-soft model: ') 234 | from hubert_model import hubert_soft 235 | hubert = hubert_soft(model) 236 | 237 | while True: 238 | audio_path = input('Path of an audio file to convert:\n') 239 | 240 | if audio_path != '[VC]': 241 | import librosa 242 | if use_f0: 243 | audio, sampling_rate = librosa.load( 244 | audio_path, sr=hps_ms.data.sampling_rate, mono=True) 245 | audio16000 = librosa.resample( 246 | audio, orig_sr=sampling_rate, target_sr=16000) 247 | else: 248 | audio16000, sampling_rate = librosa.load( 249 | audio_path, sr=16000, mono=True) 250 | 251 | print_speakers(speakers, escape) 252 | target_id = get_speaker_id('Target speaker ID: ') 253 | out_path = input('Path to save: ') 254 | length_scale, out_path = get_label_value( 255 | out_path, 'LENGTH', 1, 'length scale') 256 | noise_scale, out_path = get_label_value( 257 | out_path, 'NOISE', 0.1, 'noise scale') 258 | noise_scale_w, out_path = get_label_value( 259 | out_path, 'NOISEW', 0.1, 'deviation of noise') 260 | 261 | from torch import inference_mode, FloatTensor 262 | import numpy as np 263 | with inference_mode(): 264 | units = hubert.units(FloatTensor(audio16000).unsqueeze( 265 | 0).unsqueeze(0)).squeeze(0).numpy() 266 | if use_f0: 267 | f0_scale, out_path = get_label_value( 268 | out_path, 'F0', 1, 'f0 scale') 269 | f0 = librosa.pyin(audio, sr=sampling_rate, 270 | fmin=librosa.note_to_hz('C0'), 271 | fmax=librosa.note_to_hz('C7'), 272 | frame_length=1780)[0] 273 | target_length = len(units[:, 0]) 274 | f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length, 275 | np.arange(0, len(f0)), f0)) * f0_scale 276 | units[:, 0] = f0 / 10 277 | 278 | stn_tst = FloatTensor(units) 279 | with no_grad(): 280 | x_tst = stn_tst.unsqueeze(0) 281 | x_tst_lengths = LongTensor([stn_tst.size(0)]) 282 | sid = LongTensor([target_id]) 283 | audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, 284 | noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() 285 | 286 | else: 287 | audio, out_path = voice_conversion() 288 | 289 | write(out_path, hps_ms.data.sampling_rate, audio) 290 | print('Successfully saved!') 291 | ask_if_continue() 292 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/attentions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | from modules import LayerNorm 8 | 9 | 10 | class Encoder(nn.Module): 11 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.filter_channels = filter_channels 15 | self.n_heads = n_heads 16 | self.n_layers = n_layers 17 | self.kernel_size = kernel_size 18 | self.p_dropout = p_dropout 19 | self.window_size = window_size 20 | 21 | self.drop = nn.Dropout(p_dropout) 22 | self.attn_layers = nn.ModuleList() 23 | self.norm_layers_1 = nn.ModuleList() 24 | self.ffn_layers = nn.ModuleList() 25 | self.norm_layers_2 = nn.ModuleList() 26 | for i in range(self.n_layers): 27 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 28 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 29 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 30 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 31 | 32 | def forward(self, x, x_mask): 33 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 34 | x = x * x_mask 35 | for i in range(self.n_layers): 36 | y = self.attn_layers[i](x, x, attn_mask) 37 | y = self.drop(y) 38 | x = self.norm_layers_1[i](x + y) 39 | 40 | y = self.ffn_layers[i](x, x_mask) 41 | y = self.drop(y) 42 | x = self.norm_layers_2[i](x + y) 43 | x = x * x_mask 44 | return x 45 | 46 | 47 | class Decoder(nn.Module): 48 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 49 | super().__init__() 50 | self.hidden_channels = hidden_channels 51 | self.filter_channels = filter_channels 52 | self.n_heads = n_heads 53 | self.n_layers = n_layers 54 | self.kernel_size = kernel_size 55 | self.p_dropout = p_dropout 56 | self.proximal_bias = proximal_bias 57 | self.proximal_init = proximal_init 58 | 59 | self.drop = nn.Dropout(p_dropout) 60 | self.self_attn_layers = nn.ModuleList() 61 | self.norm_layers_0 = nn.ModuleList() 62 | self.encdec_attn_layers = nn.ModuleList() 63 | self.norm_layers_1 = nn.ModuleList() 64 | self.ffn_layers = nn.ModuleList() 65 | self.norm_layers_2 = nn.ModuleList() 66 | for i in range(self.n_layers): 67 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 68 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 69 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 70 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 71 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 72 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 73 | 74 | def forward(self, x, x_mask, h, h_mask): 75 | """ 76 | x: decoder input 77 | h: encoder output 78 | """ 79 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 80 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 81 | x = x * x_mask 82 | for i in range(self.n_layers): 83 | y = self.self_attn_layers[i](x, x, self_attn_mask) 84 | y = self.drop(y) 85 | x = self.norm_layers_0[i](x + y) 86 | 87 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 88 | y = self.drop(y) 89 | x = self.norm_layers_1[i](x + y) 90 | 91 | y = self.ffn_layers[i](x, x_mask) 92 | y = self.drop(y) 93 | x = self.norm_layers_2[i](x + y) 94 | x = x * x_mask 95 | return x 96 | 97 | 98 | class MultiHeadAttention(nn.Module): 99 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 100 | super().__init__() 101 | assert channels % n_heads == 0 102 | 103 | self.channels = channels 104 | self.out_channels = out_channels 105 | self.n_heads = n_heads 106 | self.p_dropout = p_dropout 107 | self.window_size = window_size 108 | self.heads_share = heads_share 109 | self.block_length = block_length 110 | self.proximal_bias = proximal_bias 111 | self.proximal_init = proximal_init 112 | self.attn = None 113 | 114 | self.k_channels = channels // n_heads 115 | self.conv_q = nn.Conv1d(channels, channels, 1) 116 | self.conv_k = nn.Conv1d(channels, channels, 1) 117 | self.conv_v = nn.Conv1d(channels, channels, 1) 118 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 119 | self.drop = nn.Dropout(p_dropout) 120 | 121 | if window_size is not None: 122 | n_heads_rel = 1 if heads_share else n_heads 123 | rel_stddev = self.k_channels**-0.5 124 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 125 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 126 | 127 | nn.init.xavier_uniform_(self.conv_q.weight) 128 | nn.init.xavier_uniform_(self.conv_k.weight) 129 | nn.init.xavier_uniform_(self.conv_v.weight) 130 | if proximal_init: 131 | with torch.no_grad(): 132 | self.conv_k.weight.copy_(self.conv_q.weight) 133 | self.conv_k.bias.copy_(self.conv_q.bias) 134 | 135 | def forward(self, x, c, attn_mask=None): 136 | q = self.conv_q(x) 137 | k = self.conv_k(c) 138 | v = self.conv_v(c) 139 | 140 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 141 | 142 | x = self.conv_o(x) 143 | return x 144 | 145 | def attention(self, query, key, value, mask=None): 146 | # reshape [b, d, t] -> [b, n_h, t, d_k] 147 | b, d, t_s, t_t = (*key.size(), query.size(2)) 148 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 149 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 150 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 151 | 152 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 153 | if self.window_size is not None: 154 | assert t_s == t_t, "Relative attention is only available for self-attention." 155 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 156 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 157 | scores_local = self._relative_position_to_absolute_position(rel_logits) 158 | scores = scores + scores_local 159 | if self.proximal_bias: 160 | assert t_s == t_t, "Proximal bias is only available for self-attention." 161 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 162 | if mask is not None: 163 | scores = scores.masked_fill(mask == 0, -1e4) 164 | if self.block_length is not None: 165 | assert t_s == t_t, "Local attention is only available for self-attention." 166 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 167 | scores = scores.masked_fill(block_mask == 0, -1e4) 168 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 169 | p_attn = self.drop(p_attn) 170 | output = torch.matmul(p_attn, value) 171 | if self.window_size is not None: 172 | relative_weights = self._absolute_position_to_relative_position(p_attn) 173 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 174 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 175 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 176 | return output, p_attn 177 | 178 | def _matmul_with_relative_values(self, x, y): 179 | """ 180 | x: [b, h, l, m] 181 | y: [h or 1, m, d] 182 | ret: [b, h, l, d] 183 | """ 184 | ret = torch.matmul(x, y.unsqueeze(0)) 185 | return ret 186 | 187 | def _matmul_with_relative_keys(self, x, y): 188 | """ 189 | x: [b, h, l, d] 190 | y: [h or 1, m, d] 191 | ret: [b, h, l, m] 192 | """ 193 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 194 | return ret 195 | 196 | def _get_relative_embeddings(self, relative_embeddings, length): 197 | max_relative_position = 2 * self.window_size + 1 198 | # Pad first before slice to avoid using cond ops. 199 | pad_length = max(length - (self.window_size + 1), 0) 200 | slice_start_position = max((self.window_size + 1) - length, 0) 201 | slice_end_position = slice_start_position + 2 * length - 1 202 | if pad_length > 0: 203 | padded_relative_embeddings = F.pad( 204 | relative_embeddings, 205 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 206 | else: 207 | padded_relative_embeddings = relative_embeddings 208 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 209 | return used_relative_embeddings 210 | 211 | def _relative_position_to_absolute_position(self, x): 212 | """ 213 | x: [b, h, l, 2*l-1] 214 | ret: [b, h, l, l] 215 | """ 216 | batch, heads, length, _ = x.size() 217 | # Concat columns of pad to shift from relative to absolute indexing. 218 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 219 | 220 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 221 | x_flat = x.view([batch, heads, length * 2 * length]) 222 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 223 | 224 | # Reshape and slice out the padded elements. 225 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 226 | return x_final 227 | 228 | def _absolute_position_to_relative_position(self, x): 229 | """ 230 | x: [b, h, l, l] 231 | ret: [b, h, l, 2*l-1] 232 | """ 233 | batch, heads, length, _ = x.size() 234 | # padd along column 235 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 236 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 237 | # add 0's in the beginning that will skew the elements after reshape 238 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 239 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 240 | return x_final 241 | 242 | def _attention_bias_proximal(self, length): 243 | """Bias for self-attention to encourage attention to close positions. 244 | Args: 245 | length: an integer scalar. 246 | Returns: 247 | a Tensor with shape [1, 1, length, length] 248 | """ 249 | r = torch.arange(length, dtype=torch.float32) 250 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 251 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 252 | 253 | 254 | class FFN(nn.Module): 255 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 256 | super().__init__() 257 | self.in_channels = in_channels 258 | self.out_channels = out_channels 259 | self.filter_channels = filter_channels 260 | self.kernel_size = kernel_size 261 | self.p_dropout = p_dropout 262 | self.activation = activation 263 | self.causal = causal 264 | 265 | if causal: 266 | self.padding = self._causal_padding 267 | else: 268 | self.padding = self._same_padding 269 | 270 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 271 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 272 | self.drop = nn.Dropout(p_dropout) 273 | 274 | def forward(self, x, x_mask): 275 | x = self.conv_1(self.padding(x * x_mask)) 276 | if self.activation == "gelu": 277 | x = x * torch.sigmoid(1.702 * x) 278 | else: 279 | x = torch.relu(x) 280 | x = self.drop(x) 281 | x = self.conv_2(self.padding(x * x_mask)) 282 | return x * x_mask 283 | 284 | def _causal_padding(self, x): 285 | if self.kernel_size == 1: 286 | return x 287 | pad_l = self.kernel_size - 1 288 | pad_r = 0 289 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 290 | x = F.pad(x, commons.convert_pad_shape(padding)) 291 | return x 292 | 293 | def _same_padding(self, x): 294 | if self.kernel_size == 1: 295 | return x 296 | pad_l = (self.kernel_size - 1) // 2 297 | pad_r = self.kernel_size // 2 298 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 299 | x = F.pad(x, commons.convert_pad_shape(padding)) 300 | return x 301 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch.nn import Conv1d 7 | from torch.nn.utils import weight_norm, remove_weight_norm 8 | 9 | import commons 10 | from commons import init_weights, get_padding 11 | from transforms import piecewise_rational_quadratic_transform 12 | 13 | 14 | LRELU_SLOPE = 0.1 15 | 16 | 17 | class LayerNorm(nn.Module): 18 | def __init__(self, channels, eps=1e-5): 19 | super().__init__() 20 | self.channels = channels 21 | self.eps = eps 22 | 23 | self.gamma = nn.Parameter(torch.ones(channels)) 24 | self.beta = nn.Parameter(torch.zeros(channels)) 25 | 26 | def forward(self, x): 27 | x = x.transpose(1, -1) 28 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 29 | return x.transpose(1, -1) 30 | 31 | 32 | class ConvReluNorm(nn.Module): 33 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 34 | super().__init__() 35 | self.in_channels = in_channels 36 | self.hidden_channels = hidden_channels 37 | self.out_channels = out_channels 38 | self.kernel_size = kernel_size 39 | self.n_layers = n_layers 40 | self.p_dropout = p_dropout 41 | assert n_layers > 1, "Number of layers should be larger than 0." 42 | 43 | self.conv_layers = nn.ModuleList() 44 | self.norm_layers = nn.ModuleList() 45 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 46 | self.norm_layers.append(LayerNorm(hidden_channels)) 47 | self.relu_drop = nn.Sequential( 48 | nn.ReLU(), 49 | nn.Dropout(p_dropout)) 50 | for _ in range(n_layers-1): 51 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 54 | self.proj.weight.data.zero_() 55 | self.proj.bias.data.zero_() 56 | 57 | def forward(self, x, x_mask): 58 | x_org = x 59 | for i in range(self.n_layers): 60 | x = self.conv_layers[i](x * x_mask) 61 | x = self.norm_layers[i](x) 62 | x = self.relu_drop(x) 63 | x = x_org + self.proj(x) 64 | return x * x_mask 65 | 66 | 67 | class DDSConv(nn.Module): 68 | """ 69 | Dilated and Depth-Separable Convolution 70 | """ 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert(kernel_size % 2 == 1) 112 | self.hidden_channels =hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 125 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 126 | 127 | for i in range(n_layers): 128 | dilation = dilation_rate ** i 129 | padding = int((kernel_size * dilation - dilation) / 2) 130 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 131 | dilation=dilation, padding=padding) 132 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 133 | self.in_layers.append(in_layer) 134 | 135 | # last one is not necessary 136 | if i < n_layers - 1: 137 | res_skip_channels = 2 * hidden_channels 138 | else: 139 | res_skip_channels = hidden_channels 140 | 141 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 142 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 143 | self.res_skip_layers.append(res_skip_layer) 144 | 145 | def forward(self, x, x_mask, g=None, **kwargs): 146 | output = torch.zeros_like(x) 147 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 148 | 149 | if g is not None: 150 | g = self.cond_layer(g) 151 | 152 | for i in range(self.n_layers): 153 | x_in = self.in_layers[i](x) 154 | if g is not None: 155 | cond_offset = i * 2 * self.hidden_channels 156 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 157 | else: 158 | g_l = torch.zeros_like(x_in) 159 | 160 | acts = commons.fused_add_tanh_sigmoid_multiply( 161 | x_in, 162 | g_l, 163 | n_channels_tensor) 164 | acts = self.drop(acts) 165 | 166 | res_skip_acts = self.res_skip_layers[i](acts) 167 | if i < self.n_layers - 1: 168 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 169 | x = (x + res_acts) * x_mask 170 | output = output + res_skip_acts[:,self.hidden_channels:,:] 171 | else: 172 | output = output + res_skip_acts 173 | return output * x_mask 174 | 175 | def remove_weight_norm(self): 176 | if self.gin_channels != 0: 177 | torch.nn.utils.remove_weight_norm(self.cond_layer) 178 | for l in self.in_layers: 179 | torch.nn.utils.remove_weight_norm(l) 180 | for l in self.res_skip_layers: 181 | torch.nn.utils.remove_weight_norm(l) 182 | 183 | 184 | class ResBlock1(torch.nn.Module): 185 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 186 | super(ResBlock1, self).__init__() 187 | self.convs1 = nn.ModuleList([ 188 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 189 | padding=get_padding(kernel_size, dilation[0]))), 190 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 191 | padding=get_padding(kernel_size, dilation[1]))), 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 193 | padding=get_padding(kernel_size, dilation[2]))) 194 | ]) 195 | self.convs1.apply(init_weights) 196 | 197 | self.convs2 = nn.ModuleList([ 198 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 199 | padding=get_padding(kernel_size, 1))), 200 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 201 | padding=get_padding(kernel_size, 1))), 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=get_padding(kernel_size, 1))) 204 | ]) 205 | self.convs2.apply(init_weights) 206 | 207 | def forward(self, x, x_mask=None): 208 | for c1, c2 in zip(self.convs1, self.convs2): 209 | xt = F.leaky_relu(x, LRELU_SLOPE) 210 | if x_mask is not None: 211 | xt = xt * x_mask 212 | xt = c1(xt) 213 | xt = F.leaky_relu(xt, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c2(xt) 217 | x = xt + x 218 | if x_mask is not None: 219 | x = x * x_mask 220 | return x 221 | 222 | def remove_weight_norm(self): 223 | for l in self.convs1: 224 | remove_weight_norm(l) 225 | for l in self.convs2: 226 | remove_weight_norm(l) 227 | 228 | 229 | class ResBlock2(torch.nn.Module): 230 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 231 | super(ResBlock2, self).__init__() 232 | self.convs = nn.ModuleList([ 233 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 234 | padding=get_padding(kernel_size, dilation[0]))), 235 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 236 | padding=get_padding(kernel_size, dilation[1]))) 237 | ]) 238 | self.convs.apply(init_weights) 239 | 240 | def forward(self, x, x_mask=None): 241 | for c in self.convs: 242 | xt = F.leaky_relu(x, LRELU_SLOPE) 243 | if x_mask is not None: 244 | xt = xt * x_mask 245 | xt = c(xt) 246 | x = xt + x 247 | if x_mask is not None: 248 | x = x * x_mask 249 | return x 250 | 251 | def remove_weight_norm(self): 252 | for l in self.convs: 253 | remove_weight_norm(l) 254 | 255 | 256 | class Log(nn.Module): 257 | def forward(self, x, x_mask, reverse=False, **kwargs): 258 | if not reverse: 259 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 260 | logdet = torch.sum(-y, [1, 2]) 261 | return y, logdet 262 | else: 263 | x = torch.exp(x) * x_mask 264 | return x 265 | 266 | 267 | class Flip(nn.Module): 268 | def forward(self, x, *args, reverse=False, **kwargs): 269 | x = torch.flip(x, [1]) 270 | if not reverse: 271 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 272 | return x, logdet 273 | else: 274 | return x 275 | 276 | 277 | class ElementwiseAffine(nn.Module): 278 | def __init__(self, channels): 279 | super().__init__() 280 | self.channels = channels 281 | self.m = nn.Parameter(torch.zeros(channels,1)) 282 | self.logs = nn.Parameter(torch.zeros(channels,1)) 283 | 284 | def forward(self, x, x_mask, reverse=False, **kwargs): 285 | if not reverse: 286 | y = self.m + torch.exp(self.logs) * x 287 | y = y * x_mask 288 | logdet = torch.sum(self.logs * x_mask, [1,2]) 289 | return y, logdet 290 | else: 291 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 292 | return x 293 | 294 | 295 | class ResidualCouplingLayer(nn.Module): 296 | def __init__(self, 297 | channels, 298 | hidden_channels, 299 | kernel_size, 300 | dilation_rate, 301 | n_layers, 302 | p_dropout=0, 303 | gin_channels=0, 304 | mean_only=False): 305 | assert channels % 2 == 0, "channels should be divisible by 2" 306 | super().__init__() 307 | self.channels = channels 308 | self.hidden_channels = hidden_channels 309 | self.kernel_size = kernel_size 310 | self.dilation_rate = dilation_rate 311 | self.n_layers = n_layers 312 | self.half_channels = channels // 2 313 | self.mean_only = mean_only 314 | 315 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 316 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 317 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 318 | self.post.weight.data.zero_() 319 | self.post.bias.data.zero_() 320 | 321 | def forward(self, x, x_mask, g=None, reverse=False): 322 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 323 | h = self.pre(x0) * x_mask 324 | h = self.enc(h, x_mask, g=g) 325 | stats = self.post(h) * x_mask 326 | if not self.mean_only: 327 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 328 | else: 329 | m = stats 330 | logs = torch.zeros_like(m) 331 | 332 | if not reverse: 333 | x1 = m + x1 * torch.exp(logs) * x_mask 334 | x = torch.cat([x0, x1], 1) 335 | logdet = torch.sum(logs, [1,2]) 336 | return x, logdet 337 | else: 338 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 339 | x = torch.cat([x0, x1], 1) 340 | return x 341 | 342 | 343 | class ConvFlow(nn.Module): 344 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 345 | super().__init__() 346 | self.in_channels = in_channels 347 | self.filter_channels = filter_channels 348 | self.kernel_size = kernel_size 349 | self.n_layers = n_layers 350 | self.num_bins = num_bins 351 | self.tail_bound = tail_bound 352 | self.half_channels = in_channels // 2 353 | 354 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 355 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 356 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 357 | self.proj.weight.data.zero_() 358 | self.proj.bias.data.zero_() 359 | 360 | def forward(self, x, x_mask, g=None, reverse=False): 361 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 362 | h = self.pre(x0) 363 | h = self.convs(h, x_mask, g=g) 364 | h = self.proj(h) * x_mask 365 | 366 | b, c, t = x0.shape 367 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 368 | 369 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 370 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 371 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 372 | 373 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 374 | unnormalized_widths, 375 | unnormalized_heights, 376 | unnormalized_derivatives, 377 | inverse=reverse, 378 | tails='linear', 379 | tail_bound=self.tail_bound 380 | ) 381 | 382 | x = torch.cat([x0, x1], 1) * x_mask 383 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 384 | if not reverse: 385 | return x, logdet 386 | else: 387 | return x 388 | -------------------------------------------------------------------------------- /HoshiNoYume/actions/MoeGoe/models.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import commons 7 | import modules 8 | import attentions 9 | 10 | from torch.nn import Conv1d, ConvTranspose1d 11 | from torch.nn.utils import weight_norm 12 | from commons import init_weights 13 | 14 | 15 | class StochasticDurationPredictor(nn.Module): 16 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): 17 | super().__init__() 18 | filter_channels = in_channels # it needs to be removed from future version. 19 | self.in_channels = in_channels 20 | self.filter_channels = filter_channels 21 | self.kernel_size = kernel_size 22 | self.p_dropout = p_dropout 23 | self.n_flows = n_flows 24 | self.gin_channels = gin_channels 25 | 26 | self.log_flow = modules.Log() 27 | self.flows = nn.ModuleList() 28 | self.flows.append(modules.ElementwiseAffine(2)) 29 | for i in range(n_flows): 30 | self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 31 | self.flows.append(modules.Flip()) 32 | 33 | self.post_pre = nn.Conv1d(1, filter_channels, 1) 34 | self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) 35 | self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 36 | self.post_flows = nn.ModuleList() 37 | self.post_flows.append(modules.ElementwiseAffine(2)) 38 | for i in range(4): 39 | self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) 40 | self.post_flows.append(modules.Flip()) 41 | 42 | self.pre = nn.Conv1d(in_channels, filter_channels, 1) 43 | self.proj = nn.Conv1d(filter_channels, filter_channels, 1) 44 | self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) 45 | if gin_channels != 0: 46 | self.cond = nn.Conv1d(gin_channels, filter_channels, 1) 47 | 48 | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): 49 | x = torch.detach(x) 50 | x = self.pre(x) 51 | if g is not None: 52 | g = torch.detach(g) 53 | x = x + self.cond(g) 54 | x = self.convs(x, x_mask) 55 | x = self.proj(x) * x_mask 56 | 57 | if not reverse: 58 | flows = self.flows 59 | assert w is not None 60 | 61 | logdet_tot_q = 0 62 | h_w = self.post_pre(w) 63 | h_w = self.post_convs(h_w, x_mask) 64 | h_w = self.post_proj(h_w) * x_mask 65 | e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask 66 | z_q = e_q 67 | for flow in self.post_flows: 68 | z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) 69 | logdet_tot_q += logdet_q 70 | z_u, z1 = torch.split(z_q, [1, 1], 1) 71 | u = torch.sigmoid(z_u) * x_mask 72 | z0 = (w - u) * x_mask 73 | logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2]) 74 | logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q 75 | 76 | logdet_tot = 0 77 | z0, logdet = self.log_flow(z0, x_mask) 78 | logdet_tot += logdet 79 | z = torch.cat([z0, z1], 1) 80 | for flow in flows: 81 | z, logdet = flow(z, x_mask, g=x, reverse=reverse) 82 | logdet_tot = logdet_tot + logdet 83 | nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot 84 | return nll + logq # [b] 85 | else: 86 | flows = list(reversed(self.flows)) 87 | flows = flows[:-2] + [flows[-1]] # remove a useless vflow 88 | z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale 89 | for flow in flows: 90 | z = flow(z, x_mask, g=x, reverse=reverse) 91 | z0, z1 = torch.split(z, [1, 1], 1) 92 | logw = z0 93 | return logw 94 | 95 | 96 | class DurationPredictor(nn.Module): 97 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): 98 | super().__init__() 99 | 100 | self.in_channels = in_channels 101 | self.filter_channels = filter_channels 102 | self.kernel_size = kernel_size 103 | self.p_dropout = p_dropout 104 | self.gin_channels = gin_channels 105 | 106 | self.drop = nn.Dropout(p_dropout) 107 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) 108 | self.norm_1 = modules.LayerNorm(filter_channels) 109 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) 110 | self.norm_2 = modules.LayerNorm(filter_channels) 111 | self.proj = nn.Conv1d(filter_channels, 1, 1) 112 | 113 | if gin_channels != 0: 114 | self.cond = nn.Conv1d(gin_channels, in_channels, 1) 115 | 116 | def forward(self, x, x_mask, g=None): 117 | x = torch.detach(x) 118 | if g is not None: 119 | g = torch.detach(g) 120 | x = x + self.cond(g) 121 | x = self.conv_1(x * x_mask) 122 | x = torch.relu(x) 123 | x = self.norm_1(x) 124 | x = self.drop(x) 125 | x = self.conv_2(x * x_mask) 126 | x = torch.relu(x) 127 | x = self.norm_2(x) 128 | x = self.drop(x) 129 | x = self.proj(x * x_mask) 130 | return x * x_mask 131 | 132 | 133 | class TextEncoder(nn.Module): 134 | def __init__(self, 135 | n_vocab, 136 | out_channels, 137 | hidden_channels, 138 | filter_channels, 139 | n_heads, 140 | n_layers, 141 | kernel_size, 142 | p_dropout, 143 | emotion_embedding): 144 | super().__init__() 145 | self.n_vocab = n_vocab 146 | self.out_channels = out_channels 147 | self.hidden_channels = hidden_channels 148 | self.filter_channels = filter_channels 149 | self.n_heads = n_heads 150 | self.n_layers = n_layers 151 | self.kernel_size = kernel_size 152 | self.p_dropout = p_dropout 153 | self.emotion_embedding = emotion_embedding 154 | 155 | if self.n_vocab!=0: 156 | self.emb = nn.Embedding(n_vocab, hidden_channels) 157 | if emotion_embedding: 158 | self.emo_proj = nn.Linear(1024, hidden_channels) 159 | nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) 160 | 161 | self.encoder = attentions.Encoder( 162 | hidden_channels, 163 | filter_channels, 164 | n_heads, 165 | n_layers, 166 | kernel_size, 167 | p_dropout) 168 | self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) 169 | 170 | def forward(self, x, x_lengths, emotion_embedding=None): 171 | if self.n_vocab!=0: 172 | x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] 173 | if emotion_embedding is not None: 174 | x = x + self.emo_proj(emotion_embedding.unsqueeze(1)) 175 | x = torch.transpose(x, 1, -1) # [b, h, t] 176 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 177 | 178 | x = self.encoder(x * x_mask, x_mask) 179 | stats = self.proj(x) * x_mask 180 | 181 | m, logs = torch.split(stats, self.out_channels, dim=1) 182 | return x, m, logs, x_mask 183 | 184 | 185 | class ResidualCouplingBlock(nn.Module): 186 | def __init__(self, 187 | channels, 188 | hidden_channels, 189 | kernel_size, 190 | dilation_rate, 191 | n_layers, 192 | n_flows=4, 193 | gin_channels=0): 194 | super().__init__() 195 | self.channels = channels 196 | self.hidden_channels = hidden_channels 197 | self.kernel_size = kernel_size 198 | self.dilation_rate = dilation_rate 199 | self.n_layers = n_layers 200 | self.n_flows = n_flows 201 | self.gin_channels = gin_channels 202 | 203 | self.flows = nn.ModuleList() 204 | for i in range(n_flows): 205 | self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) 206 | self.flows.append(modules.Flip()) 207 | 208 | def forward(self, x, x_mask, g=None, reverse=False): 209 | if not reverse: 210 | for flow in self.flows: 211 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 212 | else: 213 | for flow in reversed(self.flows): 214 | x = flow(x, x_mask, g=g, reverse=reverse) 215 | return x 216 | 217 | 218 | class PosteriorEncoder(nn.Module): 219 | def __init__(self, 220 | in_channels, 221 | out_channels, 222 | hidden_channels, 223 | kernel_size, 224 | dilation_rate, 225 | n_layers, 226 | gin_channels=0): 227 | super().__init__() 228 | self.in_channels = in_channels 229 | self.out_channels = out_channels 230 | self.hidden_channels = hidden_channels 231 | self.kernel_size = kernel_size 232 | self.dilation_rate = dilation_rate 233 | self.n_layers = n_layers 234 | self.gin_channels = gin_channels 235 | 236 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 237 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) 238 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 239 | 240 | def forward(self, x, x_lengths, g=None): 241 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 242 | x = self.pre(x) * x_mask 243 | x = self.enc(x, x_mask, g=g) 244 | stats = self.proj(x) * x_mask 245 | m, logs = torch.split(stats, self.out_channels, dim=1) 246 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 247 | return z, m, logs, x_mask 248 | 249 | 250 | class Generator(torch.nn.Module): 251 | def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): 252 | super(Generator, self).__init__() 253 | self.num_kernels = len(resblock_kernel_sizes) 254 | self.num_upsamples = len(upsample_rates) 255 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 256 | resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 257 | 258 | self.ups = nn.ModuleList() 259 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 260 | self.ups.append(weight_norm( 261 | ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), 262 | k, u, padding=(k-u)//2))) 263 | 264 | self.resblocks = nn.ModuleList() 265 | for i in range(len(self.ups)): 266 | ch = upsample_initial_channel//(2**(i+1)) 267 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 268 | self.resblocks.append(resblock(ch, k, d)) 269 | 270 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 271 | self.ups.apply(init_weights) 272 | 273 | if gin_channels != 0: 274 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 275 | 276 | def forward(self, x, g=None): 277 | x = self.conv_pre(x) 278 | if g is not None: 279 | x = x + self.cond(g) 280 | 281 | for i in range(self.num_upsamples): 282 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 283 | x = self.ups[i](x) 284 | xs = None 285 | for j in range(self.num_kernels): 286 | if xs is None: 287 | xs = self.resblocks[i*self.num_kernels+j](x) 288 | else: 289 | xs += self.resblocks[i*self.num_kernels+j](x) 290 | x = xs / self.num_kernels 291 | x = F.leaky_relu(x) 292 | x = self.conv_post(x) 293 | x = torch.tanh(x) 294 | 295 | return x 296 | 297 | 298 | class SynthesizerTrn(nn.Module): 299 | """ 300 | Synthesizer for Training 301 | """ 302 | 303 | def __init__(self, 304 | n_vocab, 305 | spec_channels, 306 | segment_size, 307 | inter_channels, 308 | hidden_channels, 309 | filter_channels, 310 | n_heads, 311 | n_layers, 312 | kernel_size, 313 | p_dropout, 314 | resblock, 315 | resblock_kernel_sizes, 316 | resblock_dilation_sizes, 317 | upsample_rates, 318 | upsample_initial_channel, 319 | upsample_kernel_sizes, 320 | n_speakers=0, 321 | gin_channels=0, 322 | use_sdp=True, 323 | emotion_embedding=False, 324 | **kwargs): 325 | 326 | super().__init__() 327 | self.n_vocab = n_vocab 328 | self.spec_channels = spec_channels 329 | self.inter_channels = inter_channels 330 | self.hidden_channels = hidden_channels 331 | self.filter_channels = filter_channels 332 | self.n_heads = n_heads 333 | self.n_layers = n_layers 334 | self.kernel_size = kernel_size 335 | self.p_dropout = p_dropout 336 | self.resblock = resblock 337 | self.resblock_kernel_sizes = resblock_kernel_sizes 338 | self.resblock_dilation_sizes = resblock_dilation_sizes 339 | self.upsample_rates = upsample_rates 340 | self.upsample_initial_channel = upsample_initial_channel 341 | self.upsample_kernel_sizes = upsample_kernel_sizes 342 | self.segment_size = segment_size 343 | self.n_speakers = n_speakers 344 | self.gin_channels = gin_channels 345 | 346 | self.use_sdp = use_sdp 347 | 348 | self.enc_p = TextEncoder(n_vocab, 349 | inter_channels, 350 | hidden_channels, 351 | filter_channels, 352 | n_heads, 353 | n_layers, 354 | kernel_size, 355 | p_dropout, 356 | emotion_embedding) 357 | self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) 358 | self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 359 | self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) 360 | 361 | if use_sdp: 362 | self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) 363 | else: 364 | self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) 365 | 366 | if n_speakers > 1: 367 | self.emb_g = nn.Embedding(n_speakers, gin_channels) 368 | 369 | def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None): 370 | x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding) 371 | if self.n_speakers > 0: 372 | g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] 373 | else: 374 | g = None 375 | 376 | if self.use_sdp: 377 | logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) 378 | else: 379 | logw = self.dp(x, x_mask, g=g) 380 | w = torch.exp(logw) * x_mask * length_scale 381 | w_ceil = torch.ceil(w) 382 | y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() 383 | y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) 384 | attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) 385 | attn = commons.generate_path(w_ceil, attn_mask) 386 | 387 | m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 388 | logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] 389 | 390 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 391 | z = self.flow(z_p, y_mask, g=g, reverse=True) 392 | o = self.dec((z * y_mask)[:,:,:max_len], g=g) 393 | return o, attn, y_mask, (z, z_p, m_p, logs_p) 394 | 395 | def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): 396 | assert self.n_speakers > 0, "n_speakers have to be larger than 0." 397 | g_src = self.emb_g(sid_src).unsqueeze(-1) 398 | g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) 399 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) 400 | z_p = self.flow(z, y_mask, g=g_src) 401 | z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) 402 | o_hat = self.dec(z_hat * y_mask, g=g_tgt) 403 | return o_hat, y_mask, (z, z_p, z_hat) 404 | 405 | --------------------------------------------------------------------------------