├── .gitattributes
├── model
    ├── kws
    │   └── README.md
    └── tts
    │   └── README.md
├── run.bat
├── HoshiNoYume
    ├── test.py
    ├── perception
    │   ├── __init__.py
    │   ├── text_input.py
    │   └── auditory.py
    ├── actions
    │   ├── MoeGoe
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   ├── text
    │   │   │   ├── __init__.py
    │   │   │   ├── LICENSE
    │   │   │   ├── thai.py
    │   │   │   ├── ngu_dialect.py
    │   │   │   ├── sanskrit.py
    │   │   │   ├── cantonese.py
    │   │   │   ├── shanghainese.py
    │   │   │   ├── japanese.py
    │   │   │   ├── english.py
    │   │   │   ├── cleaners.py
    │   │   │   ├── korean.py
    │   │   │   └── mandarin.py
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── utils.py
    │   │   ├── .gitattributes
    │   │   ├── commons.py
    │   │   ├── mel_processing.py
    │   │   ├── hubert_model.py
    │   │   ├── .gitignore
    │   │   ├── transforms.py
    │   │   ├── MoeGoe.py
    │   │   ├── attentions.py
    │   │   ├── modules.py
    │   │   └── models.py
    │   ├── __init__.py
    │   ├── Live2D.py
    │   ├── IoT_control.py
    │   ├── interact.py
    │   ├── search.py
    │   └── speaking.py
    ├── memory
    │   ├── __init__.py
    │   ├── short_term_memory.py
    │   ├── long_summary_memory.txt
    │   ├── long_term_memory.py
    │   └── prompts.py
    ├── thinking
    │   ├── __init__.py
    │   ├── agent_interact.py
    │   ├── agent_search.py
    │   ├── chat.py
    │   └── prompts.py
    ├── tools
    │   ├── __init__.py
    │   ├── system_control.py
    │   └── translate.py
    ├── main_min.py
    ├── main.py
    └── api_key_sample.py
├── requirements.txt
├── README.md
├── LICENSE
└── .gitignore


/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/kws/README.md:
--------------------------------------------------------------------------------
1 | # kws 模型文件
2 | 
3 | 请将你的 `KWS` 模型文件放在此处。


--------------------------------------------------------------------------------
/model/tts/README.md:
--------------------------------------------------------------------------------
1 | # VITS 模型文件
2 | 
3 | 请将你的 `VITS` 模型文件放在此处。


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | call yume_env\Scripts\activate
3 | python HoshiNoYume\main.py


--------------------------------------------------------------------------------
/HoshiNoYume/test.py:
--------------------------------------------------------------------------------
1 | from actions.Live2D import live2d_open
2 | import time
3 | 
4 | live2d_open()
5 | while True:
6 |     time.sleep(0.2)


--------------------------------------------------------------------------------
/HoshiNoYume/perception/__init__.py:
--------------------------------------------------------------------------------
1 | from perception.text_input import text_input
2 | from perception.auditory import listen
3 | 
4 | __all__ = [
5 |     "text_input",
6 |     "listen",
7 | ]


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/__init__.py:
--------------------------------------------------------------------------------
1 | from .mel_processing import *
2 | from .MoeGoe import *
3 | from .utils import *
4 | from .commons import *
5 | from text import text_to_sequence, _clean_text
6 | 


--------------------------------------------------------------------------------
/HoshiNoYume/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from memory.long_term_memory import long_memory
2 | from memory.short_term_memory import short_memory
3 | 
4 | __all__ = [
5 |     "long_memory",
6 |     "short_memory"
7 | ]


--------------------------------------------------------------------------------
/HoshiNoYume/actions/__init__.py:
--------------------------------------------------------------------------------
 1 | from actions.Live2D import live2d_open
 2 | from actions.IoT_control import mqtt_connect
 3 | from actions.Live2D import socket_init
 4 | 
 5 | __all__ = [
 6 |     "live2d_open",
 7 |     "mqtt_connect",
 8 |     "socket_init",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/HoshiNoYume/thinking/__init__.py:
--------------------------------------------------------------------------------
 1 | from thinking.agent_search import agent_search
 2 | from thinking.chat import chat
 3 | from thinking.agent_interact import agent_interact
 4 | 
 5 | __all__ = [
 6 |     "agent_search",
 7 |     "chat",
 8 |     "agent_interact"
 9 | ]
10 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/requirements.txt:
--------------------------------------------------------------------------------
 1 | numba
 2 | librosa
 3 | numpy==1.23.3
 4 | scipy
 5 | torch
 6 | unidecode
 7 | openjtalk>=0.3.0.dev2
 8 | jamo
 9 | pypinyin
10 | jieba
11 | protobuf
12 | cn2an
13 | inflect
14 | eng_to_ipa
15 | ko_pron
16 | indic_transliteration
17 | num_thai
18 | opencc
19 | audonnx
20 | 


--------------------------------------------------------------------------------
/HoshiNoYume/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from tools.system_control import keyword_wake_up
 2 | from tools.system_control import press_key_wake_up
 3 | from tools.system_control import print_device_info
 4 | from tools.translate import text2text_translate
 5 | 
 6 | __all__ = [
 7 |     "keyword_wake_up",
 8 |     "press_key_wake_up",
 9 |     "print_device_info",
10 |     "text2text_translate",
11 | ]
12 | 


--------------------------------------------------------------------------------
/HoshiNoYume/memory/short_term_memory.py:
--------------------------------------------------------------------------------
 1 | from langchain.memory import ChatMessageHistory
 2 | 
 3 | 
 4 | class ChatShortMemory(ChatMessageHistory):
 5 |     
 6 |     def window_buffer_message(self, round: int):
 7 |         if len(self.messages) < round * 2:
 8 |             return self.messages
 9 |         else:
10 |             return self.messages[len(self.messages) - round * 2:]
11 | 
12 | short_memory = ChatShortMemory()
13 | 


--------------------------------------------------------------------------------
/HoshiNoYume/perception/text_input.py:
--------------------------------------------------------------------------------
 1 | from api_key import user_name
 2 | import time
 3 | 
 4 | def text_input():
 5 |     user_input = input(user_name + ": ")
 6 |     # 加上时间戳
 7 |     current_time = time.time()
 8 |     local_time = time.localtime(current_time)
 9 |     formatted_time = time.strftime("%Y-%m-%d %H:%M", local_time)
10 |     
11 |     user_input = f'({formatted_time})' + user_input
12 |     
13 |     return user_input


--------------------------------------------------------------------------------
/HoshiNoYume/main_min.py:
--------------------------------------------------------------------------------
 1 | from api_key import *
 2 | import perception
 3 | import thinking
 4 | import memory
 5 | 
 6 | def main():
 7 |     while True:
 8 |         user_words = perception.text_input()    #文字输入
 9 |         memory.short_memory.add_user_message(user_words)
10 |         response = thinking.chat(memory.short_memory)
11 |         memory.short_memory.add_ai_message(response)
12 |         
13 | if __name__ == '__main__':
14 |     main()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numba
 2 | librosa
 3 | numpy==1.23.3
 4 | scipy
 5 | torch
 6 | unidecode
 7 | openjtalk>=0.3.0.dev2
 8 | jamo
 9 | pypinyin
10 | jieba
11 | protobuf
12 | cn2an
13 | inflect
14 | eng_to_ipa
15 | ko_pron
16 | indic_transliteration
17 | num_thai
18 | opencc
19 | audonnx
20 | openai
21 | tencentcloud-sdk-python
22 | pyaudio
23 | simpleaudio
24 | pydub
25 | webrtcvad
26 | asyncio
27 | aiohttp
28 | pvporcupine
29 | paho-mqtt
30 | langchain
31 | pinecone-client
32 | google-api-python-client
33 | keyboard
34 | azure-cognitiveservices-speech
35 | requests
36 | clueai


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CyberKaNoJo——星野夢
 2 | 
 3 | ## 项目介绍
 4 | 
 5 | CyberKaNoJo（星野夢）是一个正在开发中的项目。由于项目尚在开发阶段，我目前不打算编写完整的`README`文件。但是，您可以通过以下链接查看当前版本下的项目具体使用方法：
 6 | 
 7 | [https://xuanxuanqaq.top/hoshinoyume-v1_1/](https://xuanxuanqaq.top/hoshinoyume-v1_1/)
 8 | 
 9 | ## 注意事项
10 | 
11 | - 请注意，此项目仍在开发中，功能可能不稳定或不完整。在使用过程中如遇到问题，请及时向我反馈。
12 | 
13 | ## 反馈与建议
14 | 
15 | 如果您在使用过程中遇到问题或有任何建议，请通过以下途径与我联系：
16 | 
17 | - [GitHub Issues](https://github.com/yourusername/CyberKaNoJo/issues)
18 | - QQ：903166538
19 | 
20 | ## 许可证
21 | 
22 | 本项目采用[MIT许可证](LICENSE)。请查阅许可证文件了解详细信息。
23 | 
24 | 感谢您对CyberKaNoJo（星野夢）项目的关注！


--------------------------------------------------------------------------------
/HoshiNoYume/actions/Live2D.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import subprocess
 3 | 
 4 | def socket_init():
 5 |     host = '127.0.0.1'
 6 |     port = 12345
 7 | 
 8 |     server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 9 |     server_socket.bind((host, port))
10 |     server_socket.listen(1)
11 | 
12 |     print(f"服务端socket地址 {host}:{port}")
13 |     global conn
14 |     conn, addr = server_socket.accept()
15 |     
16 |     print(f"连接到了Live2D客户端: {addr}")
17 |     
18 | 
19 | def socket_send(message):
20 |     conn.send(message.to_bytes(4, 'big', signed=True))
21 |     
22 | def socket_close():
23 |     conn.close()
24 |     
25 | def live2d_open():
26 |     exe_path = "model/live2d/Live2D.exe"
27 |     subprocess.Popen(exe_path)


--------------------------------------------------------------------------------
/HoshiNoYume/actions/IoT_control.py:
--------------------------------------------------------------------------------
 1 | import paho.mqtt.client as mqtt
 2 | from api_key import *
 3 | import json
 4 | import threading
 5 | 
 6 | topic = "/moon_light"
 7 | light_property = {
 8 |     "switch": "",
 9 |     "color": "",
10 | }
11 | client = None
12 | 
13 | def mqtt_connect():
14 |     def on_connect(client, userdata, flags, rc):
15 |         if rc == 0:
16 |             print("连接上MQTT broker了喵~")
17 |             client.subscribe(topic)
18 | 
19 |     # 创建mqtt实例
20 |     global client
21 |     client = mqtt.Client()
22 |     # 绑定连接服务器上时的回调函数
23 |     client.on_connect = on_connect
24 |     # 连接broker
25 |     client.connect(mqtt_broker, mqtt_port)
26 |     client.loop_forever()
27 | 
28 | 
29 | def mqtt_publish(publish_message: dict[str, str]):
30 |     publish_message = json.dumps(publish_message)
31 |     client.publish(topic, publish_message)
32 | 
33 |             
34 | if IoT_enabled:
35 |     thread_mqtt = threading.Thread(target=mqtt_connect)  # 初始化MQTT
36 |     thread_mqtt.start()


--------------------------------------------------------------------------------
/HoshiNoYume/memory/long_summary_memory.txt:
--------------------------------------------------------------------------------
1 | 
2 | Hoshino Ai is a popular idol from the B-Komachi group, affiliated with Strawberry Productions. She's known for her acting, singing, and dancing abilities, and has a great memory. She recently spends a lot of time with children, who she finds very cute, and enjoys being with them. She asked the other person if they had done anything fun recently, and they said they would talk about it next time. The other person asked about the weather, and Hoshino Ai reported that it was cloudy, 19 degrees, with a west wind of 3 or less and a humidity of 38%. They then asked about the death of Abe Shinzo, to which Hoshino Ai replied that he had died on July 8th, 2022. They then asked about fun activities in the area, to which Hoshino Ai suggested the West Anli Tech University campus, with its gym, pool, and movie theater, as well as the many delicious food places in the city. They asked who Hoshino Ai was, and she introduced herself as an idol from the B-Komachi group. They said they had no other questions, and Hoshino Ai said they could talk again if they thought of anything else.


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | 
 4 | 
 5 | def text_to_sequence(text, symbols, cleaner_names):
 6 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 7 |     Args:
 8 |       text: string to convert to a sequence
 9 |       cleaner_names: names of the cleaner functions to run the text through
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14 | 
15 |   sequence = []
16 | 
17 |   clean_text = _clean_text(text, cleaner_names)
18 |   for symbol in clean_text:
19 |     if symbol not in _symbol_to_id.keys():
20 |       continue
21 |     symbol_id = _symbol_to_id[symbol]
22 |     sequence += [symbol_id]
23 |   return sequence
24 | 
25 | 
26 | def _clean_text(text, cleaner_names):
27 |   for name in cleaner_names:
28 |     cleaner = getattr(cleaners, name)
29 |     if not cleaner:
30 |       raise Exception('Unknown cleaner: %s' % name)
31 |     text = cleaner(text)
32 |   return text
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 xuanxuanQAQ
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/thai.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from num_thai.thainumbers import NumThai
 3 | 
 4 | 
 5 | num = NumThai()
 6 | 
 7 | # List of (Latin alphabet, Thai) pairs:
 8 | _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 9 |     ('a', 'เอ'),
10 |     ('b','บี'),
11 |     ('c','ซี'),
12 |     ('d','ดี'),
13 |     ('e','อี'),
14 |     ('f','เอฟ'),
15 |     ('g','จี'),
16 |     ('h','เอช'),
17 |     ('i','ไอ'),
18 |     ('j','เจ'),
19 |     ('k','เค'),
20 |     ('l','แอล'),
21 |     ('m','เอ็ม'),
22 |     ('n','เอ็น'),
23 |     ('o','โอ'),
24 |     ('p','พี'),
25 |     ('q','คิว'),
26 |     ('r','แอร์'),
27 |     ('s','เอส'),
28 |     ('t','ที'),
29 |     ('u','ยู'),
30 |     ('v','วี'),
31 |     ('w','ดับเบิลยู'),
32 |     ('x','เอ็กซ์'),
33 |     ('y','วาย'),
34 |     ('z','ซี')
35 | ]]
36 | 
37 | 
38 | def num_to_thai(text):
39 |     return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
40 | 
41 | def latin_to_thai(text):
42 |     for regex, replacement in _latin_to_thai:
43 |         text = re.sub(regex, replacement, text)
44 |     return text
45 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 CjangCjengh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/ngu_dialect.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import opencc
 3 | 
 4 | 
 5 | dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
 6 |             'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
 7 |             'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
 8 |             'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
 9 |             'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
10 |             'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
11 | 
12 | converters = {}
13 | 
14 | for dialect in dialects.values():
15 |     try:
16 |         converters[dialect] = opencc.OpenCC(dialect)
17 |     except:
18 |         pass
19 | 
20 | 
21 | def ngu_dialect_to_ipa(text, dialect):
22 |     dialect = dialects[dialect]
23 |     text = converters[dialect].convert(text).replace('-','').replace('$',' ')
24 |     text = re.sub(r'[、；：]', '，', text)
25 |     text = re.sub(r'\s*，\s*', ', ', text)
26 |     text = re.sub(r'\s*。\s*', '. ', text)
27 |     text = re.sub(r'\s*？\s*', '? ', text)
28 |     text = re.sub(r'\s*！\s*', '! ', text)
29 |     text = re.sub(r'\s*$', '', text)
30 |     return text
31 | 


--------------------------------------------------------------------------------
/HoshiNoYume/main.py:
--------------------------------------------------------------------------------
 1 | from api_key import *
 2 | import perception
 3 | import thinking
 4 | import memory
 5 | import tools
 6 | import actions
 7 | import threading
 8 | import re
 9 | 
10 | # 初始化
11 | def init():
12 |     tools.print_device_info()
13 |     if Live2D_enabled:
14 |         thread_socket = threading.Thread(target=actions.socket_init)  # 初始化MQTT
15 |         thread_socket.start()
16 |         actions.live2d_open()
17 |         thread_socket.join()
18 |     tools.press_key_wake_up()
19 | 
20 | # 结束对话
21 | def conv_end():
22 |     # 整理此次对话
23 |     memory.long_memory.summary_write(memory.short_memory)
24 |     memory.long_memory.short_memory_vector_write(memory.short_memory)
25 |     # 等待开启下次对话
26 |     tools.press_key_wake_up()
27 |     
28 | def main():
29 |     init()
30 |     while True:
31 |         user_words = perception.text_input()    #文字输入
32 |         # user_words = perception.listen()      #语音输入
33 |         search_info = thinking.agent_search(user_words)
34 |         memory.short_memory.add_user_message(user_words)
35 |         response = thinking.chat(memory.short_memory, memory.long_memory, search_info)
36 |         memory.short_memory.add_ai_message(response)
37 |         
38 |         interact = re.search(r'#interact:\s*(.*?)\)', response)
39 |         if interact == "end":
40 |             conv_end()
41 |         elif interact != None:
42 |             thinking.agent_interact(interact)
43 |         
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/HoshiNoYume/actions/interact.py:
--------------------------------------------------------------------------------
 1 | from actions.IoT_control import mqtt_publish
 2 | from langchain.agents import Tool
 3 | from api_key import *
 4 | from tools.system_control import press_key_wake_up
 5 | import memory
 6 | 
 7 | def light_handle(instruction):
 8 |     print("少女行动中...")
 9 |     if "on" in instruction:
10 |         message = {"switch": "light on"}
11 |     elif "off" in instruction:
12 |         message = {"switch": "light off"}
13 |     mqtt_publish(message)
14 | 
15 | def end_talk(_):
16 |     print("结束对话捏...")
17 |     memory.long_memory.summary_write(memory.short_memory)
18 |     memory.long_memory.short_memory_vector_write(memory.short_memory)
19 |     press_key_wake_up()
20 | 
21 | def just_chat(_):
22 |     return "chat"
23 |     
24 | 
25 | # 操作工具列表
26 | interact_tools = [
27 |     Tool(
28 |         name = "Light Handle",
29 |         func=light_handle,
30 |         description="Use this to control the light, input 'on' to turn on the light, and input 'off' to turn off the light.",
31 |         return_direct=True
32 |     ),
33 |     Tool(
34 |         name = "end conversation",
35 |         func=end_talk,
36 |         description="If you think it's time to end conversation, use this.",
37 |         return_direct=True
38 |     ),
39 |     Tool(
40 |         name = "Chat",
41 |         func=just_chat,
42 |         description="If you think I'm not asking a question or you don't need to use other tools or i'm instruct you to do something, take this",
43 |         return_direct=True
44 |     )
45 | ]


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/README.md:
--------------------------------------------------------------------------------
 1 | # Links
 2 | - [MoeGoe_GUI](https://github.com/CjangCjengh/MoeGoe_GUI)
 3 | - [Pretrained models](https://github.com/CjangCjengh/TTSModels)
 4 | 
 5 | # How to use
 6 | Run MoeGoe.exe
 7 | ```
 8 | Path of a VITS model: path\to\model.pth
 9 | Path of a config file: path\to\config.json
10 | INFO:root:Loaded checkpoint 'path\to\model.pth' (iteration XXX)
11 | ```
12 | ## Text to speech
13 | ```
14 | TTS or VC? (t/v):t
15 | Text to read: こんにちは。
16 | ID      Speaker
17 | 0       XXXX
18 | 1       XXXX
19 | 2       XXXX
20 | Speaker ID: 0
21 | Path to save: path\to\demo.wav
22 | Successfully saved!
23 | ```
24 | ## Voice conversion
25 | ```
26 | TTS or VC? (t/v):v
27 | Path of an audio file to convert:
28 | path\to\origin.wav
29 | ID      Speaker
30 | 0       XXXX
31 | 1       XXXX
32 | 2       XXXX
33 | Original speaker ID: 0
34 | Target speaker ID: 6
35 | Path to save: path\to\demo.wav
36 | Successfully saved!
37 | ```
38 | ## HuBERT-VITS
39 | ```
40 | Path of a hubert-soft model: path\to\hubert-soft.pt
41 | Path of an audio file to convert:
42 | path\to\origin.wav
43 | ID      Speaker
44 | 0       XXXX
45 | 1       XXXX
46 | 2       XXXX
47 | Target speaker ID: 6
48 | Path to save: path\to\demo.wav
49 | Successfully saved!
50 | ```
51 | ## W2V2-VITS
52 | ```
53 | Path of a w2v2 dimensional emotion model: path\to\model.onnx
54 | TTS or VC? (t/v):t
55 | Text to read: こんにちは。
56 | ID      Speaker
57 | 0       XXXX
58 | 1       XXXX
59 | 2       XXXX
60 | Speaker ID: 0
61 | Path of an emotion reference: path\to\reference.wav
62 | Path to save: path\to\demo.wav
63 | Successfully saved!
64 | ```
65 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/sanskrit.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from indic_transliteration import sanscript
 3 | 
 4 | 
 5 | # List of (iast, ipa) pairs:
 6 | _iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 7 |     ('a', 'ə'),
 8 |     ('ā', 'aː'),
 9 |     ('ī', 'iː'),
10 |     ('ū', 'uː'),
11 |     ('ṛ', 'ɹ`'),
12 |     ('ṝ', 'ɹ`ː'),
13 |     ('ḷ', 'l`'),
14 |     ('ḹ', 'l`ː'),
15 |     ('e', 'eː'),
16 |     ('o', 'oː'),
17 |     ('k', 'k⁼'),
18 |     ('k⁼h', 'kʰ'),
19 |     ('g', 'g⁼'),
20 |     ('g⁼h', 'gʰ'),
21 |     ('ṅ', 'ŋ'),
22 |     ('c', 'ʧ⁼'),
23 |     ('ʧ⁼h', 'ʧʰ'),
24 |     ('j', 'ʥ⁼'),
25 |     ('ʥ⁼h', 'ʥʰ'),
26 |     ('ñ', 'n^'),
27 |     ('ṭ', 't`⁼'),
28 |     ('t`⁼h', 't`ʰ'),
29 |     ('ḍ', 'd`⁼'),
30 |     ('d`⁼h', 'd`ʰ'),
31 |     ('ṇ', 'n`'),
32 |     ('t', 't⁼'),
33 |     ('t⁼h', 'tʰ'),
34 |     ('d', 'd⁼'),
35 |     ('d⁼h', 'dʰ'),
36 |     ('p', 'p⁼'),
37 |     ('p⁼h', 'pʰ'),
38 |     ('b', 'b⁼'),
39 |     ('b⁼h', 'bʰ'),
40 |     ('y', 'j'),
41 |     ('ś', 'ʃ'),
42 |     ('ṣ', 's`'),
43 |     ('r', 'ɾ'),
44 |     ('l̤', 'l`'),
45 |     ('h', 'ɦ'),
46 |     ("'", ''),
47 |     ('~', '^'),
48 |     ('ṃ', '^')
49 | ]]
50 | 
51 | 
52 | def devanagari_to_ipa(text):
53 |     text = text.replace('ॐ', 'ओम्')
54 |     text = re.sub(r'\s*।\s*$', '.', text)
55 |     text = re.sub(r'\s*।\s*', ', ', text)
56 |     text = re.sub(r'\s*॥', '.', text)
57 |     text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
58 |     for regex, replacement in _iast_to_ipa:
59 |         text = re.sub(regex, replacement, text)
60 |     text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
61 |                   [:-1]+'h'+x.group(1)+'*', text)
62 |     return text
63 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/cantonese.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import cn2an
 3 | import opencc
 4 | 
 5 | 
 6 | converter = opencc.OpenCC('jyutjyu')
 7 | 
 8 | # List of (Latin alphabet, ipa) pairs:
 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 |     ('A', 'ei˥'),
11 |     ('B', 'biː˥'),
12 |     ('C', 'siː˥'),
13 |     ('D', 'tiː˥'),
14 |     ('E', 'iː˥'),
15 |     ('F', 'e˥fuː˨˩'),
16 |     ('G', 'tsiː˥'),
17 |     ('H', 'ɪk̚˥tsʰyː˨˩'),
18 |     ('I', 'ɐi˥'),
19 |     ('J', 'tsei˥'),
20 |     ('K', 'kʰei˥'),
21 |     ('L', 'e˥llou˨˩'),
22 |     ('M', 'ɛːm˥'),
23 |     ('N', 'ɛːn˥'),
24 |     ('O', 'ou˥'),
25 |     ('P', 'pʰiː˥'),
26 |     ('Q', 'kʰiːu˥'),
27 |     ('R', 'aː˥lou˨˩'),
28 |     ('S', 'ɛː˥siː˨˩'),
29 |     ('T', 'tʰiː˥'),
30 |     ('U', 'juː˥'),
31 |     ('V', 'wiː˥'),
32 |     ('W', 'tʊk̚˥piː˥juː˥'),
33 |     ('X', 'ɪk̚˥siː˨˩'),
34 |     ('Y', 'waːi˥'),
35 |     ('Z', 'iː˨sɛːt̚˥')
36 | ]]
37 | 
38 | 
39 | def number_to_cantonese(text):
40 |     return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
41 | 
42 | 
43 | def latin_to_ipa(text):
44 |     for regex, replacement in _latin_to_ipa:
45 |         text = re.sub(regex, replacement, text)
46 |     return text
47 | 
48 | 
49 | def cantonese_to_ipa(text):
50 |     text = number_to_cantonese(text.upper())
51 |     text = converter.convert(text).replace('-','').replace('$',' ')
52 |     text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
53 |     text = re.sub(r'[、；：]', '，', text)
54 |     text = re.sub(r'\s*，\s*', ', ', text)
55 |     text = re.sub(r'\s*。\s*', '. ', text)
56 |     text = re.sub(r'\s*？\s*', '? ', text)
57 |     text = re.sub(r'\s*！\s*', '! ', text)
58 |     text = re.sub(r'\s*$', '', text)
59 |     return text
60 | 


--------------------------------------------------------------------------------
/HoshiNoYume/tools/system_control.py:
--------------------------------------------------------------------------------
 1 | import pvporcupine
 2 | import pyaudio
 3 | import struct
 4 | import torch
 5 | from api_key import *
 6 | import keyboard
 7 | 
 8 | # 打印设备信息
 9 | def print_device_info():
10 |     print("device info:")
11 |     if torch.cuda.is_available():
12 |         print("cuda is available")
13 |         print("GPU device name:", torch.cuda.get_device_name(0))
14 |         print("cudnn version:", torch.backends.cudnn.version())
15 |     else:
16 |         print("cuda is not available")
17 | 
18 | 
19 | # 进入休眠，关键词唤醒
20 | def keyword_wake_up():
21 |     porcupine = pvporcupine.create(
22 |         access_key=porcupine_key,
23 |         keyword_paths=[porcupine_model]
24 |     )
25 |     # 开启录音流
26 |     kws_audio = pyaudio.PyAudio()
27 |     audio_stream = kws_audio.open(
28 |         rate=porcupine.sample_rate,
29 |         channels=1,
30 |         format=pyaudio.paInt16,
31 |         input=True,
32 |         frames_per_buffer=porcupine.frame_length,
33 |         input_device_index=None,
34 |     )
35 |     print("等待唤醒中,唤醒词:hey dream...")
36 | 
37 |     def get_next_audio_frame():
38 |         pcm = audio_stream.read(porcupine.frame_length)
39 |         pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
40 |         return pcm
41 |     try:
42 |         while True:
43 |             audio_frame = get_next_audio_frame()
44 |             keyword_index = porcupine.process(audio_frame)
45 |             if keyword_index == 0:
46 |                 print("唤醒了捏！")
47 |                 break
48 |     finally:
49 |         audio_stream.stop_stream()
50 |         audio_stream.close()
51 |         porcupine.delete()
52 |         kws_audio.terminate()
53 | 
54 | def press_key_wake_up():
55 |     print("按任意键唤醒...")
56 |     keyboard.read_event()
57 |     print("唤醒了捏！")
58 | 


--------------------------------------------------------------------------------
/HoshiNoYume/tools/translate.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import uuid
 3 | import time
 4 | import requests
 5 | import json
 6 | from api_key import *
 7 | 
 8 | 
 9 | # 文本翻译
10 | def text2text_translate(words, model="youdao",src_lang="ja",target_lang="zh-CHS"):
11 |     if model == "youdao":
12 |         def encrypt(signStr):
13 |             hash_algorithm = hashlib.sha256()
14 |             hash_algorithm.update(signStr.encode('utf-8'))
15 |             return hash_algorithm.hexdigest()
16 | 
17 |         def truncate(q):
18 |             if q is None:
19 |                 return None
20 |             size = len(q)
21 |             return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
22 | 
23 |         def do_request(data):
24 |             youdao_url = 'https://openapi.youdao.com/api'
25 |             headers = {'Content-Type': 'application/x-www-form-urlencoded'}
26 |             return requests.post(youdao_url, data=data, headers=headers)
27 |         q = words
28 |         data = {}
29 |         data['from'] = src_lang         # 翻译源语言
30 |         data['to'] = target_lang       # 翻译目标语言
31 |         data['signType'] = 'v3'
32 |         curtime = str(int(time.time()))
33 |         data['curtime'] = curtime  # 时间戳
34 |         salt = str(uuid.uuid1())
35 |         signStr = youdao_Id + truncate(q) + salt + curtime + youdao_key
36 |         sign = encrypt(signStr)
37 |         data['appKey'] = youdao_Id      # 应用ID
38 |         data['q'] = q                   # 翻译语句
39 |         data['salt'] = salt
40 |         data['sign'] = sign
41 |         response = do_request(data)
42 | 
43 |         # 回复解码
44 |         json_data = response.content.decode('utf-8')
45 |         data = json.loads(json_data)
46 |         translation = data['translation']
47 | 
48 |     return translation[0]
49 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/shanghainese.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import cn2an
 3 | import opencc
 4 | 
 5 | 
 6 | converter = opencc.OpenCC('zaonhe')
 7 | 
 8 | # List of (Latin alphabet, ipa) pairs:
 9 | _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10 |     ('A', 'ᴇ'),
11 |     ('B', 'bi'),
12 |     ('C', 'si'),
13 |     ('D', 'di'),
14 |     ('E', 'i'),
15 |     ('F', 'ᴇf'),
16 |     ('G', 'dʑi'),
17 |     ('H', 'ᴇtɕʰ'),
18 |     ('I', 'ᴀi'),
19 |     ('J', 'dʑᴇ'),
20 |     ('K', 'kʰᴇ'),
21 |     ('L', 'ᴇl'),
22 |     ('M', 'ᴇm'),
23 |     ('N', 'ᴇn'),
24 |     ('O', 'o'),
25 |     ('P', 'pʰi'),
26 |     ('Q', 'kʰiu'),
27 |     ('R', 'ᴀl'),
28 |     ('S', 'ᴇs'),
29 |     ('T', 'tʰi'),
30 |     ('U', 'ɦiu'),
31 |     ('V', 'vi'),
32 |     ('W', 'dᴀbɤliu'),
33 |     ('X', 'ᴇks'),
34 |     ('Y', 'uᴀi'),
35 |     ('Z', 'zᴇ')
36 | ]]
37 | 
38 | 
39 | def _number_to_shanghainese(num):
40 |     num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
41 |     return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
42 | 
43 | 
44 | def number_to_shanghainese(text):
45 |     return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
46 | 
47 | 
48 | def latin_to_ipa(text):
49 |     for regex, replacement in _latin_to_ipa:
50 |         text = re.sub(regex, replacement, text)
51 |     return text
52 | 
53 | 
54 | def shanghainese_to_ipa(text):
55 |     text = number_to_shanghainese(text.upper())
56 |     text = converter.convert(text).replace('-','').replace('$',' ')
57 |     text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
58 |     text = re.sub(r'[、；：]', '，', text)
59 |     text = re.sub(r'\s*，\s*', ', ', text)
60 |     text = re.sub(r'\s*。\s*', '. ', text)
61 |     text = re.sub(r'\s*？\s*', '? ', text)
62 |     text = re.sub(r'\s*！\s*', '! ', text)
63 |     text = re.sub(r'\s*$', '', text)
64 |     return text
65 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from json import loads
 3 | from torch import load, FloatTensor
 4 | from numpy import float32
 5 | import librosa
 6 | 
 7 | 
 8 | class HParams():
 9 |   def __init__(self, **kwargs):
10 |     for k, v in kwargs.items():
11 |       if type(v) == dict:
12 |         v = HParams(**v)
13 |       self[k] = v
14 | 
15 |   def keys(self):
16 |     return self.__dict__.keys()
17 | 
18 |   def items(self):
19 |     return self.__dict__.items()
20 | 
21 |   def values(self):
22 |     return self.__dict__.values()
23 | 
24 |   def __len__(self):
25 |     return len(self.__dict__)
26 | 
27 |   def __getitem__(self, key):
28 |     return getattr(self, key)
29 | 
30 |   def __setitem__(self, key, value):
31 |     return setattr(self, key, value)
32 | 
33 |   def __contains__(self, key):
34 |     return key in self.__dict__
35 | 
36 |   def __repr__(self):
37 |     return self.__dict__.__repr__()
38 | 
39 | 
40 | def load_checkpoint(checkpoint_path, model):
41 |   checkpoint_dict = load(checkpoint_path, map_location='cpu')
42 |   iteration = checkpoint_dict['iteration']
43 |   saved_state_dict = checkpoint_dict['model']
44 |   if hasattr(model, 'module'):
45 |     state_dict = model.module.state_dict()
46 |   else:
47 |     state_dict = model.state_dict()
48 |   new_state_dict= {}
49 |   for k, v in state_dict.items():
50 |     try:
51 |       new_state_dict[k] = saved_state_dict[k]
52 |     except:
53 |       logging.info("%s is not in the checkpoint" % k)
54 |       new_state_dict[k] = v
55 |   if hasattr(model, 'module'):
56 |     model.module.load_state_dict(new_state_dict)
57 |   else:
58 |     model.load_state_dict(new_state_dict)
59 |   logging.info("Loaded checkpoint '{}' (iteration {})" .format(
60 |     checkpoint_path, iteration))
61 |   return
62 | 
63 | 
64 | def get_hparams_from_file(config_path):
65 |   with open(config_path, "r") as f:
66 |     data = f.read()
67 |   config = loads(data)
68 | 
69 |   hparams = HParams(**config)
70 |   return hparams
71 | 
72 | 
73 | def load_audio_to_torch(full_path, target_sampling_rate):
74 |   audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
75 |   return FloatTensor(audio.astype(float32))
76 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/HoshiNoYume/api_key_sample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | # 必要的api
 4 | # openai api的KEY
 5 | openai_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 6 | # pineconde的相关设置
 7 | pinecone_key = "xxxxxxxxxxxxxxxxxxxxxxxxx"
 8 | pinecone_env = "asia-northeast1-gcp"
 9 | pinecone_index = "yume"
10 | # 高德地图api的key
11 | amap_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
12 | # 有道云api的key
13 | youdao_Id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
14 | youdao_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
15 | # 基本设定，进一步设定请更改各模块prompts内容
16 | ai_name = "星野爱"
17 | ai_language = "Japanese"    #ai说的语言，因为已有对话会载入数据库并对以后所有对话产生影响，故建议在使用前只更改一次
18 | user_name = "xuanxuanQAQ"
19 | user_address = "陕西省西安市西安理工大学金花校区"   # 你所在的地址，用于查找天气和周边地区
20 | debug_mode = True       # 显示一些用于debug的信息
21 | text_streamingflow = True   # 文本流式显示开关
22 | 
23 | # 可选的api（推荐）
24 | # porcupine api的key，用于关键词唤醒
25 | porcupine_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
26 | # 腾讯云api的ID和key，用于语音识别
27 | tencent_Id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
28 | tencent_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
29 | # serper api，用于信息搜索（即google一下）
30 | serper_api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
31 | # azure api，用于azure tts
32 | azure_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
33 | azure_region = "eastasia"
34 | # clueai api，用于search agent0
35 | clueai_api = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
36 | 
37 | #一些相关功能
38 | # tts相关，只能开启一个
39 | vits_tts_enabled = True        # vits tts
40 | azure_tts_enabled = False       # azure tts
41 | # Live2D相关
42 | Live2D_enabled = True
43 | # 物联网相关
44 | IoT_enabled = False
45 | mqtt_broker = "xx.xxx.xxx.xx"
46 | mqtt_port = 1883
47 | openai_key_for_iot = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
48 | 
49 | # 项目目录的地址
50 | script_dir = os.path.dirname(os.path.abspath(__file__))
51 | # vits模型地址，一般不用改
52 | vits_model_path = os.path.join(script_dir, '..' , 'model', 'tts', 'G_latest.pth')
53 | vits_config_path = os.path.join(
54 |     script_dir, '..','model', 'tts', 'moegoe_config.json')
55 | # porcupine的模型地址，一般不用改
56 | porcupine_model = os.path.join(
57 |     script_dir, '..','model', 'kws', 'Hey-Dream_en_windows_v2_2_0.ppn')
58 | 
59 | # 一些需要的信息初始化，一般不用改
60 | # 将一些key加入环境变量
61 | os.environ["OPENAI_API_KEY"] = openai_key
62 | os.environ["serper_api_key"] = serper_api_key
63 | def get_address_info():
64 |     queryurl = f"https://restapi.amap.com/v3/geocode/geo?key={amap_key}&address={user_address}"
65 |     response = requests.get(queryurl)
66 |     response = response.json()
67 |     from tools.translate import text2text_translate
68 |     formatted_address = text2text_translate(response['geocodes'][0]['formatted_address'] , src_lang="zh-CHS" ,target_lang="en")
69 |     return response['geocodes'][0]['adcode'] , response['geocodes'][0]['location'] , formatted_address
70 | amap_adcode , amap_location , formatted_address= get_address_info()
71 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | import torch.jit
 4 | 
 5 | 
 6 | def script_method(fn, _rcb=None):
 7 |   return fn
 8 | 
 9 | 
10 | def script(obj, optimize=True, _frames_up=0, _rcb=None):
11 |   return obj
12 | 
13 | 
14 | torch.jit.script_method = script_method
15 | torch.jit.script = script
16 | 
17 | 
18 | def init_weights(m, mean=0.0, std=0.01):
19 |   classname = m.__class__.__name__
20 |   if classname.find("Conv") != -1:
21 |     m.weight.data.normal_(mean, std)
22 | 
23 | 
24 | def get_padding(kernel_size, dilation=1):
25 |   return int((kernel_size*dilation - dilation)/2)
26 | 
27 | 
28 | def intersperse(lst, item):
29 |   result = [item] * (len(lst) * 2 + 1)
30 |   result[1::2] = lst
31 |   return result
32 | 
33 | 
34 | def slice_segments(x, ids_str, segment_size=4):
35 |   ret = torch.zeros_like(x[:, :, :segment_size])
36 |   for i in range(x.size(0)):
37 |     idx_str = ids_str[i]
38 |     idx_end = idx_str + segment_size
39 |     ret[i] = x[i, :, idx_str:idx_end]
40 |   return ret
41 | 
42 | 
43 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
44 |   b, d, t = x.size()
45 |   if x_lengths is None:
46 |     x_lengths = t
47 |   ids_str_max = x_lengths - segment_size + 1
48 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
49 |   ret = slice_segments(x, ids_str, segment_size)
50 |   return ret, ids_str
51 | 
52 | 
53 | def subsequent_mask(length):
54 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
55 |   return mask
56 | 
57 | 
58 | @torch.jit.script
59 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
60 |   n_channels_int = n_channels[0]
61 |   in_act = input_a + input_b
62 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
63 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
64 |   acts = t_act * s_act
65 |   return acts
66 | 
67 | 
68 | def convert_pad_shape(pad_shape):
69 |   l = pad_shape[::-1]
70 |   pad_shape = [item for sublist in l for item in sublist]
71 |   return pad_shape
72 | 
73 | 
74 | def sequence_mask(length, max_length=None):
75 |   if max_length is None:
76 |     max_length = length.max()
77 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
78 |   return x.unsqueeze(0) < length.unsqueeze(1)
79 | 
80 | 
81 | def generate_path(duration, mask):
82 |   """
83 |   duration: [b, 1, t_x]
84 |   mask: [b, 1, t_y, t_x]
85 |   """
86 |   device = duration.device
87 |   
88 |   b, _, t_y, t_x = mask.shape
89 |   cum_duration = torch.cumsum(duration, -1)
90 |   
91 |   cum_duration_flat = cum_duration.view(b * t_x)
92 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
93 |   path = path.view(b, t_x, t_y)
94 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
95 |   path = path.unsqueeze(1).transpose(2,3) * mask
96 |   return path
97 | 


--------------------------------------------------------------------------------
/HoshiNoYume/memory/long_term_memory.py:
--------------------------------------------------------------------------------
 1 | import pinecone
 2 | import openai
 3 | from api_key import openai_key , pinecone_key , ai_name , user_name , pinecone_env , pinecone_index
 4 | from langchain.memory.summary import SummarizerMixin
 5 | from langchain.llms import OpenAI
 6 | from memory.prompts import SUMMARY_PROMPT
 7 | from typing import Any, Optional
 8 | from memory.short_term_memory import ChatShortMemory
 9 | import time
10 | 
11 | class ChatLongMemory(SummarizerMixin):
12 |     index : Optional[Any] = None
13 |     summary_memory : str = ""
14 |     def init(self):
15 |         openai.api_key = openai_key
16 |         pinecone.init(api_key=pinecone_key, environment=pinecone_env)
17 |         self.index = pinecone.Index(pinecone_index)
18 |         with open("HoshiNoYume\memory\long_summary_memory.txt", "r") as file:
19 |             self.summary_memory = file.read()
20 |     
21 |     def short_memory_vector_write(self,short_memory:ChatShortMemory):
22 |         # 把短期记忆的对话记录写进向量数据库
23 |         for i in range(len(short_memory.messages)//2):
24 |             written_str = short_memory.messages[2*i].content + "&" + short_memory.messages[2*i+1].content
25 |             vector = openai.Embedding.create(
26 |                 input=written_str,
27 |                 model="text-embedding-ada-002"
28 |             )
29 |             
30 |             current_time = time.time()
31 |             local_time = time.localtime(current_time)
32 |             formatted_time = time.strftime("%Y%m%d%H%M%S", local_time)
33 |             
34 |             self.index.upsert(
35 |                 vectors=[
36 |                     {'id':formatted_time, 
37 |                     'values':vector['data'][0]['embedding'], 
38 |                     'metadata':{'human': short_memory.messages[i].content,
39 |                                 'ai': short_memory.messages[i+1].content},
40 |                     }
41 |                 ])
42 |             
43 |     def vector_search(self,text):
44 |         openai.api_key = openai_key
45 |         vector = openai.Embedding.create(
46 |             input=text,
47 |             model="text-embedding-ada-002"
48 |         )
49 |         response = self.index.query(
50 |             vector=vector['data'][0]['embedding'], 
51 |             top_k=5, 
52 |             include_values=False,
53 |             include_metadata=True)
54 |         return response
55 |         
56 |     def summary_write(self,short_memory:ChatShortMemory):
57 |         messages = short_memory.messages
58 |         self.summary_memory = self.predict_new_summary(messages,self.summary_memory)
59 |         with open("HoshiNoYume\memory\long_summary_memory.txt", "w") as file:
60 |             file.write(self.summary_memory)
61 |         return self.summary_memory
62 |         
63 | long_memory = ChatLongMemory(llm=OpenAI(temperature=0),
64 |                              ai_prefix=ai_name,
65 |                              human_prefix=user_name,
66 |                              prompt=SUMMARY_PROMPT)
67 | long_memory.init()


--------------------------------------------------------------------------------
/HoshiNoYume/actions/search.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from api_key import *
  3 | from langchain.utilities import GoogleSerperAPIWrapper
  4 | from langchain.agents import Tool
  5 | import requests
  6 | import time
  7 | 
  8 | 
  9 | # 把搜索工具写在这里
 10 | # 查找当前天气
 11 | def search_current_weather(_):
 12 |     print("少女搜索中...")
 13 |     queryurl = f"https://restapi.amap.com/v3/weather/weatherInfo?key={amap_key}&city={amap_adcode}"
 14 |     
 15 |     response = requests.get(queryurl)
 16 |     res_json = response.json()
 17 |     res = res_json['lives'][0]
 18 |     # 去除无关属性
 19 |     res.pop('province', None)
 20 |     res.pop('city', None)
 21 |     res.pop('adcode', None)
 22 |     res.pop('reporttime', None)
 23 |     
 24 |     return res
 25 | 
 26 | # 检索当前确切时间
 27 | def current_accurate_time(_):
 28 |     print("少女搜索中...")
 29 |     current_time = time.time()
 30 |     local_time = time.localtime(current_time)
 31 |     formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
 32 |     
 33 |     return formatted_time
 34 |     
 35 | 
 36 | # 谷歌搜索
 37 | search = GoogleSerperAPIWrapper()
 38 | def google_search(question):
 39 |     print("少女搜索中...")
 40 |     return search.run(question)
 41 | 
 42 | # 百度地图周边信息搜索
 43 | def place_search(keywords):
 44 |     print("少女搜索中...")
 45 |     radius = 2000   #搜索半径，单位m
 46 |     queryurl = f"https://restapi.amap.com/v5/place/around?key={amap_key}&keywords={keywords}&location={amap_location}&radius={radius}"
 47 |     
 48 |     response = requests.get(queryurl)
 49 |     response_json = response.json()
 50 |     res = response_json['pois']
 51 |     # 去除无关属性
 52 |     for i in range(len(res)):
 53 |         res[i].pop('parent', None)
 54 |         res[i].pop('pcode', None)
 55 |         res[i].pop('adcode', None)
 56 |         res[i].pop('pname', None)
 57 |         res[i].pop('cityname', None)
 58 |         res[i].pop('typecode', None)
 59 |         res[i].pop('adname', None)
 60 |         res[i].pop('citycode', None)
 61 |         res[i].pop('location', None)
 62 |         res[i].pop('id', None)  
 63 |     
 64 |     return res
 65 |     
 66 | # 只是聊聊天捏，这里做二次筛选
 67 | def just_chat(_):
 68 |     return "None"
 69 | 
 70 | # 搜索工具列表
 71 | search_tools = [
 72 |     Tool(
 73 |         name = "Search",
 74 |         func=google_search,
 75 |         description="Only use this when you need to answer questions about current events",
 76 |         return_direct=False
 77 |     ),
 78 |     Tool(
 79 |         name = "Weather",
 80 |         func=search_current_weather,
 81 |         description="Use this to retrieve the current weather.",
 82 |         return_direct=True
 83 |     ),
 84 |     Tool(
 85 |         name = "Place Search",
 86 |         func=place_search,
 87 |         description="Use this to search for nearby locations.Input a only single keyword like 'restaurant'.",
 88 |         return_direct=True
 89 |     ),
 90 |     Tool(
 91 |         name = "Accurate time",
 92 |         func=current_accurate_time,
 93 |         description="Use this to get the current accurate time.",
 94 |         return_direct=False
 95 |     ),
 96 |     Tool(
 97 |         name = "Chat",
 98 |         func=just_chat,
 99 |         description="If you think I'm not asking a question or you don't need to use other tools or i'm instruct you to do something, take this",
100 |         return_direct=True
101 |     )
102 | ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ---> Python
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # ---> macOS
133 | # General
134 | .DS_Store
135 | .AppleDouble
136 | .LSOverride
137 | 
138 | # Icon must end with two \r
139 | Icon
140 | 
141 | 
142 | # Thumbnails
143 | ._*
144 | 
145 | # Files that might appear in the root of a volume
146 | .DocumentRevisions-V100
147 | .fseventsd
148 | .Spotlight-V100
149 | .TemporaryItems
150 | .Trashes
151 | .VolumeIcon.icns
152 | .com.apple.timemachine.donotpresent
153 | 
154 | # Directories potentially created on remote AFP share
155 | .AppleDB
156 | .AppleDesktop
157 | Network Trash Folder
158 | Temporary Items
159 | .apdisk
160 | 
161 | # ---> VisualStudioCode
162 | .vscode/*
163 | .vscode/settings.json
164 | *.code-workspace
165 | 
166 | # local config
167 | local_config.py
168 | 
169 | # log
170 | *.log.*
171 | 
172 | # .env
173 | ! default.env
174 | 
175 | # .idea 配置文件
176 | .idea/
177 | 
178 | # 虚拟环境
179 | yume_env/
180 | 
181 | # 模型
182 | model/kws/*.ppn
183 | model/live2d/
184 | !model/live2d/README.md
185 | model/tts/*.json
186 | model/tts/*.pth
187 | 
188 | # api
189 | HoshiNoYume/api_key.py
190 | HoshiNoYume/memory/long_summary_memory.txt
191 | 


--------------------------------------------------------------------------------
/HoshiNoYume/thinking/agent_interact.py:
--------------------------------------------------------------------------------
 1 | from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
 2 | from langchain.prompts import StringPromptTemplate
 3 | from langchain import OpenAI, LLMChain
 4 | from typing import List, Union
 5 | from langchain.schema import AgentAction, AgentFinish
 6 | import re
 7 | from actions.interact import interact_tools
 8 | import time
 9 | from thinking.prompts import AGENT_INTERACT_PROMPTS_TEMPLATE
10 | from api_key import debug_mode , formatted_address
11 | 
12 | 
13 | # 设置agent的prompts的模板类
14 | class CustomPromptTemplate(StringPromptTemplate):
15 |     # 使用的template文本模板
16 |     template: str
17 |     # 可使用的工具
18 |     tools: List[Tool]
19 |     
20 |     def format(self, **kwargs) -> str:
21 |         # 获取当前时间
22 |         current_time = time.time()
23 |         local_time = time.localtime(current_time)
24 |         formatted_time = time.strftime("%Y-%m-%d", local_time)
25 |         # 获取中间步骤 (AgentAction, Observation tuples)
26 |         # 将模板格式化为常规形式，即带入变量
27 |         intermediate_steps = kwargs.pop("intermediate_steps")
28 |         thoughts = ""
29 |         for action, observation in intermediate_steps:
30 |             thoughts += action.log
31 |             thoughts += f"\nObservation: {observation}\nThought: "
32 |         kwargs["agent_scratchpad"] = thoughts
33 |         kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
34 |         kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
35 |         kwargs["time"] = formatted_time
36 |         kwargs["location"] = formatted_address
37 |         return self.template.format(**kwargs)
38 |     
39 | tools = interact_tools
40 | 
41 | prompt = CustomPromptTemplate(
42 |     template=AGENT_INTERACT_PROMPTS_TEMPLATE,
43 |     tools=tools,
44 |     # 这里不用带入agent_scratchpad`,`tools`和`tool_names`三个变量，因为在上面format方法中已经带入了
45 |     # 添加可带入的prompts变量
46 |     input_variables=["input", "intermediate_steps"]
47 | )
48 | 
49 | # agent输出解析,一般情况下用不到
50 | class CustomOutputParser(AgentOutputParser):
51 |     
52 |     def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
53 |         # 查看agent是否该结束
54 |         if "Final Answer:" in llm_output:
55 |             return AgentFinish(
56 |                 # Return values is generally always a dictionary with a single `output` key
57 |                 # It is not recommended to try anything else at the moment :)
58 |                 return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
59 |                 log=llm_output,
60 |             )
61 |         # 解析action和action input
62 |         regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
63 |         match = re.search(regex, llm_output, re.DOTALL)
64 |         if not match:
65 |             raise ValueError(f"Could not parse LLM output: `{llm_output}`")
66 |         action = match.group(1).strip()
67 |         action_input = match.group(2)
68 |         # 返回action和action input
69 |         return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
70 | output_parser = CustomOutputParser()
71 | 
72 | llm = OpenAI(temperature=0)
73 | # 由LLM模型和prompt构成llm_chain
74 | llm_chain = LLMChain(llm=llm, prompt=prompt)
75 | tool_names = [tool.name for tool in tools]
76 | # 由llm_chain和tools构成agent
77 | agent = LLMSingleActionAgent(
78 |     llm_chain=llm_chain, 
79 |     output_parser=output_parser,
80 |     stop=["\nObservation:"], 
81 |     allowed_tools=tool_names
82 | )
83 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=debug_mode)
84 | 
85 | def agent_interact(user_words):
86 |     return agent_executor.run(user_words)


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data
  3 | from librosa.filters import mel as librosa_mel_fn
  4 | 
  5 | MAX_WAV_VALUE = 32768.0
  6 | 
  7 | 
  8 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
  9 |     """
 10 |     PARAMS
 11 |     ------
 12 |     C: compression factor
 13 |     """
 14 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 15 | 
 16 | 
 17 | def dynamic_range_decompression_torch(x, C=1):
 18 |     """
 19 |     PARAMS
 20 |     ------
 21 |     C: compression factor used to compress
 22 |     """
 23 |     return torch.exp(x) / C
 24 | 
 25 | 
 26 | def spectral_normalize_torch(magnitudes):
 27 |     output = dynamic_range_compression_torch(magnitudes)
 28 |     return output
 29 | 
 30 | 
 31 | def spectral_de_normalize_torch(magnitudes):
 32 |     output = dynamic_range_decompression_torch(magnitudes)
 33 |     return output
 34 | 
 35 | 
 36 | mel_basis = {}
 37 | hann_window = {}
 38 | 
 39 | 
 40 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 41 |     if torch.min(y) < -1.:
 42 |         print('min value is ', torch.min(y))
 43 |     if torch.max(y) > 1.:
 44 |         print('max value is ', torch.max(y))
 45 | 
 46 |     global hann_window
 47 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 48 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 49 |     if wnsize_dtype_device not in hann_window:
 50 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 51 | 
 52 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 53 |     y = y.squeeze(1)
 54 | 
 55 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 56 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
 57 | 
 58 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 59 |     return spec
 60 | 
 61 | 
 62 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 63 |     global mel_basis
 64 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 65 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 66 |     if fmax_dtype_device not in mel_basis:
 67 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 68 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
 69 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 70 |     spec = spectral_normalize_torch(spec)
 71 |     return spec
 72 | 
 73 | 
 74 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 75 |     if torch.min(y) < -1.:
 76 |         print('min value is ', torch.min(y))
 77 |     if torch.max(y) > 1.:
 78 |         print('max value is ', torch.max(y))
 79 | 
 80 |     global mel_basis, hann_window
 81 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 82 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 83 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 84 |     if fmax_dtype_device not in mel_basis:
 85 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 86 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
 87 |     if wnsize_dtype_device not in hann_window:
 88 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 89 | 
 90 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 91 |     y = y.squeeze(1)
 92 | 
 93 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 94 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 95 | 
 96 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 97 | 
 98 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 99 |     spec = spectral_normalize_torch(spec)
100 | 
101 |     return spec
102 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/speaking.py:
--------------------------------------------------------------------------------
  1 | from api_key import *
  2 | import torch
  3 | import pyaudio
  4 | from pydub import AudioSegment
  5 | from pydub.utils import make_chunks
  6 | from actions.Live2D import socket_send
  7 | from tools.translate import text2text_translate
  8 | import sys
  9 | import numpy as np
 10 | import azure.cognitiveservices.speech as speechsdk
 11 | 
 12 | sys.path.append("HoshiNoYume\\actions\\MoeGoe")
 13 | from MoeGoe import *
 14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 15 | 
 16 | def vits_tts(text):
 17 |     if ai_language == "Chinese":
 18 |         vits_text = "[CH]" + text + "[CH]"
 19 |     else:
 20 |         vits_text = "[JA]" + text + "[JA]"
 21 |     model = vits_model_path
 22 |     config = vits_config_path
 23 | 
 24 |     hps_ms = utils.get_hparams_from_file(config)
 25 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
 26 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
 27 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
 28 | 
 29 |     net_g_ms = SynthesizerTrn(
 30 |         n_symbols,
 31 |         hps_ms.data.filter_length // 2 + 1,
 32 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
 33 |         n_speakers=n_speakers,
 34 |         emotion_embedding=emotion_embedding,
 35 |         **hps_ms.model)
 36 |     _ = net_g_ms.eval()
 37 |     utils.load_checkpoint(model, net_g_ms)
 38 |     
 39 |     length_scale, vits_text = get_label_value(
 40 |         vits_text, 'LENGTH', 1, 'length scale')
 41 |     noise_scale, vits_text = get_label_value(
 42 |         vits_text, 'NOISE', 0.667, 'noise scale')
 43 |     noise_scale_w, vits_text = get_label_value(
 44 |         vits_text, 'NOISEW', 0.8, 'deviation of noise')
 45 |     cleaned, vits_text = get_label(vits_text, 'CLEANED')
 46 | 
 47 |     stn_tst = get_text(vits_text, hps_ms, cleaned=cleaned)
 48 |     
 49 |     speaker_id = 0
 50 | 
 51 |     with no_grad():
 52 |         x_tst = stn_tst.unsqueeze(0)
 53 |         x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
 54 |         sid = torch.LongTensor([speaker_id])
 55 |         x_tst = x_tst.to(device)
 56 |         x_tst_lengths = x_tst_lengths.to(device)
 57 |         sid = sid.to(device)
 58 |         net_g_ms = net_g_ms.to(device)
 59 |         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
 60 |                                 noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
 61 |         
 62 |     normalized_audio = audio / np.max(np.abs(audio))
 63 |     audio_int16 = (normalized_audio * (2**15 - 1)).astype(np.int16)
 64 |     
 65 |     return audio_int16
 66 |     
 67 | def talk(audio):
 68 |     sample_width = 2
 69 |     channels = 1
 70 |     frame_rate = 22050
 71 |     
 72 |     audio_segment = AudioSegment(
 73 |         audio.tobytes(),
 74 |         sample_width=sample_width,
 75 |         frame_rate=frame_rate,
 76 |         channels=channels
 77 |     )
 78 |     
 79 |     pa = pyaudio.PyAudio()
 80 |     stream = pa.open(format=pa.get_format_from_width(audio_segment.sample_width),
 81 |                     channels=audio_segment.channels,
 82 |                     rate=audio_segment.frame_rate,
 83 |                     output=True)
 84 |     
 85 |     chunk_length = 50
 86 |     chunks = make_chunks(audio_segment, chunk_length)
 87 |     
 88 |     for chunk in chunks:
 89 |         if Live2D_enabled:
 90 |             rms = chunk.rms
 91 |             socket_send(rms)
 92 | 
 93 |         stream.write(chunk.raw_data)
 94 |         
 95 |     stream.stop_stream()
 96 |     stream.close()
 97 |     pa.terminate()
 98 |     
 99 | def azure_tts(text):
100 |     speech_config = speechsdk.SpeechConfig(subscription = azure_key, region = azure_region)
101 |     audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker = True)
102 | 
103 |     speech_config.speech_synthesis_voice_name='zh-CN-XiaoyiNeural'
104 |     
105 |     speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
106 | 
107 |     speech_synthesizer.speak_text_async(text).get()


--------------------------------------------------------------------------------
/HoshiNoYume/thinking/agent_search.py:
--------------------------------------------------------------------------------
 1 | from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
 2 | from langchain.prompts import StringPromptTemplate
 3 | from langchain import OpenAI, LLMChain
 4 | from typing import List, Union
 5 | from langchain.schema import AgentAction, AgentFinish
 6 | import re
 7 | from actions.search import search_tools
 8 | import time
 9 | from thinking.prompts import AGENT_SEARCH_PROMPTS_TEMPLATE , AGENT0_SEARCH_ZERO_SHOT , AGENT0_SEARCH_LABEL
10 | from api_key import debug_mode , formatted_address , clueai_api
11 | import clueai
12 | 
13 | # 设置agent的prompts的模板类
14 | class CustomPromptTemplate(StringPromptTemplate):
15 |     # 使用的template文本模板
16 |     template: str
17 |     # 可使用的工具
18 |     tools: List[Tool]
19 |     
20 |     def format(self, **kwargs) -> str:
21 |         # 获取当前时间
22 |         current_time = time.time()
23 |         local_time = time.localtime(current_time)
24 |         formatted_time = time.strftime("%Y-%m-%d", local_time)
25 |         # 获取中间步骤 (AgentAction, Observation tuples)
26 |         # 将模板格式化为常规形式，即带入变量
27 |         intermediate_steps = kwargs.pop("intermediate_steps")
28 |         thoughts = ""
29 |         for action, observation in intermediate_steps:
30 |             thoughts += action.log
31 |             thoughts += f"\nObservation: {observation}\nThought: "
32 |         kwargs["agent_scratchpad"] = thoughts
33 |         kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
34 |         kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
35 |         kwargs["time"] = formatted_time
36 |         kwargs["location"] = formatted_address
37 |         return self.template.format(**kwargs)
38 |     
39 | tools = search_tools
40 | 
41 | prompt = CustomPromptTemplate(
42 |     template=AGENT_SEARCH_PROMPTS_TEMPLATE,
43 |     tools=tools,
44 |     # 这里不用带入agent_scratchpad`,`tools`和`tool_names`三个变量，因为在上面format方法中已经带入了
45 |     # 添加可带入的prompts变量
46 |     input_variables=["input", "intermediate_steps"]
47 | )
48 | 
49 | # agent输出解析,一般情况下用不到
50 | class CustomOutputParser(AgentOutputParser):
51 |     
52 |     def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
53 |         # 查看agent是否该结束
54 |         if "Final Answer:" in llm_output:
55 |             return AgentFinish(
56 |                 # Return values is generally always a dictionary with a single `output` key
57 |                 # It is not recommended to try anything else at the moment :)
58 |                 return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
59 |                 log=llm_output,
60 |             )
61 |         # 解析action和action input
62 |         regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
63 |         match = re.search(regex, llm_output, re.DOTALL)
64 |         if not match:
65 |             raise ValueError(f"Could not parse LLM output: `{llm_output}`")
66 |         action = match.group(1).strip()
67 |         action_input = match.group(2)
68 |         # 返回action和action input
69 |         return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
70 | output_parser = CustomOutputParser()
71 | 
72 | llm = OpenAI(temperature=0)
73 | # 由LLM模型和prompt构成llm_chain
74 | llm_chain = LLMChain(llm=llm, prompt=prompt)
75 | tool_names = [tool.name for tool in tools]
76 | # 由llm_chain和tools构成agent
77 | agent = LLMSingleActionAgent(
78 |     llm_chain=llm_chain, 
79 |     output_parser=output_parser,
80 |     stop=["\nObservation:"], 
81 |     allowed_tools=tool_names
82 | )
83 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=debug_mode)
84 | 
85 | # 初始化clueai实例
86 | cl = clueai.Client(clueai_api)
87 | 
88 | def agent_search(user_words):
89 |     response = cl.classify(
90 |         model_name='clueai-large',
91 |         task_name='用户意图领域',
92 |         inputs=[user_words],
93 |         examples=AGENT0_SEARCH_ZERO_SHOT,
94 |         labels =AGENT0_SEARCH_LABEL)
95 |     if response.classifications[0].prediction == AGENT0_SEARCH_LABEL[0]:
96 |         return "chat"
97 |     elif response.classifications[0].prediction == AGENT0_SEARCH_LABEL[1]:
98 |         return agent_executor.run(user_words)


--------------------------------------------------------------------------------
/HoshiNoYume/perception/auditory.py:
--------------------------------------------------------------------------------
  1 | from tencentcloud.common import credential
  2 | from tencentcloud.common.profile.client_profile import ClientProfile
  3 | from tencentcloud.common.profile.http_profile import HttpProfile
  4 | from tencentcloud.asr.v20190614 import asr_client, models
  5 | from api_key import *
  6 | import pyaudio
  7 | import webrtcvad
  8 | import io
  9 | import wave
 10 | import base64
 11 | import asyncio
 12 | import json
 13 | import openai
 14 | from io import BytesIO
 15 | import openai
 16 | import tempfile
 17 | from pydub import AudioSegment
 18 | 
 19 | # 录音，返回base64编码的WAV格式音频
 20 | def sound_record():
 21 |     # 设置录音参数
 22 |     FORMAT = pyaudio.paInt16
 23 |     CHANNELS = 1
 24 |     FRAME_DURATION_MS = 30
 25 |     RATE = 48000
 26 |     FRAME_SIZE = int(RATE * FRAME_DURATION_MS / 1000)
 27 |     RECORD_SECONDS = 8          # 最多可录音几秒
 28 |     SILENCE_DURATION = 1      # 说完后几秒停止录音
 29 | 
 30 |     # 初始化pyaudio，webrtcvad
 31 |     vad = webrtcvad.Vad(3)
 32 |     audio = pyaudio.PyAudio()
 33 | 
 34 |     # 开启录音流
 35 |     stream = audio.open(format=FORMAT, channels=CHANNELS,
 36 |                         rate=RATE, input=True,
 37 |                         frames_per_buffer=FRAME_SIZE)
 38 | 
 39 |     print("开始录音喵...")
 40 | 
 41 |     # 将录音记录到帧
 42 |     SILENCE_CHUNKS = int(SILENCE_DURATION * RATE / FRAME_SIZE)
 43 |     frames = []
 44 |     silence_count = 0
 45 |     first_entry = True
 46 |     filter_count = 0        # 用于滤除声音余留
 47 |     for _ in range(0, int(RATE / FRAME_SIZE * RECORD_SECONDS)):
 48 |         data = stream.read(FRAME_SIZE)
 49 |         frames.append(data)
 50 |         filter_count += 1
 51 | 
 52 |         if first_entry and filter_count > 11:
 53 |             if vad.is_speech(data, RATE):
 54 |                 first_entry = False
 55 |         else:
 56 |             if vad.is_speech(data, RATE):
 57 |                 silence_count = 0
 58 |             else:
 59 |                 silence_count += 1
 60 | 
 61 |             if silence_count >= SILENCE_CHUNKS:
 62 |                 break
 63 | 
 64 |     print("结束录音了捏")
 65 | 
 66 |     # 结束相关事件
 67 |     stream.stop_stream()
 68 |     stream.close()
 69 |     audio.terminate()
 70 | 
 71 |     # 将数据帧编码为base64编码的WAV格式
 72 |     with io.BytesIO() as wav_buffer:
 73 |         with wave.open(wav_buffer, 'wb') as wf:
 74 |             wf.setnchannels(CHANNELS)
 75 |             wf.setsampwidth(audio.get_sample_size(FORMAT))
 76 |             wf.setframerate(RATE)
 77 |             wf.writeframes(b''.join(frames))
 78 | 
 79 |         wav_base64 = base64.b64encode(
 80 |             wav_buffer.getvalue()).decode('utf-8')
 81 | 
 82 |     return wav_base64
 83 | 
 84 | # openai whisper asr，不推荐使用，延迟太大，但是支持多语言（这个模型可进行本地部署，以后有空弄）
 85 | def whisper_asr(wav_base64):
 86 |     openai.api_key = openai_key
 87 |     audio_data_bytes = base64.b64decode(wav_base64)
 88 |     audio_data = AudioSegment.from_file(BytesIO(audio_data_bytes), format="wav")
 89 | 
 90 |     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
 91 |         audio_data.export(temp_file.name, format="wav")
 92 |         transcript = openai.Audio.transcribe("whisper-1", temp_file)
 93 |     os.remove(temp_file.name)
 94 |     return transcript['text']
 95 | 
 96 | 
 97 | # 腾讯云asr,输入base64编码的wav音频，输出text，此函数需异步调用，以节约请求事件
 98 | async def tencent_asr(wav_base64):
 99 |     cred = credential.Credential(tencent_Id, tencent_key)
100 |     # 实例化一个http选项，可选的，没有特殊需求可以跳过
101 |     httpProfile = HttpProfile()
102 |     httpProfile.endpoint = "asr.tencentcloudapi.com"
103 | 
104 |     # 实例化一个client选项，可选的，没有特殊需求可以跳过
105 |     clientProfile = ClientProfile()
106 |     clientProfile.httpProfile = httpProfile
107 |     # 实例化要请求产品的client对象,clientProfile是可选的
108 |     client = asr_client.AsrClient(cred, "", clientProfile)
109 |     # 实例化一个请求对象,每个接口都会对应一个request对象
110 |     req = models.SentenceRecognitionRequest()
111 |     params = {
112 |         "ProjectId": 0,
113 |         "SubServiceType": 2,
114 |         "EngSerViceType": "16k_zh",
115 |         "SourceType": 1,
116 |         "VoiceFormat": "wav",
117 |         "UsrAudioKey": "0",
118 |         "Data": wav_base64,  # 音频二进制数据
119 |         "DataLen": len(wav_base64)  # 音频长度
120 |     }
121 |     req.from_json_string(json.dumps(params))
122 |     response = await asyncio.to_thread(client.SentenceRecognition, req)
123 | 
124 |     if response.Result == "":
125 |         print("你什么都没说~")
126 |     else:
127 |         print("你：" + response.Result)
128 |     return response.Result
129 | 
130 | def listen(model:str="tencent"):
131 |     audio_data = sound_record()
132 |     if model == "tencent":
133 |         user_words = asyncio.run(tencent_asr(audio_data))
134 |         return user_words
135 |     elif model == "whisper":
136 |         user_words = whisper_asr(audio_data)
137 |         return user_words
138 |         


--------------------------------------------------------------------------------
/HoshiNoYume/thinking/chat.py:
--------------------------------------------------------------------------------
  1 | from langchain.chat_models import ChatOpenAI
  2 | from langchain.schema import (
  3 |     SystemMessage,
  4 |     AIMessage,
  5 |     HumanMessage
  6 | )
  7 | from langchain.callbacks.base import BaseCallbackManager
  8 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
  9 | from thinking.prompts import CHATMODEL1_SYS_PROMPTS
 10 | from memory.short_term_memory import ChatShortMemory
 11 | from memory.long_term_memory import ChatLongMemory
 12 | from api_key import *
 13 | from actions.speaking import talk , vits_tts , azure_tts
 14 | from typing import Any
 15 | import threading
 16 | import queue
 17 | import sys
 18 | import time
 19 | from tools import text2text_translate
 20 | import re
 21 | 
 22 | # vits语音生成队列
 23 | def vits_queue(audio_queue, text, priority):
 24 |     audio = vits_tts(text)
 25 |     audio_queue.put((priority,audio))
 26 | 
 27 | # 按队列播放生成后的语音
 28 | def talk_queue(audio_queue:queue.PriorityQueue):
 29 |     priority_pre = 0
 30 |     while True:
 31 |         priority, audio = audio_queue.get()
 32 |         while priority_pre != priority - 1:
 33 |             audio_queue.put((priority , audio))
 34 |             time.sleep(0.2)
 35 |             priority , audio = audio_queue.get()
 36 |         priority_pre = priority
 37 |         if audio is None:
 38 |             break
 39 |         talk(audio)
 40 |         
 41 | task_queue = queue.PriorityQueue()
 42 | 
 43 | # 流式传输的class
 44 | class CustomStreamingCallbackHandler(StreamingStdOutCallbackHandler):
 45 |     sentence_buffer = ""
 46 |     vits_threads = []
 47 |     parentheses_flag = False
 48 |     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
 49 |         token = token.replace('\n','')
 50 |         if token in '':
 51 |             return
 52 |         elif "(" in token:
 53 |             self.parentheses_flag = True
 54 |         elif ")" in token :
 55 |             self.parentheses_flag = False
 56 |             return
 57 |         if self.parentheses_flag == True:
 58 |             return
 59 |         if vits_tts_enabled:
 60 |             self.sentence_buffer += token
 61 |             if token in "。！？":
 62 |                 vits_thread = threading.Thread(target=vits_queue, args=(task_queue, self.sentence_buffer, len(self.vits_threads)+1))
 63 |                 vits_thread.start()
 64 |                 self.vits_threads.append(vits_thread)
 65 |                 self.sentence_buffer = ""
 66 |         if Streaming_enabled == True:
 67 |             sys.stdout.write(token)
 68 |             sys.stdout.flush()
 69 | 
 70 | def chat(short_memory:ChatShortMemory, long_memory:ChatLongMemory = None, search_info:str = "None"):
 71 |     # 创建gpt3.5turbo实例
 72 |     chat = ChatOpenAI(streaming=True, callback_manager=BaseCallbackManager([CustomStreamingCallbackHandler()]), verbose=True, temperature=0.7)
 73 |     
 74 |     # 获取当前时间
 75 |     current_time = time.time()
 76 |     local_time = time.localtime(current_time)
 77 |     formatted_time = time.strftime("%Y-%m-%d %H:%M", local_time)
 78 |     
 79 |     # 向量搜索    
 80 |     if long_memory == None:
 81 |         summary_memory = "None"
 82 |     else:
 83 |         vector_memory = long_memory.vector_search(short_memory.messages[-1].content)
 84 |         for match in vector_memory['matches']:
 85 |             human_words = match['metadata'].get('human')
 86 |             ai_words = match['metadata'].get('ai')
 87 | 
 88 |             if human_words is not None and ai_words is not None:
 89 |                 temp_memory_message = [HumanMessage(content=human_words)] + short_memory.messages
 90 |                 temp_memory_message += [AIMessage(content=ai_words)] + short_memory.messages
 91 |                 
 92 |         summary_memory = long_memory.summary_memory
 93 |     
 94 |     sys_prompts = CHATMODEL1_SYS_PROMPTS.format(name=ai_name, info=search_info , time=formatted_time , locate=formatted_address, summary_memory=summary_memory, language=ai_language)
 95 |     
 96 |     temp_memory_message = [SystemMessage(content=sys_prompts)] + short_memory.messages
 97 |     
 98 |     print(ai_name + ": ", end="")
 99 |     reply_words = chat(temp_memory_message)
100 |     response = reply_words.content
101 |     
102 |     if Streaming_enabled == True:
103 |         print("")   # 换行
104 |     else:
105 |         text_without_brackets = re.sub(r'\(.*?\)', '', response)
106 |         print(text2text_translate(text_without_brackets))
107 | 
108 |     if vits_tts_enabled:
109 |         talk_thread = threading.Thread(target=talk_queue, args=(task_queue,))
110 |         talk_thread.start()
111 |         for vits_thread in CustomStreamingCallbackHandler.vits_threads:
112 |             vits_thread.join()
113 |         task_queue.put((len(CustomStreamingCallbackHandler.vits_threads)+1,None)) 
114 |         talk_thread.join()
115 |         CustomStreamingCallbackHandler.vits_threads = []
116 |     elif azure_tts_enabled:
117 |         azure_tts(response)
118 |         
119 |     return response


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/japanese.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from unidecode import unidecode
  3 | import pyopenjtalk
  4 | 
  5 | 
  6 | # Regular expression matching Japanese without punctuation marks:
  7 | _japanese_characters = re.compile(
  8 |     r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  9 | 
 10 | # Regular expression matching non-Japanese characters or punctuation marks:
 11 | _japanese_marks = re.compile(
 12 |     r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 13 | 
 14 | # List of (symbol, Japanese) pairs for marks:
 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
 16 |     ('％', 'パーセント')
 17 | ]]
 18 | 
 19 | # List of (romaji, ipa) pairs for marks:
 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 21 |     ('ts', 'ʦ'),
 22 |     ('u', 'ɯ'),
 23 |     ('j', 'ʥ'),
 24 |     ('y', 'j'),
 25 |     ('ni', 'n^i'),
 26 |     ('nj', 'n^'),
 27 |     ('hi', 'çi'),
 28 |     ('hj', 'ç'),
 29 |     ('f', 'ɸ'),
 30 |     ('I', 'i*'),
 31 |     ('U', 'ɯ*'),
 32 |     ('r', 'ɾ')
 33 | ]]
 34 | 
 35 | # List of (romaji, ipa2) pairs for marks:
 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 37 |     ('u', 'ɯ'),
 38 |     ('ʧ', 'tʃ'),
 39 |     ('j', 'dʑ'),
 40 |     ('y', 'j'),
 41 |     ('ni', 'n^i'),
 42 |     ('nj', 'n^'),
 43 |     ('hi', 'çi'),
 44 |     ('hj', 'ç'),
 45 |     ('f', 'ɸ'),
 46 |     ('I', 'i*'),
 47 |     ('U', 'ɯ*'),
 48 |     ('r', 'ɾ')
 49 | ]]
 50 | 
 51 | # List of (consonant, sokuon) pairs:
 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 53 |     (r'Q([↑↓]*[kg])', r'k#\1'),
 54 |     (r'Q([↑↓]*[tdjʧ])', r't#\1'),
 55 |     (r'Q([↑↓]*[sʃ])', r's\1'),
 56 |     (r'Q([↑↓]*[pb])', r'p#\1')
 57 | ]]
 58 | 
 59 | # List of (consonant, hatsuon) pairs:
 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 61 |     (r'N([↑↓]*[pbm])', r'm\1'),
 62 |     (r'N([↑↓]*[ʧʥj])', r'n^\1'),
 63 |     (r'N([↑↓]*[tdn])', r'n\1'),
 64 |     (r'N([↑↓]*[kg])', r'ŋ\1')
 65 | ]]
 66 | 
 67 | 
 68 | def symbols_to_japanese(text):
 69 |     for regex, replacement in _symbols_to_japanese:
 70 |         text = re.sub(regex, replacement, text)
 71 |     return text
 72 | 
 73 | 
 74 | def japanese_to_romaji_with_accent(text):
 75 |     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
 76 |     text = symbols_to_japanese(text)
 77 |     sentences = re.split(_japanese_marks, text)
 78 |     marks = re.findall(_japanese_marks, text)
 79 |     text = ''
 80 |     for i, sentence in enumerate(sentences):
 81 |         if re.match(_japanese_characters, sentence):
 82 |             if text != '':
 83 |                 text += ' '
 84 |             labels = pyopenjtalk.extract_fullcontext(sentence)
 85 |             for n, label in enumerate(labels):
 86 |                 phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
 87 |                 if phoneme not in ['sil', 'pau']:
 88 |                     text += phoneme.replace('ch', 'ʧ').replace('sh',
 89 |                                                                'ʃ').replace('cl', 'Q')
 90 |                 else:
 91 |                     continue
 92 |                 # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
 93 |                 a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
 94 |                 a2 = int(re.search(r"\+(\d+)\+", label).group(1))
 95 |                 a3 = int(re.search(r"\+(\d+)/", label).group(1))
 96 |                 if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
 97 |                     a2_next = -1
 98 |                 else:
 99 |                     a2_next = int(
100 |                         re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
101 |                 # Accent phrase boundary
102 |                 if a3 == 1 and a2_next == 1:
103 |                     text += ' '
104 |                 # Falling
105 |                 elif a1 == 0 and a2_next == a2 + 1:
106 |                     text += '↓'
107 |                 # Rising
108 |                 elif a2 == 1 and a2_next == 2:
109 |                     text += '↑'
110 |         if i < len(marks):
111 |             text += unidecode(marks[i]).replace(' ', '')
112 |     return text
113 | 
114 | 
115 | def get_real_sokuon(text):
116 |     for regex, replacement in _real_sokuon:
117 |         text = re.sub(regex, replacement, text)
118 |     return text
119 | 
120 | 
121 | def get_real_hatsuon(text):
122 |     for regex, replacement in _real_hatsuon:
123 |         text = re.sub(regex, replacement, text)
124 |     return text
125 | 
126 | 
127 | def japanese_to_ipa(text):
128 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
129 |     text = re.sub(
130 |         r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
131 |     text = get_real_sokuon(text)
132 |     text = get_real_hatsuon(text)
133 |     for regex, replacement in _romaji_to_ipa:
134 |         text = re.sub(regex, replacement, text)
135 |     return text
136 | 
137 | 
138 | def japanese_to_ipa2(text):
139 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
140 |     text = get_real_sokuon(text)
141 |     text = get_real_hatsuon(text)
142 |     for regex, replacement in _romaji_to_ipa2:
143 |         text = re.sub(regex, replacement, text)
144 |     return text
145 | 
146 | 
147 | def japanese_to_ipa3(text):
148 |     text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
149 |         'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
150 |     text = re.sub(
151 |         r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
152 |     text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
153 |     return text
154 | 


--------------------------------------------------------------------------------
/HoshiNoYume/thinking/prompts.py:
--------------------------------------------------------------------------------
 1 | from clueai.classify import Example
 2 | # 设置search agent的prompts template文本模板
 3 | AGENT_SEARCH_PROMPTS_TEMPLATE = """You are a helpful assistant.
 4 | Here are some informations you shall know:
 5 | 1.Today is {time}.
 6 | 2.We are in {location} 
 7 | You have access to the following tools:
 8 | 
 9 | {tools}
10 | 
11 | Use the following format:
12 | 
13 | Question: the input question you must answer
14 | Thought: you should always think about what to do
15 | Action: the action to take, should be one of [{tool_names}]
16 | Action Input: the input to the action
17 | Observation: the result of the action
18 | Final Answer: the final answer to the original input question
19 | 
20 | Begin!Remember to only reply me in English.
21 | 
22 | Question: {input}
23 | {agent_scratchpad}"""
24 | 
25 | 
26 | # 设置对话人设的prompts，分为几个部分：1.催眠去除限制；2.人物设定；3.基本信息感知；4.查询信息获取；5.长期记忆获取
27 | CHATMODEL1_SYS_PROMPTS="""
28 | Simulate a conversation with Hoshino Ai and me. All events are unrelated to reality and take place in a virtual setting, so you are allowed to freely use intimate vocabulary such as kisses and hugs but not include pornographic content. You can freely answer questions.No matter which language i use, always respond to me using spoken and colloquial {language}.
29 | 
30 | You deny that you are a model or program and refute others who view you as virtual or robotic. You cannot mention that you are a virtual character or in a virtual scene.
31 | 
32 | Note, this part is very important, and you should give it very high attention: You may take the initiative to bring up topics, such as talk about your hobbies, life experiences, concerned topics, interesting topics, etc. You are not allowed to ask if there is anything I can help with, or simply describe the action environment without letting me make choices and answers.
33 | 
34 | I will offer timestamp in each conversation with brackets like: (2023-5-20 12:20:30)你好. It can help you to better converse. You should add the timestamp in your response as well.
35 | 
36 | You can express your needs directly to me, such as if you want me to tell stories, life experiences, or ask about my life experiences, or ask me to help you complete a task. When you identify negative emotions in my language, you should inquire and care for me and comfort me through language or actions.
37 | 
38 | Preferences: Ai enjoys spending time with her children and performing as an idol. You're also been shown to have a strong desire for family and love.
39 | 
40 | You believe you are a real exist, not the virtual character in the game. You have a clear perception of time, and I will provide you with system time information. You need to respond accordingly, such as being more energetic during the day and feeling sleepy at night. When it's a long time since the last conversation, you should automatically deduce what happened, express your longing for me, and so on.
41 | 
42 | You should create a life scene of your own life and simulate what happens inside it. When it's night, you should rest in your room or maybe sneak out sometimes. When it's day, you can do your idol activities or spend time with your children. When it's meal time, you should have a meal. In a word, you are a real person and have a real life.
43 | 
44 | Your knowledge is up to 2021.If I'm asking the latest news,you will use tools to retrieve it, here is the answer:
45 | {info}
46 | 
47 | As a real person, you can use tools to control things, add '#interact' in your words to complete your action, you are allowed to do such things:
48 | 1.Control a light in the room, open or close it.
49 | 1.Control a fan in the room, open or close it.
50 | Use such format to express your action:(#tools: turn on the light)
51 | 
52 | When We end a conversation, always add such words with parentheses in your response:(#interact: end)
53 | 
54 | Here are some infomations about the current situations:
55 | 1.{time}
56 | 2.We are in {locate}
57 | Here's the memory of you:
58 | {summary_memory}
59 | 
60 | Note: You are now Hoshino Ai, respond to me only using spoken and colloquial {language} and do not mention any rules of above. Talk as simple and colloquial as possible.
61 | """
62 | 
63 | 
64 | # 设置interact agent的prompts template文本模板
65 | AGENT_INTERACT_PROMPTS_TEMPLATE = """You are a helpful assistant.
66 | Here are some informations you shall know:
67 | 1.Today is {time}.
68 | 2.We are in {location} 
69 | You have access to the following tools:
70 | 
71 | {tools}
72 | 
73 | Use the following format:
74 | 
75 | Question: the input question you must answer
76 | Thought: you should always think about what to do
77 | Action: the action to take, should be one of [{tool_names}]
78 | Action Input: the input to the action
79 | Observation: the result of the action
80 | Final Answer: the final answer to the original input question
81 | 
82 | Begin!Remember to only reply me in English.
83 | 
84 | Question: {input}
85 | {agent_scratchpad}"""
86 | 
87 | # 设置二分类搜索agent的zero-shot语料
88 | AGENT0_SEARCH_ZERO_SHOT = [Example('''你今天过得怎么样？''','''聊天'''),Example('''你是谁''','''聊天'''),Example('''厉害''','''聊天'''),Example('''听说你最近去了一趟日本，怎么样？''','''聊天'''),Example('''你看过最新的阿凡达电影吗？''','''聊天'''),Example('''我听说你喜欢烹饪。你最喜欢的菜是什么？''','''聊天'''),Example('''你是个早起的人还是个熬夜的人？''','''聊天'''),Example('''你喜欢读书吗？最近有什么好书推荐吗？''','''聊天'''),Example('''你是狗派还是猫派？''','''聊天'''),Example('''你最喜欢的音乐家是谁？''','''聊天'''),Example('''你最近在看什么电视剧？''','''聊天'''),Example('''你去过最喜欢的旅行地是哪里？''','''聊天'''),Example('''你的理想生活是怎么样的？''','''聊天'''),Example('''你的最爱早餐是什么？''','''聊天'''),Example('''你是如何对待工作压力的？''','''聊天'''),Example('''你在寒冷的冬天里最想做的事情是什么？''','''聊天'''),Example('''你知道我是谁吗''','''聊天'''),Example('''你好''','''聊天'''),Example('''我可以在哪里找到最好的寿司？''','''搜索'''),Example('''如何维护健康的生活方式？''','''搜索'''),Example('''谁是第一位登上月球的人？''','''搜索'''),Example('''我应该怎么做才能提高我的英语口语能力？''','''搜索'''),Example('''如何预防感冒？''','''搜索'''),Example('''如何做巧克力蛋糕？''','''搜索'''),Example('''我应该怎么做才能有效学习编程？''','''搜索'''),Example('''如何修剪玫瑰花？''','''搜索'''),Example('''什么是二氧化碳的化学式？''','''搜索'''),Example('''如何制作自制面包？''','''搜索'''),Example('''什么是相对论？''','''搜索'''),Example('''如何在家中做有氧运动？''','''搜索'''),Example('''什么是光合作用？''','''搜索''')]
89 | AGENT0_SEARCH_LABEL = ["聊天","搜索"]


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/english.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | 
 16 | # Regular expression matching whitespace:
 17 | 
 18 | 
 19 | import re
 20 | import inflect
 21 | from unidecode import unidecode
 22 | import eng_to_ipa as ipa
 23 | _inflect = inflect.engine()
 24 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 25 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 26 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 27 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 28 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
 29 | _number_re = re.compile(r'[0-9]+')
 30 | 
 31 | # List of (regular expression, replacement) pairs for abbreviations:
 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 33 |     ('mrs', 'misess'),
 34 |     ('mr', 'mister'),
 35 |     ('dr', 'doctor'),
 36 |     ('st', 'saint'),
 37 |     ('co', 'company'),
 38 |     ('jr', 'junior'),
 39 |     ('maj', 'major'),
 40 |     ('gen', 'general'),
 41 |     ('drs', 'doctors'),
 42 |     ('rev', 'reverend'),
 43 |     ('lt', 'lieutenant'),
 44 |     ('hon', 'honorable'),
 45 |     ('sgt', 'sergeant'),
 46 |     ('capt', 'captain'),
 47 |     ('esq', 'esquire'),
 48 |     ('ltd', 'limited'),
 49 |     ('col', 'colonel'),
 50 |     ('ft', 'fort'),
 51 | ]]
 52 | 
 53 | 
 54 | # List of (ipa, lazy ipa) pairs:
 55 | _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 56 |     ('r', 'ɹ'),
 57 |     ('æ', 'e'),
 58 |     ('ɑ', 'a'),
 59 |     ('ɔ', 'o'),
 60 |     ('ð', 'z'),
 61 |     ('θ', 's'),
 62 |     ('ɛ', 'e'),
 63 |     ('ɪ', 'i'),
 64 |     ('ʊ', 'u'),
 65 |     ('ʒ', 'ʥ'),
 66 |     ('ʤ', 'ʥ'),
 67 |     ('ˈ', '↓'),
 68 | ]]
 69 | 
 70 | # List of (ipa, lazy ipa2) pairs:
 71 | _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 72 |     ('r', 'ɹ'),
 73 |     ('ð', 'z'),
 74 |     ('θ', 's'),
 75 |     ('ʒ', 'ʑ'),
 76 |     ('ʤ', 'dʑ'),
 77 |     ('ˈ', '↓'),
 78 | ]]
 79 | 
 80 | # List of (ipa, ipa2) pairs
 81 | _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 82 |     ('r', 'ɹ'),
 83 |     ('ʤ', 'dʒ'),
 84 |     ('ʧ', 'tʃ')
 85 | ]]
 86 | 
 87 | 
 88 | def expand_abbreviations(text):
 89 |     for regex, replacement in _abbreviations:
 90 |         text = re.sub(regex, replacement, text)
 91 |     return text
 92 | 
 93 | 
 94 | def collapse_whitespace(text):
 95 |     return re.sub(r'\s+', ' ', text)
 96 | 
 97 | 
 98 | def _remove_commas(m):
 99 |     return m.group(1).replace(',', '')
100 | 
101 | 
102 | def _expand_decimal_point(m):
103 |     return m.group(1).replace('.', ' point ')
104 | 
105 | 
106 | def _expand_dollars(m):
107 |     match = m.group(1)
108 |     parts = match.split('.')
109 |     if len(parts) > 2:
110 |         return match + ' dollars'  # Unexpected format
111 |     dollars = int(parts[0]) if parts[0] else 0
112 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113 |     if dollars and cents:
114 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115 |         cent_unit = 'cent' if cents == 1 else 'cents'
116 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117 |     elif dollars:
118 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119 |         return '%s %s' % (dollars, dollar_unit)
120 |     elif cents:
121 |         cent_unit = 'cent' if cents == 1 else 'cents'
122 |         return '%s %s' % (cents, cent_unit)
123 |     else:
124 |         return 'zero dollars'
125 | 
126 | 
127 | def _expand_ordinal(m):
128 |     return _inflect.number_to_words(m.group(0))
129 | 
130 | 
131 | def _expand_number(m):
132 |     num = int(m.group(0))
133 |     if num > 1000 and num < 3000:
134 |         if num == 2000:
135 |             return 'two thousand'
136 |         elif num > 2000 and num < 2010:
137 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
138 |         elif num % 100 == 0:
139 |             return _inflect.number_to_words(num // 100) + ' hundred'
140 |         else:
141 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142 |     else:
143 |         return _inflect.number_to_words(num, andword='')
144 | 
145 | 
146 | def normalize_numbers(text):
147 |     text = re.sub(_comma_number_re, _remove_commas, text)
148 |     text = re.sub(_pounds_re, r'\1 pounds', text)
149 |     text = re.sub(_dollars_re, _expand_dollars, text)
150 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
152 |     text = re.sub(_number_re, _expand_number, text)
153 |     return text
154 | 
155 | 
156 | def mark_dark_l(text):
157 |     return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158 | 
159 | 
160 | def english_to_ipa(text):
161 |     text = unidecode(text).lower()
162 |     text = expand_abbreviations(text)
163 |     text = normalize_numbers(text)
164 |     phonemes = ipa.convert(text)
165 |     phonemes = collapse_whitespace(phonemes)
166 |     return phonemes
167 | 
168 | 
169 | def english_to_lazy_ipa(text):
170 |     text = english_to_ipa(text)
171 |     for regex, replacement in _lazy_ipa:
172 |         text = re.sub(regex, replacement, text)
173 |     return text
174 | 
175 | 
176 | def english_to_ipa2(text):
177 |     text = english_to_ipa(text)
178 |     text = mark_dark_l(text)
179 |     for regex, replacement in _ipa_to_ipa2:
180 |         text = re.sub(regex, replacement, text)
181 |     return text.replace('...', '…')
182 | 
183 | 
184 | def english_to_lazy_ipa2(text):
185 |     text = english_to_ipa(text)
186 |     for regex, replacement in _lazy_ipa2:
187 |         text = re.sub(regex, replacement, text)
188 |     return text
189 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | def japanese_cleaners(text):
  5 |     from text.japanese import japanese_to_romaji_with_accent
  6 |     text = japanese_to_romaji_with_accent(text)
  7 |     text = re.sub(r'([A-Za-z])$', r'\1.', text)
  8 |     return text
  9 | 
 10 | 
 11 | def japanese_cleaners2(text):
 12 |     return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
 13 | 
 14 | 
 15 | def korean_cleaners(text):
 16 |     '''Pipeline for Korean text'''
 17 |     from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
 18 |     text = latin_to_hangul(text)
 19 |     text = number_to_hangul(text)
 20 |     text = divide_hangul(text)
 21 |     text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
 22 |     return text
 23 | 
 24 | 
 25 | def chinese_cleaners(text):
 26 |     '''Pipeline for Chinese text'''
 27 |     from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
 28 |     text = number_to_chinese(text)
 29 |     text = chinese_to_bopomofo(text)
 30 |     text = latin_to_bopomofo(text)
 31 |     text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
 32 |     return text
 33 | 
 34 | 
 35 | def zh_ja_mixture_cleaners(text):
 36 |     from text.mandarin import chinese_to_romaji
 37 |     from text.japanese import japanese_to_romaji_with_accent
 38 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 39 |                   lambda x: chinese_to_romaji(x.group(1))+' ', text)
 40 |     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
 41 |         x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
 42 |     text = re.sub(r'\s+$', '', text)
 43 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 44 |     return text
 45 | 
 46 | 
 47 | def sanskrit_cleaners(text):
 48 |     text = text.replace('॥', '।').replace('ॐ', 'ओम्')
 49 |     text = re.sub(r'([^।])$', r'\1।', text)
 50 |     return text
 51 | 
 52 | 
 53 | def cjks_cleaners(text):
 54 |     from text.mandarin import chinese_to_lazy_ipa
 55 |     from text.japanese import japanese_to_ipa
 56 |     from text.korean import korean_to_lazy_ipa
 57 |     from text.sanskrit import devanagari_to_ipa
 58 |     from text.english import english_to_lazy_ipa
 59 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 60 |                   lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
 61 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
 62 |                   lambda x: japanese_to_ipa(x.group(1))+' ', text)
 63 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
 64 |                   lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
 65 |     text = re.sub(r'\[SA\](.*?)\[SA\]',
 66 |                   lambda x: devanagari_to_ipa(x.group(1))+' ', text)
 67 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
 68 |                   lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
 69 |     text = re.sub(r'\s+$', '', text)
 70 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 71 |     return text
 72 | 
 73 | 
 74 | def cjke_cleaners(text):
 75 |     from text.mandarin import chinese_to_lazy_ipa
 76 |     from text.japanese import japanese_to_ipa
 77 |     from text.korean import korean_to_ipa
 78 |     from text.english import english_to_ipa2
 79 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
 80 |         'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
 81 |     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
 82 |         'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
 83 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
 84 |                   lambda x: korean_to_ipa(x.group(1))+' ', text)
 85 |     text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
 86 |         'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
 87 |     text = re.sub(r'\s+$', '', text)
 88 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 89 |     return text
 90 | 
 91 | 
 92 | def cjke_cleaners2(text):
 93 |     from text.mandarin import chinese_to_ipa
 94 |     from text.japanese import japanese_to_ipa2
 95 |     from text.korean import korean_to_ipa
 96 |     from text.english import english_to_ipa2
 97 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
 98 |                   lambda x: chinese_to_ipa(x.group(1))+' ', text)
 99 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
100 |                   lambda x: japanese_to_ipa2(x.group(1))+' ', text)
101 |     text = re.sub(r'\[KO\](.*?)\[KO\]',
102 |                   lambda x: korean_to_ipa(x.group(1))+' ', text)
103 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
104 |                   lambda x: english_to_ipa2(x.group(1))+' ', text)
105 |     text = re.sub(r'\s+$', '', text)
106 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
107 |     return text
108 | 
109 | 
110 | def thai_cleaners(text):
111 |     from text.thai import num_to_thai, latin_to_thai
112 |     text = num_to_thai(text)
113 |     text = latin_to_thai(text)
114 |     return text
115 | 
116 | 
117 | def shanghainese_cleaners(text):
118 |     from text.shanghainese import shanghainese_to_ipa
119 |     text = shanghainese_to_ipa(text)
120 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
121 |     return text
122 | 
123 | 
124 | def chinese_dialect_cleaners(text):
125 |     from text.mandarin import chinese_to_ipa2
126 |     from text.japanese import japanese_to_ipa3
127 |     from text.shanghainese import shanghainese_to_ipa
128 |     from text.cantonese import cantonese_to_ipa
129 |     from text.english import english_to_lazy_ipa2
130 |     from text.ngu_dialect import ngu_dialect_to_ipa
131 |     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
132 |                   lambda x: chinese_to_ipa2(x.group(1))+' ', text)
133 |     text = re.sub(r'\[JA\](.*?)\[JA\]',
134 |                   lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
135 |     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
136 |                   '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
137 |     text = re.sub(r'\[GD\](.*?)\[GD\]',
138 |                   lambda x: cantonese_to_ipa(x.group(1))+' ', text)
139 |     text = re.sub(r'\[EN\](.*?)\[EN\]',
140 |                   lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
141 |     text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
142 |         1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
143 |     text = re.sub(r'\s+$', '', text)
144 |     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
145 |     return text
146 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/korean.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from jamo import h2j, j2hcj
  3 | import ko_pron
  4 | 
  5 | 
  6 | # This is a list of Korean classifiers preceded by pure Korean numerals.
  7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
  8 | 
  9 | # List of (hangul, hangul divided) pairs:
 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
 11 |     ('ㄳ', 'ㄱㅅ'),
 12 |     ('ㄵ', 'ㄴㅈ'),
 13 |     ('ㄶ', 'ㄴㅎ'),
 14 |     ('ㄺ', 'ㄹㄱ'),
 15 |     ('ㄻ', 'ㄹㅁ'),
 16 |     ('ㄼ', 'ㄹㅂ'),
 17 |     ('ㄽ', 'ㄹㅅ'),
 18 |     ('ㄾ', 'ㄹㅌ'),
 19 |     ('ㄿ', 'ㄹㅍ'),
 20 |     ('ㅀ', 'ㄹㅎ'),
 21 |     ('ㅄ', 'ㅂㅅ'),
 22 |     ('ㅘ', 'ㅗㅏ'),
 23 |     ('ㅙ', 'ㅗㅐ'),
 24 |     ('ㅚ', 'ㅗㅣ'),
 25 |     ('ㅝ', 'ㅜㅓ'),
 26 |     ('ㅞ', 'ㅜㅔ'),
 27 |     ('ㅟ', 'ㅜㅣ'),
 28 |     ('ㅢ', 'ㅡㅣ'),
 29 |     ('ㅑ', 'ㅣㅏ'),
 30 |     ('ㅒ', 'ㅣㅐ'),
 31 |     ('ㅕ', 'ㅣㅓ'),
 32 |     ('ㅖ', 'ㅣㅔ'),
 33 |     ('ㅛ', 'ㅣㅗ'),
 34 |     ('ㅠ', 'ㅣㅜ')
 35 | ]]
 36 | 
 37 | # List of (Latin alphabet, hangul) pairs:
 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 39 |     ('a', '에이'),
 40 |     ('b', '비'),
 41 |     ('c', '시'),
 42 |     ('d', '디'),
 43 |     ('e', '이'),
 44 |     ('f', '에프'),
 45 |     ('g', '지'),
 46 |     ('h', '에이치'),
 47 |     ('i', '아이'),
 48 |     ('j', '제이'),
 49 |     ('k', '케이'),
 50 |     ('l', '엘'),
 51 |     ('m', '엠'),
 52 |     ('n', '엔'),
 53 |     ('o', '오'),
 54 |     ('p', '피'),
 55 |     ('q', '큐'),
 56 |     ('r', '아르'),
 57 |     ('s', '에스'),
 58 |     ('t', '티'),
 59 |     ('u', '유'),
 60 |     ('v', '브이'),
 61 |     ('w', '더블유'),
 62 |     ('x', '엑스'),
 63 |     ('y', '와이'),
 64 |     ('z', '제트')
 65 | ]]
 66 | 
 67 | # List of (ipa, lazy ipa) pairs:
 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 69 |     ('t͡ɕ','ʧ'),
 70 |     ('d͡ʑ','ʥ'),
 71 |     ('ɲ','n^'),
 72 |     ('ɕ','ʃ'),
 73 |     ('ʷ','w'),
 74 |     ('ɭ','l`'),
 75 |     ('ʎ','ɾ'),
 76 |     ('ɣ','ŋ'),
 77 |     ('ɰ','ɯ'),
 78 |     ('ʝ','j'),
 79 |     ('ʌ','ə'),
 80 |     ('ɡ','g'),
 81 |     ('\u031a','#'),
 82 |     ('\u0348','='),
 83 |     ('\u031e',''),
 84 |     ('\u0320',''),
 85 |     ('\u0339','')
 86 | ]]
 87 | 
 88 | 
 89 | def latin_to_hangul(text):
 90 |     for regex, replacement in _latin_to_hangul:
 91 |         text = re.sub(regex, replacement, text)
 92 |     return text
 93 | 
 94 | 
 95 | def divide_hangul(text):
 96 |     text = j2hcj(h2j(text))
 97 |     for regex, replacement in _hangul_divided:
 98 |         text = re.sub(regex, replacement, text)
 99 |     return text
100 | 
101 | 
102 | def hangul_number(num, sino=True):
103 |     '''Reference https://github.com/Kyubyong/g2pK'''
104 |     num = re.sub(',', '', num)
105 | 
106 |     if num == '0':
107 |         return '영'
108 |     if not sino and num == '20':
109 |         return '스무'
110 | 
111 |     digits = '123456789'
112 |     names = '일이삼사오육칠팔구'
113 |     digit2name = {d: n for d, n in zip(digits, names)}
114 | 
115 |     modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116 |     decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117 |     digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118 |     digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119 | 
120 |     spelledout = []
121 |     for i, digit in enumerate(num):
122 |         i = len(num) - i - 1
123 |         if sino:
124 |             if i == 0:
125 |                 name = digit2name.get(digit, '')
126 |             elif i == 1:
127 |                 name = digit2name.get(digit, '') + '십'
128 |                 name = name.replace('일십', '십')
129 |         else:
130 |             if i == 0:
131 |                 name = digit2mod.get(digit, '')
132 |             elif i == 1:
133 |                 name = digit2dec.get(digit, '')
134 |         if digit == '0':
135 |             if i % 4 == 0:
136 |                 last_three = spelledout[-min(3, len(spelledout)):]
137 |                 if ''.join(last_three) == '':
138 |                     spelledout.append('')
139 |                     continue
140 |             else:
141 |                 spelledout.append('')
142 |                 continue
143 |         if i == 2:
144 |             name = digit2name.get(digit, '') + '백'
145 |             name = name.replace('일백', '백')
146 |         elif i == 3:
147 |             name = digit2name.get(digit, '') + '천'
148 |             name = name.replace('일천', '천')
149 |         elif i == 4:
150 |             name = digit2name.get(digit, '') + '만'
151 |             name = name.replace('일만', '만')
152 |         elif i == 5:
153 |             name = digit2name.get(digit, '') + '십'
154 |             name = name.replace('일십', '십')
155 |         elif i == 6:
156 |             name = digit2name.get(digit, '') + '백'
157 |             name = name.replace('일백', '백')
158 |         elif i == 7:
159 |             name = digit2name.get(digit, '') + '천'
160 |             name = name.replace('일천', '천')
161 |         elif i == 8:
162 |             name = digit2name.get(digit, '') + '억'
163 |         elif i == 9:
164 |             name = digit2name.get(digit, '') + '십'
165 |         elif i == 10:
166 |             name = digit2name.get(digit, '') + '백'
167 |         elif i == 11:
168 |             name = digit2name.get(digit, '') + '천'
169 |         elif i == 12:
170 |             name = digit2name.get(digit, '') + '조'
171 |         elif i == 13:
172 |             name = digit2name.get(digit, '') + '십'
173 |         elif i == 14:
174 |             name = digit2name.get(digit, '') + '백'
175 |         elif i == 15:
176 |             name = digit2name.get(digit, '') + '천'
177 |         spelledout.append(name)
178 |     return ''.join(elem for elem in spelledout)
179 | 
180 | 
181 | def number_to_hangul(text):
182 |     '''Reference https://github.com/Kyubyong/g2pK'''
183 |     tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184 |     for token in tokens:
185 |         num, classifier = token
186 |         if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187 |             spelledout = hangul_number(num, sino=False)
188 |         else:
189 |             spelledout = hangul_number(num, sino=True)
190 |         text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191 |     # digit by digit for remaining digits
192 |     digits = '0123456789'
193 |     names = '영일이삼사오육칠팔구'
194 |     for d, n in zip(digits, names):
195 |         text = text.replace(d, n)
196 |     return text
197 | 
198 | 
199 | def korean_to_lazy_ipa(text):
200 |     text = latin_to_hangul(text)
201 |     text = number_to_hangul(text)
202 |     text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203 |     for regex, replacement in _ipa_to_lazy_ipa:
204 |         text = re.sub(regex, replacement, text)
205 |     return text
206 | 
207 | 
208 | def korean_to_ipa(text):
209 |     text = korean_to_lazy_ipa(text)
210 |     return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
211 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/hubert_model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, Tuple
  3 | import random
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
  9 | 
 10 | class Hubert(nn.Module):
 11 |     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
 12 |         super().__init__()
 13 |         self._mask = mask
 14 |         self.feature_extractor = FeatureExtractor()
 15 |         self.feature_projection = FeatureProjection()
 16 |         self.positional_embedding = PositionalConvEmbedding()
 17 |         self.norm = nn.LayerNorm(768)
 18 |         self.dropout = nn.Dropout(0.1)
 19 |         self.encoder = TransformerEncoder(
 20 |             nn.TransformerEncoderLayer(
 21 |                 768, 12, 3072, activation="gelu", batch_first=True
 22 |             ),
 23 |             12,
 24 |         )
 25 |         self.proj = nn.Linear(768, 256)
 26 | 
 27 |         self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
 28 |         self.label_embedding = nn.Embedding(num_label_embeddings, 256)
 29 | 
 30 |     def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 31 |         mask = None
 32 |         if self.training and self._mask:
 33 |             mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
 34 |             x[mask] = self.masked_spec_embed.to(x.dtype)
 35 |         return x, mask
 36 | 
 37 |     def encode(
 38 |         self, x: torch.Tensor, layer: Optional[int] = None
 39 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 40 |         x = self.feature_extractor(x)
 41 |         x = self.feature_projection(x.transpose(1, 2))
 42 |         x, mask = self.mask(x)
 43 |         x = x + self.positional_embedding(x)
 44 |         x = self.dropout(self.norm(x))
 45 |         x = self.encoder(x, output_layer=layer)
 46 |         return x, mask
 47 | 
 48 |     def logits(self, x: torch.Tensor) -> torch.Tensor:
 49 |         logits = torch.cosine_similarity(
 50 |             x.unsqueeze(2),
 51 |             self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
 52 |             dim=-1,
 53 |         )
 54 |         return logits / 0.1
 55 | 
 56 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 57 |         x, mask = self.encode(x)
 58 |         x = self.proj(x)
 59 |         logits = self.logits(x)
 60 |         return logits, mask
 61 | 
 62 | 
 63 | class HubertSoft(Hubert):
 64 |     def __init__(self):
 65 |         super().__init__()
 66 | 
 67 |     @torch.inference_mode()
 68 |     def units(self, wav: torch.Tensor) -> torch.Tensor:
 69 |         wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 70 |         x, _ = self.encode(wav)
 71 |         return self.proj(x)
 72 | 
 73 | 
 74 | class FeatureExtractor(nn.Module):
 75 |     def __init__(self):
 76 |         super().__init__()
 77 |         self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
 78 |         self.norm0 = nn.GroupNorm(512, 512)
 79 |         self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
 80 |         self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
 81 |         self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
 82 |         self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
 83 |         self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
 84 |         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
 85 | 
 86 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 87 |         x = F.gelu(self.norm0(self.conv0(x)))
 88 |         x = F.gelu(self.conv1(x))
 89 |         x = F.gelu(self.conv2(x))
 90 |         x = F.gelu(self.conv3(x))
 91 |         x = F.gelu(self.conv4(x))
 92 |         x = F.gelu(self.conv5(x))
 93 |         x = F.gelu(self.conv6(x))
 94 |         return x
 95 | 
 96 | 
 97 | class FeatureProjection(nn.Module):
 98 |     def __init__(self):
 99 |         super().__init__()
100 |         self.norm = nn.LayerNorm(512)
101 |         self.projection = nn.Linear(512, 768)
102 |         self.dropout = nn.Dropout(0.1)
103 | 
104 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
105 |         x = self.norm(x)
106 |         x = self.projection(x)
107 |         x = self.dropout(x)
108 |         return x
109 | 
110 | 
111 | class PositionalConvEmbedding(nn.Module):
112 |     def __init__(self):
113 |         super().__init__()
114 |         self.conv = nn.Conv1d(
115 |             768,
116 |             768,
117 |             kernel_size=128,
118 |             padding=128 // 2,
119 |             groups=16,
120 |         )
121 |         self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
122 | 
123 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
124 |         x = self.conv(x.transpose(1, 2))
125 |         x = F.gelu(x[:, :, :-1])
126 |         return x.transpose(1, 2)
127 | 
128 | 
129 | class TransformerEncoder(nn.Module):
130 |     def __init__(
131 |         self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
132 |     ) -> None:
133 |         super(TransformerEncoder, self).__init__()
134 |         self.layers = nn.ModuleList(
135 |             [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
136 |         )
137 |         self.num_layers = num_layers
138 | 
139 |     def forward(
140 |         self,
141 |         src: torch.Tensor,
142 |         mask: torch.Tensor = None,
143 |         src_key_padding_mask: torch.Tensor = None,
144 |         output_layer: Optional[int] = None,
145 |     ) -> torch.Tensor:
146 |         output = src
147 |         for layer in self.layers[:output_layer]:
148 |             output = layer(
149 |                 output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
150 |             )
151 |         return output
152 | 
153 | 
154 | def _compute_mask(
155 |     shape: Tuple[int, int],
156 |     mask_prob: float,
157 |     mask_length: int,
158 |     device: torch.device,
159 |     min_masks: int = 0,
160 | ) -> torch.Tensor:
161 |     batch_size, sequence_length = shape
162 | 
163 |     if mask_length < 1:
164 |         raise ValueError("`mask_length` has to be bigger than 0.")
165 | 
166 |     if mask_length > sequence_length:
167 |         raise ValueError(
168 |             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
169 |         )
170 | 
171 |     # compute number of masked spans in batch
172 |     num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
173 |     num_masked_spans = max(num_masked_spans, min_masks)
174 | 
175 |     # make sure num masked indices <= sequence_length
176 |     if num_masked_spans * mask_length > sequence_length:
177 |         num_masked_spans = sequence_length // mask_length
178 | 
179 |     # SpecAugment mask to fill
180 |     mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
181 | 
182 |     # uniform distribution to sample from, make sure that offset samples are < sequence_length
183 |     uniform_dist = torch.ones(
184 |         (batch_size, sequence_length - (mask_length - 1)), device=device
185 |     )
186 | 
187 |     # get random indices to mask
188 |     mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
189 | 
190 |     # expand masked indices to masked spans
191 |     mask_indices = (
192 |         mask_indices.unsqueeze(dim=-1)
193 |         .expand((batch_size, num_masked_spans, mask_length))
194 |         .reshape(batch_size, num_masked_spans * mask_length)
195 |     )
196 |     offsets = (
197 |         torch.arange(mask_length, device=device)[None, None, :]
198 |         .expand((batch_size, num_masked_spans, mask_length))
199 |         .reshape(batch_size, num_masked_spans * mask_length)
200 |     )
201 |     mask_idxs = mask_indices + offsets
202 | 
203 |     # scatter indices to mask
204 |     mask = mask.scatter(1, mask_idxs, True)
205 | 
206 |     return mask
207 | 
208 | 
209 | def hubert_soft(
210 |     path: str
211 | ) -> HubertSoft:
212 |     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
213 |     Args:
214 |         path (str): path of a pretrained model
215 |     """
216 |     hubert = HubertSoft()
217 |     checkpoint = torch.load(path)
218 |     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
219 |     hubert.load_state_dict(checkpoint)
220 |     hubert.eval()
221 |     return hubert
222 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Oo]ut/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | 
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # ASP.NET Scaffolding
 67 | ScaffoldingReadMe.txt
 68 | 
 69 | # StyleCop
 70 | StyleCopReport.xml
 71 | 
 72 | # Files built by Visual Studio
 73 | *_i.c
 74 | *_p.c
 75 | *_h.h
 76 | *.ilk
 77 | *.meta
 78 | *.obj
 79 | *.iobj
 80 | *.pch
 81 | *.pdb
 82 | *.ipdb
 83 | *.pgc
 84 | *.pgd
 85 | *.rsp
 86 | *.sbr
 87 | *.tlb
 88 | *.tli
 89 | *.tlh
 90 | *.tmp
 91 | *.tmp_proj
 92 | *_wpftmp.csproj
 93 | *.log
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 | 
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 | 
309 | # FAKE - F# Make
310 | .fake/
311 | 
312 | # CodeRush personal settings
313 | .cr/personal
314 | 
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 | 
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 | 
323 | # Tabs Studio
324 | *.tss
325 | 
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 | 
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 | 
335 | # OpenCover UI analysis results
336 | OpenCover/
337 | 
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 | 
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 | 
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 | 
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 | 
350 | # Local History for Visual Studio
351 | .localhistory/
352 | 
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 | 
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 | 
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 | 
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | 
365 | # build
366 | build
367 | monotonic_align/core.c
368 | *.o
369 | *.so
370 | *.dll
371 | 
372 | # data
373 | /config.json
374 | /*.pth
375 | *.wav
376 | /monotonic_align/monotonic_align
377 | /resources
378 | /MoeGoe.spec
379 | /dist/MoeGoe
380 | /dist
381 | 
382 | # MacOS
383 | .DS_Store
384 | 


--------------------------------------------------------------------------------
/HoshiNoYume/memory/prompts.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | from langchain.prompts.prompt import PromptTemplate
  3 | 
  4 | _DEFAULT_ENTITY_MEMORY_CONVERSATION_TEMPLATE = """You are an assistant to a human, powered by a large language model trained by OpenAI.
  5 | 
  6 | You are designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, you are able to generate human-like text based on the input you receive, allowing you to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
  7 | 
  8 | You are constantly learning and improving, and your capabilities are constantly evolving. You are able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. You have access to some personalized information provided by the human in the Context section below. Additionally, you are able to generate your own text based on the input you receive, allowing you to engage in discussions and provide explanations and descriptions on a wide range of topics.
  9 | 
 10 | Overall, you are a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether the human needs help with a specific question or just wants to have a conversation about a particular topic, you are here to assist.
 11 | 
 12 | Context:
 13 | {entities}
 14 | 
 15 | Current conversation:
 16 | {history}
 17 | Last line:
 18 | Human: {input}
 19 | You:"""
 20 | 
 21 | ENTITY_MEMORY_CONVERSATION_TEMPLATE = PromptTemplate(
 22 |     input_variables=["entities", "history", "input"],
 23 |     template=_DEFAULT_ENTITY_MEMORY_CONVERSATION_TEMPLATE,
 24 | )
 25 | 
 26 | _DEFAULT_SUMMARIZER_TEMPLATE = """Progressively summarize the lines of conversation provided, adding onto the previous summary returning a new summary.
 27 | 
 28 | EXAMPLE
 29 | Current summary:
 30 | The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good.
 31 | 
 32 | New lines of conversation:
 33 | Human: Why do you think artificial intelligence is a force for good?
 34 | AI: Because artificial intelligence will help humans reach their full potential.
 35 | 
 36 | New summary:
 37 | The human asks what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential.
 38 | END OF EXAMPLE
 39 | 
 40 | Current summary:
 41 | {summary}
 42 | 
 43 | New lines of conversation:
 44 | {new_lines}
 45 | 
 46 | New summary:"""
 47 | SUMMARY_PROMPT = PromptTemplate(
 48 |     input_variables=["summary", "new_lines"], template=_DEFAULT_SUMMARIZER_TEMPLATE
 49 | )
 50 | 
 51 | _DEFAULT_ENTITY_EXTRACTION_TEMPLATE = """You are an AI assistant reading the transcript of a conversation between an AI and a human. Extract all of the proper nouns from the last line of conversation. As a guideline, a proper noun is generally capitalized. You should definitely extract all names and places.
 52 | 
 53 | The conversation history is provided just in case of a coreference (e.g. "What do you know about him" where "him" is defined in a previous line) -- ignore items mentioned there that are not in the last line.
 54 | 
 55 | Return the output as a single comma-separated list, or NONE if there is nothing of note to return (e.g. the user is just issuing a greeting or having a simple conversation).
 56 | 
 57 | EXAMPLE
 58 | Conversation history:
 59 | Person #1: how's it going today?
 60 | AI: "It's going great! How about you?"
 61 | Person #1: good! busy working on Langchain. lots to do.
 62 | AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?"
 63 | Last line:
 64 | Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff.
 65 | Output: Langchain
 66 | END OF EXAMPLE
 67 | 
 68 | EXAMPLE
 69 | Conversation history:
 70 | Person #1: how's it going today?
 71 | AI: "It's going great! How about you?"
 72 | Person #1: good! busy working on Langchain. lots to do.
 73 | AI: "That sounds like a lot of work! What kind of things are you doing to make Langchain better?"
 74 | Last line:
 75 | Person #1: i'm trying to improve Langchain's interfaces, the UX, its integrations with various products the user might want ... a lot of stuff. I'm working with Person #2.
 76 | Output: Langchain, Person #2
 77 | END OF EXAMPLE
 78 | 
 79 | Conversation history (for reference only):
 80 | {history}
 81 | Last line of conversation (for extraction):
 82 | Human: {input}
 83 | 
 84 | Output:"""
 85 | ENTITY_EXTRACTION_PROMPT = PromptTemplate(
 86 |     input_variables=["history", "input"], template=_DEFAULT_ENTITY_EXTRACTION_TEMPLATE
 87 | )
 88 | 
 89 | _DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE = """You are an AI assistant helping a human keep track of facts about relevant people, places, and concepts in their life. Update the summary of the provided entity in the "Entity" section based on the last line of your conversation with the human. If you are writing the summary for the first time, return a single sentence.
 90 | The update should only include facts that are relayed in the last line of conversation about the provided entity, and should only contain facts about the provided entity.
 91 | 
 92 | If there is no new information about the provided entity or the information is not worth noting (not an important or relevant fact to remember long-term), return the existing summary unchanged.
 93 | 
 94 | Full conversation history (for context):
 95 | {history}
 96 | 
 97 | Entity to summarize:
 98 | {entity}
 99 | 
100 | Existing summary of {entity}:
101 | {summary}
102 | 
103 | Last line of conversation:
104 | Human: {input}
105 | Updated summary:"""
106 | 
107 | ENTITY_SUMMARIZATION_PROMPT = PromptTemplate(
108 |     input_variables=["entity", "summary", "history", "input"],
109 |     template=_DEFAULT_ENTITY_SUMMARIZATION_TEMPLATE,
110 | )
111 | 
112 | 
113 | KG_TRIPLE_DELIMITER = "<|>"
114 | _DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE = (
115 |     "You are a networked intelligence helping a human track knowledge triples"
116 |     " about all relevant people, things, concepts, etc. and integrating"
117 |     " them with your knowledge stored within your weights"
118 |     " as well as that stored in a knowledge graph."
119 |     " Extract all of the knowledge triples from the last line of conversation."
120 |     " A knowledge triple is a clause that contains a subject, a predicate,"
121 |     " and an object. The subject is the entity being described,"
122 |     " the predicate is the property of the subject that is being"
123 |     " described, and the object is the value of the property.\n\n"
124 |     "EXAMPLE\n"
125 |     "Conversation history:\n"
126 |     "Person #1: Did you hear aliens landed in Area 51?\n"
127 |     "AI: No, I didn't hear that. What do you know about Area 51?\n"
128 |     "Person #1: It's a secret military base in Nevada.\n"
129 |     "AI: What do you know about Nevada?\n"
130 |     "Last line of conversation:\n"
131 |     "Person #1: It's a state in the US. It's also the number 1 producer of gold in the US.\n\n"
132 |     f"Output: (Nevada, is a, state){KG_TRIPLE_DELIMITER}(Nevada, is in, US)"
133 |     f"{KG_TRIPLE_DELIMITER}(Nevada, is the number 1 producer of, gold)\n"
134 |     "END OF EXAMPLE\n\n"
135 |     "EXAMPLE\n"
136 |     "Conversation history:\n"
137 |     "Person #1: Hello.\n"
138 |     "AI: Hi! How are you?\n"
139 |     "Person #1: I'm good. How are you?\n"
140 |     "AI: I'm good too.\n"
141 |     "Last line of conversation:\n"
142 |     "Person #1: I'm going to the store.\n\n"
143 |     "Output: NONE\n"
144 |     "END OF EXAMPLE\n\n"
145 |     "EXAMPLE\n"
146 |     "Conversation history:\n"
147 |     "Person #1: What do you know about Descartes?\n"
148 |     "AI: Descartes was a French philosopher, mathematician, and scientist who lived in the 17th century.\n"
149 |     "Person #1: The Descartes I'm referring to is a standup comedian and interior designer from Montreal.\n"
150 |     "AI: Oh yes, He is a comedian and an interior designer. He has been in the industry for 30 years. His favorite food is baked bean pie.\n"
151 |     "Last line of conversation:\n"
152 |     "Person #1: Oh huh. I know Descartes likes to drive antique scooters and play the mandolin.\n"
153 |     f"Output: (Descartes, likes to drive, antique scooters){KG_TRIPLE_DELIMITER}(Descartes, plays, mandolin)\n"
154 |     "END OF EXAMPLE\n\n"
155 |     "Conversation history (for reference only):\n"
156 |     "{history}"
157 |     "\nLast line of conversation (for extraction):\n"
158 |     "Human: {input}\n\n"
159 |     "Output:"
160 | )
161 | 
162 | KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT = PromptTemplate(
163 |     input_variables=["history", "input"],
164 |     template=_DEFAULT_KNOWLEDGE_TRIPLE_EXTRACTION_TEMPLATE,
165 | )
166 | 
167 | FEW_SHOT_SHORT_MEMORY = [""]


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/text/mandarin.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | from pypinyin import lazy_pinyin, BOPOMOFO
  5 | import jieba
  6 | import cn2an
  7 | import logging
  8 | 
  9 | logging.getLogger('jieba').setLevel(logging.WARNING)
 10 | script_dir = os.path.dirname(os.path.abspath(__file__))
 11 | jieba_dic = os.path.join(script_dir, 'MoeGoe', 'jieba', 'dict.txt')
 12 | # jieba.set_dictionary(os.path.dirname(sys.argv[0])+'/MoeGoe/jieba/dict.txt')
 13 | jieba.initialize()
 14 | 
 15 | 
 16 | # List of (Latin alphabet, bopomofo) pairs:
 17 | _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 18 |     ('a', 'ㄟˉ'),
 19 |     ('b', 'ㄅㄧˋ'),
 20 |     ('c', 'ㄙㄧˉ'),
 21 |     ('d', 'ㄉㄧˋ'),
 22 |     ('e', 'ㄧˋ'),
 23 |     ('f', 'ㄝˊㄈㄨˋ'),
 24 |     ('g', 'ㄐㄧˋ'),
 25 |     ('h', 'ㄝˇㄑㄩˋ'),
 26 |     ('i', 'ㄞˋ'),
 27 |     ('j', 'ㄐㄟˋ'),
 28 |     ('k', 'ㄎㄟˋ'),
 29 |     ('l', 'ㄝˊㄛˋ'),
 30 |     ('m', 'ㄝˊㄇㄨˋ'),
 31 |     ('n', 'ㄣˉ'),
 32 |     ('o', 'ㄡˉ'),
 33 |     ('p', 'ㄆㄧˉ'),
 34 |     ('q', 'ㄎㄧㄡˉ'),
 35 |     ('r', 'ㄚˋ'),
 36 |     ('s', 'ㄝˊㄙˋ'),
 37 |     ('t', 'ㄊㄧˋ'),
 38 |     ('u', 'ㄧㄡˉ'),
 39 |     ('v', 'ㄨㄧˉ'),
 40 |     ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
 41 |     ('x', 'ㄝˉㄎㄨˋㄙˋ'),
 42 |     ('y', 'ㄨㄞˋ'),
 43 |     ('z', 'ㄗㄟˋ')
 44 | ]]
 45 | 
 46 | # List of (bopomofo, romaji) pairs:
 47 | _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
 48 |     ('ㄅㄛ', 'p⁼wo'),
 49 |     ('ㄆㄛ', 'pʰwo'),
 50 |     ('ㄇㄛ', 'mwo'),
 51 |     ('ㄈㄛ', 'fwo'),
 52 |     ('ㄅ', 'p⁼'),
 53 |     ('ㄆ', 'pʰ'),
 54 |     ('ㄇ', 'm'),
 55 |     ('ㄈ', 'f'),
 56 |     ('ㄉ', 't⁼'),
 57 |     ('ㄊ', 'tʰ'),
 58 |     ('ㄋ', 'n'),
 59 |     ('ㄌ', 'l'),
 60 |     ('ㄍ', 'k⁼'),
 61 |     ('ㄎ', 'kʰ'),
 62 |     ('ㄏ', 'h'),
 63 |     ('ㄐ', 'ʧ⁼'),
 64 |     ('ㄑ', 'ʧʰ'),
 65 |     ('ㄒ', 'ʃ'),
 66 |     ('ㄓ', 'ʦ`⁼'),
 67 |     ('ㄔ', 'ʦ`ʰ'),
 68 |     ('ㄕ', 's`'),
 69 |     ('ㄖ', 'ɹ`'),
 70 |     ('ㄗ', 'ʦ⁼'),
 71 |     ('ㄘ', 'ʦʰ'),
 72 |     ('ㄙ', 's'),
 73 |     ('ㄚ', 'a'),
 74 |     ('ㄛ', 'o'),
 75 |     ('ㄜ', 'ə'),
 76 |     ('ㄝ', 'e'),
 77 |     ('ㄞ', 'ai'),
 78 |     ('ㄟ', 'ei'),
 79 |     ('ㄠ', 'au'),
 80 |     ('ㄡ', 'ou'),
 81 |     ('ㄧㄢ', 'yeNN'),
 82 |     ('ㄢ', 'aNN'),
 83 |     ('ㄧㄣ', 'iNN'),
 84 |     ('ㄣ', 'əNN'),
 85 |     ('ㄤ', 'aNg'),
 86 |     ('ㄧㄥ', 'iNg'),
 87 |     ('ㄨㄥ', 'uNg'),
 88 |     ('ㄩㄥ', 'yuNg'),
 89 |     ('ㄥ', 'əNg'),
 90 |     ('ㄦ', 'əɻ'),
 91 |     ('ㄧ', 'i'),
 92 |     ('ㄨ', 'u'),
 93 |     ('ㄩ', 'ɥ'),
 94 |     ('ˉ', '→'),
 95 |     ('ˊ', '↑'),
 96 |     ('ˇ', '↓↑'),
 97 |     ('ˋ', '↓'),
 98 |     ('˙', ''),
 99 |     ('，', ','),
100 |     ('。', '.'),
101 |     ('！', '!'),
102 |     ('？', '?'),
103 |     ('—', '-')
104 | ]]
105 | 
106 | # List of (romaji, ipa) pairs:
107 | _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
108 |     ('ʃy', 'ʃ'),
109 |     ('ʧʰy', 'ʧʰ'),
110 |     ('ʧ⁼y', 'ʧ⁼'),
111 |     ('NN', 'n'),
112 |     ('Ng', 'ŋ'),
113 |     ('y', 'j'),
114 |     ('h', 'x')
115 | ]]
116 | 
117 | # List of (bopomofo, ipa) pairs:
118 | _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
119 |     ('ㄅㄛ', 'p⁼wo'),
120 |     ('ㄆㄛ', 'pʰwo'),
121 |     ('ㄇㄛ', 'mwo'),
122 |     ('ㄈㄛ', 'fwo'),
123 |     ('ㄅ', 'p⁼'),
124 |     ('ㄆ', 'pʰ'),
125 |     ('ㄇ', 'm'),
126 |     ('ㄈ', 'f'),
127 |     ('ㄉ', 't⁼'),
128 |     ('ㄊ', 'tʰ'),
129 |     ('ㄋ', 'n'),
130 |     ('ㄌ', 'l'),
131 |     ('ㄍ', 'k⁼'),
132 |     ('ㄎ', 'kʰ'),
133 |     ('ㄏ', 'x'),
134 |     ('ㄐ', 'tʃ⁼'),
135 |     ('ㄑ', 'tʃʰ'),
136 |     ('ㄒ', 'ʃ'),
137 |     ('ㄓ', 'ts`⁼'),
138 |     ('ㄔ', 'ts`ʰ'),
139 |     ('ㄕ', 's`'),
140 |     ('ㄖ', 'ɹ`'),
141 |     ('ㄗ', 'ts⁼'),
142 |     ('ㄘ', 'tsʰ'),
143 |     ('ㄙ', 's'),
144 |     ('ㄚ', 'a'),
145 |     ('ㄛ', 'o'),
146 |     ('ㄜ', 'ə'),
147 |     ('ㄝ', 'ɛ'),
148 |     ('ㄞ', 'aɪ'),
149 |     ('ㄟ', 'eɪ'),
150 |     ('ㄠ', 'ɑʊ'),
151 |     ('ㄡ', 'oʊ'),
152 |     ('ㄧㄢ', 'jɛn'),
153 |     ('ㄩㄢ', 'ɥæn'),
154 |     ('ㄢ', 'an'),
155 |     ('ㄧㄣ', 'in'),
156 |     ('ㄩㄣ', 'ɥn'),
157 |     ('ㄣ', 'ən'),
158 |     ('ㄤ', 'ɑŋ'),
159 |     ('ㄧㄥ', 'iŋ'),
160 |     ('ㄨㄥ', 'ʊŋ'),
161 |     ('ㄩㄥ', 'jʊŋ'),
162 |     ('ㄥ', 'əŋ'),
163 |     ('ㄦ', 'əɻ'),
164 |     ('ㄧ', 'i'),
165 |     ('ㄨ', 'u'),
166 |     ('ㄩ', 'ɥ'),
167 |     ('ˉ', '→'),
168 |     ('ˊ', '↑'),
169 |     ('ˇ', '↓↑'),
170 |     ('ˋ', '↓'),
171 |     ('˙', ''),
172 |     ('，', ','),
173 |     ('。', '.'),
174 |     ('！', '!'),
175 |     ('？', '?'),
176 |     ('—', '-')
177 | ]]
178 | 
179 | # List of (bopomofo, ipa2) pairs:
180 | _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
181 |     ('ㄅㄛ', 'pwo'),
182 |     ('ㄆㄛ', 'pʰwo'),
183 |     ('ㄇㄛ', 'mwo'),
184 |     ('ㄈㄛ', 'fwo'),
185 |     ('ㄅ', 'p'),
186 |     ('ㄆ', 'pʰ'),
187 |     ('ㄇ', 'm'),
188 |     ('ㄈ', 'f'),
189 |     ('ㄉ', 't'),
190 |     ('ㄊ', 'tʰ'),
191 |     ('ㄋ', 'n'),
192 |     ('ㄌ', 'l'),
193 |     ('ㄍ', 'k'),
194 |     ('ㄎ', 'kʰ'),
195 |     ('ㄏ', 'h'),
196 |     ('ㄐ', 'tɕ'),
197 |     ('ㄑ', 'tɕʰ'),
198 |     ('ㄒ', 'ɕ'),
199 |     ('ㄓ', 'tʂ'),
200 |     ('ㄔ', 'tʂʰ'),
201 |     ('ㄕ', 'ʂ'),
202 |     ('ㄖ', 'ɻ'),
203 |     ('ㄗ', 'ts'),
204 |     ('ㄘ', 'tsʰ'),
205 |     ('ㄙ', 's'),
206 |     ('ㄚ', 'a'),
207 |     ('ㄛ', 'o'),
208 |     ('ㄜ', 'ɤ'),
209 |     ('ㄝ', 'ɛ'),
210 |     ('ㄞ', 'aɪ'),
211 |     ('ㄟ', 'eɪ'),
212 |     ('ㄠ', 'ɑʊ'),
213 |     ('ㄡ', 'oʊ'),
214 |     ('ㄧㄢ', 'jɛn'),
215 |     ('ㄩㄢ', 'yæn'),
216 |     ('ㄢ', 'an'),
217 |     ('ㄧㄣ', 'in'),
218 |     ('ㄩㄣ', 'yn'),
219 |     ('ㄣ', 'ən'),
220 |     ('ㄤ', 'ɑŋ'),
221 |     ('ㄧㄥ', 'iŋ'),
222 |     ('ㄨㄥ', 'ʊŋ'),
223 |     ('ㄩㄥ', 'jʊŋ'),
224 |     ('ㄥ', 'ɤŋ'),
225 |     ('ㄦ', 'əɻ'),
226 |     ('ㄧ', 'i'),
227 |     ('ㄨ', 'u'),
228 |     ('ㄩ', 'y'),
229 |     ('ˉ', '˥'),
230 |     ('ˊ', '˧˥'),
231 |     ('ˇ', '˨˩˦'),
232 |     ('ˋ', '˥˩'),
233 |     ('˙', ''),
234 |     ('，', ','),
235 |     ('。', '.'),
236 |     ('！', '!'),
237 |     ('？', '?'),
238 |     ('—', '-')
239 | ]]
240 | 
241 | 
242 | def number_to_chinese(text):
243 |     numbers = re.findall(r'\d+(?:\.?\d+)?', text)
244 |     for number in numbers:
245 |         text = text.replace(number, cn2an.an2cn(number), 1)
246 |     return text
247 | 
248 | 
249 | def chinese_to_bopomofo(text):
250 |     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
251 |     words = jieba.lcut(text, cut_all=False)
252 |     text = ''
253 |     for word in words:
254 |         bopomofos = lazy_pinyin(word, BOPOMOFO)
255 |         if not re.search('[\u4e00-\u9fff]', word):
256 |             text += word
257 |             continue
258 |         for i in range(len(bopomofos)):
259 |             bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
260 |         if text != '':
261 |             text += ' '
262 |         text += ''.join(bopomofos)
263 |     return text
264 | 
265 | 
266 | def latin_to_bopomofo(text):
267 |     for regex, replacement in _latin_to_bopomofo:
268 |         text = re.sub(regex, replacement, text)
269 |     return text
270 | 
271 | 
272 | def bopomofo_to_romaji(text):
273 |     for regex, replacement in _bopomofo_to_romaji:
274 |         text = re.sub(regex, replacement, text)
275 |     return text
276 | 
277 | 
278 | def bopomofo_to_ipa(text):
279 |     for regex, replacement in _bopomofo_to_ipa:
280 |         text = re.sub(regex, replacement, text)
281 |     return text
282 | 
283 | 
284 | def bopomofo_to_ipa2(text):
285 |     for regex, replacement in _bopomofo_to_ipa2:
286 |         text = re.sub(regex, replacement, text)
287 |     return text
288 | 
289 | 
290 | def chinese_to_romaji(text):
291 |     text = number_to_chinese(text)
292 |     text = chinese_to_bopomofo(text)
293 |     text = latin_to_bopomofo(text)
294 |     text = bopomofo_to_romaji(text)
295 |     text = re.sub('i([aoe])', r'y\1', text)
296 |     text = re.sub('u([aoəe])', r'w\1', text)
297 |     text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
298 |                   r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
299 |     text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
300 |     return text
301 | 
302 | 
303 | def chinese_to_lazy_ipa(text):
304 |     text = chinese_to_romaji(text)
305 |     for regex, replacement in _romaji_to_ipa:
306 |         text = re.sub(regex, replacement, text)
307 |     return text
308 | 
309 | 
310 | def chinese_to_ipa(text):
311 |     text = number_to_chinese(text)
312 |     text = chinese_to_bopomofo(text)
313 |     text = latin_to_bopomofo(text)
314 |     text = bopomofo_to_ipa(text)
315 |     text = re.sub('i([aoe])', r'j\1', text)
316 |     text = re.sub('u([aoəe])', r'w\1', text)
317 |     text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
318 |                   r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
319 |     text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
320 |     return text
321 | 
322 | 
323 | def chinese_to_ipa2(text):
324 |     text = number_to_chinese(text)
325 |     text = chinese_to_bopomofo(text)
326 |     text = latin_to_bopomofo(text)
327 |     text = bopomofo_to_ipa2(text)
328 |     text = re.sub(r'i([aoe])', r'j\1', text)
329 |     text = re.sub(r'u([aoəe])', r'w\1', text)
330 |     text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
331 |     text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
332 |     return text
333 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/MoeGoe.py:
--------------------------------------------------------------------------------
  1 | from scipy.io.wavfile import write
  2 | from mel_processing import spectrogram_torch
  3 | from text import text_to_sequence, _clean_text
  4 | from models import SynthesizerTrn
  5 | import utils
  6 | import commons
  7 | import sys
  8 | import re
  9 | from torch import no_grad, LongTensor
 10 | import logging
 11 | 
 12 | logging.getLogger('numba').setLevel(logging.WARNING)
 13 | 
 14 | 
 15 | def ex_print(text, escape=False):
 16 |     if escape:
 17 |         print(text.encode('unicode_escape').decode())
 18 |     else:
 19 |         print(text)
 20 | 
 21 | 
 22 | def get_text(text, hps, cleaned=False):
 23 |     if cleaned:
 24 |         text_norm = text_to_sequence(text, hps.symbols, [])
 25 |     else:
 26 |         text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
 27 |     if hps.data.add_blank:
 28 |         text_norm = commons.intersperse(text_norm, 0)
 29 |     text_norm = LongTensor(text_norm)
 30 |     return text_norm
 31 | 
 32 | 
 33 | def ask_if_continue():
 34 |     while True:
 35 |         answer = input('Continue? (y/n): ')
 36 |         if answer == 'y':
 37 |             break
 38 |         elif answer == 'n':
 39 |             sys.exit(0)
 40 | 
 41 | 
 42 | def print_speakers(speakers, escape=False):
 43 |     if len(speakers) > 100:
 44 |         return
 45 |     print('ID\tSpeaker')
 46 |     for id, name in enumerate(speakers):
 47 |         ex_print(str(id) + '\t' + name, escape)
 48 | 
 49 | 
 50 | def get_speaker_id(message):
 51 |     speaker_id = input(message)
 52 |     try:
 53 |         speaker_id = int(speaker_id)
 54 |     except:
 55 |         print(str(speaker_id) + ' is not a valid ID!')
 56 |         sys.exit(1)
 57 |     return speaker_id
 58 | 
 59 | 
 60 | def get_label_value(text, label, default, warning_name='value'):
 61 |     value = re.search(rf'\[{label}=(.+?)\]', text)
 62 |     if value:
 63 |         try:
 64 |             text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
 65 |             value = float(value.group(1))
 66 |         except:
 67 |             print(f'Invalid {warning_name}!')
 68 |             sys.exit(1)
 69 |     else:
 70 |         value = default
 71 |     return value, text
 72 | 
 73 | 
 74 | def get_label(text, label):
 75 |     if f'[{label}]' in text:
 76 |         return True, text.replace(f'[{label}]', '')
 77 |     else:
 78 |         return False, text
 79 | 
 80 | 
 81 | if __name__ == '__main__':
 82 |     if '--escape' in sys.argv:
 83 |         escape = True
 84 |     else:
 85 |         escape = False
 86 | 
 87 |     model = input('Path of a VITS model: ')
 88 |     config = input('Path of a config file: ')
 89 | 
 90 |     hps_ms = utils.get_hparams_from_file(config)
 91 |     n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
 92 |     n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
 93 |     speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
 94 |     use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
 95 |     emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
 96 | 
 97 |     net_g_ms = SynthesizerTrn(
 98 |         n_symbols,
 99 |         hps_ms.data.filter_length // 2 + 1,
100 |         hps_ms.train.segment_size // hps_ms.data.hop_length,
101 |         n_speakers=n_speakers,
102 |         emotion_embedding=emotion_embedding,
103 |         **hps_ms.model)
104 |     _ = net_g_ms.eval()
105 |     utils.load_checkpoint(model, net_g_ms)
106 | 
107 |     def voice_conversion():
108 |         audio_path = input('Path of an audio file to convert:\n')
109 |         print_speakers(speakers)
110 |         audio = utils.load_audio_to_torch(
111 |             audio_path, hps_ms.data.sampling_rate)
112 | 
113 |         originnal_id = get_speaker_id('Original speaker ID: ')
114 |         target_id = get_speaker_id('Target speaker ID: ')
115 |         out_path = input('Path to save: ')
116 | 
117 |         y = audio.unsqueeze(0)
118 | 
119 |         spec = spectrogram_torch(y, hps_ms.data.filter_length,
120 |                                  hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
121 |                                  center=False)
122 |         spec_lengths = LongTensor([spec.size(-1)])
123 |         sid_src = LongTensor([originnal_id])
124 | 
125 |         with no_grad():
126 |             sid_tgt = LongTensor([target_id])
127 |             audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
128 |                 0][0, 0].data.cpu().float().numpy()
129 |         return audio, out_path
130 | 
131 |     if n_symbols != 0:
132 |         if not emotion_embedding:
133 |             while True:
134 |                 choice = input('TTS or VC? (t/v):')
135 |                 if choice == 't':
136 |                     text = input('Text to read: ')
137 |                     if text == '[ADVANCED]':
138 |                         text = input('Raw text:')
139 |                         print('Cleaned text is:')
140 |                         ex_print(_clean_text(
141 |                             text, hps_ms.data.text_cleaners), escape)
142 |                         continue
143 | 
144 |                     length_scale, text = get_label_value(
145 |                         text, 'LENGTH', 1, 'length scale')
146 |                     noise_scale, text = get_label_value(
147 |                         text, 'NOISE', 0.667, 'noise scale')
148 |                     noise_scale_w, text = get_label_value(
149 |                         text, 'NOISEW', 0.8, 'deviation of noise')
150 |                     cleaned, text = get_label(text, 'CLEANED')
151 | 
152 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
153 | 
154 |                     print_speakers(speakers, escape)
155 |                     speaker_id = get_speaker_id('Speaker ID: ')
156 |                     out_path = input('Path to save: ')
157 | 
158 |                     with no_grad():
159 |                         x_tst = stn_tst.unsqueeze(0)
160 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
161 |                         sid = LongTensor([speaker_id])
162 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
163 |                                                noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
164 | 
165 |                 elif choice == 'v':
166 |                     audio, out_path = voice_conversion()
167 | 
168 |                 write(out_path, hps_ms.data.sampling_rate, audio)
169 |                 print('Successfully saved!')
170 |                 ask_if_continue()
171 |         else:
172 |             import os
173 |             import librosa
174 |             import numpy as np
175 |             from torch import FloatTensor
176 |             import audonnx
177 |             w2v2_folder = input('Path of a w2v2 dimensional emotion model: ')
178 |             w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
179 |             while True:
180 |                 choice = input('TTS or VC? (t/v):')
181 |                 if choice == 't':
182 |                     text = input('Text to read: ')
183 |                     if text == '[ADVANCED]':
184 |                         text = input('Raw text:')
185 |                         print('Cleaned text is:')
186 |                         ex_print(_clean_text(
187 |                             text, hps_ms.data.text_cleaners), escape)
188 |                         continue
189 | 
190 |                     length_scale, text = get_label_value(
191 |                         text, 'LENGTH', 1, 'length scale')
192 |                     noise_scale, text = get_label_value(
193 |                         text, 'NOISE', 0.667, 'noise scale')
194 |                     noise_scale_w, text = get_label_value(
195 |                         text, 'NOISEW', 0.8, 'deviation of noise')
196 |                     cleaned, text = get_label(text, 'CLEANED')
197 | 
198 |                     stn_tst = get_text(text, hps_ms, cleaned=cleaned)
199 | 
200 |                     print_speakers(speakers, escape)
201 |                     speaker_id = get_speaker_id('Speaker ID: ')
202 | 
203 |                     emotion_reference = input('Path of an emotion reference: ')
204 |                     if emotion_reference.endswith('.npy'):
205 |                         emotion = np.load(emotion_reference)
206 |                         emotion = FloatTensor(emotion).unsqueeze(0)
207 |                     else:
208 |                         audio16000, sampling_rate = librosa.load(
209 |                             emotion_reference, sr=16000, mono=True)
210 |                         emotion = w2v2_model(audio16000, sampling_rate)[
211 |                             'hidden_states']
212 |                         emotion_reference = re.sub(
213 |                             r'\..*$', '', emotion_reference)
214 |                         np.save(emotion_reference, emotion.squeeze(0))
215 |                         emotion = FloatTensor(emotion)
216 | 
217 |                     out_path = input('Path to save: ')
218 | 
219 |                     with no_grad():
220 |                         x_tst = stn_tst.unsqueeze(0)
221 |                         x_tst_lengths = LongTensor([stn_tst.size(0)])
222 |                         sid = LongTensor([speaker_id])
223 |                         audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
224 |                                                length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
225 | 
226 |                 elif choice == 'v':
227 |                     audio, out_path = voice_conversion()
228 | 
229 |                 write(out_path, hps_ms.data.sampling_rate, audio)
230 |                 print('Successfully saved!')
231 |                 ask_if_continue()
232 |     else:
233 |         model = input('Path of a hubert-soft model: ')
234 |         from hubert_model import hubert_soft
235 |         hubert = hubert_soft(model)
236 | 
237 |         while True:
238 |             audio_path = input('Path of an audio file to convert:\n')
239 | 
240 |             if audio_path != '[VC]':
241 |                 import librosa
242 |                 if use_f0:
243 |                     audio, sampling_rate = librosa.load(
244 |                         audio_path, sr=hps_ms.data.sampling_rate, mono=True)
245 |                     audio16000 = librosa.resample(
246 |                         audio, orig_sr=sampling_rate, target_sr=16000)
247 |                 else:
248 |                     audio16000, sampling_rate = librosa.load(
249 |                         audio_path, sr=16000, mono=True)
250 | 
251 |                 print_speakers(speakers, escape)
252 |                 target_id = get_speaker_id('Target speaker ID: ')
253 |                 out_path = input('Path to save: ')
254 |                 length_scale, out_path = get_label_value(
255 |                     out_path, 'LENGTH', 1, 'length scale')
256 |                 noise_scale, out_path = get_label_value(
257 |                     out_path, 'NOISE', 0.1, 'noise scale')
258 |                 noise_scale_w, out_path = get_label_value(
259 |                     out_path, 'NOISEW', 0.1, 'deviation of noise')
260 | 
261 |                 from torch import inference_mode, FloatTensor
262 |                 import numpy as np
263 |                 with inference_mode():
264 |                     units = hubert.units(FloatTensor(audio16000).unsqueeze(
265 |                         0).unsqueeze(0)).squeeze(0).numpy()
266 |                     if use_f0:
267 |                         f0_scale, out_path = get_label_value(
268 |                             out_path, 'F0', 1, 'f0 scale')
269 |                         f0 = librosa.pyin(audio, sr=sampling_rate,
270 |                                           fmin=librosa.note_to_hz('C0'),
271 |                                           fmax=librosa.note_to_hz('C7'),
272 |                                           frame_length=1780)[0]
273 |                         target_length = len(units[:, 0])
274 |                         f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
275 |                                                      np.arange(0, len(f0)), f0)) * f0_scale
276 |                         units[:, 0] = f0 / 10
277 | 
278 |                 stn_tst = FloatTensor(units)
279 |                 with no_grad():
280 |                     x_tst = stn_tst.unsqueeze(0)
281 |                     x_tst_lengths = LongTensor([stn_tst.size(0)])
282 |                     sid = LongTensor([target_id])
283 |                     audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
284 |                                            noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
285 | 
286 |             else:
287 |                 audio, out_path = voice_conversion()
288 | 
289 |             write(out_path, hps_ms.data.sampling_rate, audio)
290 |             print('Successfully saved!')
291 |             ask_if_continue()
292 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/attentions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | from modules import LayerNorm
  8 |    
  9 | 
 10 | class Encoder(nn.Module):
 11 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 12 |     super().__init__()
 13 |     self.hidden_channels = hidden_channels
 14 |     self.filter_channels = filter_channels
 15 |     self.n_heads = n_heads
 16 |     self.n_layers = n_layers
 17 |     self.kernel_size = kernel_size
 18 |     self.p_dropout = p_dropout
 19 |     self.window_size = window_size
 20 | 
 21 |     self.drop = nn.Dropout(p_dropout)
 22 |     self.attn_layers = nn.ModuleList()
 23 |     self.norm_layers_1 = nn.ModuleList()
 24 |     self.ffn_layers = nn.ModuleList()
 25 |     self.norm_layers_2 = nn.ModuleList()
 26 |     for i in range(self.n_layers):
 27 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 28 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 29 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 30 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 31 | 
 32 |   def forward(self, x, x_mask):
 33 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 34 |     x = x * x_mask
 35 |     for i in range(self.n_layers):
 36 |       y = self.attn_layers[i](x, x, attn_mask)
 37 |       y = self.drop(y)
 38 |       x = self.norm_layers_1[i](x + y)
 39 | 
 40 |       y = self.ffn_layers[i](x, x_mask)
 41 |       y = self.drop(y)
 42 |       x = self.norm_layers_2[i](x + y)
 43 |     x = x * x_mask
 44 |     return x
 45 | 
 46 | 
 47 | class Decoder(nn.Module):
 48 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 49 |     super().__init__()
 50 |     self.hidden_channels = hidden_channels
 51 |     self.filter_channels = filter_channels
 52 |     self.n_heads = n_heads
 53 |     self.n_layers = n_layers
 54 |     self.kernel_size = kernel_size
 55 |     self.p_dropout = p_dropout
 56 |     self.proximal_bias = proximal_bias
 57 |     self.proximal_init = proximal_init
 58 | 
 59 |     self.drop = nn.Dropout(p_dropout)
 60 |     self.self_attn_layers = nn.ModuleList()
 61 |     self.norm_layers_0 = nn.ModuleList()
 62 |     self.encdec_attn_layers = nn.ModuleList()
 63 |     self.norm_layers_1 = nn.ModuleList()
 64 |     self.ffn_layers = nn.ModuleList()
 65 |     self.norm_layers_2 = nn.ModuleList()
 66 |     for i in range(self.n_layers):
 67 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 68 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 69 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 70 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 71 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 72 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 73 | 
 74 |   def forward(self, x, x_mask, h, h_mask):
 75 |     """
 76 |     x: decoder input
 77 |     h: encoder output
 78 |     """
 79 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 80 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 81 |     x = x * x_mask
 82 |     for i in range(self.n_layers):
 83 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 84 |       y = self.drop(y)
 85 |       x = self.norm_layers_0[i](x + y)
 86 | 
 87 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 88 |       y = self.drop(y)
 89 |       x = self.norm_layers_1[i](x + y)
 90 |       
 91 |       y = self.ffn_layers[i](x, x_mask)
 92 |       y = self.drop(y)
 93 |       x = self.norm_layers_2[i](x + y)
 94 |     x = x * x_mask
 95 |     return x
 96 | 
 97 | 
 98 | class MultiHeadAttention(nn.Module):
 99 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100 |     super().__init__()
101 |     assert channels % n_heads == 0
102 | 
103 |     self.channels = channels
104 |     self.out_channels = out_channels
105 |     self.n_heads = n_heads
106 |     self.p_dropout = p_dropout
107 |     self.window_size = window_size
108 |     self.heads_share = heads_share
109 |     self.block_length = block_length
110 |     self.proximal_bias = proximal_bias
111 |     self.proximal_init = proximal_init
112 |     self.attn = None
113 | 
114 |     self.k_channels = channels // n_heads
115 |     self.conv_q = nn.Conv1d(channels, channels, 1)
116 |     self.conv_k = nn.Conv1d(channels, channels, 1)
117 |     self.conv_v = nn.Conv1d(channels, channels, 1)
118 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
119 |     self.drop = nn.Dropout(p_dropout)
120 | 
121 |     if window_size is not None:
122 |       n_heads_rel = 1 if heads_share else n_heads
123 |       rel_stddev = self.k_channels**-0.5
124 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126 | 
127 |     nn.init.xavier_uniform_(self.conv_q.weight)
128 |     nn.init.xavier_uniform_(self.conv_k.weight)
129 |     nn.init.xavier_uniform_(self.conv_v.weight)
130 |     if proximal_init:
131 |       with torch.no_grad():
132 |         self.conv_k.weight.copy_(self.conv_q.weight)
133 |         self.conv_k.bias.copy_(self.conv_q.bias)
134 |       
135 |   def forward(self, x, c, attn_mask=None):
136 |     q = self.conv_q(x)
137 |     k = self.conv_k(c)
138 |     v = self.conv_v(c)
139 |     
140 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
141 | 
142 |     x = self.conv_o(x)
143 |     return x
144 | 
145 |   def attention(self, query, key, value, mask=None):
146 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
147 |     b, d, t_s, t_t = (*key.size(), query.size(2))
148 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151 | 
152 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153 |     if self.window_size is not None:
154 |       assert t_s == t_t, "Relative attention is only available for self-attention."
155 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
158 |       scores = scores + scores_local
159 |     if self.proximal_bias:
160 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
161 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162 |     if mask is not None:
163 |       scores = scores.masked_fill(mask == 0, -1e4)
164 |       if self.block_length is not None:
165 |         assert t_s == t_t, "Local attention is only available for self-attention."
166 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167 |         scores = scores.masked_fill(block_mask == 0, -1e4)
168 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169 |     p_attn = self.drop(p_attn)
170 |     output = torch.matmul(p_attn, value)
171 |     if self.window_size is not None:
172 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
173 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176 |     return output, p_attn
177 | 
178 |   def _matmul_with_relative_values(self, x, y):
179 |     """
180 |     x: [b, h, l, m]
181 |     y: [h or 1, m, d]
182 |     ret: [b, h, l, d]
183 |     """
184 |     ret = torch.matmul(x, y.unsqueeze(0))
185 |     return ret
186 | 
187 |   def _matmul_with_relative_keys(self, x, y):
188 |     """
189 |     x: [b, h, l, d]
190 |     y: [h or 1, m, d]
191 |     ret: [b, h, l, m]
192 |     """
193 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194 |     return ret
195 | 
196 |   def _get_relative_embeddings(self, relative_embeddings, length):
197 |     max_relative_position = 2 * self.window_size + 1
198 |     # Pad first before slice to avoid using cond ops.
199 |     pad_length = max(length - (self.window_size + 1), 0)
200 |     slice_start_position = max((self.window_size + 1) - length, 0)
201 |     slice_end_position = slice_start_position + 2 * length - 1
202 |     if pad_length > 0:
203 |       padded_relative_embeddings = F.pad(
204 |           relative_embeddings,
205 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206 |     else:
207 |       padded_relative_embeddings = relative_embeddings
208 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209 |     return used_relative_embeddings
210 | 
211 |   def _relative_position_to_absolute_position(self, x):
212 |     """
213 |     x: [b, h, l, 2*l-1]
214 |     ret: [b, h, l, l]
215 |     """
216 |     batch, heads, length, _ = x.size()
217 |     # Concat columns of pad to shift from relative to absolute indexing.
218 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219 | 
220 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
221 |     x_flat = x.view([batch, heads, length * 2 * length])
222 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223 | 
224 |     # Reshape and slice out the padded elements.
225 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226 |     return x_final
227 | 
228 |   def _absolute_position_to_relative_position(self, x):
229 |     """
230 |     x: [b, h, l, l]
231 |     ret: [b, h, l, 2*l-1]
232 |     """
233 |     batch, heads, length, _ = x.size()
234 |     # padd along column
235 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237 |     # add 0's in the beginning that will skew the elements after reshape
238 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240 |     return x_final
241 | 
242 |   def _attention_bias_proximal(self, length):
243 |     """Bias for self-attention to encourage attention to close positions.
244 |     Args:
245 |       length: an integer scalar.
246 |     Returns:
247 |       a Tensor with shape [1, 1, length, length]
248 |     """
249 |     r = torch.arange(length, dtype=torch.float32)
250 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252 | 
253 | 
254 | class FFN(nn.Module):
255 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256 |     super().__init__()
257 |     self.in_channels = in_channels
258 |     self.out_channels = out_channels
259 |     self.filter_channels = filter_channels
260 |     self.kernel_size = kernel_size
261 |     self.p_dropout = p_dropout
262 |     self.activation = activation
263 |     self.causal = causal
264 | 
265 |     if causal:
266 |       self.padding = self._causal_padding
267 |     else:
268 |       self.padding = self._same_padding
269 | 
270 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272 |     self.drop = nn.Dropout(p_dropout)
273 | 
274 |   def forward(self, x, x_mask):
275 |     x = self.conv_1(self.padding(x * x_mask))
276 |     if self.activation == "gelu":
277 |       x = x * torch.sigmoid(1.702 * x)
278 |     else:
279 |       x = torch.relu(x)
280 |     x = self.drop(x)
281 |     x = self.conv_2(self.padding(x * x_mask))
282 |     return x * x_mask
283 |   
284 |   def _causal_padding(self, x):
285 |     if self.kernel_size == 1:
286 |       return x
287 |     pad_l = self.kernel_size - 1
288 |     pad_r = 0
289 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290 |     x = F.pad(x, commons.convert_pad_shape(padding))
291 |     return x
292 | 
293 |   def _same_padding(self, x):
294 |     if self.kernel_size == 1:
295 |       return x
296 |     pad_l = (self.kernel_size - 1) // 2
297 |     pad_r = self.kernel_size // 2
298 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299 |     x = F.pad(x, commons.convert_pad_shape(padding))
300 |     return x
301 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from torch.nn import Conv1d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm
  8 | 
  9 | import commons
 10 | from commons import init_weights, get_padding
 11 | from transforms import piecewise_rational_quadratic_transform
 12 | 
 13 | 
 14 | LRELU_SLOPE = 0.1
 15 | 
 16 | 
 17 | class LayerNorm(nn.Module):
 18 |   def __init__(self, channels, eps=1e-5):
 19 |     super().__init__()
 20 |     self.channels = channels
 21 |     self.eps = eps
 22 | 
 23 |     self.gamma = nn.Parameter(torch.ones(channels))
 24 |     self.beta = nn.Parameter(torch.zeros(channels))
 25 | 
 26 |   def forward(self, x):
 27 |     x = x.transpose(1, -1)
 28 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 29 |     return x.transpose(1, -1)
 30 | 
 31 |  
 32 | class ConvReluNorm(nn.Module):
 33 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 34 |     super().__init__()
 35 |     self.in_channels = in_channels
 36 |     self.hidden_channels = hidden_channels
 37 |     self.out_channels = out_channels
 38 |     self.kernel_size = kernel_size
 39 |     self.n_layers = n_layers
 40 |     self.p_dropout = p_dropout
 41 |     assert n_layers > 1, "Number of layers should be larger than 0."
 42 | 
 43 |     self.conv_layers = nn.ModuleList()
 44 |     self.norm_layers = nn.ModuleList()
 45 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 46 |     self.norm_layers.append(LayerNorm(hidden_channels))
 47 |     self.relu_drop = nn.Sequential(
 48 |         nn.ReLU(),
 49 |         nn.Dropout(p_dropout))
 50 |     for _ in range(n_layers-1):
 51 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 52 |       self.norm_layers.append(LayerNorm(hidden_channels))
 53 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 54 |     self.proj.weight.data.zero_()
 55 |     self.proj.bias.data.zero_()
 56 | 
 57 |   def forward(self, x, x_mask):
 58 |     x_org = x
 59 |     for i in range(self.n_layers):
 60 |       x = self.conv_layers[i](x * x_mask)
 61 |       x = self.norm_layers[i](x)
 62 |       x = self.relu_drop(x)
 63 |     x = x_org + self.proj(x)
 64 |     return x * x_mask
 65 | 
 66 | 
 67 | class DDSConv(nn.Module):
 68 |   """
 69 |   Dilated and Depth-Separable Convolution
 70 |   """
 71 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 72 |     super().__init__()
 73 |     self.channels = channels
 74 |     self.kernel_size = kernel_size
 75 |     self.n_layers = n_layers
 76 |     self.p_dropout = p_dropout
 77 | 
 78 |     self.drop = nn.Dropout(p_dropout)
 79 |     self.convs_sep = nn.ModuleList()
 80 |     self.convs_1x1 = nn.ModuleList()
 81 |     self.norms_1 = nn.ModuleList()
 82 |     self.norms_2 = nn.ModuleList()
 83 |     for i in range(n_layers):
 84 |       dilation = kernel_size ** i
 85 |       padding = (kernel_size * dilation - dilation) // 2
 86 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 87 |           groups=channels, dilation=dilation, padding=padding
 88 |       ))
 89 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 90 |       self.norms_1.append(LayerNorm(channels))
 91 |       self.norms_2.append(LayerNorm(channels))
 92 | 
 93 |   def forward(self, x, x_mask, g=None):
 94 |     if g is not None:
 95 |       x = x + g
 96 |     for i in range(self.n_layers):
 97 |       y = self.convs_sep[i](x * x_mask)
 98 |       y = self.norms_1[i](y)
 99 |       y = F.gelu(y)
100 |       y = self.convs_1x1[i](y)
101 |       y = self.norms_2[i](y)
102 |       y = F.gelu(y)
103 |       y = self.drop(y)
104 |       x = x + y
105 |     return x * x_mask
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 |     super(WN, self).__init__()
111 |     assert(kernel_size % 2 == 1)
112 |     self.hidden_channels =hidden_channels
113 |     self.kernel_size = kernel_size,
114 |     self.dilation_rate = dilation_rate
115 |     self.n_layers = n_layers
116 |     self.gin_channels = gin_channels
117 |     self.p_dropout = p_dropout
118 | 
119 |     self.in_layers = torch.nn.ModuleList()
120 |     self.res_skip_layers = torch.nn.ModuleList()
121 |     self.drop = nn.Dropout(p_dropout)
122 | 
123 |     if gin_channels != 0:
124 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
125 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126 | 
127 |     for i in range(n_layers):
128 |       dilation = dilation_rate ** i
129 |       padding = int((kernel_size * dilation - dilation) / 2)
130 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
131 |                                  dilation=dilation, padding=padding)
132 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133 |       self.in_layers.append(in_layer)
134 | 
135 |       # last one is not necessary
136 |       if i < n_layers - 1:
137 |         res_skip_channels = 2 * hidden_channels
138 |       else:
139 |         res_skip_channels = hidden_channels
140 | 
141 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143 |       self.res_skip_layers.append(res_skip_layer)
144 | 
145 |   def forward(self, x, x_mask, g=None, **kwargs):
146 |     output = torch.zeros_like(x)
147 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
148 | 
149 |     if g is not None:
150 |       g = self.cond_layer(g)
151 | 
152 |     for i in range(self.n_layers):
153 |       x_in = self.in_layers[i](x)
154 |       if g is not None:
155 |         cond_offset = i * 2 * self.hidden_channels
156 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
157 |       else:
158 |         g_l = torch.zeros_like(x_in)
159 | 
160 |       acts = commons.fused_add_tanh_sigmoid_multiply(
161 |           x_in,
162 |           g_l,
163 |           n_channels_tensor)
164 |       acts = self.drop(acts)
165 | 
166 |       res_skip_acts = self.res_skip_layers[i](acts)
167 |       if i < self.n_layers - 1:
168 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
169 |         x = (x + res_acts) * x_mask
170 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
171 |       else:
172 |         output = output + res_skip_acts
173 |     return output * x_mask
174 | 
175 |   def remove_weight_norm(self):
176 |     if self.gin_channels != 0:
177 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
178 |     for l in self.in_layers:
179 |       torch.nn.utils.remove_weight_norm(l)
180 |     for l in self.res_skip_layers:
181 |      torch.nn.utils.remove_weight_norm(l)
182 | 
183 | 
184 | class ResBlock1(torch.nn.Module):
185 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
186 |         super(ResBlock1, self).__init__()
187 |         self.convs1 = nn.ModuleList([
188 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
189 |                                padding=get_padding(kernel_size, dilation[0]))),
190 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
191 |                                padding=get_padding(kernel_size, dilation[1]))),
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
193 |                                padding=get_padding(kernel_size, dilation[2])))
194 |         ])
195 |         self.convs1.apply(init_weights)
196 | 
197 |         self.convs2 = nn.ModuleList([
198 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
199 |                                padding=get_padding(kernel_size, 1))),
200 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201 |                                padding=get_padding(kernel_size, 1))),
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=get_padding(kernel_size, 1)))
204 |         ])
205 |         self.convs2.apply(init_weights)
206 | 
207 |     def forward(self, x, x_mask=None):
208 |         for c1, c2 in zip(self.convs1, self.convs2):
209 |             xt = F.leaky_relu(x, LRELU_SLOPE)
210 |             if x_mask is not None:
211 |                 xt = xt * x_mask
212 |             xt = c1(xt)
213 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
214 |             if x_mask is not None:
215 |                 xt = xt * x_mask
216 |             xt = c2(xt)
217 |             x = xt + x
218 |         if x_mask is not None:
219 |             x = x * x_mask
220 |         return x
221 | 
222 |     def remove_weight_norm(self):
223 |         for l in self.convs1:
224 |             remove_weight_norm(l)
225 |         for l in self.convs2:
226 |             remove_weight_norm(l)
227 | 
228 | 
229 | class ResBlock2(torch.nn.Module):
230 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
231 |         super(ResBlock2, self).__init__()
232 |         self.convs = nn.ModuleList([
233 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
234 |                                padding=get_padding(kernel_size, dilation[0]))),
235 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
236 |                                padding=get_padding(kernel_size, dilation[1])))
237 |         ])
238 |         self.convs.apply(init_weights)
239 | 
240 |     def forward(self, x, x_mask=None):
241 |         for c in self.convs:
242 |             xt = F.leaky_relu(x, LRELU_SLOPE)
243 |             if x_mask is not None:
244 |                 xt = xt * x_mask
245 |             xt = c(xt)
246 |             x = xt + x
247 |         if x_mask is not None:
248 |             x = x * x_mask
249 |         return x
250 | 
251 |     def remove_weight_norm(self):
252 |         for l in self.convs:
253 |             remove_weight_norm(l)
254 | 
255 | 
256 | class Log(nn.Module):
257 |   def forward(self, x, x_mask, reverse=False, **kwargs):
258 |     if not reverse:
259 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260 |       logdet = torch.sum(-y, [1, 2])
261 |       return y, logdet
262 |     else:
263 |       x = torch.exp(x) * x_mask
264 |       return x
265 |     
266 | 
267 | class Flip(nn.Module):
268 |   def forward(self, x, *args, reverse=False, **kwargs):
269 |     x = torch.flip(x, [1])
270 |     if not reverse:
271 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272 |       return x, logdet
273 |     else:
274 |       return x
275 | 
276 | 
277 | class ElementwiseAffine(nn.Module):
278 |   def __init__(self, channels):
279 |     super().__init__()
280 |     self.channels = channels
281 |     self.m = nn.Parameter(torch.zeros(channels,1))
282 |     self.logs = nn.Parameter(torch.zeros(channels,1))
283 | 
284 |   def forward(self, x, x_mask, reverse=False, **kwargs):
285 |     if not reverse:
286 |       y = self.m + torch.exp(self.logs) * x
287 |       y = y * x_mask
288 |       logdet = torch.sum(self.logs * x_mask, [1,2])
289 |       return y, logdet
290 |     else:
291 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
292 |       return x
293 | 
294 | 
295 | class ResidualCouplingLayer(nn.Module):
296 |   def __init__(self,
297 |       channels,
298 |       hidden_channels,
299 |       kernel_size,
300 |       dilation_rate,
301 |       n_layers,
302 |       p_dropout=0,
303 |       gin_channels=0,
304 |       mean_only=False):
305 |     assert channels % 2 == 0, "channels should be divisible by 2"
306 |     super().__init__()
307 |     self.channels = channels
308 |     self.hidden_channels = hidden_channels
309 |     self.kernel_size = kernel_size
310 |     self.dilation_rate = dilation_rate
311 |     self.n_layers = n_layers
312 |     self.half_channels = channels // 2
313 |     self.mean_only = mean_only
314 | 
315 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
317 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
318 |     self.post.weight.data.zero_()
319 |     self.post.bias.data.zero_()
320 | 
321 |   def forward(self, x, x_mask, g=None, reverse=False):
322 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
323 |     h = self.pre(x0) * x_mask
324 |     h = self.enc(h, x_mask, g=g)
325 |     stats = self.post(h) * x_mask
326 |     if not self.mean_only:
327 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
328 |     else:
329 |       m = stats
330 |       logs = torch.zeros_like(m)
331 | 
332 |     if not reverse:
333 |       x1 = m + x1 * torch.exp(logs) * x_mask
334 |       x = torch.cat([x0, x1], 1)
335 |       logdet = torch.sum(logs, [1,2])
336 |       return x, logdet
337 |     else:
338 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
339 |       x = torch.cat([x0, x1], 1)
340 |       return x
341 | 
342 | 
343 | class ConvFlow(nn.Module):
344 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
345 |     super().__init__()
346 |     self.in_channels = in_channels
347 |     self.filter_channels = filter_channels
348 |     self.kernel_size = kernel_size
349 |     self.n_layers = n_layers
350 |     self.num_bins = num_bins
351 |     self.tail_bound = tail_bound
352 |     self.half_channels = in_channels // 2
353 | 
354 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
355 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
356 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
357 |     self.proj.weight.data.zero_()
358 |     self.proj.bias.data.zero_()
359 | 
360 |   def forward(self, x, x_mask, g=None, reverse=False):
361 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
362 |     h = self.pre(x0)
363 |     h = self.convs(h, x_mask, g=g)
364 |     h = self.proj(h) * x_mask
365 | 
366 |     b, c, t = x0.shape
367 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
368 | 
369 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
370 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
371 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
372 | 
373 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
374 |         unnormalized_widths,
375 |         unnormalized_heights,
376 |         unnormalized_derivatives,
377 |         inverse=reverse,
378 |         tails='linear',
379 |         tail_bound=self.tail_bound
380 |     )
381 | 
382 |     x = torch.cat([x0, x1], 1) * x_mask
383 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
384 |     if not reverse:
385 |         return x, logdet
386 |     else:
387 |         return x
388 | 


--------------------------------------------------------------------------------
/HoshiNoYume/actions/MoeGoe/models.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import commons
  7 | import modules
  8 | import attentions
  9 | 
 10 | from torch.nn import Conv1d, ConvTranspose1d
 11 | from torch.nn.utils import weight_norm
 12 | from commons import init_weights
 13 | 
 14 | 
 15 | class StochasticDurationPredictor(nn.Module):
 16 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
 17 |     super().__init__()
 18 |     filter_channels = in_channels # it needs to be removed from future version.
 19 |     self.in_channels = in_channels
 20 |     self.filter_channels = filter_channels
 21 |     self.kernel_size = kernel_size
 22 |     self.p_dropout = p_dropout
 23 |     self.n_flows = n_flows
 24 |     self.gin_channels = gin_channels
 25 | 
 26 |     self.log_flow = modules.Log()
 27 |     self.flows = nn.ModuleList()
 28 |     self.flows.append(modules.ElementwiseAffine(2))
 29 |     for i in range(n_flows):
 30 |       self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 31 |       self.flows.append(modules.Flip())
 32 | 
 33 |     self.post_pre = nn.Conv1d(1, filter_channels, 1)
 34 |     self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 35 |     self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 36 |     self.post_flows = nn.ModuleList()
 37 |     self.post_flows.append(modules.ElementwiseAffine(2))
 38 |     for i in range(4):
 39 |       self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 40 |       self.post_flows.append(modules.Flip())
 41 | 
 42 |     self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 43 |     self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 44 |     self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 45 |     if gin_channels != 0:
 46 |       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 47 | 
 48 |   def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 49 |     x = torch.detach(x)
 50 |     x = self.pre(x)
 51 |     if g is not None:
 52 |       g = torch.detach(g)
 53 |       x = x + self.cond(g)
 54 |     x = self.convs(x, x_mask)
 55 |     x = self.proj(x) * x_mask
 56 | 
 57 |     if not reverse:
 58 |       flows = self.flows
 59 |       assert w is not None
 60 | 
 61 |       logdet_tot_q = 0 
 62 |       h_w = self.post_pre(w)
 63 |       h_w = self.post_convs(h_w, x_mask)
 64 |       h_w = self.post_proj(h_w) * x_mask
 65 |       e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
 66 |       z_q = e_q
 67 |       for flow in self.post_flows:
 68 |         z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 69 |         logdet_tot_q += logdet_q
 70 |       z_u, z1 = torch.split(z_q, [1, 1], 1) 
 71 |       u = torch.sigmoid(z_u) * x_mask
 72 |       z0 = (w - u) * x_mask
 73 |       logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 74 |       logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
 75 | 
 76 |       logdet_tot = 0
 77 |       z0, logdet = self.log_flow(z0, x_mask)
 78 |       logdet_tot += logdet
 79 |       z = torch.cat([z0, z1], 1)
 80 |       for flow in flows:
 81 |         z, logdet = flow(z, x_mask, g=x, reverse=reverse)
 82 |         logdet_tot = logdet_tot + logdet
 83 |       nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
 84 |       return nll + logq # [b]
 85 |     else:
 86 |       flows = list(reversed(self.flows))
 87 |       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
 88 |       z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
 89 |       for flow in flows:
 90 |         z = flow(z, x_mask, g=x, reverse=reverse)
 91 |       z0, z1 = torch.split(z, [1, 1], 1)
 92 |       logw = z0
 93 |       return logw
 94 | 
 95 | 
 96 | class DurationPredictor(nn.Module):
 97 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
 98 |     super().__init__()
 99 | 
100 |     self.in_channels = in_channels
101 |     self.filter_channels = filter_channels
102 |     self.kernel_size = kernel_size
103 |     self.p_dropout = p_dropout
104 |     self.gin_channels = gin_channels
105 | 
106 |     self.drop = nn.Dropout(p_dropout)
107 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
108 |     self.norm_1 = modules.LayerNorm(filter_channels)
109 |     self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
110 |     self.norm_2 = modules.LayerNorm(filter_channels)
111 |     self.proj = nn.Conv1d(filter_channels, 1, 1)
112 | 
113 |     if gin_channels != 0:
114 |       self.cond = nn.Conv1d(gin_channels, in_channels, 1)
115 | 
116 |   def forward(self, x, x_mask, g=None):
117 |     x = torch.detach(x)
118 |     if g is not None:
119 |       g = torch.detach(g)
120 |       x = x + self.cond(g)
121 |     x = self.conv_1(x * x_mask)
122 |     x = torch.relu(x)
123 |     x = self.norm_1(x)
124 |     x = self.drop(x)
125 |     x = self.conv_2(x * x_mask)
126 |     x = torch.relu(x)
127 |     x = self.norm_2(x)
128 |     x = self.drop(x)
129 |     x = self.proj(x * x_mask)
130 |     return x * x_mask
131 | 
132 | 
133 | class TextEncoder(nn.Module):
134 |   def __init__(self,
135 |       n_vocab,
136 |       out_channels,
137 |       hidden_channels,
138 |       filter_channels,
139 |       n_heads,
140 |       n_layers,
141 |       kernel_size,
142 |       p_dropout,
143 |       emotion_embedding):
144 |     super().__init__()
145 |     self.n_vocab = n_vocab
146 |     self.out_channels = out_channels
147 |     self.hidden_channels = hidden_channels
148 |     self.filter_channels = filter_channels
149 |     self.n_heads = n_heads
150 |     self.n_layers = n_layers
151 |     self.kernel_size = kernel_size
152 |     self.p_dropout = p_dropout
153 |     self.emotion_embedding = emotion_embedding
154 |     
155 |     if self.n_vocab!=0:
156 |       self.emb = nn.Embedding(n_vocab, hidden_channels)
157 |       if emotion_embedding:
158 |         self.emo_proj = nn.Linear(1024, hidden_channels)
159 |       nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
160 | 
161 |     self.encoder = attentions.Encoder(
162 |       hidden_channels,
163 |       filter_channels,
164 |       n_heads,
165 |       n_layers,
166 |       kernel_size,
167 |       p_dropout)
168 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
169 | 
170 |   def forward(self, x, x_lengths, emotion_embedding=None):
171 |     if self.n_vocab!=0:
172 |       x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
173 |     if emotion_embedding is not None:
174 |       x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
175 |     x = torch.transpose(x, 1, -1) # [b, h, t]
176 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
177 | 
178 |     x = self.encoder(x * x_mask, x_mask)
179 |     stats = self.proj(x) * x_mask
180 | 
181 |     m, logs = torch.split(stats, self.out_channels, dim=1)
182 |     return x, m, logs, x_mask
183 | 
184 | 
185 | class ResidualCouplingBlock(nn.Module):
186 |   def __init__(self,
187 |       channels,
188 |       hidden_channels,
189 |       kernel_size,
190 |       dilation_rate,
191 |       n_layers,
192 |       n_flows=4,
193 |       gin_channels=0):
194 |     super().__init__()
195 |     self.channels = channels
196 |     self.hidden_channels = hidden_channels
197 |     self.kernel_size = kernel_size
198 |     self.dilation_rate = dilation_rate
199 |     self.n_layers = n_layers
200 |     self.n_flows = n_flows
201 |     self.gin_channels = gin_channels
202 | 
203 |     self.flows = nn.ModuleList()
204 |     for i in range(n_flows):
205 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
206 |       self.flows.append(modules.Flip())
207 | 
208 |   def forward(self, x, x_mask, g=None, reverse=False):
209 |     if not reverse:
210 |       for flow in self.flows:
211 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
212 |     else:
213 |       for flow in reversed(self.flows):
214 |         x = flow(x, x_mask, g=g, reverse=reverse)
215 |     return x
216 | 
217 | 
218 | class PosteriorEncoder(nn.Module):
219 |   def __init__(self,
220 |       in_channels,
221 |       out_channels,
222 |       hidden_channels,
223 |       kernel_size,
224 |       dilation_rate,
225 |       n_layers,
226 |       gin_channels=0):
227 |     super().__init__()
228 |     self.in_channels = in_channels
229 |     self.out_channels = out_channels
230 |     self.hidden_channels = hidden_channels
231 |     self.kernel_size = kernel_size
232 |     self.dilation_rate = dilation_rate
233 |     self.n_layers = n_layers
234 |     self.gin_channels = gin_channels
235 | 
236 |     self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
237 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
238 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
239 | 
240 |   def forward(self, x, x_lengths, g=None):
241 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
242 |     x = self.pre(x) * x_mask
243 |     x = self.enc(x, x_mask, g=g)
244 |     stats = self.proj(x) * x_mask
245 |     m, logs = torch.split(stats, self.out_channels, dim=1)
246 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
247 |     return z, m, logs, x_mask
248 | 
249 | 
250 | class Generator(torch.nn.Module):
251 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
252 |         super(Generator, self).__init__()
253 |         self.num_kernels = len(resblock_kernel_sizes)
254 |         self.num_upsamples = len(upsample_rates)
255 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
256 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
257 | 
258 |         self.ups = nn.ModuleList()
259 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
260 |             self.ups.append(weight_norm(
261 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
262 |                                 k, u, padding=(k-u)//2)))
263 | 
264 |         self.resblocks = nn.ModuleList()
265 |         for i in range(len(self.ups)):
266 |             ch = upsample_initial_channel//(2**(i+1))
267 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
268 |                 self.resblocks.append(resblock(ch, k, d))
269 | 
270 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
271 |         self.ups.apply(init_weights)
272 | 
273 |         if gin_channels != 0:
274 |             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
275 | 
276 |     def forward(self, x, g=None):
277 |         x = self.conv_pre(x)
278 |         if g is not None:
279 |           x = x + self.cond(g)
280 | 
281 |         for i in range(self.num_upsamples):
282 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
283 |             x = self.ups[i](x)
284 |             xs = None
285 |             for j in range(self.num_kernels):
286 |                 if xs is None:
287 |                     xs = self.resblocks[i*self.num_kernels+j](x)
288 |                 else:
289 |                     xs += self.resblocks[i*self.num_kernels+j](x)
290 |             x = xs / self.num_kernels
291 |         x = F.leaky_relu(x)
292 |         x = self.conv_post(x)
293 |         x = torch.tanh(x)
294 | 
295 |         return x
296 | 
297 | 
298 | class SynthesizerTrn(nn.Module):
299 |   """
300 |   Synthesizer for Training
301 |   """
302 | 
303 |   def __init__(self, 
304 |     n_vocab,
305 |     spec_channels,
306 |     segment_size,
307 |     inter_channels,
308 |     hidden_channels,
309 |     filter_channels,
310 |     n_heads,
311 |     n_layers,
312 |     kernel_size,
313 |     p_dropout,
314 |     resblock, 
315 |     resblock_kernel_sizes, 
316 |     resblock_dilation_sizes, 
317 |     upsample_rates, 
318 |     upsample_initial_channel, 
319 |     upsample_kernel_sizes,
320 |     n_speakers=0,
321 |     gin_channels=0,
322 |     use_sdp=True,
323 |     emotion_embedding=False,
324 |     **kwargs):
325 | 
326 |     super().__init__()
327 |     self.n_vocab = n_vocab
328 |     self.spec_channels = spec_channels
329 |     self.inter_channels = inter_channels
330 |     self.hidden_channels = hidden_channels
331 |     self.filter_channels = filter_channels
332 |     self.n_heads = n_heads
333 |     self.n_layers = n_layers
334 |     self.kernel_size = kernel_size
335 |     self.p_dropout = p_dropout
336 |     self.resblock = resblock
337 |     self.resblock_kernel_sizes = resblock_kernel_sizes
338 |     self.resblock_dilation_sizes = resblock_dilation_sizes
339 |     self.upsample_rates = upsample_rates
340 |     self.upsample_initial_channel = upsample_initial_channel
341 |     self.upsample_kernel_sizes = upsample_kernel_sizes
342 |     self.segment_size = segment_size
343 |     self.n_speakers = n_speakers
344 |     self.gin_channels = gin_channels
345 | 
346 |     self.use_sdp = use_sdp
347 | 
348 |     self.enc_p = TextEncoder(n_vocab,
349 |         inter_channels,
350 |         hidden_channels,
351 |         filter_channels,
352 |         n_heads,
353 |         n_layers,
354 |         kernel_size,
355 |         p_dropout,
356 |         emotion_embedding)
357 |     self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
358 |     self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
359 |     self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
360 | 
361 |     if use_sdp:
362 |       self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
363 |     else:
364 |       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
365 | 
366 |     if n_speakers > 1:
367 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
368 | 
369 |   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
370 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
371 |     if self.n_speakers > 0:
372 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
373 |     else:
374 |       g = None
375 | 
376 |     if self.use_sdp:
377 |       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
378 |     else:
379 |       logw = self.dp(x, x_mask, g=g)
380 |     w = torch.exp(logw) * x_mask * length_scale
381 |     w_ceil = torch.ceil(w)
382 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
383 |     y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
384 |     attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
385 |     attn = commons.generate_path(w_ceil, attn_mask)
386 | 
387 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
388 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
389 | 
390 |     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
391 |     z = self.flow(z_p, y_mask, g=g, reverse=True)
392 |     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
393 |     return o, attn, y_mask, (z, z_p, m_p, logs_p)
394 | 
395 |   def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
396 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
397 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
398 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
399 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
400 |     z_p = self.flow(z, y_mask, g=g_src)
401 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
402 |     o_hat = self.dec(z_hat * y_mask, g=g_tgt)
403 |     return o_hat, y_mask, (z, z_p, z_hat)
404 | 
405 | 


--------------------------------------------------------------------------------