├── matcha ├── data │ ├── __init__.py │ └── components │ │ └── __init__.py ├── hifigan │ ├── __init__.py │ ├── env.py │ ├── config.py │ ├── LICENSE │ ├── xutils.py │ ├── denoiser.py │ ├── README.md │ └── meldataset.py ├── models │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ └── flow_matching.py │ └── baselightningmodule.py └── onnx │ └── export.py ├── academicodec ├── __init__.py ├── models │ ├── encodec │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── net3.py │ │ ├── distributed │ │ │ ├── launch.py │ │ │ └── distributed.py │ │ └── test.py │ ├── hificodec │ │ ├── __init__.py │ │ ├── env.py │ │ ├── vqvae_tester.py │ │ ├── vqvae.py │ │ ├── vqvae_copy_syn.py │ │ └── meldataset.py │ └── soundstream │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── models.py ├── quantization │ ├── __init__.py │ ├── distrib.py │ └── vq.py ├── modules │ ├── __init__.py │ ├── lstm.py │ ├── norm.py │ └── transformer.py ├── binary.py └── utils.py ├── cosyvoice ├── cli │ ├── __init__.py │ ├── zh_normalization │ │ ├── __init__.py │ │ ├── README.md │ │ ├── quantifier.py │ │ ├── phonecode.py │ │ ├── constants.py │ │ ├── chronology.py │ │ ├── text_normlization.py │ │ └── num.py │ └── model.py ├── dataset │ ├── __init__.py │ └── dataset.py ├── transformer │ ├── __init__.py │ ├── activation.py │ ├── label_smoothing_loss.py │ ├── positionwise_feed_forward.py │ ├── decoder_layer.py │ └── convolution.py ├── flow │ ├── length_regulator.py │ ├── flow_matching.py │ └── flow.py ├── hifigan │ └── f0_predictor.py ├── utils │ ├── class_utils.py │ └── common.py └── bin │ ├── inference.py │ └── train.py ├── data ├── cache │ └── 这里为语音合成缓存文件夹.txt └── model │ └── 这里存放CosyVoice模型.txt ├── example参考音频文本.txt ├── requirements.txt ├── api.py ├── LICENSE ├── README_CN.md └── README.md /matcha/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /academicodec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/cache/这里为语音合成缓存文件夹.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matcha/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matcha/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/model/这里存放CosyVoice模型.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matcha/data/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matcha/models/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /academicodec/models/encodec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /academicodec/models/soundstream/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example参考音频文本.txt: -------------------------------------------------------------------------------- 1 | 把这些文字替换为你的example.wav的参考音频文本 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | modelscope 3 | torch 4 | torchaudio 5 | uvicorn -------------------------------------------------------------------------------- /academicodec/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # flake8: noqa 7 | from .vq import QuantizedResult 8 | from .vq import ResidualVectorQuantizer 9 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | def build_env(config, config_name, path): 12 | t_path = os.path.join(path, config_name) 13 | if config != t_path: 14 | os.makedirs(path, exist_ok=True) 15 | shutil.copyfile(config, os.path.join(path, config_name)) 16 | -------------------------------------------------------------------------------- /matcha/hifigan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .text_normlization import * 15 | -------------------------------------------------------------------------------- /academicodec/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Torch modules.""" 7 | # flake8: noqa 8 | from .conv import NormConv1d 9 | from .conv import NormConv2d 10 | from .conv import NormConvTranspose1d 11 | from .conv import NormConvTranspose2d 12 | from .conv import pad1d 13 | from .conv import SConv1d 14 | from .conv import SConvTranspose1d 15 | from .conv import unpad1d 16 | from .lstm import SLSTM 17 | from .seanet import SEANetDecoder 18 | from .seanet import SEANetEncoder 19 | from .transformer import StreamingTransformerEncoder 20 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /academicodec/modules/lstm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """LSTM layers module.""" 7 | from torch import nn 8 | 9 | 10 | class SLSTM(nn.Module): 11 | """ 12 | LSTM without worrying about the hidden state, nor the layout of the data. 13 | Expects input as convolutional layout. 14 | """ 15 | 16 | def __init__(self, dimension: int, num_layers: int=2, skip: bool=True): 17 | super().__init__() 18 | self.skip = skip 19 | self.lstm = nn.LSTM(dimension, dimension, num_layers) 20 | 21 | def forward(self, x): 22 | x = x.permute(2, 0, 1) 23 | y, _ = self.lstm(x) 24 | if self.skip: 25 | y = y + x 26 | y = y.permute(1, 2, 0) 27 | return y 28 | -------------------------------------------------------------------------------- /matcha/hifigan/config.py: -------------------------------------------------------------------------------- 1 | v1 = { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0004, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | "upsample_rates": [8, 8, 2, 2], 11 | "upsample_kernel_sizes": [16, 16, 4, 4], 12 | "upsample_initial_channel": 512, 13 | "resblock_kernel_sizes": [3, 7, 11], 14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 15 | "resblock_initial_channel": 256, 16 | "segment_size": 8192, 17 | "num_mels": 80, 18 | "num_freq": 1025, 19 | "n_fft": 1024, 20 | "hop_size": 256, 21 | "win_size": 1024, 22 | "sampling_rate": 22050, 23 | "fmin": 0, 24 | "fmax": 8000, 25 | "fmax_loss": None, 26 | "num_workers": 4, 27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, 28 | } 29 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import torchaudio 2 | import uvicorn 3 | from fastapi import FastAPI 4 | from fastapi.responses import FileResponse 5 | from cosyvoice.cli.cosyvoice import CosyVoice 6 | from cosyvoice.utils.file_utils import load_wav 7 | 8 | app = FastAPI() 9 | print("正在加载CosyVoice模型,请稍后...") 10 | model = CosyVoice('data/model/CosyVoice-300M') 11 | prompt_speech = load_wav('example.wav', 16000) 12 | with open('example参考音频文本.txt', 'r', encoding='utf-8') as file: 13 | lines = file.readlines() 14 | prompt_text = lines[0].strip() 15 | output_path = 'data/cache/cache.wav' 16 | 17 | 18 | @app.get("/cosyvoice/") 19 | def run_cosyvoice(text: str): 20 | results = model.inference_zero_shot(text, prompt_text, prompt_speech) 21 | tts_speech = results['tts_speech'] 22 | torchaudio.save(output_path, tts_speech, 22050) 23 | return FileResponse(output_path) 24 | 25 | 26 | print("本地CosyVoice语音合成大模型API服务器启动成功!") 27 | uvicorn.run(app, host="0.0.0.0", port=9881) 28 | -------------------------------------------------------------------------------- /academicodec/modules/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Normalization modules.""" 7 | import typing as tp 8 | 9 | import einops 10 | import torch 11 | from torch import nn 12 | 13 | 14 | class ConvLayerNorm(nn.LayerNorm): 15 | """ 16 | Convolution-friendly LayerNorm that moves channels to last dimensions 17 | before running the normalization and moves them back to original position right after. 18 | """ 19 | 20 | def __init__(self, 21 | normalized_shape: tp.Union[int, tp.List[int], torch.Size], 22 | **kwargs): 23 | super().__init__(normalized_shape, **kwargs) 24 | 25 | def forward(self, x): 26 | x = einops.rearrange(x, 'b ... t -> b t ...') 27 | x = super().forward(x) 28 | x = einops.rearrange(x, 'b t ... -> b ... t') 29 | return 30 | -------------------------------------------------------------------------------- /academicodec/models/encodec/dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import random 3 | 4 | import torch 5 | import torchaudio 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class NSynthDataset(Dataset): 10 | """Dataset to load NSynth data.""" 11 | 12 | def __init__(self, audio_dir): 13 | super().__init__() 14 | self.filenames = [] 15 | self.filenames.extend(glob.glob(audio_dir + "/*.wav")) 16 | print(len(self.filenames)) 17 | _, self.sr = torchaudio.load(self.filenames[0]) 18 | self.max_len = 24000 # 24000 19 | 20 | def __len__(self): 21 | return len(self.filenames) 22 | 23 | def __getitem__(self, index): 24 | ans = torch.zeros(1, self.max_len) 25 | audio = torchaudio.load(self.filenames[index])[0] 26 | if audio.shape[1] > self.max_len: 27 | st = random.randint(0, audio.shape[1] - self.max_len - 1) 28 | ed = st + self.max_len 29 | return audio[:, st:ed] 30 | else: 31 | ans[:, :audio.shape[1]] = audio 32 | return ans 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 枫影剑 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /matcha/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/vqvae_tester.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import librosa 4 | import torch 5 | import torch.nn as nn 6 | 7 | from academicodec.models.hificodec.vqvae import VQVAE 8 | 9 | 10 | class VqvaeTester(nn.Module): 11 | def __init__(self, config_path, model_path, sample_rate=24000): 12 | super().__init__() 13 | self.vqvae = VQVAE(config_path, model_path, with_encoder=True) 14 | self.sample_rate = sample_rate 15 | 16 | @torch.no_grad() 17 | def forward(self, wav_path): 18 | # 单声道 19 | # wav.shape (T, ), 按照模型的 sr 读取 20 | wav, sr = librosa.load(wav_path, sr=self.sample_rate) 21 | fid = os.path.basename(wav_path)[:-4] 22 | wav = torch.tensor(wav).unsqueeze(0) 23 | wav = wav.cuda() 24 | # vq_codes is acoustic token 25 | vq_codes = self.vqvae.encode(wav) 26 | syn = self.vqvae(vq_codes) 27 | return fid, syn 28 | 29 | @torch.no_grad() 30 | def vq(self, wav_path): 31 | wav, sr = librosa.load(wav_path, sr=self.sample_rate) 32 | fid = os.path.basename(wav_path)[:-4] 33 | wav = torch.tensor(wav).unsqueeze(0) 34 | wav = wav.cuda() 35 | # vq_codes is acoustic token 36 | vq_codes = self.vqvae.encode(wav) 37 | return fid, vq_codes 38 | -------------------------------------------------------------------------------- /matcha/hifigan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | 13 | 14 | def plot_spectrogram(spectrogram): 15 | fig, ax = plt.subplots(figsize=(10, 2)) 16 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 17 | plt.colorbar(im, ax=ax) 18 | 19 | fig.canvas.draw() 20 | plt.close() 21 | 22 | return fig 23 | 24 | 25 | def init_weights(m, mean=0.0, std=0.01): 26 | classname = m.__class__.__name__ 27 | if classname.find("Conv") != -1: 28 | m.weight.data.normal_(mean, std) 29 | 30 | 31 | def apply_weight_norm(m): 32 | classname = m.__class__.__name__ 33 | if classname.find("Conv") != -1: 34 | weight_norm(m) 35 | 36 | 37 | def get_padding(kernel_size, dilation=1): 38 | return int((kernel_size * dilation - dilation) / 2) 39 | 40 | 41 | def load_checkpoint(filepath, device): 42 | assert os.path.isfile(filepath) 43 | print(f"Loading '{filepath}'") 44 | checkpoint_dict = torch.load(filepath, map_location=device) 45 | print("Complete.") 46 | return checkpoint_dict 47 | 48 | 49 | def save_checkpoint(filepath, obj): 50 | print(f"Saving checkpoint to {filepath}") 51 | torch.save(obj, filepath) 52 | print("Complete.") 53 | 54 | 55 | def scan_checkpoint(cp_dir, prefix): 56 | pattern = os.path.join(cp_dir, prefix + "????????") 57 | cp_list = glob.glob(pattern) 58 | if len(cp_list) == 0: 59 | return None 60 | return sorted(cp_list)[-1] 61 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/vqvae.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from academicodec.models.hificodec.env import AttrDict 7 | from academicodec.models.hificodec.models import Encoder 8 | from academicodec.models.hificodec.models import Generator 9 | from academicodec.models.hificodec.models import Quantizer 10 | 11 | 12 | class VQVAE(nn.Module): 13 | def __init__(self, 14 | config_path, 15 | ckpt_path, 16 | with_encoder=False): 17 | super(VQVAE, self).__init__() 18 | ckpt = torch.load(ckpt_path) 19 | with open(config_path) as f: 20 | data = f.read() 21 | json_config = json.loads(data) 22 | self.h = AttrDict(json_config) 23 | self.quantizer = Quantizer(self.h) 24 | self.generator = Generator(self.h) 25 | self.generator.load_state_dict(ckpt['generator']) 26 | self.quantizer.load_state_dict(ckpt['quantizer']) 27 | if with_encoder: 28 | self.encoder = Encoder(self.h) 29 | self.encoder.load_state_dict(ckpt['encoder']) 30 | 31 | def forward(self, x): 32 | # x is the codebook 33 | # x.shape (B, T, Nq) 34 | quant_emb = self.quantizer.embed(x) 35 | return self.generator(quant_emb) 36 | 37 | def encode(self, x): 38 | batch_size = x.size(0) 39 | if len(x.shape) == 3 and x.shape[-1] == 1: 40 | x = x.squeeze(-1) 41 | c = self.encoder(x.unsqueeze(1)) 42 | q, loss_q, c = self.quantizer(c) 43 | c = [code.reshape(batch_size, -1) for code in c] 44 | # shape: [N, T, 4] 45 | return torch.stack(c, -1) 46 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | # cosyvoice_simple_api 2 | 3 | ## 项目概述 4 | 5 | `cosyvoice_simple_api` 是一个基于阿里的 CosyVoice 开发的简易的语音合成 API 服务器项目。它允许用户轻松地将文本转换为有情感的语音输出,适用于创建有声读物、自动语音回复系统以及其他语音合成应用。 6 | 7 | ### 项目地址 8 | 9 | - CosyVoice 源地址:[FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 10 | - CosyVoice Windows 适配版(特别鸣谢刘悦):[v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows) 11 | - 本项目地址:[swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api) 12 | 13 | ## 运行方式 14 | 15 | 1. 确保你的系统中已安装 Python 环境。 16 | 2. 通过 `git clone` 或下载 ZIP 文件的方式获取项目代码。 17 | 3. 在项目根目录下,运行以下命令安装依赖: 18 | 19 | ```bash 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | 4. 在命令行中运行以下命令启动服务器: 24 | 25 | ```bash 26 | python api.py 27 | ``` 28 | 29 | ## 服务器地址 30 | 31 | CosyVoice 语音合成 API 服务器地址为:`http://你的电脑IP:9881/` 32 | 33 | ## API 接口 34 | 35 | ### 接口地址 36 | 37 | ``` 38 | /cosyvoice/ 39 | ``` 40 | 41 | ### 请求方式 42 | 43 | ``` 44 | GET 45 | ``` 46 | 47 | ### 请求参数 48 | 49 | - `text`:必填,要合成的主体文本。 50 | 51 | ## 使用示例 52 | 53 | 1. 在浏览器地址栏输入以下地址: 54 | 55 | ``` 56 | http://127.0.0.1:9881/cosyvoice/?text=你好,很高兴遇见你 57 | ``` 58 | 59 | 2. 按下回车键,服务器将返回输出格式为 wav 音频文件。 60 | 61 | ## 更换参考音频和参考音频文本 62 | 63 | 1. 将 `example.wav` 替换为自定义的参考音频,文件名保持不变。 64 | 2. 用记事本打开 `example参考音频文本.txt`,修改成新的自定义参考音频文本。 65 | 3. 修改完成后保存文件,并重新运行 `CosyVoice语音合成API服务器.bat` 文件。 66 | 67 | ## 技术栈 68 | 69 | - FastAPI:用于构建 API 服务器。 70 | - ModelScope:模型相关的库。 71 | - Torch:PyTorch,用于深度学习模型。 72 | - TorchAudio:用于音频处理。 73 | - Uvicorn:ASGI 服务器,用于运行 FastAPI 应用。 74 | 75 | ## 贡献 76 | 77 | 欢迎对本项目进行贡献,包括但不限于修复 bug、增加新功能、改进文档等。在提交 Pull Request 之前,请确保你的代码通过了所有测试,并且遵循项目的代码风格。 78 | 79 | ## 许可证 80 | 81 | 本项目采用 [MIT 许可证](LICENSE)。 -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒" 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/vqvae_copy_syn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | import os 5 | from pathlib import Path 6 | 7 | import soundfile as sf 8 | from tqdm import tqdm 9 | 10 | from academicodec.models.hificodec.vqvae_tester import VqvaeTester 11 | 12 | parser = argparse.ArgumentParser() 13 | 14 | #Path 15 | parser.add_argument('--outputdir', type=str, required=True) 16 | parser.add_argument('--model_path', type=str, required=True) 17 | parser.add_argument('--input_wavdir', type=str, required=True) 18 | parser.add_argument('--config_path', type=str, required=True) 19 | parser.add_argument('--num_gens', type=int, default=1024) 20 | 21 | #Data 22 | parser.add_argument('--sample_rate', type=int, default=24000) 23 | 24 | args = parser.parse_args() 25 | 26 | with open(args.config_path, 'r') as f: 27 | argdict = json.load(f) 28 | assert argdict['sampling_rate'] == args.sample_rate, \ 29 | f"Sampling rate not consistent, stated {args.sample_rate}, but the model is trained on {argdict['sample_rate']}" 30 | argdict.update(args.__dict__) 31 | args.__dict__ = argdict 32 | 33 | if __name__ == '__main__': 34 | Path(args.outputdir).mkdir(parents=True, exist_ok=True) 35 | print("Init model and load weights") 36 | model = VqvaeTester(config_path=args.config_path, model_path=args.model_path,sample_rate=args.sample_rate) 37 | model.cuda() 38 | model.vqvae.generator.remove_weight_norm() 39 | model.vqvae.encoder.remove_weight_norm() 40 | model.eval() 41 | print("Model ready") 42 | 43 | wav_paths = glob.glob(f"{args.input_wavdir}/*.wav")[:args.num_gens] 44 | print(f"Globbed {len(wav_paths)} wav files.") 45 | 46 | for wav_path in wav_paths: 47 | fid, wav = model(wav_path) 48 | wav = wav.squeeze().cpu().numpy() 49 | sf.write( 50 | os.path.join(args.outputdir, f'{fid}.wav'), wav, args.sample_rate) 51 | -------------------------------------------------------------------------------- /cosyvoice/flow/length_regulator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple 15 | import torch.nn as nn 16 | from torch.nn import functional as F 17 | from cosyvoice.utils.mask import make_pad_mask 18 | 19 | 20 | class InterpolateRegulator(nn.Module): 21 | def __init__( 22 | self, 23 | channels: int, 24 | sampling_ratios: Tuple, 25 | out_channels: int = None, 26 | groups: int = 1, 27 | ): 28 | super().__init__() 29 | self.sampling_ratios = sampling_ratios 30 | out_channels = out_channels or channels 31 | model = nn.ModuleList([]) 32 | if len(sampling_ratios) > 0: 33 | for _ in sampling_ratios: 34 | module = nn.Conv1d(channels, channels, 3, 1, 1) 35 | norm = nn.GroupNorm(groups, channels) 36 | act = nn.Mish() 37 | model.extend([module, norm, act]) 38 | model.append( 39 | nn.Conv1d(channels, out_channels, 1, 1) 40 | ) 41 | self.model = nn.Sequential(*model) 42 | 43 | def forward(self, x, ylens=None): 44 | # x in (B, T, D) 45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) 46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') 47 | out = self.model(x).transpose(1, 2).contiguous() 48 | olens = ylens 49 | return out * mask, olens 50 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /cosyvoice/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /academicodec/models/soundstream/dataset.py: -------------------------------------------------------------------------------- 1 | # 和 Encodec* 的 dataset.py 有点类似但是不完全一样 2 | # 主要是 prob > 0.7 的时候多了 ans2 3 | import glob 4 | import random 5 | 6 | import torch 7 | import torchaudio 8 | from torch.utils.data import Dataset 9 | 10 | 11 | class NSynthDataset(Dataset): 12 | """Dataset to load NSynth data.""" 13 | 14 | def __init__(self, audio_dir): 15 | super().__init__() 16 | self.filenames = [] 17 | self.filenames.extend(glob.glob(audio_dir + "/*.wav")) 18 | print(len(self.filenames)) 19 | _, self.sr = torchaudio.load(self.filenames[0]) 20 | self.max_len = 24000 # 24000 21 | 22 | def __len__(self): 23 | return len(self.filenames) 24 | 25 | def __getitem__(self, index): 26 | #print(self.filenames[index]) 27 | prob = random.random() # (0,1) 28 | if prob > 0.7: 29 | # data augmentation 30 | ans1 = torch.zeros(1, self.max_len) 31 | ans2 = torch.zeros(1, self.max_len) 32 | audio1 = torchaudio.load(self.filenames[index])[0] 33 | index2 = random.randint(0, len(self.filenames) - 1) 34 | audio2 = torchaudio.load(self.filenames[index2])[0] 35 | if audio1.shape[1] > self.max_len: 36 | st = random.randint(0, audio1.shape[1] - self.max_len - 1) 37 | ed = st + self.max_len 38 | ans1 = audio1[:, st:ed] 39 | else: 40 | ans1[:, :audio1.shape[1]] = audio1 41 | if audio2.shape[1] > self.max_len: 42 | st = random.randint(0, audio2.shape[1] - self.max_len - 1) 43 | ed = st + self.max_len 44 | ans2 = audio2[:, st:ed] 45 | else: 46 | ans2[:, :audio2.shape[1]] = audio2 47 | ans = ans1 + ans2 48 | return ans 49 | else: 50 | ans = torch.zeros(1, self.max_len) 51 | audio = torchaudio.load(self.filenames[index])[0] 52 | if audio.shape[1] > self.max_len: 53 | st = random.randint(0, audio.shape[1] - self.max_len - 1) 54 | ed = st + self.max_len 55 | return audio[:, st:ed] 56 | else: 57 | ans[:, :audio.shape[1]] = audio 58 | return ans 59 | -------------------------------------------------------------------------------- /academicodec/models/encodec/net3.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | import numpy as np 5 | import torch.nn as nn 6 | from academicodec.modules.seanet import SEANetDecoder 7 | from academicodec.modules.seanet import SEANetEncoder 8 | from academicodec.quantization import ResidualVectorQuantizer 9 | 10 | 11 | # Generator 12 | class SoundStream(nn.Module): 13 | def __init__(self, 14 | n_filters, 15 | D, 16 | target_bandwidths=[7.5, 15], 17 | ratios=[8, 5, 4, 2], 18 | sample_rate=24000, 19 | bins=1024, 20 | normalize=False): 21 | super().__init__() 22 | self.hop_length = np.prod(ratios) # 计算乘积 23 | self.encoder = SEANetEncoder( 24 | n_filters=n_filters, dimension=D, ratios=ratios) 25 | n_q = int(1000 * target_bandwidths[-1] // 26 | (math.ceil(sample_rate / self.hop_length) * 10)) 27 | self.frame_rate = math.ceil(sample_rate / np.prod(ratios)) # 75 28 | self.bits_per_codebook = int(math.log2(bins)) 29 | self.target_bandwidths = target_bandwidths 30 | self.quantizer = ResidualVectorQuantizer( 31 | dimension=D, n_q=n_q, bins=bins) 32 | self.decoder = SEANetDecoder( 33 | n_filters=n_filters, dimension=D, ratios=ratios) 34 | 35 | def get_last_layer(self): 36 | return self.decoder.layers[-1].weight 37 | 38 | def forward(self, x): 39 | e = self.encoder(x) 40 | max_idx = len(self.target_bandwidths) - 1 41 | bw = self.target_bandwidths[random.randint(0, max_idx)] 42 | quantized, codes, bandwidth, commit_loss = self.quantizer( 43 | e, self.frame_rate, bw) 44 | o = self.decoder(quantized) 45 | return o, commit_loss, None 46 | 47 | def encode(self, x, target_bw=None, st=None): 48 | e = self.encoder(x) 49 | if target_bw is None: 50 | bw = self.target_bandwidths[-1] 51 | else: 52 | bw = target_bw 53 | if st is None: 54 | st = 0 55 | codes = self.quantizer.encode(e, self.frame_rate, bw, st) 56 | return codes 57 | 58 | def decode(self, codes): 59 | quantized = self.quantizer.decode(codes) 60 | o = self.decoder(quantized) 61 | return o 62 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | ord(char) + 65248: ord(char) 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /cosyvoice/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright [2023-11-28] 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from cosyvoice.transformer.activation import Swish 18 | from cosyvoice.transformer.subsampling import ( 19 | LinearNoSubsampling, 20 | EmbedinigNoSubsampling, 21 | Conv1dSubsampling2, 22 | Conv2dSubsampling4, 23 | Conv2dSubsampling6, 24 | Conv2dSubsampling8, 25 | ) 26 | from cosyvoice.transformer.embedding import (PositionalEncoding, 27 | RelPositionalEncoding, 28 | WhisperPositionalEncoding, 29 | LearnablePositionalEncoding, 30 | NoPositionalEncoding) 31 | from cosyvoice.transformer.attention import (MultiHeadedAttention, 32 | RelPositionMultiHeadedAttention) 33 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding 34 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling 35 | 36 | 37 | COSYVOICE_ACTIVATION_CLASSES = { 38 | "hardtanh": torch.nn.Hardtanh, 39 | "tanh": torch.nn.Tanh, 40 | "relu": torch.nn.ReLU, 41 | "selu": torch.nn.SELU, 42 | "swish": getattr(torch.nn, "SiLU", Swish), 43 | "gelu": torch.nn.GELU, 44 | } 45 | 46 | COSYVOICE_SUBSAMPLE_CLASSES = { 47 | "linear": LinearNoSubsampling, 48 | "linear_legacy": LegacyLinearNoSubsampling, 49 | "embed": EmbedinigNoSubsampling, 50 | "conv1d2": Conv1dSubsampling2, 51 | "conv2d": Conv2dSubsampling4, 52 | "conv2d6": Conv2dSubsampling6, 53 | "conv2d8": Conv2dSubsampling8, 54 | 'paraformer_dummy': torch.nn.Identity 55 | } 56 | 57 | COSYVOICE_EMB_CLASSES = { 58 | "embed": PositionalEncoding, 59 | "abs_pos": PositionalEncoding, 60 | "rel_pos": RelPositionalEncoding, 61 | "rel_pos_espnet": EspnetRelPositionalEncoding, 62 | "no_pos": NoPositionalEncoding, 63 | "abs_pos_whisper": WhisperPositionalEncoding, 64 | "embed_learnable_pe": LearnablePositionalEncoding, 65 | } 66 | 67 | COSYVOICE_ATTENTION_CLASSES = { 68 | "selfattn": MultiHeadedAttention, 69 | "rel_selfattn": RelPositionMultiHeadedAttention, 70 | } 71 | -------------------------------------------------------------------------------- /matcha/hifigan/denoiser.py: -------------------------------------------------------------------------------- 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py 2 | 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio.""" 4 | import torch 5 | 6 | 7 | class Denoiser(torch.nn.Module): 8 | """Removes model bias from audio produced with waveglow""" 9 | 10 | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"): 11 | super().__init__() 12 | self.filter_length = filter_length 13 | self.hop_length = int(filter_length / n_overlap) 14 | self.win_length = win_length 15 | 16 | dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device 17 | self.device = device 18 | if mode == "zeros": 19 | mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device) 20 | elif mode == "normal": 21 | mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device) 22 | else: 23 | raise Exception(f"Mode {mode} if not supported") 24 | 25 | def stft_fn(audio, n_fft, hop_length, win_length, window): 26 | spec = torch.stft( 27 | audio, 28 | n_fft=n_fft, 29 | hop_length=hop_length, 30 | win_length=win_length, 31 | window=window, 32 | return_complex=True, 33 | ) 34 | spec = torch.view_as_real(spec) 35 | return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0]) 36 | 37 | self.stft = lambda x: stft_fn( 38 | audio=x, 39 | n_fft=self.filter_length, 40 | hop_length=self.hop_length, 41 | win_length=self.win_length, 42 | window=torch.hann_window(self.win_length, device=device), 43 | ) 44 | self.istft = lambda x, y: torch.istft( 45 | torch.complex(x * torch.cos(y), x * torch.sin(y)), 46 | n_fft=self.filter_length, 47 | hop_length=self.hop_length, 48 | win_length=self.win_length, 49 | window=torch.hann_window(self.win_length, device=device), 50 | ) 51 | 52 | with torch.no_grad(): 53 | bias_audio = vocoder(mel_input).float().squeeze(0) 54 | bias_spec, _ = self.stft(bias_audio) 55 | 56 | self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None]) 57 | 58 | @torch.inference_mode() 59 | def forward(self, audio, strength=0.0005): 60 | audio_spec, audio_angles = self.stft(audio) 61 | audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength 62 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 63 | audio_denoised = self.istft(audio_spec_denoised, audio_angles) 64 | return audio_denoised 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cosyvoice_simple_api 2 | 3 | ## Project Overview 4 | 5 | `cosyvoice_simple_api` is a simple text-to-speech API server project developed based on Alibaba's CosyVoice. It allows users to easily convert text into emotionally rich voice output, suitable for creating audiobooks, automated voice response systems, and other text-to-speech applications. 6 | 7 | ### Project Addresses 8 | 9 | - CosyVoice Source Address: [FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 10 | - CosyVoice Windows Adaptation (Special thanks to Liu Yue): [v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows) 11 | - This Project Address: [swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api) 12 | 13 | ## Running Method 14 | 15 | 1. Ensure that a Python environment is installed in your system. 16 | 2. Obtain the project code via `git clone` or by downloading the ZIP file. 17 | 3. In the project root directory, run the following command to install dependencies: 18 | 19 | ```bash 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | 4. Run the following command in the command line to start the server: 24 | 25 | ```bash 26 | python api.py 27 | ``` 28 | 29 | ## Server Address 30 | 31 | The CosyVoice text-to-speech API server address is: `http://your-computer-IP:9881/` 32 | 33 | ## API Interface 34 | 35 | ### Interface Address 36 | 37 | ``` 38 | /cosyvoice/ 39 | ``` 40 | 41 | ### Request Method 42 | 43 | ``` 44 | GET 45 | ``` 46 | 47 | ### Request Parameters 48 | 49 | - `text`: Required, the main text to be synthesized. 50 | 51 | ## Usage Example 52 | 53 | 1. Enter the following address in the browser's address bar: 54 | 55 | ``` 56 | http://127.0.0.1:9881/cosyvoice/?text=Hello, nice to meet you 57 | ``` 58 | 59 | 2. Press Enter, and the server will return a response in the format of a wav audio file. 60 | 61 | ## Changing Reference Audio and Reference Audio Text 62 | 63 | 1. Replace `example.wav` with your custom reference audio, keeping the file name unchanged. 64 | 2. Open `example_reference_audio_text.txt` with Notepad and modify it to your new custom reference audio text. 65 | 3. After modification, save the file and rerun the `CosyVoice Text-to-Speech API Server.bat` file. 66 | 67 | ## Technology Stack 68 | 69 | - FastAPI: Used for building the API server. 70 | - ModelScope: A library related to models. 71 | - Torch: PyTorch, used for deep learning models. 72 | - TorchAudio: Used for audio processing. 73 | - Uvicorn: ASGI server, used to run FastAPI applications. 74 | 75 | ## Contribution 76 | 77 | Contributions to this project are welcome, including but not limited to fixing bugs, adding new features, and improving documentation. Before submitting a Pull Request, please ensure that your code passes all tests and adheres to the project's coding style. 78 | 79 | ## License 80 | 81 | This project is licensed under the [MIT License](LICENSE). -------------------------------------------------------------------------------- /cosyvoice/transformer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 2024 Alibaba Inc (Xiang Lyu) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Swish() activation function for Conformer.""" 18 | 19 | import torch 20 | from torch import nn, sin, pow 21 | from torch.nn import Parameter 22 | 23 | 24 | class Swish(torch.nn.Module): 25 | """Construct an Swish object.""" 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | """Return Swish activation function.""" 29 | return x * torch.sigmoid(x) 30 | 31 | 32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 33 | # LICENSE is in incl_licenses directory. 34 | class Snake(nn.Module): 35 | ''' 36 | Implementation of a sine-based periodic activation function 37 | Shape: 38 | - Input: (B, C, T) 39 | - Output: (B, C, T), same shape as the input 40 | Parameters: 41 | - alpha - trainable parameter 42 | References: 43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 44 | https://arxiv.org/abs/2006.08195 45 | Examples: 46 | >>> a1 = snake(256) 47 | >>> x = torch.randn(256) 48 | >>> x = a1(x) 49 | ''' 50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 51 | ''' 52 | Initialization. 53 | INPUT: 54 | - in_features: shape of the input 55 | - alpha: trainable parameter 56 | alpha is initialized to 1 by default, higher values = higher-frequency. 57 | alpha will be trained along with the rest of your model. 58 | ''' 59 | super(Snake, self).__init__() 60 | self.in_features = in_features 61 | 62 | # initialize alpha 63 | self.alpha_logscale = alpha_logscale 64 | if self.alpha_logscale: # log scale alphas initialized to zeros 65 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 66 | else: # linear scale alphas initialized to ones 67 | self.alpha = Parameter(torch.ones(in_features) * alpha) 68 | 69 | self.alpha.requires_grad = alpha_trainable 70 | 71 | self.no_div_by_zero = 0.000000001 72 | 73 | def forward(self, x): 74 | ''' 75 | Forward pass of the function. 76 | Applies the function to the input elementwise. 77 | Snake ∶= x + 1/a * sin^2 (xa) 78 | ''' 79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 80 | if self.alpha_logscale: 81 | alpha = torch.exp(alpha) 82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /cosyvoice/utils/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """Unility functions for Transformer.""" 17 | 18 | from typing import List 19 | 20 | import torch 21 | 22 | IGNORE_ID = -1 23 | 24 | 25 | def pad_list(xs: List[torch.Tensor], pad_value: int): 26 | """Perform padding for the list of tensors. 27 | 28 | Args: 29 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 30 | pad_value (float): Value for padding. 31 | 32 | Returns: 33 | Tensor: Padded tensor (B, Tmax, `*`). 34 | 35 | Examples: 36 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 37 | >>> x 38 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 39 | >>> pad_list(x, 0) 40 | tensor([[1., 1., 1., 1.], 41 | [1., 1., 0., 0.], 42 | [1., 0., 0., 0.]]) 43 | 44 | """ 45 | max_len = max([len(item) for item in xs]) 46 | batchs = len(xs) 47 | ndim = xs[0].ndim 48 | if ndim == 1: 49 | pad_res = torch.zeros(batchs, 50 | max_len, 51 | dtype=xs[0].dtype, 52 | device=xs[0].device) 53 | elif ndim == 2: 54 | pad_res = torch.zeros(batchs, 55 | max_len, 56 | xs[0].shape[1], 57 | dtype=xs[0].dtype, 58 | device=xs[0].device) 59 | elif ndim == 3: 60 | pad_res = torch.zeros(batchs, 61 | max_len, 62 | xs[0].shape[1], 63 | xs[0].shape[2], 64 | dtype=xs[0].dtype, 65 | device=xs[0].device) 66 | else: 67 | raise ValueError(f"Unsupported ndim: {ndim}") 68 | pad_res.fill_(pad_value) 69 | for i in range(batchs): 70 | pad_res[i, :len(xs[i])] = xs[i] 71 | return pad_res 72 | 73 | 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 75 | ignore_label: int) -> torch.Tensor: 76 | """Calculate accuracy. 77 | 78 | Args: 79 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 80 | pad_targets (LongTensor): Target label tensors (B, Lmax). 81 | ignore_label (int): Ignore label id. 82 | 83 | Returns: 84 | torch.Tensor: Accuracy value (0.0 - 1.0). 85 | 86 | """ 87 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 88 | pad_outputs.size(1)).argmax(2) 89 | mask = pad_targets != ignore_label 90 | numerator = torch.sum( 91 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 92 | denominator = torch.sum(mask) 93 | return (numerator / denominator).detach() 94 | -------------------------------------------------------------------------------- /academicodec/models/encodec/distributed/launch.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ 2 | # Diffsound 3 | # code based https://github.com/cientgu/VQ-Diffusion 4 | # ------------------------------------------ 5 | import distributed.distributed as dist_fn 6 | import torch 7 | from torch import distributed as dist 8 | from torch import multiprocessing as mp 9 | 10 | # import distributed as dist_fn 11 | 12 | 13 | def find_free_port(): 14 | import socket 15 | 16 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 17 | 18 | sock.bind(("", 0)) 19 | port = sock.getsockname()[1] 20 | sock.close() 21 | 22 | return port 23 | 24 | 25 | def launch(fn, 26 | n_gpu_per_machine, 27 | n_machine=1, 28 | machine_rank=0, 29 | dist_url=None, 30 | args=()): 31 | world_size = n_machine * n_gpu_per_machine 32 | 33 | if world_size > 1: 34 | # if "OMP_NUM_THREADS" not in os.environ: 35 | # os.environ["OMP_NUM_THREADS"] = "1" 36 | if dist_url == "auto": 37 | if n_machine != 1: 38 | raise ValueError( 39 | 'dist_url="auto" not supported in multi-machine jobs') 40 | port = find_free_port() 41 | dist_url = f"tcp://127.0.0.1:{port}" 42 | print('dist_url ', dist_url) 43 | print('n_machine ', n_machine) 44 | print('args ', args) 45 | print('world_size ', world_size) 46 | print('machine_rank ', machine_rank) 47 | if n_machine > 1 and dist_url.startswith("file://"): 48 | raise ValueError( 49 | "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://" 50 | ) 51 | 52 | mp.spawn( 53 | distributed_worker, 54 | nprocs=n_gpu_per_machine, 55 | args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, 56 | args), 57 | daemon=False, ) 58 | # n_machine ? world_size 59 | else: 60 | local_rank = 0 61 | fn(local_rank, *args) 62 | 63 | 64 | def distributed_worker(local_rank, fn, world_size, n_gpu_per_machine, 65 | machine_rank, dist_url, args): 66 | if not torch.cuda.is_available(): 67 | raise OSError("CUDA is not available. Please check your environments") 68 | 69 | global_rank = machine_rank * n_gpu_per_machine + local_rank 70 | print('local_rank ', local_rank) 71 | print('global_rank ', global_rank) 72 | try: 73 | dist.init_process_group( 74 | backend="NCCL", 75 | init_method=dist_url, 76 | world_size=world_size, 77 | rank=global_rank, ) 78 | 79 | except Exception: 80 | raise OSError("failed to initialize NCCL groups") 81 | 82 | # changed 83 | dist_fn.synchronize() 84 | 85 | if n_gpu_per_machine > torch.cuda.device_count(): 86 | raise ValueError( 87 | f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})" 88 | ) 89 | 90 | torch.cuda.set_device(local_rank) 91 | 92 | if dist_fn.LOCAL_PROCESS_GROUP is not None: 93 | raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None") 94 | 95 | # change paert 96 | 97 | n_machine = world_size // n_gpu_per_machine 98 | for i in range(n_machine): 99 | ranks_on_i = list( 100 | range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine)) 101 | pg = dist.new_group(ranks_on_i) 102 | 103 | if i == machine_rank: 104 | dist_fn.LOCAL_PROCESS_GROUP = pg 105 | 106 | fn(local_rank, *args) 107 | -------------------------------------------------------------------------------- /cosyvoice/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Label smoothing module.""" 16 | 17 | import torch 18 | from torch import nn 19 | 20 | 21 | class LabelSmoothingLoss(nn.Module): 22 | """Label-smoothing loss. 23 | 24 | In a standard CE loss, the label's data distribution is: 25 | [0,1,2] -> 26 | [ 27 | [1.0, 0.0, 0.0], 28 | [0.0, 1.0, 0.0], 29 | [0.0, 0.0, 1.0], 30 | ] 31 | 32 | In the smoothing version CE Loss,some probabilities 33 | are taken from the true label prob (1.0) and are divided 34 | among other labels. 35 | 36 | e.g. 37 | smoothing=0.1 38 | [0,1,2] -> 39 | [ 40 | [0.9, 0.05, 0.05], 41 | [0.05, 0.9, 0.05], 42 | [0.05, 0.05, 0.9], 43 | ] 44 | 45 | Args: 46 | size (int): the number of class 47 | padding_idx (int): padding class id which will be ignored for loss 48 | smoothing (float): smoothing rate (0.0 means the conventional CE) 49 | normalize_length (bool): 50 | normalize loss by sequence length if True 51 | normalize loss by batch size if False 52 | """ 53 | 54 | def __init__(self, 55 | size: int, 56 | padding_idx: int, 57 | smoothing: float, 58 | normalize_length: bool = False): 59 | """Construct an LabelSmoothingLoss object.""" 60 | super(LabelSmoothingLoss, self).__init__() 61 | self.criterion = nn.KLDivLoss(reduction="none") 62 | self.padding_idx = padding_idx 63 | self.confidence = 1.0 - smoothing 64 | self.smoothing = smoothing 65 | self.size = size 66 | self.normalize_length = normalize_length 67 | 68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 69 | """Compute loss between x and target. 70 | 71 | The model outputs and data labels tensors are flatten to 72 | (batch*seqlen, class) shape and a mask is applied to the 73 | padding part which should not be calculated for loss. 74 | 75 | Args: 76 | x (torch.Tensor): prediction (batch, seqlen, class) 77 | target (torch.Tensor): 78 | target signal masked with self.padding_id (batch, seqlen) 79 | Returns: 80 | loss (torch.Tensor) : The KL loss, scalar float value 81 | """ 82 | assert x.size(2) == self.size 83 | batch_size = x.size(0) 84 | x = x.view(-1, self.size) 85 | target = target.view(-1) 86 | # use zeros_like instead of torch.no_grad() for true_dist, 87 | # since no_grad() can not be exported by JIT 88 | true_dist = torch.zeros_like(x) 89 | true_dist.fill_(self.smoothing / (self.size - 1)) 90 | ignore = target == self.padding_idx # (B,) 91 | total = len(target) - ignore.sum().item() 92 | target = target.masked_fill(ignore, 0) # avoid -1 index 93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 95 | denom = total if self.normalize_length else batch_size 96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 97 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /academicodec/models/encodec/distributed/distributed.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ 2 | # Diffsound 3 | # code based https://github.com/cientgu/VQ-Diffusion 4 | # ------------------------------------------ 5 | import pickle 6 | 7 | import torch 8 | from torch import distributed as dist 9 | from torch.utils import data 10 | 11 | LOCAL_PROCESS_GROUP = None 12 | 13 | 14 | def is_primary(): 15 | return get_rank() == 0 16 | 17 | 18 | def get_rank(): 19 | if not dist.is_available(): 20 | return 0 21 | 22 | if not dist.is_initialized(): 23 | return 0 24 | 25 | return dist.get_rank() 26 | 27 | 28 | def get_local_rank(): 29 | if not dist.is_available(): 30 | return 0 31 | 32 | if not dist.is_initialized(): 33 | return 0 34 | 35 | if LOCAL_PROCESS_GROUP is None: 36 | raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None") 37 | 38 | return dist.get_rank(group=LOCAL_PROCESS_GROUP) 39 | 40 | 41 | def synchronize(): 42 | if not dist.is_available(): 43 | return 44 | 45 | if not dist.is_initialized(): 46 | return 47 | 48 | world_size = dist.get_world_size() 49 | 50 | if world_size == 1: 51 | return 52 | 53 | dist.barrier() 54 | 55 | 56 | def get_world_size(): 57 | if not dist.is_available(): 58 | return 1 59 | 60 | if not dist.is_initialized(): 61 | return 1 62 | 63 | return dist.get_world_size() 64 | 65 | 66 | def is_distributed(): 67 | raise RuntimeError('Please debug this function!') 68 | return get_world_size() > 1 69 | 70 | 71 | def all_reduce(tensor, op=dist.ReduceOp.SUM, async_op=False): 72 | world_size = get_world_size() 73 | 74 | if world_size == 1: 75 | return tensor 76 | dist.all_reduce(tensor, op=op, async_op=async_op) 77 | 78 | return tensor 79 | 80 | 81 | def all_gather(data): 82 | world_size = get_world_size() 83 | 84 | if world_size == 1: 85 | return [data] 86 | 87 | buffer = pickle.dumps(data) 88 | storage = torch.ByteStorage.from_buffer(buffer) 89 | tensor = torch.ByteTensor(storage).to("cuda") 90 | 91 | local_size = torch.IntTensor([tensor.numel()]).to("cuda") 92 | size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)] 93 | dist.all_gather(size_list, local_size) 94 | size_list = [int(size.item()) for size in size_list] 95 | max_size = max(size_list) 96 | 97 | tensor_list = [] 98 | for _ in size_list: 99 | tensor_list.append(torch.ByteTensor(size=(max_size, )).to("cuda")) 100 | 101 | if local_size != max_size: 102 | padding = torch.ByteTensor(size=(max_size - local_size, )).to("cuda") 103 | tensor = torch.cat((tensor, padding), 0) 104 | 105 | dist.all_gather(tensor_list, tensor) 106 | 107 | data_list = [] 108 | 109 | for size, tensor in zip(size_list, tensor_list): 110 | buffer = tensor.cpu().numpy().tobytes()[:size] 111 | data_list.append(pickle.loads(buffer)) 112 | 113 | return data_list 114 | 115 | 116 | def reduce_dict(input_dict, average=True): 117 | world_size = get_world_size() 118 | 119 | if world_size < 2: 120 | return input_dict 121 | 122 | with torch.no_grad(): 123 | keys = [] 124 | values = [] 125 | 126 | for k in sorted(input_dict.keys()): 127 | keys.append(k) 128 | values.append(input_dict[k]) 129 | 130 | values = torch.stack(values, 0) 131 | dist.reduce(values, dst=0) 132 | 133 | if dist.get_rank() == 0 and average: 134 | values /= world_size 135 | 136 | reduced_dict = {k: v for k, v in zip(keys, values)} 137 | 138 | return reduced_dict 139 | 140 | 141 | def data_sampler(dataset, shuffle, distributed): 142 | if distributed: 143 | return data.distributed.DistributedSampler(dataset, shuffle=shuffle) 144 | 145 | if shuffle: 146 | return data.RandomSampler(dataset) 147 | 148 | else: 149 | return data.SequentialSampler(dataset) 150 | -------------------------------------------------------------------------------- /cosyvoice/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Positionwise feed forward layer definition.""" 16 | 17 | import torch 18 | 19 | 20 | class PositionwiseFeedForward(torch.nn.Module): 21 | """Positionwise feed forward layer. 22 | 23 | FeedForward are appied on each position of the sequence. 24 | The output dim is same with the input dim. 25 | 26 | Args: 27 | idim (int): Input dimenstion. 28 | hidden_units (int): The number of hidden units. 29 | dropout_rate (float): Dropout rate. 30 | activation (torch.nn.Module): Activation function 31 | """ 32 | 33 | def __init__( 34 | self, 35 | idim: int, 36 | hidden_units: int, 37 | dropout_rate: float, 38 | activation: torch.nn.Module = torch.nn.ReLU(), 39 | ): 40 | """Construct a PositionwiseFeedForward object.""" 41 | super(PositionwiseFeedForward, self).__init__() 42 | self.w_1 = torch.nn.Linear(idim, hidden_units) 43 | self.activation = activation 44 | self.dropout = torch.nn.Dropout(dropout_rate) 45 | self.w_2 = torch.nn.Linear(hidden_units, idim) 46 | 47 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 48 | """Forward function. 49 | 50 | Args: 51 | xs: input tensor (B, L, D) 52 | Returns: 53 | output tensor, (B, L, D) 54 | """ 55 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 56 | 57 | 58 | class MoEFFNLayer(torch.nn.Module): 59 | """ 60 | Mixture of expert with Positionwise feed forward layer 61 | See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf 62 | The output dim is same with the input dim. 63 | 64 | Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 65 | https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 66 | Args: 67 | n_expert: number of expert. 68 | n_expert_per_token: The actual number of experts used for each frame 69 | idim (int): Input dimenstion. 70 | hidden_units (int): The number of hidden units. 71 | dropout_rate (float): Dropout rate. 72 | activation (torch.nn.Module): Activation function 73 | """ 74 | 75 | def __init__( 76 | self, 77 | n_expert: int, 78 | n_expert_per_token: int, 79 | idim: int, 80 | hidden_units: int, 81 | dropout_rate: float, 82 | activation: torch.nn.Module = torch.nn.ReLU(), 83 | ): 84 | super(MoEFFNLayer, self).__init__() 85 | self.gate = torch.nn.Linear(idim, n_expert, bias=False) 86 | self.experts = torch.nn.ModuleList( 87 | PositionwiseFeedForward(idim, hidden_units, dropout_rate, 88 | activation) for _ in range(n_expert)) 89 | self.n_expert_per_token = n_expert_per_token 90 | 91 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 92 | """Foward function. 93 | Args: 94 | xs: input tensor (B, L, D) 95 | Returns: 96 | output tensor, (B, L, D) 97 | 98 | """ 99 | B, L, D = xs.size( 100 | ) # batch size, sequence length, embedding dimension (idim) 101 | xs = xs.view(-1, D) # (B*L, D) 102 | router = self.gate(xs) # (B*L, n_expert) 103 | logits, indices = torch.topk( 104 | router, self.n_expert_per_token 105 | ) # probs:(B*L, n_expert), indices: (B*L, n_expert) 106 | weights = torch.nn.functional.softmax( 107 | logits, dim=1, 108 | dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token) 109 | output = torch.zeros_like(xs) # (B*L, D) 110 | for i, expert in enumerate(self.experts): 111 | mask = indices == i 112 | batch_idx, ith_expert = torch.where(mask) 113 | output[batch_idx] += weights[batch_idx, ith_expert, None] * expert( 114 | xs[batch_idx]) 115 | return output.view(B, L, D) 116 | -------------------------------------------------------------------------------- /academicodec/quantization/distrib.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Torch distributed utilities.""" 7 | import typing as tp 8 | 9 | import torch 10 | 11 | 12 | def rank(): 13 | if torch.distributed.is_initialized(): 14 | return torch.distributed.get_rank() 15 | else: 16 | return 0 17 | 18 | 19 | def world_size(): 20 | if torch.distributed.is_initialized(): 21 | return torch.distributed.get_world_size() 22 | else: 23 | return 1 24 | 25 | 26 | def is_distributed(): 27 | return world_size() > 1 28 | 29 | 30 | def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM): 31 | if is_distributed(): 32 | return torch.distributed.all_reduce(tensor, op) 33 | 34 | 35 | def _is_complex_or_float(tensor): 36 | return torch.is_floating_point(tensor) or torch.is_complex(tensor) 37 | 38 | 39 | def _check_number_of_params(params: tp.List[torch.Tensor]): 40 | # utility function to check that the number of params in all workers is the same, 41 | # and thus avoid a deadlock with distributed all reduce. 42 | if not is_distributed() or not params: 43 | return 44 | #print('params[0].device ', params[0].device) 45 | tensor = torch.tensor( 46 | [len(params)], device=params[0].device, dtype=torch.long) 47 | all_reduce(tensor) 48 | if tensor.item() != len(params) * world_size(): 49 | # If not all the workers have the same number, for at least one of them, 50 | # this inequality will be verified. 51 | raise RuntimeError( 52 | f"Mismatch in number of params: ours is {len(params)}, " 53 | "at least one worker has a different one.") 54 | 55 | 56 | def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int=0): 57 | """Broadcast the tensors from the given parameters to all workers. 58 | This can be used to ensure that all workers have the same model to start with. 59 | """ 60 | if not is_distributed(): 61 | return 62 | tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)] 63 | _check_number_of_params(tensors) 64 | handles = [] 65 | for tensor in tensors: 66 | # src = int(rank()) # added code 67 | handle = torch.distributed.broadcast( 68 | tensor.data, src=src, async_op=True) 69 | handles.append(handle) 70 | for handle in handles: 71 | handle.wait() 72 | 73 | 74 | def sync_buffer(buffers, average=True): 75 | """ 76 | Sync grad for buffers. If average is False, broadcast instead of averaging. 77 | """ 78 | if not is_distributed(): 79 | return 80 | handles = [] 81 | for buffer in buffers: 82 | if torch.is_floating_point(buffer.data): 83 | if average: 84 | handle = torch.distributed.all_reduce( 85 | buffer.data, 86 | op=torch.distributed.ReduceOp.SUM, 87 | async_op=True) 88 | else: 89 | handle = torch.distributed.broadcast( 90 | buffer.data, src=0, async_op=True) 91 | handles.append((buffer, handle)) 92 | for buffer, handle in handles: 93 | handle.wait() 94 | if average: 95 | buffer.data /= world_size 96 | 97 | 98 | def sync_grad(params): 99 | """ 100 | Simpler alternative to DistributedDataParallel, that doesn't rely 101 | on any black magic. For simple models it can also be as fast. 102 | Just call this on your model parameters after the call to backward! 103 | """ 104 | if not is_distributed(): 105 | return 106 | handles = [] 107 | for p in params: 108 | if p.grad is not None: 109 | handle = torch.distributed.all_reduce( 110 | p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True) 111 | handles.append((p, handle)) 112 | for p, handle in handles: 113 | handle.wait() 114 | p.grad.data /= world_size() 115 | 116 | 117 | def average_metrics(metrics: tp.Dict[str, float], count=1.): 118 | """Average a dictionary of metrics across all workers, using the optional 119 | `count` as unormalized weight. 120 | """ 121 | if not is_distributed(): 122 | return metrics 123 | keys, values = zip(*metrics.items()) 124 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 125 | tensor = torch.tensor( 126 | list(values) + [1], device=device, dtype=torch.float32) 127 | tensor *= count 128 | all_reduce(tensor) 129 | averaged = (tensor[:-1] / tensor[-1]).cpu().tolist() 130 | return dict(zip(keys, averaged)) 131 | -------------------------------------------------------------------------------- /matcha/models/components/flow_matching.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from matcha.models.components.decoder import Decoder 7 | from matcha.utils.pylogger import get_pylogger 8 | 9 | log = get_pylogger(__name__) 10 | 11 | 12 | class BASECFM(torch.nn.Module, ABC): 13 | def __init__( 14 | self, 15 | n_feats, 16 | cfm_params, 17 | n_spks=1, 18 | spk_emb_dim=128, 19 | ): 20 | super().__init__() 21 | self.n_feats = n_feats 22 | self.n_spks = n_spks 23 | self.spk_emb_dim = spk_emb_dim 24 | self.solver = cfm_params.solver 25 | if hasattr(cfm_params, "sigma_min"): 26 | self.sigma_min = cfm_params.sigma_min 27 | else: 28 | self.sigma_min = 1e-4 29 | 30 | self.estimator = None 31 | 32 | @torch.inference_mode() 33 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): 34 | """Forward diffusion 35 | 36 | Args: 37 | mu (torch.Tensor): output of encoder 38 | shape: (batch_size, n_feats, mel_timesteps) 39 | mask (torch.Tensor): output_mask 40 | shape: (batch_size, 1, mel_timesteps) 41 | n_timesteps (int): number of diffusion steps 42 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0. 43 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 44 | shape: (batch_size, spk_emb_dim) 45 | cond: Not used but kept for future purposes 46 | 47 | Returns: 48 | sample: generated mel-spectrogram 49 | shape: (batch_size, n_feats, mel_timesteps) 50 | """ 51 | z = torch.randn_like(mu) * temperature 52 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) 53 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) 54 | 55 | def solve_euler(self, x, t_span, mu, mask, spks, cond): 56 | """ 57 | Fixed euler solver for ODEs. 58 | Args: 59 | x (torch.Tensor): random noise 60 | t_span (torch.Tensor): n_timesteps interpolated 61 | shape: (n_timesteps + 1,) 62 | mu (torch.Tensor): output of encoder 63 | shape: (batch_size, n_feats, mel_timesteps) 64 | mask (torch.Tensor): output_mask 65 | shape: (batch_size, 1, mel_timesteps) 66 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 67 | shape: (batch_size, spk_emb_dim) 68 | cond: Not used but kept for future purposes 69 | """ 70 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] 71 | 72 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file 73 | # Or in future might add like a return_all_steps flag 74 | sol = [] 75 | 76 | for step in range(1, len(t_span)): 77 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond) 78 | 79 | x = x + dt * dphi_dt 80 | t = t + dt 81 | sol.append(x) 82 | if step < len(t_span) - 1: 83 | dt = t_span[step + 1] - t 84 | 85 | return sol[-1] 86 | 87 | def compute_loss(self, x1, mask, mu, spks=None, cond=None): 88 | """Computes diffusion loss 89 | 90 | Args: 91 | x1 (torch.Tensor): Target 92 | shape: (batch_size, n_feats, mel_timesteps) 93 | mask (torch.Tensor): target mask 94 | shape: (batch_size, 1, mel_timesteps) 95 | mu (torch.Tensor): output of encoder 96 | shape: (batch_size, n_feats, mel_timesteps) 97 | spks (torch.Tensor, optional): speaker embedding. Defaults to None. 98 | shape: (batch_size, spk_emb_dim) 99 | 100 | Returns: 101 | loss: conditional flow matching loss 102 | y: conditional flow 103 | shape: (batch_size, n_feats, mel_timesteps) 104 | """ 105 | b, _, t = mu.shape 106 | 107 | # random timestep 108 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) 109 | # sample noise p(x_0) 110 | z = torch.randn_like(x1) 111 | 112 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1 113 | u = x1 - (1 - self.sigma_min) * z 114 | 115 | loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / ( 116 | torch.sum(mask) * u.shape[1] 117 | ) 118 | return loss, y 119 | 120 | 121 | class CFM(BASECFM): 122 | def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64): 123 | super().__init__( 124 | n_feats=in_channels, 125 | cfm_params=cfm_params, 126 | n_spks=n_spks, 127 | spk_emb_dim=spk_emb_dim, 128 | ) 129 | 130 | in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0) 131 | # Just change the architecture of the estimator here 132 | self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params) 133 | -------------------------------------------------------------------------------- /academicodec/quantization/vq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Residual vector quantizer implementation.""" 7 | import math 8 | import typing as tp 9 | from dataclasses import dataclass 10 | from dataclasses import field 11 | 12 | import torch 13 | from torch import nn 14 | 15 | from academicodec.quantization.core_vq import ResidualVectorQuantization 16 | 17 | 18 | @dataclass 19 | class QuantizedResult: 20 | quantized: torch.Tensor 21 | codes: torch.Tensor 22 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item. 23 | penalty: tp.Optional[torch.Tensor] = None 24 | metrics: dict = field(default_factory=dict) 25 | 26 | 27 | class ResidualVectorQuantizer(nn.Module): 28 | """Residual Vector Quantizer. 29 | Args: 30 | dimension (int): Dimension of the codebooks. 31 | n_q (int): Number of residual vector quantizers used. 32 | bins (int): Codebook size. 33 | decay (float): Decay for exponential moving average over the codebooks. 34 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks. 35 | kmeans_iters (int): Number of iterations used for kmeans initialization. 36 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes 37 | that have an exponential moving average cluster size less than the specified threshold with 38 | randomly selected vector from the current batch. 39 | """ 40 | 41 | def __init__( 42 | self, 43 | dimension: int=256, 44 | n_q: int=8, 45 | bins: int=1024, 46 | decay: float=0.99, 47 | kmeans_init: bool=True, 48 | kmeans_iters: int=50, 49 | threshold_ema_dead_code: int=2, ): 50 | super().__init__() 51 | self.n_q = n_q 52 | self.dimension = dimension 53 | self.bins = bins 54 | self.decay = decay 55 | self.kmeans_init = kmeans_init 56 | self.kmeans_iters = kmeans_iters 57 | self.threshold_ema_dead_code = threshold_ema_dead_code 58 | self.vq = ResidualVectorQuantization( 59 | dim=self.dimension, 60 | codebook_size=self.bins, 61 | num_quantizers=self.n_q, 62 | decay=self.decay, 63 | kmeans_init=self.kmeans_init, 64 | kmeans_iters=self.kmeans_iters, 65 | threshold_ema_dead_code=self.threshold_ema_dead_code, ) 66 | 67 | def forward(self, 68 | x: torch.Tensor, 69 | sample_rate: int, 70 | bandwidth: tp.Optional[float]=None) -> QuantizedResult: 71 | """Residual vector quantization on the given input tensor. 72 | Args: 73 | x (torch.Tensor): Input tensor. 74 | sample_rate (int): Sample rate of the input tensor. 75 | bandwidth (float): Target bandwidth. 76 | Returns: 77 | QuantizedResult: 78 | The quantized (or approximately quantized) representation with 79 | the associated bandwidth and any penalty term for the loss. 80 | """ 81 | bw_per_q = self.get_bandwidth_per_quantizer(sample_rate) 82 | n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth) 83 | quantized, codes, commit_loss = self.vq(x, n_q=n_q) 84 | bw = torch.tensor(n_q * bw_per_q).to(x) 85 | return quantized, codes, bw, torch.mean(commit_loss) 86 | #return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss)) 87 | 88 | def get_num_quantizers_for_bandwidth( 89 | self, sample_rate: int, bandwidth: tp.Optional[float]=None) -> int: 90 | """Return n_q based on specified target bandwidth. 91 | """ 92 | bw_per_q = self.get_bandwidth_per_quantizer(sample_rate) 93 | n_q = self.n_q 94 | if bandwidth and bandwidth > 0.: 95 | n_q = int(max(1, math.floor(bandwidth / bw_per_q))) 96 | return n_q 97 | 98 | def get_bandwidth_per_quantizer(self, sample_rate: int): 99 | """Return bandwidth per quantizer for a given input sample rate. 100 | """ 101 | return math.log2(self.bins) * sample_rate / 1000 102 | 103 | def encode(self, 104 | x: torch.Tensor, 105 | sample_rate: int, 106 | bandwidth: tp.Optional[float]=None, 107 | st: tp.Optional[int]=None) -> torch.Tensor: 108 | """Encode a given input tensor with the specified sample rate at the given bandwidth. 109 | The RVQ encode method sets the appropriate number of quantizer to use 110 | and returns indices for each quantizer. 111 | """ 112 | n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth) 113 | st = st or 0 114 | codes = self.vq.encode(x, n_q=n_q, st=st) 115 | return codes 116 | 117 | def decode(self, codes: torch.Tensor) -> torch.Tensor: 118 | """Decode the given codes to the quantized representation. 119 | """ 120 | quantized = self.vq.decode(codes) 121 | return quantized 122 | -------------------------------------------------------------------------------- /cosyvoice/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Decoder self-attention layer definition.""" 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class DecoderLayer(nn.Module): 23 | """Single decoder layer module. 24 | 25 | Args: 26 | size (int): Input dimension. 27 | self_attn (torch.nn.Module): Self-attention module instance. 28 | `MultiHeadedAttention` instance can be used as the argument. 29 | src_attn (torch.nn.Module): Inter-attention module instance. 30 | `MultiHeadedAttention` instance can be used as the argument. 31 | If `None` is passed, Inter-attention is not used, such as 32 | CIF, GPT, and other decoder only model. 33 | feed_forward (torch.nn.Module): Feed-forward module instance. 34 | `PositionwiseFeedForward` instance can be used as the argument. 35 | dropout_rate (float): Dropout rate. 36 | normalize_before (bool): 37 | True: use layer_norm before each sub-block. 38 | False: to use layer_norm after each sub-block. 39 | """ 40 | 41 | def __init__( 42 | self, 43 | size: int, 44 | self_attn: nn.Module, 45 | src_attn: Optional[nn.Module], 46 | feed_forward: nn.Module, 47 | dropout_rate: float, 48 | normalize_before: bool = True, 49 | ): 50 | """Construct an DecoderLayer object.""" 51 | super().__init__() 52 | self.size = size 53 | self.self_attn = self_attn 54 | self.src_attn = src_attn 55 | self.feed_forward = feed_forward 56 | self.norm1 = nn.LayerNorm(size, eps=1e-5) 57 | self.norm2 = nn.LayerNorm(size, eps=1e-5) 58 | self.norm3 = nn.LayerNorm(size, eps=1e-5) 59 | self.dropout = nn.Dropout(dropout_rate) 60 | self.normalize_before = normalize_before 61 | 62 | def forward( 63 | self, 64 | tgt: torch.Tensor, 65 | tgt_mask: torch.Tensor, 66 | memory: torch.Tensor, 67 | memory_mask: torch.Tensor, 68 | cache: Optional[torch.Tensor] = None 69 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 70 | """Compute decoded features. 71 | 72 | Args: 73 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). 74 | tgt_mask (torch.Tensor): Mask for input tensor 75 | (#batch, maxlen_out). 76 | memory (torch.Tensor): Encoded memory 77 | (#batch, maxlen_in, size). 78 | memory_mask (torch.Tensor): Encoded memory mask 79 | (#batch, maxlen_in). 80 | cache (torch.Tensor): cached tensors. 81 | (#batch, maxlen_out - 1, size). 82 | 83 | Returns: 84 | torch.Tensor: Output tensor (#batch, maxlen_out, size). 85 | torch.Tensor: Mask for output tensor (#batch, maxlen_out). 86 | torch.Tensor: Encoded memory (#batch, maxlen_in, size). 87 | torch.Tensor: Encoded memory mask (#batch, maxlen_in). 88 | 89 | """ 90 | residual = tgt 91 | if self.normalize_before: 92 | tgt = self.norm1(tgt) 93 | 94 | if cache is None: 95 | tgt_q = tgt 96 | tgt_q_mask = tgt_mask 97 | else: 98 | # compute only the last frame query keeping dim: max_time_out -> 1 99 | assert cache.shape == ( 100 | tgt.shape[0], 101 | tgt.shape[1] - 1, 102 | self.size, 103 | ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" 104 | tgt_q = tgt[:, -1:, :] 105 | residual = residual[:, -1:, :] 106 | tgt_q_mask = tgt_mask[:, -1:, :] 107 | 108 | x = residual + self.dropout( 109 | self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) 110 | if not self.normalize_before: 111 | x = self.norm1(x) 112 | 113 | if self.src_attn is not None: 114 | residual = x 115 | if self.normalize_before: 116 | x = self.norm2(x) 117 | x = residual + self.dropout( 118 | self.src_attn(x, memory, memory, memory_mask)[0]) 119 | if not self.normalize_before: 120 | x = self.norm2(x) 121 | 122 | residual = x 123 | if self.normalize_before: 124 | x = self.norm3(x) 125 | x = residual + self.dropout(self.feed_forward(x)) 126 | if not self.normalize_before: 127 | x = self.norm3(x) 128 | 129 | if cache is not None: 130 | x = torch.cat([cache, x], dim=1) 131 | 132 | return x, tgt_mask, memory, memory_mask 133 | -------------------------------------------------------------------------------- /academicodec/modules/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """A streamable transformer.""" 7 | import typing as tp 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | 14 | def create_sin_embedding(positions: torch.Tensor, 15 | dim: int, 16 | max_period: float=10000): 17 | """Create time embedding for the given positions, target dimension `dim`. 18 | """ 19 | # We aim for BTC format 20 | assert dim % 2 == 0 21 | half_dim = dim // 2 22 | adim = torch.arange(half_dim, device=positions.device).view(1, 1, -1) 23 | phase = positions / (max_period**(adim / (half_dim - 1))) 24 | return torch.cat( 25 | [ 26 | torch.cos(phase), 27 | torch.sin(phase), 28 | ], dim=-1) 29 | 30 | 31 | class StreamingTransformerEncoderLayer(nn.TransformerEncoderLayer): 32 | def forward(self, x: torch.Tensor, x_past: torch.Tensor, 33 | past_context: int): # type: ignore 34 | if self.norm_first: 35 | sa_input = self.norm1(x) 36 | x = x + self._sa_block(sa_input, x_past, past_context) 37 | x = x + self._ff_block(self.norm2(x)) 38 | else: 39 | sa_input = x 40 | x = self.norm1(x + self._sa_block(sa_input, x_past, past_context)) 41 | x = self.norm2(x + self._ff_block(x)) 42 | 43 | return x, sa_input 44 | 45 | # self-attention block 46 | def _sa_block(self, 47 | x: torch.Tensor, 48 | x_past: torch.Tensor, 49 | past_context: int): # type: ignore 50 | _, T, _ = x.shape 51 | _, H, _ = x_past.shape 52 | 53 | queries = x 54 | keys = torch.cat([x_past, x], dim=1) 55 | values = keys 56 | 57 | queries_pos = torch.arange(H, T + H, device=x.device).view(-1, 1) 58 | keys_pos = torch.arange(T + H, device=x.device).view(1, -1) 59 | delta = queries_pos - keys_pos 60 | valid_access = (delta >= 0) & (delta <= past_context) 61 | x = self.self_attn( 62 | queries, keys, values, attn_mask=~valid_access, 63 | need_weights=False)[0] 64 | return self.dropout1(x) 65 | 66 | 67 | class StreamingTransformerEncoder(nn.Module): 68 | """TransformerEncoder with streaming support. 69 | 70 | Args: 71 | dim (int): dimension of the data. 72 | hidden_scale (int): intermediate dimension of FF module is this times the dimension. 73 | num_heads (int): number of heads. 74 | num_layers (int): number of layers. 75 | max_period (float): maxium period of cosines in the positional embedding. 76 | past_context (int or None): receptive field for the causal mask, infinite if None. 77 | gelu (bool): if true uses GeLUs, otherwise use ReLUs. 78 | norm_in (bool): normalize the input. 79 | dropout (float): dropout probability. 80 | **kwargs: See `nn.TransformerEncoderLayer`. 81 | """ 82 | 83 | def __init__(self, 84 | dim, 85 | hidden_scale: float=4., 86 | num_heads: int=8, 87 | num_layers: int=5, 88 | max_period: float=10000, 89 | past_context: int=1000, 90 | gelu: bool=True, 91 | norm_in: bool=True, 92 | dropout: float=0., 93 | **kwargs): 94 | super().__init__() 95 | assert dim % num_heads == 0 96 | hidden_dim = int(dim * hidden_scale) 97 | 98 | self.max_period = max_period 99 | self.past_context = past_context 100 | activation: tp.Any = F.gelu if gelu else F.relu 101 | 102 | self.norm_in: nn.Module 103 | if norm_in: 104 | self.norm_in = nn.LayerNorm(dim) 105 | else: 106 | self.norm_in = nn.Identity() 107 | 108 | self.layers = nn.ModuleList() 109 | for idx in range(num_layers): 110 | self.layers.append( 111 | StreamingTransformerEncoderLayer( 112 | dim, 113 | num_heads, 114 | hidden_dim, 115 | activation=activation, 116 | batch_first=True, 117 | dropout=dropout, 118 | **kwargs)) 119 | 120 | def forward(self, 121 | x: torch.Tensor, 122 | states: tp.Optional[tp.List[torch.Tensor]]=None, 123 | offset: tp.Union[int, torch.Tensor]=0): 124 | B, T, C = x.shape 125 | if states is None: 126 | states = [ 127 | torch.zeros_like(x[:, :1]) for _ in range(1 + len(self.layers)) 128 | ] 129 | 130 | positions = torch.arange(T, device=x.device).view(1, -1, 1) + offset 131 | pos_emb = create_sin_embedding(positions, C, max_period=self.max_period) 132 | 133 | new_state: tp.List[torch.Tensor] = [] 134 | x = self.norm_in(x) 135 | x = x + pos_emb 136 | 137 | for layer_state, layer in zip(states, self.layers): 138 | x, new_layer_state = layer(x, layer_state, self.past_context) 139 | new_layer_state = torch.cat([layer_state, new_layer_state], dim=1) 140 | new_state.append(new_layer_state[:, -self.past_context:, :]) 141 | return x, new_state, offset + T 142 | -------------------------------------------------------------------------------- /academicodec/models/soundstream/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from academicodec.modules import NormConv1d 5 | from academicodec.modules import NormConv2d 6 | from academicodec.utils import get_padding 7 | from torch.nn import AvgPool1d 8 | from torch.nn.utils import spectral_norm 9 | from torch.nn.utils import weight_norm 10 | 11 | LRELU_SLOPE = 0.1 12 | 13 | 14 | class DiscriminatorP(torch.nn.Module): 15 | def __init__(self, 16 | period, 17 | kernel_size=5, 18 | stride=3, 19 | use_spectral_norm=False, 20 | activation: str='LeakyReLU', 21 | activation_params: dict={'negative_slope': 0.2}): 22 | super(DiscriminatorP, self).__init__() 23 | self.period = period 24 | norm_f = weight_norm if use_spectral_norm is False else spectral_norm 25 | self.activation = getattr(torch.nn, activation)(**activation_params) 26 | self.convs = nn.ModuleList([ 27 | NormConv2d( 28 | 1, 29 | 32, (kernel_size, 1), (stride, 1), 30 | padding=(get_padding(5, 1), 0)), 31 | NormConv2d( 32 | 32, 33 | 32, (kernel_size, 1), (stride, 1), 34 | padding=(get_padding(5, 1), 0)), 35 | NormConv2d( 36 | 32, 37 | 32, (kernel_size, 1), (stride, 1), 38 | padding=(get_padding(5, 1), 0)), 39 | NormConv2d( 40 | 32, 41 | 32, (kernel_size, 1), (stride, 1), 42 | padding=(get_padding(5, 1), 0)), 43 | NormConv2d(32, 32, (kernel_size, 1), 1, padding=(2, 0)), 44 | ]) 45 | self.conv_post = NormConv2d(32, 1, (3, 1), 1, padding=(1, 0)) 46 | 47 | def forward(self, x): 48 | fmap = [] 49 | # 1d to 2d 50 | b, c, t = x.shape 51 | if t % self.period != 0: # pad first 52 | n_pad = self.period - (t % self.period) 53 | x = F.pad(x, (0, n_pad), "reflect") 54 | t = t + n_pad 55 | x = x.view(b, c, t // self.period, self.period) 56 | 57 | for l in self.convs: 58 | x = l(x) 59 | x = self.activation(x) 60 | fmap.append(x) 61 | x = self.conv_post(x) 62 | fmap.append(x) 63 | x = torch.flatten(x, 1, -1) 64 | 65 | return x, fmap 66 | 67 | 68 | class MultiPeriodDiscriminator(torch.nn.Module): 69 | def __init__(self): 70 | super(MultiPeriodDiscriminator, self).__init__() 71 | self.discriminators = nn.ModuleList([ 72 | DiscriminatorP(2), 73 | DiscriminatorP(3), 74 | DiscriminatorP(5), 75 | DiscriminatorP(7), 76 | DiscriminatorP(11), 77 | ]) 78 | 79 | def forward(self, y, y_hat): 80 | y_d_rs = [] 81 | y_d_gs = [] 82 | fmap_rs = [] 83 | fmap_gs = [] 84 | for i, d in enumerate(self.discriminators): 85 | y_d_r, fmap_r = d(y) 86 | y_d_g, fmap_g = d(y_hat) 87 | y_d_rs.append(y_d_r) 88 | fmap_rs.append(fmap_r) 89 | y_d_gs.append(y_d_g) 90 | fmap_gs.append(fmap_g) 91 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 92 | 93 | 94 | class DiscriminatorS(torch.nn.Module): 95 | def __init__(self, 96 | use_spectral_norm=False, 97 | activation: str='LeakyReLU', 98 | activation_params: dict={'negative_slope': 0.2}): 99 | super(DiscriminatorS, self).__init__() 100 | self.activation = getattr(torch.nn, activation)(**activation_params) 101 | self.convs = nn.ModuleList([ 102 | NormConv1d(1, 32, 15, 1, padding=7), 103 | NormConv1d(32, 32, 41, 2, groups=4, padding=20), 104 | NormConv1d(32, 32, 41, 2, groups=16, padding=20), 105 | NormConv1d(32, 32, 41, 4, groups=16, padding=20), 106 | NormConv1d(32, 32, 41, 4, groups=16, padding=20), 107 | NormConv1d(32, 32, 41, 1, groups=16, padding=20), 108 | NormConv1d(32, 32, 5, 1, padding=2), 109 | ]) 110 | self.conv_post = NormConv1d(32, 1, 3, 1, padding=1) 111 | 112 | def forward(self, x): 113 | fmap = [] 114 | for l in self.convs: 115 | x = l(x) 116 | x = self.activation(x) 117 | fmap.append(x) 118 | x = self.conv_post(x) 119 | fmap.append(x) 120 | x = torch.flatten(x, 1, -1) 121 | return x, fmap 122 | 123 | 124 | class MultiScaleDiscriminator(torch.nn.Module): 125 | def __init__(self): 126 | super(MultiScaleDiscriminator, self).__init__() 127 | self.discriminators = nn.ModuleList([ 128 | DiscriminatorS(), 129 | DiscriminatorS(), 130 | DiscriminatorS(), 131 | ]) 132 | self.meanpools = nn.ModuleList( 133 | [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]) 134 | 135 | def forward(self, y, y_hat): 136 | y_d_rs = [] 137 | y_d_gs = [] 138 | fmap_rs = [] 139 | fmap_gs = [] 140 | for i, d in enumerate(self.discriminators): 141 | if i != 0: 142 | y = self.meanpools[i - 1](y) 143 | y_hat = self.meanpools[i - 1](y_hat) 144 | y_d_r, fmap_r = d(y) 145 | y_d_g, fmap_g = d(y_hat) 146 | y_d_rs.append(y_d_r) 147 | fmap_rs.append(fmap_r) 148 | y_d_gs.append(y_d_g) 149 | fmap_gs.append(fmap_g) 150 | 151 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 152 | -------------------------------------------------------------------------------- /cosyvoice/bin/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import argparse 18 | import logging 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING) 20 | import os 21 | 22 | import torch 23 | from torch.utils.data import DataLoader 24 | import torchaudio 25 | from hyperpyyaml import load_hyperpyyaml 26 | from tqdm import tqdm 27 | from cosyvoice.cli.model import CosyVoiceModel 28 | 29 | from cosyvoice.dataset.dataset import Dataset 30 | 31 | def get_args(): 32 | parser = argparse.ArgumentParser(description='inference with your model') 33 | parser.add_argument('--config', required=True, help='config file') 34 | parser.add_argument('--prompt_data', required=True, help='prompt data file') 35 | parser.add_argument('--prompt_utt2data', required=True, help='prompt data file') 36 | parser.add_argument('--tts_text', required=True, help='tts input file') 37 | parser.add_argument('--llm_model', required=True, help='llm model file') 38 | parser.add_argument('--flow_model', required=True, help='flow model file') 39 | parser.add_argument('--hifigan_model', required=True, help='hifigan model file') 40 | parser.add_argument('--gpu', 41 | type=int, 42 | default=-1, 43 | help='gpu id for this rank, -1 for cpu') 44 | parser.add_argument('--mode', 45 | default='sft', 46 | choices=['sft', 'zero_shot'], 47 | help='inference mode') 48 | parser.add_argument('--result_dir', required=True, help='asr result file') 49 | args = parser.parse_args() 50 | print(args) 51 | return args 52 | 53 | 54 | def main(): 55 | args = get_args() 56 | logging.basicConfig(level=logging.DEBUG, 57 | format='%(asctime)s %(levelname)s %(message)s') 58 | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) 59 | 60 | # Init cosyvoice models from configs 61 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 62 | device = torch.device('cuda' if use_cuda else 'cpu') 63 | with open(args.config, 'r') as f: 64 | configs = load_hyperpyyaml(f) 65 | 66 | model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) 67 | model.load(args.llm_model, args.flow_model, args.hifigan_model) 68 | 69 | test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data) 70 | test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) 71 | 72 | del configs 73 | os.makedirs(args.result_dir, exist_ok=True) 74 | fn = os.path.join(args.result_dir, 'wav.scp') 75 | f = open(fn, 'w') 76 | with torch.no_grad(): 77 | for batch_idx, batch in tqdm(enumerate(test_data_loader)): 78 | utts = batch["utts"] 79 | assert len(utts) == 1, "inference mode only support batchsize 1" 80 | text = batch["text"] 81 | text_token = batch["text_token"].to(device) 82 | text_token_len = batch["text_token_len"].to(device) 83 | tts_text = batch["tts_text"] 84 | tts_index = batch["tts_index"] 85 | tts_text_token = batch["tts_text_token"].to(device) 86 | tts_text_token_len = batch["tts_text_token_len"].to(device) 87 | speech_token = batch["speech_token"].to(device) 88 | speech_token_len = batch["speech_token_len"].to(device) 89 | speech_feat = batch["speech_feat"].to(device) 90 | speech_feat_len = batch["speech_feat_len"].to(device) 91 | utt_embedding = batch["utt_embedding"].to(device) 92 | spk_embedding = batch["spk_embedding"].to(device) 93 | if args.mode == 'sft': 94 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 95 | 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding} 96 | else: 97 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 98 | 'prompt_text': text_token, 'prompt_text_len': text_token_len, 99 | 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, 100 | 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, 101 | 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, 102 | 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding} 103 | model_output = model.inference(**model_input) 104 | tts_key = '{}_{}'.format(utts[0], tts_index[0]) 105 | tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key)) 106 | torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050) 107 | f.write('{} {}\n'.format(tts_key, tts_fn)) 108 | f.flush() 109 | f.close() 110 | logging.info('Result wav.scp saved in {}'.format(fn)) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /cosyvoice/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import random 17 | import json 18 | import math 19 | from functools import partial 20 | 21 | import torch 22 | import torch.distributed as dist 23 | from torch.utils.data import IterableDataset 24 | from cosyvoice.utils.file_utils import read_lists, read_json_lists 25 | 26 | 27 | class Processor(IterableDataset): 28 | 29 | def __init__(self, source, f, *args, **kw): 30 | assert callable(f) 31 | self.source = source 32 | self.f = f 33 | self.args = args 34 | self.kw = kw 35 | 36 | def set_epoch(self, epoch): 37 | self.source.set_epoch(epoch) 38 | 39 | def __iter__(self): 40 | """ Return an iterator over the source dataset processed by the 41 | given processor. 42 | """ 43 | assert self.source is not None 44 | assert callable(self.f) 45 | return self.f(iter(self.source), *self.args, **self.kw) 46 | 47 | def apply(self, f): 48 | assert callable(f) 49 | return Processor(self, f, *self.args, **self.kw) 50 | 51 | 52 | class DistributedSampler: 53 | 54 | def __init__(self, shuffle=True, partition=True): 55 | self.epoch = -1 56 | self.update() 57 | self.shuffle = shuffle 58 | self.partition = partition 59 | 60 | def update(self): 61 | assert dist.is_available() 62 | if dist.is_initialized(): 63 | self.rank = dist.get_rank() 64 | self.world_size = dist.get_world_size() 65 | else: 66 | self.rank = 0 67 | self.world_size = 1 68 | worker_info = torch.utils.data.get_worker_info() 69 | if worker_info is None: 70 | self.worker_id = 0 71 | self.num_workers = 1 72 | else: 73 | self.worker_id = worker_info.id 74 | self.num_workers = worker_info.num_workers 75 | return dict(rank=self.rank, 76 | world_size=self.world_size, 77 | worker_id=self.worker_id, 78 | num_workers=self.num_workers) 79 | 80 | def set_epoch(self, epoch): 81 | self.epoch = epoch 82 | 83 | def sample(self, data): 84 | """ Sample data according to rank/world_size/num_workers 85 | 86 | Args: 87 | data(List): input data list 88 | 89 | Returns: 90 | List: data list after sample 91 | """ 92 | data = list(range(len(data))) 93 | # force datalist even 94 | if self.partition: 95 | if self.shuffle: 96 | random.Random(self.epoch).shuffle(data) 97 | if len(data) < self.world_size: 98 | data = data * math.ceil(self.world_size / len(data)) 99 | data = data[:self.world_size] 100 | data = data[self.rank::self.world_size] 101 | if len(data) < self.num_workers: 102 | data = data * math.ceil(self.num_workers / len(data)) 103 | data = data[:self.num_workers] 104 | data = data[self.worker_id::self.num_workers] 105 | return data 106 | 107 | 108 | class DataList(IterableDataset): 109 | 110 | def __init__(self, lists, shuffle=True, partition=True): 111 | self.lists = lists 112 | self.sampler = DistributedSampler(shuffle, partition) 113 | 114 | def set_epoch(self, epoch): 115 | self.sampler.set_epoch(epoch) 116 | 117 | def __iter__(self): 118 | sampler_info = self.sampler.update() 119 | indexes = self.sampler.sample(self.lists) 120 | for index in indexes: 121 | data = dict(src=self.lists[index]) 122 | data.update(sampler_info) 123 | yield data 124 | 125 | 126 | def Dataset(data_list_file, 127 | data_pipeline, 128 | mode='train', 129 | shuffle=True, 130 | partition=True, 131 | tts_file='', 132 | prompt_utt2data=''): 133 | """ Construct dataset from arguments 134 | 135 | We have two shuffle stage in the Dataset. The first is global 136 | shuffle at shards tar/raw file level. The second is global shuffle 137 | at training samples level. 138 | 139 | Args: 140 | data_type(str): raw/shard 141 | tokenizer (BaseTokenizer): tokenizer to tokenize 142 | partition(bool): whether to do data partition in terms of rank 143 | """ 144 | assert mode in ['train', 'inference'] 145 | lists = read_lists(data_list_file) 146 | if mode == 'inference': 147 | with open(tts_file) as f: 148 | tts_data = json.load(f) 149 | utt2lists = read_json_lists(prompt_utt2data) 150 | # filter unnecessary file in inference mode 151 | lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists])) 152 | dataset = DataList(lists, 153 | shuffle=shuffle, 154 | partition=partition) 155 | if mode == 'inference': 156 | # map partial arg tts_data in inference mode 157 | data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data) 158 | for func in data_pipeline: 159 | dataset = Processor(dataset, func, mode=mode) 160 | return dataset 161 | -------------------------------------------------------------------------------- /cosyvoice/bin/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import print_function 15 | import os,sys 16 | os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo" 17 | import argparse 18 | import datetime 19 | import logging 20 | logging.getLogger('matplotlib').setLevel(logging.WARNING) 21 | from copy import deepcopy 22 | import torch 23 | import torch.distributed as dist 24 | import deepspeed 25 | 26 | now_dir = os.getcwd() 27 | sys.path.append(now_dir) 28 | sys.path.append("%s/cosyvoice" % (now_dir)) 29 | 30 | from hyperpyyaml import load_hyperpyyaml 31 | 32 | from torch.distributed.elastic.multiprocessing.errors import record 33 | 34 | from cosyvoice.utils.executor import Executor 35 | from cosyvoice.utils.train_utils import ( 36 | init_distributed, 37 | init_dataset_and_dataloader, 38 | init_optimizer_and_scheduler, 39 | init_summarywriter, save_model, 40 | wrap_cuda_model, check_modify_and_save_config) 41 | 42 | 43 | def get_args(): 44 | parser = argparse.ArgumentParser(description='training your network') 45 | parser.add_argument('--train_engine', 46 | default='torch_ddp', 47 | choices=['torch_ddp', 'deepspeed'], 48 | help='Engine for paralleled training') 49 | parser.add_argument('--model', required=True, help='model which will be trained') 50 | parser.add_argument('--config', required=True, help='config file') 51 | parser.add_argument('--train_data', required=True, help='train data file') 52 | parser.add_argument('--cv_data', required=True, help='cv data file') 53 | parser.add_argument('--checkpoint', help='checkpoint model') 54 | parser.add_argument('--model_dir', required=True, help='save model dir') 55 | parser.add_argument('--tensorboard_dir', 56 | default='tensorboard', 57 | help='tensorboard log dir') 58 | parser.add_argument('--ddp.dist_backend', 59 | dest='dist_backend', 60 | default='gloo', 61 | choices=['nccl', 'gloo'], 62 | help='distributed backend') 63 | parser.add_argument('--num_workers', 64 | default=0, 65 | type=int, 66 | help='num of subprocess workers for reading') 67 | parser.add_argument('--prefetch', 68 | default=100, 69 | type=int, 70 | help='prefetch number') 71 | parser.add_argument('--pin_memory', 72 | action='store_true', 73 | default=False, 74 | help='Use pinned memory buffers used for reading') 75 | parser.add_argument('--deepspeed.save_states', 76 | dest='save_states', 77 | default='model_only', 78 | choices=['model_only', 'model+optimizer'], 79 | help='save model/optimizer states') 80 | parser.add_argument('--timeout', 81 | default=30, 82 | type=int, 83 | help='timeout (in seconds) of cosyvoice_join.') 84 | parser = deepspeed.add_config_arguments(parser) 85 | args = parser.parse_args() 86 | return args 87 | 88 | 89 | @record 90 | def main(): 91 | args = get_args() 92 | logging.basicConfig(level=logging.DEBUG, 93 | format='%(asctime)s %(levelname)s %(message)s') 94 | 95 | override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model} 96 | with open(args.config, 'r') as f: 97 | configs = load_hyperpyyaml(f, overrides=override_dict) 98 | configs['train_conf'].update(vars(args)) 99 | 100 | # Init env for ddp 101 | init_distributed(args) 102 | 103 | # Get dataset & dataloader 104 | train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ 105 | init_dataset_and_dataloader(args, configs) 106 | 107 | # Do some sanity checks and save config to arsg.model_dir 108 | configs = check_modify_and_save_config(args, configs) 109 | 110 | # Tensorboard summary 111 | writer = init_summarywriter(args) 112 | 113 | # load checkpoint 114 | model = configs[args.model] 115 | if args.checkpoint is not None: 116 | model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) 117 | 118 | # Dispatch model from cpu to gpu 119 | model = wrap_cuda_model(args, model) 120 | 121 | # Get optimizer & scheduler 122 | model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model) 123 | 124 | # Save init checkpoints 125 | info_dict = deepcopy(configs['train_conf']) 126 | save_model(model, 'init', info_dict) 127 | 128 | # Get executor 129 | executor = Executor() 130 | 131 | # Start training loop 132 | for epoch in range(info_dict['max_epoch']): 133 | executor.epoch = epoch 134 | train_dataset.set_epoch(epoch) 135 | dist.barrier() 136 | group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) 137 | executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) 138 | dist.destroy_process_group(group_join) 139 | 140 | if __name__ == '__main__': 141 | main() 142 | -------------------------------------------------------------------------------- /cosyvoice/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 2024 Alibaba Inc (Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """ConvolutionModule definition.""" 17 | 18 | from typing import Tuple 19 | 20 | import torch 21 | from torch import nn 22 | 23 | 24 | class ConvolutionModule(nn.Module): 25 | """ConvolutionModule in Conformer model.""" 26 | 27 | def __init__(self, 28 | channels: int, 29 | kernel_size: int = 15, 30 | activation: nn.Module = nn.ReLU(), 31 | norm: str = "batch_norm", 32 | causal: bool = False, 33 | bias: bool = True): 34 | """Construct an ConvolutionModule object. 35 | Args: 36 | channels (int): The number of channels of conv layers. 37 | kernel_size (int): Kernel size of conv layers. 38 | causal (int): Whether use causal convolution or not 39 | """ 40 | super().__init__() 41 | 42 | self.pointwise_conv1 = nn.Conv1d( 43 | channels, 44 | 2 * channels, 45 | kernel_size=1, 46 | stride=1, 47 | padding=0, 48 | bias=bias, 49 | ) 50 | # self.lorder is used to distinguish if it's a causal convolution, 51 | # if self.lorder > 0: it's a causal convolution, the input will be 52 | # padded with self.lorder frames on the left in forward. 53 | # else: it's a symmetrical convolution 54 | if causal: 55 | padding = 0 56 | self.lorder = kernel_size - 1 57 | else: 58 | # kernel_size should be an odd number for none causal convolution 59 | assert (kernel_size - 1) % 2 == 0 60 | padding = (kernel_size - 1) // 2 61 | self.lorder = 0 62 | self.depthwise_conv = nn.Conv1d( 63 | channels, 64 | channels, 65 | kernel_size, 66 | stride=1, 67 | padding=padding, 68 | groups=channels, 69 | bias=bias, 70 | ) 71 | 72 | assert norm in ['batch_norm', 'layer_norm'] 73 | if norm == "batch_norm": 74 | self.use_layer_norm = False 75 | self.norm = nn.BatchNorm1d(channels) 76 | else: 77 | self.use_layer_norm = True 78 | self.norm = nn.LayerNorm(channels) 79 | 80 | self.pointwise_conv2 = nn.Conv1d( 81 | channels, 82 | channels, 83 | kernel_size=1, 84 | stride=1, 85 | padding=0, 86 | bias=bias, 87 | ) 88 | self.activation = activation 89 | 90 | def forward( 91 | self, 92 | x: torch.Tensor, 93 | mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), 94 | cache: torch.Tensor = torch.zeros((0, 0, 0)), 95 | ) -> Tuple[torch.Tensor, torch.Tensor]: 96 | """Compute convolution module. 97 | Args: 98 | x (torch.Tensor): Input tensor (#batch, time, channels). 99 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), 100 | (0, 0, 0) means fake mask. 101 | cache (torch.Tensor): left context cache, it is only 102 | used in causal convolution (#batch, channels, cache_t), 103 | (0, 0, 0) meas fake cache. 104 | Returns: 105 | torch.Tensor: Output tensor (#batch, time, channels). 106 | """ 107 | # exchange the temporal dimension and the feature dimension 108 | x = x.transpose(1, 2) # (#batch, channels, time) 109 | 110 | # mask batch padding 111 | if mask_pad.size(2) > 0: # time > 0 112 | x.masked_fill_(~mask_pad, 0.0) 113 | 114 | if self.lorder > 0: 115 | if cache.size(2) == 0: # cache_t == 0 116 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) 117 | else: 118 | assert cache.size(0) == x.size(0) # equal batch 119 | assert cache.size(1) == x.size(1) # equal channel 120 | x = torch.cat((cache, x), dim=2) 121 | assert (x.size(2) > self.lorder) 122 | new_cache = x[:, :, -self.lorder:] 123 | else: 124 | # It's better we just return None if no cache is required, 125 | # However, for JIT export, here we just fake one tensor instead of 126 | # None. 127 | new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) 128 | 129 | # GLU mechanism 130 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 131 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 132 | 133 | # 1D Depthwise Conv 134 | x = self.depthwise_conv(x) 135 | if self.use_layer_norm: 136 | x = x.transpose(1, 2) 137 | x = self.activation(self.norm(x)) 138 | if self.use_layer_norm: 139 | x = x.transpose(1, 2) 140 | x = self.pointwise_conv2(x) 141 | # mask batch padding 142 | if mask_pad.size(2) > 0: # time > 0 143 | x.masked_fill_(~mask_pad, 0.0) 144 | 145 | return x.transpose(1, 2), new_cache 146 | -------------------------------------------------------------------------------- /matcha/hifigan/README.md: -------------------------------------------------------------------------------- 1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis 2 | 3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae 4 | 5 | In our [paper](https://arxiv.org/abs/2010.05646), 6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
7 | We provide our implementation and pretrained models as open source in this repository. 8 | 9 | **Abstract :** 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 11 | Although such methods improve the sampling efficiency and memory usage, 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models. 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 14 | As speech audio consists of sinusoidal signals with various periods, 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart. 21 | 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples. 23 | 24 | ## Pre-requisites 25 | 26 | 1. Python >= 3.6 27 | 2. Clone this repository. 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt) 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). 30 | And move all wav files to `LJSpeech-1.1/wavs` 31 | 32 | ## Training 33 | 34 | ``` 35 | python train.py --config config_v1.json 36 | ``` 37 | 38 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
39 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
40 | You can change the path by adding `--checkpoint_path` option. 41 | 42 | Validation loss during training with V1 generator.
43 | ![validation loss](./validation_loss.png) 44 | 45 | ## Pretrained Model 46 | 47 | You can also use pretrained models we provide.
48 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
49 | Details of each folder are as in follows: 50 | 51 | | Folder Name | Generator | Dataset | Fine-Tuned | 52 | | ------------ | --------- | --------- | ------------------------------------------------------ | 53 | | LJ_V1 | V1 | LJSpeech | No | 54 | | LJ_V2 | V2 | LJSpeech | No | 55 | | LJ_V3 | V3 | LJSpeech | No | 56 | | LJ_FT_T2_V1 | V1 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 57 | | LJ_FT_T2_V2 | V2 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 58 | | LJ_FT_T2_V3 | V3 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 59 | | VCTK_V1 | V1 | VCTK | No | 60 | | VCTK_V2 | V2 | VCTK | No | 61 | | VCTK_V3 | V3 | VCTK | No | 62 | | UNIVERSAL_V1 | V1 | Universal | No | 63 | 64 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets. 65 | 66 | ## Fine-Tuning 67 | 68 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
69 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
70 | Example: 71 | ` Audio File : LJ001-0001.wav 72 | Mel-Spectrogram File : LJ001-0001.npy` 73 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
74 | 3. Run the following command. 75 | ``` 76 | python train.py --fine_tuning True --config config_v1.json 77 | ``` 78 | For other command line options, please refer to the training section. 79 | 80 | ## Inference from wav file 81 | 82 | 1. Make `test_files` directory and copy wav files into the directory. 83 | 2. Run the following command. 84 | ` python inference.py --checkpoint_file [generator checkpoint file path]` 85 | Generated wav files are saved in `generated_files` by default.
86 | You can change the path by adding `--output_dir` option. 87 | 88 | ## Inference for end-to-end speech synthesis 89 | 90 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
91 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 92 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth. 93 | 2. Run the following command. 94 | ` python inference_e2e.py --checkpoint_file [generator checkpoint file path]` 95 | Generated wav files are saved in `generated_files_from_mel` by default.
96 | You can change the path by adding `--output_dir` option. 97 | 98 | ## Acknowledgements 99 | 100 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 101 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this. 102 | -------------------------------------------------------------------------------- /cosyvoice/flow/flow_matching.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn.functional as F 16 | from matcha.models.components.flow_matching import BASECFM 17 | 18 | class ConditionalCFM(BASECFM): 19 | def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): 20 | super().__init__( 21 | n_feats=in_channels, 22 | cfm_params=cfm_params, 23 | n_spks=n_spks, 24 | spk_emb_dim=spk_emb_dim, 25 | ) 26 | self.t_scheduler = cfm_params.t_scheduler 27 | self.training_cfg_rate = cfm_params.training_cfg_rate 28 | self.inference_cfg_rate = cfm_params.inference_cfg_rate 29 | in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) 30 | # Just change the architecture of the estimator here 31 | self.estimator = estimator 32 | 33 | @torch.inference_mode() 34 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): 35 | """Forward diffusion 36 | 37 | Args: 38 | mu (torch.Tensor): output of encoder 39 | shape: (batch_size, n_feats, mel_timesteps) 40 | mask (torch.Tensor): output_mask 41 | shape: (batch_size, 1, mel_timesteps) 42 | n_timesteps (int): number of diffusion steps 43 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0. 44 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 45 | shape: (batch_size, spk_emb_dim) 46 | cond: Not used but kept for future purposes 47 | 48 | Returns: 49 | sample: generated mel-spectrogram 50 | shape: (batch_size, n_feats, mel_timesteps) 51 | """ 52 | z = torch.randn_like(mu) * temperature 53 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) 54 | if self.t_scheduler == 'cosine': 55 | t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) 56 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) 57 | 58 | def solve_euler(self, x, t_span, mu, mask, spks, cond): 59 | """ 60 | Fixed euler solver for ODEs. 61 | Args: 62 | x (torch.Tensor): random noise 63 | t_span (torch.Tensor): n_timesteps interpolated 64 | shape: (n_timesteps + 1,) 65 | mu (torch.Tensor): output of encoder 66 | shape: (batch_size, n_feats, mel_timesteps) 67 | mask (torch.Tensor): output_mask 68 | shape: (batch_size, 1, mel_timesteps) 69 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 70 | shape: (batch_size, spk_emb_dim) 71 | cond: Not used but kept for future purposes 72 | """ 73 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] 74 | 75 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file 76 | # Or in future might add like a return_all_steps flag 77 | sol = [] 78 | 79 | for step in range(1, len(t_span)): 80 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond) 81 | # Classifier-Free Guidance inference introduced in VoiceBox 82 | if self.inference_cfg_rate > 0: 83 | cfg_dphi_dt = self.estimator( 84 | x, mask, 85 | torch.zeros_like(mu), t, 86 | torch.zeros_like(spks) if spks is not None else None, 87 | torch.zeros_like(cond) 88 | ) 89 | dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - 90 | self.inference_cfg_rate * cfg_dphi_dt) 91 | x = x + dt * dphi_dt 92 | t = t + dt 93 | sol.append(x) 94 | if step < len(t_span) - 1: 95 | dt = t_span[step + 1] - t 96 | 97 | return sol[-1] 98 | 99 | def compute_loss(self, x1, mask, mu, spks=None, cond=None): 100 | """Computes diffusion loss 101 | 102 | Args: 103 | x1 (torch.Tensor): Target 104 | shape: (batch_size, n_feats, mel_timesteps) 105 | mask (torch.Tensor): target mask 106 | shape: (batch_size, 1, mel_timesteps) 107 | mu (torch.Tensor): output of encoder 108 | shape: (batch_size, n_feats, mel_timesteps) 109 | spks (torch.Tensor, optional): speaker embedding. Defaults to None. 110 | shape: (batch_size, spk_emb_dim) 111 | 112 | Returns: 113 | loss: conditional flow matching loss 114 | y: conditional flow 115 | shape: (batch_size, n_feats, mel_timesteps) 116 | """ 117 | b, _, t = mu.shape 118 | 119 | # random timestep 120 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) 121 | if self.t_scheduler == 'cosine': 122 | t = 1 - torch.cos(t * 0.5 * torch.pi) 123 | # sample noise p(x_0) 124 | z = torch.randn_like(x1) 125 | 126 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1 127 | u = x1 - (1 - self.sigma_min) * z 128 | 129 | pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) 130 | loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) 131 | return loss, y 132 | -------------------------------------------------------------------------------- /academicodec/binary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Raw binary format for Encodec compressed audio. Actual compression API is in `encodec.compress`.""" 7 | import io 8 | import json 9 | import struct 10 | import typing as tp 11 | 12 | # format is `ECDC` magic code, followed by the header size as uint32. 13 | # Then an uint8 indicates the protocol version (0.) 14 | # The header is then provided as json and should contain all required 15 | # informations for decoding. A raw stream of bytes is then provided 16 | # and should be interpretable using the json header. 17 | _encodec_header_struct = struct.Struct('!4sBI') 18 | _ENCODEC_MAGIC = b'ECDC' 19 | 20 | 21 | def write_ecdc_header(fo: tp.IO[bytes], metadata: tp.Any): 22 | meta_dumped = json.dumps(metadata).encode('utf-8') 23 | version = 0 24 | header = _encodec_header_struct.pack(_ENCODEC_MAGIC, version, 25 | len(meta_dumped)) 26 | fo.write(header) 27 | fo.write(meta_dumped) 28 | fo.flush() 29 | 30 | 31 | def _read_exactly(fo: tp.IO[bytes], size: int) -> bytes: 32 | buf = b"" 33 | while len(buf) < size: 34 | new_buf = fo.read(size) 35 | if not new_buf: 36 | raise EOFError("Impossible to read enough data from the stream, " 37 | f"{size} bytes remaining.") 38 | buf += new_buf 39 | size -= len(new_buf) 40 | return buf 41 | 42 | 43 | def read_ecdc_header(fo: tp.IO[bytes]): 44 | header_bytes = _read_exactly(fo, _encodec_header_struct.size) 45 | magic, version, meta_size = _encodec_header_struct.unpack(header_bytes) 46 | if magic != _ENCODEC_MAGIC: 47 | raise ValueError("File is not in ECDC format.") 48 | if version != 0: 49 | raise ValueError("Version not supported.") 50 | meta_bytes = _read_exactly(fo, meta_size) 51 | return json.loads(meta_bytes.decode('utf-8')) 52 | 53 | 54 | class BitPacker: 55 | """Simple bit packer to handle ints with a non standard width, e.g. 10 bits. 56 | Note that for some bandwidth (1.5, 3), the codebook representation 57 | will not cover an integer number of bytes. 58 | 59 | Args: 60 | bits (int): number of bits per value that will be pushed. 61 | fo (IO[bytes]): file-object to push the bytes to. 62 | """ 63 | 64 | def __init__(self, bits: int, fo: tp.IO[bytes]): 65 | self._current_value = 0 66 | self._current_bits = 0 67 | self.bits = bits 68 | self.fo = fo 69 | 70 | def push(self, value: int): 71 | """Push a new value to the stream. This will immediately 72 | write as many uint8 as possible to the underlying file-object.""" 73 | self._current_value += (value << self._current_bits) 74 | self._current_bits += self.bits 75 | while self._current_bits >= 8: 76 | lower_8bits = self._current_value & 0xff 77 | self._current_bits -= 8 78 | self._current_value >>= 8 79 | self.fo.write(bytes([lower_8bits])) 80 | 81 | def flush(self): 82 | """Flushes the remaining partial uint8, call this at the end 83 | of the stream to encode.""" 84 | if self._current_bits: 85 | self.fo.write(bytes([self._current_value])) 86 | self._current_value = 0 87 | self._current_bits = 0 88 | self.fo.flush() 89 | 90 | 91 | class BitUnpacker: 92 | """BitUnpacker does the opposite of `BitPacker`. 93 | 94 | Args: 95 | bits (int): number of bits of the values to decode. 96 | fo (IO[bytes]): file-object to push the bytes to. 97 | """ 98 | 99 | def __init__(self, bits: int, fo: tp.IO[bytes]): 100 | self.bits = bits 101 | self.fo = fo 102 | self._mask = (1 << bits) - 1 103 | self._current_value = 0 104 | self._current_bits = 0 105 | 106 | def pull(self) -> tp.Optional[int]: 107 | """ 108 | Pull a single value from the stream, potentially reading some 109 | extra bytes from the underlying file-object. 110 | Returns `None` when reaching the end of the stream. 111 | """ 112 | while self._current_bits < self.bits: 113 | buf = self.fo.read(1) 114 | if not buf: 115 | return None 116 | character = buf[0] 117 | self._current_value += character << self._current_bits 118 | self._current_bits += 8 119 | 120 | out = self._current_value & self._mask 121 | self._current_value >>= self.bits 122 | self._current_bits -= self.bits 123 | return out 124 | 125 | 126 | def test(): 127 | import torch 128 | torch.manual_seed(1234) 129 | for rep in range(4): 130 | length: int = torch.randint(10, 2_000, (1, )).item() 131 | bits: int = torch.randint(1, 16, (1, )).item() 132 | tokens: tp.List[int] = torch.randint(2**bits, (length, )).tolist() 133 | rebuilt: tp.List[int] = [] 134 | buf = io.BytesIO() 135 | packer = BitPacker(bits, buf) 136 | for token in tokens: 137 | packer.push(token) 138 | packer.flush() 139 | buf.seek(0) 140 | unpacker = BitUnpacker(bits, buf) 141 | while True: 142 | value = unpacker.pull() 143 | if value is None: 144 | break 145 | rebuilt.append(value) 146 | assert len(rebuilt) >= len(tokens), (len(rebuilt), len(tokens)) 147 | # The flushing mechanism might lead to "ghost" values at the end of the stream. 148 | assert len(rebuilt) <= len(tokens) + 8 // bits, (len(rebuilt), 149 | len(tokens), bits) 150 | for idx, (a, b) in enumerate(zip(tokens, rebuilt)): 151 | assert a == b, (idx, a, b) 152 | 153 | 154 | if __name__ == '__main__': 155 | test() 156 | -------------------------------------------------------------------------------- /matcha/onnx/export.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import torch 7 | from lightning import LightningModule 8 | 9 | from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder 10 | 11 | DEFAULT_OPSET = 15 12 | 13 | SEED = 1234 14 | random.seed(SEED) 15 | np.random.seed(SEED) 16 | torch.manual_seed(SEED) 17 | torch.cuda.manual_seed(SEED) 18 | torch.backends.cudnn.deterministic = True 19 | torch.backends.cudnn.benchmark = False 20 | 21 | 22 | class MatchaWithVocoder(LightningModule): 23 | def __init__(self, matcha, vocoder): 24 | super().__init__() 25 | self.matcha = matcha 26 | self.vocoder = vocoder 27 | 28 | def forward(self, x, x_lengths, scales, spks=None): 29 | mel, mel_lengths = self.matcha(x, x_lengths, scales, spks) 30 | wavs = self.vocoder(mel).clamp(-1, 1) 31 | lengths = mel_lengths * 256 32 | return wavs.squeeze(1), lengths 33 | 34 | 35 | def get_exportable_module(matcha, vocoder, n_timesteps): 36 | """ 37 | Return an appropriate `LighteningModule` and output-node names 38 | based on whether the vocoder is embedded in the final graph 39 | """ 40 | 41 | def onnx_forward_func(x, x_lengths, scales, spks=None): 42 | """ 43 | Custom forward function for accepting 44 | scaler parameters as tensors 45 | """ 46 | # Extract scaler parameters from tensors 47 | temperature = scales[0] 48 | length_scale = scales[1] 49 | output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale) 50 | return output["mel"], output["mel_lengths"] 51 | 52 | # Monkey-patch Matcha's forward function 53 | matcha.forward = onnx_forward_func 54 | 55 | if vocoder is None: 56 | model, output_names = matcha, ["mel", "mel_lengths"] 57 | else: 58 | model = MatchaWithVocoder(matcha, vocoder) 59 | output_names = ["wav", "wav_lengths"] 60 | return model, output_names 61 | 62 | 63 | def get_inputs(is_multi_speaker): 64 | """ 65 | Create dummy inputs for tracing 66 | """ 67 | dummy_input_length = 50 68 | x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long) 69 | x_lengths = torch.LongTensor([dummy_input_length]) 70 | 71 | # Scales 72 | temperature = 0.667 73 | length_scale = 1.0 74 | scales = torch.Tensor([temperature, length_scale]) 75 | 76 | model_inputs = [x, x_lengths, scales] 77 | input_names = [ 78 | "x", 79 | "x_lengths", 80 | "scales", 81 | ] 82 | 83 | if is_multi_speaker: 84 | spks = torch.LongTensor([1]) 85 | model_inputs.append(spks) 86 | input_names.append("spks") 87 | 88 | return tuple(model_inputs), input_names 89 | 90 | 91 | def main(): 92 | parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX") 93 | 94 | parser.add_argument( 95 | "checkpoint_path", 96 | type=str, 97 | help="Path to the model checkpoint", 98 | ) 99 | parser.add_argument("output", type=str, help="Path to output `.onnx` file") 100 | parser.add_argument( 101 | "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)" 102 | ) 103 | parser.add_argument( 104 | "--vocoder-name", 105 | type=str, 106 | choices=list(VOCODER_URLS.keys()), 107 | default=None, 108 | help="Name of the vocoder to embed in the ONNX graph", 109 | ) 110 | parser.add_argument( 111 | "--vocoder-checkpoint-path", 112 | type=str, 113 | default=None, 114 | help="Vocoder checkpoint to embed in the ONNX graph for an `e2e` like experience", 115 | ) 116 | parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15") 117 | 118 | args = parser.parse_args() 119 | 120 | print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}") 121 | print(f"Setting n_timesteps to {args.n_timesteps}") 122 | 123 | checkpoint_path = Path(args.checkpoint_path) 124 | matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu") 125 | 126 | if args.vocoder_name or args.vocoder_checkpoint_path: 127 | assert ( 128 | args.vocoder_name and args.vocoder_checkpoint_path 129 | ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph." 130 | vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu") 131 | else: 132 | vocoder = None 133 | 134 | is_multi_speaker = matcha.n_spks > 1 135 | 136 | dummy_input, input_names = get_inputs(is_multi_speaker) 137 | model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps) 138 | 139 | # Set dynamic shape for inputs/outputs 140 | dynamic_axes = { 141 | "x": {0: "batch_size", 1: "time"}, 142 | "x_lengths": {0: "batch_size"}, 143 | } 144 | 145 | if vocoder is None: 146 | dynamic_axes.update( 147 | { 148 | "mel": {0: "batch_size", 2: "time"}, 149 | "mel_lengths": {0: "batch_size"}, 150 | } 151 | ) 152 | else: 153 | print("Embedding the vocoder in the ONNX graph") 154 | dynamic_axes.update( 155 | { 156 | "wav": {0: "batch_size", 1: "time"}, 157 | "wav_lengths": {0: "batch_size"}, 158 | } 159 | ) 160 | 161 | if is_multi_speaker: 162 | dynamic_axes["spks"] = {0: "batch_size"} 163 | 164 | # Create the output directory (if not exists) 165 | Path(args.output).parent.mkdir(parents=True, exist_ok=True) 166 | 167 | model.to_onnx( 168 | args.output, 169 | dummy_input, 170 | input_names=input_names, 171 | output_names=output_names, 172 | dynamic_axes=dynamic_axes, 173 | opset_version=args.opset, 174 | export_params=True, 175 | do_constant_folding=True, 176 | ) 177 | print(f"[🍵] ONNX model exported to {args.output}") 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | -------------------------------------------------------------------------------- /cosyvoice/cli/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | class CosyVoiceModel: 17 | 18 | def __init__(self, 19 | llm: torch.nn.Module, 20 | flow: torch.nn.Module, 21 | hift: torch.nn.Module): 22 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | self.llm = llm 24 | self.flow = flow 25 | self.hift = hift 26 | 27 | def load(self, llm_model, flow_model, hift_model): 28 | self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) 29 | self.llm.to(self.device).eval() 30 | self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) 31 | self.flow.to(self.device).eval() 32 | self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) 33 | self.hift.to(self.device).eval() 34 | 35 | def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192), 36 | prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), 37 | llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 38 | flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 39 | prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): 40 | tts_speech_token = self.llm.inference(text=text.to(self.device), 41 | text_len=text_len.to(self.device), 42 | prompt_text=prompt_text.to(self.device), 43 | prompt_text_len=prompt_text_len.to(self.device), 44 | prompt_speech_token=llm_prompt_speech_token.to(self.device), 45 | prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device), 46 | embedding=llm_embedding.to(self.device), 47 | beam_size=1, 48 | sampling=25, 49 | max_token_text_ratio=30, 50 | min_token_text_ratio=3) 51 | tts_mel = self.flow.inference(token=tts_speech_token, 52 | token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), 53 | prompt_token=flow_prompt_speech_token.to(self.device), 54 | prompt_token_len=flow_prompt_speech_token_len.to(self.device), 55 | prompt_feat=prompt_speech_feat.to(self.device), 56 | prompt_feat_len=prompt_speech_feat_len.to(self.device), 57 | embedding=flow_embedding.to(self.device)) 58 | tts_speech = self.hift.inference(mel=tts_mel).cpu() 59 | torch.cuda.empty_cache() 60 | return {'tts_speech': tts_speech} 61 | 62 | def inference_stream(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192), 63 | prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), 64 | llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 65 | flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 66 | prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): 67 | try: 68 | tts_speech_token = next(self.llm.inference_stream(text=text.to(self.device), 69 | text_len=text_len.to(self.device), 70 | prompt_text=prompt_text.to(self.device), 71 | prompt_text_len=prompt_text_len.to(self.device), 72 | prompt_speech_token=llm_prompt_speech_token.to(self.device), 73 | prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device), 74 | embedding=llm_embedding.to(self.device), 75 | beam_size=1, 76 | sampling=25, 77 | max_token_text_ratio=30, 78 | min_token_text_ratio=3)) 79 | except StopIteration: 80 | print("LLM inference stream exhausted") 81 | return 82 | 83 | try: 84 | tts_mel = next(self.flow.inference_stream(token=tts_speech_token, 85 | token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), 86 | prompt_token=flow_prompt_speech_token.to(self.device), 87 | prompt_token_len=flow_prompt_speech_token_len.to(self.device), 88 | prompt_feat=prompt_speech_feat.to(self.device), 89 | prompt_feat_len=prompt_speech_feat_len.to(self.device), 90 | embedding=flow_embedding.to(self.device))) 91 | except StopIteration: 92 | print("Flow inference stream exhausted") 93 | return 94 | 95 | try: 96 | tts_speech = next(self.hift.inference_stream(mel=tts_mel)) 97 | except StopIteration: 98 | print("HIFT inference stream exhausted") 99 | return 100 | 101 | tts_speech = tts_speech.cpu() 102 | torch.cuda.empty_cache() 103 | yield {'tts_speech': tts_speech} -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/text_normlization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | from typing import List 16 | 17 | from .char_convert import tranditional_to_simplified 18 | from .chronology import RE_DATE 19 | from .chronology import RE_DATE2 20 | from .chronology import RE_TIME 21 | from .chronology import RE_TIME_RANGE 22 | from .chronology import replace_date 23 | from .chronology import replace_date2 24 | from .chronology import replace_time 25 | from .constants import F2H_ASCII_LETTERS 26 | from .constants import F2H_DIGITS 27 | from .constants import F2H_SPACE 28 | from .num import RE_DECIMAL_NUM 29 | from .num import RE_DEFAULT_NUM 30 | from .num import RE_FRAC 31 | from .num import RE_INTEGER 32 | from .num import RE_NUMBER 33 | from .num import RE_PERCENTAGE 34 | from .num import RE_POSITIVE_QUANTIFIERS 35 | from .num import RE_RANGE 36 | from .num import replace_default_num 37 | from .num import replace_frac 38 | from .num import replace_negative_num 39 | from .num import replace_number 40 | from .num import replace_percentage 41 | from .num import replace_positive_quantifier 42 | from .num import replace_range 43 | from .phonecode import RE_MOBILE_PHONE 44 | from .phonecode import RE_NATIONAL_UNIFORM_NUMBER 45 | from .phonecode import RE_TELEPHONE 46 | from .phonecode import replace_mobile 47 | from .phonecode import replace_phone 48 | from .quantifier import RE_TEMPERATURE 49 | from .quantifier import replace_measure 50 | from .quantifier import replace_temperature 51 | 52 | 53 | class TextNormalizer(): 54 | def __init__(self): 55 | self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') 56 | 57 | def _split(self, text: str, lang="zh") -> List[str]: 58 | """Split long text into sentences with sentence-splitting punctuations. 59 | Args: 60 | text (str): The input text. 61 | Returns: 62 | List[str]: Sentences. 63 | """ 64 | # Only for pure Chinese here 65 | if lang == "zh": 66 | text = text.replace(" ", "") 67 | # 过滤掉特殊字符 68 | text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text) 69 | text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) 70 | text = text.strip() 71 | sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] 72 | return sentences 73 | 74 | def _post_replace(self, sentence: str) -> str: 75 | sentence = sentence.replace('/', '每') 76 | sentence = sentence.replace('~', '至') 77 | sentence = sentence.replace('~', '至') 78 | sentence = sentence.replace('①', '一') 79 | sentence = sentence.replace('②', '二') 80 | sentence = sentence.replace('③', '三') 81 | sentence = sentence.replace('④', '四') 82 | sentence = sentence.replace('⑤', '五') 83 | sentence = sentence.replace('⑥', '六') 84 | sentence = sentence.replace('⑦', '七') 85 | sentence = sentence.replace('⑧', '八') 86 | sentence = sentence.replace('⑨', '九') 87 | sentence = sentence.replace('⑩', '十') 88 | sentence = sentence.replace('α', '阿尔法') 89 | sentence = sentence.replace('β', '贝塔') 90 | sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛') 91 | sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔') 92 | sentence = sentence.replace('ε', '艾普西龙') 93 | sentence = sentence.replace('ζ', '捷塔') 94 | sentence = sentence.replace('η', '依塔') 95 | sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔') 96 | sentence = sentence.replace('ι', '艾欧塔') 97 | sentence = sentence.replace('κ', '喀帕') 98 | sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达') 99 | sentence = sentence.replace('μ', '缪') 100 | sentence = sentence.replace('ν', '拗') 101 | sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西') 102 | sentence = sentence.replace('ο', '欧米克伦') 103 | sentence = sentence.replace('π', '派').replace('Π', '派') 104 | sentence = sentence.replace('ρ', '肉') 105 | sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace( 106 | 'σ', '西格玛') 107 | sentence = sentence.replace('τ', '套') 108 | sentence = sentence.replace('υ', '宇普西龙') 109 | sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾') 110 | sentence = sentence.replace('χ', '器') 111 | sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛') 112 | sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽') 113 | # re filter special characters, have one more character "-" than line 68 114 | sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence) 115 | return sentence 116 | 117 | def normalize_sentence(self, sentence: str) -> str: 118 | # basic character conversions 119 | sentence = tranditional_to_simplified(sentence) 120 | sentence = sentence.translate(F2H_ASCII_LETTERS).translate( 121 | F2H_DIGITS).translate(F2H_SPACE) 122 | 123 | # number related NSW verbalization 124 | sentence = RE_DATE.sub(replace_date, sentence) 125 | sentence = RE_DATE2.sub(replace_date2, sentence) 126 | 127 | # range first 128 | sentence = RE_TIME_RANGE.sub(replace_time, sentence) 129 | sentence = RE_TIME.sub(replace_time, sentence) 130 | 131 | sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) 132 | sentence = replace_measure(sentence) 133 | sentence = RE_FRAC.sub(replace_frac, sentence) 134 | sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) 135 | sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) 136 | 137 | sentence = RE_TELEPHONE.sub(replace_phone, sentence) 138 | sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) 139 | 140 | sentence = RE_RANGE.sub(replace_range, sentence) 141 | sentence = RE_INTEGER.sub(replace_negative_num, sentence) 142 | sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) 143 | sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, 144 | sentence) 145 | sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) 146 | sentence = RE_NUMBER.sub(replace_number, sentence) 147 | sentence = self._post_replace(sentence) 148 | 149 | return sentence 150 | 151 | def normalize(self, text: str) -> List[str]: 152 | sentences = self._split(text) 153 | sentences = [self.normalize_sentence(sent) for sent in sentences] 154 | return sentences 155 | -------------------------------------------------------------------------------- /academicodec/models/encodec/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Command-line for audio compression.""" 7 | import argparse 8 | import os 9 | import sys 10 | import typing as tp 11 | from collections import OrderedDict 12 | from pathlib import Path 13 | 14 | import librosa 15 | import soundfile as sf 16 | import torch 17 | from academicodec.models.encodec.net3 import SoundStream 18 | 19 | 20 | def save_audio(wav: torch.Tensor, 21 | path: tp.Union[Path, str], 22 | sample_rate: int, 23 | rescale: bool=False): 24 | limit = 0.99 25 | mx = wav.abs().max() 26 | if rescale: 27 | wav = wav * min(limit / mx, 1) 28 | else: 29 | wav = wav.clamp(-limit, limit) 30 | wav = wav.squeeze().cpu().numpy() 31 | sf.write(path, wav, sample_rate) 32 | 33 | 34 | def get_parser(): 35 | parser = argparse.ArgumentParser( 36 | 'encodec', 37 | description='High fidelity neural audio codec. ' 38 | 'If input is a .ecdc, decompresses it. ' 39 | 'If input is .wav, compresses it. If output is also wav, ' 40 | 'do a compression/decompression cycle.') 41 | parser.add_argument( 42 | '--input', 43 | type=Path, 44 | help='Input file, whatever is supported by torchaudio on your system.') 45 | parser.add_argument( 46 | '--output', 47 | type=Path, 48 | nargs='?', 49 | help='Output file, otherwise inferred from input file.') 50 | parser.add_argument( 51 | '--resume_path', type=str, default='resume_path', help='resume_path') 52 | parser.add_argument( 53 | '--sr', type=int, default=16000, help='sample rate of model') 54 | parser.add_argument( 55 | '-r', 56 | '--rescale', 57 | action='store_true', 58 | help='Automatically rescale the output to avoid clipping.') 59 | parser.add_argument( 60 | '--ratios', 61 | type=int, 62 | nargs='+', 63 | # probs(ratios) = hop_size 64 | default=[8, 5, 4, 2], 65 | help='ratios of SoundStream, shoud be set for different hop_size (32d, 320, 240d, ...)' 66 | ) 67 | parser.add_argument( 68 | '--target_bandwidths', 69 | type=float, 70 | nargs='+', 71 | # default for 16k_320d 72 | default=[1, 1.5, 2, 4, 6, 12], 73 | help='target_bandwidths of net3.py') 74 | parser.add_argument( 75 | '--target_bw', 76 | type=float, 77 | # default for 16k_320d 78 | default=12, 79 | help='target_bw of net3.py') 80 | 81 | return parser 82 | 83 | 84 | def fatal(*args): 85 | print(*args, file=sys.stderr) 86 | sys.exit(1) 87 | 88 | 89 | # 这只是打印了但是没有真的 clip 90 | def check_clipping(wav, rescale): 91 | if rescale: 92 | return 93 | mx = wav.abs().max() 94 | limit = 0.99 95 | if mx > limit: 96 | print( 97 | f"Clipping!! max scale {mx}, limit is {limit}. " 98 | "To avoid clipping, use the `-r` option to rescale the output.", 99 | file=sys.stderr) 100 | 101 | 102 | def test_one(args, wav_root, store_root, rescale, soundstream): 103 | # torchaudio.load 的采样率为原始音频的采样率,不会自动下采样 104 | # wav, sr = torchaudio.load(wav_root) 105 | # # 取单声道, output shape [1, T] 106 | # wav = wav[0].unsqueeze(0) 107 | # # 重采样为模型的采样率 108 | # wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=args.sr)(wav) 109 | 110 | # load wav with librosa 111 | wav, sr = librosa.load(wav_root, sr=args.sr) 112 | wav = torch.tensor(wav).unsqueeze(0) 113 | 114 | # add batch axis 115 | wav = wav.unsqueeze(1).cuda() 116 | 117 | # compressing 118 | compressed = soundstream.encode(wav, target_bw=args.target_bw) 119 | print('finish compressing') 120 | out = soundstream.decode(compressed) 121 | out = out.detach().cpu().squeeze(0) 122 | check_clipping(out, rescale) 123 | save_audio(wav=out, path=store_root, sample_rate=args.sr, rescale=rescale) 124 | print('finish decompressing') 125 | 126 | 127 | def remove_encodec_weight_norm(model): 128 | from academicodec.modules import SConv1d 129 | from academicodec.modules.seanet import SConvTranspose1d 130 | from academicodec.modules.seanet import SEANetResnetBlock 131 | from torch.nn.utils import remove_weight_norm 132 | 133 | encoder = model.encoder.model 134 | for key in encoder._modules: 135 | if isinstance(encoder._modules[key], SEANetResnetBlock): 136 | remove_weight_norm(encoder._modules[key].shortcut.conv.conv) 137 | block_modules = encoder._modules[key].block._modules 138 | for skey in block_modules: 139 | if isinstance(block_modules[skey], SConv1d): 140 | remove_weight_norm(block_modules[skey].conv.conv) 141 | elif isinstance(encoder._modules[key], SConv1d): 142 | remove_weight_norm(encoder._modules[key].conv.conv) 143 | 144 | decoder = model.decoder.model 145 | for key in decoder._modules: 146 | if isinstance(decoder._modules[key], SEANetResnetBlock): 147 | remove_weight_norm(decoder._modules[key].shortcut.conv.conv) 148 | block_modules = decoder._modules[key].block._modules 149 | for skey in block_modules: 150 | if isinstance(block_modules[skey], SConv1d): 151 | remove_weight_norm(block_modules[skey].conv.conv) 152 | elif isinstance(decoder._modules[key], SConvTranspose1d): 153 | remove_weight_norm(decoder._modules[key].convtr.convtr) 154 | elif isinstance(decoder._modules[key], SConv1d): 155 | remove_weight_norm(decoder._modules[key].conv.conv) 156 | 157 | 158 | def test_batch(): 159 | args = get_parser().parse_args() 160 | print("args.target_bandwidths:", args.target_bandwidths) 161 | if not args.input.exists(): 162 | fatal(f"Input file {args.input} does not exist.") 163 | input_lists = os.listdir(args.input) 164 | input_lists.sort() 165 | soundstream = SoundStream( 166 | n_filters=32, 167 | D=512, 168 | ratios=args.ratios, 169 | sample_rate=args.sr, 170 | target_bandwidths=args.target_bandwidths) 171 | parameter_dict = torch.load(args.resume_path) 172 | new_state_dict = OrderedDict() 173 | # k 为 module.xxx.weight, v 为权重 174 | for k, v in parameter_dict.items(): 175 | # 截取`module.`后面的xxx.weight 176 | name = k[7:] 177 | new_state_dict[name] = v 178 | soundstream.load_state_dict(new_state_dict) # load model 179 | remove_encodec_weight_norm(soundstream) 180 | soundstream.cuda() 181 | soundstream.eval() 182 | os.makedirs(args.output, exist_ok=True) 183 | for audio in input_lists: 184 | test_one( 185 | args=args, 186 | wav_root=os.path.join(args.input, audio), 187 | store_root=os.path.join(args.output, audio), 188 | rescale=args.rescale, 189 | soundstream=soundstream) 190 | 191 | 192 | if __name__ == '__main__': 193 | test_batch() 194 | -------------------------------------------------------------------------------- /cosyvoice/cli/zh_normalization/num.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Rules to verbalize numbers into Chinese characters. 16 | https://zh.wikipedia.org/wiki/中文数字#現代中文 17 | """ 18 | import re 19 | from collections import OrderedDict 20 | from typing import List 21 | 22 | DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} 23 | UNITS = OrderedDict({ 24 | 1: '十', 25 | 2: '百', 26 | 3: '千', 27 | 4: '万', 28 | 8: '亿', 29 | }) 30 | 31 | COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' 32 | 33 | # 分数表达式 34 | RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') 35 | 36 | 37 | def replace_frac(match) -> str: 38 | """ 39 | Args: 40 | match (re.Match) 41 | Returns: 42 | str 43 | """ 44 | sign = match.group(1) 45 | nominator = match.group(2) 46 | denominator = match.group(3) 47 | sign: str = "负" if sign else "" 48 | nominator: str = num2str(nominator) 49 | denominator: str = num2str(denominator) 50 | result = f"{sign}{denominator}分之{nominator}" 51 | return result 52 | 53 | 54 | # 百分数表达式 55 | RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') 56 | 57 | 58 | def replace_percentage(match) -> str: 59 | """ 60 | Args: 61 | match (re.Match) 62 | Returns: 63 | str 64 | """ 65 | sign = match.group(1) 66 | percent = match.group(2) 67 | sign: str = "负" if sign else "" 68 | percent: str = num2str(percent) 69 | result = f"{sign}百分之{percent}" 70 | return result 71 | 72 | 73 | # 整数表达式 74 | # 带负号的整数 -10 75 | RE_INTEGER = re.compile(r'(-)' r'(\d+)') 76 | 77 | 78 | def replace_negative_num(match) -> str: 79 | """ 80 | Args: 81 | match (re.Match) 82 | Returns: 83 | str 84 | """ 85 | sign = match.group(1) 86 | number = match.group(2) 87 | sign: str = "负" if sign else "" 88 | number: str = num2str(number) 89 | result = f"{sign}{number}" 90 | return result 91 | 92 | 93 | # 编号-无符号整形 94 | # 00078 95 | RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') 96 | 97 | 98 | def replace_default_num(match): 99 | """ 100 | Args: 101 | match (re.Match) 102 | Returns: 103 | str 104 | """ 105 | number = match.group(0) 106 | return verbalize_digit(number, alt_one=True) 107 | 108 | 109 | # 数字表达式 110 | # 纯小数 111 | RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') 112 | # 正整数 + 量词 113 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) 114 | RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') 115 | 116 | 117 | def replace_positive_quantifier(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | number = match.group(1) 125 | match_2 = match.group(2) 126 | if match_2 == "+": 127 | match_2 = "多" 128 | match_2: str = match_2 if match_2 else "" 129 | quantifiers: str = match.group(3) 130 | number: str = num2str(number) 131 | result = f"{number}{match_2}{quantifiers}" 132 | return result 133 | 134 | 135 | def replace_number(match) -> str: 136 | """ 137 | Args: 138 | match (re.Match) 139 | Returns: 140 | str 141 | """ 142 | sign = match.group(1) 143 | number = match.group(2) 144 | pure_decimal = match.group(5) 145 | if pure_decimal: 146 | result = num2str(pure_decimal) 147 | else: 148 | sign: str = "负" if sign else "" 149 | number: str = num2str(number) 150 | result = f"{sign}{number}" 151 | return result 152 | 153 | 154 | # 范围表达式 155 | # match.group(1) and match.group(8) are copy from RE_NUMBER 156 | 157 | RE_RANGE = re.compile( 158 | r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') 159 | 160 | 161 | def replace_range(match) -> str: 162 | """ 163 | Args: 164 | match (re.Match) 165 | Returns: 166 | str 167 | """ 168 | first, second = match.group(1), match.group(8) 169 | first = RE_NUMBER.sub(replace_number, first) 170 | second = RE_NUMBER.sub(replace_number, second) 171 | result = f"{first}到{second}" 172 | return result 173 | 174 | 175 | def _get_value(value_string: str, use_zero: bool=True) -> List[str]: 176 | stripped = value_string.lstrip('0') 177 | if len(stripped) == 0: 178 | return [] 179 | elif len(stripped) == 1: 180 | if use_zero and len(stripped) < len(value_string): 181 | return [DIGITS['0'], DIGITS[stripped]] 182 | else: 183 | return [DIGITS[stripped]] 184 | else: 185 | largest_unit = next( 186 | power for power in reversed(UNITS.keys()) if power < len(stripped)) 187 | first_part = value_string[:-largest_unit] 188 | second_part = value_string[-largest_unit:] 189 | return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( 190 | second_part) 191 | 192 | 193 | def verbalize_cardinal(value_string: str) -> str: 194 | if not value_string: 195 | return '' 196 | 197 | # 000 -> '零' , 0 -> '零' 198 | value_string = value_string.lstrip('0') 199 | if len(value_string) == 0: 200 | return DIGITS['0'] 201 | 202 | result_symbols = _get_value(value_string) 203 | # verbalized number starting with '一十*' is abbreviated as `十*` 204 | if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ 205 | '1'] and result_symbols[1] == UNITS[1]: 206 | result_symbols = result_symbols[1:] 207 | return ''.join(result_symbols) 208 | 209 | 210 | def verbalize_digit(value_string: str, alt_one=False) -> str: 211 | result_symbols = [DIGITS[digit] for digit in value_string] 212 | result = ''.join(result_symbols) 213 | if alt_one: 214 | result = result.replace("一", "幺") 215 | return result 216 | 217 | 218 | def num2str(value_string: str) -> str: 219 | integer_decimal = value_string.split('.') 220 | if len(integer_decimal) == 1: 221 | integer = integer_decimal[0] 222 | decimal = '' 223 | elif len(integer_decimal) == 2: 224 | integer, decimal = integer_decimal 225 | else: 226 | raise ValueError( 227 | f"The value string: '${value_string}' has more than one point in it." 228 | ) 229 | 230 | result = verbalize_cardinal(integer) 231 | 232 | decimal = decimal.rstrip('0') 233 | if decimal: 234 | # '.22' is verbalized as '零点二二' 235 | # '3.20' is verbalized as '三点二 236 | result = result if result else "零" 237 | result += '点' + verbalize_digit(decimal) 238 | return result 239 | -------------------------------------------------------------------------------- /academicodec/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import random 5 | import sys 6 | import time 7 | import warnings 8 | 9 | import matplotlib 10 | import numpy as np 11 | import torch 12 | import yaml 13 | from torch import distributed as dist 14 | from torch.nn.utils import weight_norm 15 | matplotlib.use("Agg") 16 | import matplotlib.pylab as plt 17 | import re 18 | import pathlib 19 | 20 | 21 | def seed_everything(seed, cudnn_deterministic=False): 22 | """ 23 | Function that sets seed for pseudo-random number generators in: 24 | pytorch, numpy, python.random 25 | 26 | Args: 27 | seed: the integer value seed for global random state 28 | """ 29 | if seed is not None: 30 | # print(f"Global seed set to {seed}") 31 | random.seed(seed) 32 | np.random.seed(seed) 33 | torch.manual_seed(seed) 34 | torch.cuda.manual_seed_all(seed) 35 | 36 | # if cudnn_deterministic: 37 | # torch.backends.cudnn.deterministic = True 38 | # warnings.warn('You have chosen to seed training. ' 39 | # 'This will turn on the CUDNN deterministic setting, ' 40 | # 'which can slow down your training considerably! ' 41 | # 'You may see unexpected behavior when restarting ' 42 | # 'from checkpoints.') 43 | 44 | 45 | def is_primary(): 46 | return get_rank() == 0 47 | 48 | 49 | def get_rank(): 50 | if not dist.is_available(): 51 | return 0 52 | if not dist.is_initialized(): 53 | return 0 54 | 55 | return dist.get_rank() 56 | 57 | 58 | def load_yaml_config(path): 59 | with open(path) as f: 60 | config = yaml.full_load(f) 61 | return config 62 | 63 | 64 | def save_config_to_yaml(config, path): 65 | assert path.endswith('.yaml') 66 | with open(path, 'w') as f: 67 | f.write(yaml.dump(config)) 68 | f.close() 69 | 70 | 71 | def save_dict_to_json(d, path, indent=None): 72 | json.dump(d, open(path, 'w'), indent=indent) 73 | 74 | 75 | def load_dict_from_json(path): 76 | return json.load(open(path, 'r')) 77 | 78 | 79 | def write_args(args, path): 80 | args_dict = dict((name, getattr(args, name)) for name in dir(args) 81 | if not name.startswith('_')) 82 | with open(path, 'a') as args_file: 83 | args_file.write('==> torch version: {}\n'.format(torch.__version__)) 84 | args_file.write( 85 | '==> cudnn version: {}\n'.format(torch.backends.cudnn.version())) 86 | args_file.write('==> Cmd:\n') 87 | args_file.write(str(sys.argv)) 88 | args_file.write('\n==> args:\n') 89 | for k, v in sorted(args_dict.items()): 90 | args_file.write(' %s: %s\n' % (str(k), str(v))) 91 | args_file.close() 92 | 93 | 94 | class Logger(object): 95 | def __init__(self, args): 96 | self.args = args 97 | self.save_dir = args.save_dir 98 | self.is_primary = is_primary() 99 | 100 | if self.is_primary: 101 | os.makedirs(self.save_dir, exist_ok=True) 102 | 103 | # save the args and config 104 | self.config_dir = os.path.join(self.save_dir, 'configs') 105 | os.makedirs(self.config_dir, exist_ok=True) 106 | file_name = os.path.join(self.config_dir, 'args.txt') 107 | write_args(args, file_name) 108 | 109 | log_dir = os.path.join(self.save_dir, 'logs') 110 | if not os.path.exists(log_dir): 111 | os.makedirs(log_dir, exist_ok=True) 112 | self.text_writer = open(os.path.join(log_dir, 'log.txt'), 113 | 'a') # 'w') 114 | if args.tensorboard: 115 | self.log_info('using tensorboard') 116 | self.tb_writer = torch.utils.tensorboard.SummaryWriter( 117 | log_dir=log_dir 118 | ) # tensorboard.SummaryWriter(log_dir=log_dir) 119 | else: 120 | self.tb_writer = None 121 | 122 | def save_config(self, config): 123 | if self.is_primary: 124 | save_config_to_yaml(config, 125 | os.path.join(self.config_dir, 'config.yaml')) 126 | 127 | def log_info(self, info, check_primary=True): 128 | if self.is_primary or (not check_primary): 129 | print(info) 130 | if self.is_primary: 131 | info = str(info) 132 | time_str = time.strftime('%Y-%m-%d-%H-%M') 133 | info = '{}: {}'.format(time_str, info) 134 | if not info.endswith('\n'): 135 | info += '\n' 136 | self.text_writer.write(info) 137 | self.text_writer.flush() 138 | 139 | def add_scalar(self, **kargs): 140 | """Log a scalar variable.""" 141 | if self.is_primary: 142 | if self.tb_writer is not None: 143 | self.tb_writer.add_scalar(**kargs) 144 | 145 | def add_scalars(self, **kargs): 146 | """Log a scalar variable.""" 147 | if self.is_primary: 148 | if self.tb_writer is not None: 149 | self.tb_writer.add_scalars(**kargs) 150 | 151 | def add_image(self, **kargs): 152 | """Log a scalar variable.""" 153 | if self.is_primary: 154 | if self.tb_writer is not None: 155 | self.tb_writer.add_image(**kargs) 156 | 157 | def add_images(self, **kargs): 158 | """Log a scalar variable.""" 159 | if self.is_primary: 160 | if self.tb_writer is not None: 161 | self.tb_writer.add_images(**kargs) 162 | 163 | def close(self): 164 | if self.is_primary: 165 | self.text_writer.close() 166 | self.tb_writer.close() 167 | 168 | 169 | def plot_spectrogram(spectrogram): 170 | fig, ax = plt.subplots(figsize=(10, 2)) 171 | im = ax.imshow( 172 | spectrogram, aspect="auto", origin="lower", interpolation='none') 173 | plt.colorbar(im, ax=ax) 174 | 175 | fig.canvas.draw() 176 | plt.close() 177 | 178 | return fig 179 | 180 | 181 | def init_weights(m, mean=0.0, std=0.01): 182 | classname = m.__class__.__name__ 183 | if classname.find("Conv") != -1: 184 | m.weight.data.normal_(mean, std) 185 | 186 | 187 | def apply_weight_norm(m): 188 | classname = m.__class__.__name__ 189 | if classname.find("Conv") != -1: 190 | weight_norm(m) 191 | 192 | 193 | def get_padding(kernel_size, dilation=1): 194 | return int((kernel_size * dilation - dilation) / 2) 195 | 196 | 197 | def load_checkpoint(filepath, device): 198 | assert os.path.isfile(filepath) 199 | print("Loading '{}'".format(filepath)) 200 | checkpoint_dict = torch.load(filepath, map_location=device) 201 | print("Complete.") 202 | return checkpoint_dict 203 | 204 | 205 | def save_checkpoint(filepath, obj, num_ckpt_keep=5): 206 | name = re.match(r'(do|g)_\d+', pathlib.Path(filepath).name).group(1) 207 | ckpts = sorted(pathlib.Path(filepath).parent.glob(f'{name}_*')) 208 | if len(ckpts) > num_ckpt_keep: 209 | [os.remove(c) for c in ckpts[:-num_ckpt_keep]] 210 | print("Saving checkpoint to {}".format(filepath)) 211 | torch.save(obj, filepath) 212 | print("Complete.") 213 | 214 | 215 | def scan_checkpoint(cp_dir, prefix): 216 | pattern = os.path.join(cp_dir, prefix + '????????') 217 | cp_list = glob.glob(pattern) 218 | if len(cp_list) == 0: 219 | return None 220 | return sorted(cp_list)[-1] 221 | -------------------------------------------------------------------------------- /matcha/hifigan/meldataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import math 4 | import os 5 | import random 6 | 7 | import numpy as np 8 | import torch 9 | import torch.utils.data 10 | from librosa.filters import mel as librosa_mel_fn 11 | from librosa.util import normalize 12 | from scipy.io.wavfile import read 13 | 14 | MAX_WAV_VALUE = 32768.0 15 | 16 | 17 | def load_wav(full_path): 18 | sampling_rate, data = read(full_path) 19 | return data, sampling_rate 20 | 21 | 22 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 23 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 24 | 25 | 26 | def dynamic_range_decompression(x, C=1): 27 | return np.exp(x) / C 28 | 29 | 30 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 31 | return torch.log(torch.clamp(x, min=clip_val) * C) 32 | 33 | 34 | def dynamic_range_decompression_torch(x, C=1): 35 | return torch.exp(x) / C 36 | 37 | 38 | def spectral_normalize_torch(magnitudes): 39 | output = dynamic_range_compression_torch(magnitudes) 40 | return output 41 | 42 | 43 | def spectral_de_normalize_torch(magnitudes): 44 | output = dynamic_range_decompression_torch(magnitudes) 45 | return output 46 | 47 | 48 | mel_basis = {} 49 | hann_window = {} 50 | 51 | 52 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 53 | if torch.min(y) < -1.0: 54 | print("min value is ", torch.min(y)) 55 | if torch.max(y) > 1.0: 56 | print("max value is ", torch.max(y)) 57 | 58 | global mel_basis, hann_window # pylint: disable=global-statement 59 | if fmax not in mel_basis: 60 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 61 | mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 62 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 63 | 64 | y = torch.nn.functional.pad( 65 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" 66 | ) 67 | y = y.squeeze(1) 68 | 69 | spec = torch.view_as_real( 70 | torch.stft( 71 | y, 72 | n_fft, 73 | hop_length=hop_size, 74 | win_length=win_size, 75 | window=hann_window[str(y.device)], 76 | center=center, 77 | pad_mode="reflect", 78 | normalized=False, 79 | onesided=True, 80 | return_complex=True, 81 | ) 82 | ) 83 | 84 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 85 | 86 | spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) 87 | spec = spectral_normalize_torch(spec) 88 | 89 | return spec 90 | 91 | 92 | def get_dataset_filelist(a): 93 | with open(a.input_training_file, encoding="utf-8") as fi: 94 | training_files = [ 95 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 96 | ] 97 | 98 | with open(a.input_validation_file, encoding="utf-8") as fi: 99 | validation_files = [ 100 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 101 | ] 102 | return training_files, validation_files 103 | 104 | 105 | class MelDataset(torch.utils.data.Dataset): 106 | def __init__( 107 | self, 108 | training_files, 109 | segment_size, 110 | n_fft, 111 | num_mels, 112 | hop_size, 113 | win_size, 114 | sampling_rate, 115 | fmin, 116 | fmax, 117 | split=True, 118 | shuffle=True, 119 | n_cache_reuse=1, 120 | device=None, 121 | fmax_loss=None, 122 | fine_tuning=False, 123 | base_mels_path=None, 124 | ): 125 | self.audio_files = training_files 126 | random.seed(1234) 127 | if shuffle: 128 | random.shuffle(self.audio_files) 129 | self.segment_size = segment_size 130 | self.sampling_rate = sampling_rate 131 | self.split = split 132 | self.n_fft = n_fft 133 | self.num_mels = num_mels 134 | self.hop_size = hop_size 135 | self.win_size = win_size 136 | self.fmin = fmin 137 | self.fmax = fmax 138 | self.fmax_loss = fmax_loss 139 | self.cached_wav = None 140 | self.n_cache_reuse = n_cache_reuse 141 | self._cache_ref_count = 0 142 | self.device = device 143 | self.fine_tuning = fine_tuning 144 | self.base_mels_path = base_mels_path 145 | 146 | def __getitem__(self, index): 147 | filename = self.audio_files[index] 148 | if self._cache_ref_count == 0: 149 | audio, sampling_rate = load_wav(filename) 150 | audio = audio / MAX_WAV_VALUE 151 | if not self.fine_tuning: 152 | audio = normalize(audio) * 0.95 153 | self.cached_wav = audio 154 | if sampling_rate != self.sampling_rate: 155 | raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR") 156 | self._cache_ref_count = self.n_cache_reuse 157 | else: 158 | audio = self.cached_wav 159 | self._cache_ref_count -= 1 160 | 161 | audio = torch.FloatTensor(audio) 162 | audio = audio.unsqueeze(0) 163 | 164 | if not self.fine_tuning: 165 | if self.split: 166 | if audio.size(1) >= self.segment_size: 167 | max_audio_start = audio.size(1) - self.segment_size 168 | audio_start = random.randint(0, max_audio_start) 169 | audio = audio[:, audio_start : audio_start + self.segment_size] 170 | else: 171 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") 172 | 173 | mel = mel_spectrogram( 174 | audio, 175 | self.n_fft, 176 | self.num_mels, 177 | self.sampling_rate, 178 | self.hop_size, 179 | self.win_size, 180 | self.fmin, 181 | self.fmax, 182 | center=False, 183 | ) 184 | else: 185 | mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy")) 186 | mel = torch.from_numpy(mel) 187 | 188 | if len(mel.shape) < 3: 189 | mel = mel.unsqueeze(0) 190 | 191 | if self.split: 192 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 193 | 194 | if audio.size(1) >= self.segment_size: 195 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 196 | mel = mel[:, :, mel_start : mel_start + frames_per_seg] 197 | audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size] 198 | else: 199 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant") 200 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") 201 | 202 | mel_loss = mel_spectrogram( 203 | audio, 204 | self.n_fft, 205 | self.num_mels, 206 | self.sampling_rate, 207 | self.hop_size, 208 | self.win_size, 209 | self.fmin, 210 | self.fmax_loss, 211 | center=False, 212 | ) 213 | 214 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 215 | 216 | def __len__(self): 217 | return len(self.audio_files) 218 | -------------------------------------------------------------------------------- /matcha/models/baselightningmodule.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a base lightning module that can be used to train a model. 3 | The benefit of this abstraction is that all the logic outside of model definition can be reused for different models. 4 | """ 5 | import inspect 6 | from abc import ABC 7 | from typing import Any, Dict 8 | 9 | import torch 10 | from lightning import LightningModule 11 | from lightning.pytorch.utilities import grad_norm 12 | 13 | from matcha import utils 14 | from matcha.utils.utils import plot_tensor 15 | 16 | log = utils.get_pylogger(__name__) 17 | 18 | 19 | class BaseLightningClass(LightningModule, ABC): 20 | def update_data_statistics(self, data_statistics): 21 | if data_statistics is None: 22 | data_statistics = { 23 | "mel_mean": 0.0, 24 | "mel_std": 1.0, 25 | } 26 | 27 | self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"])) 28 | self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"])) 29 | 30 | def configure_optimizers(self) -> Any: 31 | optimizer = self.hparams.optimizer(params=self.parameters()) 32 | if self.hparams.scheduler not in (None, {}): 33 | scheduler_args = {} 34 | # Manage last epoch for exponential schedulers 35 | if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters: 36 | if hasattr(self, "ckpt_loaded_epoch"): 37 | current_epoch = self.ckpt_loaded_epoch - 1 38 | else: 39 | current_epoch = -1 40 | 41 | scheduler_args.update({"optimizer": optimizer}) 42 | scheduler = self.hparams.scheduler.scheduler(**scheduler_args) 43 | scheduler.last_epoch = current_epoch 44 | return { 45 | "optimizer": optimizer, 46 | "lr_scheduler": { 47 | "scheduler": scheduler, 48 | "interval": self.hparams.scheduler.lightning_args.interval, 49 | "frequency": self.hparams.scheduler.lightning_args.frequency, 50 | "name": "learning_rate", 51 | }, 52 | } 53 | 54 | return {"optimizer": optimizer} 55 | 56 | def get_losses(self, batch): 57 | x, x_lengths = batch["x"], batch["x_lengths"] 58 | y, y_lengths = batch["y"], batch["y_lengths"] 59 | spks = batch["spks"] 60 | 61 | dur_loss, prior_loss, diff_loss = self( 62 | x=x, 63 | x_lengths=x_lengths, 64 | y=y, 65 | y_lengths=y_lengths, 66 | spks=spks, 67 | out_size=self.out_size, 68 | ) 69 | return { 70 | "dur_loss": dur_loss, 71 | "prior_loss": prior_loss, 72 | "diff_loss": diff_loss, 73 | } 74 | 75 | def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: 76 | self.ckpt_loaded_epoch = checkpoint["epoch"] # pylint: disable=attribute-defined-outside-init 77 | 78 | def training_step(self, batch: Any, batch_idx: int): 79 | loss_dict = self.get_losses(batch) 80 | self.log( 81 | "step", 82 | float(self.global_step), 83 | on_step=True, 84 | prog_bar=True, 85 | logger=True, 86 | sync_dist=True, 87 | ) 88 | 89 | self.log( 90 | "sub_loss/train_dur_loss", 91 | loss_dict["dur_loss"], 92 | on_step=True, 93 | on_epoch=True, 94 | logger=True, 95 | sync_dist=True, 96 | ) 97 | self.log( 98 | "sub_loss/train_prior_loss", 99 | loss_dict["prior_loss"], 100 | on_step=True, 101 | on_epoch=True, 102 | logger=True, 103 | sync_dist=True, 104 | ) 105 | self.log( 106 | "sub_loss/train_diff_loss", 107 | loss_dict["diff_loss"], 108 | on_step=True, 109 | on_epoch=True, 110 | logger=True, 111 | sync_dist=True, 112 | ) 113 | 114 | total_loss = sum(loss_dict.values()) 115 | self.log( 116 | "loss/train", 117 | total_loss, 118 | on_step=True, 119 | on_epoch=True, 120 | logger=True, 121 | prog_bar=True, 122 | sync_dist=True, 123 | ) 124 | 125 | return {"loss": total_loss, "log": loss_dict} 126 | 127 | def validation_step(self, batch: Any, batch_idx: int): 128 | loss_dict = self.get_losses(batch) 129 | self.log( 130 | "sub_loss/val_dur_loss", 131 | loss_dict["dur_loss"], 132 | on_step=True, 133 | on_epoch=True, 134 | logger=True, 135 | sync_dist=True, 136 | ) 137 | self.log( 138 | "sub_loss/val_prior_loss", 139 | loss_dict["prior_loss"], 140 | on_step=True, 141 | on_epoch=True, 142 | logger=True, 143 | sync_dist=True, 144 | ) 145 | self.log( 146 | "sub_loss/val_diff_loss", 147 | loss_dict["diff_loss"], 148 | on_step=True, 149 | on_epoch=True, 150 | logger=True, 151 | sync_dist=True, 152 | ) 153 | 154 | total_loss = sum(loss_dict.values()) 155 | self.log( 156 | "loss/val", 157 | total_loss, 158 | on_step=True, 159 | on_epoch=True, 160 | logger=True, 161 | prog_bar=True, 162 | sync_dist=True, 163 | ) 164 | 165 | return total_loss 166 | 167 | def on_validation_end(self) -> None: 168 | if self.trainer.is_global_zero: 169 | one_batch = next(iter(self.trainer.val_dataloaders)) 170 | if self.current_epoch == 0: 171 | log.debug("Plotting original samples") 172 | for i in range(2): 173 | y = one_batch["y"][i].unsqueeze(0).to(self.device) 174 | self.logger.experiment.add_image( 175 | f"original/{i}", 176 | plot_tensor(y.squeeze().cpu()), 177 | self.current_epoch, 178 | dataformats="HWC", 179 | ) 180 | 181 | log.debug("Synthesising...") 182 | for i in range(2): 183 | x = one_batch["x"][i].unsqueeze(0).to(self.device) 184 | x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device) 185 | spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None 186 | output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks) 187 | y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"] 188 | attn = output["attn"] 189 | self.logger.experiment.add_image( 190 | f"generated_enc/{i}", 191 | plot_tensor(y_enc.squeeze().cpu()), 192 | self.current_epoch, 193 | dataformats="HWC", 194 | ) 195 | self.logger.experiment.add_image( 196 | f"generated_dec/{i}", 197 | plot_tensor(y_dec.squeeze().cpu()), 198 | self.current_epoch, 199 | dataformats="HWC", 200 | ) 201 | self.logger.experiment.add_image( 202 | f"alignment/{i}", 203 | plot_tensor(attn.squeeze().cpu()), 204 | self.current_epoch, 205 | dataformats="HWC", 206 | ) 207 | 208 | def on_before_optimizer_step(self, optimizer): 209 | self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()}) 210 | -------------------------------------------------------------------------------- /cosyvoice/flow/flow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | from typing import Dict, Optional 16 | import torch 17 | import torch.nn as nn 18 | from torch.nn import functional as F 19 | from omegaconf import DictConfig 20 | from cosyvoice.utils.mask import make_pad_mask 21 | 22 | 23 | class MaskedDiffWithXvec(torch.nn.Module): 24 | def __init__(self, 25 | input_size: int = 512, 26 | output_size: int = 80, 27 | spk_embed_dim: int = 192, 28 | output_type: str = "mel", 29 | vocab_size: int = 4096, 30 | input_frame_rate: int = 50, 31 | only_mask_loss: bool = True, 32 | encoder: torch.nn.Module = None, 33 | length_regulator: torch.nn.Module = None, 34 | decoder: torch.nn.Module = None, 35 | decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, 36 | mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): 37 | super().__init__() 38 | self.input_size = input_size 39 | self.output_size = output_size 40 | self.decoder_conf = decoder_conf 41 | self.mel_feat_conf = mel_feat_conf 42 | self.vocab_size = vocab_size 43 | self.output_type = output_type 44 | self.input_frame_rate = input_frame_rate 45 | logging.info(f"input frame rate={self.input_frame_rate}") 46 | self.input_embedding = nn.Embedding(vocab_size, input_size) 47 | self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) 48 | self.encoder = encoder 49 | self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) 50 | self.decoder = decoder 51 | self.length_regulator = length_regulator 52 | self.only_mask_loss = only_mask_loss 53 | 54 | def forward( 55 | self, 56 | batch: dict, 57 | device: torch.device, 58 | ) -> Dict[str, Optional[torch.Tensor]]: 59 | token = batch['speech_token'].to(device) 60 | token_len = batch['speech_token_len'].to(device) 61 | feat = batch['speech_feat'].to(device) 62 | feat_len = batch['speech_feat_len'].to(device) 63 | embedding = batch['utt_embedding'].to(device) 64 | 65 | # xvec projection 66 | embedding = F.normalize(embedding, dim=1) 67 | embedding = self.spk_embed_affine_layer(embedding) 68 | 69 | # concat text and prompt_text 70 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) 71 | token = self.input_embedding(torch.clamp(token, min=0)) * mask 72 | 73 | # text encode 74 | h, h_lengths = self.encoder(token, token_len) 75 | h = self.encoder_proj(h) 76 | h, h_lengths = self.length_regulator(h, feat_len) 77 | 78 | # get conditions 79 | conds = torch.zeros(feat.shape, device=token.device) 80 | conds = conds.transpose(1, 2) 81 | 82 | mask = (~make_pad_mask(feat_len)).to(h) 83 | feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1) 84 | loss, _ = self.decoder.compute_loss( 85 | feat.transpose(1, 2).contiguous(), 86 | mask.unsqueeze(1), 87 | h.transpose(1, 2).contiguous(), 88 | embedding, 89 | cond=conds 90 | ) 91 | return {'loss': loss} 92 | 93 | @torch.inference_mode() 94 | def inference(self, 95 | token, 96 | token_len, 97 | prompt_token, 98 | prompt_token_len, 99 | prompt_feat, 100 | prompt_feat_len, 101 | embedding): 102 | assert token.shape[0] == 1 103 | # xvec projection 104 | embedding = F.normalize(embedding, dim=1) 105 | embedding = self.spk_embed_affine_layer(embedding) 106 | 107 | # concat text and prompt_text 108 | token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len 109 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding) 110 | token = self.input_embedding(torch.clamp(token, min=0)) * mask 111 | 112 | # text encode 113 | h, h_lengths = self.encoder(token, token_len) 114 | h = self.encoder_proj(h) 115 | feat_len = (token_len / 50 * 22050 / 256).int() 116 | h, h_lengths = self.length_regulator(h, feat_len) 117 | 118 | # get conditions 119 | conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device) 120 | if prompt_feat.shape[1] != 0: 121 | for i, j in enumerate(prompt_feat_len): 122 | conds[i, :j] = prompt_feat[i] 123 | conds = conds.transpose(1, 2) 124 | 125 | mask = (~make_pad_mask(feat_len)).to(h) 126 | feat = self.decoder( 127 | mu=h.transpose(1, 2).contiguous(), 128 | mask=mask.unsqueeze(1), 129 | spks=embedding, 130 | cond=conds, 131 | n_timesteps=10 132 | ) 133 | if prompt_feat.shape[1] != 0: 134 | feat = feat[:, :, prompt_feat.shape[1]:] 135 | return feat 136 | 137 | @torch.inference_mode() 138 | def inference_stream(self, 139 | token, 140 | token_len, 141 | prompt_token, 142 | prompt_token_len, 143 | prompt_feat, 144 | prompt_feat_len, 145 | embedding): 146 | assert token.shape[0] == 1 147 | # xvec projection 148 | embedding = F.normalize(embedding, dim=1) 149 | embedding = self.spk_embed_affine_layer(embedding) 150 | 151 | # concat text and prompt_text 152 | token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len 153 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding) 154 | token = self.input_embedding(torch.clamp(token, min=0)) * mask 155 | 156 | # text encode 157 | h, h_lengths = self.encoder(token, token_len) 158 | h = self.encoder_proj(h) 159 | feat_len = (token_len / 50 * 22050 / 256).int() 160 | h, h_lengths = self.length_regulator(h, feat_len) 161 | 162 | # get conditions 163 | conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device) 164 | if prompt_feat.shape[1] != 0: 165 | for i, j in enumerate(prompt_feat_len): 166 | conds[i, :j] = prompt_feat[i] 167 | conds = conds.transpose(1, 2) 168 | 169 | mask = (~make_pad_mask(feat_len)).to(h) 170 | feat = self.decoder( 171 | mu=h.transpose(1, 2).contiguous(), 172 | mask=mask.unsqueeze(1), 173 | spks=embedding, 174 | cond=conds, 175 | n_timesteps=10 176 | ) 177 | if prompt_feat.shape[1] != 0: 178 | feat = feat[:, :, prompt_feat.shape[1]:] 179 | yield feat 180 | -------------------------------------------------------------------------------- /academicodec/models/hificodec/meldataset.py: -------------------------------------------------------------------------------- 1 | # code based on https://github.com/b04901014/MQTTS 2 | import math 3 | import os 4 | import random 5 | 6 | import librosa 7 | import numpy as np 8 | import torch.utils.data 9 | from librosa.filters import mel as librosa_mel_fn 10 | 11 | 12 | def load_wav(full_path, sr): 13 | wav, sr = librosa.load(full_path, sr=sr) 14 | return wav, sr 15 | 16 | 17 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 18 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 19 | 20 | 21 | def dynamic_range_decompression(x, C=1): 22 | return np.exp(x) / C 23 | 24 | 25 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 26 | return torch.log(torch.clamp(x, min=clip_val) * C) 27 | 28 | 29 | def dynamic_range_decompression_torch(x, C=1): 30 | return torch.exp(x) / C 31 | 32 | 33 | def spectral_normalize_torch(magnitudes): 34 | output = dynamic_range_compression_torch(magnitudes) 35 | return output 36 | 37 | 38 | def spectral_de_normalize_torch(magnitudes): 39 | output = dynamic_range_decompression_torch(magnitudes) 40 | return output 41 | 42 | 43 | mel_basis = {} 44 | hann_window = {} 45 | 46 | 47 | def mel_spectrogram(y, 48 | n_fft, 49 | num_mels, 50 | sampling_rate, 51 | hop_size, 52 | win_size, 53 | fmin, 54 | fmax, 55 | center=False): 56 | if torch.min(y) < -1.: 57 | print('min value is ', torch.min(y)) 58 | if torch.max(y) > 1.: 59 | print('max value is ', torch.max(y)) 60 | 61 | global mel_basis, hann_window 62 | if fmax not in mel_basis: 63 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 64 | mel_basis[str(fmax) + '_' + 65 | str(y.device)] = torch.from_numpy(mel).float().to(y.device) 66 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 67 | 68 | y = torch.nn.functional.pad( 69 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int( 70 | (n_fft - hop_size) / 2)), 71 | mode='reflect') 72 | y = y.squeeze(1) 73 | 74 | spec = torch.stft( 75 | y, 76 | n_fft, 77 | hop_length=hop_size, 78 | win_length=win_size, 79 | window=hann_window[str(y.device)], 80 | center=center, 81 | pad_mode='reflect', 82 | normalized=False, 83 | onesided=True) 84 | 85 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 86 | 87 | spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec) 88 | spec = spectral_normalize_torch(spec) 89 | 90 | return spec 91 | 92 | 93 | def get_dataset_filelist(a): 94 | with open(a.input_training_file, 'r') as f: 95 | training_files = [l.strip() for l in f] 96 | with open(a.input_validation_file, 'r') as f: 97 | validation_files = [l.strip() for l in f] 98 | return training_files, validation_files 99 | 100 | 101 | class MelDataset(torch.utils.data.Dataset): 102 | def __init__(self, 103 | training_files, 104 | segment_size, 105 | n_fft, 106 | num_mels, 107 | hop_size, 108 | win_size, 109 | sampling_rate, 110 | fmin, 111 | fmax, 112 | split=True, 113 | shuffle=True, 114 | n_cache_reuse=1, 115 | device=None, 116 | fmax_loss=None, 117 | fine_tuning=False, 118 | base_mels_path=None): 119 | self.audio_files = training_files 120 | random.seed(1234) 121 | if shuffle: 122 | random.shuffle(self.audio_files) 123 | self.segment_size = segment_size 124 | self.sampling_rate = sampling_rate 125 | self.split = split 126 | self.n_fft = n_fft 127 | self.num_mels = num_mels 128 | self.hop_size = hop_size 129 | self.win_size = win_size 130 | self.fmin = fmin 131 | self.fmax = fmax 132 | self.fmax_loss = fmax_loss 133 | self.cached_wav = None 134 | self.n_cache_reuse = n_cache_reuse 135 | self._cache_ref_count = 0 136 | self.device = device 137 | self.fine_tuning = fine_tuning 138 | self.base_mels_path = base_mels_path 139 | 140 | def __getitem__(self, index): 141 | filename = self.audio_files[index] 142 | if self._cache_ref_count == 0: 143 | try: 144 | # Note by yuantian: load with the sample_rate of config 145 | audio, sampling_rate = load_wav(filename, sr=self.sampling_rate) 146 | except Exception as e: 147 | print(f"Error on audio: {filename}") 148 | audio = np.random.normal(size=(160000, )) * 0.05 149 | sampling_rate = self.sampling_rate 150 | self.cached_wav = audio 151 | if sampling_rate != self.sampling_rate: 152 | raise ValueError("{} SR doesn't match target {} SR".format( 153 | sampling_rate, self.sampling_rate)) 154 | self._cache_ref_count = self.n_cache_reuse 155 | else: 156 | audio = self.cached_wav 157 | self._cache_ref_count -= 1 158 | 159 | audio = torch.FloatTensor(audio) 160 | audio = audio.unsqueeze(0) 161 | 162 | if not self.fine_tuning: 163 | if self.split: 164 | if audio.size(1) >= self.segment_size: 165 | max_audio_start = audio.size(1) - self.segment_size 166 | audio_start = random.randint(0, max_audio_start) 167 | audio = audio[:, audio_start:audio_start + 168 | self.segment_size] 169 | else: 170 | audio = torch.nn.functional.pad(audio, ( 171 | 0, self.segment_size - audio.size(1)), 'constant') 172 | 173 | mel = mel_spectrogram( 174 | audio, 175 | self.n_fft, 176 | self.num_mels, 177 | self.sampling_rate, 178 | self.hop_size, 179 | self.win_size, 180 | self.fmin, 181 | self.fmax, 182 | center=False) 183 | else: 184 | mel = np.load( 185 | os.path.join(self.base_mels_path, 186 | os.path.splitext(os.path.split(filename)[-1])[0] + 187 | '.npy')) 188 | mel = torch.from_numpy(mel) 189 | 190 | if len(mel.shape) < 3: 191 | mel = mel.unsqueeze(0) 192 | 193 | if self.split: 194 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 195 | 196 | if audio.size(1) >= self.segment_size: 197 | mel_start = random.randint(0, 198 | mel.size(2) - frames_per_seg - 1) 199 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 200 | audio = audio[:, mel_start * self.hop_size:( 201 | mel_start + frames_per_seg) * self.hop_size] 202 | else: 203 | mel = torch.nn.functional.pad(mel, ( 204 | 0, frames_per_seg - mel.size(2)), 'constant') 205 | audio = torch.nn.functional.pad(audio, ( 206 | 0, self.segment_size - audio.size(1)), 'constant') 207 | 208 | mel_loss = mel_spectrogram( 209 | audio, 210 | self.n_fft, 211 | self.num_mels, 212 | self.sampling_rate, 213 | self.hop_size, 214 | self.win_size, 215 | self.fmin, 216 | self.fmax_loss, 217 | center=False) 218 | 219 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 220 | 221 | def __len__(self): 222 | return len(self.audio_files) 223 | --------------------------------------------------------------------------------