├── matcha
├── data
│ ├── __init__.py
│ └── components
│ │ └── __init__.py
├── hifigan
│ ├── __init__.py
│ ├── env.py
│ ├── config.py
│ ├── LICENSE
│ ├── xutils.py
│ ├── denoiser.py
│ ├── README.md
│ └── meldataset.py
├── models
│ ├── __init__.py
│ ├── components
│ │ ├── __init__.py
│ │ └── flow_matching.py
│ └── baselightningmodule.py
└── onnx
│ └── export.py
├── academicodec
├── __init__.py
├── models
│ ├── encodec
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── net3.py
│ │ ├── distributed
│ │ │ ├── launch.py
│ │ │ └── distributed.py
│ │ └── test.py
│ ├── hificodec
│ │ ├── __init__.py
│ │ ├── env.py
│ │ ├── vqvae_tester.py
│ │ ├── vqvae.py
│ │ ├── vqvae_copy_syn.py
│ │ └── meldataset.py
│ └── soundstream
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── models.py
├── quantization
│ ├── __init__.py
│ ├── distrib.py
│ └── vq.py
├── modules
│ ├── __init__.py
│ ├── lstm.py
│ ├── norm.py
│ └── transformer.py
├── binary.py
└── utils.py
├── cosyvoice
├── cli
│ ├── __init__.py
│ ├── zh_normalization
│ │ ├── __init__.py
│ │ ├── README.md
│ │ ├── quantifier.py
│ │ ├── phonecode.py
│ │ ├── constants.py
│ │ ├── chronology.py
│ │ ├── text_normlization.py
│ │ └── num.py
│ └── model.py
├── dataset
│ ├── __init__.py
│ └── dataset.py
├── transformer
│ ├── __init__.py
│ ├── activation.py
│ ├── label_smoothing_loss.py
│ ├── positionwise_feed_forward.py
│ ├── decoder_layer.py
│ └── convolution.py
├── flow
│ ├── length_regulator.py
│ ├── flow_matching.py
│ └── flow.py
├── hifigan
│ └── f0_predictor.py
├── utils
│ ├── class_utils.py
│ └── common.py
└── bin
│ ├── inference.py
│ └── train.py
├── data
├── cache
│ └── 这里为语音合成缓存文件夹.txt
└── model
│ └── 这里存放CosyVoice模型.txt
├── example参考音频文本.txt
├── requirements.txt
├── api.py
├── LICENSE
├── README_CN.md
└── README.md
/matcha/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/academicodec/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/cache/这里为语音合成缓存文件夹.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/matcha/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/matcha/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/model/这里存放CosyVoice模型.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/matcha/data/components/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/matcha/models/components/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/academicodec/models/soundstream/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/example参考音频文本.txt:
--------------------------------------------------------------------------------
1 | 把这些文字替换为你的example.wav的参考音频文本
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | modelscope
3 | torch
4 | torchaudio
5 | uvicorn
--------------------------------------------------------------------------------
/academicodec/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | # flake8: noqa
7 | from .vq import QuantizedResult
8 | from .vq import ResidualVectorQuantizer
9 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 |
5 | class AttrDict(dict):
6 | def __init__(self, *args, **kwargs):
7 | super(AttrDict, self).__init__(*args, **kwargs)
8 | self.__dict__ = self
9 |
10 |
11 | def build_env(config, config_name, path):
12 | t_path = os.path.join(path, config_name)
13 | if config != t_path:
14 | os.makedirs(path, exist_ok=True)
15 | shutil.copyfile(config, os.path.join(path, config_name))
16 |
--------------------------------------------------------------------------------
/matcha/hifigan/env.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/jik876/hifi-gan """
2 |
3 | import os
4 | import shutil
5 |
6 |
7 | class AttrDict(dict):
8 | def __init__(self, *args, **kwargs):
9 | super().__init__(*args, **kwargs)
10 | self.__dict__ = self
11 |
12 |
13 | def build_env(config, config_name, path):
14 | t_path = os.path.join(path, config_name)
15 | if config != t_path:
16 | os.makedirs(path, exist_ok=True)
17 | shutil.copyfile(config, os.path.join(path, config_name))
18 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .text_normlization import *
15 |
--------------------------------------------------------------------------------
/academicodec/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Torch modules."""
7 | # flake8: noqa
8 | from .conv import NormConv1d
9 | from .conv import NormConv2d
10 | from .conv import NormConvTranspose1d
11 | from .conv import NormConvTranspose2d
12 | from .conv import pad1d
13 | from .conv import SConv1d
14 | from .conv import SConvTranspose1d
15 | from .conv import unpad1d
16 | from .lstm import SLSTM
17 | from .seanet import SEANetDecoder
18 | from .seanet import SEANetEncoder
19 | from .transformer import StreamingTransformerEncoder
20 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/README.md:
--------------------------------------------------------------------------------
1 | ## Supported NSW (Non-Standard-Word) Normalization
2 |
3 | |NSW type|raw|normalized|
4 | |:--|:-|:-|
5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分|
7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二|
8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 |
--------------------------------------------------------------------------------
/academicodec/modules/lstm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """LSTM layers module."""
7 | from torch import nn
8 |
9 |
10 | class SLSTM(nn.Module):
11 | """
12 | LSTM without worrying about the hidden state, nor the layout of the data.
13 | Expects input as convolutional layout.
14 | """
15 |
16 | def __init__(self, dimension: int, num_layers: int=2, skip: bool=True):
17 | super().__init__()
18 | self.skip = skip
19 | self.lstm = nn.LSTM(dimension, dimension, num_layers)
20 |
21 | def forward(self, x):
22 | x = x.permute(2, 0, 1)
23 | y, _ = self.lstm(x)
24 | if self.skip:
25 | y = y + x
26 | y = y.permute(1, 2, 0)
27 | return y
28 |
--------------------------------------------------------------------------------
/matcha/hifigan/config.py:
--------------------------------------------------------------------------------
1 | v1 = {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 16,
5 | "learning_rate": 0.0004,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.999,
9 | "seed": 1234,
10 | "upsample_rates": [8, 8, 2, 2],
11 | "upsample_kernel_sizes": [16, 16, 4, 4],
12 | "upsample_initial_channel": 512,
13 | "resblock_kernel_sizes": [3, 7, 11],
14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
15 | "resblock_initial_channel": 256,
16 | "segment_size": 8192,
17 | "num_mels": 80,
18 | "num_freq": 1025,
19 | "n_fft": 1024,
20 | "hop_size": 256,
21 | "win_size": 1024,
22 | "sampling_rate": 22050,
23 | "fmin": 0,
24 | "fmax": 8000,
25 | "fmax_loss": None,
26 | "num_workers": 4,
27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
28 | }
29 |
--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
1 | import torchaudio
2 | import uvicorn
3 | from fastapi import FastAPI
4 | from fastapi.responses import FileResponse
5 | from cosyvoice.cli.cosyvoice import CosyVoice
6 | from cosyvoice.utils.file_utils import load_wav
7 |
8 | app = FastAPI()
9 | print("正在加载CosyVoice模型,请稍后...")
10 | model = CosyVoice('data/model/CosyVoice-300M')
11 | prompt_speech = load_wav('example.wav', 16000)
12 | with open('example参考音频文本.txt', 'r', encoding='utf-8') as file:
13 | lines = file.readlines()
14 | prompt_text = lines[0].strip()
15 | output_path = 'data/cache/cache.wav'
16 |
17 |
18 | @app.get("/cosyvoice/")
19 | def run_cosyvoice(text: str):
20 | results = model.inference_zero_shot(text, prompt_text, prompt_speech)
21 | tts_speech = results['tts_speech']
22 | torchaudio.save(output_path, tts_speech, 22050)
23 | return FileResponse(output_path)
24 |
25 |
26 | print("本地CosyVoice语音合成大模型API服务器启动成功!")
27 | uvicorn.run(app, host="0.0.0.0", port=9881)
28 |
--------------------------------------------------------------------------------
/academicodec/modules/norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Normalization modules."""
7 | import typing as tp
8 |
9 | import einops
10 | import torch
11 | from torch import nn
12 |
13 |
14 | class ConvLayerNorm(nn.LayerNorm):
15 | """
16 | Convolution-friendly LayerNorm that moves channels to last dimensions
17 | before running the normalization and moves them back to original position right after.
18 | """
19 |
20 | def __init__(self,
21 | normalized_shape: tp.Union[int, tp.List[int], torch.Size],
22 | **kwargs):
23 | super().__init__(normalized_shape, **kwargs)
24 |
25 | def forward(self, x):
26 | x = einops.rearrange(x, 'b ... t -> b t ...')
27 | x = super().forward(x)
28 | x = einops.rearrange(x, 'b t ... -> b ... t')
29 | return
30 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/dataset.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import random
3 |
4 | import torch
5 | import torchaudio
6 | from torch.utils.data import Dataset
7 |
8 |
9 | class NSynthDataset(Dataset):
10 | """Dataset to load NSynth data."""
11 |
12 | def __init__(self, audio_dir):
13 | super().__init__()
14 | self.filenames = []
15 | self.filenames.extend(glob.glob(audio_dir + "/*.wav"))
16 | print(len(self.filenames))
17 | _, self.sr = torchaudio.load(self.filenames[0])
18 | self.max_len = 24000 # 24000
19 |
20 | def __len__(self):
21 | return len(self.filenames)
22 |
23 | def __getitem__(self, index):
24 | ans = torch.zeros(1, self.max_len)
25 | audio = torchaudio.load(self.filenames[index])[0]
26 | if audio.shape[1] > self.max_len:
27 | st = random.randint(0, audio.shape[1] - self.max_len - 1)
28 | ed = st + self.max_len
29 | return audio[:, st:ed]
30 | else:
31 | ans[:, :audio.shape[1]] = audio
32 | return ans
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 枫影剑
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/matcha/hifigan/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Jungil Kong
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae_tester.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import librosa
4 | import torch
5 | import torch.nn as nn
6 |
7 | from academicodec.models.hificodec.vqvae import VQVAE
8 |
9 |
10 | class VqvaeTester(nn.Module):
11 | def __init__(self, config_path, model_path, sample_rate=24000):
12 | super().__init__()
13 | self.vqvae = VQVAE(config_path, model_path, with_encoder=True)
14 | self.sample_rate = sample_rate
15 |
16 | @torch.no_grad()
17 | def forward(self, wav_path):
18 | # 单声道
19 | # wav.shape (T, ), 按照模型的 sr 读取
20 | wav, sr = librosa.load(wav_path, sr=self.sample_rate)
21 | fid = os.path.basename(wav_path)[:-4]
22 | wav = torch.tensor(wav).unsqueeze(0)
23 | wav = wav.cuda()
24 | # vq_codes is acoustic token
25 | vq_codes = self.vqvae.encode(wav)
26 | syn = self.vqvae(vq_codes)
27 | return fid, syn
28 |
29 | @torch.no_grad()
30 | def vq(self, wav_path):
31 | wav, sr = librosa.load(wav_path, sr=self.sample_rate)
32 | fid = os.path.basename(wav_path)[:-4]
33 | wav = torch.tensor(wav).unsqueeze(0)
34 | wav = wav.cuda()
35 | # vq_codes is acoustic token
36 | vq_codes = self.vqvae.encode(wav)
37 | return fid, vq_codes
38 |
--------------------------------------------------------------------------------
/matcha/hifigan/xutils.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/jik876/hifi-gan """
2 |
3 | import glob
4 | import os
5 |
6 | import matplotlib
7 | import torch
8 | from torch.nn.utils import weight_norm
9 |
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 |
13 |
14 | def plot_spectrogram(spectrogram):
15 | fig, ax = plt.subplots(figsize=(10, 2))
16 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
17 | plt.colorbar(im, ax=ax)
18 |
19 | fig.canvas.draw()
20 | plt.close()
21 |
22 | return fig
23 |
24 |
25 | def init_weights(m, mean=0.0, std=0.01):
26 | classname = m.__class__.__name__
27 | if classname.find("Conv") != -1:
28 | m.weight.data.normal_(mean, std)
29 |
30 |
31 | def apply_weight_norm(m):
32 | classname = m.__class__.__name__
33 | if classname.find("Conv") != -1:
34 | weight_norm(m)
35 |
36 |
37 | def get_padding(kernel_size, dilation=1):
38 | return int((kernel_size * dilation - dilation) / 2)
39 |
40 |
41 | def load_checkpoint(filepath, device):
42 | assert os.path.isfile(filepath)
43 | print(f"Loading '{filepath}'")
44 | checkpoint_dict = torch.load(filepath, map_location=device)
45 | print("Complete.")
46 | return checkpoint_dict
47 |
48 |
49 | def save_checkpoint(filepath, obj):
50 | print(f"Saving checkpoint to {filepath}")
51 | torch.save(obj, filepath)
52 | print("Complete.")
53 |
54 |
55 | def scan_checkpoint(cp_dir, prefix):
56 | pattern = os.path.join(cp_dir, prefix + "????????")
57 | cp_list = glob.glob(pattern)
58 | if len(cp_list) == 0:
59 | return None
60 | return sorted(cp_list)[-1]
61 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from academicodec.models.hificodec.env import AttrDict
7 | from academicodec.models.hificodec.models import Encoder
8 | from academicodec.models.hificodec.models import Generator
9 | from academicodec.models.hificodec.models import Quantizer
10 |
11 |
12 | class VQVAE(nn.Module):
13 | def __init__(self,
14 | config_path,
15 | ckpt_path,
16 | with_encoder=False):
17 | super(VQVAE, self).__init__()
18 | ckpt = torch.load(ckpt_path)
19 | with open(config_path) as f:
20 | data = f.read()
21 | json_config = json.loads(data)
22 | self.h = AttrDict(json_config)
23 | self.quantizer = Quantizer(self.h)
24 | self.generator = Generator(self.h)
25 | self.generator.load_state_dict(ckpt['generator'])
26 | self.quantizer.load_state_dict(ckpt['quantizer'])
27 | if with_encoder:
28 | self.encoder = Encoder(self.h)
29 | self.encoder.load_state_dict(ckpt['encoder'])
30 |
31 | def forward(self, x):
32 | # x is the codebook
33 | # x.shape (B, T, Nq)
34 | quant_emb = self.quantizer.embed(x)
35 | return self.generator(quant_emb)
36 |
37 | def encode(self, x):
38 | batch_size = x.size(0)
39 | if len(x.shape) == 3 and x.shape[-1] == 1:
40 | x = x.squeeze(-1)
41 | c = self.encoder(x.unsqueeze(1))
42 | q, loss_q, c = self.quantizer(c)
43 | c = [code.reshape(batch_size, -1) for code in c]
44 | # shape: [N, T, 4]
45 | return torch.stack(c, -1)
46 |
--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
1 | # cosyvoice_simple_api
2 |
3 | ## 项目概述
4 |
5 | `cosyvoice_simple_api` 是一个基于阿里的 CosyVoice 开发的简易的语音合成 API 服务器项目。它允许用户轻松地将文本转换为有情感的语音输出,适用于创建有声读物、自动语音回复系统以及其他语音合成应用。
6 |
7 | ### 项目地址
8 |
9 | - CosyVoice 源地址:[FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
10 | - CosyVoice Windows 适配版(特别鸣谢刘悦):[v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows)
11 | - 本项目地址:[swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api)
12 |
13 | ## 运行方式
14 |
15 | 1. 确保你的系统中已安装 Python 环境。
16 | 2. 通过 `git clone` 或下载 ZIP 文件的方式获取项目代码。
17 | 3. 在项目根目录下,运行以下命令安装依赖:
18 |
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 |
23 | 4. 在命令行中运行以下命令启动服务器:
24 |
25 | ```bash
26 | python api.py
27 | ```
28 |
29 | ## 服务器地址
30 |
31 | CosyVoice 语音合成 API 服务器地址为:`http://你的电脑IP:9881/`
32 |
33 | ## API 接口
34 |
35 | ### 接口地址
36 |
37 | ```
38 | /cosyvoice/
39 | ```
40 |
41 | ### 请求方式
42 |
43 | ```
44 | GET
45 | ```
46 |
47 | ### 请求参数
48 |
49 | - `text`:必填,要合成的主体文本。
50 |
51 | ## 使用示例
52 |
53 | 1. 在浏览器地址栏输入以下地址:
54 |
55 | ```
56 | http://127.0.0.1:9881/cosyvoice/?text=你好,很高兴遇见你
57 | ```
58 |
59 | 2. 按下回车键,服务器将返回输出格式为 wav 音频文件。
60 |
61 | ## 更换参考音频和参考音频文本
62 |
63 | 1. 将 `example.wav` 替换为自定义的参考音频,文件名保持不变。
64 | 2. 用记事本打开 `example参考音频文本.txt`,修改成新的自定义参考音频文本。
65 | 3. 修改完成后保存文件,并重新运行 `CosyVoice语音合成API服务器.bat` 文件。
66 |
67 | ## 技术栈
68 |
69 | - FastAPI:用于构建 API 服务器。
70 | - ModelScope:模型相关的库。
71 | - Torch:PyTorch,用于深度学习模型。
72 | - TorchAudio:用于音频处理。
73 | - Uvicorn:ASGI 服务器,用于运行 FastAPI 应用。
74 |
75 | ## 贡献
76 |
77 | 欢迎对本项目进行贡献,包括但不限于修复 bug、增加新功能、改进文档等。在提交 Pull Request 之前,请确保你的代码通过了所有测试,并且遵循项目的代码风格。
78 |
79 | ## 许可证
80 |
81 | 本项目采用 [MIT 许可证](LICENSE)。
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import num2str
17 |
18 | # 温度表达式,温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 | "cm2": "平方厘米",
23 | "cm²": "平方厘米",
24 | "cm3": "立方厘米",
25 | "cm³": "立方厘米",
26 | "cm": "厘米",
27 | "db": "分贝",
28 | "ds": "毫秒",
29 | "kg": "千克",
30 | "km": "千米",
31 | "m2": "平方米",
32 | "m²": "平方米",
33 | "m³": "立方米",
34 | "m3": "立方米",
35 | "ml": "毫升",
36 | "m": "米",
37 | "mm": "毫米",
38 | "s": "秒"
39 | }
40 |
41 |
42 | def replace_temperature(match) -> str:
43 | """
44 | Args:
45 | match (re.Match)
46 | Returns:
47 | str
48 | """
49 | sign = match.group(1)
50 | temperature = match.group(2)
51 | unit = match.group(3)
52 | sign: str = "零下" if sign else ""
53 | temperature: str = num2str(temperature)
54 | unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 | result = f"{sign}{temperature}{unit}"
56 | return result
57 |
58 |
59 | def replace_measure(sentence) -> str:
60 | for q_notation in measure_dict:
61 | if q_notation in sentence:
62 | sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 | return sentence
64 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae_copy_syn.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import json
4 | import os
5 | from pathlib import Path
6 |
7 | import soundfile as sf
8 | from tqdm import tqdm
9 |
10 | from academicodec.models.hificodec.vqvae_tester import VqvaeTester
11 |
12 | parser = argparse.ArgumentParser()
13 |
14 | #Path
15 | parser.add_argument('--outputdir', type=str, required=True)
16 | parser.add_argument('--model_path', type=str, required=True)
17 | parser.add_argument('--input_wavdir', type=str, required=True)
18 | parser.add_argument('--config_path', type=str, required=True)
19 | parser.add_argument('--num_gens', type=int, default=1024)
20 |
21 | #Data
22 | parser.add_argument('--sample_rate', type=int, default=24000)
23 |
24 | args = parser.parse_args()
25 |
26 | with open(args.config_path, 'r') as f:
27 | argdict = json.load(f)
28 | assert argdict['sampling_rate'] == args.sample_rate, \
29 | f"Sampling rate not consistent, stated {args.sample_rate}, but the model is trained on {argdict['sample_rate']}"
30 | argdict.update(args.__dict__)
31 | args.__dict__ = argdict
32 |
33 | if __name__ == '__main__':
34 | Path(args.outputdir).mkdir(parents=True, exist_ok=True)
35 | print("Init model and load weights")
36 | model = VqvaeTester(config_path=args.config_path, model_path=args.model_path,sample_rate=args.sample_rate)
37 | model.cuda()
38 | model.vqvae.generator.remove_weight_norm()
39 | model.vqvae.encoder.remove_weight_norm()
40 | model.eval()
41 | print("Model ready")
42 |
43 | wav_paths = glob.glob(f"{args.input_wavdir}/*.wav")[:args.num_gens]
44 | print(f"Globbed {len(wav_paths)} wav files.")
45 |
46 | for wav_path in wav_paths:
47 | fid, wav = model(wav_path)
48 | wav = wav.squeeze().cpu().numpy()
49 | sf.write(
50 | os.path.join(args.outputdir, f'{fid}.wav'), wav, args.sample_rate)
51 |
--------------------------------------------------------------------------------
/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 |
19 |
20 | class InterpolateRegulator(nn.Module):
21 | def __init__(
22 | self,
23 | channels: int,
24 | sampling_ratios: Tuple,
25 | out_channels: int = None,
26 | groups: int = 1,
27 | ):
28 | super().__init__()
29 | self.sampling_ratios = sampling_ratios
30 | out_channels = out_channels or channels
31 | model = nn.ModuleList([])
32 | if len(sampling_ratios) > 0:
33 | for _ in sampling_ratios:
34 | module = nn.Conv1d(channels, channels, 3, 1, 1)
35 | norm = nn.GroupNorm(groups, channels)
36 | act = nn.Mish()
37 | model.extend([module, norm, act])
38 | model.append(
39 | nn.Conv1d(channels, out_channels, 1, 1)
40 | )
41 | self.model = nn.Sequential(*model)
42 |
43 | def forward(self, x, ylens=None):
44 | # x in (B, T, D)
45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 | out = self.model(x).transpose(1, 2).contiguous()
48 | olens = ylens
49 | return out * mask, olens
50 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import verbalize_digit
17 |
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通:130、131、132、156、155、186、185、176
23 | # 电信:133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 | r"(? str:
34 | if mobile:
35 | sp_parts = phone_string.strip('+').split()
36 | result = ','.join(
37 | [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 | return result
39 | else:
40 | sil_parts = phone_string.split('-')
41 | result = ','.join(
42 | [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 | return result
44 |
45 |
46 | def replace_phone(match) -> str:
47 | """
48 | Args:
49 | match (re.Match)
50 | Returns:
51 | str
52 | """
53 | return phone2str(match.group(0), mobile=False)
54 |
55 |
56 | def replace_mobile(match) -> str:
57 | """
58 | Args:
59 | match (re.Match)
60 | Returns:
61 | str
62 | """
63 | return phone2str(match.group(0))
64 |
--------------------------------------------------------------------------------
/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 |
18 |
19 | class ConvRNNF0Predictor(nn.Module):
20 | def __init__(self,
21 | num_class: int = 1,
22 | in_channels: int = 80,
23 | cond_channels: int = 512
24 | ):
25 | super().__init__()
26 |
27 | self.num_class = num_class
28 | self.condnet = nn.Sequential(
29 | weight_norm(
30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 | ),
32 | nn.ELU(),
33 | weight_norm(
34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 | ),
36 | nn.ELU(),
37 | weight_norm(
38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 | ),
40 | nn.ELU(),
41 | weight_norm(
42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 | ),
44 | nn.ELU(),
45 | weight_norm(
46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 | ),
48 | nn.ELU(),
49 | )
50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 |
52 | def forward(self, x: torch.Tensor) -> torch.Tensor:
53 | x = self.condnet(x)
54 | x = x.transpose(1, 2)
55 | return torch.abs(self.classifier(x).squeeze(-1))
56 |
--------------------------------------------------------------------------------
/academicodec/models/soundstream/dataset.py:
--------------------------------------------------------------------------------
1 | # 和 Encodec* 的 dataset.py 有点类似但是不完全一样
2 | # 主要是 prob > 0.7 的时候多了 ans2
3 | import glob
4 | import random
5 |
6 | import torch
7 | import torchaudio
8 | from torch.utils.data import Dataset
9 |
10 |
11 | class NSynthDataset(Dataset):
12 | """Dataset to load NSynth data."""
13 |
14 | def __init__(self, audio_dir):
15 | super().__init__()
16 | self.filenames = []
17 | self.filenames.extend(glob.glob(audio_dir + "/*.wav"))
18 | print(len(self.filenames))
19 | _, self.sr = torchaudio.load(self.filenames[0])
20 | self.max_len = 24000 # 24000
21 |
22 | def __len__(self):
23 | return len(self.filenames)
24 |
25 | def __getitem__(self, index):
26 | #print(self.filenames[index])
27 | prob = random.random() # (0,1)
28 | if prob > 0.7:
29 | # data augmentation
30 | ans1 = torch.zeros(1, self.max_len)
31 | ans2 = torch.zeros(1, self.max_len)
32 | audio1 = torchaudio.load(self.filenames[index])[0]
33 | index2 = random.randint(0, len(self.filenames) - 1)
34 | audio2 = torchaudio.load(self.filenames[index2])[0]
35 | if audio1.shape[1] > self.max_len:
36 | st = random.randint(0, audio1.shape[1] - self.max_len - 1)
37 | ed = st + self.max_len
38 | ans1 = audio1[:, st:ed]
39 | else:
40 | ans1[:, :audio1.shape[1]] = audio1
41 | if audio2.shape[1] > self.max_len:
42 | st = random.randint(0, audio2.shape[1] - self.max_len - 1)
43 | ed = st + self.max_len
44 | ans2 = audio2[:, st:ed]
45 | else:
46 | ans2[:, :audio2.shape[1]] = audio2
47 | ans = ans1 + ans2
48 | return ans
49 | else:
50 | ans = torch.zeros(1, self.max_len)
51 | audio = torchaudio.load(self.filenames[index])[0]
52 | if audio.shape[1] > self.max_len:
53 | st = random.randint(0, audio.shape[1] - self.max_len - 1)
54 | ed = st + self.max_len
55 | return audio[:, st:ed]
56 | else:
57 | ans[:, :audio.shape[1]] = audio
58 | return ans
59 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/net3.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 |
4 | import numpy as np
5 | import torch.nn as nn
6 | from academicodec.modules.seanet import SEANetDecoder
7 | from academicodec.modules.seanet import SEANetEncoder
8 | from academicodec.quantization import ResidualVectorQuantizer
9 |
10 |
11 | # Generator
12 | class SoundStream(nn.Module):
13 | def __init__(self,
14 | n_filters,
15 | D,
16 | target_bandwidths=[7.5, 15],
17 | ratios=[8, 5, 4, 2],
18 | sample_rate=24000,
19 | bins=1024,
20 | normalize=False):
21 | super().__init__()
22 | self.hop_length = np.prod(ratios) # 计算乘积
23 | self.encoder = SEANetEncoder(
24 | n_filters=n_filters, dimension=D, ratios=ratios)
25 | n_q = int(1000 * target_bandwidths[-1] //
26 | (math.ceil(sample_rate / self.hop_length) * 10))
27 | self.frame_rate = math.ceil(sample_rate / np.prod(ratios)) # 75
28 | self.bits_per_codebook = int(math.log2(bins))
29 | self.target_bandwidths = target_bandwidths
30 | self.quantizer = ResidualVectorQuantizer(
31 | dimension=D, n_q=n_q, bins=bins)
32 | self.decoder = SEANetDecoder(
33 | n_filters=n_filters, dimension=D, ratios=ratios)
34 |
35 | def get_last_layer(self):
36 | return self.decoder.layers[-1].weight
37 |
38 | def forward(self, x):
39 | e = self.encoder(x)
40 | max_idx = len(self.target_bandwidths) - 1
41 | bw = self.target_bandwidths[random.randint(0, max_idx)]
42 | quantized, codes, bandwidth, commit_loss = self.quantizer(
43 | e, self.frame_rate, bw)
44 | o = self.decoder(quantized)
45 | return o, commit_loss, None
46 |
47 | def encode(self, x, target_bw=None, st=None):
48 | e = self.encoder(x)
49 | if target_bw is None:
50 | bw = self.target_bandwidths[-1]
51 | else:
52 | bw = target_bw
53 | if st is None:
54 | st = 0
55 | codes = self.quantizer.encode(e, self.frame_rate, bw, st)
56 | return codes
57 |
58 | def decode(self, codes):
59 | quantized = self.quantizer.decode(codes)
60 | o = self.decoder(quantized)
61 | return o
62 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 |
17 | from pypinyin.constants import SUPPORT_UCS4
18 |
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 | ord(char) + 65248: ord(char)
23 | for char in string.ascii_letters
24 | }
25 |
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 |
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 |
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 |
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 |
43 | # 非"有拼音的汉字"的字符串,可用于NSW提取
44 | if SUPPORT_UCS4:
45 | RE_NSW = re.compile(r'(?:[^'
46 | r'\u3007' # 〇
47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
54 | r'])+')
55 | else:
56 | RE_NSW = re.compile( # pragma: no cover
57 | r'(?:[^'
58 | r'\u3007' # 〇
59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
62 | r'])+')
63 |
--------------------------------------------------------------------------------
/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright [2023-11-28]
2 | # 2024 Alibaba Inc (authors: Xiang Lyu)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 |
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 | LinearNoSubsampling,
20 | EmbedinigNoSubsampling,
21 | Conv1dSubsampling2,
22 | Conv2dSubsampling4,
23 | Conv2dSubsampling6,
24 | Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 | RelPositionalEncoding,
28 | WhisperPositionalEncoding,
29 | LearnablePositionalEncoding,
30 | NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 | RelPositionMultiHeadedAttention)
33 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
34 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
35 |
36 |
37 | COSYVOICE_ACTIVATION_CLASSES = {
38 | "hardtanh": torch.nn.Hardtanh,
39 | "tanh": torch.nn.Tanh,
40 | "relu": torch.nn.ReLU,
41 | "selu": torch.nn.SELU,
42 | "swish": getattr(torch.nn, "SiLU", Swish),
43 | "gelu": torch.nn.GELU,
44 | }
45 |
46 | COSYVOICE_SUBSAMPLE_CLASSES = {
47 | "linear": LinearNoSubsampling,
48 | "linear_legacy": LegacyLinearNoSubsampling,
49 | "embed": EmbedinigNoSubsampling,
50 | "conv1d2": Conv1dSubsampling2,
51 | "conv2d": Conv2dSubsampling4,
52 | "conv2d6": Conv2dSubsampling6,
53 | "conv2d8": Conv2dSubsampling8,
54 | 'paraformer_dummy': torch.nn.Identity
55 | }
56 |
57 | COSYVOICE_EMB_CLASSES = {
58 | "embed": PositionalEncoding,
59 | "abs_pos": PositionalEncoding,
60 | "rel_pos": RelPositionalEncoding,
61 | "rel_pos_espnet": EspnetRelPositionalEncoding,
62 | "no_pos": NoPositionalEncoding,
63 | "abs_pos_whisper": WhisperPositionalEncoding,
64 | "embed_learnable_pe": LearnablePositionalEncoding,
65 | }
66 |
67 | COSYVOICE_ATTENTION_CLASSES = {
68 | "selfattn": MultiHeadedAttention,
69 | "rel_selfattn": RelPositionMultiHeadedAttention,
70 | }
71 |
--------------------------------------------------------------------------------
/matcha/hifigan/denoiser.py:
--------------------------------------------------------------------------------
1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
2 |
3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
4 | import torch
5 |
6 |
7 | class Denoiser(torch.nn.Module):
8 | """Removes model bias from audio produced with waveglow"""
9 |
10 | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
11 | super().__init__()
12 | self.filter_length = filter_length
13 | self.hop_length = int(filter_length / n_overlap)
14 | self.win_length = win_length
15 |
16 | dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
17 | self.device = device
18 | if mode == "zeros":
19 | mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
20 | elif mode == "normal":
21 | mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
22 | else:
23 | raise Exception(f"Mode {mode} if not supported")
24 |
25 | def stft_fn(audio, n_fft, hop_length, win_length, window):
26 | spec = torch.stft(
27 | audio,
28 | n_fft=n_fft,
29 | hop_length=hop_length,
30 | win_length=win_length,
31 | window=window,
32 | return_complex=True,
33 | )
34 | spec = torch.view_as_real(spec)
35 | return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
36 |
37 | self.stft = lambda x: stft_fn(
38 | audio=x,
39 | n_fft=self.filter_length,
40 | hop_length=self.hop_length,
41 | win_length=self.win_length,
42 | window=torch.hann_window(self.win_length, device=device),
43 | )
44 | self.istft = lambda x, y: torch.istft(
45 | torch.complex(x * torch.cos(y), x * torch.sin(y)),
46 | n_fft=self.filter_length,
47 | hop_length=self.hop_length,
48 | win_length=self.win_length,
49 | window=torch.hann_window(self.win_length, device=device),
50 | )
51 |
52 | with torch.no_grad():
53 | bias_audio = vocoder(mel_input).float().squeeze(0)
54 | bias_spec, _ = self.stft(bias_audio)
55 |
56 | self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
57 |
58 | @torch.inference_mode()
59 | def forward(self, audio, strength=0.0005):
60 | audio_spec, audio_angles = self.stft(audio)
61 | audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
62 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
63 | audio_denoised = self.istft(audio_spec_denoised, audio_angles)
64 | return audio_denoised
65 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cosyvoice_simple_api
2 |
3 | ## Project Overview
4 |
5 | `cosyvoice_simple_api` is a simple text-to-speech API server project developed based on Alibaba's CosyVoice. It allows users to easily convert text into emotionally rich voice output, suitable for creating audiobooks, automated voice response systems, and other text-to-speech applications.
6 |
7 | ### Project Addresses
8 |
9 | - CosyVoice Source Address: [FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
10 | - CosyVoice Windows Adaptation (Special thanks to Liu Yue): [v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows)
11 | - This Project Address: [swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api)
12 |
13 | ## Running Method
14 |
15 | 1. Ensure that a Python environment is installed in your system.
16 | 2. Obtain the project code via `git clone` or by downloading the ZIP file.
17 | 3. In the project root directory, run the following command to install dependencies:
18 |
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 |
23 | 4. Run the following command in the command line to start the server:
24 |
25 | ```bash
26 | python api.py
27 | ```
28 |
29 | ## Server Address
30 |
31 | The CosyVoice text-to-speech API server address is: `http://your-computer-IP:9881/`
32 |
33 | ## API Interface
34 |
35 | ### Interface Address
36 |
37 | ```
38 | /cosyvoice/
39 | ```
40 |
41 | ### Request Method
42 |
43 | ```
44 | GET
45 | ```
46 |
47 | ### Request Parameters
48 |
49 | - `text`: Required, the main text to be synthesized.
50 |
51 | ## Usage Example
52 |
53 | 1. Enter the following address in the browser's address bar:
54 |
55 | ```
56 | http://127.0.0.1:9881/cosyvoice/?text=Hello, nice to meet you
57 | ```
58 |
59 | 2. Press Enter, and the server will return a response in the format of a wav audio file.
60 |
61 | ## Changing Reference Audio and Reference Audio Text
62 |
63 | 1. Replace `example.wav` with your custom reference audio, keeping the file name unchanged.
64 | 2. Open `example_reference_audio_text.txt` with Notepad and modify it to your new custom reference audio text.
65 | 3. After modification, save the file and rerun the `CosyVoice Text-to-Speech API Server.bat` file.
66 |
67 | ## Technology Stack
68 |
69 | - FastAPI: Used for building the API server.
70 | - ModelScope: A library related to models.
71 | - Torch: PyTorch, used for deep learning models.
72 | - TorchAudio: Used for audio processing.
73 | - Uvicorn: ASGI server, used to run FastAPI applications.
74 |
75 | ## Contribution
76 |
77 | Contributions to this project are welcome, including but not limited to fixing bugs, adding new features, and improving documentation. Before submitting a Pull Request, please ensure that your code passes all tests and adheres to the project's coding style.
78 |
79 | ## License
80 |
81 | This project is licensed under the [MIT License](LICENSE).
--------------------------------------------------------------------------------
/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo)
3 | # 2020 Mobvoi Inc (Binbin Zhang)
4 | # 2024 Alibaba Inc (Xiang Lyu)
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 |
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 |
23 |
24 | class Swish(torch.nn.Module):
25 | """Construct an Swish object."""
26 |
27 | def forward(self, x: torch.Tensor) -> torch.Tensor:
28 | """Return Swish activation function."""
29 | return x * torch.sigmoid(x)
30 |
31 |
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | # LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 | '''
36 | Implementation of a sine-based periodic activation function
37 | Shape:
38 | - Input: (B, C, T)
39 | - Output: (B, C, T), same shape as the input
40 | Parameters:
41 | - alpha - trainable parameter
42 | References:
43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 | https://arxiv.org/abs/2006.08195
45 | Examples:
46 | >>> a1 = snake(256)
47 | >>> x = torch.randn(256)
48 | >>> x = a1(x)
49 | '''
50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 | '''
52 | Initialization.
53 | INPUT:
54 | - in_features: shape of the input
55 | - alpha: trainable parameter
56 | alpha is initialized to 1 by default, higher values = higher-frequency.
57 | alpha will be trained along with the rest of your model.
58 | '''
59 | super(Snake, self).__init__()
60 | self.in_features = in_features
61 |
62 | # initialize alpha
63 | self.alpha_logscale = alpha_logscale
64 | if self.alpha_logscale: # log scale alphas initialized to zeros
65 | self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 | else: # linear scale alphas initialized to ones
67 | self.alpha = Parameter(torch.ones(in_features) * alpha)
68 |
69 | self.alpha.requires_grad = alpha_trainable
70 |
71 | self.no_div_by_zero = 0.000000001
72 |
73 | def forward(self, x):
74 | '''
75 | Forward pass of the function.
76 | Applies the function to the input elementwise.
77 | Snake ∶= x + 1/a * sin^2 (xa)
78 | '''
79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
80 | if self.alpha_logscale:
81 | alpha = torch.exp(alpha)
82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 |
84 | return x
85 |
--------------------------------------------------------------------------------
/cosyvoice/utils/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
2 | # 2024 Alibaba Inc (authors: Xiang Lyu)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # Modified from ESPnet(https://github.com/espnet/espnet)
16 | """Unility functions for Transformer."""
17 |
18 | from typing import List
19 |
20 | import torch
21 |
22 | IGNORE_ID = -1
23 |
24 |
25 | def pad_list(xs: List[torch.Tensor], pad_value: int):
26 | """Perform padding for the list of tensors.
27 |
28 | Args:
29 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
30 | pad_value (float): Value for padding.
31 |
32 | Returns:
33 | Tensor: Padded tensor (B, Tmax, `*`).
34 |
35 | Examples:
36 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
37 | >>> x
38 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
39 | >>> pad_list(x, 0)
40 | tensor([[1., 1., 1., 1.],
41 | [1., 1., 0., 0.],
42 | [1., 0., 0., 0.]])
43 |
44 | """
45 | max_len = max([len(item) for item in xs])
46 | batchs = len(xs)
47 | ndim = xs[0].ndim
48 | if ndim == 1:
49 | pad_res = torch.zeros(batchs,
50 | max_len,
51 | dtype=xs[0].dtype,
52 | device=xs[0].device)
53 | elif ndim == 2:
54 | pad_res = torch.zeros(batchs,
55 | max_len,
56 | xs[0].shape[1],
57 | dtype=xs[0].dtype,
58 | device=xs[0].device)
59 | elif ndim == 3:
60 | pad_res = torch.zeros(batchs,
61 | max_len,
62 | xs[0].shape[1],
63 | xs[0].shape[2],
64 | dtype=xs[0].dtype,
65 | device=xs[0].device)
66 | else:
67 | raise ValueError(f"Unsupported ndim: {ndim}")
68 | pad_res.fill_(pad_value)
69 | for i in range(batchs):
70 | pad_res[i, :len(xs[i])] = xs[i]
71 | return pad_res
72 |
73 |
74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
75 | ignore_label: int) -> torch.Tensor:
76 | """Calculate accuracy.
77 |
78 | Args:
79 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
80 | pad_targets (LongTensor): Target label tensors (B, Lmax).
81 | ignore_label (int): Ignore label id.
82 |
83 | Returns:
84 | torch.Tensor: Accuracy value (0.0 - 1.0).
85 |
86 | """
87 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
88 | pad_outputs.size(1)).argmax(2)
89 | mask = pad_targets != ignore_label
90 | numerator = torch.sum(
91 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
92 | denominator = torch.sum(mask)
93 | return (numerator / denominator).detach()
94 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/distributed/launch.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------
2 | # Diffsound
3 | # code based https://github.com/cientgu/VQ-Diffusion
4 | # ------------------------------------------
5 | import distributed.distributed as dist_fn
6 | import torch
7 | from torch import distributed as dist
8 | from torch import multiprocessing as mp
9 |
10 | # import distributed as dist_fn
11 |
12 |
13 | def find_free_port():
14 | import socket
15 |
16 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
17 |
18 | sock.bind(("", 0))
19 | port = sock.getsockname()[1]
20 | sock.close()
21 |
22 | return port
23 |
24 |
25 | def launch(fn,
26 | n_gpu_per_machine,
27 | n_machine=1,
28 | machine_rank=0,
29 | dist_url=None,
30 | args=()):
31 | world_size = n_machine * n_gpu_per_machine
32 |
33 | if world_size > 1:
34 | # if "OMP_NUM_THREADS" not in os.environ:
35 | # os.environ["OMP_NUM_THREADS"] = "1"
36 | if dist_url == "auto":
37 | if n_machine != 1:
38 | raise ValueError(
39 | 'dist_url="auto" not supported in multi-machine jobs')
40 | port = find_free_port()
41 | dist_url = f"tcp://127.0.0.1:{port}"
42 | print('dist_url ', dist_url)
43 | print('n_machine ', n_machine)
44 | print('args ', args)
45 | print('world_size ', world_size)
46 | print('machine_rank ', machine_rank)
47 | if n_machine > 1 and dist_url.startswith("file://"):
48 | raise ValueError(
49 | "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://"
50 | )
51 |
52 | mp.spawn(
53 | distributed_worker,
54 | nprocs=n_gpu_per_machine,
55 | args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url,
56 | args),
57 | daemon=False, )
58 | # n_machine ? world_size
59 | else:
60 | local_rank = 0
61 | fn(local_rank, *args)
62 |
63 |
64 | def distributed_worker(local_rank, fn, world_size, n_gpu_per_machine,
65 | machine_rank, dist_url, args):
66 | if not torch.cuda.is_available():
67 | raise OSError("CUDA is not available. Please check your environments")
68 |
69 | global_rank = machine_rank * n_gpu_per_machine + local_rank
70 | print('local_rank ', local_rank)
71 | print('global_rank ', global_rank)
72 | try:
73 | dist.init_process_group(
74 | backend="NCCL",
75 | init_method=dist_url,
76 | world_size=world_size,
77 | rank=global_rank, )
78 |
79 | except Exception:
80 | raise OSError("failed to initialize NCCL groups")
81 |
82 | # changed
83 | dist_fn.synchronize()
84 |
85 | if n_gpu_per_machine > torch.cuda.device_count():
86 | raise ValueError(
87 | f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})"
88 | )
89 |
90 | torch.cuda.set_device(local_rank)
91 |
92 | if dist_fn.LOCAL_PROCESS_GROUP is not None:
93 | raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None")
94 |
95 | # change paert
96 |
97 | n_machine = world_size // n_gpu_per_machine
98 | for i in range(n_machine):
99 | ranks_on_i = list(
100 | range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine))
101 | pg = dist.new_group(ranks_on_i)
102 |
103 | if i == machine_rank:
104 | dist_fn.LOCAL_PROCESS_GROUP = pg
105 |
106 | fn(local_rank, *args)
107 |
--------------------------------------------------------------------------------
/cosyvoice/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019 Shigeki Karita
2 | # 2020 Mobvoi Inc (Binbin Zhang)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 |
17 | import torch
18 | from torch import nn
19 |
20 |
21 | class LabelSmoothingLoss(nn.Module):
22 | """Label-smoothing loss.
23 |
24 | In a standard CE loss, the label's data distribution is:
25 | [0,1,2] ->
26 | [
27 | [1.0, 0.0, 0.0],
28 | [0.0, 1.0, 0.0],
29 | [0.0, 0.0, 1.0],
30 | ]
31 |
32 | In the smoothing version CE Loss,some probabilities
33 | are taken from the true label prob (1.0) and are divided
34 | among other labels.
35 |
36 | e.g.
37 | smoothing=0.1
38 | [0,1,2] ->
39 | [
40 | [0.9, 0.05, 0.05],
41 | [0.05, 0.9, 0.05],
42 | [0.05, 0.05, 0.9],
43 | ]
44 |
45 | Args:
46 | size (int): the number of class
47 | padding_idx (int): padding class id which will be ignored for loss
48 | smoothing (float): smoothing rate (0.0 means the conventional CE)
49 | normalize_length (bool):
50 | normalize loss by sequence length if True
51 | normalize loss by batch size if False
52 | """
53 |
54 | def __init__(self,
55 | size: int,
56 | padding_idx: int,
57 | smoothing: float,
58 | normalize_length: bool = False):
59 | """Construct an LabelSmoothingLoss object."""
60 | super(LabelSmoothingLoss, self).__init__()
61 | self.criterion = nn.KLDivLoss(reduction="none")
62 | self.padding_idx = padding_idx
63 | self.confidence = 1.0 - smoothing
64 | self.smoothing = smoothing
65 | self.size = size
66 | self.normalize_length = normalize_length
67 |
68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 | """Compute loss between x and target.
70 |
71 | The model outputs and data labels tensors are flatten to
72 | (batch*seqlen, class) shape and a mask is applied to the
73 | padding part which should not be calculated for loss.
74 |
75 | Args:
76 | x (torch.Tensor): prediction (batch, seqlen, class)
77 | target (torch.Tensor):
78 | target signal masked with self.padding_id (batch, seqlen)
79 | Returns:
80 | loss (torch.Tensor) : The KL loss, scalar float value
81 | """
82 | assert x.size(2) == self.size
83 | batch_size = x.size(0)
84 | x = x.view(-1, self.size)
85 | target = target.view(-1)
86 | # use zeros_like instead of torch.no_grad() for true_dist,
87 | # since no_grad() can not be exported by JIT
88 | true_dist = torch.zeros_like(x)
89 | true_dist.fill_(self.smoothing / (self.size - 1))
90 | ignore = target == self.padding_idx # (B,)
91 | total = len(target) - ignore.sum().item()
92 | target = target.masked_fill(ignore, 0) # avoid -1 index
93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 | denom = total if self.normalize_length else batch_size
96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import DIGITS
17 | from .num import num2str
18 | from .num import verbalize_cardinal
19 | from .num import verbalize_digit
20 |
21 |
22 | def _time_num2str(num_string: str) -> str:
23 | """A special case for verbalizing number in time."""
24 | result = num2str(num_string.lstrip('0'))
25 | if num_string.startswith('0'):
26 | result = DIGITS['0'] + result
27 | return result
28 |
29 |
30 | # 时刻表达式
31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
32 | r':([0-5][0-9])'
33 | r'(:([0-5][0-9]))?')
34 |
35 | # 时间范围,如8:30-12:30
36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
37 | r':([0-5][0-9])'
38 | r'(:([0-5][0-9]))?'
39 | r'(~|-)'
40 | r'([0-1]?[0-9]|2[0-3])'
41 | r':([0-5][0-9])'
42 | r'(:([0-5][0-9]))?')
43 |
44 |
45 | def replace_time(match) -> str:
46 | """
47 | Args:
48 | match (re.Match)
49 | Returns:
50 | str
51 | """
52 |
53 | is_range = len(match.groups()) > 5
54 |
55 | hour = match.group(1)
56 | minute = match.group(2)
57 | second = match.group(4)
58 |
59 | if is_range:
60 | hour_2 = match.group(6)
61 | minute_2 = match.group(7)
62 | second_2 = match.group(9)
63 |
64 | result = f"{num2str(hour)}点"
65 | if minute.lstrip('0'):
66 | if int(minute) == 30:
67 | result += "半"
68 | else:
69 | result += f"{_time_num2str(minute)}分"
70 | if second and second.lstrip('0'):
71 | result += f"{_time_num2str(second)}秒"
72 |
73 | if is_range:
74 | result += "至"
75 | result += f"{num2str(hour_2)}点"
76 | if minute_2.lstrip('0'):
77 | if int(minute) == 30:
78 | result += "半"
79 | else:
80 | result += f"{_time_num2str(minute_2)}分"
81 | if second_2 and second_2.lstrip('0'):
82 | result += f"{_time_num2str(second_2)}秒"
83 |
84 | return result
85 |
86 |
87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
88 | r'((0?[1-9]|1[0-2])月)?'
89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
90 |
91 |
92 | def replace_date(match) -> str:
93 | """
94 | Args:
95 | match (re.Match)
96 | Returns:
97 | str
98 | """
99 | year = match.group(1)
100 | month = match.group(3)
101 | day = match.group(5)
102 | result = ""
103 | if year:
104 | result += f"{verbalize_digit(year)}年"
105 | if month:
106 | result += f"{verbalize_cardinal(month)}月"
107 | if day:
108 | result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 | return result
110 |
111 |
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 |
116 |
117 | def replace_date2(match) -> str:
118 | """
119 | Args:
120 | match (re.Match)
121 | Returns:
122 | str
123 | """
124 | year = match.group(1)
125 | month = match.group(3)
126 | day = match.group(4)
127 | result = ""
128 | if year:
129 | result += f"{verbalize_digit(year)}年"
130 | if month:
131 | result += f"{verbalize_cardinal(month)}月"
132 | if day:
133 | result += f"{verbalize_cardinal(day)}日"
134 | return result
135 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/distributed/distributed.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------
2 | # Diffsound
3 | # code based https://github.com/cientgu/VQ-Diffusion
4 | # ------------------------------------------
5 | import pickle
6 |
7 | import torch
8 | from torch import distributed as dist
9 | from torch.utils import data
10 |
11 | LOCAL_PROCESS_GROUP = None
12 |
13 |
14 | def is_primary():
15 | return get_rank() == 0
16 |
17 |
18 | def get_rank():
19 | if not dist.is_available():
20 | return 0
21 |
22 | if not dist.is_initialized():
23 | return 0
24 |
25 | return dist.get_rank()
26 |
27 |
28 | def get_local_rank():
29 | if not dist.is_available():
30 | return 0
31 |
32 | if not dist.is_initialized():
33 | return 0
34 |
35 | if LOCAL_PROCESS_GROUP is None:
36 | raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None")
37 |
38 | return dist.get_rank(group=LOCAL_PROCESS_GROUP)
39 |
40 |
41 | def synchronize():
42 | if not dist.is_available():
43 | return
44 |
45 | if not dist.is_initialized():
46 | return
47 |
48 | world_size = dist.get_world_size()
49 |
50 | if world_size == 1:
51 | return
52 |
53 | dist.barrier()
54 |
55 |
56 | def get_world_size():
57 | if not dist.is_available():
58 | return 1
59 |
60 | if not dist.is_initialized():
61 | return 1
62 |
63 | return dist.get_world_size()
64 |
65 |
66 | def is_distributed():
67 | raise RuntimeError('Please debug this function!')
68 | return get_world_size() > 1
69 |
70 |
71 | def all_reduce(tensor, op=dist.ReduceOp.SUM, async_op=False):
72 | world_size = get_world_size()
73 |
74 | if world_size == 1:
75 | return tensor
76 | dist.all_reduce(tensor, op=op, async_op=async_op)
77 |
78 | return tensor
79 |
80 |
81 | def all_gather(data):
82 | world_size = get_world_size()
83 |
84 | if world_size == 1:
85 | return [data]
86 |
87 | buffer = pickle.dumps(data)
88 | storage = torch.ByteStorage.from_buffer(buffer)
89 | tensor = torch.ByteTensor(storage).to("cuda")
90 |
91 | local_size = torch.IntTensor([tensor.numel()]).to("cuda")
92 | size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)]
93 | dist.all_gather(size_list, local_size)
94 | size_list = [int(size.item()) for size in size_list]
95 | max_size = max(size_list)
96 |
97 | tensor_list = []
98 | for _ in size_list:
99 | tensor_list.append(torch.ByteTensor(size=(max_size, )).to("cuda"))
100 |
101 | if local_size != max_size:
102 | padding = torch.ByteTensor(size=(max_size - local_size, )).to("cuda")
103 | tensor = torch.cat((tensor, padding), 0)
104 |
105 | dist.all_gather(tensor_list, tensor)
106 |
107 | data_list = []
108 |
109 | for size, tensor in zip(size_list, tensor_list):
110 | buffer = tensor.cpu().numpy().tobytes()[:size]
111 | data_list.append(pickle.loads(buffer))
112 |
113 | return data_list
114 |
115 |
116 | def reduce_dict(input_dict, average=True):
117 | world_size = get_world_size()
118 |
119 | if world_size < 2:
120 | return input_dict
121 |
122 | with torch.no_grad():
123 | keys = []
124 | values = []
125 |
126 | for k in sorted(input_dict.keys()):
127 | keys.append(k)
128 | values.append(input_dict[k])
129 |
130 | values = torch.stack(values, 0)
131 | dist.reduce(values, dst=0)
132 |
133 | if dist.get_rank() == 0 and average:
134 | values /= world_size
135 |
136 | reduced_dict = {k: v for k, v in zip(keys, values)}
137 |
138 | return reduced_dict
139 |
140 |
141 | def data_sampler(dataset, shuffle, distributed):
142 | if distributed:
143 | return data.distributed.DistributedSampler(dataset, shuffle=shuffle)
144 |
145 | if shuffle:
146 | return data.RandomSampler(dataset)
147 |
148 | else:
149 | return data.SequentialSampler(dataset)
150 |
--------------------------------------------------------------------------------
/cosyvoice/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019 Shigeki Karita
2 | # 2020 Mobvoi Inc (Binbin Zhang)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Positionwise feed forward layer definition."""
16 |
17 | import torch
18 |
19 |
20 | class PositionwiseFeedForward(torch.nn.Module):
21 | """Positionwise feed forward layer.
22 |
23 | FeedForward are appied on each position of the sequence.
24 | The output dim is same with the input dim.
25 |
26 | Args:
27 | idim (int): Input dimenstion.
28 | hidden_units (int): The number of hidden units.
29 | dropout_rate (float): Dropout rate.
30 | activation (torch.nn.Module): Activation function
31 | """
32 |
33 | def __init__(
34 | self,
35 | idim: int,
36 | hidden_units: int,
37 | dropout_rate: float,
38 | activation: torch.nn.Module = torch.nn.ReLU(),
39 | ):
40 | """Construct a PositionwiseFeedForward object."""
41 | super(PositionwiseFeedForward, self).__init__()
42 | self.w_1 = torch.nn.Linear(idim, hidden_units)
43 | self.activation = activation
44 | self.dropout = torch.nn.Dropout(dropout_rate)
45 | self.w_2 = torch.nn.Linear(hidden_units, idim)
46 |
47 | def forward(self, xs: torch.Tensor) -> torch.Tensor:
48 | """Forward function.
49 |
50 | Args:
51 | xs: input tensor (B, L, D)
52 | Returns:
53 | output tensor, (B, L, D)
54 | """
55 | return self.w_2(self.dropout(self.activation(self.w_1(xs))))
56 |
57 |
58 | class MoEFFNLayer(torch.nn.Module):
59 | """
60 | Mixture of expert with Positionwise feed forward layer
61 | See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
62 | The output dim is same with the input dim.
63 |
64 | Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
65 | https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
66 | Args:
67 | n_expert: number of expert.
68 | n_expert_per_token: The actual number of experts used for each frame
69 | idim (int): Input dimenstion.
70 | hidden_units (int): The number of hidden units.
71 | dropout_rate (float): Dropout rate.
72 | activation (torch.nn.Module): Activation function
73 | """
74 |
75 | def __init__(
76 | self,
77 | n_expert: int,
78 | n_expert_per_token: int,
79 | idim: int,
80 | hidden_units: int,
81 | dropout_rate: float,
82 | activation: torch.nn.Module = torch.nn.ReLU(),
83 | ):
84 | super(MoEFFNLayer, self).__init__()
85 | self.gate = torch.nn.Linear(idim, n_expert, bias=False)
86 | self.experts = torch.nn.ModuleList(
87 | PositionwiseFeedForward(idim, hidden_units, dropout_rate,
88 | activation) for _ in range(n_expert))
89 | self.n_expert_per_token = n_expert_per_token
90 |
91 | def forward(self, xs: torch.Tensor) -> torch.Tensor:
92 | """Foward function.
93 | Args:
94 | xs: input tensor (B, L, D)
95 | Returns:
96 | output tensor, (B, L, D)
97 |
98 | """
99 | B, L, D = xs.size(
100 | ) # batch size, sequence length, embedding dimension (idim)
101 | xs = xs.view(-1, D) # (B*L, D)
102 | router = self.gate(xs) # (B*L, n_expert)
103 | logits, indices = torch.topk(
104 | router, self.n_expert_per_token
105 | ) # probs:(B*L, n_expert), indices: (B*L, n_expert)
106 | weights = torch.nn.functional.softmax(
107 | logits, dim=1,
108 | dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token)
109 | output = torch.zeros_like(xs) # (B*L, D)
110 | for i, expert in enumerate(self.experts):
111 | mask = indices == i
112 | batch_idx, ith_expert = torch.where(mask)
113 | output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
114 | xs[batch_idx])
115 | return output.view(B, L, D)
116 |
--------------------------------------------------------------------------------
/academicodec/quantization/distrib.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Torch distributed utilities."""
7 | import typing as tp
8 |
9 | import torch
10 |
11 |
12 | def rank():
13 | if torch.distributed.is_initialized():
14 | return torch.distributed.get_rank()
15 | else:
16 | return 0
17 |
18 |
19 | def world_size():
20 | if torch.distributed.is_initialized():
21 | return torch.distributed.get_world_size()
22 | else:
23 | return 1
24 |
25 |
26 | def is_distributed():
27 | return world_size() > 1
28 |
29 |
30 | def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
31 | if is_distributed():
32 | return torch.distributed.all_reduce(tensor, op)
33 |
34 |
35 | def _is_complex_or_float(tensor):
36 | return torch.is_floating_point(tensor) or torch.is_complex(tensor)
37 |
38 |
39 | def _check_number_of_params(params: tp.List[torch.Tensor]):
40 | # utility function to check that the number of params in all workers is the same,
41 | # and thus avoid a deadlock with distributed all reduce.
42 | if not is_distributed() or not params:
43 | return
44 | #print('params[0].device ', params[0].device)
45 | tensor = torch.tensor(
46 | [len(params)], device=params[0].device, dtype=torch.long)
47 | all_reduce(tensor)
48 | if tensor.item() != len(params) * world_size():
49 | # If not all the workers have the same number, for at least one of them,
50 | # this inequality will be verified.
51 | raise RuntimeError(
52 | f"Mismatch in number of params: ours is {len(params)}, "
53 | "at least one worker has a different one.")
54 |
55 |
56 | def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int=0):
57 | """Broadcast the tensors from the given parameters to all workers.
58 | This can be used to ensure that all workers have the same model to start with.
59 | """
60 | if not is_distributed():
61 | return
62 | tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)]
63 | _check_number_of_params(tensors)
64 | handles = []
65 | for tensor in tensors:
66 | # src = int(rank()) # added code
67 | handle = torch.distributed.broadcast(
68 | tensor.data, src=src, async_op=True)
69 | handles.append(handle)
70 | for handle in handles:
71 | handle.wait()
72 |
73 |
74 | def sync_buffer(buffers, average=True):
75 | """
76 | Sync grad for buffers. If average is False, broadcast instead of averaging.
77 | """
78 | if not is_distributed():
79 | return
80 | handles = []
81 | for buffer in buffers:
82 | if torch.is_floating_point(buffer.data):
83 | if average:
84 | handle = torch.distributed.all_reduce(
85 | buffer.data,
86 | op=torch.distributed.ReduceOp.SUM,
87 | async_op=True)
88 | else:
89 | handle = torch.distributed.broadcast(
90 | buffer.data, src=0, async_op=True)
91 | handles.append((buffer, handle))
92 | for buffer, handle in handles:
93 | handle.wait()
94 | if average:
95 | buffer.data /= world_size
96 |
97 |
98 | def sync_grad(params):
99 | """
100 | Simpler alternative to DistributedDataParallel, that doesn't rely
101 | on any black magic. For simple models it can also be as fast.
102 | Just call this on your model parameters after the call to backward!
103 | """
104 | if not is_distributed():
105 | return
106 | handles = []
107 | for p in params:
108 | if p.grad is not None:
109 | handle = torch.distributed.all_reduce(
110 | p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
111 | handles.append((p, handle))
112 | for p, handle in handles:
113 | handle.wait()
114 | p.grad.data /= world_size()
115 |
116 |
117 | def average_metrics(metrics: tp.Dict[str, float], count=1.):
118 | """Average a dictionary of metrics across all workers, using the optional
119 | `count` as unormalized weight.
120 | """
121 | if not is_distributed():
122 | return metrics
123 | keys, values = zip(*metrics.items())
124 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
125 | tensor = torch.tensor(
126 | list(values) + [1], device=device, dtype=torch.float32)
127 | tensor *= count
128 | all_reduce(tensor)
129 | averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
130 | return dict(zip(keys, averaged))
131 |
--------------------------------------------------------------------------------
/matcha/models/components/flow_matching.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import torch
4 | import torch.nn.functional as F
5 |
6 | from matcha.models.components.decoder import Decoder
7 | from matcha.utils.pylogger import get_pylogger
8 |
9 | log = get_pylogger(__name__)
10 |
11 |
12 | class BASECFM(torch.nn.Module, ABC):
13 | def __init__(
14 | self,
15 | n_feats,
16 | cfm_params,
17 | n_spks=1,
18 | spk_emb_dim=128,
19 | ):
20 | super().__init__()
21 | self.n_feats = n_feats
22 | self.n_spks = n_spks
23 | self.spk_emb_dim = spk_emb_dim
24 | self.solver = cfm_params.solver
25 | if hasattr(cfm_params, "sigma_min"):
26 | self.sigma_min = cfm_params.sigma_min
27 | else:
28 | self.sigma_min = 1e-4
29 |
30 | self.estimator = None
31 |
32 | @torch.inference_mode()
33 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
34 | """Forward diffusion
35 |
36 | Args:
37 | mu (torch.Tensor): output of encoder
38 | shape: (batch_size, n_feats, mel_timesteps)
39 | mask (torch.Tensor): output_mask
40 | shape: (batch_size, 1, mel_timesteps)
41 | n_timesteps (int): number of diffusion steps
42 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
43 | spks (torch.Tensor, optional): speaker ids. Defaults to None.
44 | shape: (batch_size, spk_emb_dim)
45 | cond: Not used but kept for future purposes
46 |
47 | Returns:
48 | sample: generated mel-spectrogram
49 | shape: (batch_size, n_feats, mel_timesteps)
50 | """
51 | z = torch.randn_like(mu) * temperature
52 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
53 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
54 |
55 | def solve_euler(self, x, t_span, mu, mask, spks, cond):
56 | """
57 | Fixed euler solver for ODEs.
58 | Args:
59 | x (torch.Tensor): random noise
60 | t_span (torch.Tensor): n_timesteps interpolated
61 | shape: (n_timesteps + 1,)
62 | mu (torch.Tensor): output of encoder
63 | shape: (batch_size, n_feats, mel_timesteps)
64 | mask (torch.Tensor): output_mask
65 | shape: (batch_size, 1, mel_timesteps)
66 | spks (torch.Tensor, optional): speaker ids. Defaults to None.
67 | shape: (batch_size, spk_emb_dim)
68 | cond: Not used but kept for future purposes
69 | """
70 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
71 |
72 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file
73 | # Or in future might add like a return_all_steps flag
74 | sol = []
75 |
76 | for step in range(1, len(t_span)):
77 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
78 |
79 | x = x + dt * dphi_dt
80 | t = t + dt
81 | sol.append(x)
82 | if step < len(t_span) - 1:
83 | dt = t_span[step + 1] - t
84 |
85 | return sol[-1]
86 |
87 | def compute_loss(self, x1, mask, mu, spks=None, cond=None):
88 | """Computes diffusion loss
89 |
90 | Args:
91 | x1 (torch.Tensor): Target
92 | shape: (batch_size, n_feats, mel_timesteps)
93 | mask (torch.Tensor): target mask
94 | shape: (batch_size, 1, mel_timesteps)
95 | mu (torch.Tensor): output of encoder
96 | shape: (batch_size, n_feats, mel_timesteps)
97 | spks (torch.Tensor, optional): speaker embedding. Defaults to None.
98 | shape: (batch_size, spk_emb_dim)
99 |
100 | Returns:
101 | loss: conditional flow matching loss
102 | y: conditional flow
103 | shape: (batch_size, n_feats, mel_timesteps)
104 | """
105 | b, _, t = mu.shape
106 |
107 | # random timestep
108 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
109 | # sample noise p(x_0)
110 | z = torch.randn_like(x1)
111 |
112 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1
113 | u = x1 - (1 - self.sigma_min) * z
114 |
115 | loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
116 | torch.sum(mask) * u.shape[1]
117 | )
118 | return loss, y
119 |
120 |
121 | class CFM(BASECFM):
122 | def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64):
123 | super().__init__(
124 | n_feats=in_channels,
125 | cfm_params=cfm_params,
126 | n_spks=n_spks,
127 | spk_emb_dim=spk_emb_dim,
128 | )
129 |
130 | in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
131 | # Just change the architecture of the estimator here
132 | self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params)
133 |
--------------------------------------------------------------------------------
/academicodec/quantization/vq.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Residual vector quantizer implementation."""
7 | import math
8 | import typing as tp
9 | from dataclasses import dataclass
10 | from dataclasses import field
11 |
12 | import torch
13 | from torch import nn
14 |
15 | from academicodec.quantization.core_vq import ResidualVectorQuantization
16 |
17 |
18 | @dataclass
19 | class QuantizedResult:
20 | quantized: torch.Tensor
21 | codes: torch.Tensor
22 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item.
23 | penalty: tp.Optional[torch.Tensor] = None
24 | metrics: dict = field(default_factory=dict)
25 |
26 |
27 | class ResidualVectorQuantizer(nn.Module):
28 | """Residual Vector Quantizer.
29 | Args:
30 | dimension (int): Dimension of the codebooks.
31 | n_q (int): Number of residual vector quantizers used.
32 | bins (int): Codebook size.
33 | decay (float): Decay for exponential moving average over the codebooks.
34 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
35 | kmeans_iters (int): Number of iterations used for kmeans initialization.
36 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
37 | that have an exponential moving average cluster size less than the specified threshold with
38 | randomly selected vector from the current batch.
39 | """
40 |
41 | def __init__(
42 | self,
43 | dimension: int=256,
44 | n_q: int=8,
45 | bins: int=1024,
46 | decay: float=0.99,
47 | kmeans_init: bool=True,
48 | kmeans_iters: int=50,
49 | threshold_ema_dead_code: int=2, ):
50 | super().__init__()
51 | self.n_q = n_q
52 | self.dimension = dimension
53 | self.bins = bins
54 | self.decay = decay
55 | self.kmeans_init = kmeans_init
56 | self.kmeans_iters = kmeans_iters
57 | self.threshold_ema_dead_code = threshold_ema_dead_code
58 | self.vq = ResidualVectorQuantization(
59 | dim=self.dimension,
60 | codebook_size=self.bins,
61 | num_quantizers=self.n_q,
62 | decay=self.decay,
63 | kmeans_init=self.kmeans_init,
64 | kmeans_iters=self.kmeans_iters,
65 | threshold_ema_dead_code=self.threshold_ema_dead_code, )
66 |
67 | def forward(self,
68 | x: torch.Tensor,
69 | sample_rate: int,
70 | bandwidth: tp.Optional[float]=None) -> QuantizedResult:
71 | """Residual vector quantization on the given input tensor.
72 | Args:
73 | x (torch.Tensor): Input tensor.
74 | sample_rate (int): Sample rate of the input tensor.
75 | bandwidth (float): Target bandwidth.
76 | Returns:
77 | QuantizedResult:
78 | The quantized (or approximately quantized) representation with
79 | the associated bandwidth and any penalty term for the loss.
80 | """
81 | bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
82 | n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
83 | quantized, codes, commit_loss = self.vq(x, n_q=n_q)
84 | bw = torch.tensor(n_q * bw_per_q).to(x)
85 | return quantized, codes, bw, torch.mean(commit_loss)
86 | #return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
87 |
88 | def get_num_quantizers_for_bandwidth(
89 | self, sample_rate: int, bandwidth: tp.Optional[float]=None) -> int:
90 | """Return n_q based on specified target bandwidth.
91 | """
92 | bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
93 | n_q = self.n_q
94 | if bandwidth and bandwidth > 0.:
95 | n_q = int(max(1, math.floor(bandwidth / bw_per_q)))
96 | return n_q
97 |
98 | def get_bandwidth_per_quantizer(self, sample_rate: int):
99 | """Return bandwidth per quantizer for a given input sample rate.
100 | """
101 | return math.log2(self.bins) * sample_rate / 1000
102 |
103 | def encode(self,
104 | x: torch.Tensor,
105 | sample_rate: int,
106 | bandwidth: tp.Optional[float]=None,
107 | st: tp.Optional[int]=None) -> torch.Tensor:
108 | """Encode a given input tensor with the specified sample rate at the given bandwidth.
109 | The RVQ encode method sets the appropriate number of quantizer to use
110 | and returns indices for each quantizer.
111 | """
112 | n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
113 | st = st or 0
114 | codes = self.vq.encode(x, n_q=n_q, st=st)
115 | return codes
116 |
117 | def decode(self, codes: torch.Tensor) -> torch.Tensor:
118 | """Decode the given codes to the quantized representation.
119 | """
120 | quantized = self.vq.decode(codes)
121 | return quantized
122 |
--------------------------------------------------------------------------------
/cosyvoice/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019 Shigeki Karita
2 | # 2020 Mobvoi Inc (Binbin Zhang)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Decoder self-attention layer definition."""
16 | from typing import Optional, Tuple
17 |
18 | import torch
19 | from torch import nn
20 |
21 |
22 | class DecoderLayer(nn.Module):
23 | """Single decoder layer module.
24 |
25 | Args:
26 | size (int): Input dimension.
27 | self_attn (torch.nn.Module): Self-attention module instance.
28 | `MultiHeadedAttention` instance can be used as the argument.
29 | src_attn (torch.nn.Module): Inter-attention module instance.
30 | `MultiHeadedAttention` instance can be used as the argument.
31 | If `None` is passed, Inter-attention is not used, such as
32 | CIF, GPT, and other decoder only model.
33 | feed_forward (torch.nn.Module): Feed-forward module instance.
34 | `PositionwiseFeedForward` instance can be used as the argument.
35 | dropout_rate (float): Dropout rate.
36 | normalize_before (bool):
37 | True: use layer_norm before each sub-block.
38 | False: to use layer_norm after each sub-block.
39 | """
40 |
41 | def __init__(
42 | self,
43 | size: int,
44 | self_attn: nn.Module,
45 | src_attn: Optional[nn.Module],
46 | feed_forward: nn.Module,
47 | dropout_rate: float,
48 | normalize_before: bool = True,
49 | ):
50 | """Construct an DecoderLayer object."""
51 | super().__init__()
52 | self.size = size
53 | self.self_attn = self_attn
54 | self.src_attn = src_attn
55 | self.feed_forward = feed_forward
56 | self.norm1 = nn.LayerNorm(size, eps=1e-5)
57 | self.norm2 = nn.LayerNorm(size, eps=1e-5)
58 | self.norm3 = nn.LayerNorm(size, eps=1e-5)
59 | self.dropout = nn.Dropout(dropout_rate)
60 | self.normalize_before = normalize_before
61 |
62 | def forward(
63 | self,
64 | tgt: torch.Tensor,
65 | tgt_mask: torch.Tensor,
66 | memory: torch.Tensor,
67 | memory_mask: torch.Tensor,
68 | cache: Optional[torch.Tensor] = None
69 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
70 | """Compute decoded features.
71 |
72 | Args:
73 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
74 | tgt_mask (torch.Tensor): Mask for input tensor
75 | (#batch, maxlen_out).
76 | memory (torch.Tensor): Encoded memory
77 | (#batch, maxlen_in, size).
78 | memory_mask (torch.Tensor): Encoded memory mask
79 | (#batch, maxlen_in).
80 | cache (torch.Tensor): cached tensors.
81 | (#batch, maxlen_out - 1, size).
82 |
83 | Returns:
84 | torch.Tensor: Output tensor (#batch, maxlen_out, size).
85 | torch.Tensor: Mask for output tensor (#batch, maxlen_out).
86 | torch.Tensor: Encoded memory (#batch, maxlen_in, size).
87 | torch.Tensor: Encoded memory mask (#batch, maxlen_in).
88 |
89 | """
90 | residual = tgt
91 | if self.normalize_before:
92 | tgt = self.norm1(tgt)
93 |
94 | if cache is None:
95 | tgt_q = tgt
96 | tgt_q_mask = tgt_mask
97 | else:
98 | # compute only the last frame query keeping dim: max_time_out -> 1
99 | assert cache.shape == (
100 | tgt.shape[0],
101 | tgt.shape[1] - 1,
102 | self.size,
103 | ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
104 | tgt_q = tgt[:, -1:, :]
105 | residual = residual[:, -1:, :]
106 | tgt_q_mask = tgt_mask[:, -1:, :]
107 |
108 | x = residual + self.dropout(
109 | self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
110 | if not self.normalize_before:
111 | x = self.norm1(x)
112 |
113 | if self.src_attn is not None:
114 | residual = x
115 | if self.normalize_before:
116 | x = self.norm2(x)
117 | x = residual + self.dropout(
118 | self.src_attn(x, memory, memory, memory_mask)[0])
119 | if not self.normalize_before:
120 | x = self.norm2(x)
121 |
122 | residual = x
123 | if self.normalize_before:
124 | x = self.norm3(x)
125 | x = residual + self.dropout(self.feed_forward(x))
126 | if not self.normalize_before:
127 | x = self.norm3(x)
128 |
129 | if cache is not None:
130 | x = torch.cat([cache, x], dim=1)
131 |
132 | return x, tgt_mask, memory, memory_mask
133 |
--------------------------------------------------------------------------------
/academicodec/modules/transformer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """A streamable transformer."""
7 | import typing as tp
8 |
9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 |
13 |
14 | def create_sin_embedding(positions: torch.Tensor,
15 | dim: int,
16 | max_period: float=10000):
17 | """Create time embedding for the given positions, target dimension `dim`.
18 | """
19 | # We aim for BTC format
20 | assert dim % 2 == 0
21 | half_dim = dim // 2
22 | adim = torch.arange(half_dim, device=positions.device).view(1, 1, -1)
23 | phase = positions / (max_period**(adim / (half_dim - 1)))
24 | return torch.cat(
25 | [
26 | torch.cos(phase),
27 | torch.sin(phase),
28 | ], dim=-1)
29 |
30 |
31 | class StreamingTransformerEncoderLayer(nn.TransformerEncoderLayer):
32 | def forward(self, x: torch.Tensor, x_past: torch.Tensor,
33 | past_context: int): # type: ignore
34 | if self.norm_first:
35 | sa_input = self.norm1(x)
36 | x = x + self._sa_block(sa_input, x_past, past_context)
37 | x = x + self._ff_block(self.norm2(x))
38 | else:
39 | sa_input = x
40 | x = self.norm1(x + self._sa_block(sa_input, x_past, past_context))
41 | x = self.norm2(x + self._ff_block(x))
42 |
43 | return x, sa_input
44 |
45 | # self-attention block
46 | def _sa_block(self,
47 | x: torch.Tensor,
48 | x_past: torch.Tensor,
49 | past_context: int): # type: ignore
50 | _, T, _ = x.shape
51 | _, H, _ = x_past.shape
52 |
53 | queries = x
54 | keys = torch.cat([x_past, x], dim=1)
55 | values = keys
56 |
57 | queries_pos = torch.arange(H, T + H, device=x.device).view(-1, 1)
58 | keys_pos = torch.arange(T + H, device=x.device).view(1, -1)
59 | delta = queries_pos - keys_pos
60 | valid_access = (delta >= 0) & (delta <= past_context)
61 | x = self.self_attn(
62 | queries, keys, values, attn_mask=~valid_access,
63 | need_weights=False)[0]
64 | return self.dropout1(x)
65 |
66 |
67 | class StreamingTransformerEncoder(nn.Module):
68 | """TransformerEncoder with streaming support.
69 |
70 | Args:
71 | dim (int): dimension of the data.
72 | hidden_scale (int): intermediate dimension of FF module is this times the dimension.
73 | num_heads (int): number of heads.
74 | num_layers (int): number of layers.
75 | max_period (float): maxium period of cosines in the positional embedding.
76 | past_context (int or None): receptive field for the causal mask, infinite if None.
77 | gelu (bool): if true uses GeLUs, otherwise use ReLUs.
78 | norm_in (bool): normalize the input.
79 | dropout (float): dropout probability.
80 | **kwargs: See `nn.TransformerEncoderLayer`.
81 | """
82 |
83 | def __init__(self,
84 | dim,
85 | hidden_scale: float=4.,
86 | num_heads: int=8,
87 | num_layers: int=5,
88 | max_period: float=10000,
89 | past_context: int=1000,
90 | gelu: bool=True,
91 | norm_in: bool=True,
92 | dropout: float=0.,
93 | **kwargs):
94 | super().__init__()
95 | assert dim % num_heads == 0
96 | hidden_dim = int(dim * hidden_scale)
97 |
98 | self.max_period = max_period
99 | self.past_context = past_context
100 | activation: tp.Any = F.gelu if gelu else F.relu
101 |
102 | self.norm_in: nn.Module
103 | if norm_in:
104 | self.norm_in = nn.LayerNorm(dim)
105 | else:
106 | self.norm_in = nn.Identity()
107 |
108 | self.layers = nn.ModuleList()
109 | for idx in range(num_layers):
110 | self.layers.append(
111 | StreamingTransformerEncoderLayer(
112 | dim,
113 | num_heads,
114 | hidden_dim,
115 | activation=activation,
116 | batch_first=True,
117 | dropout=dropout,
118 | **kwargs))
119 |
120 | def forward(self,
121 | x: torch.Tensor,
122 | states: tp.Optional[tp.List[torch.Tensor]]=None,
123 | offset: tp.Union[int, torch.Tensor]=0):
124 | B, T, C = x.shape
125 | if states is None:
126 | states = [
127 | torch.zeros_like(x[:, :1]) for _ in range(1 + len(self.layers))
128 | ]
129 |
130 | positions = torch.arange(T, device=x.device).view(1, -1, 1) + offset
131 | pos_emb = create_sin_embedding(positions, C, max_period=self.max_period)
132 |
133 | new_state: tp.List[torch.Tensor] = []
134 | x = self.norm_in(x)
135 | x = x + pos_emb
136 |
137 | for layer_state, layer in zip(states, self.layers):
138 | x, new_layer_state = layer(x, layer_state, self.past_context)
139 | new_layer_state = torch.cat([layer_state, new_layer_state], dim=1)
140 | new_state.append(new_layer_state[:, -self.past_context:, :])
141 | return x, new_state, offset + T
142 |
--------------------------------------------------------------------------------
/academicodec/models/soundstream/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from academicodec.modules import NormConv1d
5 | from academicodec.modules import NormConv2d
6 | from academicodec.utils import get_padding
7 | from torch.nn import AvgPool1d
8 | from torch.nn.utils import spectral_norm
9 | from torch.nn.utils import weight_norm
10 |
11 | LRELU_SLOPE = 0.1
12 |
13 |
14 | class DiscriminatorP(torch.nn.Module):
15 | def __init__(self,
16 | period,
17 | kernel_size=5,
18 | stride=3,
19 | use_spectral_norm=False,
20 | activation: str='LeakyReLU',
21 | activation_params: dict={'negative_slope': 0.2}):
22 | super(DiscriminatorP, self).__init__()
23 | self.period = period
24 | norm_f = weight_norm if use_spectral_norm is False else spectral_norm
25 | self.activation = getattr(torch.nn, activation)(**activation_params)
26 | self.convs = nn.ModuleList([
27 | NormConv2d(
28 | 1,
29 | 32, (kernel_size, 1), (stride, 1),
30 | padding=(get_padding(5, 1), 0)),
31 | NormConv2d(
32 | 32,
33 | 32, (kernel_size, 1), (stride, 1),
34 | padding=(get_padding(5, 1), 0)),
35 | NormConv2d(
36 | 32,
37 | 32, (kernel_size, 1), (stride, 1),
38 | padding=(get_padding(5, 1), 0)),
39 | NormConv2d(
40 | 32,
41 | 32, (kernel_size, 1), (stride, 1),
42 | padding=(get_padding(5, 1), 0)),
43 | NormConv2d(32, 32, (kernel_size, 1), 1, padding=(2, 0)),
44 | ])
45 | self.conv_post = NormConv2d(32, 1, (3, 1), 1, padding=(1, 0))
46 |
47 | def forward(self, x):
48 | fmap = []
49 | # 1d to 2d
50 | b, c, t = x.shape
51 | if t % self.period != 0: # pad first
52 | n_pad = self.period - (t % self.period)
53 | x = F.pad(x, (0, n_pad), "reflect")
54 | t = t + n_pad
55 | x = x.view(b, c, t // self.period, self.period)
56 |
57 | for l in self.convs:
58 | x = l(x)
59 | x = self.activation(x)
60 | fmap.append(x)
61 | x = self.conv_post(x)
62 | fmap.append(x)
63 | x = torch.flatten(x, 1, -1)
64 |
65 | return x, fmap
66 |
67 |
68 | class MultiPeriodDiscriminator(torch.nn.Module):
69 | def __init__(self):
70 | super(MultiPeriodDiscriminator, self).__init__()
71 | self.discriminators = nn.ModuleList([
72 | DiscriminatorP(2),
73 | DiscriminatorP(3),
74 | DiscriminatorP(5),
75 | DiscriminatorP(7),
76 | DiscriminatorP(11),
77 | ])
78 |
79 | def forward(self, y, y_hat):
80 | y_d_rs = []
81 | y_d_gs = []
82 | fmap_rs = []
83 | fmap_gs = []
84 | for i, d in enumerate(self.discriminators):
85 | y_d_r, fmap_r = d(y)
86 | y_d_g, fmap_g = d(y_hat)
87 | y_d_rs.append(y_d_r)
88 | fmap_rs.append(fmap_r)
89 | y_d_gs.append(y_d_g)
90 | fmap_gs.append(fmap_g)
91 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
92 |
93 |
94 | class DiscriminatorS(torch.nn.Module):
95 | def __init__(self,
96 | use_spectral_norm=False,
97 | activation: str='LeakyReLU',
98 | activation_params: dict={'negative_slope': 0.2}):
99 | super(DiscriminatorS, self).__init__()
100 | self.activation = getattr(torch.nn, activation)(**activation_params)
101 | self.convs = nn.ModuleList([
102 | NormConv1d(1, 32, 15, 1, padding=7),
103 | NormConv1d(32, 32, 41, 2, groups=4, padding=20),
104 | NormConv1d(32, 32, 41, 2, groups=16, padding=20),
105 | NormConv1d(32, 32, 41, 4, groups=16, padding=20),
106 | NormConv1d(32, 32, 41, 4, groups=16, padding=20),
107 | NormConv1d(32, 32, 41, 1, groups=16, padding=20),
108 | NormConv1d(32, 32, 5, 1, padding=2),
109 | ])
110 | self.conv_post = NormConv1d(32, 1, 3, 1, padding=1)
111 |
112 | def forward(self, x):
113 | fmap = []
114 | for l in self.convs:
115 | x = l(x)
116 | x = self.activation(x)
117 | fmap.append(x)
118 | x = self.conv_post(x)
119 | fmap.append(x)
120 | x = torch.flatten(x, 1, -1)
121 | return x, fmap
122 |
123 |
124 | class MultiScaleDiscriminator(torch.nn.Module):
125 | def __init__(self):
126 | super(MultiScaleDiscriminator, self).__init__()
127 | self.discriminators = nn.ModuleList([
128 | DiscriminatorS(),
129 | DiscriminatorS(),
130 | DiscriminatorS(),
131 | ])
132 | self.meanpools = nn.ModuleList(
133 | [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
134 |
135 | def forward(self, y, y_hat):
136 | y_d_rs = []
137 | y_d_gs = []
138 | fmap_rs = []
139 | fmap_gs = []
140 | for i, d in enumerate(self.discriminators):
141 | if i != 0:
142 | y = self.meanpools[i - 1](y)
143 | y_hat = self.meanpools[i - 1](y_hat)
144 | y_d_r, fmap_r = d(y)
145 | y_d_g, fmap_g = d(y_hat)
146 | y_d_rs.append(y_d_r)
147 | fmap_rs.append(fmap_r)
148 | y_d_gs.append(y_d_g)
149 | fmap_gs.append(fmap_g)
150 |
151 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
152 |
--------------------------------------------------------------------------------
/cosyvoice/bin/inference.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | import argparse
18 | import logging
19 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
20 | import os
21 |
22 | import torch
23 | from torch.utils.data import DataLoader
24 | import torchaudio
25 | from hyperpyyaml import load_hyperpyyaml
26 | from tqdm import tqdm
27 | from cosyvoice.cli.model import CosyVoiceModel
28 |
29 | from cosyvoice.dataset.dataset import Dataset
30 |
31 | def get_args():
32 | parser = argparse.ArgumentParser(description='inference with your model')
33 | parser.add_argument('--config', required=True, help='config file')
34 | parser.add_argument('--prompt_data', required=True, help='prompt data file')
35 | parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
36 | parser.add_argument('--tts_text', required=True, help='tts input file')
37 | parser.add_argument('--llm_model', required=True, help='llm model file')
38 | parser.add_argument('--flow_model', required=True, help='flow model file')
39 | parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
40 | parser.add_argument('--gpu',
41 | type=int,
42 | default=-1,
43 | help='gpu id for this rank, -1 for cpu')
44 | parser.add_argument('--mode',
45 | default='sft',
46 | choices=['sft', 'zero_shot'],
47 | help='inference mode')
48 | parser.add_argument('--result_dir', required=True, help='asr result file')
49 | args = parser.parse_args()
50 | print(args)
51 | return args
52 |
53 |
54 | def main():
55 | args = get_args()
56 | logging.basicConfig(level=logging.DEBUG,
57 | format='%(asctime)s %(levelname)s %(message)s')
58 | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
59 |
60 | # Init cosyvoice models from configs
61 | use_cuda = args.gpu >= 0 and torch.cuda.is_available()
62 | device = torch.device('cuda' if use_cuda else 'cpu')
63 | with open(args.config, 'r') as f:
64 | configs = load_hyperpyyaml(f)
65 |
66 | model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
67 | model.load(args.llm_model, args.flow_model, args.hifigan_model)
68 |
69 | test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
70 | test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
71 |
72 | del configs
73 | os.makedirs(args.result_dir, exist_ok=True)
74 | fn = os.path.join(args.result_dir, 'wav.scp')
75 | f = open(fn, 'w')
76 | with torch.no_grad():
77 | for batch_idx, batch in tqdm(enumerate(test_data_loader)):
78 | utts = batch["utts"]
79 | assert len(utts) == 1, "inference mode only support batchsize 1"
80 | text = batch["text"]
81 | text_token = batch["text_token"].to(device)
82 | text_token_len = batch["text_token_len"].to(device)
83 | tts_text = batch["tts_text"]
84 | tts_index = batch["tts_index"]
85 | tts_text_token = batch["tts_text_token"].to(device)
86 | tts_text_token_len = batch["tts_text_token_len"].to(device)
87 | speech_token = batch["speech_token"].to(device)
88 | speech_token_len = batch["speech_token_len"].to(device)
89 | speech_feat = batch["speech_feat"].to(device)
90 | speech_feat_len = batch["speech_feat_len"].to(device)
91 | utt_embedding = batch["utt_embedding"].to(device)
92 | spk_embedding = batch["spk_embedding"].to(device)
93 | if args.mode == 'sft':
94 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
95 | 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
96 | else:
97 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
98 | 'prompt_text': text_token, 'prompt_text_len': text_token_len,
99 | 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
100 | 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
101 | 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
102 | 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
103 | model_output = model.inference(**model_input)
104 | tts_key = '{}_{}'.format(utts[0], tts_index[0])
105 | tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
106 | torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
107 | f.write('{} {}\n'.format(tts_key, tts_fn))
108 | f.flush()
109 | f.close()
110 | logging.info('Result wav.scp saved in {}'.format(fn))
111 |
112 |
113 | if __name__ == '__main__':
114 | main()
115 |
--------------------------------------------------------------------------------
/cosyvoice/dataset/dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
2 | # 2024 Alibaba Inc (authors: Xiang Lyu)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import random
17 | import json
18 | import math
19 | from functools import partial
20 |
21 | import torch
22 | import torch.distributed as dist
23 | from torch.utils.data import IterableDataset
24 | from cosyvoice.utils.file_utils import read_lists, read_json_lists
25 |
26 |
27 | class Processor(IterableDataset):
28 |
29 | def __init__(self, source, f, *args, **kw):
30 | assert callable(f)
31 | self.source = source
32 | self.f = f
33 | self.args = args
34 | self.kw = kw
35 |
36 | def set_epoch(self, epoch):
37 | self.source.set_epoch(epoch)
38 |
39 | def __iter__(self):
40 | """ Return an iterator over the source dataset processed by the
41 | given processor.
42 | """
43 | assert self.source is not None
44 | assert callable(self.f)
45 | return self.f(iter(self.source), *self.args, **self.kw)
46 |
47 | def apply(self, f):
48 | assert callable(f)
49 | return Processor(self, f, *self.args, **self.kw)
50 |
51 |
52 | class DistributedSampler:
53 |
54 | def __init__(self, shuffle=True, partition=True):
55 | self.epoch = -1
56 | self.update()
57 | self.shuffle = shuffle
58 | self.partition = partition
59 |
60 | def update(self):
61 | assert dist.is_available()
62 | if dist.is_initialized():
63 | self.rank = dist.get_rank()
64 | self.world_size = dist.get_world_size()
65 | else:
66 | self.rank = 0
67 | self.world_size = 1
68 | worker_info = torch.utils.data.get_worker_info()
69 | if worker_info is None:
70 | self.worker_id = 0
71 | self.num_workers = 1
72 | else:
73 | self.worker_id = worker_info.id
74 | self.num_workers = worker_info.num_workers
75 | return dict(rank=self.rank,
76 | world_size=self.world_size,
77 | worker_id=self.worker_id,
78 | num_workers=self.num_workers)
79 |
80 | def set_epoch(self, epoch):
81 | self.epoch = epoch
82 |
83 | def sample(self, data):
84 | """ Sample data according to rank/world_size/num_workers
85 |
86 | Args:
87 | data(List): input data list
88 |
89 | Returns:
90 | List: data list after sample
91 | """
92 | data = list(range(len(data)))
93 | # force datalist even
94 | if self.partition:
95 | if self.shuffle:
96 | random.Random(self.epoch).shuffle(data)
97 | if len(data) < self.world_size:
98 | data = data * math.ceil(self.world_size / len(data))
99 | data = data[:self.world_size]
100 | data = data[self.rank::self.world_size]
101 | if len(data) < self.num_workers:
102 | data = data * math.ceil(self.num_workers / len(data))
103 | data = data[:self.num_workers]
104 | data = data[self.worker_id::self.num_workers]
105 | return data
106 |
107 |
108 | class DataList(IterableDataset):
109 |
110 | def __init__(self, lists, shuffle=True, partition=True):
111 | self.lists = lists
112 | self.sampler = DistributedSampler(shuffle, partition)
113 |
114 | def set_epoch(self, epoch):
115 | self.sampler.set_epoch(epoch)
116 |
117 | def __iter__(self):
118 | sampler_info = self.sampler.update()
119 | indexes = self.sampler.sample(self.lists)
120 | for index in indexes:
121 | data = dict(src=self.lists[index])
122 | data.update(sampler_info)
123 | yield data
124 |
125 |
126 | def Dataset(data_list_file,
127 | data_pipeline,
128 | mode='train',
129 | shuffle=True,
130 | partition=True,
131 | tts_file='',
132 | prompt_utt2data=''):
133 | """ Construct dataset from arguments
134 |
135 | We have two shuffle stage in the Dataset. The first is global
136 | shuffle at shards tar/raw file level. The second is global shuffle
137 | at training samples level.
138 |
139 | Args:
140 | data_type(str): raw/shard
141 | tokenizer (BaseTokenizer): tokenizer to tokenize
142 | partition(bool): whether to do data partition in terms of rank
143 | """
144 | assert mode in ['train', 'inference']
145 | lists = read_lists(data_list_file)
146 | if mode == 'inference':
147 | with open(tts_file) as f:
148 | tts_data = json.load(f)
149 | utt2lists = read_json_lists(prompt_utt2data)
150 | # filter unnecessary file in inference mode
151 | lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
152 | dataset = DataList(lists,
153 | shuffle=shuffle,
154 | partition=partition)
155 | if mode == 'inference':
156 | # map partial arg tts_data in inference mode
157 | data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
158 | for func in data_pipeline:
159 | dataset = Processor(dataset, func, mode=mode)
160 | return dataset
161 |
--------------------------------------------------------------------------------
/cosyvoice/bin/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from __future__ import print_function
15 | import os,sys
16 | os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo"
17 | import argparse
18 | import datetime
19 | import logging
20 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
21 | from copy import deepcopy
22 | import torch
23 | import torch.distributed as dist
24 | import deepspeed
25 |
26 | now_dir = os.getcwd()
27 | sys.path.append(now_dir)
28 | sys.path.append("%s/cosyvoice" % (now_dir))
29 |
30 | from hyperpyyaml import load_hyperpyyaml
31 |
32 | from torch.distributed.elastic.multiprocessing.errors import record
33 |
34 | from cosyvoice.utils.executor import Executor
35 | from cosyvoice.utils.train_utils import (
36 | init_distributed,
37 | init_dataset_and_dataloader,
38 | init_optimizer_and_scheduler,
39 | init_summarywriter, save_model,
40 | wrap_cuda_model, check_modify_and_save_config)
41 |
42 |
43 | def get_args():
44 | parser = argparse.ArgumentParser(description='training your network')
45 | parser.add_argument('--train_engine',
46 | default='torch_ddp',
47 | choices=['torch_ddp', 'deepspeed'],
48 | help='Engine for paralleled training')
49 | parser.add_argument('--model', required=True, help='model which will be trained')
50 | parser.add_argument('--config', required=True, help='config file')
51 | parser.add_argument('--train_data', required=True, help='train data file')
52 | parser.add_argument('--cv_data', required=True, help='cv data file')
53 | parser.add_argument('--checkpoint', help='checkpoint model')
54 | parser.add_argument('--model_dir', required=True, help='save model dir')
55 | parser.add_argument('--tensorboard_dir',
56 | default='tensorboard',
57 | help='tensorboard log dir')
58 | parser.add_argument('--ddp.dist_backend',
59 | dest='dist_backend',
60 | default='gloo',
61 | choices=['nccl', 'gloo'],
62 | help='distributed backend')
63 | parser.add_argument('--num_workers',
64 | default=0,
65 | type=int,
66 | help='num of subprocess workers for reading')
67 | parser.add_argument('--prefetch',
68 | default=100,
69 | type=int,
70 | help='prefetch number')
71 | parser.add_argument('--pin_memory',
72 | action='store_true',
73 | default=False,
74 | help='Use pinned memory buffers used for reading')
75 | parser.add_argument('--deepspeed.save_states',
76 | dest='save_states',
77 | default='model_only',
78 | choices=['model_only', 'model+optimizer'],
79 | help='save model/optimizer states')
80 | parser.add_argument('--timeout',
81 | default=30,
82 | type=int,
83 | help='timeout (in seconds) of cosyvoice_join.')
84 | parser = deepspeed.add_config_arguments(parser)
85 | args = parser.parse_args()
86 | return args
87 |
88 |
89 | @record
90 | def main():
91 | args = get_args()
92 | logging.basicConfig(level=logging.DEBUG,
93 | format='%(asctime)s %(levelname)s %(message)s')
94 |
95 | override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
96 | with open(args.config, 'r') as f:
97 | configs = load_hyperpyyaml(f, overrides=override_dict)
98 | configs['train_conf'].update(vars(args))
99 |
100 | # Init env for ddp
101 | init_distributed(args)
102 |
103 | # Get dataset & dataloader
104 | train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
105 | init_dataset_and_dataloader(args, configs)
106 |
107 | # Do some sanity checks and save config to arsg.model_dir
108 | configs = check_modify_and_save_config(args, configs)
109 |
110 | # Tensorboard summary
111 | writer = init_summarywriter(args)
112 |
113 | # load checkpoint
114 | model = configs[args.model]
115 | if args.checkpoint is not None:
116 | model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
117 |
118 | # Dispatch model from cpu to gpu
119 | model = wrap_cuda_model(args, model)
120 |
121 | # Get optimizer & scheduler
122 | model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
123 |
124 | # Save init checkpoints
125 | info_dict = deepcopy(configs['train_conf'])
126 | save_model(model, 'init', info_dict)
127 |
128 | # Get executor
129 | executor = Executor()
130 |
131 | # Start training loop
132 | for epoch in range(info_dict['max_epoch']):
133 | executor.epoch = epoch
134 | train_dataset.set_epoch(epoch)
135 | dist.barrier()
136 | group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
137 | executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
138 | dist.destroy_process_group(group_join)
139 |
140 | if __name__ == '__main__':
141 | main()
142 |
--------------------------------------------------------------------------------
/cosyvoice/transformer/convolution.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
2 | # 2024 Alibaba Inc (Xiang Lyu)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # Modified from ESPnet(https://github.com/espnet/espnet)
16 | """ConvolutionModule definition."""
17 |
18 | from typing import Tuple
19 |
20 | import torch
21 | from torch import nn
22 |
23 |
24 | class ConvolutionModule(nn.Module):
25 | """ConvolutionModule in Conformer model."""
26 |
27 | def __init__(self,
28 | channels: int,
29 | kernel_size: int = 15,
30 | activation: nn.Module = nn.ReLU(),
31 | norm: str = "batch_norm",
32 | causal: bool = False,
33 | bias: bool = True):
34 | """Construct an ConvolutionModule object.
35 | Args:
36 | channels (int): The number of channels of conv layers.
37 | kernel_size (int): Kernel size of conv layers.
38 | causal (int): Whether use causal convolution or not
39 | """
40 | super().__init__()
41 |
42 | self.pointwise_conv1 = nn.Conv1d(
43 | channels,
44 | 2 * channels,
45 | kernel_size=1,
46 | stride=1,
47 | padding=0,
48 | bias=bias,
49 | )
50 | # self.lorder is used to distinguish if it's a causal convolution,
51 | # if self.lorder > 0: it's a causal convolution, the input will be
52 | # padded with self.lorder frames on the left in forward.
53 | # else: it's a symmetrical convolution
54 | if causal:
55 | padding = 0
56 | self.lorder = kernel_size - 1
57 | else:
58 | # kernel_size should be an odd number for none causal convolution
59 | assert (kernel_size - 1) % 2 == 0
60 | padding = (kernel_size - 1) // 2
61 | self.lorder = 0
62 | self.depthwise_conv = nn.Conv1d(
63 | channels,
64 | channels,
65 | kernel_size,
66 | stride=1,
67 | padding=padding,
68 | groups=channels,
69 | bias=bias,
70 | )
71 |
72 | assert norm in ['batch_norm', 'layer_norm']
73 | if norm == "batch_norm":
74 | self.use_layer_norm = False
75 | self.norm = nn.BatchNorm1d(channels)
76 | else:
77 | self.use_layer_norm = True
78 | self.norm = nn.LayerNorm(channels)
79 |
80 | self.pointwise_conv2 = nn.Conv1d(
81 | channels,
82 | channels,
83 | kernel_size=1,
84 | stride=1,
85 | padding=0,
86 | bias=bias,
87 | )
88 | self.activation = activation
89 |
90 | def forward(
91 | self,
92 | x: torch.Tensor,
93 | mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
94 | cache: torch.Tensor = torch.zeros((0, 0, 0)),
95 | ) -> Tuple[torch.Tensor, torch.Tensor]:
96 | """Compute convolution module.
97 | Args:
98 | x (torch.Tensor): Input tensor (#batch, time, channels).
99 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
100 | (0, 0, 0) means fake mask.
101 | cache (torch.Tensor): left context cache, it is only
102 | used in causal convolution (#batch, channels, cache_t),
103 | (0, 0, 0) meas fake cache.
104 | Returns:
105 | torch.Tensor: Output tensor (#batch, time, channels).
106 | """
107 | # exchange the temporal dimension and the feature dimension
108 | x = x.transpose(1, 2) # (#batch, channels, time)
109 |
110 | # mask batch padding
111 | if mask_pad.size(2) > 0: # time > 0
112 | x.masked_fill_(~mask_pad, 0.0)
113 |
114 | if self.lorder > 0:
115 | if cache.size(2) == 0: # cache_t == 0
116 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
117 | else:
118 | assert cache.size(0) == x.size(0) # equal batch
119 | assert cache.size(1) == x.size(1) # equal channel
120 | x = torch.cat((cache, x), dim=2)
121 | assert (x.size(2) > self.lorder)
122 | new_cache = x[:, :, -self.lorder:]
123 | else:
124 | # It's better we just return None if no cache is required,
125 | # However, for JIT export, here we just fake one tensor instead of
126 | # None.
127 | new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
128 |
129 | # GLU mechanism
130 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
131 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
132 |
133 | # 1D Depthwise Conv
134 | x = self.depthwise_conv(x)
135 | if self.use_layer_norm:
136 | x = x.transpose(1, 2)
137 | x = self.activation(self.norm(x))
138 | if self.use_layer_norm:
139 | x = x.transpose(1, 2)
140 | x = self.pointwise_conv2(x)
141 | # mask batch padding
142 | if mask_pad.size(2) > 0: # time > 0
143 | x.masked_fill_(~mask_pad, 0.0)
144 |
145 | return x.transpose(1, 2), new_cache
146 |
--------------------------------------------------------------------------------
/matcha/hifigan/README.md:
--------------------------------------------------------------------------------
1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
2 |
3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
4 |
5 | In our [paper](https://arxiv.org/abs/2010.05646),
6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
7 | We provide our implementation and pretrained models as open source in this repository.
8 |
9 | **Abstract :**
10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
11 | Although such methods improve the sampling efficiency and memory usage,
12 | their sample quality has not yet reached that of autoregressive and flow-based generative models.
13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
14 | As speech audio consists of sinusoidal signals with various periods,
15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart.
21 |
22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
23 |
24 | ## Pre-requisites
25 |
26 | 1. Python >= 3.6
27 | 2. Clone this repository.
28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
30 | And move all wav files to `LJSpeech-1.1/wavs`
31 |
32 | ## Training
33 |
34 | ```
35 | python train.py --config config_v1.json
36 | ```
37 |
38 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
39 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
40 | You can change the path by adding `--checkpoint_path` option.
41 |
42 | Validation loss during training with V1 generator.
43 | 
44 |
45 | ## Pretrained Model
46 |
47 | You can also use pretrained models we provide.
48 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
49 | Details of each folder are as in follows:
50 |
51 | | Folder Name | Generator | Dataset | Fine-Tuned |
52 | | ------------ | --------- | --------- | ------------------------------------------------------ |
53 | | LJ_V1 | V1 | LJSpeech | No |
54 | | LJ_V2 | V2 | LJSpeech | No |
55 | | LJ_V3 | V3 | LJSpeech | No |
56 | | LJ_FT_T2_V1 | V1 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
57 | | LJ_FT_T2_V2 | V2 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
58 | | LJ_FT_T2_V3 | V3 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
59 | | VCTK_V1 | V1 | VCTK | No |
60 | | VCTK_V2 | V2 | VCTK | No |
61 | | VCTK_V3 | V3 | VCTK | No |
62 | | UNIVERSAL_V1 | V1 | Universal | No |
63 |
64 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
65 |
66 | ## Fine-Tuning
67 |
68 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
69 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
70 | Example:
71 | ` Audio File : LJ001-0001.wav
72 | Mel-Spectrogram File : LJ001-0001.npy`
73 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
74 | 3. Run the following command.
75 | ```
76 | python train.py --fine_tuning True --config config_v1.json
77 | ```
78 | For other command line options, please refer to the training section.
79 |
80 | ## Inference from wav file
81 |
82 | 1. Make `test_files` directory and copy wav files into the directory.
83 | 2. Run the following command.
84 | ` python inference.py --checkpoint_file [generator checkpoint file path]`
85 | Generated wav files are saved in `generated_files` by default.
86 | You can change the path by adding `--output_dir` option.
87 |
88 | ## Inference for end-to-end speech synthesis
89 |
90 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
91 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
92 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
93 | 2. Run the following command.
94 | ` python inference_e2e.py --checkpoint_file [generator checkpoint file path]`
95 | Generated wav files are saved in `generated_files_from_mel` by default.
96 | You can change the path by adding `--output_dir` option.
97 |
98 | ## Acknowledgements
99 |
100 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
101 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
102 |
--------------------------------------------------------------------------------
/cosyvoice/flow/flow_matching.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn.functional as F
16 | from matcha.models.components.flow_matching import BASECFM
17 |
18 | class ConditionalCFM(BASECFM):
19 | def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
20 | super().__init__(
21 | n_feats=in_channels,
22 | cfm_params=cfm_params,
23 | n_spks=n_spks,
24 | spk_emb_dim=spk_emb_dim,
25 | )
26 | self.t_scheduler = cfm_params.t_scheduler
27 | self.training_cfg_rate = cfm_params.training_cfg_rate
28 | self.inference_cfg_rate = cfm_params.inference_cfg_rate
29 | in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
30 | # Just change the architecture of the estimator here
31 | self.estimator = estimator
32 |
33 | @torch.inference_mode()
34 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
35 | """Forward diffusion
36 |
37 | Args:
38 | mu (torch.Tensor): output of encoder
39 | shape: (batch_size, n_feats, mel_timesteps)
40 | mask (torch.Tensor): output_mask
41 | shape: (batch_size, 1, mel_timesteps)
42 | n_timesteps (int): number of diffusion steps
43 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
44 | spks (torch.Tensor, optional): speaker ids. Defaults to None.
45 | shape: (batch_size, spk_emb_dim)
46 | cond: Not used but kept for future purposes
47 |
48 | Returns:
49 | sample: generated mel-spectrogram
50 | shape: (batch_size, n_feats, mel_timesteps)
51 | """
52 | z = torch.randn_like(mu) * temperature
53 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
54 | if self.t_scheduler == 'cosine':
55 | t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
56 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
57 |
58 | def solve_euler(self, x, t_span, mu, mask, spks, cond):
59 | """
60 | Fixed euler solver for ODEs.
61 | Args:
62 | x (torch.Tensor): random noise
63 | t_span (torch.Tensor): n_timesteps interpolated
64 | shape: (n_timesteps + 1,)
65 | mu (torch.Tensor): output of encoder
66 | shape: (batch_size, n_feats, mel_timesteps)
67 | mask (torch.Tensor): output_mask
68 | shape: (batch_size, 1, mel_timesteps)
69 | spks (torch.Tensor, optional): speaker ids. Defaults to None.
70 | shape: (batch_size, spk_emb_dim)
71 | cond: Not used but kept for future purposes
72 | """
73 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
74 |
75 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file
76 | # Or in future might add like a return_all_steps flag
77 | sol = []
78 |
79 | for step in range(1, len(t_span)):
80 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
81 | # Classifier-Free Guidance inference introduced in VoiceBox
82 | if self.inference_cfg_rate > 0:
83 | cfg_dphi_dt = self.estimator(
84 | x, mask,
85 | torch.zeros_like(mu), t,
86 | torch.zeros_like(spks) if spks is not None else None,
87 | torch.zeros_like(cond)
88 | )
89 | dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
90 | self.inference_cfg_rate * cfg_dphi_dt)
91 | x = x + dt * dphi_dt
92 | t = t + dt
93 | sol.append(x)
94 | if step < len(t_span) - 1:
95 | dt = t_span[step + 1] - t
96 |
97 | return sol[-1]
98 |
99 | def compute_loss(self, x1, mask, mu, spks=None, cond=None):
100 | """Computes diffusion loss
101 |
102 | Args:
103 | x1 (torch.Tensor): Target
104 | shape: (batch_size, n_feats, mel_timesteps)
105 | mask (torch.Tensor): target mask
106 | shape: (batch_size, 1, mel_timesteps)
107 | mu (torch.Tensor): output of encoder
108 | shape: (batch_size, n_feats, mel_timesteps)
109 | spks (torch.Tensor, optional): speaker embedding. Defaults to None.
110 | shape: (batch_size, spk_emb_dim)
111 |
112 | Returns:
113 | loss: conditional flow matching loss
114 | y: conditional flow
115 | shape: (batch_size, n_feats, mel_timesteps)
116 | """
117 | b, _, t = mu.shape
118 |
119 | # random timestep
120 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
121 | if self.t_scheduler == 'cosine':
122 | t = 1 - torch.cos(t * 0.5 * torch.pi)
123 | # sample noise p(x_0)
124 | z = torch.randn_like(x1)
125 |
126 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127 | u = x1 - (1 - self.sigma_min) * z
128 |
129 | pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
130 | loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
131 | return loss, y
132 |
--------------------------------------------------------------------------------
/academicodec/binary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Raw binary format for Encodec compressed audio. Actual compression API is in `encodec.compress`."""
7 | import io
8 | import json
9 | import struct
10 | import typing as tp
11 |
12 | # format is `ECDC` magic code, followed by the header size as uint32.
13 | # Then an uint8 indicates the protocol version (0.)
14 | # The header is then provided as json and should contain all required
15 | # informations for decoding. A raw stream of bytes is then provided
16 | # and should be interpretable using the json header.
17 | _encodec_header_struct = struct.Struct('!4sBI')
18 | _ENCODEC_MAGIC = b'ECDC'
19 |
20 |
21 | def write_ecdc_header(fo: tp.IO[bytes], metadata: tp.Any):
22 | meta_dumped = json.dumps(metadata).encode('utf-8')
23 | version = 0
24 | header = _encodec_header_struct.pack(_ENCODEC_MAGIC, version,
25 | len(meta_dumped))
26 | fo.write(header)
27 | fo.write(meta_dumped)
28 | fo.flush()
29 |
30 |
31 | def _read_exactly(fo: tp.IO[bytes], size: int) -> bytes:
32 | buf = b""
33 | while len(buf) < size:
34 | new_buf = fo.read(size)
35 | if not new_buf:
36 | raise EOFError("Impossible to read enough data from the stream, "
37 | f"{size} bytes remaining.")
38 | buf += new_buf
39 | size -= len(new_buf)
40 | return buf
41 |
42 |
43 | def read_ecdc_header(fo: tp.IO[bytes]):
44 | header_bytes = _read_exactly(fo, _encodec_header_struct.size)
45 | magic, version, meta_size = _encodec_header_struct.unpack(header_bytes)
46 | if magic != _ENCODEC_MAGIC:
47 | raise ValueError("File is not in ECDC format.")
48 | if version != 0:
49 | raise ValueError("Version not supported.")
50 | meta_bytes = _read_exactly(fo, meta_size)
51 | return json.loads(meta_bytes.decode('utf-8'))
52 |
53 |
54 | class BitPacker:
55 | """Simple bit packer to handle ints with a non standard width, e.g. 10 bits.
56 | Note that for some bandwidth (1.5, 3), the codebook representation
57 | will not cover an integer number of bytes.
58 |
59 | Args:
60 | bits (int): number of bits per value that will be pushed.
61 | fo (IO[bytes]): file-object to push the bytes to.
62 | """
63 |
64 | def __init__(self, bits: int, fo: tp.IO[bytes]):
65 | self._current_value = 0
66 | self._current_bits = 0
67 | self.bits = bits
68 | self.fo = fo
69 |
70 | def push(self, value: int):
71 | """Push a new value to the stream. This will immediately
72 | write as many uint8 as possible to the underlying file-object."""
73 | self._current_value += (value << self._current_bits)
74 | self._current_bits += self.bits
75 | while self._current_bits >= 8:
76 | lower_8bits = self._current_value & 0xff
77 | self._current_bits -= 8
78 | self._current_value >>= 8
79 | self.fo.write(bytes([lower_8bits]))
80 |
81 | def flush(self):
82 | """Flushes the remaining partial uint8, call this at the end
83 | of the stream to encode."""
84 | if self._current_bits:
85 | self.fo.write(bytes([self._current_value]))
86 | self._current_value = 0
87 | self._current_bits = 0
88 | self.fo.flush()
89 |
90 |
91 | class BitUnpacker:
92 | """BitUnpacker does the opposite of `BitPacker`.
93 |
94 | Args:
95 | bits (int): number of bits of the values to decode.
96 | fo (IO[bytes]): file-object to push the bytes to.
97 | """
98 |
99 | def __init__(self, bits: int, fo: tp.IO[bytes]):
100 | self.bits = bits
101 | self.fo = fo
102 | self._mask = (1 << bits) - 1
103 | self._current_value = 0
104 | self._current_bits = 0
105 |
106 | def pull(self) -> tp.Optional[int]:
107 | """
108 | Pull a single value from the stream, potentially reading some
109 | extra bytes from the underlying file-object.
110 | Returns `None` when reaching the end of the stream.
111 | """
112 | while self._current_bits < self.bits:
113 | buf = self.fo.read(1)
114 | if not buf:
115 | return None
116 | character = buf[0]
117 | self._current_value += character << self._current_bits
118 | self._current_bits += 8
119 |
120 | out = self._current_value & self._mask
121 | self._current_value >>= self.bits
122 | self._current_bits -= self.bits
123 | return out
124 |
125 |
126 | def test():
127 | import torch
128 | torch.manual_seed(1234)
129 | for rep in range(4):
130 | length: int = torch.randint(10, 2_000, (1, )).item()
131 | bits: int = torch.randint(1, 16, (1, )).item()
132 | tokens: tp.List[int] = torch.randint(2**bits, (length, )).tolist()
133 | rebuilt: tp.List[int] = []
134 | buf = io.BytesIO()
135 | packer = BitPacker(bits, buf)
136 | for token in tokens:
137 | packer.push(token)
138 | packer.flush()
139 | buf.seek(0)
140 | unpacker = BitUnpacker(bits, buf)
141 | while True:
142 | value = unpacker.pull()
143 | if value is None:
144 | break
145 | rebuilt.append(value)
146 | assert len(rebuilt) >= len(tokens), (len(rebuilt), len(tokens))
147 | # The flushing mechanism might lead to "ghost" values at the end of the stream.
148 | assert len(rebuilt) <= len(tokens) + 8 // bits, (len(rebuilt),
149 | len(tokens), bits)
150 | for idx, (a, b) in enumerate(zip(tokens, rebuilt)):
151 | assert a == b, (idx, a, b)
152 |
153 |
154 | if __name__ == '__main__':
155 | test()
156 |
--------------------------------------------------------------------------------
/matcha/onnx/export.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import random
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | import torch
7 | from lightning import LightningModule
8 |
9 | from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder
10 |
11 | DEFAULT_OPSET = 15
12 |
13 | SEED = 1234
14 | random.seed(SEED)
15 | np.random.seed(SEED)
16 | torch.manual_seed(SEED)
17 | torch.cuda.manual_seed(SEED)
18 | torch.backends.cudnn.deterministic = True
19 | torch.backends.cudnn.benchmark = False
20 |
21 |
22 | class MatchaWithVocoder(LightningModule):
23 | def __init__(self, matcha, vocoder):
24 | super().__init__()
25 | self.matcha = matcha
26 | self.vocoder = vocoder
27 |
28 | def forward(self, x, x_lengths, scales, spks=None):
29 | mel, mel_lengths = self.matcha(x, x_lengths, scales, spks)
30 | wavs = self.vocoder(mel).clamp(-1, 1)
31 | lengths = mel_lengths * 256
32 | return wavs.squeeze(1), lengths
33 |
34 |
35 | def get_exportable_module(matcha, vocoder, n_timesteps):
36 | """
37 | Return an appropriate `LighteningModule` and output-node names
38 | based on whether the vocoder is embedded in the final graph
39 | """
40 |
41 | def onnx_forward_func(x, x_lengths, scales, spks=None):
42 | """
43 | Custom forward function for accepting
44 | scaler parameters as tensors
45 | """
46 | # Extract scaler parameters from tensors
47 | temperature = scales[0]
48 | length_scale = scales[1]
49 | output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale)
50 | return output["mel"], output["mel_lengths"]
51 |
52 | # Monkey-patch Matcha's forward function
53 | matcha.forward = onnx_forward_func
54 |
55 | if vocoder is None:
56 | model, output_names = matcha, ["mel", "mel_lengths"]
57 | else:
58 | model = MatchaWithVocoder(matcha, vocoder)
59 | output_names = ["wav", "wav_lengths"]
60 | return model, output_names
61 |
62 |
63 | def get_inputs(is_multi_speaker):
64 | """
65 | Create dummy inputs for tracing
66 | """
67 | dummy_input_length = 50
68 | x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long)
69 | x_lengths = torch.LongTensor([dummy_input_length])
70 |
71 | # Scales
72 | temperature = 0.667
73 | length_scale = 1.0
74 | scales = torch.Tensor([temperature, length_scale])
75 |
76 | model_inputs = [x, x_lengths, scales]
77 | input_names = [
78 | "x",
79 | "x_lengths",
80 | "scales",
81 | ]
82 |
83 | if is_multi_speaker:
84 | spks = torch.LongTensor([1])
85 | model_inputs.append(spks)
86 | input_names.append("spks")
87 |
88 | return tuple(model_inputs), input_names
89 |
90 |
91 | def main():
92 | parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX")
93 |
94 | parser.add_argument(
95 | "checkpoint_path",
96 | type=str,
97 | help="Path to the model checkpoint",
98 | )
99 | parser.add_argument("output", type=str, help="Path to output `.onnx` file")
100 | parser.add_argument(
101 | "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)"
102 | )
103 | parser.add_argument(
104 | "--vocoder-name",
105 | type=str,
106 | choices=list(VOCODER_URLS.keys()),
107 | default=None,
108 | help="Name of the vocoder to embed in the ONNX graph",
109 | )
110 | parser.add_argument(
111 | "--vocoder-checkpoint-path",
112 | type=str,
113 | default=None,
114 | help="Vocoder checkpoint to embed in the ONNX graph for an `e2e` like experience",
115 | )
116 | parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15")
117 |
118 | args = parser.parse_args()
119 |
120 | print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}")
121 | print(f"Setting n_timesteps to {args.n_timesteps}")
122 |
123 | checkpoint_path = Path(args.checkpoint_path)
124 | matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu")
125 |
126 | if args.vocoder_name or args.vocoder_checkpoint_path:
127 | assert (
128 | args.vocoder_name and args.vocoder_checkpoint_path
129 | ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph."
130 | vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu")
131 | else:
132 | vocoder = None
133 |
134 | is_multi_speaker = matcha.n_spks > 1
135 |
136 | dummy_input, input_names = get_inputs(is_multi_speaker)
137 | model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps)
138 |
139 | # Set dynamic shape for inputs/outputs
140 | dynamic_axes = {
141 | "x": {0: "batch_size", 1: "time"},
142 | "x_lengths": {0: "batch_size"},
143 | }
144 |
145 | if vocoder is None:
146 | dynamic_axes.update(
147 | {
148 | "mel": {0: "batch_size", 2: "time"},
149 | "mel_lengths": {0: "batch_size"},
150 | }
151 | )
152 | else:
153 | print("Embedding the vocoder in the ONNX graph")
154 | dynamic_axes.update(
155 | {
156 | "wav": {0: "batch_size", 1: "time"},
157 | "wav_lengths": {0: "batch_size"},
158 | }
159 | )
160 |
161 | if is_multi_speaker:
162 | dynamic_axes["spks"] = {0: "batch_size"}
163 |
164 | # Create the output directory (if not exists)
165 | Path(args.output).parent.mkdir(parents=True, exist_ok=True)
166 |
167 | model.to_onnx(
168 | args.output,
169 | dummy_input,
170 | input_names=input_names,
171 | output_names=output_names,
172 | dynamic_axes=dynamic_axes,
173 | opset_version=args.opset,
174 | export_params=True,
175 | do_constant_folding=True,
176 | )
177 | print(f"[🍵] ONNX model exported to {args.output}")
178 |
179 |
180 | if __name__ == "__main__":
181 | main()
182 |
--------------------------------------------------------------------------------
/cosyvoice/cli/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 |
16 | class CosyVoiceModel:
17 |
18 | def __init__(self,
19 | llm: torch.nn.Module,
20 | flow: torch.nn.Module,
21 | hift: torch.nn.Module):
22 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | self.llm = llm
24 | self.flow = flow
25 | self.hift = hift
26 |
27 | def load(self, llm_model, flow_model, hift_model):
28 | self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
29 | self.llm.to(self.device).eval()
30 | self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
31 | self.flow.to(self.device).eval()
32 | self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
33 | self.hift.to(self.device).eval()
34 |
35 | def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
36 | prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
37 | llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
38 | flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
39 | prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
40 | tts_speech_token = self.llm.inference(text=text.to(self.device),
41 | text_len=text_len.to(self.device),
42 | prompt_text=prompt_text.to(self.device),
43 | prompt_text_len=prompt_text_len.to(self.device),
44 | prompt_speech_token=llm_prompt_speech_token.to(self.device),
45 | prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
46 | embedding=llm_embedding.to(self.device),
47 | beam_size=1,
48 | sampling=25,
49 | max_token_text_ratio=30,
50 | min_token_text_ratio=3)
51 | tts_mel = self.flow.inference(token=tts_speech_token,
52 | token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
53 | prompt_token=flow_prompt_speech_token.to(self.device),
54 | prompt_token_len=flow_prompt_speech_token_len.to(self.device),
55 | prompt_feat=prompt_speech_feat.to(self.device),
56 | prompt_feat_len=prompt_speech_feat_len.to(self.device),
57 | embedding=flow_embedding.to(self.device))
58 | tts_speech = self.hift.inference(mel=tts_mel).cpu()
59 | torch.cuda.empty_cache()
60 | return {'tts_speech': tts_speech}
61 |
62 | def inference_stream(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
63 | prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
64 | llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
65 | flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
66 | prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
67 | try:
68 | tts_speech_token = next(self.llm.inference_stream(text=text.to(self.device),
69 | text_len=text_len.to(self.device),
70 | prompt_text=prompt_text.to(self.device),
71 | prompt_text_len=prompt_text_len.to(self.device),
72 | prompt_speech_token=llm_prompt_speech_token.to(self.device),
73 | prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
74 | embedding=llm_embedding.to(self.device),
75 | beam_size=1,
76 | sampling=25,
77 | max_token_text_ratio=30,
78 | min_token_text_ratio=3))
79 | except StopIteration:
80 | print("LLM inference stream exhausted")
81 | return
82 |
83 | try:
84 | tts_mel = next(self.flow.inference_stream(token=tts_speech_token,
85 | token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
86 | prompt_token=flow_prompt_speech_token.to(self.device),
87 | prompt_token_len=flow_prompt_speech_token_len.to(self.device),
88 | prompt_feat=prompt_speech_feat.to(self.device),
89 | prompt_feat_len=prompt_speech_feat_len.to(self.device),
90 | embedding=flow_embedding.to(self.device)))
91 | except StopIteration:
92 | print("Flow inference stream exhausted")
93 | return
94 |
95 | try:
96 | tts_speech = next(self.hift.inference_stream(mel=tts_mel))
97 | except StopIteration:
98 | print("HIFT inference stream exhausted")
99 | return
100 |
101 | tts_speech = tts_speech.cpu()
102 | torch.cuda.empty_cache()
103 | yield {'tts_speech': tts_speech}
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/text_normlization.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | from typing import List
16 |
17 | from .char_convert import tranditional_to_simplified
18 | from .chronology import RE_DATE
19 | from .chronology import RE_DATE2
20 | from .chronology import RE_TIME
21 | from .chronology import RE_TIME_RANGE
22 | from .chronology import replace_date
23 | from .chronology import replace_date2
24 | from .chronology import replace_time
25 | from .constants import F2H_ASCII_LETTERS
26 | from .constants import F2H_DIGITS
27 | from .constants import F2H_SPACE
28 | from .num import RE_DECIMAL_NUM
29 | from .num import RE_DEFAULT_NUM
30 | from .num import RE_FRAC
31 | from .num import RE_INTEGER
32 | from .num import RE_NUMBER
33 | from .num import RE_PERCENTAGE
34 | from .num import RE_POSITIVE_QUANTIFIERS
35 | from .num import RE_RANGE
36 | from .num import replace_default_num
37 | from .num import replace_frac
38 | from .num import replace_negative_num
39 | from .num import replace_number
40 | from .num import replace_percentage
41 | from .num import replace_positive_quantifier
42 | from .num import replace_range
43 | from .phonecode import RE_MOBILE_PHONE
44 | from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
45 | from .phonecode import RE_TELEPHONE
46 | from .phonecode import replace_mobile
47 | from .phonecode import replace_phone
48 | from .quantifier import RE_TEMPERATURE
49 | from .quantifier import replace_measure
50 | from .quantifier import replace_temperature
51 |
52 |
53 | class TextNormalizer():
54 | def __init__(self):
55 | self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
56 |
57 | def _split(self, text: str, lang="zh") -> List[str]:
58 | """Split long text into sentences with sentence-splitting punctuations.
59 | Args:
60 | text (str): The input text.
61 | Returns:
62 | List[str]: Sentences.
63 | """
64 | # Only for pure Chinese here
65 | if lang == "zh":
66 | text = text.replace(" ", "")
67 | # 过滤掉特殊字符
68 | text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text)
69 | text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
70 | text = text.strip()
71 | sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
72 | return sentences
73 |
74 | def _post_replace(self, sentence: str) -> str:
75 | sentence = sentence.replace('/', '每')
76 | sentence = sentence.replace('~', '至')
77 | sentence = sentence.replace('~', '至')
78 | sentence = sentence.replace('①', '一')
79 | sentence = sentence.replace('②', '二')
80 | sentence = sentence.replace('③', '三')
81 | sentence = sentence.replace('④', '四')
82 | sentence = sentence.replace('⑤', '五')
83 | sentence = sentence.replace('⑥', '六')
84 | sentence = sentence.replace('⑦', '七')
85 | sentence = sentence.replace('⑧', '八')
86 | sentence = sentence.replace('⑨', '九')
87 | sentence = sentence.replace('⑩', '十')
88 | sentence = sentence.replace('α', '阿尔法')
89 | sentence = sentence.replace('β', '贝塔')
90 | sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
91 | sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
92 | sentence = sentence.replace('ε', '艾普西龙')
93 | sentence = sentence.replace('ζ', '捷塔')
94 | sentence = sentence.replace('η', '依塔')
95 | sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
96 | sentence = sentence.replace('ι', '艾欧塔')
97 | sentence = sentence.replace('κ', '喀帕')
98 | sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
99 | sentence = sentence.replace('μ', '缪')
100 | sentence = sentence.replace('ν', '拗')
101 | sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
102 | sentence = sentence.replace('ο', '欧米克伦')
103 | sentence = sentence.replace('π', '派').replace('Π', '派')
104 | sentence = sentence.replace('ρ', '肉')
105 | sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
106 | 'σ', '西格玛')
107 | sentence = sentence.replace('τ', '套')
108 | sentence = sentence.replace('υ', '宇普西龙')
109 | sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
110 | sentence = sentence.replace('χ', '器')
111 | sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
112 | sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
113 | # re filter special characters, have one more character "-" than line 68
114 | sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence)
115 | return sentence
116 |
117 | def normalize_sentence(self, sentence: str) -> str:
118 | # basic character conversions
119 | sentence = tranditional_to_simplified(sentence)
120 | sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
121 | F2H_DIGITS).translate(F2H_SPACE)
122 |
123 | # number related NSW verbalization
124 | sentence = RE_DATE.sub(replace_date, sentence)
125 | sentence = RE_DATE2.sub(replace_date2, sentence)
126 |
127 | # range first
128 | sentence = RE_TIME_RANGE.sub(replace_time, sentence)
129 | sentence = RE_TIME.sub(replace_time, sentence)
130 |
131 | sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
132 | sentence = replace_measure(sentence)
133 | sentence = RE_FRAC.sub(replace_frac, sentence)
134 | sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
135 | sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
136 |
137 | sentence = RE_TELEPHONE.sub(replace_phone, sentence)
138 | sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
139 |
140 | sentence = RE_RANGE.sub(replace_range, sentence)
141 | sentence = RE_INTEGER.sub(replace_negative_num, sentence)
142 | sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
143 | sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
144 | sentence)
145 | sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
146 | sentence = RE_NUMBER.sub(replace_number, sentence)
147 | sentence = self._post_replace(sentence)
148 |
149 | return sentence
150 |
151 | def normalize(self, text: str) -> List[str]:
152 | sentences = self._split(text)
153 | sentences = [self.normalize_sentence(sent) for sent in sentences]
154 | return sentences
155 |
--------------------------------------------------------------------------------
/academicodec/models/encodec/test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Command-line for audio compression."""
7 | import argparse
8 | import os
9 | import sys
10 | import typing as tp
11 | from collections import OrderedDict
12 | from pathlib import Path
13 |
14 | import librosa
15 | import soundfile as sf
16 | import torch
17 | from academicodec.models.encodec.net3 import SoundStream
18 |
19 |
20 | def save_audio(wav: torch.Tensor,
21 | path: tp.Union[Path, str],
22 | sample_rate: int,
23 | rescale: bool=False):
24 | limit = 0.99
25 | mx = wav.abs().max()
26 | if rescale:
27 | wav = wav * min(limit / mx, 1)
28 | else:
29 | wav = wav.clamp(-limit, limit)
30 | wav = wav.squeeze().cpu().numpy()
31 | sf.write(path, wav, sample_rate)
32 |
33 |
34 | def get_parser():
35 | parser = argparse.ArgumentParser(
36 | 'encodec',
37 | description='High fidelity neural audio codec. '
38 | 'If input is a .ecdc, decompresses it. '
39 | 'If input is .wav, compresses it. If output is also wav, '
40 | 'do a compression/decompression cycle.')
41 | parser.add_argument(
42 | '--input',
43 | type=Path,
44 | help='Input file, whatever is supported by torchaudio on your system.')
45 | parser.add_argument(
46 | '--output',
47 | type=Path,
48 | nargs='?',
49 | help='Output file, otherwise inferred from input file.')
50 | parser.add_argument(
51 | '--resume_path', type=str, default='resume_path', help='resume_path')
52 | parser.add_argument(
53 | '--sr', type=int, default=16000, help='sample rate of model')
54 | parser.add_argument(
55 | '-r',
56 | '--rescale',
57 | action='store_true',
58 | help='Automatically rescale the output to avoid clipping.')
59 | parser.add_argument(
60 | '--ratios',
61 | type=int,
62 | nargs='+',
63 | # probs(ratios) = hop_size
64 | default=[8, 5, 4, 2],
65 | help='ratios of SoundStream, shoud be set for different hop_size (32d, 320, 240d, ...)'
66 | )
67 | parser.add_argument(
68 | '--target_bandwidths',
69 | type=float,
70 | nargs='+',
71 | # default for 16k_320d
72 | default=[1, 1.5, 2, 4, 6, 12],
73 | help='target_bandwidths of net3.py')
74 | parser.add_argument(
75 | '--target_bw',
76 | type=float,
77 | # default for 16k_320d
78 | default=12,
79 | help='target_bw of net3.py')
80 |
81 | return parser
82 |
83 |
84 | def fatal(*args):
85 | print(*args, file=sys.stderr)
86 | sys.exit(1)
87 |
88 |
89 | # 这只是打印了但是没有真的 clip
90 | def check_clipping(wav, rescale):
91 | if rescale:
92 | return
93 | mx = wav.abs().max()
94 | limit = 0.99
95 | if mx > limit:
96 | print(
97 | f"Clipping!! max scale {mx}, limit is {limit}. "
98 | "To avoid clipping, use the `-r` option to rescale the output.",
99 | file=sys.stderr)
100 |
101 |
102 | def test_one(args, wav_root, store_root, rescale, soundstream):
103 | # torchaudio.load 的采样率为原始音频的采样率,不会自动下采样
104 | # wav, sr = torchaudio.load(wav_root)
105 | # # 取单声道, output shape [1, T]
106 | # wav = wav[0].unsqueeze(0)
107 | # # 重采样为模型的采样率
108 | # wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=args.sr)(wav)
109 |
110 | # load wav with librosa
111 | wav, sr = librosa.load(wav_root, sr=args.sr)
112 | wav = torch.tensor(wav).unsqueeze(0)
113 |
114 | # add batch axis
115 | wav = wav.unsqueeze(1).cuda()
116 |
117 | # compressing
118 | compressed = soundstream.encode(wav, target_bw=args.target_bw)
119 | print('finish compressing')
120 | out = soundstream.decode(compressed)
121 | out = out.detach().cpu().squeeze(0)
122 | check_clipping(out, rescale)
123 | save_audio(wav=out, path=store_root, sample_rate=args.sr, rescale=rescale)
124 | print('finish decompressing')
125 |
126 |
127 | def remove_encodec_weight_norm(model):
128 | from academicodec.modules import SConv1d
129 | from academicodec.modules.seanet import SConvTranspose1d
130 | from academicodec.modules.seanet import SEANetResnetBlock
131 | from torch.nn.utils import remove_weight_norm
132 |
133 | encoder = model.encoder.model
134 | for key in encoder._modules:
135 | if isinstance(encoder._modules[key], SEANetResnetBlock):
136 | remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
137 | block_modules = encoder._modules[key].block._modules
138 | for skey in block_modules:
139 | if isinstance(block_modules[skey], SConv1d):
140 | remove_weight_norm(block_modules[skey].conv.conv)
141 | elif isinstance(encoder._modules[key], SConv1d):
142 | remove_weight_norm(encoder._modules[key].conv.conv)
143 |
144 | decoder = model.decoder.model
145 | for key in decoder._modules:
146 | if isinstance(decoder._modules[key], SEANetResnetBlock):
147 | remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
148 | block_modules = decoder._modules[key].block._modules
149 | for skey in block_modules:
150 | if isinstance(block_modules[skey], SConv1d):
151 | remove_weight_norm(block_modules[skey].conv.conv)
152 | elif isinstance(decoder._modules[key], SConvTranspose1d):
153 | remove_weight_norm(decoder._modules[key].convtr.convtr)
154 | elif isinstance(decoder._modules[key], SConv1d):
155 | remove_weight_norm(decoder._modules[key].conv.conv)
156 |
157 |
158 | def test_batch():
159 | args = get_parser().parse_args()
160 | print("args.target_bandwidths:", args.target_bandwidths)
161 | if not args.input.exists():
162 | fatal(f"Input file {args.input} does not exist.")
163 | input_lists = os.listdir(args.input)
164 | input_lists.sort()
165 | soundstream = SoundStream(
166 | n_filters=32,
167 | D=512,
168 | ratios=args.ratios,
169 | sample_rate=args.sr,
170 | target_bandwidths=args.target_bandwidths)
171 | parameter_dict = torch.load(args.resume_path)
172 | new_state_dict = OrderedDict()
173 | # k 为 module.xxx.weight, v 为权重
174 | for k, v in parameter_dict.items():
175 | # 截取`module.`后面的xxx.weight
176 | name = k[7:]
177 | new_state_dict[name] = v
178 | soundstream.load_state_dict(new_state_dict) # load model
179 | remove_encodec_weight_norm(soundstream)
180 | soundstream.cuda()
181 | soundstream.eval()
182 | os.makedirs(args.output, exist_ok=True)
183 | for audio in input_lists:
184 | test_one(
185 | args=args,
186 | wav_root=os.path.join(args.input, audio),
187 | store_root=os.path.join(args.output, audio),
188 | rescale=args.rescale,
189 | soundstream=soundstream)
190 |
191 |
192 | if __name__ == '__main__':
193 | test_batch()
194 |
--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/num.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Rules to verbalize numbers into Chinese characters.
16 | https://zh.wikipedia.org/wiki/中文数字#現代中文
17 | """
18 | import re
19 | from collections import OrderedDict
20 | from typing import List
21 |
22 | DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
23 | UNITS = OrderedDict({
24 | 1: '十',
25 | 2: '百',
26 | 3: '千',
27 | 4: '万',
28 | 8: '亿',
29 | })
30 |
31 | COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
32 |
33 | # 分数表达式
34 | RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
35 |
36 |
37 | def replace_frac(match) -> str:
38 | """
39 | Args:
40 | match (re.Match)
41 | Returns:
42 | str
43 | """
44 | sign = match.group(1)
45 | nominator = match.group(2)
46 | denominator = match.group(3)
47 | sign: str = "负" if sign else ""
48 | nominator: str = num2str(nominator)
49 | denominator: str = num2str(denominator)
50 | result = f"{sign}{denominator}分之{nominator}"
51 | return result
52 |
53 |
54 | # 百分数表达式
55 | RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
56 |
57 |
58 | def replace_percentage(match) -> str:
59 | """
60 | Args:
61 | match (re.Match)
62 | Returns:
63 | str
64 | """
65 | sign = match.group(1)
66 | percent = match.group(2)
67 | sign: str = "负" if sign else ""
68 | percent: str = num2str(percent)
69 | result = f"{sign}百分之{percent}"
70 | return result
71 |
72 |
73 | # 整数表达式
74 | # 带负号的整数 -10
75 | RE_INTEGER = re.compile(r'(-)' r'(\d+)')
76 |
77 |
78 | def replace_negative_num(match) -> str:
79 | """
80 | Args:
81 | match (re.Match)
82 | Returns:
83 | str
84 | """
85 | sign = match.group(1)
86 | number = match.group(2)
87 | sign: str = "负" if sign else ""
88 | number: str = num2str(number)
89 | result = f"{sign}{number}"
90 | return result
91 |
92 |
93 | # 编号-无符号整形
94 | # 00078
95 | RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
96 |
97 |
98 | def replace_default_num(match):
99 | """
100 | Args:
101 | match (re.Match)
102 | Returns:
103 | str
104 | """
105 | number = match.group(0)
106 | return verbalize_digit(number, alt_one=True)
107 |
108 |
109 | # 数字表达式
110 | # 纯小数
111 | RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
112 | # 正整数 + 量词
113 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
114 | RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
115 |
116 |
117 | def replace_positive_quantifier(match) -> str:
118 | """
119 | Args:
120 | match (re.Match)
121 | Returns:
122 | str
123 | """
124 | number = match.group(1)
125 | match_2 = match.group(2)
126 | if match_2 == "+":
127 | match_2 = "多"
128 | match_2: str = match_2 if match_2 else ""
129 | quantifiers: str = match.group(3)
130 | number: str = num2str(number)
131 | result = f"{number}{match_2}{quantifiers}"
132 | return result
133 |
134 |
135 | def replace_number(match) -> str:
136 | """
137 | Args:
138 | match (re.Match)
139 | Returns:
140 | str
141 | """
142 | sign = match.group(1)
143 | number = match.group(2)
144 | pure_decimal = match.group(5)
145 | if pure_decimal:
146 | result = num2str(pure_decimal)
147 | else:
148 | sign: str = "负" if sign else ""
149 | number: str = num2str(number)
150 | result = f"{sign}{number}"
151 | return result
152 |
153 |
154 | # 范围表达式
155 | # match.group(1) and match.group(8) are copy from RE_NUMBER
156 |
157 | RE_RANGE = re.compile(
158 | r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
159 |
160 |
161 | def replace_range(match) -> str:
162 | """
163 | Args:
164 | match (re.Match)
165 | Returns:
166 | str
167 | """
168 | first, second = match.group(1), match.group(8)
169 | first = RE_NUMBER.sub(replace_number, first)
170 | second = RE_NUMBER.sub(replace_number, second)
171 | result = f"{first}到{second}"
172 | return result
173 |
174 |
175 | def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
176 | stripped = value_string.lstrip('0')
177 | if len(stripped) == 0:
178 | return []
179 | elif len(stripped) == 1:
180 | if use_zero and len(stripped) < len(value_string):
181 | return [DIGITS['0'], DIGITS[stripped]]
182 | else:
183 | return [DIGITS[stripped]]
184 | else:
185 | largest_unit = next(
186 | power for power in reversed(UNITS.keys()) if power < len(stripped))
187 | first_part = value_string[:-largest_unit]
188 | second_part = value_string[-largest_unit:]
189 | return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
190 | second_part)
191 |
192 |
193 | def verbalize_cardinal(value_string: str) -> str:
194 | if not value_string:
195 | return ''
196 |
197 | # 000 -> '零' , 0 -> '零'
198 | value_string = value_string.lstrip('0')
199 | if len(value_string) == 0:
200 | return DIGITS['0']
201 |
202 | result_symbols = _get_value(value_string)
203 | # verbalized number starting with '一十*' is abbreviated as `十*`
204 | if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
205 | '1'] and result_symbols[1] == UNITS[1]:
206 | result_symbols = result_symbols[1:]
207 | return ''.join(result_symbols)
208 |
209 |
210 | def verbalize_digit(value_string: str, alt_one=False) -> str:
211 | result_symbols = [DIGITS[digit] for digit in value_string]
212 | result = ''.join(result_symbols)
213 | if alt_one:
214 | result = result.replace("一", "幺")
215 | return result
216 |
217 |
218 | def num2str(value_string: str) -> str:
219 | integer_decimal = value_string.split('.')
220 | if len(integer_decimal) == 1:
221 | integer = integer_decimal[0]
222 | decimal = ''
223 | elif len(integer_decimal) == 2:
224 | integer, decimal = integer_decimal
225 | else:
226 | raise ValueError(
227 | f"The value string: '${value_string}' has more than one point in it."
228 | )
229 |
230 | result = verbalize_cardinal(integer)
231 |
232 | decimal = decimal.rstrip('0')
233 | if decimal:
234 | # '.22' is verbalized as '零点二二'
235 | # '3.20' is verbalized as '三点二
236 | result = result if result else "零"
237 | result += '点' + verbalize_digit(decimal)
238 | return result
239 |
--------------------------------------------------------------------------------
/academicodec/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import json
3 | import os
4 | import random
5 | import sys
6 | import time
7 | import warnings
8 |
9 | import matplotlib
10 | import numpy as np
11 | import torch
12 | import yaml
13 | from torch import distributed as dist
14 | from torch.nn.utils import weight_norm
15 | matplotlib.use("Agg")
16 | import matplotlib.pylab as plt
17 | import re
18 | import pathlib
19 |
20 |
21 | def seed_everything(seed, cudnn_deterministic=False):
22 | """
23 | Function that sets seed for pseudo-random number generators in:
24 | pytorch, numpy, python.random
25 |
26 | Args:
27 | seed: the integer value seed for global random state
28 | """
29 | if seed is not None:
30 | # print(f"Global seed set to {seed}")
31 | random.seed(seed)
32 | np.random.seed(seed)
33 | torch.manual_seed(seed)
34 | torch.cuda.manual_seed_all(seed)
35 |
36 | # if cudnn_deterministic:
37 | # torch.backends.cudnn.deterministic = True
38 | # warnings.warn('You have chosen to seed training. '
39 | # 'This will turn on the CUDNN deterministic setting, '
40 | # 'which can slow down your training considerably! '
41 | # 'You may see unexpected behavior when restarting '
42 | # 'from checkpoints.')
43 |
44 |
45 | def is_primary():
46 | return get_rank() == 0
47 |
48 |
49 | def get_rank():
50 | if not dist.is_available():
51 | return 0
52 | if not dist.is_initialized():
53 | return 0
54 |
55 | return dist.get_rank()
56 |
57 |
58 | def load_yaml_config(path):
59 | with open(path) as f:
60 | config = yaml.full_load(f)
61 | return config
62 |
63 |
64 | def save_config_to_yaml(config, path):
65 | assert path.endswith('.yaml')
66 | with open(path, 'w') as f:
67 | f.write(yaml.dump(config))
68 | f.close()
69 |
70 |
71 | def save_dict_to_json(d, path, indent=None):
72 | json.dump(d, open(path, 'w'), indent=indent)
73 |
74 |
75 | def load_dict_from_json(path):
76 | return json.load(open(path, 'r'))
77 |
78 |
79 | def write_args(args, path):
80 | args_dict = dict((name, getattr(args, name)) for name in dir(args)
81 | if not name.startswith('_'))
82 | with open(path, 'a') as args_file:
83 | args_file.write('==> torch version: {}\n'.format(torch.__version__))
84 | args_file.write(
85 | '==> cudnn version: {}\n'.format(torch.backends.cudnn.version()))
86 | args_file.write('==> Cmd:\n')
87 | args_file.write(str(sys.argv))
88 | args_file.write('\n==> args:\n')
89 | for k, v in sorted(args_dict.items()):
90 | args_file.write(' %s: %s\n' % (str(k), str(v)))
91 | args_file.close()
92 |
93 |
94 | class Logger(object):
95 | def __init__(self, args):
96 | self.args = args
97 | self.save_dir = args.save_dir
98 | self.is_primary = is_primary()
99 |
100 | if self.is_primary:
101 | os.makedirs(self.save_dir, exist_ok=True)
102 |
103 | # save the args and config
104 | self.config_dir = os.path.join(self.save_dir, 'configs')
105 | os.makedirs(self.config_dir, exist_ok=True)
106 | file_name = os.path.join(self.config_dir, 'args.txt')
107 | write_args(args, file_name)
108 |
109 | log_dir = os.path.join(self.save_dir, 'logs')
110 | if not os.path.exists(log_dir):
111 | os.makedirs(log_dir, exist_ok=True)
112 | self.text_writer = open(os.path.join(log_dir, 'log.txt'),
113 | 'a') # 'w')
114 | if args.tensorboard:
115 | self.log_info('using tensorboard')
116 | self.tb_writer = torch.utils.tensorboard.SummaryWriter(
117 | log_dir=log_dir
118 | ) # tensorboard.SummaryWriter(log_dir=log_dir)
119 | else:
120 | self.tb_writer = None
121 |
122 | def save_config(self, config):
123 | if self.is_primary:
124 | save_config_to_yaml(config,
125 | os.path.join(self.config_dir, 'config.yaml'))
126 |
127 | def log_info(self, info, check_primary=True):
128 | if self.is_primary or (not check_primary):
129 | print(info)
130 | if self.is_primary:
131 | info = str(info)
132 | time_str = time.strftime('%Y-%m-%d-%H-%M')
133 | info = '{}: {}'.format(time_str, info)
134 | if not info.endswith('\n'):
135 | info += '\n'
136 | self.text_writer.write(info)
137 | self.text_writer.flush()
138 |
139 | def add_scalar(self, **kargs):
140 | """Log a scalar variable."""
141 | if self.is_primary:
142 | if self.tb_writer is not None:
143 | self.tb_writer.add_scalar(**kargs)
144 |
145 | def add_scalars(self, **kargs):
146 | """Log a scalar variable."""
147 | if self.is_primary:
148 | if self.tb_writer is not None:
149 | self.tb_writer.add_scalars(**kargs)
150 |
151 | def add_image(self, **kargs):
152 | """Log a scalar variable."""
153 | if self.is_primary:
154 | if self.tb_writer is not None:
155 | self.tb_writer.add_image(**kargs)
156 |
157 | def add_images(self, **kargs):
158 | """Log a scalar variable."""
159 | if self.is_primary:
160 | if self.tb_writer is not None:
161 | self.tb_writer.add_images(**kargs)
162 |
163 | def close(self):
164 | if self.is_primary:
165 | self.text_writer.close()
166 | self.tb_writer.close()
167 |
168 |
169 | def plot_spectrogram(spectrogram):
170 | fig, ax = plt.subplots(figsize=(10, 2))
171 | im = ax.imshow(
172 | spectrogram, aspect="auto", origin="lower", interpolation='none')
173 | plt.colorbar(im, ax=ax)
174 |
175 | fig.canvas.draw()
176 | plt.close()
177 |
178 | return fig
179 |
180 |
181 | def init_weights(m, mean=0.0, std=0.01):
182 | classname = m.__class__.__name__
183 | if classname.find("Conv") != -1:
184 | m.weight.data.normal_(mean, std)
185 |
186 |
187 | def apply_weight_norm(m):
188 | classname = m.__class__.__name__
189 | if classname.find("Conv") != -1:
190 | weight_norm(m)
191 |
192 |
193 | def get_padding(kernel_size, dilation=1):
194 | return int((kernel_size * dilation - dilation) / 2)
195 |
196 |
197 | def load_checkpoint(filepath, device):
198 | assert os.path.isfile(filepath)
199 | print("Loading '{}'".format(filepath))
200 | checkpoint_dict = torch.load(filepath, map_location=device)
201 | print("Complete.")
202 | return checkpoint_dict
203 |
204 |
205 | def save_checkpoint(filepath, obj, num_ckpt_keep=5):
206 | name = re.match(r'(do|g)_\d+', pathlib.Path(filepath).name).group(1)
207 | ckpts = sorted(pathlib.Path(filepath).parent.glob(f'{name}_*'))
208 | if len(ckpts) > num_ckpt_keep:
209 | [os.remove(c) for c in ckpts[:-num_ckpt_keep]]
210 | print("Saving checkpoint to {}".format(filepath))
211 | torch.save(obj, filepath)
212 | print("Complete.")
213 |
214 |
215 | def scan_checkpoint(cp_dir, prefix):
216 | pattern = os.path.join(cp_dir, prefix + '????????')
217 | cp_list = glob.glob(pattern)
218 | if len(cp_list) == 0:
219 | return None
220 | return sorted(cp_list)[-1]
221 |
--------------------------------------------------------------------------------
/matcha/hifigan/meldataset.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/jik876/hifi-gan """
2 |
3 | import math
4 | import os
5 | import random
6 |
7 | import numpy as np
8 | import torch
9 | import torch.utils.data
10 | from librosa.filters import mel as librosa_mel_fn
11 | from librosa.util import normalize
12 | from scipy.io.wavfile import read
13 |
14 | MAX_WAV_VALUE = 32768.0
15 |
16 |
17 | def load_wav(full_path):
18 | sampling_rate, data = read(full_path)
19 | return data, sampling_rate
20 |
21 |
22 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
23 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
24 |
25 |
26 | def dynamic_range_decompression(x, C=1):
27 | return np.exp(x) / C
28 |
29 |
30 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
31 | return torch.log(torch.clamp(x, min=clip_val) * C)
32 |
33 |
34 | def dynamic_range_decompression_torch(x, C=1):
35 | return torch.exp(x) / C
36 |
37 |
38 | def spectral_normalize_torch(magnitudes):
39 | output = dynamic_range_compression_torch(magnitudes)
40 | return output
41 |
42 |
43 | def spectral_de_normalize_torch(magnitudes):
44 | output = dynamic_range_decompression_torch(magnitudes)
45 | return output
46 |
47 |
48 | mel_basis = {}
49 | hann_window = {}
50 |
51 |
52 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
53 | if torch.min(y) < -1.0:
54 | print("min value is ", torch.min(y))
55 | if torch.max(y) > 1.0:
56 | print("max value is ", torch.max(y))
57 |
58 | global mel_basis, hann_window # pylint: disable=global-statement
59 | if fmax not in mel_basis:
60 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
61 | mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
62 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
63 |
64 | y = torch.nn.functional.pad(
65 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
66 | )
67 | y = y.squeeze(1)
68 |
69 | spec = torch.view_as_real(
70 | torch.stft(
71 | y,
72 | n_fft,
73 | hop_length=hop_size,
74 | win_length=win_size,
75 | window=hann_window[str(y.device)],
76 | center=center,
77 | pad_mode="reflect",
78 | normalized=False,
79 | onesided=True,
80 | return_complex=True,
81 | )
82 | )
83 |
84 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
85 |
86 | spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
87 | spec = spectral_normalize_torch(spec)
88 |
89 | return spec
90 |
91 |
92 | def get_dataset_filelist(a):
93 | with open(a.input_training_file, encoding="utf-8") as fi:
94 | training_files = [
95 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
96 | ]
97 |
98 | with open(a.input_validation_file, encoding="utf-8") as fi:
99 | validation_files = [
100 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
101 | ]
102 | return training_files, validation_files
103 |
104 |
105 | class MelDataset(torch.utils.data.Dataset):
106 | def __init__(
107 | self,
108 | training_files,
109 | segment_size,
110 | n_fft,
111 | num_mels,
112 | hop_size,
113 | win_size,
114 | sampling_rate,
115 | fmin,
116 | fmax,
117 | split=True,
118 | shuffle=True,
119 | n_cache_reuse=1,
120 | device=None,
121 | fmax_loss=None,
122 | fine_tuning=False,
123 | base_mels_path=None,
124 | ):
125 | self.audio_files = training_files
126 | random.seed(1234)
127 | if shuffle:
128 | random.shuffle(self.audio_files)
129 | self.segment_size = segment_size
130 | self.sampling_rate = sampling_rate
131 | self.split = split
132 | self.n_fft = n_fft
133 | self.num_mels = num_mels
134 | self.hop_size = hop_size
135 | self.win_size = win_size
136 | self.fmin = fmin
137 | self.fmax = fmax
138 | self.fmax_loss = fmax_loss
139 | self.cached_wav = None
140 | self.n_cache_reuse = n_cache_reuse
141 | self._cache_ref_count = 0
142 | self.device = device
143 | self.fine_tuning = fine_tuning
144 | self.base_mels_path = base_mels_path
145 |
146 | def __getitem__(self, index):
147 | filename = self.audio_files[index]
148 | if self._cache_ref_count == 0:
149 | audio, sampling_rate = load_wav(filename)
150 | audio = audio / MAX_WAV_VALUE
151 | if not self.fine_tuning:
152 | audio = normalize(audio) * 0.95
153 | self.cached_wav = audio
154 | if sampling_rate != self.sampling_rate:
155 | raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
156 | self._cache_ref_count = self.n_cache_reuse
157 | else:
158 | audio = self.cached_wav
159 | self._cache_ref_count -= 1
160 |
161 | audio = torch.FloatTensor(audio)
162 | audio = audio.unsqueeze(0)
163 |
164 | if not self.fine_tuning:
165 | if self.split:
166 | if audio.size(1) >= self.segment_size:
167 | max_audio_start = audio.size(1) - self.segment_size
168 | audio_start = random.randint(0, max_audio_start)
169 | audio = audio[:, audio_start : audio_start + self.segment_size]
170 | else:
171 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
172 |
173 | mel = mel_spectrogram(
174 | audio,
175 | self.n_fft,
176 | self.num_mels,
177 | self.sampling_rate,
178 | self.hop_size,
179 | self.win_size,
180 | self.fmin,
181 | self.fmax,
182 | center=False,
183 | )
184 | else:
185 | mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
186 | mel = torch.from_numpy(mel)
187 |
188 | if len(mel.shape) < 3:
189 | mel = mel.unsqueeze(0)
190 |
191 | if self.split:
192 | frames_per_seg = math.ceil(self.segment_size / self.hop_size)
193 |
194 | if audio.size(1) >= self.segment_size:
195 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
196 | mel = mel[:, :, mel_start : mel_start + frames_per_seg]
197 | audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
198 | else:
199 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
200 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
201 |
202 | mel_loss = mel_spectrogram(
203 | audio,
204 | self.n_fft,
205 | self.num_mels,
206 | self.sampling_rate,
207 | self.hop_size,
208 | self.win_size,
209 | self.fmin,
210 | self.fmax_loss,
211 | center=False,
212 | )
213 |
214 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
215 |
216 | def __len__(self):
217 | return len(self.audio_files)
218 |
--------------------------------------------------------------------------------
/matcha/models/baselightningmodule.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a base lightning module that can be used to train a model.
3 | The benefit of this abstraction is that all the logic outside of model definition can be reused for different models.
4 | """
5 | import inspect
6 | from abc import ABC
7 | from typing import Any, Dict
8 |
9 | import torch
10 | from lightning import LightningModule
11 | from lightning.pytorch.utilities import grad_norm
12 |
13 | from matcha import utils
14 | from matcha.utils.utils import plot_tensor
15 |
16 | log = utils.get_pylogger(__name__)
17 |
18 |
19 | class BaseLightningClass(LightningModule, ABC):
20 | def update_data_statistics(self, data_statistics):
21 | if data_statistics is None:
22 | data_statistics = {
23 | "mel_mean": 0.0,
24 | "mel_std": 1.0,
25 | }
26 |
27 | self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"]))
28 | self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"]))
29 |
30 | def configure_optimizers(self) -> Any:
31 | optimizer = self.hparams.optimizer(params=self.parameters())
32 | if self.hparams.scheduler not in (None, {}):
33 | scheduler_args = {}
34 | # Manage last epoch for exponential schedulers
35 | if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters:
36 | if hasattr(self, "ckpt_loaded_epoch"):
37 | current_epoch = self.ckpt_loaded_epoch - 1
38 | else:
39 | current_epoch = -1
40 |
41 | scheduler_args.update({"optimizer": optimizer})
42 | scheduler = self.hparams.scheduler.scheduler(**scheduler_args)
43 | scheduler.last_epoch = current_epoch
44 | return {
45 | "optimizer": optimizer,
46 | "lr_scheduler": {
47 | "scheduler": scheduler,
48 | "interval": self.hparams.scheduler.lightning_args.interval,
49 | "frequency": self.hparams.scheduler.lightning_args.frequency,
50 | "name": "learning_rate",
51 | },
52 | }
53 |
54 | return {"optimizer": optimizer}
55 |
56 | def get_losses(self, batch):
57 | x, x_lengths = batch["x"], batch["x_lengths"]
58 | y, y_lengths = batch["y"], batch["y_lengths"]
59 | spks = batch["spks"]
60 |
61 | dur_loss, prior_loss, diff_loss = self(
62 | x=x,
63 | x_lengths=x_lengths,
64 | y=y,
65 | y_lengths=y_lengths,
66 | spks=spks,
67 | out_size=self.out_size,
68 | )
69 | return {
70 | "dur_loss": dur_loss,
71 | "prior_loss": prior_loss,
72 | "diff_loss": diff_loss,
73 | }
74 |
75 | def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
76 | self.ckpt_loaded_epoch = checkpoint["epoch"] # pylint: disable=attribute-defined-outside-init
77 |
78 | def training_step(self, batch: Any, batch_idx: int):
79 | loss_dict = self.get_losses(batch)
80 | self.log(
81 | "step",
82 | float(self.global_step),
83 | on_step=True,
84 | prog_bar=True,
85 | logger=True,
86 | sync_dist=True,
87 | )
88 |
89 | self.log(
90 | "sub_loss/train_dur_loss",
91 | loss_dict["dur_loss"],
92 | on_step=True,
93 | on_epoch=True,
94 | logger=True,
95 | sync_dist=True,
96 | )
97 | self.log(
98 | "sub_loss/train_prior_loss",
99 | loss_dict["prior_loss"],
100 | on_step=True,
101 | on_epoch=True,
102 | logger=True,
103 | sync_dist=True,
104 | )
105 | self.log(
106 | "sub_loss/train_diff_loss",
107 | loss_dict["diff_loss"],
108 | on_step=True,
109 | on_epoch=True,
110 | logger=True,
111 | sync_dist=True,
112 | )
113 |
114 | total_loss = sum(loss_dict.values())
115 | self.log(
116 | "loss/train",
117 | total_loss,
118 | on_step=True,
119 | on_epoch=True,
120 | logger=True,
121 | prog_bar=True,
122 | sync_dist=True,
123 | )
124 |
125 | return {"loss": total_loss, "log": loss_dict}
126 |
127 | def validation_step(self, batch: Any, batch_idx: int):
128 | loss_dict = self.get_losses(batch)
129 | self.log(
130 | "sub_loss/val_dur_loss",
131 | loss_dict["dur_loss"],
132 | on_step=True,
133 | on_epoch=True,
134 | logger=True,
135 | sync_dist=True,
136 | )
137 | self.log(
138 | "sub_loss/val_prior_loss",
139 | loss_dict["prior_loss"],
140 | on_step=True,
141 | on_epoch=True,
142 | logger=True,
143 | sync_dist=True,
144 | )
145 | self.log(
146 | "sub_loss/val_diff_loss",
147 | loss_dict["diff_loss"],
148 | on_step=True,
149 | on_epoch=True,
150 | logger=True,
151 | sync_dist=True,
152 | )
153 |
154 | total_loss = sum(loss_dict.values())
155 | self.log(
156 | "loss/val",
157 | total_loss,
158 | on_step=True,
159 | on_epoch=True,
160 | logger=True,
161 | prog_bar=True,
162 | sync_dist=True,
163 | )
164 |
165 | return total_loss
166 |
167 | def on_validation_end(self) -> None:
168 | if self.trainer.is_global_zero:
169 | one_batch = next(iter(self.trainer.val_dataloaders))
170 | if self.current_epoch == 0:
171 | log.debug("Plotting original samples")
172 | for i in range(2):
173 | y = one_batch["y"][i].unsqueeze(0).to(self.device)
174 | self.logger.experiment.add_image(
175 | f"original/{i}",
176 | plot_tensor(y.squeeze().cpu()),
177 | self.current_epoch,
178 | dataformats="HWC",
179 | )
180 |
181 | log.debug("Synthesising...")
182 | for i in range(2):
183 | x = one_batch["x"][i].unsqueeze(0).to(self.device)
184 | x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device)
185 | spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None
186 | output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks)
187 | y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"]
188 | attn = output["attn"]
189 | self.logger.experiment.add_image(
190 | f"generated_enc/{i}",
191 | plot_tensor(y_enc.squeeze().cpu()),
192 | self.current_epoch,
193 | dataformats="HWC",
194 | )
195 | self.logger.experiment.add_image(
196 | f"generated_dec/{i}",
197 | plot_tensor(y_dec.squeeze().cpu()),
198 | self.current_epoch,
199 | dataformats="HWC",
200 | )
201 | self.logger.experiment.add_image(
202 | f"alignment/{i}",
203 | plot_tensor(attn.squeeze().cpu()),
204 | self.current_epoch,
205 | dataformats="HWC",
206 | )
207 |
208 | def on_before_optimizer_step(self, optimizer):
209 | self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()})
210 |
--------------------------------------------------------------------------------
/cosyvoice/flow/flow.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import logging
15 | from typing import Dict, Optional
16 | import torch
17 | import torch.nn as nn
18 | from torch.nn import functional as F
19 | from omegaconf import DictConfig
20 | from cosyvoice.utils.mask import make_pad_mask
21 |
22 |
23 | class MaskedDiffWithXvec(torch.nn.Module):
24 | def __init__(self,
25 | input_size: int = 512,
26 | output_size: int = 80,
27 | spk_embed_dim: int = 192,
28 | output_type: str = "mel",
29 | vocab_size: int = 4096,
30 | input_frame_rate: int = 50,
31 | only_mask_loss: bool = True,
32 | encoder: torch.nn.Module = None,
33 | length_regulator: torch.nn.Module = None,
34 | decoder: torch.nn.Module = None,
35 | decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
36 | mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
37 | super().__init__()
38 | self.input_size = input_size
39 | self.output_size = output_size
40 | self.decoder_conf = decoder_conf
41 | self.mel_feat_conf = mel_feat_conf
42 | self.vocab_size = vocab_size
43 | self.output_type = output_type
44 | self.input_frame_rate = input_frame_rate
45 | logging.info(f"input frame rate={self.input_frame_rate}")
46 | self.input_embedding = nn.Embedding(vocab_size, input_size)
47 | self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
48 | self.encoder = encoder
49 | self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
50 | self.decoder = decoder
51 | self.length_regulator = length_regulator
52 | self.only_mask_loss = only_mask_loss
53 |
54 | def forward(
55 | self,
56 | batch: dict,
57 | device: torch.device,
58 | ) -> Dict[str, Optional[torch.Tensor]]:
59 | token = batch['speech_token'].to(device)
60 | token_len = batch['speech_token_len'].to(device)
61 | feat = batch['speech_feat'].to(device)
62 | feat_len = batch['speech_feat_len'].to(device)
63 | embedding = batch['utt_embedding'].to(device)
64 |
65 | # xvec projection
66 | embedding = F.normalize(embedding, dim=1)
67 | embedding = self.spk_embed_affine_layer(embedding)
68 |
69 | # concat text and prompt_text
70 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
71 | token = self.input_embedding(torch.clamp(token, min=0)) * mask
72 |
73 | # text encode
74 | h, h_lengths = self.encoder(token, token_len)
75 | h = self.encoder_proj(h)
76 | h, h_lengths = self.length_regulator(h, feat_len)
77 |
78 | # get conditions
79 | conds = torch.zeros(feat.shape, device=token.device)
80 | conds = conds.transpose(1, 2)
81 |
82 | mask = (~make_pad_mask(feat_len)).to(h)
83 | feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
84 | loss, _ = self.decoder.compute_loss(
85 | feat.transpose(1, 2).contiguous(),
86 | mask.unsqueeze(1),
87 | h.transpose(1, 2).contiguous(),
88 | embedding,
89 | cond=conds
90 | )
91 | return {'loss': loss}
92 |
93 | @torch.inference_mode()
94 | def inference(self,
95 | token,
96 | token_len,
97 | prompt_token,
98 | prompt_token_len,
99 | prompt_feat,
100 | prompt_feat_len,
101 | embedding):
102 | assert token.shape[0] == 1
103 | # xvec projection
104 | embedding = F.normalize(embedding, dim=1)
105 | embedding = self.spk_embed_affine_layer(embedding)
106 |
107 | # concat text and prompt_text
108 | token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
109 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
110 | token = self.input_embedding(torch.clamp(token, min=0)) * mask
111 |
112 | # text encode
113 | h, h_lengths = self.encoder(token, token_len)
114 | h = self.encoder_proj(h)
115 | feat_len = (token_len / 50 * 22050 / 256).int()
116 | h, h_lengths = self.length_regulator(h, feat_len)
117 |
118 | # get conditions
119 | conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
120 | if prompt_feat.shape[1] != 0:
121 | for i, j in enumerate(prompt_feat_len):
122 | conds[i, :j] = prompt_feat[i]
123 | conds = conds.transpose(1, 2)
124 |
125 | mask = (~make_pad_mask(feat_len)).to(h)
126 | feat = self.decoder(
127 | mu=h.transpose(1, 2).contiguous(),
128 | mask=mask.unsqueeze(1),
129 | spks=embedding,
130 | cond=conds,
131 | n_timesteps=10
132 | )
133 | if prompt_feat.shape[1] != 0:
134 | feat = feat[:, :, prompt_feat.shape[1]:]
135 | return feat
136 |
137 | @torch.inference_mode()
138 | def inference_stream(self,
139 | token,
140 | token_len,
141 | prompt_token,
142 | prompt_token_len,
143 | prompt_feat,
144 | prompt_feat_len,
145 | embedding):
146 | assert token.shape[0] == 1
147 | # xvec projection
148 | embedding = F.normalize(embedding, dim=1)
149 | embedding = self.spk_embed_affine_layer(embedding)
150 |
151 | # concat text and prompt_text
152 | token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
153 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
154 | token = self.input_embedding(torch.clamp(token, min=0)) * mask
155 |
156 | # text encode
157 | h, h_lengths = self.encoder(token, token_len)
158 | h = self.encoder_proj(h)
159 | feat_len = (token_len / 50 * 22050 / 256).int()
160 | h, h_lengths = self.length_regulator(h, feat_len)
161 |
162 | # get conditions
163 | conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
164 | if prompt_feat.shape[1] != 0:
165 | for i, j in enumerate(prompt_feat_len):
166 | conds[i, :j] = prompt_feat[i]
167 | conds = conds.transpose(1, 2)
168 |
169 | mask = (~make_pad_mask(feat_len)).to(h)
170 | feat = self.decoder(
171 | mu=h.transpose(1, 2).contiguous(),
172 | mask=mask.unsqueeze(1),
173 | spks=embedding,
174 | cond=conds,
175 | n_timesteps=10
176 | )
177 | if prompt_feat.shape[1] != 0:
178 | feat = feat[:, :, prompt_feat.shape[1]:]
179 | yield feat
180 |
--------------------------------------------------------------------------------
/academicodec/models/hificodec/meldataset.py:
--------------------------------------------------------------------------------
1 | # code based on https://github.com/b04901014/MQTTS
2 | import math
3 | import os
4 | import random
5 |
6 | import librosa
7 | import numpy as np
8 | import torch.utils.data
9 | from librosa.filters import mel as librosa_mel_fn
10 |
11 |
12 | def load_wav(full_path, sr):
13 | wav, sr = librosa.load(full_path, sr=sr)
14 | return wav, sr
15 |
16 |
17 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
18 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
19 |
20 |
21 | def dynamic_range_decompression(x, C=1):
22 | return np.exp(x) / C
23 |
24 |
25 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
26 | return torch.log(torch.clamp(x, min=clip_val) * C)
27 |
28 |
29 | def dynamic_range_decompression_torch(x, C=1):
30 | return torch.exp(x) / C
31 |
32 |
33 | def spectral_normalize_torch(magnitudes):
34 | output = dynamic_range_compression_torch(magnitudes)
35 | return output
36 |
37 |
38 | def spectral_de_normalize_torch(magnitudes):
39 | output = dynamic_range_decompression_torch(magnitudes)
40 | return output
41 |
42 |
43 | mel_basis = {}
44 | hann_window = {}
45 |
46 |
47 | def mel_spectrogram(y,
48 | n_fft,
49 | num_mels,
50 | sampling_rate,
51 | hop_size,
52 | win_size,
53 | fmin,
54 | fmax,
55 | center=False):
56 | if torch.min(y) < -1.:
57 | print('min value is ', torch.min(y))
58 | if torch.max(y) > 1.:
59 | print('max value is ', torch.max(y))
60 |
61 | global mel_basis, hann_window
62 | if fmax not in mel_basis:
63 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
64 | mel_basis[str(fmax) + '_' +
65 | str(y.device)] = torch.from_numpy(mel).float().to(y.device)
66 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
67 |
68 | y = torch.nn.functional.pad(
69 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
70 | (n_fft - hop_size) / 2)),
71 | mode='reflect')
72 | y = y.squeeze(1)
73 |
74 | spec = torch.stft(
75 | y,
76 | n_fft,
77 | hop_length=hop_size,
78 | win_length=win_size,
79 | window=hann_window[str(y.device)],
80 | center=center,
81 | pad_mode='reflect',
82 | normalized=False,
83 | onesided=True)
84 |
85 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
86 |
87 | spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
88 | spec = spectral_normalize_torch(spec)
89 |
90 | return spec
91 |
92 |
93 | def get_dataset_filelist(a):
94 | with open(a.input_training_file, 'r') as f:
95 | training_files = [l.strip() for l in f]
96 | with open(a.input_validation_file, 'r') as f:
97 | validation_files = [l.strip() for l in f]
98 | return training_files, validation_files
99 |
100 |
101 | class MelDataset(torch.utils.data.Dataset):
102 | def __init__(self,
103 | training_files,
104 | segment_size,
105 | n_fft,
106 | num_mels,
107 | hop_size,
108 | win_size,
109 | sampling_rate,
110 | fmin,
111 | fmax,
112 | split=True,
113 | shuffle=True,
114 | n_cache_reuse=1,
115 | device=None,
116 | fmax_loss=None,
117 | fine_tuning=False,
118 | base_mels_path=None):
119 | self.audio_files = training_files
120 | random.seed(1234)
121 | if shuffle:
122 | random.shuffle(self.audio_files)
123 | self.segment_size = segment_size
124 | self.sampling_rate = sampling_rate
125 | self.split = split
126 | self.n_fft = n_fft
127 | self.num_mels = num_mels
128 | self.hop_size = hop_size
129 | self.win_size = win_size
130 | self.fmin = fmin
131 | self.fmax = fmax
132 | self.fmax_loss = fmax_loss
133 | self.cached_wav = None
134 | self.n_cache_reuse = n_cache_reuse
135 | self._cache_ref_count = 0
136 | self.device = device
137 | self.fine_tuning = fine_tuning
138 | self.base_mels_path = base_mels_path
139 |
140 | def __getitem__(self, index):
141 | filename = self.audio_files[index]
142 | if self._cache_ref_count == 0:
143 | try:
144 | # Note by yuantian: load with the sample_rate of config
145 | audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
146 | except Exception as e:
147 | print(f"Error on audio: {filename}")
148 | audio = np.random.normal(size=(160000, )) * 0.05
149 | sampling_rate = self.sampling_rate
150 | self.cached_wav = audio
151 | if sampling_rate != self.sampling_rate:
152 | raise ValueError("{} SR doesn't match target {} SR".format(
153 | sampling_rate, self.sampling_rate))
154 | self._cache_ref_count = self.n_cache_reuse
155 | else:
156 | audio = self.cached_wav
157 | self._cache_ref_count -= 1
158 |
159 | audio = torch.FloatTensor(audio)
160 | audio = audio.unsqueeze(0)
161 |
162 | if not self.fine_tuning:
163 | if self.split:
164 | if audio.size(1) >= self.segment_size:
165 | max_audio_start = audio.size(1) - self.segment_size
166 | audio_start = random.randint(0, max_audio_start)
167 | audio = audio[:, audio_start:audio_start +
168 | self.segment_size]
169 | else:
170 | audio = torch.nn.functional.pad(audio, (
171 | 0, self.segment_size - audio.size(1)), 'constant')
172 |
173 | mel = mel_spectrogram(
174 | audio,
175 | self.n_fft,
176 | self.num_mels,
177 | self.sampling_rate,
178 | self.hop_size,
179 | self.win_size,
180 | self.fmin,
181 | self.fmax,
182 | center=False)
183 | else:
184 | mel = np.load(
185 | os.path.join(self.base_mels_path,
186 | os.path.splitext(os.path.split(filename)[-1])[0] +
187 | '.npy'))
188 | mel = torch.from_numpy(mel)
189 |
190 | if len(mel.shape) < 3:
191 | mel = mel.unsqueeze(0)
192 |
193 | if self.split:
194 | frames_per_seg = math.ceil(self.segment_size / self.hop_size)
195 |
196 | if audio.size(1) >= self.segment_size:
197 | mel_start = random.randint(0,
198 | mel.size(2) - frames_per_seg - 1)
199 | mel = mel[:, :, mel_start:mel_start + frames_per_seg]
200 | audio = audio[:, mel_start * self.hop_size:(
201 | mel_start + frames_per_seg) * self.hop_size]
202 | else:
203 | mel = torch.nn.functional.pad(mel, (
204 | 0, frames_per_seg - mel.size(2)), 'constant')
205 | audio = torch.nn.functional.pad(audio, (
206 | 0, self.segment_size - audio.size(1)), 'constant')
207 |
208 | mel_loss = mel_spectrogram(
209 | audio,
210 | self.n_fft,
211 | self.num_mels,
212 | self.sampling_rate,
213 | self.hop_size,
214 | self.win_size,
215 | self.fmin,
216 | self.fmax_loss,
217 | center=False)
218 |
219 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
220 |
221 | def __len__(self):
222 | return len(self.audio_files)
223 |
--------------------------------------------------------------------------------