├── matcha
    ├── data
    │   ├── __init__.py
    │   └── components
    │   │   └── __init__.py
    ├── hifigan
    │   ├── __init__.py
    │   ├── env.py
    │   ├── config.py
    │   ├── LICENSE
    │   ├── xutils.py
    │   ├── denoiser.py
    │   ├── README.md
    │   └── meldataset.py
    ├── models
    │   ├── __init__.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   └── flow_matching.py
    │   └── baselightningmodule.py
    └── onnx
    │   └── export.py
├── academicodec
    ├── __init__.py
    ├── models
    │   ├── encodec
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── net3.py
    │   │   ├── distributed
    │   │   │   ├── launch.py
    │   │   │   └── distributed.py
    │   │   └── test.py
    │   ├── hificodec
    │   │   ├── __init__.py
    │   │   ├── env.py
    │   │   ├── vqvae_tester.py
    │   │   ├── vqvae.py
    │   │   ├── vqvae_copy_syn.py
    │   │   └── meldataset.py
    │   └── soundstream
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   └── models.py
    ├── quantization
    │   ├── __init__.py
    │   ├── distrib.py
    │   └── vq.py
    ├── modules
    │   ├── __init__.py
    │   ├── lstm.py
    │   ├── norm.py
    │   └── transformer.py
    ├── binary.py
    └── utils.py
├── cosyvoice
    ├── cli
    │   ├── __init__.py
    │   ├── zh_normalization
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   ├── quantifier.py
    │   │   ├── phonecode.py
    │   │   ├── constants.py
    │   │   ├── chronology.py
    │   │   ├── text_normlization.py
    │   │   └── num.py
    │   └── model.py
    ├── dataset
    │   ├── __init__.py
    │   └── dataset.py
    ├── transformer
    │   ├── __init__.py
    │   ├── activation.py
    │   ├── label_smoothing_loss.py
    │   ├── positionwise_feed_forward.py
    │   ├── decoder_layer.py
    │   └── convolution.py
    ├── flow
    │   ├── length_regulator.py
    │   ├── flow_matching.py
    │   └── flow.py
    ├── hifigan
    │   └── f0_predictor.py
    ├── utils
    │   ├── class_utils.py
    │   └── common.py
    └── bin
    │   ├── inference.py
    │   └── train.py
├── data
    ├── cache
    │   └── 这里为语音合成缓存文件夹.txt
    └── model
    │   └── 这里存放CosyVoice模型.txt
├── example参考音频文本.txt
├── requirements.txt
├── api.py
├── LICENSE
├── README_CN.md
└── README.md


/matcha/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/academicodec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/cache/这里为语音合成缓存文件夹.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matcha/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matcha/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/model/这里存放CosyVoice模型.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matcha/data/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matcha/models/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/academicodec/models/soundstream/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example参考音频文本.txt:
--------------------------------------------------------------------------------
1 | 把这些文字替换为你的example.wav的参考音频文本


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | modelscope
3 | torch
4 | torchaudio
5 | uvicorn


--------------------------------------------------------------------------------
/academicodec/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | # flake8: noqa
7 | from .vq import QuantizedResult
8 | from .vq import ResidualVectorQuantizer
9 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | class AttrDict(dict):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(AttrDict, self).__init__(*args, **kwargs)
 8 |         self.__dict__ = self
 9 | 
10 | 
11 | def build_env(config, config_name, path):
12 |     t_path = os.path.join(path, config_name)
13 |     if config != t_path:
14 |         os.makedirs(path, exist_ok=True)
15 |         shutil.copyfile(config, os.path.join(path, config_name))
16 | 


--------------------------------------------------------------------------------
/matcha/hifigan/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .text_normlization import *
15 | 


--------------------------------------------------------------------------------
/academicodec/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Torch modules."""
 7 | # flake8: noqa
 8 | from .conv import NormConv1d
 9 | from .conv import NormConv2d
10 | from .conv import NormConvTranspose1d
11 | from .conv import NormConvTranspose2d
12 | from .conv import pad1d
13 | from .conv import SConv1d
14 | from .conv import SConvTranspose1d
15 | from .conv import unpad1d
16 | from .lstm import SLSTM
17 | from .seanet import SEANetDecoder
18 | from .seanet import SEANetEncoder
19 | from .transformer import StreamingTransformerEncoder
20 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/academicodec/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """LSTM layers module."""
 7 | from torch import nn
 8 | 
 9 | 
10 | class SLSTM(nn.Module):
11 |     """
12 |     LSTM without worrying about the hidden state, nor the layout of the data.
13 |     Expects input as convolutional layout.
14 |     """
15 | 
16 |     def __init__(self, dimension: int, num_layers: int=2, skip: bool=True):
17 |         super().__init__()
18 |         self.skip = skip
19 |         self.lstm = nn.LSTM(dimension, dimension, num_layers)
20 | 
21 |     def forward(self, x):
22 |         x = x.permute(2, 0, 1)
23 |         y, _ = self.lstm(x)
24 |         if self.skip:
25 |             y = y + x
26 |         y = y.permute(1, 2, 0)
27 |         return y
28 | 


--------------------------------------------------------------------------------
/matcha/hifigan/config.py:
--------------------------------------------------------------------------------
 1 | v1 = {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0004,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 |     "upsample_rates": [8, 8, 2, 2],
11 |     "upsample_kernel_sizes": [16, 16, 4, 4],
12 |     "upsample_initial_channel": 512,
13 |     "resblock_kernel_sizes": [3, 7, 11],
14 |     "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
15 |     "resblock_initial_channel": 256,
16 |     "segment_size": 8192,
17 |     "num_mels": 80,
18 |     "num_freq": 1025,
19 |     "n_fft": 1024,
20 |     "hop_size": 256,
21 |     "win_size": 1024,
22 |     "sampling_rate": 22050,
23 |     "fmin": 0,
24 |     "fmax": 8000,
25 |     "fmax_loss": None,
26 |     "num_workers": 4,
27 |     "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
28 | }
29 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
 1 | import torchaudio
 2 | import uvicorn
 3 | from fastapi import FastAPI
 4 | from fastapi.responses import FileResponse
 5 | from cosyvoice.cli.cosyvoice import CosyVoice
 6 | from cosyvoice.utils.file_utils import load_wav
 7 | 
 8 | app = FastAPI()
 9 | print("正在加载CosyVoice模型，请稍后...")
10 | model = CosyVoice('data/model/CosyVoice-300M')
11 | prompt_speech = load_wav('example.wav', 16000)
12 | with open('example参考音频文本.txt', 'r', encoding='utf-8') as file:
13 |     lines = file.readlines()
14 | prompt_text = lines[0].strip()
15 | output_path = 'data/cache/cache.wav'
16 | 
17 | 
18 | @app.get("/cosyvoice/")
19 | def run_cosyvoice(text: str):
20 |     results = model.inference_zero_shot(text, prompt_text, prompt_speech)
21 |     tts_speech = results['tts_speech']
22 |     torchaudio.save(output_path, tts_speech, 22050)
23 |     return FileResponse(output_path)
24 | 
25 | 
26 | print("本地CosyVoice语音合成大模型API服务器启动成功!")
27 | uvicorn.run(app, host="0.0.0.0", port=9881)
28 | 


--------------------------------------------------------------------------------
/academicodec/modules/norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Normalization modules."""
 7 | import typing as tp
 8 | 
 9 | import einops
10 | import torch
11 | from torch import nn
12 | 
13 | 
14 | class ConvLayerNorm(nn.LayerNorm):
15 |     """
16 |     Convolution-friendly LayerNorm that moves channels to last dimensions
17 |     before running the normalization and moves them back to original position right after.
18 |     """
19 | 
20 |     def __init__(self,
21 |                  normalized_shape: tp.Union[int, tp.List[int], torch.Size],
22 |                  **kwargs):
23 |         super().__init__(normalized_shape, **kwargs)
24 | 
25 |     def forward(self, x):
26 |         x = einops.rearrange(x, 'b ... t -> b t ...')
27 |         x = super().forward(x)
28 |         x = einops.rearrange(x, 'b t ... -> b ... t')
29 |         return
30 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/dataset.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import random
 3 | 
 4 | import torch
 5 | import torchaudio
 6 | from torch.utils.data import Dataset
 7 | 
 8 | 
 9 | class NSynthDataset(Dataset):
10 |     """Dataset to load NSynth data."""
11 | 
12 |     def __init__(self, audio_dir):
13 |         super().__init__()
14 |         self.filenames = []
15 |         self.filenames.extend(glob.glob(audio_dir + "/*.wav"))
16 |         print(len(self.filenames))
17 |         _, self.sr = torchaudio.load(self.filenames[0])
18 |         self.max_len = 24000  # 24000
19 | 
20 |     def __len__(self):
21 |         return len(self.filenames)
22 | 
23 |     def __getitem__(self, index):
24 |         ans = torch.zeros(1, self.max_len)
25 |         audio = torchaudio.load(self.filenames[index])[0]
26 |         if audio.shape[1] > self.max_len:
27 |             st = random.randint(0, audio.shape[1] - self.max_len - 1)
28 |             ed = st + self.max_len
29 |             return audio[:, st:ed]
30 |         else:
31 |             ans[:, :audio.shape[1]] = audio
32 |             return ans
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 枫影剑
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/matcha/hifigan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae_tester.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import librosa
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from academicodec.models.hificodec.vqvae import VQVAE
 8 | 
 9 | 
10 | class VqvaeTester(nn.Module):
11 |     def __init__(self, config_path, model_path, sample_rate=24000):
12 |         super().__init__()
13 |         self.vqvae = VQVAE(config_path, model_path, with_encoder=True)
14 |         self.sample_rate = sample_rate
15 | 
16 |     @torch.no_grad()
17 |     def forward(self, wav_path):
18 |         # 单声道
19 |         # wav.shape (T, ), 按照模型的 sr 读取
20 |         wav, sr = librosa.load(wav_path, sr=self.sample_rate)
21 |         fid = os.path.basename(wav_path)[:-4]
22 |         wav = torch.tensor(wav).unsqueeze(0)
23 |         wav = wav.cuda()
24 |         # vq_codes is acoustic token
25 |         vq_codes = self.vqvae.encode(wav)
26 |         syn = self.vqvae(vq_codes)
27 |         return fid, syn
28 | 
29 |     @torch.no_grad()
30 |     def vq(self, wav_path):
31 |         wav, sr = librosa.load(wav_path, sr=self.sample_rate)
32 |         fid = os.path.basename(wav_path)[:-4]
33 |         wav = torch.tensor(wav).unsqueeze(0)
34 |         wav = wav.cuda()
35 |         # vq_codes is acoustic token
36 |         vq_codes = self.vqvae.encode(wav)
37 |         return fid, vq_codes
38 | 


--------------------------------------------------------------------------------
/matcha/hifigan/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | 
 6 | import matplotlib
 7 | import torch
 8 | from torch.nn.utils import weight_norm
 9 | 
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 | 
13 | 
14 | def plot_spectrogram(spectrogram):
15 |     fig, ax = plt.subplots(figsize=(10, 2))
16 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
17 |     plt.colorbar(im, ax=ax)
18 | 
19 |     fig.canvas.draw()
20 |     plt.close()
21 | 
22 |     return fig
23 | 
24 | 
25 | def init_weights(m, mean=0.0, std=0.01):
26 |     classname = m.__class__.__name__
27 |     if classname.find("Conv") != -1:
28 |         m.weight.data.normal_(mean, std)
29 | 
30 | 
31 | def apply_weight_norm(m):
32 |     classname = m.__class__.__name__
33 |     if classname.find("Conv") != -1:
34 |         weight_norm(m)
35 | 
36 | 
37 | def get_padding(kernel_size, dilation=1):
38 |     return int((kernel_size * dilation - dilation) / 2)
39 | 
40 | 
41 | def load_checkpoint(filepath, device):
42 |     assert os.path.isfile(filepath)
43 |     print(f"Loading '{filepath}'")
44 |     checkpoint_dict = torch.load(filepath, map_location=device)
45 |     print("Complete.")
46 |     return checkpoint_dict
47 | 
48 | 
49 | def save_checkpoint(filepath, obj):
50 |     print(f"Saving checkpoint to {filepath}")
51 |     torch.save(obj, filepath)
52 |     print("Complete.")
53 | 
54 | 
55 | def scan_checkpoint(cp_dir, prefix):
56 |     pattern = os.path.join(cp_dir, prefix + "????????")
57 |     cp_list = glob.glob(pattern)
58 |     if len(cp_list) == 0:
59 |         return None
60 |     return sorted(cp_list)[-1]
61 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from academicodec.models.hificodec.env import AttrDict
 7 | from academicodec.models.hificodec.models import Encoder
 8 | from academicodec.models.hificodec.models import Generator
 9 | from academicodec.models.hificodec.models import Quantizer
10 | 
11 | 
12 | class VQVAE(nn.Module):
13 |     def __init__(self,
14 |                  config_path,
15 |                  ckpt_path,
16 |                  with_encoder=False):
17 |         super(VQVAE, self).__init__()
18 |         ckpt = torch.load(ckpt_path)
19 |         with open(config_path) as f:
20 |             data = f.read()
21 |         json_config = json.loads(data)
22 |         self.h = AttrDict(json_config)
23 |         self.quantizer = Quantizer(self.h)
24 |         self.generator = Generator(self.h)
25 |         self.generator.load_state_dict(ckpt['generator'])
26 |         self.quantizer.load_state_dict(ckpt['quantizer'])
27 |         if with_encoder:
28 |             self.encoder = Encoder(self.h)
29 |             self.encoder.load_state_dict(ckpt['encoder'])
30 | 
31 |     def forward(self, x):
32 |         # x is the codebook
33 |         # x.shape (B, T, Nq)
34 |         quant_emb = self.quantizer.embed(x)
35 |         return self.generator(quant_emb)
36 | 
37 |     def encode(self, x):
38 |         batch_size = x.size(0)
39 |         if len(x.shape) == 3 and x.shape[-1] == 1:
40 |             x = x.squeeze(-1)
41 |         c = self.encoder(x.unsqueeze(1))
42 |         q, loss_q, c = self.quantizer(c)
43 |         c = [code.reshape(batch_size, -1) for code in c]
44 |         # shape: [N, T, 4]
45 |         return torch.stack(c, -1)
46 | 


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
 1 | # cosyvoice_simple_api
 2 | 
 3 | ## 项目概述
 4 | 
 5 | `cosyvoice_simple_api` 是一个基于阿里的 CosyVoice 开发的简易的语音合成 API 服务器项目。它允许用户轻松地将文本转换为有情感的语音输出，适用于创建有声读物、自动语音回复系统以及其他语音合成应用。
 6 | 
 7 | ### 项目地址
 8 | 
 9 | - CosyVoice 源地址：[FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
10 | - CosyVoice Windows 适配版(特别鸣谢刘悦)：[v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows)
11 | - 本项目地址：[swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api)
12 | 
13 | ## 运行方式
14 | 
15 | 1. 确保你的系统中已安装 Python 环境。
16 | 2. 通过 `git clone` 或下载 ZIP 文件的方式获取项目代码。
17 | 3. 在项目根目录下，运行以下命令安装依赖：
18 | 
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 | 
23 | 4. 在命令行中运行以下命令启动服务器：
24 | 
25 | ```bash
26 | python api.py
27 | ```
28 | 
29 | ## 服务器地址
30 | 
31 | CosyVoice 语音合成 API 服务器地址为：`http://你的电脑IP:9881/`
32 | 
33 | ## API 接口
34 | 
35 | ### 接口地址
36 | 
37 | ```
38 | /cosyvoice/
39 | ```
40 | 
41 | ### 请求方式
42 | 
43 | ```
44 | GET
45 | ```
46 | 
47 | ### 请求参数
48 | 
49 | - `text`：必填，要合成的主体文本。
50 | 
51 | ## 使用示例
52 | 
53 | 1. 在浏览器地址栏输入以下地址：
54 | 
55 | ```
56 | http://127.0.0.1:9881/cosyvoice/?text=你好，很高兴遇见你
57 | ```
58 | 
59 | 2. 按下回车键，服务器将返回输出格式为 wav 音频文件。
60 | 
61 | ## 更换参考音频和参考音频文本
62 | 
63 | 1. 将 `example.wav` 替换为自定义的参考音频，文件名保持不变。
64 | 2. 用记事本打开 `example参考音频文本.txt`，修改成新的自定义参考音频文本。
65 | 3. 修改完成后保存文件，并重新运行 `CosyVoice语音合成API服务器.bat` 文件。
66 | 
67 | ## 技术栈
68 | 
69 | - FastAPI：用于构建 API 服务器。
70 | - ModelScope：模型相关的库。
71 | - Torch：PyTorch，用于深度学习模型。
72 | - TorchAudio：用于音频处理。
73 | - Uvicorn：ASGI 服务器，用于运行 FastAPI 应用。
74 | 
75 | ## 贡献
76 | 
77 | 欢迎对本项目进行贡献，包括但不限于修复 bug、增加新功能、改进文档等。在提交 Pull Request 之前，请确保你的代码通过了所有测试，并且遵循项目的代码风格。
78 | 
79 | ## 许可证
80 | 
81 | 本项目采用 [MIT 许可证](LICENSE)。


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/vqvae_copy_syn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import json
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import soundfile as sf
 8 | from tqdm import tqdm
 9 | 
10 | from academicodec.models.hificodec.vqvae_tester import VqvaeTester
11 | 
12 | parser = argparse.ArgumentParser()
13 | 
14 | #Path
15 | parser.add_argument('--outputdir', type=str, required=True)
16 | parser.add_argument('--model_path', type=str, required=True)
17 | parser.add_argument('--input_wavdir', type=str, required=True)
18 | parser.add_argument('--config_path', type=str, required=True)
19 | parser.add_argument('--num_gens', type=int, default=1024)
20 | 
21 | #Data
22 | parser.add_argument('--sample_rate', type=int, default=24000)
23 | 
24 | args = parser.parse_args()
25 | 
26 | with open(args.config_path, 'r') as f:
27 |     argdict = json.load(f)
28 |     assert argdict['sampling_rate'] == args.sample_rate, \
29 |         f"Sampling rate not consistent, stated {args.sample_rate}, but the model is trained on {argdict['sample_rate']}"
30 |     argdict.update(args.__dict__)
31 |     args.__dict__ = argdict
32 | 
33 | if __name__ == '__main__':
34 |     Path(args.outputdir).mkdir(parents=True, exist_ok=True)
35 |     print("Init model and load weights")
36 |     model = VqvaeTester(config_path=args.config_path, model_path=args.model_path,sample_rate=args.sample_rate)
37 |     model.cuda()
38 |     model.vqvae.generator.remove_weight_norm()
39 |     model.vqvae.encoder.remove_weight_norm()
40 |     model.eval()
41 |     print("Model ready")
42 | 
43 |     wav_paths = glob.glob(f"{args.input_wavdir}/*.wav")[:args.num_gens]
44 |     print(f"Globbed {len(wav_paths)} wav files.")
45 | 
46 |     for wav_path in wav_paths:
47 |         fid, wav = model(wav_path)
48 |         wav = wav.squeeze().cpu().numpy()
49 |         sf.write(
50 |             os.path.join(args.outputdir, f'{fid}.wav'), wav, args.sample_rate)
51 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 | 
19 | 
20 | class InterpolateRegulator(nn.Module):
21 |     def __init__(
22 |             self,
23 |             channels: int,
24 |             sampling_ratios: Tuple,
25 |             out_channels: int = None,
26 |             groups: int = 1,
27 |     ):
28 |         super().__init__()
29 |         self.sampling_ratios = sampling_ratios
30 |         out_channels = out_channels or channels
31 |         model = nn.ModuleList([])
32 |         if len(sampling_ratios) > 0:
33 |             for _ in sampling_ratios:
34 |                 module = nn.Conv1d(channels, channels, 3, 1, 1)
35 |                 norm = nn.GroupNorm(groups, channels)
36 |                 act = nn.Mish()
37 |                 model.extend([module, norm, act])
38 |         model.append(
39 |             nn.Conv1d(channels, out_channels, 1, 1)
40 |         )
41 |         self.model = nn.Sequential(*model)
42 | 
43 |     def forward(self, x, ylens=None):
44 |         # x in (B, T, D)
45 |         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 |         x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 |         out = self.model(x).transpose(1, 2).contiguous()
48 |         olens = ylens
49 |         return out * mask, olens
50 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/academicodec/models/soundstream/dataset.py:
--------------------------------------------------------------------------------
 1 | # 和 Encodec* 的 dataset.py 有点类似但是不完全一样
 2 | # 主要是 prob > 0.7 的时候多了 ans2
 3 | import glob
 4 | import random
 5 | 
 6 | import torch
 7 | import torchaudio
 8 | from torch.utils.data import Dataset
 9 | 
10 | 
11 | class NSynthDataset(Dataset):
12 |     """Dataset to load NSynth data."""
13 | 
14 |     def __init__(self, audio_dir):
15 |         super().__init__()
16 |         self.filenames = []
17 |         self.filenames.extend(glob.glob(audio_dir + "/*.wav"))
18 |         print(len(self.filenames))
19 |         _, self.sr = torchaudio.load(self.filenames[0])
20 |         self.max_len = 24000  # 24000
21 | 
22 |     def __len__(self):
23 |         return len(self.filenames)
24 | 
25 |     def __getitem__(self, index):
26 |         #print(self.filenames[index])
27 |         prob = random.random()  # (0,1)
28 |         if prob > 0.7:
29 |             # data augmentation
30 |             ans1 = torch.zeros(1, self.max_len)
31 |             ans2 = torch.zeros(1, self.max_len)
32 |             audio1 = torchaudio.load(self.filenames[index])[0]
33 |             index2 = random.randint(0, len(self.filenames) - 1)
34 |             audio2 = torchaudio.load(self.filenames[index2])[0]
35 |             if audio1.shape[1] > self.max_len:
36 |                 st = random.randint(0, audio1.shape[1] - self.max_len - 1)
37 |                 ed = st + self.max_len
38 |                 ans1 = audio1[:, st:ed]
39 |             else:
40 |                 ans1[:, :audio1.shape[1]] = audio1
41 |             if audio2.shape[1] > self.max_len:
42 |                 st = random.randint(0, audio2.shape[1] - self.max_len - 1)
43 |                 ed = st + self.max_len
44 |                 ans2 = audio2[:, st:ed]
45 |             else:
46 |                 ans2[:, :audio2.shape[1]] = audio2
47 |             ans = ans1 + ans2
48 |             return ans
49 |         else:
50 |             ans = torch.zeros(1, self.max_len)
51 |             audio = torchaudio.load(self.filenames[index])[0]
52 |             if audio.shape[1] > self.max_len:
53 |                 st = random.randint(0, audio.shape[1] - self.max_len - 1)
54 |                 ed = st + self.max_len
55 |                 return audio[:, st:ed]
56 |             else:
57 |                 ans[:, :audio.shape[1]] = audio
58 |                 return ans
59 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/net3.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch.nn as nn
 6 | from academicodec.modules.seanet import SEANetDecoder
 7 | from academicodec.modules.seanet import SEANetEncoder
 8 | from academicodec.quantization import ResidualVectorQuantizer
 9 | 
10 | 
11 | # Generator
12 | class SoundStream(nn.Module):
13 |     def __init__(self,
14 |                  n_filters,
15 |                  D,
16 |                  target_bandwidths=[7.5, 15],
17 |                  ratios=[8, 5, 4, 2],
18 |                  sample_rate=24000,
19 |                  bins=1024,
20 |                  normalize=False):
21 |         super().__init__()
22 |         self.hop_length = np.prod(ratios)  # 计算乘积
23 |         self.encoder = SEANetEncoder(
24 |             n_filters=n_filters, dimension=D, ratios=ratios)
25 |         n_q = int(1000 * target_bandwidths[-1] //
26 |                   (math.ceil(sample_rate / self.hop_length) * 10))
27 |         self.frame_rate = math.ceil(sample_rate / np.prod(ratios))  # 75
28 |         self.bits_per_codebook = int(math.log2(bins))
29 |         self.target_bandwidths = target_bandwidths
30 |         self.quantizer = ResidualVectorQuantizer(
31 |             dimension=D, n_q=n_q, bins=bins)
32 |         self.decoder = SEANetDecoder(
33 |             n_filters=n_filters, dimension=D, ratios=ratios)
34 | 
35 |     def get_last_layer(self):
36 |         return self.decoder.layers[-1].weight
37 | 
38 |     def forward(self, x):
39 |         e = self.encoder(x)
40 |         max_idx = len(self.target_bandwidths) - 1
41 |         bw = self.target_bandwidths[random.randint(0, max_idx)]
42 |         quantized, codes, bandwidth, commit_loss = self.quantizer(
43 |             e, self.frame_rate, bw)
44 |         o = self.decoder(quantized)
45 |         return o, commit_loss, None
46 | 
47 |     def encode(self, x, target_bw=None, st=None):
48 |         e = self.encoder(x)
49 |         if target_bw is None:
50 |             bw = self.target_bandwidths[-1]
51 |         else:
52 |             bw = target_bw
53 |         if st is None:
54 |             st = 0
55 |         codes = self.quantizer.encode(e, self.frame_rate, bw, st)
56 |         return codes
57 | 
58 |     def decode(self, codes):
59 |         quantized = self.quantizer.decode(codes)
60 |         o = self.decoder(quantized)
61 |         return o
62 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 2 | #            2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 |     LinearNoSubsampling,
20 |     EmbedinigNoSubsampling,
21 |     Conv1dSubsampling2,
22 |     Conv2dSubsampling4,
23 |     Conv2dSubsampling6,
24 |     Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 |                                              RelPositionalEncoding,
28 |                                              WhisperPositionalEncoding,
29 |                                              LearnablePositionalEncoding,
30 |                                              NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 |                                              RelPositionMultiHeadedAttention)
33 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
34 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
35 | 
36 | 
37 | COSYVOICE_ACTIVATION_CLASSES = {
38 |     "hardtanh": torch.nn.Hardtanh,
39 |     "tanh": torch.nn.Tanh,
40 |     "relu": torch.nn.ReLU,
41 |     "selu": torch.nn.SELU,
42 |     "swish": getattr(torch.nn, "SiLU", Swish),
43 |     "gelu": torch.nn.GELU,
44 | }
45 | 
46 | COSYVOICE_SUBSAMPLE_CLASSES = {
47 |     "linear": LinearNoSubsampling,
48 |     "linear_legacy": LegacyLinearNoSubsampling,
49 |     "embed": EmbedinigNoSubsampling,
50 |     "conv1d2": Conv1dSubsampling2,
51 |     "conv2d": Conv2dSubsampling4,
52 |     "conv2d6": Conv2dSubsampling6,
53 |     "conv2d8": Conv2dSubsampling8,
54 |     'paraformer_dummy': torch.nn.Identity
55 | }
56 | 
57 | COSYVOICE_EMB_CLASSES = {
58 |     "embed": PositionalEncoding,
59 |     "abs_pos": PositionalEncoding,
60 |     "rel_pos": RelPositionalEncoding,
61 |     "rel_pos_espnet": EspnetRelPositionalEncoding,
62 |     "no_pos": NoPositionalEncoding,
63 |     "abs_pos_whisper": WhisperPositionalEncoding,
64 |     "embed_learnable_pe": LearnablePositionalEncoding,
65 | }
66 | 
67 | COSYVOICE_ATTENTION_CLASSES = {
68 |     "selfattn": MultiHeadedAttention,
69 |     "rel_selfattn": RelPositionMultiHeadedAttention,
70 | }
71 | 


--------------------------------------------------------------------------------
/matcha/hifigan/denoiser.py:
--------------------------------------------------------------------------------
 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
 2 | 
 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
 4 | import torch
 5 | 
 6 | 
 7 | class Denoiser(torch.nn.Module):
 8 |     """Removes model bias from audio produced with waveglow"""
 9 | 
10 |     def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
11 |         super().__init__()
12 |         self.filter_length = filter_length
13 |         self.hop_length = int(filter_length / n_overlap)
14 |         self.win_length = win_length
15 | 
16 |         dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
17 |         self.device = device
18 |         if mode == "zeros":
19 |             mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
20 |         elif mode == "normal":
21 |             mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
22 |         else:
23 |             raise Exception(f"Mode {mode} if not supported")
24 | 
25 |         def stft_fn(audio, n_fft, hop_length, win_length, window):
26 |             spec = torch.stft(
27 |                 audio,
28 |                 n_fft=n_fft,
29 |                 hop_length=hop_length,
30 |                 win_length=win_length,
31 |                 window=window,
32 |                 return_complex=True,
33 |             )
34 |             spec = torch.view_as_real(spec)
35 |             return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
36 | 
37 |         self.stft = lambda x: stft_fn(
38 |             audio=x,
39 |             n_fft=self.filter_length,
40 |             hop_length=self.hop_length,
41 |             win_length=self.win_length,
42 |             window=torch.hann_window(self.win_length, device=device),
43 |         )
44 |         self.istft = lambda x, y: torch.istft(
45 |             torch.complex(x * torch.cos(y), x * torch.sin(y)),
46 |             n_fft=self.filter_length,
47 |             hop_length=self.hop_length,
48 |             win_length=self.win_length,
49 |             window=torch.hann_window(self.win_length, device=device),
50 |         )
51 | 
52 |         with torch.no_grad():
53 |             bias_audio = vocoder(mel_input).float().squeeze(0)
54 |             bias_spec, _ = self.stft(bias_audio)
55 | 
56 |         self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
57 | 
58 |     @torch.inference_mode()
59 |     def forward(self, audio, strength=0.0005):
60 |         audio_spec, audio_angles = self.stft(audio)
61 |         audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
62 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
63 |         audio_denoised = self.istft(audio_spec_denoised, audio_angles)
64 |         return audio_denoised
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cosyvoice_simple_api
 2 | 
 3 | ## Project Overview
 4 | 
 5 | `cosyvoice_simple_api` is a simple text-to-speech API server project developed based on Alibaba's CosyVoice. It allows users to easily convert text into emotionally rich voice output, suitable for creating audiobooks, automated voice response systems, and other text-to-speech applications.
 6 | 
 7 | ### Project Addresses
 8 | 
 9 | - CosyVoice Source Address: [FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 
10 | - CosyVoice Windows Adaptation (Special thanks to Liu Yue): [v3ucn/CosyVoice_For_Windows](https://github.com/v3ucn/CosyVoice_For_Windows) 
11 | - This Project Address: [swordswind/cosyvoice_simple_api](https://github.com/swordswind/cosyvoice_simple_api) 
12 | 
13 | ## Running Method
14 | 
15 | 1. Ensure that a Python environment is installed in your system.
16 | 2. Obtain the project code via `git clone` or by downloading the ZIP file.
17 | 3. In the project root directory, run the following command to install dependencies:
18 | 
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 | 
23 | 4. Run the following command in the command line to start the server:
24 | 
25 | ```bash
26 | python api.py
27 | ```
28 | 
29 | ## Server Address
30 | 
31 | The CosyVoice text-to-speech API server address is: `http://your-computer-IP:9881/`
32 | 
33 | ## API Interface
34 | 
35 | ### Interface Address
36 | 
37 | ```
38 | /cosyvoice/
39 | ```
40 | 
41 | ### Request Method
42 | 
43 | ```
44 | GET
45 | ```
46 | 
47 | ### Request Parameters
48 | 
49 | - `text`: Required, the main text to be synthesized.
50 | 
51 | ## Usage Example
52 | 
53 | 1. Enter the following address in the browser's address bar:
54 | 
55 | ```
56 | http://127.0.0.1:9881/cosyvoice/?text=Hello, nice to meet you
57 | ```
58 | 
59 | 2. Press Enter, and the server will return a response in the format of a wav audio file.
60 | 
61 | ## Changing Reference Audio and Reference Audio Text
62 | 
63 | 1. Replace `example.wav` with your custom reference audio, keeping the file name unchanged.
64 | 2. Open `example_reference_audio_text.txt` with Notepad and modify it to your new custom reference audio text.
65 | 3. After modification, save the file and rerun the `CosyVoice Text-to-Speech API Server.bat` file.
66 | 
67 | ## Technology Stack
68 | 
69 | - FastAPI: Used for building the API server.
70 | - ModelScope: A library related to models.
71 | - Torch: PyTorch, used for deep learning models.
72 | - TorchAudio: Used for audio processing.
73 | - Uvicorn: ASGI server, used to run FastAPI applications.
74 | 
75 | ## Contribution
76 | 
77 | Contributions to this project are welcome, including but not limited to fixing bugs, adding new features, and improving documentation. Before submitting a Pull Request, please ensure that your code passes all tests and adheres to the project's coding style.
78 | 
79 | ## License
80 | 
81 | This project is licensed under the [MIT License](LICENSE).


--------------------------------------------------------------------------------
/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #               2024 Alibaba Inc (Xiang Lyu)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 | 
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 | 
23 | 
24 | class Swish(torch.nn.Module):
25 |     """Construct an Swish object."""
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         """Return Swish activation function."""
29 |         return x * torch.sigmoid(x)
30 | 
31 | 
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | #   LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 |     '''
36 |     Implementation of a sine-based periodic activation function
37 |     Shape:
38 |         - Input: (B, C, T)
39 |         - Output: (B, C, T), same shape as the input
40 |     Parameters:
41 |         - alpha - trainable parameter
42 |     References:
43 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 |         https://arxiv.org/abs/2006.08195
45 |     Examples:
46 |         >>> a1 = snake(256)
47 |         >>> x = torch.randn(256)
48 |         >>> x = a1(x)
49 |     '''
50 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 |         '''
52 |         Initialization.
53 |         INPUT:
54 |             - in_features: shape of the input
55 |             - alpha: trainable parameter
56 |             alpha is initialized to 1 by default, higher values = higher-frequency.
57 |             alpha will be trained along with the rest of your model.
58 |         '''
59 |         super(Snake, self).__init__()
60 |         self.in_features = in_features
61 | 
62 |         # initialize alpha
63 |         self.alpha_logscale = alpha_logscale
64 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
65 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 |         else:  # linear scale alphas initialized to ones
67 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
68 | 
69 |         self.alpha.requires_grad = alpha_trainable
70 | 
71 |         self.no_div_by_zero = 0.000000001
72 | 
73 |     def forward(self, x):
74 |         '''
75 |         Forward pass of the function.
76 |         Applies the function to the input elementwise.
77 |         Snake ∶= x + 1/a * sin^2 (xa)
78 |         '''
79 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
80 |         if self.alpha_logscale:
81 |             alpha = torch.exp(alpha)
82 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # Modified from ESPnet(https://github.com/espnet/espnet)
16 | """Unility functions for Transformer."""
17 | 
18 | from typing import List
19 | 
20 | import torch
21 | 
22 | IGNORE_ID = -1
23 | 
24 | 
25 | def pad_list(xs: List[torch.Tensor], pad_value: int):
26 |     """Perform padding for the list of tensors.
27 | 
28 |     Args:
29 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
30 |         pad_value (float): Value for padding.
31 | 
32 |     Returns:
33 |         Tensor: Padded tensor (B, Tmax, `*`).
34 | 
35 |     Examples:
36 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
37 |         >>> x
38 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
39 |         >>> pad_list(x, 0)
40 |         tensor([[1., 1., 1., 1.],
41 |                 [1., 1., 0., 0.],
42 |                 [1., 0., 0., 0.]])
43 | 
44 |     """
45 |     max_len = max([len(item) for item in xs])
46 |     batchs = len(xs)
47 |     ndim = xs[0].ndim
48 |     if ndim == 1:
49 |         pad_res = torch.zeros(batchs,
50 |                               max_len,
51 |                               dtype=xs[0].dtype,
52 |                               device=xs[0].device)
53 |     elif ndim == 2:
54 |         pad_res = torch.zeros(batchs,
55 |                               max_len,
56 |                               xs[0].shape[1],
57 |                               dtype=xs[0].dtype,
58 |                               device=xs[0].device)
59 |     elif ndim == 3:
60 |         pad_res = torch.zeros(batchs,
61 |                               max_len,
62 |                               xs[0].shape[1],
63 |                               xs[0].shape[2],
64 |                               dtype=xs[0].dtype,
65 |                               device=xs[0].device)
66 |     else:
67 |         raise ValueError(f"Unsupported ndim: {ndim}")
68 |     pad_res.fill_(pad_value)
69 |     for i in range(batchs):
70 |         pad_res[i, :len(xs[i])] = xs[i]
71 |     return pad_res
72 | 
73 | 
74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
75 |                 ignore_label: int) -> torch.Tensor:
76 |     """Calculate accuracy.
77 | 
78 |     Args:
79 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
80 |         pad_targets (LongTensor): Target label tensors (B, Lmax).
81 |         ignore_label (int): Ignore label id.
82 | 
83 |     Returns:
84 |         torch.Tensor: Accuracy value (0.0 - 1.0).
85 | 
86 |     """
87 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
88 |                                 pad_outputs.size(1)).argmax(2)
89 |     mask = pad_targets != ignore_label
90 |     numerator = torch.sum(
91 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
92 |     denominator = torch.sum(mask)
93 |     return (numerator / denominator).detach()
94 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/distributed/launch.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------
  2 | # Diffsound
  3 | # code based https://github.com/cientgu/VQ-Diffusion
  4 | # ------------------------------------------
  5 | import distributed.distributed as dist_fn
  6 | import torch
  7 | from torch import distributed as dist
  8 | from torch import multiprocessing as mp
  9 | 
 10 | # import distributed as dist_fn
 11 | 
 12 | 
 13 | def find_free_port():
 14 |     import socket
 15 | 
 16 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 17 | 
 18 |     sock.bind(("", 0))
 19 |     port = sock.getsockname()[1]
 20 |     sock.close()
 21 | 
 22 |     return port
 23 | 
 24 | 
 25 | def launch(fn,
 26 |            n_gpu_per_machine,
 27 |            n_machine=1,
 28 |            machine_rank=0,
 29 |            dist_url=None,
 30 |            args=()):
 31 |     world_size = n_machine * n_gpu_per_machine
 32 | 
 33 |     if world_size > 1:
 34 |         # if "OMP_NUM_THREADS" not in os.environ:
 35 |         #     os.environ["OMP_NUM_THREADS"] = "1"
 36 |         if dist_url == "auto":
 37 |             if n_machine != 1:
 38 |                 raise ValueError(
 39 |                     'dist_url="auto" not supported in multi-machine jobs')
 40 |             port = find_free_port()
 41 |             dist_url = f"tcp://127.0.0.1:{port}"
 42 |         print('dist_url ', dist_url)
 43 |         print('n_machine ', n_machine)
 44 |         print('args ', args)
 45 |         print('world_size ', world_size)
 46 |         print('machine_rank ', machine_rank)
 47 |         if n_machine > 1 and dist_url.startswith("file://"):
 48 |             raise ValueError(
 49 |                 "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://"
 50 |             )
 51 | 
 52 |         mp.spawn(
 53 |             distributed_worker,
 54 |             nprocs=n_gpu_per_machine,
 55 |             args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url,
 56 |                   args),
 57 |             daemon=False, )
 58 |         # n_machine ? world_size
 59 |     else:
 60 |         local_rank = 0
 61 |         fn(local_rank, *args)
 62 | 
 63 | 
 64 | def distributed_worker(local_rank, fn, world_size, n_gpu_per_machine,
 65 |                        machine_rank, dist_url, args):
 66 |     if not torch.cuda.is_available():
 67 |         raise OSError("CUDA is not available. Please check your environments")
 68 | 
 69 |     global_rank = machine_rank * n_gpu_per_machine + local_rank
 70 |     print('local_rank ', local_rank)
 71 |     print('global_rank ', global_rank)
 72 |     try:
 73 |         dist.init_process_group(
 74 |             backend="NCCL",
 75 |             init_method=dist_url,
 76 |             world_size=world_size,
 77 |             rank=global_rank, )
 78 | 
 79 |     except Exception:
 80 |         raise OSError("failed to initialize NCCL groups")
 81 | 
 82 |     # changed
 83 |     dist_fn.synchronize()
 84 | 
 85 |     if n_gpu_per_machine > torch.cuda.device_count():
 86 |         raise ValueError(
 87 |             f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})"
 88 |         )
 89 | 
 90 |     torch.cuda.set_device(local_rank)
 91 | 
 92 |     if dist_fn.LOCAL_PROCESS_GROUP is not None:
 93 |         raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None")
 94 | 
 95 |     # change paert
 96 | 
 97 |     n_machine = world_size // n_gpu_per_machine
 98 |     for i in range(n_machine):
 99 |         ranks_on_i = list(
100 |             range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine))
101 |         pg = dist.new_group(ranks_on_i)
102 | 
103 |         if i == machine_rank:
104 |             dist_fn.LOCAL_PROCESS_GROUP = pg
105 | 
106 |     fn(local_rank, *args)
107 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Shigeki Karita
 2 | #               2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 | 
17 | import torch
18 | from torch import nn
19 | 
20 | 
21 | class LabelSmoothingLoss(nn.Module):
22 |     """Label-smoothing loss.
23 | 
24 |     In a standard CE loss, the label's data distribution is:
25 |     [0,1,2] ->
26 |     [
27 |         [1.0, 0.0, 0.0],
28 |         [0.0, 1.0, 0.0],
29 |         [0.0, 0.0, 1.0],
30 |     ]
31 | 
32 |     In the smoothing version CE Loss,some probabilities
33 |     are taken from the true label prob (1.0) and are divided
34 |     among other labels.
35 | 
36 |     e.g.
37 |     smoothing=0.1
38 |     [0,1,2] ->
39 |     [
40 |         [0.9, 0.05, 0.05],
41 |         [0.05, 0.9, 0.05],
42 |         [0.05, 0.05, 0.9],
43 |     ]
44 | 
45 |     Args:
46 |         size (int): the number of class
47 |         padding_idx (int): padding class id which will be ignored for loss
48 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
49 |         normalize_length (bool):
50 |             normalize loss by sequence length if True
51 |             normalize loss by batch size if False
52 |     """
53 | 
54 |     def __init__(self,
55 |                  size: int,
56 |                  padding_idx: int,
57 |                  smoothing: float,
58 |                  normalize_length: bool = False):
59 |         """Construct an LabelSmoothingLoss object."""
60 |         super(LabelSmoothingLoss, self).__init__()
61 |         self.criterion = nn.KLDivLoss(reduction="none")
62 |         self.padding_idx = padding_idx
63 |         self.confidence = 1.0 - smoothing
64 |         self.smoothing = smoothing
65 |         self.size = size
66 |         self.normalize_length = normalize_length
67 | 
68 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 |         """Compute loss between x and target.
70 | 
71 |         The model outputs and data labels tensors are flatten to
72 |         (batch*seqlen, class) shape and a mask is applied to the
73 |         padding part which should not be calculated for loss.
74 | 
75 |         Args:
76 |             x (torch.Tensor): prediction (batch, seqlen, class)
77 |             target (torch.Tensor):
78 |                 target signal masked with self.padding_id (batch, seqlen)
79 |         Returns:
80 |             loss (torch.Tensor) : The KL loss, scalar float value
81 |         """
82 |         assert x.size(2) == self.size
83 |         batch_size = x.size(0)
84 |         x = x.view(-1, self.size)
85 |         target = target.view(-1)
86 |         # use zeros_like instead of torch.no_grad() for true_dist,
87 |         # since no_grad() can not be exported by JIT
88 |         true_dist = torch.zeros_like(x)
89 |         true_dist.fill_(self.smoothing / (self.size - 1))
90 |         ignore = target == self.padding_idx  # (B,)
91 |         total = len(target) - ignore.sum().item()
92 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
93 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 |         denom = total if self.normalize_length else batch_size
96 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/distributed/distributed.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------
  2 | # Diffsound
  3 | # code based https://github.com/cientgu/VQ-Diffusion
  4 | # ------------------------------------------
  5 | import pickle
  6 | 
  7 | import torch
  8 | from torch import distributed as dist
  9 | from torch.utils import data
 10 | 
 11 | LOCAL_PROCESS_GROUP = None
 12 | 
 13 | 
 14 | def is_primary():
 15 |     return get_rank() == 0
 16 | 
 17 | 
 18 | def get_rank():
 19 |     if not dist.is_available():
 20 |         return 0
 21 | 
 22 |     if not dist.is_initialized():
 23 |         return 0
 24 | 
 25 |     return dist.get_rank()
 26 | 
 27 | 
 28 | def get_local_rank():
 29 |     if not dist.is_available():
 30 |         return 0
 31 | 
 32 |     if not dist.is_initialized():
 33 |         return 0
 34 | 
 35 |     if LOCAL_PROCESS_GROUP is None:
 36 |         raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None")
 37 | 
 38 |     return dist.get_rank(group=LOCAL_PROCESS_GROUP)
 39 | 
 40 | 
 41 | def synchronize():
 42 |     if not dist.is_available():
 43 |         return
 44 | 
 45 |     if not dist.is_initialized():
 46 |         return
 47 | 
 48 |     world_size = dist.get_world_size()
 49 | 
 50 |     if world_size == 1:
 51 |         return
 52 | 
 53 |     dist.barrier()
 54 | 
 55 | 
 56 | def get_world_size():
 57 |     if not dist.is_available():
 58 |         return 1
 59 | 
 60 |     if not dist.is_initialized():
 61 |         return 1
 62 | 
 63 |     return dist.get_world_size()
 64 | 
 65 | 
 66 | def is_distributed():
 67 |     raise RuntimeError('Please debug this function!')
 68 |     return get_world_size() > 1
 69 | 
 70 | 
 71 | def all_reduce(tensor, op=dist.ReduceOp.SUM, async_op=False):
 72 |     world_size = get_world_size()
 73 | 
 74 |     if world_size == 1:
 75 |         return tensor
 76 |     dist.all_reduce(tensor, op=op, async_op=async_op)
 77 | 
 78 |     return tensor
 79 | 
 80 | 
 81 | def all_gather(data):
 82 |     world_size = get_world_size()
 83 | 
 84 |     if world_size == 1:
 85 |         return [data]
 86 | 
 87 |     buffer = pickle.dumps(data)
 88 |     storage = torch.ByteStorage.from_buffer(buffer)
 89 |     tensor = torch.ByteTensor(storage).to("cuda")
 90 | 
 91 |     local_size = torch.IntTensor([tensor.numel()]).to("cuda")
 92 |     size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)]
 93 |     dist.all_gather(size_list, local_size)
 94 |     size_list = [int(size.item()) for size in size_list]
 95 |     max_size = max(size_list)
 96 | 
 97 |     tensor_list = []
 98 |     for _ in size_list:
 99 |         tensor_list.append(torch.ByteTensor(size=(max_size, )).to("cuda"))
100 | 
101 |     if local_size != max_size:
102 |         padding = torch.ByteTensor(size=(max_size - local_size, )).to("cuda")
103 |         tensor = torch.cat((tensor, padding), 0)
104 | 
105 |     dist.all_gather(tensor_list, tensor)
106 | 
107 |     data_list = []
108 | 
109 |     for size, tensor in zip(size_list, tensor_list):
110 |         buffer = tensor.cpu().numpy().tobytes()[:size]
111 |         data_list.append(pickle.loads(buffer))
112 | 
113 |     return data_list
114 | 
115 | 
116 | def reduce_dict(input_dict, average=True):
117 |     world_size = get_world_size()
118 | 
119 |     if world_size < 2:
120 |         return input_dict
121 | 
122 |     with torch.no_grad():
123 |         keys = []
124 |         values = []
125 | 
126 |         for k in sorted(input_dict.keys()):
127 |             keys.append(k)
128 |             values.append(input_dict[k])
129 | 
130 |         values = torch.stack(values, 0)
131 |         dist.reduce(values, dst=0)
132 | 
133 |         if dist.get_rank() == 0 and average:
134 |             values /= world_size
135 | 
136 |         reduced_dict = {k: v for k, v in zip(keys, values)}
137 | 
138 |     return reduced_dict
139 | 
140 | 
141 | def data_sampler(dataset, shuffle, distributed):
142 |     if distributed:
143 |         return data.distributed.DistributedSampler(dataset, shuffle=shuffle)
144 | 
145 |     if shuffle:
146 |         return data.RandomSampler(dataset)
147 | 
148 |     else:
149 |         return data.SequentialSampler(dataset)
150 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Positionwise feed forward layer definition."""
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | class PositionwiseFeedForward(torch.nn.Module):
 21 |     """Positionwise feed forward layer.
 22 | 
 23 |     FeedForward are appied on each position of the sequence.
 24 |     The output dim is same with the input dim.
 25 | 
 26 |     Args:
 27 |         idim (int): Input dimenstion.
 28 |         hidden_units (int): The number of hidden units.
 29 |         dropout_rate (float): Dropout rate.
 30 |         activation (torch.nn.Module): Activation function
 31 |     """
 32 | 
 33 |     def __init__(
 34 |             self,
 35 |             idim: int,
 36 |             hidden_units: int,
 37 |             dropout_rate: float,
 38 |             activation: torch.nn.Module = torch.nn.ReLU(),
 39 |     ):
 40 |         """Construct a PositionwiseFeedForward object."""
 41 |         super(PositionwiseFeedForward, self).__init__()
 42 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
 43 |         self.activation = activation
 44 |         self.dropout = torch.nn.Dropout(dropout_rate)
 45 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
 46 | 
 47 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 48 |         """Forward function.
 49 | 
 50 |         Args:
 51 |             xs: input tensor (B, L, D)
 52 |         Returns:
 53 |             output tensor, (B, L, D)
 54 |         """
 55 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
 56 | 
 57 | 
 58 | class MoEFFNLayer(torch.nn.Module):
 59 |     """
 60 |     Mixture of expert with Positionwise feed forward layer
 61 |     See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
 62 |     The output dim is same with the input dim.
 63 | 
 64 |     Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
 65 |                   https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
 66 |     Args:
 67 |         n_expert: number of expert.
 68 |         n_expert_per_token: The actual number of experts used for each frame
 69 |         idim (int): Input dimenstion.
 70 |         hidden_units (int): The number of hidden units.
 71 |         dropout_rate (float): Dropout rate.
 72 |         activation (torch.nn.Module): Activation function
 73 |     """
 74 | 
 75 |     def __init__(
 76 |             self,
 77 |             n_expert: int,
 78 |             n_expert_per_token: int,
 79 |             idim: int,
 80 |             hidden_units: int,
 81 |             dropout_rate: float,
 82 |             activation: torch.nn.Module = torch.nn.ReLU(),
 83 |     ):
 84 |         super(MoEFFNLayer, self).__init__()
 85 |         self.gate = torch.nn.Linear(idim, n_expert, bias=False)
 86 |         self.experts = torch.nn.ModuleList(
 87 |             PositionwiseFeedForward(idim, hidden_units, dropout_rate,
 88 |                                     activation) for _ in range(n_expert))
 89 |         self.n_expert_per_token = n_expert_per_token
 90 | 
 91 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 92 |         """Foward function.
 93 |         Args:
 94 |             xs: input tensor (B, L, D)
 95 |         Returns:
 96 |             output tensor, (B, L, D)
 97 | 
 98 |         """
 99 |         B, L, D = xs.size(
100 |         )  # batch size, sequence length, embedding dimension (idim)
101 |         xs = xs.view(-1, D)  # (B*L, D)
102 |         router = self.gate(xs)  # (B*L, n_expert)
103 |         logits, indices = torch.topk(
104 |             router, self.n_expert_per_token
105 |         )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
106 |         weights = torch.nn.functional.softmax(
107 |             logits, dim=1,
108 |             dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_per_token)
109 |         output = torch.zeros_like(xs)  # (B*L, D)
110 |         for i, expert in enumerate(self.experts):
111 |             mask = indices == i
112 |             batch_idx, ith_expert = torch.where(mask)
113 |             output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
114 |                 xs[batch_idx])
115 |         return output.view(B, L, D)
116 | 


--------------------------------------------------------------------------------
/academicodec/quantization/distrib.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Torch distributed utilities."""
  7 | import typing as tp
  8 | 
  9 | import torch
 10 | 
 11 | 
 12 | def rank():
 13 |     if torch.distributed.is_initialized():
 14 |         return torch.distributed.get_rank()
 15 |     else:
 16 |         return 0
 17 | 
 18 | 
 19 | def world_size():
 20 |     if torch.distributed.is_initialized():
 21 |         return torch.distributed.get_world_size()
 22 |     else:
 23 |         return 1
 24 | 
 25 | 
 26 | def is_distributed():
 27 |     return world_size() > 1
 28 | 
 29 | 
 30 | def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
 31 |     if is_distributed():
 32 |         return torch.distributed.all_reduce(tensor, op)
 33 | 
 34 | 
 35 | def _is_complex_or_float(tensor):
 36 |     return torch.is_floating_point(tensor) or torch.is_complex(tensor)
 37 | 
 38 | 
 39 | def _check_number_of_params(params: tp.List[torch.Tensor]):
 40 |     # utility function to check that the number of params in all workers is the same,
 41 |     # and thus avoid a deadlock with distributed all reduce.
 42 |     if not is_distributed() or not params:
 43 |         return
 44 |     #print('params[0].device ', params[0].device)
 45 |     tensor = torch.tensor(
 46 |         [len(params)], device=params[0].device, dtype=torch.long)
 47 |     all_reduce(tensor)
 48 |     if tensor.item() != len(params) * world_size():
 49 |         # If not all the workers have the same number, for at least one of them,
 50 |         # this inequality will be verified.
 51 |         raise RuntimeError(
 52 |             f"Mismatch in number of params: ours is {len(params)}, "
 53 |             "at least one worker has a different one.")
 54 | 
 55 | 
 56 | def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int=0):
 57 |     """Broadcast the tensors from the given parameters to all workers.
 58 |     This can be used to ensure that all workers have the same model to start with.
 59 |     """
 60 |     if not is_distributed():
 61 |         return
 62 |     tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)]
 63 |     _check_number_of_params(tensors)
 64 |     handles = []
 65 |     for tensor in tensors:
 66 |         # src = int(rank()) # added code
 67 |         handle = torch.distributed.broadcast(
 68 |             tensor.data, src=src, async_op=True)
 69 |         handles.append(handle)
 70 |     for handle in handles:
 71 |         handle.wait()
 72 | 
 73 | 
 74 | def sync_buffer(buffers, average=True):
 75 |     """
 76 |     Sync grad for buffers. If average is False, broadcast instead of averaging.
 77 |     """
 78 |     if not is_distributed():
 79 |         return
 80 |     handles = []
 81 |     for buffer in buffers:
 82 |         if torch.is_floating_point(buffer.data):
 83 |             if average:
 84 |                 handle = torch.distributed.all_reduce(
 85 |                     buffer.data,
 86 |                     op=torch.distributed.ReduceOp.SUM,
 87 |                     async_op=True)
 88 |             else:
 89 |                 handle = torch.distributed.broadcast(
 90 |                     buffer.data, src=0, async_op=True)
 91 |             handles.append((buffer, handle))
 92 |     for buffer, handle in handles:
 93 |         handle.wait()
 94 |         if average:
 95 |             buffer.data /= world_size
 96 | 
 97 | 
 98 | def sync_grad(params):
 99 |     """
100 |     Simpler alternative to DistributedDataParallel, that doesn't rely
101 |     on any black magic. For simple models it can also be as fast.
102 |     Just call this on your model parameters after the call to backward!
103 |     """
104 |     if not is_distributed():
105 |         return
106 |     handles = []
107 |     for p in params:
108 |         if p.grad is not None:
109 |             handle = torch.distributed.all_reduce(
110 |                 p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
111 |             handles.append((p, handle))
112 |     for p, handle in handles:
113 |         handle.wait()
114 |         p.grad.data /= world_size()
115 | 
116 | 
117 | def average_metrics(metrics: tp.Dict[str, float], count=1.):
118 |     """Average a dictionary of metrics across all workers, using the optional
119 |     `count` as unormalized weight.
120 |     """
121 |     if not is_distributed():
122 |         return metrics
123 |     keys, values = zip(*metrics.items())
124 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
125 |     tensor = torch.tensor(
126 |         list(values) + [1], device=device, dtype=torch.float32)
127 |     tensor *= count
128 |     all_reduce(tensor)
129 |     averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
130 |     return dict(zip(keys, averaged))
131 | 


--------------------------------------------------------------------------------
/matcha/models/components/flow_matching.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | 
  6 | from matcha.models.components.decoder import Decoder
  7 | from matcha.utils.pylogger import get_pylogger
  8 | 
  9 | log = get_pylogger(__name__)
 10 | 
 11 | 
 12 | class BASECFM(torch.nn.Module, ABC):
 13 |     def __init__(
 14 |         self,
 15 |         n_feats,
 16 |         cfm_params,
 17 |         n_spks=1,
 18 |         spk_emb_dim=128,
 19 |     ):
 20 |         super().__init__()
 21 |         self.n_feats = n_feats
 22 |         self.n_spks = n_spks
 23 |         self.spk_emb_dim = spk_emb_dim
 24 |         self.solver = cfm_params.solver
 25 |         if hasattr(cfm_params, "sigma_min"):
 26 |             self.sigma_min = cfm_params.sigma_min
 27 |         else:
 28 |             self.sigma_min = 1e-4
 29 | 
 30 |         self.estimator = None
 31 | 
 32 |     @torch.inference_mode()
 33 |     def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
 34 |         """Forward diffusion
 35 | 
 36 |         Args:
 37 |             mu (torch.Tensor): output of encoder
 38 |                 shape: (batch_size, n_feats, mel_timesteps)
 39 |             mask (torch.Tensor): output_mask
 40 |                 shape: (batch_size, 1, mel_timesteps)
 41 |             n_timesteps (int): number of diffusion steps
 42 |             temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
 43 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 44 |                 shape: (batch_size, spk_emb_dim)
 45 |             cond: Not used but kept for future purposes
 46 | 
 47 |         Returns:
 48 |             sample: generated mel-spectrogram
 49 |                 shape: (batch_size, n_feats, mel_timesteps)
 50 |         """
 51 |         z = torch.randn_like(mu) * temperature
 52 |         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
 53 |         return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
 54 | 
 55 |     def solve_euler(self, x, t_span, mu, mask, spks, cond):
 56 |         """
 57 |         Fixed euler solver for ODEs.
 58 |         Args:
 59 |             x (torch.Tensor): random noise
 60 |             t_span (torch.Tensor): n_timesteps interpolated
 61 |                 shape: (n_timesteps + 1,)
 62 |             mu (torch.Tensor): output of encoder
 63 |                 shape: (batch_size, n_feats, mel_timesteps)
 64 |             mask (torch.Tensor): output_mask
 65 |                 shape: (batch_size, 1, mel_timesteps)
 66 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 67 |                 shape: (batch_size, spk_emb_dim)
 68 |             cond: Not used but kept for future purposes
 69 |         """
 70 |         t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
 71 | 
 72 |         # I am storing this because I can later plot it by putting a debugger here and saving it to a file
 73 |         # Or in future might add like a return_all_steps flag
 74 |         sol = []
 75 | 
 76 |         for step in range(1, len(t_span)):
 77 |             dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
 78 | 
 79 |             x = x + dt * dphi_dt
 80 |             t = t + dt
 81 |             sol.append(x)
 82 |             if step < len(t_span) - 1:
 83 |                 dt = t_span[step + 1] - t
 84 | 
 85 |         return sol[-1]
 86 | 
 87 |     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
 88 |         """Computes diffusion loss
 89 | 
 90 |         Args:
 91 |             x1 (torch.Tensor): Target
 92 |                 shape: (batch_size, n_feats, mel_timesteps)
 93 |             mask (torch.Tensor): target mask
 94 |                 shape: (batch_size, 1, mel_timesteps)
 95 |             mu (torch.Tensor): output of encoder
 96 |                 shape: (batch_size, n_feats, mel_timesteps)
 97 |             spks (torch.Tensor, optional): speaker embedding. Defaults to None.
 98 |                 shape: (batch_size, spk_emb_dim)
 99 | 
100 |         Returns:
101 |             loss: conditional flow matching loss
102 |             y: conditional flow
103 |                 shape: (batch_size, n_feats, mel_timesteps)
104 |         """
105 |         b, _, t = mu.shape
106 | 
107 |         # random timestep
108 |         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
109 |         # sample noise p(x_0)
110 |         z = torch.randn_like(x1)
111 | 
112 |         y = (1 - (1 - self.sigma_min) * t) * z + t * x1
113 |         u = x1 - (1 - self.sigma_min) * z
114 | 
115 |         loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
116 |             torch.sum(mask) * u.shape[1]
117 |         )
118 |         return loss, y
119 | 
120 | 
121 | class CFM(BASECFM):
122 |     def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64):
123 |         super().__init__(
124 |             n_feats=in_channels,
125 |             cfm_params=cfm_params,
126 |             n_spks=n_spks,
127 |             spk_emb_dim=spk_emb_dim,
128 |         )
129 | 
130 |         in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
131 |         # Just change the architecture of the estimator here
132 |         self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params)
133 | 


--------------------------------------------------------------------------------
/academicodec/quantization/vq.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Residual vector quantizer implementation."""
  7 | import math
  8 | import typing as tp
  9 | from dataclasses import dataclass
 10 | from dataclasses import field
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | 
 15 | from academicodec.quantization.core_vq import ResidualVectorQuantization
 16 | 
 17 | 
 18 | @dataclass
 19 | class QuantizedResult:
 20 |     quantized: torch.Tensor
 21 |     codes: torch.Tensor
 22 |     bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
 23 |     penalty: tp.Optional[torch.Tensor] = None
 24 |     metrics: dict = field(default_factory=dict)
 25 | 
 26 | 
 27 | class ResidualVectorQuantizer(nn.Module):
 28 |     """Residual Vector Quantizer.
 29 |     Args:
 30 |         dimension (int): Dimension of the codebooks.
 31 |         n_q (int): Number of residual vector quantizers used.
 32 |         bins (int): Codebook size.
 33 |         decay (float): Decay for exponential moving average over the codebooks.
 34 |         kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
 35 |         kmeans_iters (int): Number of iterations used for kmeans initialization.
 36 |         threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
 37 |             that have an exponential moving average cluster size less than the specified threshold with
 38 |             randomly selected vector from the current batch.
 39 |     """
 40 | 
 41 |     def __init__(
 42 |             self,
 43 |             dimension: int=256,
 44 |             n_q: int=8,
 45 |             bins: int=1024,
 46 |             decay: float=0.99,
 47 |             kmeans_init: bool=True,
 48 |             kmeans_iters: int=50,
 49 |             threshold_ema_dead_code: int=2, ):
 50 |         super().__init__()
 51 |         self.n_q = n_q
 52 |         self.dimension = dimension
 53 |         self.bins = bins
 54 |         self.decay = decay
 55 |         self.kmeans_init = kmeans_init
 56 |         self.kmeans_iters = kmeans_iters
 57 |         self.threshold_ema_dead_code = threshold_ema_dead_code
 58 |         self.vq = ResidualVectorQuantization(
 59 |             dim=self.dimension,
 60 |             codebook_size=self.bins,
 61 |             num_quantizers=self.n_q,
 62 |             decay=self.decay,
 63 |             kmeans_init=self.kmeans_init,
 64 |             kmeans_iters=self.kmeans_iters,
 65 |             threshold_ema_dead_code=self.threshold_ema_dead_code, )
 66 | 
 67 |     def forward(self,
 68 |                 x: torch.Tensor,
 69 |                 sample_rate: int,
 70 |                 bandwidth: tp.Optional[float]=None) -> QuantizedResult:
 71 |         """Residual vector quantization on the given input tensor.
 72 |         Args:
 73 |             x (torch.Tensor): Input tensor.
 74 |             sample_rate (int): Sample rate of the input tensor.
 75 |             bandwidth (float): Target bandwidth.
 76 |         Returns:
 77 |             QuantizedResult:
 78 |                 The quantized (or approximately quantized) representation with
 79 |                 the associated bandwidth and any penalty term for the loss.
 80 |         """
 81 |         bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
 82 |         n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
 83 |         quantized, codes, commit_loss = self.vq(x, n_q=n_q)
 84 |         bw = torch.tensor(n_q * bw_per_q).to(x)
 85 |         return quantized, codes, bw, torch.mean(commit_loss)
 86 |         #return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
 87 | 
 88 |     def get_num_quantizers_for_bandwidth(
 89 |             self, sample_rate: int, bandwidth: tp.Optional[float]=None) -> int:
 90 |         """Return n_q based on specified target bandwidth.
 91 |         """
 92 |         bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
 93 |         n_q = self.n_q
 94 |         if bandwidth and bandwidth > 0.:
 95 |             n_q = int(max(1, math.floor(bandwidth / bw_per_q)))
 96 |         return n_q
 97 | 
 98 |     def get_bandwidth_per_quantizer(self, sample_rate: int):
 99 |         """Return bandwidth per quantizer for a given input sample rate.
100 |         """
101 |         return math.log2(self.bins) * sample_rate / 1000
102 | 
103 |     def encode(self,
104 |                x: torch.Tensor,
105 |                sample_rate: int,
106 |                bandwidth: tp.Optional[float]=None,
107 |                st: tp.Optional[int]=None) -> torch.Tensor:
108 |         """Encode a given input tensor with the specified sample rate at the given bandwidth.
109 |         The RVQ encode method sets the appropriate number of quantizer to use
110 |         and returns indices for each quantizer.
111 |         """
112 |         n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
113 |         st = st or 0
114 |         codes = self.vq.encode(x, n_q=n_q, st=st)
115 |         return codes
116 | 
117 |     def decode(self, codes: torch.Tensor) -> torch.Tensor:
118 |         """Decode the given codes to the quantized representation.
119 |         """
120 |         quantized = self.vq.decode(codes)
121 |         return quantized
122 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Decoder self-attention layer definition."""
 16 | from typing import Optional, Tuple
 17 | 
 18 | import torch
 19 | from torch import nn
 20 | 
 21 | 
 22 | class DecoderLayer(nn.Module):
 23 |     """Single decoder layer module.
 24 | 
 25 |     Args:
 26 |         size (int): Input dimension.
 27 |         self_attn (torch.nn.Module): Self-attention module instance.
 28 |             `MultiHeadedAttention` instance can be used as the argument.
 29 |         src_attn (torch.nn.Module): Inter-attention module instance.
 30 |             `MultiHeadedAttention` instance can be used as the argument.
 31 |             If `None` is passed, Inter-attention is not used, such as
 32 |             CIF, GPT, and other decoder only model.
 33 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 34 |             `PositionwiseFeedForward` instance can be used as the argument.
 35 |         dropout_rate (float): Dropout rate.
 36 |         normalize_before (bool):
 37 |             True: use layer_norm before each sub-block.
 38 |             False: to use layer_norm after each sub-block.
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         size: int,
 44 |         self_attn: nn.Module,
 45 |         src_attn: Optional[nn.Module],
 46 |         feed_forward: nn.Module,
 47 |         dropout_rate: float,
 48 |         normalize_before: bool = True,
 49 |     ):
 50 |         """Construct an DecoderLayer object."""
 51 |         super().__init__()
 52 |         self.size = size
 53 |         self.self_attn = self_attn
 54 |         self.src_attn = src_attn
 55 |         self.feed_forward = feed_forward
 56 |         self.norm1 = nn.LayerNorm(size, eps=1e-5)
 57 |         self.norm2 = nn.LayerNorm(size, eps=1e-5)
 58 |         self.norm3 = nn.LayerNorm(size, eps=1e-5)
 59 |         self.dropout = nn.Dropout(dropout_rate)
 60 |         self.normalize_before = normalize_before
 61 | 
 62 |     def forward(
 63 |         self,
 64 |         tgt: torch.Tensor,
 65 |         tgt_mask: torch.Tensor,
 66 |         memory: torch.Tensor,
 67 |         memory_mask: torch.Tensor,
 68 |         cache: Optional[torch.Tensor] = None
 69 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 70 |         """Compute decoded features.
 71 | 
 72 |         Args:
 73 |             tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
 74 |             tgt_mask (torch.Tensor): Mask for input tensor
 75 |                 (#batch, maxlen_out).
 76 |             memory (torch.Tensor): Encoded memory
 77 |                 (#batch, maxlen_in, size).
 78 |             memory_mask (torch.Tensor): Encoded memory mask
 79 |                 (#batch, maxlen_in).
 80 |             cache (torch.Tensor): cached tensors.
 81 |                 (#batch, maxlen_out - 1, size).
 82 | 
 83 |         Returns:
 84 |             torch.Tensor: Output tensor (#batch, maxlen_out, size).
 85 |             torch.Tensor: Mask for output tensor (#batch, maxlen_out).
 86 |             torch.Tensor: Encoded memory (#batch, maxlen_in, size).
 87 |             torch.Tensor: Encoded memory mask (#batch, maxlen_in).
 88 | 
 89 |         """
 90 |         residual = tgt
 91 |         if self.normalize_before:
 92 |             tgt = self.norm1(tgt)
 93 | 
 94 |         if cache is None:
 95 |             tgt_q = tgt
 96 |             tgt_q_mask = tgt_mask
 97 |         else:
 98 |             # compute only the last frame query keeping dim: max_time_out -> 1
 99 |             assert cache.shape == (
100 |                 tgt.shape[0],
101 |                 tgt.shape[1] - 1,
102 |                 self.size,
103 |             ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
104 |             tgt_q = tgt[:, -1:, :]
105 |             residual = residual[:, -1:, :]
106 |             tgt_q_mask = tgt_mask[:, -1:, :]
107 | 
108 |         x = residual + self.dropout(
109 |             self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
110 |         if not self.normalize_before:
111 |             x = self.norm1(x)
112 | 
113 |         if self.src_attn is not None:
114 |             residual = x
115 |             if self.normalize_before:
116 |                 x = self.norm2(x)
117 |             x = residual + self.dropout(
118 |                 self.src_attn(x, memory, memory, memory_mask)[0])
119 |             if not self.normalize_before:
120 |                 x = self.norm2(x)
121 | 
122 |         residual = x
123 |         if self.normalize_before:
124 |             x = self.norm3(x)
125 |         x = residual + self.dropout(self.feed_forward(x))
126 |         if not self.normalize_before:
127 |             x = self.norm3(x)
128 | 
129 |         if cache is not None:
130 |             x = torch.cat([cache, x], dim=1)
131 | 
132 |         return x, tgt_mask, memory, memory_mask
133 | 


--------------------------------------------------------------------------------
/academicodec/modules/transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """A streamable transformer."""
  7 | import typing as tp
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | 
 14 | def create_sin_embedding(positions: torch.Tensor,
 15 |                          dim: int,
 16 |                          max_period: float=10000):
 17 |     """Create time embedding for the given positions, target dimension `dim`.
 18 |     """
 19 |     # We aim for BTC format
 20 |     assert dim % 2 == 0
 21 |     half_dim = dim // 2
 22 |     adim = torch.arange(half_dim, device=positions.device).view(1, 1, -1)
 23 |     phase = positions / (max_period**(adim / (half_dim - 1)))
 24 |     return torch.cat(
 25 |         [
 26 |             torch.cos(phase),
 27 |             torch.sin(phase),
 28 |         ], dim=-1)
 29 | 
 30 | 
 31 | class StreamingTransformerEncoderLayer(nn.TransformerEncoderLayer):
 32 |     def forward(self, x: torch.Tensor, x_past: torch.Tensor,
 33 |                 past_context: int):  # type: ignore
 34 |         if self.norm_first:
 35 |             sa_input = self.norm1(x)
 36 |             x = x + self._sa_block(sa_input, x_past, past_context)
 37 |             x = x + self._ff_block(self.norm2(x))
 38 |         else:
 39 |             sa_input = x
 40 |             x = self.norm1(x + self._sa_block(sa_input, x_past, past_context))
 41 |             x = self.norm2(x + self._ff_block(x))
 42 | 
 43 |         return x, sa_input
 44 | 
 45 |     # self-attention block
 46 |     def _sa_block(self,
 47 |                   x: torch.Tensor,
 48 |                   x_past: torch.Tensor,
 49 |                   past_context: int):  # type: ignore
 50 |         _, T, _ = x.shape
 51 |         _, H, _ = x_past.shape
 52 | 
 53 |         queries = x
 54 |         keys = torch.cat([x_past, x], dim=1)
 55 |         values = keys
 56 | 
 57 |         queries_pos = torch.arange(H, T + H, device=x.device).view(-1, 1)
 58 |         keys_pos = torch.arange(T + H, device=x.device).view(1, -1)
 59 |         delta = queries_pos - keys_pos
 60 |         valid_access = (delta >= 0) & (delta <= past_context)
 61 |         x = self.self_attn(
 62 |             queries, keys, values, attn_mask=~valid_access,
 63 |             need_weights=False)[0]
 64 |         return self.dropout1(x)
 65 | 
 66 | 
 67 | class StreamingTransformerEncoder(nn.Module):
 68 |     """TransformerEncoder with streaming support.
 69 | 
 70 |     Args:
 71 |         dim (int): dimension of the data.
 72 |         hidden_scale (int): intermediate dimension of FF module is this times the dimension.
 73 |         num_heads (int): number of heads.
 74 |         num_layers (int): number of layers.
 75 |         max_period (float): maxium period of cosines in the positional embedding.
 76 |         past_context (int or None): receptive field for the causal mask, infinite if None.
 77 |         gelu (bool): if true uses GeLUs, otherwise use ReLUs.
 78 |         norm_in (bool): normalize the input.
 79 |         dropout (float): dropout probability.
 80 |         **kwargs: See `nn.TransformerEncoderLayer`.
 81 |     """
 82 | 
 83 |     def __init__(self,
 84 |                  dim,
 85 |                  hidden_scale: float=4.,
 86 |                  num_heads: int=8,
 87 |                  num_layers: int=5,
 88 |                  max_period: float=10000,
 89 |                  past_context: int=1000,
 90 |                  gelu: bool=True,
 91 |                  norm_in: bool=True,
 92 |                  dropout: float=0.,
 93 |                  **kwargs):
 94 |         super().__init__()
 95 |         assert dim % num_heads == 0
 96 |         hidden_dim = int(dim * hidden_scale)
 97 | 
 98 |         self.max_period = max_period
 99 |         self.past_context = past_context
100 |         activation: tp.Any = F.gelu if gelu else F.relu
101 | 
102 |         self.norm_in: nn.Module
103 |         if norm_in:
104 |             self.norm_in = nn.LayerNorm(dim)
105 |         else:
106 |             self.norm_in = nn.Identity()
107 | 
108 |         self.layers = nn.ModuleList()
109 |         for idx in range(num_layers):
110 |             self.layers.append(
111 |                 StreamingTransformerEncoderLayer(
112 |                     dim,
113 |                     num_heads,
114 |                     hidden_dim,
115 |                     activation=activation,
116 |                     batch_first=True,
117 |                     dropout=dropout,
118 |                     **kwargs))
119 | 
120 |     def forward(self,
121 |                 x: torch.Tensor,
122 |                 states: tp.Optional[tp.List[torch.Tensor]]=None,
123 |                 offset: tp.Union[int, torch.Tensor]=0):
124 |         B, T, C = x.shape
125 |         if states is None:
126 |             states = [
127 |                 torch.zeros_like(x[:, :1]) for _ in range(1 + len(self.layers))
128 |             ]
129 | 
130 |         positions = torch.arange(T, device=x.device).view(1, -1, 1) + offset
131 |         pos_emb = create_sin_embedding(positions, C, max_period=self.max_period)
132 | 
133 |         new_state: tp.List[torch.Tensor] = []
134 |         x = self.norm_in(x)
135 |         x = x + pos_emb
136 | 
137 |         for layer_state, layer in zip(states, self.layers):
138 |             x, new_layer_state = layer(x, layer_state, self.past_context)
139 |             new_layer_state = torch.cat([layer_state, new_layer_state], dim=1)
140 |             new_state.append(new_layer_state[:, -self.past_context:, :])
141 |         return x, new_state, offset + T
142 | 


--------------------------------------------------------------------------------
/academicodec/models/soundstream/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from academicodec.modules import NormConv1d
  5 | from academicodec.modules import NormConv2d
  6 | from academicodec.utils import get_padding
  7 | from torch.nn import AvgPool1d
  8 | from torch.nn.utils import spectral_norm
  9 | from torch.nn.utils import weight_norm
 10 | 
 11 | LRELU_SLOPE = 0.1
 12 | 
 13 | 
 14 | class DiscriminatorP(torch.nn.Module):
 15 |     def __init__(self,
 16 |                  period,
 17 |                  kernel_size=5,
 18 |                  stride=3,
 19 |                  use_spectral_norm=False,
 20 |                  activation: str='LeakyReLU',
 21 |                  activation_params: dict={'negative_slope': 0.2}):
 22 |         super(DiscriminatorP, self).__init__()
 23 |         self.period = period
 24 |         norm_f = weight_norm if use_spectral_norm is False else spectral_norm
 25 |         self.activation = getattr(torch.nn, activation)(**activation_params)
 26 |         self.convs = nn.ModuleList([
 27 |             NormConv2d(
 28 |                 1,
 29 |                 32, (kernel_size, 1), (stride, 1),
 30 |                 padding=(get_padding(5, 1), 0)),
 31 |             NormConv2d(
 32 |                 32,
 33 |                 32, (kernel_size, 1), (stride, 1),
 34 |                 padding=(get_padding(5, 1), 0)),
 35 |             NormConv2d(
 36 |                 32,
 37 |                 32, (kernel_size, 1), (stride, 1),
 38 |                 padding=(get_padding(5, 1), 0)),
 39 |             NormConv2d(
 40 |                 32,
 41 |                 32, (kernel_size, 1), (stride, 1),
 42 |                 padding=(get_padding(5, 1), 0)),
 43 |             NormConv2d(32, 32, (kernel_size, 1), 1, padding=(2, 0)),
 44 |         ])
 45 |         self.conv_post = NormConv2d(32, 1, (3, 1), 1, padding=(1, 0))
 46 | 
 47 |     def forward(self, x):
 48 |         fmap = []
 49 |         # 1d to 2d
 50 |         b, c, t = x.shape
 51 |         if t % self.period != 0:  # pad first
 52 |             n_pad = self.period - (t % self.period)
 53 |             x = F.pad(x, (0, n_pad), "reflect")
 54 |             t = t + n_pad
 55 |         x = x.view(b, c, t // self.period, self.period)
 56 | 
 57 |         for l in self.convs:
 58 |             x = l(x)
 59 |             x = self.activation(x)
 60 |             fmap.append(x)
 61 |         x = self.conv_post(x)
 62 |         fmap.append(x)
 63 |         x = torch.flatten(x, 1, -1)
 64 | 
 65 |         return x, fmap
 66 | 
 67 | 
 68 | class MultiPeriodDiscriminator(torch.nn.Module):
 69 |     def __init__(self):
 70 |         super(MultiPeriodDiscriminator, self).__init__()
 71 |         self.discriminators = nn.ModuleList([
 72 |             DiscriminatorP(2),
 73 |             DiscriminatorP(3),
 74 |             DiscriminatorP(5),
 75 |             DiscriminatorP(7),
 76 |             DiscriminatorP(11),
 77 |         ])
 78 | 
 79 |     def forward(self, y, y_hat):
 80 |         y_d_rs = []
 81 |         y_d_gs = []
 82 |         fmap_rs = []
 83 |         fmap_gs = []
 84 |         for i, d in enumerate(self.discriminators):
 85 |             y_d_r, fmap_r = d(y)
 86 |             y_d_g, fmap_g = d(y_hat)
 87 |             y_d_rs.append(y_d_r)
 88 |             fmap_rs.append(fmap_r)
 89 |             y_d_gs.append(y_d_g)
 90 |             fmap_gs.append(fmap_g)
 91 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 92 | 
 93 | 
 94 | class DiscriminatorS(torch.nn.Module):
 95 |     def __init__(self,
 96 |                  use_spectral_norm=False,
 97 |                  activation: str='LeakyReLU',
 98 |                  activation_params: dict={'negative_slope': 0.2}):
 99 |         super(DiscriminatorS, self).__init__()
100 |         self.activation = getattr(torch.nn, activation)(**activation_params)
101 |         self.convs = nn.ModuleList([
102 |             NormConv1d(1, 32, 15, 1, padding=7),
103 |             NormConv1d(32, 32, 41, 2, groups=4, padding=20),
104 |             NormConv1d(32, 32, 41, 2, groups=16, padding=20),
105 |             NormConv1d(32, 32, 41, 4, groups=16, padding=20),
106 |             NormConv1d(32, 32, 41, 4, groups=16, padding=20),
107 |             NormConv1d(32, 32, 41, 1, groups=16, padding=20),
108 |             NormConv1d(32, 32, 5, 1, padding=2),
109 |         ])
110 |         self.conv_post = NormConv1d(32, 1, 3, 1, padding=1)
111 | 
112 |     def forward(self, x):
113 |         fmap = []
114 |         for l in self.convs:
115 |             x = l(x)
116 |             x = self.activation(x)
117 |             fmap.append(x)
118 |         x = self.conv_post(x)
119 |         fmap.append(x)
120 |         x = torch.flatten(x, 1, -1)
121 |         return x, fmap
122 | 
123 | 
124 | class MultiScaleDiscriminator(torch.nn.Module):
125 |     def __init__(self):
126 |         super(MultiScaleDiscriminator, self).__init__()
127 |         self.discriminators = nn.ModuleList([
128 |             DiscriminatorS(),
129 |             DiscriminatorS(),
130 |             DiscriminatorS(),
131 |         ])
132 |         self.meanpools = nn.ModuleList(
133 |             [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
134 | 
135 |     def forward(self, y, y_hat):
136 |         y_d_rs = []
137 |         y_d_gs = []
138 |         fmap_rs = []
139 |         fmap_gs = []
140 |         for i, d in enumerate(self.discriminators):
141 |             if i != 0:
142 |                 y = self.meanpools[i - 1](y)
143 |                 y_hat = self.meanpools[i - 1](y_hat)
144 |             y_d_r, fmap_r = d(y)
145 |             y_d_g, fmap_g = d(y_hat)
146 |             y_d_rs.append(y_d_r)
147 |             fmap_rs.append(fmap_r)
148 |             y_d_gs.append(y_d_g)
149 |             fmap_gs.append(fmap_g)
150 | 
151 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
152 | 


--------------------------------------------------------------------------------
/cosyvoice/bin/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function
 16 | 
 17 | import argparse
 18 | import logging
 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
 20 | import os
 21 | 
 22 | import torch
 23 | from torch.utils.data import DataLoader
 24 | import torchaudio
 25 | from hyperpyyaml import load_hyperpyyaml
 26 | from tqdm import tqdm
 27 | from cosyvoice.cli.model import CosyVoiceModel
 28 | 
 29 | from cosyvoice.dataset.dataset import Dataset
 30 | 
 31 | def get_args():
 32 |     parser = argparse.ArgumentParser(description='inference with your model')
 33 |     parser.add_argument('--config', required=True, help='config file')
 34 |     parser.add_argument('--prompt_data', required=True, help='prompt data file')
 35 |     parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
 36 |     parser.add_argument('--tts_text', required=True, help='tts input file')
 37 |     parser.add_argument('--llm_model', required=True, help='llm model file')
 38 |     parser.add_argument('--flow_model', required=True, help='flow model file')
 39 |     parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
 40 |     parser.add_argument('--gpu',
 41 |                         type=int,
 42 |                         default=-1,
 43 |                         help='gpu id for this rank, -1 for cpu')
 44 |     parser.add_argument('--mode',
 45 |                         default='sft',
 46 |                         choices=['sft', 'zero_shot'],
 47 |                         help='inference mode')
 48 |     parser.add_argument('--result_dir', required=True, help='asr result file')
 49 |     args = parser.parse_args()
 50 |     print(args)
 51 |     return args
 52 | 
 53 | 
 54 | def main():
 55 |     args = get_args()
 56 |     logging.basicConfig(level=logging.DEBUG,
 57 |                         format='%(asctime)s %(levelname)s %(message)s')
 58 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
 59 | 
 60 |     # Init cosyvoice models from configs
 61 |     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 62 |     device = torch.device('cuda' if use_cuda else 'cpu')
 63 |     with open(args.config, 'r') as f:
 64 |         configs = load_hyperpyyaml(f)
 65 | 
 66 |     model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
 67 |     model.load(args.llm_model, args.flow_model, args.hifigan_model)
 68 | 
 69 |     test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
 70 |     test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
 71 | 
 72 |     del configs
 73 |     os.makedirs(args.result_dir, exist_ok=True)
 74 |     fn = os.path.join(args.result_dir, 'wav.scp')
 75 |     f = open(fn, 'w')
 76 |     with torch.no_grad():
 77 |         for batch_idx, batch in tqdm(enumerate(test_data_loader)):
 78 |             utts = batch["utts"]
 79 |             assert len(utts) == 1, "inference mode only support batchsize 1"
 80 |             text = batch["text"]
 81 |             text_token = batch["text_token"].to(device)
 82 |             text_token_len = batch["text_token_len"].to(device)
 83 |             tts_text = batch["tts_text"]
 84 |             tts_index = batch["tts_index"]
 85 |             tts_text_token = batch["tts_text_token"].to(device)
 86 |             tts_text_token_len = batch["tts_text_token_len"].to(device)
 87 |             speech_token = batch["speech_token"].to(device)
 88 |             speech_token_len = batch["speech_token_len"].to(device)
 89 |             speech_feat = batch["speech_feat"].to(device)
 90 |             speech_feat_len = batch["speech_feat_len"].to(device)
 91 |             utt_embedding = batch["utt_embedding"].to(device)
 92 |             spk_embedding = batch["spk_embedding"].to(device)
 93 |             if args.mode == 'sft':
 94 |                 model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
 95 |                                'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
 96 |             else:
 97 |                 model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
 98 |                                'prompt_text': text_token, 'prompt_text_len': text_token_len,
 99 |                                'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
100 |                                'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
101 |                                'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
102 |                                'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
103 |             model_output = model.inference(**model_input)
104 |             tts_key = '{}_{}'.format(utts[0], tts_index[0])
105 |             tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
106 |             torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
107 |             f.write('{} {}\n'.format(tts_key, tts_fn))
108 |             f.flush()
109 |     f.close()
110 |     logging.info('Result wav.scp saved in {}'.format(fn))
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main()
115 | 


--------------------------------------------------------------------------------
/cosyvoice/dataset/dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import random
 17 | import json
 18 | import math
 19 | from functools import partial
 20 | 
 21 | import torch
 22 | import torch.distributed as dist
 23 | from torch.utils.data import IterableDataset
 24 | from cosyvoice.utils.file_utils import read_lists, read_json_lists
 25 | 
 26 | 
 27 | class Processor(IterableDataset):
 28 | 
 29 |     def __init__(self, source, f, *args, **kw):
 30 |         assert callable(f)
 31 |         self.source = source
 32 |         self.f = f
 33 |         self.args = args
 34 |         self.kw = kw
 35 | 
 36 |     def set_epoch(self, epoch):
 37 |         self.source.set_epoch(epoch)
 38 | 
 39 |     def __iter__(self):
 40 |         """ Return an iterator over the source dataset processed by the
 41 |             given processor.
 42 |         """
 43 |         assert self.source is not None
 44 |         assert callable(self.f)
 45 |         return self.f(iter(self.source), *self.args, **self.kw)
 46 | 
 47 |     def apply(self, f):
 48 |         assert callable(f)
 49 |         return Processor(self, f, *self.args, **self.kw)
 50 | 
 51 | 
 52 | class DistributedSampler:
 53 | 
 54 |     def __init__(self, shuffle=True, partition=True):
 55 |         self.epoch = -1
 56 |         self.update()
 57 |         self.shuffle = shuffle
 58 |         self.partition = partition
 59 | 
 60 |     def update(self):
 61 |         assert dist.is_available()
 62 |         if dist.is_initialized():
 63 |             self.rank = dist.get_rank()
 64 |             self.world_size = dist.get_world_size()
 65 |         else:
 66 |             self.rank = 0
 67 |             self.world_size = 1
 68 |         worker_info = torch.utils.data.get_worker_info()
 69 |         if worker_info is None:
 70 |             self.worker_id = 0
 71 |             self.num_workers = 1
 72 |         else:
 73 |             self.worker_id = worker_info.id
 74 |             self.num_workers = worker_info.num_workers
 75 |         return dict(rank=self.rank,
 76 |                     world_size=self.world_size,
 77 |                     worker_id=self.worker_id,
 78 |                     num_workers=self.num_workers)
 79 | 
 80 |     def set_epoch(self, epoch):
 81 |         self.epoch = epoch
 82 | 
 83 |     def sample(self, data):
 84 |         """ Sample data according to rank/world_size/num_workers
 85 | 
 86 |             Args:
 87 |                 data(List): input data list
 88 | 
 89 |             Returns:
 90 |                 List: data list after sample
 91 |         """
 92 |         data = list(range(len(data)))
 93 |         # force datalist even
 94 |         if self.partition:
 95 |             if self.shuffle:
 96 |                 random.Random(self.epoch).shuffle(data)
 97 |             if len(data) < self.world_size:
 98 |                 data = data * math.ceil(self.world_size / len(data))
 99 |                 data = data[:self.world_size]
100 |             data = data[self.rank::self.world_size]
101 |         if len(data) < self.num_workers:
102 |             data = data * math.ceil(self.num_workers / len(data))
103 |             data = data[:self.num_workers]
104 |         data = data[self.worker_id::self.num_workers]
105 |         return data
106 | 
107 | 
108 | class DataList(IterableDataset):
109 | 
110 |     def __init__(self, lists, shuffle=True, partition=True):
111 |         self.lists = lists
112 |         self.sampler = DistributedSampler(shuffle, partition)
113 | 
114 |     def set_epoch(self, epoch):
115 |         self.sampler.set_epoch(epoch)
116 | 
117 |     def __iter__(self):
118 |         sampler_info = self.sampler.update()
119 |         indexes = self.sampler.sample(self.lists)
120 |         for index in indexes:
121 |             data = dict(src=self.lists[index])
122 |             data.update(sampler_info)
123 |             yield data
124 | 
125 | 
126 | def Dataset(data_list_file,
127 |             data_pipeline,
128 |             mode='train',
129 |             shuffle=True,
130 |             partition=True,
131 |             tts_file='',
132 |             prompt_utt2data=''):
133 |     """ Construct dataset from arguments
134 | 
135 |         We have two shuffle stage in the Dataset. The first is global
136 |         shuffle at shards tar/raw file level. The second is global shuffle
137 |         at training samples level.
138 | 
139 |         Args:
140 |             data_type(str): raw/shard
141 |             tokenizer (BaseTokenizer): tokenizer to tokenize
142 |             partition(bool): whether to do data partition in terms of rank
143 |     """
144 |     assert mode in ['train', 'inference']
145 |     lists = read_lists(data_list_file)
146 |     if mode == 'inference':
147 |         with open(tts_file) as f:
148 |             tts_data = json.load(f)
149 |         utt2lists = read_json_lists(prompt_utt2data)
150 |         # filter unnecessary file in inference mode
151 |         lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
152 |     dataset = DataList(lists,
153 |                        shuffle=shuffle,
154 |                        partition=partition)
155 |     if mode == 'inference':
156 |         # map partial arg tts_data in inference mode
157 |         data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
158 |     for func in data_pipeline:
159 |         dataset = Processor(dataset, func, mode=mode)
160 |     return dataset
161 | 


--------------------------------------------------------------------------------
/cosyvoice/bin/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from __future__ import print_function
 15 | import os,sys
 16 | os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo"
 17 | import argparse
 18 | import datetime
 19 | import logging
 20 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
 21 | from copy import deepcopy
 22 | import torch
 23 | import torch.distributed as dist
 24 | import deepspeed
 25 | 
 26 | now_dir = os.getcwd()
 27 | sys.path.append(now_dir)
 28 | sys.path.append("%s/cosyvoice" % (now_dir))
 29 | 
 30 | from hyperpyyaml import load_hyperpyyaml
 31 | 
 32 | from torch.distributed.elastic.multiprocessing.errors import record
 33 | 
 34 | from cosyvoice.utils.executor import Executor
 35 | from cosyvoice.utils.train_utils import (
 36 |     init_distributed,
 37 |     init_dataset_and_dataloader,
 38 |     init_optimizer_and_scheduler,
 39 |     init_summarywriter, save_model,
 40 |     wrap_cuda_model, check_modify_and_save_config)
 41 | 
 42 | 
 43 | def get_args():
 44 |     parser = argparse.ArgumentParser(description='training your network')
 45 |     parser.add_argument('--train_engine',
 46 |                         default='torch_ddp',
 47 |                         choices=['torch_ddp', 'deepspeed'],
 48 |                         help='Engine for paralleled training')
 49 |     parser.add_argument('--model', required=True, help='model which will be trained')
 50 |     parser.add_argument('--config', required=True, help='config file')
 51 |     parser.add_argument('--train_data', required=True, help='train data file')
 52 |     parser.add_argument('--cv_data', required=True, help='cv data file')
 53 |     parser.add_argument('--checkpoint', help='checkpoint model')
 54 |     parser.add_argument('--model_dir', required=True, help='save model dir')
 55 |     parser.add_argument('--tensorboard_dir',
 56 |                         default='tensorboard',
 57 |                         help='tensorboard log dir')
 58 |     parser.add_argument('--ddp.dist_backend',
 59 |                         dest='dist_backend',
 60 |                         default='gloo',
 61 |                         choices=['nccl', 'gloo'],
 62 |                         help='distributed backend')
 63 |     parser.add_argument('--num_workers',
 64 |                         default=0,
 65 |                         type=int,
 66 |                         help='num of subprocess workers for reading')
 67 |     parser.add_argument('--prefetch',
 68 |                         default=100,
 69 |                         type=int,
 70 |                         help='prefetch number')
 71 |     parser.add_argument('--pin_memory',
 72 |                         action='store_true',
 73 |                         default=False,
 74 |                         help='Use pinned memory buffers used for reading')
 75 |     parser.add_argument('--deepspeed.save_states',
 76 |                         dest='save_states',
 77 |                         default='model_only',
 78 |                         choices=['model_only', 'model+optimizer'],
 79 |                         help='save model/optimizer states')
 80 |     parser.add_argument('--timeout',
 81 |                         default=30,
 82 |                         type=int,
 83 |                         help='timeout (in seconds) of cosyvoice_join.')
 84 |     parser = deepspeed.add_config_arguments(parser)
 85 |     args = parser.parse_args()
 86 |     return args
 87 | 
 88 | 
 89 | @record
 90 | def main():
 91 |     args = get_args()
 92 |     logging.basicConfig(level=logging.DEBUG,
 93 |                         format='%(asctime)s %(levelname)s %(message)s')
 94 | 
 95 |     override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
 96 |     with open(args.config, 'r') as f:
 97 |         configs = load_hyperpyyaml(f, overrides=override_dict)
 98 |     configs['train_conf'].update(vars(args))
 99 | 
100 |     # Init env for ddp
101 |     init_distributed(args)
102 | 
103 |     # Get dataset & dataloader
104 |     train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
105 |         init_dataset_and_dataloader(args, configs)
106 | 
107 |     # Do some sanity checks and save config to arsg.model_dir
108 |     configs = check_modify_and_save_config(args, configs)
109 | 
110 |     # Tensorboard summary
111 |     writer = init_summarywriter(args)
112 | 
113 |     # load checkpoint
114 |     model = configs[args.model]
115 |     if args.checkpoint is not None:
116 |         model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
117 | 
118 |     # Dispatch model from cpu to gpu
119 |     model = wrap_cuda_model(args, model)
120 | 
121 |     # Get optimizer & scheduler
122 |     model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
123 | 
124 |     # Save init checkpoints
125 |     info_dict = deepcopy(configs['train_conf'])
126 |     save_model(model, 'init', info_dict)
127 | 
128 |     # Get executor
129 |     executor = Executor()
130 | 
131 |     # Start training loop
132 |     for epoch in range(info_dict['max_epoch']):
133 |         executor.epoch = epoch
134 |         train_dataset.set_epoch(epoch)
135 |         dist.barrier()
136 |         group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
137 |         executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
138 |         dist.destroy_process_group(group_join)
139 | 
140 | if __name__ == '__main__':
141 |     main()
142 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/convolution.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
  2 | #               2024 Alibaba Inc (Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modified from ESPnet(https://github.com/espnet/espnet)
 16 | """ConvolutionModule definition."""
 17 | 
 18 | from typing import Tuple
 19 | 
 20 | import torch
 21 | from torch import nn
 22 | 
 23 | 
 24 | class ConvolutionModule(nn.Module):
 25 |     """ConvolutionModule in Conformer model."""
 26 | 
 27 |     def __init__(self,
 28 |                  channels: int,
 29 |                  kernel_size: int = 15,
 30 |                  activation: nn.Module = nn.ReLU(),
 31 |                  norm: str = "batch_norm",
 32 |                  causal: bool = False,
 33 |                  bias: bool = True):
 34 |         """Construct an ConvolutionModule object.
 35 |         Args:
 36 |             channels (int): The number of channels of conv layers.
 37 |             kernel_size (int): Kernel size of conv layers.
 38 |             causal (int): Whether use causal convolution or not
 39 |         """
 40 |         super().__init__()
 41 | 
 42 |         self.pointwise_conv1 = nn.Conv1d(
 43 |             channels,
 44 |             2 * channels,
 45 |             kernel_size=1,
 46 |             stride=1,
 47 |             padding=0,
 48 |             bias=bias,
 49 |         )
 50 |         # self.lorder is used to distinguish if it's a causal convolution,
 51 |         # if self.lorder > 0: it's a causal convolution, the input will be
 52 |         #    padded with self.lorder frames on the left in forward.
 53 |         # else: it's a symmetrical convolution
 54 |         if causal:
 55 |             padding = 0
 56 |             self.lorder = kernel_size - 1
 57 |         else:
 58 |             # kernel_size should be an odd number for none causal convolution
 59 |             assert (kernel_size - 1) % 2 == 0
 60 |             padding = (kernel_size - 1) // 2
 61 |             self.lorder = 0
 62 |         self.depthwise_conv = nn.Conv1d(
 63 |             channels,
 64 |             channels,
 65 |             kernel_size,
 66 |             stride=1,
 67 |             padding=padding,
 68 |             groups=channels,
 69 |             bias=bias,
 70 |         )
 71 | 
 72 |         assert norm in ['batch_norm', 'layer_norm']
 73 |         if norm == "batch_norm":
 74 |             self.use_layer_norm = False
 75 |             self.norm = nn.BatchNorm1d(channels)
 76 |         else:
 77 |             self.use_layer_norm = True
 78 |             self.norm = nn.LayerNorm(channels)
 79 | 
 80 |         self.pointwise_conv2 = nn.Conv1d(
 81 |             channels,
 82 |             channels,
 83 |             kernel_size=1,
 84 |             stride=1,
 85 |             padding=0,
 86 |             bias=bias,
 87 |         )
 88 |         self.activation = activation
 89 | 
 90 |     def forward(
 91 |         self,
 92 |         x: torch.Tensor,
 93 |         mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
 94 |         cache: torch.Tensor = torch.zeros((0, 0, 0)),
 95 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 96 |         """Compute convolution module.
 97 |         Args:
 98 |             x (torch.Tensor): Input tensor (#batch, time, channels).
 99 |             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
100 |                 (0, 0, 0) means fake mask.
101 |             cache (torch.Tensor): left context cache, it is only
102 |                 used in causal convolution (#batch, channels, cache_t),
103 |                 (0, 0, 0) meas fake cache.
104 |         Returns:
105 |             torch.Tensor: Output tensor (#batch, time, channels).
106 |         """
107 |         # exchange the temporal dimension and the feature dimension
108 |         x = x.transpose(1, 2)  # (#batch, channels, time)
109 | 
110 |         # mask batch padding
111 |         if mask_pad.size(2) > 0:  # time > 0
112 |             x.masked_fill_(~mask_pad, 0.0)
113 | 
114 |         if self.lorder > 0:
115 |             if cache.size(2) == 0:  # cache_t == 0
116 |                 x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
117 |             else:
118 |                 assert cache.size(0) == x.size(0)  # equal batch
119 |                 assert cache.size(1) == x.size(1)  # equal channel
120 |                 x = torch.cat((cache, x), dim=2)
121 |             assert (x.size(2) > self.lorder)
122 |             new_cache = x[:, :, -self.lorder:]
123 |         else:
124 |             # It's better we just return None if no cache is required,
125 |             # However, for JIT export, here we just fake one tensor instead of
126 |             # None.
127 |             new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
128 | 
129 |         # GLU mechanism
130 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
131 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
132 | 
133 |         # 1D Depthwise Conv
134 |         x = self.depthwise_conv(x)
135 |         if self.use_layer_norm:
136 |             x = x.transpose(1, 2)
137 |         x = self.activation(self.norm(x))
138 |         if self.use_layer_norm:
139 |             x = x.transpose(1, 2)
140 |         x = self.pointwise_conv2(x)
141 |         # mask batch padding
142 |         if mask_pad.size(2) > 0:  # time > 0
143 |             x.masked_fill_(~mask_pad, 0.0)
144 | 
145 |         return x.transpose(1, 2), new_cache
146 | 


--------------------------------------------------------------------------------
/matcha/hifigan/README.md:
--------------------------------------------------------------------------------
  1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
  2 | 
  3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
  4 | 
  5 | In our [paper](https://arxiv.org/abs/2010.05646),
  6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
  7 | We provide our implementation and pretrained models as open source in this repository.
  8 | 
  9 | **Abstract :**
 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
 11 | Although such methods improve the sampling efficiency and memory usage,
 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models.
 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
 14 | As speech audio consists of sinusoidal signals with various periods,
 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart.
 21 | 
 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
 23 | 
 24 | ## Pre-requisites
 25 | 
 26 | 1. Python >= 3.6
 27 | 2. Clone this repository.
 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
 30 |    And move all wav files to `LJSpeech-1.1/wavs`
 31 | 
 32 | ## Training
 33 | 
 34 | ```
 35 | python train.py --config config_v1.json
 36 | ```
 37 | 
 38 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
 39 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
 40 | You can change the path by adding `--checkpoint_path` option.
 41 | 
 42 | Validation loss during training with V1 generator.<br>
 43 | ![validation loss](./validation_loss.png)
 44 | 
 45 | ## Pretrained Model
 46 | 
 47 | You can also use pretrained models we provide.<br/>
 48 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
 49 | Details of each folder are as in follows:
 50 | 
 51 | | Folder Name  | Generator | Dataset   | Fine-Tuned                                             |
 52 | | ------------ | --------- | --------- | ------------------------------------------------------ |
 53 | | LJ_V1        | V1        | LJSpeech  | No                                                     |
 54 | | LJ_V2        | V2        | LJSpeech  | No                                                     |
 55 | | LJ_V3        | V3        | LJSpeech  | No                                                     |
 56 | | LJ_FT_T2_V1  | V1        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 57 | | LJ_FT_T2_V2  | V2        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 58 | | LJ_FT_T2_V3  | V3        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 59 | | VCTK_V1      | V1        | VCTK      | No                                                     |
 60 | | VCTK_V2      | V2        | VCTK      | No                                                     |
 61 | | VCTK_V3      | V3        | VCTK      | No                                                     |
 62 | | UNIVERSAL_V1 | V1        | Universal | No                                                     |
 63 | 
 64 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
 65 | 
 66 | ## Fine-Tuning
 67 | 
 68 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
 69 |    The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
 70 |    Example:
 71 |    `   Audio File : LJ001-0001.wav
 72 | Mel-Spectrogram File : LJ001-0001.npy`
 73 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
 74 | 3. Run the following command.
 75 |    ```
 76 |    python train.py --fine_tuning True --config config_v1.json
 77 |    ```
 78 |    For other command line options, please refer to the training section.
 79 | 
 80 | ## Inference from wav file
 81 | 
 82 | 1. Make `test_files` directory and copy wav files into the directory.
 83 | 2. Run the following command.
 84 |    `   python inference.py --checkpoint_file [generator checkpoint file path]`
 85 |    Generated wav files are saved in `generated_files` by default.<br>
 86 |    You can change the path by adding `--output_dir` option.
 87 | 
 88 | ## Inference for end-to-end speech synthesis
 89 | 
 90 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
 91 |    You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
 92 |    [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
 93 | 2. Run the following command.
 94 |    `   python inference_e2e.py --checkpoint_file [generator checkpoint file path]`
 95 |    Generated wav files are saved in `generated_files_from_mel` by default.<br>
 96 |    You can change the path by adding `--output_dir` option.
 97 | 
 98 | ## Acknowledgements
 99 | 
100 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
101 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
102 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/flow_matching.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import torch
 15 | import torch.nn.functional as F
 16 | from matcha.models.components.flow_matching import BASECFM
 17 | 
 18 | class ConditionalCFM(BASECFM):
 19 |     def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
 20 |         super().__init__(
 21 |             n_feats=in_channels,
 22 |             cfm_params=cfm_params,
 23 |             n_spks=n_spks,
 24 |             spk_emb_dim=spk_emb_dim,
 25 |         )
 26 |         self.t_scheduler = cfm_params.t_scheduler
 27 |         self.training_cfg_rate = cfm_params.training_cfg_rate
 28 |         self.inference_cfg_rate = cfm_params.inference_cfg_rate
 29 |         in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
 30 |         # Just change the architecture of the estimator here
 31 |         self.estimator = estimator
 32 | 
 33 |     @torch.inference_mode()
 34 |     def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
 35 |         """Forward diffusion
 36 | 
 37 |         Args:
 38 |             mu (torch.Tensor): output of encoder
 39 |                 shape: (batch_size, n_feats, mel_timesteps)
 40 |             mask (torch.Tensor): output_mask
 41 |                 shape: (batch_size, 1, mel_timesteps)
 42 |             n_timesteps (int): number of diffusion steps
 43 |             temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
 44 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 45 |                 shape: (batch_size, spk_emb_dim)
 46 |             cond: Not used but kept for future purposes
 47 | 
 48 |         Returns:
 49 |             sample: generated mel-spectrogram
 50 |                 shape: (batch_size, n_feats, mel_timesteps)
 51 |         """
 52 |         z = torch.randn_like(mu) * temperature
 53 |         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
 54 |         if self.t_scheduler == 'cosine':
 55 |             t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
 56 |         return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
 57 | 
 58 |     def solve_euler(self, x, t_span, mu, mask, spks, cond):
 59 |         """
 60 |         Fixed euler solver for ODEs.
 61 |         Args:
 62 |             x (torch.Tensor): random noise
 63 |             t_span (torch.Tensor): n_timesteps interpolated
 64 |                 shape: (n_timesteps + 1,)
 65 |             mu (torch.Tensor): output of encoder
 66 |                 shape: (batch_size, n_feats, mel_timesteps)
 67 |             mask (torch.Tensor): output_mask
 68 |                 shape: (batch_size, 1, mel_timesteps)
 69 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 70 |                 shape: (batch_size, spk_emb_dim)
 71 |             cond: Not used but kept for future purposes
 72 |         """
 73 |         t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
 74 | 
 75 |         # I am storing this because I can later plot it by putting a debugger here and saving it to a file
 76 |         # Or in future might add like a return_all_steps flag
 77 |         sol = []
 78 | 
 79 |         for step in range(1, len(t_span)):
 80 |             dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
 81 |             # Classifier-Free Guidance inference introduced in VoiceBox
 82 |             if self.inference_cfg_rate > 0:
 83 |                 cfg_dphi_dt = self.estimator(
 84 |                     x, mask,
 85 |                     torch.zeros_like(mu), t,
 86 |                     torch.zeros_like(spks) if spks is not None else None,
 87 |                     torch.zeros_like(cond)
 88 |                 )
 89 |                 dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
 90 |                            self.inference_cfg_rate * cfg_dphi_dt)
 91 |             x = x + dt * dphi_dt
 92 |             t = t + dt
 93 |             sol.append(x)
 94 |             if step < len(t_span) - 1:
 95 |                 dt = t_span[step + 1] - t
 96 | 
 97 |         return sol[-1]
 98 | 
 99 |     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
100 |         """Computes diffusion loss
101 | 
102 |         Args:
103 |             x1 (torch.Tensor): Target
104 |                 shape: (batch_size, n_feats, mel_timesteps)
105 |             mask (torch.Tensor): target mask
106 |                 shape: (batch_size, 1, mel_timesteps)
107 |             mu (torch.Tensor): output of encoder
108 |                 shape: (batch_size, n_feats, mel_timesteps)
109 |             spks (torch.Tensor, optional): speaker embedding. Defaults to None.
110 |                 shape: (batch_size, spk_emb_dim)
111 | 
112 |         Returns:
113 |             loss: conditional flow matching loss
114 |             y: conditional flow
115 |                 shape: (batch_size, n_feats, mel_timesteps)
116 |         """
117 |         b, _, t = mu.shape
118 | 
119 |         # random timestep
120 |         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
121 |         if self.t_scheduler == 'cosine':
122 |             t = 1 - torch.cos(t * 0.5 * torch.pi)
123 |         # sample noise p(x_0)
124 |         z = torch.randn_like(x1)
125 | 
126 |         y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127 |         u = x1 - (1 - self.sigma_min) * z
128 | 
129 |         pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
130 |         loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
131 |         return loss, y
132 | 


--------------------------------------------------------------------------------
/academicodec/binary.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Raw binary format for Encodec compressed audio. Actual compression API is in `encodec.compress`."""
  7 | import io
  8 | import json
  9 | import struct
 10 | import typing as tp
 11 | 
 12 | # format is `ECDC` magic code, followed by the header size as uint32.
 13 | # Then an uint8 indicates the protocol version (0.)
 14 | # The header is then provided as json and should contain all required
 15 | # informations for decoding. A raw stream of bytes is then provided
 16 | # and should be interpretable using the json header.
 17 | _encodec_header_struct = struct.Struct('!4sBI')
 18 | _ENCODEC_MAGIC = b'ECDC'
 19 | 
 20 | 
 21 | def write_ecdc_header(fo: tp.IO[bytes], metadata: tp.Any):
 22 |     meta_dumped = json.dumps(metadata).encode('utf-8')
 23 |     version = 0
 24 |     header = _encodec_header_struct.pack(_ENCODEC_MAGIC, version,
 25 |                                          len(meta_dumped))
 26 |     fo.write(header)
 27 |     fo.write(meta_dumped)
 28 |     fo.flush()
 29 | 
 30 | 
 31 | def _read_exactly(fo: tp.IO[bytes], size: int) -> bytes:
 32 |     buf = b""
 33 |     while len(buf) < size:
 34 |         new_buf = fo.read(size)
 35 |         if not new_buf:
 36 |             raise EOFError("Impossible to read enough data from the stream, "
 37 |                            f"{size} bytes remaining.")
 38 |         buf += new_buf
 39 |         size -= len(new_buf)
 40 |     return buf
 41 | 
 42 | 
 43 | def read_ecdc_header(fo: tp.IO[bytes]):
 44 |     header_bytes = _read_exactly(fo, _encodec_header_struct.size)
 45 |     magic, version, meta_size = _encodec_header_struct.unpack(header_bytes)
 46 |     if magic != _ENCODEC_MAGIC:
 47 |         raise ValueError("File is not in ECDC format.")
 48 |     if version != 0:
 49 |         raise ValueError("Version not supported.")
 50 |     meta_bytes = _read_exactly(fo, meta_size)
 51 |     return json.loads(meta_bytes.decode('utf-8'))
 52 | 
 53 | 
 54 | class BitPacker:
 55 |     """Simple bit packer to handle ints with a non standard width, e.g. 10 bits.
 56 |     Note that for some bandwidth (1.5, 3), the codebook representation
 57 |     will not cover an integer number of bytes.
 58 | 
 59 |     Args:
 60 |         bits (int): number of bits per value that will be pushed.
 61 |         fo (IO[bytes]): file-object to push the bytes to.
 62 |     """
 63 | 
 64 |     def __init__(self, bits: int, fo: tp.IO[bytes]):
 65 |         self._current_value = 0
 66 |         self._current_bits = 0
 67 |         self.bits = bits
 68 |         self.fo = fo
 69 | 
 70 |     def push(self, value: int):
 71 |         """Push a new value to the stream. This will immediately
 72 |         write as many uint8 as possible to the underlying file-object."""
 73 |         self._current_value += (value << self._current_bits)
 74 |         self._current_bits += self.bits
 75 |         while self._current_bits >= 8:
 76 |             lower_8bits = self._current_value & 0xff
 77 |             self._current_bits -= 8
 78 |             self._current_value >>= 8
 79 |             self.fo.write(bytes([lower_8bits]))
 80 | 
 81 |     def flush(self):
 82 |         """Flushes the remaining partial uint8, call this at the end
 83 |         of the stream to encode."""
 84 |         if self._current_bits:
 85 |             self.fo.write(bytes([self._current_value]))
 86 |             self._current_value = 0
 87 |             self._current_bits = 0
 88 |         self.fo.flush()
 89 | 
 90 | 
 91 | class BitUnpacker:
 92 |     """BitUnpacker does the opposite of `BitPacker`.
 93 | 
 94 |     Args:
 95 |         bits (int): number of bits of the values to decode.
 96 |         fo (IO[bytes]): file-object to push the bytes to.
 97 |         """
 98 | 
 99 |     def __init__(self, bits: int, fo: tp.IO[bytes]):
100 |         self.bits = bits
101 |         self.fo = fo
102 |         self._mask = (1 << bits) - 1
103 |         self._current_value = 0
104 |         self._current_bits = 0
105 | 
106 |     def pull(self) -> tp.Optional[int]:
107 |         """
108 |         Pull a single value from the stream, potentially reading some
109 |         extra bytes from the underlying file-object.
110 |         Returns `None` when reaching the end of the stream.
111 |         """
112 |         while self._current_bits < self.bits:
113 |             buf = self.fo.read(1)
114 |             if not buf:
115 |                 return None
116 |             character = buf[0]
117 |             self._current_value += character << self._current_bits
118 |             self._current_bits += 8
119 | 
120 |         out = self._current_value & self._mask
121 |         self._current_value >>= self.bits
122 |         self._current_bits -= self.bits
123 |         return out
124 | 
125 | 
126 | def test():
127 |     import torch
128 |     torch.manual_seed(1234)
129 |     for rep in range(4):
130 |         length: int = torch.randint(10, 2_000, (1, )).item()
131 |         bits: int = torch.randint(1, 16, (1, )).item()
132 |         tokens: tp.List[int] = torch.randint(2**bits, (length, )).tolist()
133 |         rebuilt: tp.List[int] = []
134 |         buf = io.BytesIO()
135 |         packer = BitPacker(bits, buf)
136 |         for token in tokens:
137 |             packer.push(token)
138 |         packer.flush()
139 |         buf.seek(0)
140 |         unpacker = BitUnpacker(bits, buf)
141 |         while True:
142 |             value = unpacker.pull()
143 |             if value is None:
144 |                 break
145 |             rebuilt.append(value)
146 |         assert len(rebuilt) >= len(tokens), (len(rebuilt), len(tokens))
147 |         # The flushing mechanism might lead to "ghost" values at the end of the stream.
148 |         assert len(rebuilt) <= len(tokens) + 8 // bits, (len(rebuilt),
149 |                                                          len(tokens), bits)
150 |         for idx, (a, b) in enumerate(zip(tokens, rebuilt)):
151 |             assert a == b, (idx, a, b)
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     test()
156 | 


--------------------------------------------------------------------------------
/matcha/onnx/export.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import random
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from lightning import LightningModule
  8 | 
  9 | from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder
 10 | 
 11 | DEFAULT_OPSET = 15
 12 | 
 13 | SEED = 1234
 14 | random.seed(SEED)
 15 | np.random.seed(SEED)
 16 | torch.manual_seed(SEED)
 17 | torch.cuda.manual_seed(SEED)
 18 | torch.backends.cudnn.deterministic = True
 19 | torch.backends.cudnn.benchmark = False
 20 | 
 21 | 
 22 | class MatchaWithVocoder(LightningModule):
 23 |     def __init__(self, matcha, vocoder):
 24 |         super().__init__()
 25 |         self.matcha = matcha
 26 |         self.vocoder = vocoder
 27 | 
 28 |     def forward(self, x, x_lengths, scales, spks=None):
 29 |         mel, mel_lengths = self.matcha(x, x_lengths, scales, spks)
 30 |         wavs = self.vocoder(mel).clamp(-1, 1)
 31 |         lengths = mel_lengths * 256
 32 |         return wavs.squeeze(1), lengths
 33 | 
 34 | 
 35 | def get_exportable_module(matcha, vocoder, n_timesteps):
 36 |     """
 37 |     Return an appropriate `LighteningModule` and output-node names
 38 |     based on whether the vocoder is embedded in  the final graph
 39 |     """
 40 | 
 41 |     def onnx_forward_func(x, x_lengths, scales, spks=None):
 42 |         """
 43 |         Custom forward function for accepting
 44 |         scaler parameters as tensors
 45 |         """
 46 |         # Extract scaler parameters from tensors
 47 |         temperature = scales[0]
 48 |         length_scale = scales[1]
 49 |         output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale)
 50 |         return output["mel"], output["mel_lengths"]
 51 | 
 52 |     # Monkey-patch Matcha's forward function
 53 |     matcha.forward = onnx_forward_func
 54 | 
 55 |     if vocoder is None:
 56 |         model, output_names = matcha, ["mel", "mel_lengths"]
 57 |     else:
 58 |         model = MatchaWithVocoder(matcha, vocoder)
 59 |         output_names = ["wav", "wav_lengths"]
 60 |     return model, output_names
 61 | 
 62 | 
 63 | def get_inputs(is_multi_speaker):
 64 |     """
 65 |     Create dummy inputs for tracing
 66 |     """
 67 |     dummy_input_length = 50
 68 |     x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long)
 69 |     x_lengths = torch.LongTensor([dummy_input_length])
 70 | 
 71 |     # Scales
 72 |     temperature = 0.667
 73 |     length_scale = 1.0
 74 |     scales = torch.Tensor([temperature, length_scale])
 75 | 
 76 |     model_inputs = [x, x_lengths, scales]
 77 |     input_names = [
 78 |         "x",
 79 |         "x_lengths",
 80 |         "scales",
 81 |     ]
 82 | 
 83 |     if is_multi_speaker:
 84 |         spks = torch.LongTensor([1])
 85 |         model_inputs.append(spks)
 86 |         input_names.append("spks")
 87 | 
 88 |     return tuple(model_inputs), input_names
 89 | 
 90 | 
 91 | def main():
 92 |     parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX")
 93 | 
 94 |     parser.add_argument(
 95 |         "checkpoint_path",
 96 |         type=str,
 97 |         help="Path to the model checkpoint",
 98 |     )
 99 |     parser.add_argument("output", type=str, help="Path to output `.onnx` file")
100 |     parser.add_argument(
101 |         "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)"
102 |     )
103 |     parser.add_argument(
104 |         "--vocoder-name",
105 |         type=str,
106 |         choices=list(VOCODER_URLS.keys()),
107 |         default=None,
108 |         help="Name of the vocoder to embed in the ONNX graph",
109 |     )
110 |     parser.add_argument(
111 |         "--vocoder-checkpoint-path",
112 |         type=str,
113 |         default=None,
114 |         help="Vocoder checkpoint to embed  in the ONNX graph for an `e2e` like experience",
115 |     )
116 |     parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15")
117 | 
118 |     args = parser.parse_args()
119 | 
120 |     print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}")
121 |     print(f"Setting n_timesteps to {args.n_timesteps}")
122 | 
123 |     checkpoint_path = Path(args.checkpoint_path)
124 |     matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu")
125 | 
126 |     if args.vocoder_name or args.vocoder_checkpoint_path:
127 |         assert (
128 |             args.vocoder_name and args.vocoder_checkpoint_path
129 |         ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph."
130 |         vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu")
131 |     else:
132 |         vocoder = None
133 | 
134 |     is_multi_speaker = matcha.n_spks > 1
135 | 
136 |     dummy_input, input_names = get_inputs(is_multi_speaker)
137 |     model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps)
138 | 
139 |     # Set dynamic shape for inputs/outputs
140 |     dynamic_axes = {
141 |         "x": {0: "batch_size", 1: "time"},
142 |         "x_lengths": {0: "batch_size"},
143 |     }
144 | 
145 |     if vocoder is None:
146 |         dynamic_axes.update(
147 |             {
148 |                 "mel": {0: "batch_size", 2: "time"},
149 |                 "mel_lengths": {0: "batch_size"},
150 |             }
151 |         )
152 |     else:
153 |         print("Embedding the vocoder in the ONNX graph")
154 |         dynamic_axes.update(
155 |             {
156 |                 "wav": {0: "batch_size", 1: "time"},
157 |                 "wav_lengths": {0: "batch_size"},
158 |             }
159 |         )
160 | 
161 |     if is_multi_speaker:
162 |         dynamic_axes["spks"] = {0: "batch_size"}
163 | 
164 |     # Create the output directory (if not exists)
165 |     Path(args.output).parent.mkdir(parents=True, exist_ok=True)
166 | 
167 |     model.to_onnx(
168 |         args.output,
169 |         dummy_input,
170 |         input_names=input_names,
171 |         output_names=output_names,
172 |         dynamic_axes=dynamic_axes,
173 |         opset_version=args.opset,
174 |         export_params=True,
175 |         do_constant_folding=True,
176 |     )
177 |     print(f"[🍵] ONNX model exported to  {args.output}")
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import torch
 15 | 
 16 | class CosyVoiceModel:
 17 | 
 18 |     def __init__(self,
 19 |                  llm: torch.nn.Module,
 20 |                  flow: torch.nn.Module,
 21 |                  hift: torch.nn.Module):
 22 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 |         self.llm = llm
 24 |         self.flow = flow
 25 |         self.hift = hift
 26 | 
 27 |     def load(self, llm_model, flow_model, hift_model):
 28 |         self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
 29 |         self.llm.to(self.device).eval()
 30 |         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
 31 |         self.flow.to(self.device).eval()
 32 |         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
 33 |         self.hift.to(self.device).eval()
 34 | 
 35 |     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
 36 |                   prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
 37 |                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
 38 |                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
 39 |                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
 40 |         tts_speech_token = self.llm.inference(text=text.to(self.device),
 41 |                                               text_len=text_len.to(self.device),
 42 |                                               prompt_text=prompt_text.to(self.device),
 43 |                                               prompt_text_len=prompt_text_len.to(self.device),
 44 |                                               prompt_speech_token=llm_prompt_speech_token.to(self.device),
 45 |                                               prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
 46 |                                               embedding=llm_embedding.to(self.device),
 47 |                                               beam_size=1,
 48 |                                               sampling=25,
 49 |                                               max_token_text_ratio=30,
 50 |                                               min_token_text_ratio=3)
 51 |         tts_mel = self.flow.inference(token=tts_speech_token,
 52 |                                       token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
 53 |                                       prompt_token=flow_prompt_speech_token.to(self.device),
 54 |                                       prompt_token_len=flow_prompt_speech_token_len.to(self.device),
 55 |                                       prompt_feat=prompt_speech_feat.to(self.device),
 56 |                                       prompt_feat_len=prompt_speech_feat_len.to(self.device),
 57 |                                       embedding=flow_embedding.to(self.device))
 58 |         tts_speech = self.hift.inference(mel=tts_mel).cpu()
 59 |         torch.cuda.empty_cache()
 60 |         return {'tts_speech': tts_speech}
 61 | 
 62 |     def inference_stream(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
 63 |                 prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
 64 |                 llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
 65 |                 flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
 66 |                 prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
 67 |         try:
 68 |             tts_speech_token = next(self.llm.inference_stream(text=text.to(self.device),
 69 |                                                             text_len=text_len.to(self.device),
 70 |                                                             prompt_text=prompt_text.to(self.device),
 71 |                                                             prompt_text_len=prompt_text_len.to(self.device),
 72 |                                                             prompt_speech_token=llm_prompt_speech_token.to(self.device),
 73 |                                                             prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
 74 |                                                             embedding=llm_embedding.to(self.device),
 75 |                                                             beam_size=1,
 76 |                                                             sampling=25,
 77 |                                                             max_token_text_ratio=30,
 78 |                                                             min_token_text_ratio=3))
 79 |         except StopIteration:
 80 |             print("LLM inference stream exhausted")
 81 |             return
 82 | 
 83 |         try:
 84 |             tts_mel = next(self.flow.inference_stream(token=tts_speech_token,
 85 |                                                     token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
 86 |                                                     prompt_token=flow_prompt_speech_token.to(self.device),
 87 |                                                     prompt_token_len=flow_prompt_speech_token_len.to(self.device),
 88 |                                                     prompt_feat=prompt_speech_feat.to(self.device),
 89 |                                                     prompt_feat_len=prompt_speech_feat_len.to(self.device),
 90 |                                                     embedding=flow_embedding.to(self.device)))
 91 |         except StopIteration:
 92 |             print("Flow inference stream exhausted")
 93 |             return
 94 | 
 95 |         try:
 96 |             tts_speech = next(self.hift.inference_stream(mel=tts_mel))
 97 |         except StopIteration:
 98 |             print("HIFT inference stream exhausted")
 99 |             return
100 | 
101 |         tts_speech = tts_speech.cpu()
102 |         torch.cuda.empty_cache()
103 |         yield {'tts_speech': tts_speech}


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/text_normlization.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | from typing import List
 16 | 
 17 | from .char_convert import tranditional_to_simplified
 18 | from .chronology import RE_DATE
 19 | from .chronology import RE_DATE2
 20 | from .chronology import RE_TIME
 21 | from .chronology import RE_TIME_RANGE
 22 | from .chronology import replace_date
 23 | from .chronology import replace_date2
 24 | from .chronology import replace_time
 25 | from .constants import F2H_ASCII_LETTERS
 26 | from .constants import F2H_DIGITS
 27 | from .constants import F2H_SPACE
 28 | from .num import RE_DECIMAL_NUM
 29 | from .num import RE_DEFAULT_NUM
 30 | from .num import RE_FRAC
 31 | from .num import RE_INTEGER
 32 | from .num import RE_NUMBER
 33 | from .num import RE_PERCENTAGE
 34 | from .num import RE_POSITIVE_QUANTIFIERS
 35 | from .num import RE_RANGE
 36 | from .num import replace_default_num
 37 | from .num import replace_frac
 38 | from .num import replace_negative_num
 39 | from .num import replace_number
 40 | from .num import replace_percentage
 41 | from .num import replace_positive_quantifier
 42 | from .num import replace_range
 43 | from .phonecode import RE_MOBILE_PHONE
 44 | from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 45 | from .phonecode import RE_TELEPHONE
 46 | from .phonecode import replace_mobile
 47 | from .phonecode import replace_phone
 48 | from .quantifier import RE_TEMPERATURE
 49 | from .quantifier import replace_measure
 50 | from .quantifier import replace_temperature
 51 | 
 52 | 
 53 | class TextNormalizer():
 54 |     def __init__(self):
 55 |         self.SENTENCE_SPLITOR = re.compile(r'([：、，；。？！,;?!][”’]?)')
 56 | 
 57 |     def _split(self, text: str, lang="zh") -> List[str]:
 58 |         """Split long text into sentences with sentence-splitting punctuations.
 59 |         Args:
 60 |             text (str): The input text.
 61 |         Returns:
 62 |             List[str]: Sentences.
 63 |         """
 64 |         # Only for pure Chinese here
 65 |         if lang == "zh":
 66 |             text = text.replace(" ", "")
 67 |             # 过滤掉特殊字符
 68 |             text = re.sub(r'[——《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
 69 |         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
 70 |         text = text.strip()
 71 |         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
 72 |         return sentences
 73 | 
 74 |     def _post_replace(self, sentence: str) -> str:
 75 |         sentence = sentence.replace('/', '每')
 76 |         sentence = sentence.replace('~', '至')
 77 |         sentence = sentence.replace('～', '至')
 78 |         sentence = sentence.replace('①', '一')
 79 |         sentence = sentence.replace('②', '二')
 80 |         sentence = sentence.replace('③', '三')
 81 |         sentence = sentence.replace('④', '四')
 82 |         sentence = sentence.replace('⑤', '五')
 83 |         sentence = sentence.replace('⑥', '六')
 84 |         sentence = sentence.replace('⑦', '七')
 85 |         sentence = sentence.replace('⑧', '八')
 86 |         sentence = sentence.replace('⑨', '九')
 87 |         sentence = sentence.replace('⑩', '十')
 88 |         sentence = sentence.replace('α', '阿尔法')
 89 |         sentence = sentence.replace('β', '贝塔')
 90 |         sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
 91 |         sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
 92 |         sentence = sentence.replace('ε', '艾普西龙')
 93 |         sentence = sentence.replace('ζ', '捷塔')
 94 |         sentence = sentence.replace('η', '依塔')
 95 |         sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
 96 |         sentence = sentence.replace('ι', '艾欧塔')
 97 |         sentence = sentence.replace('κ', '喀帕')
 98 |         sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
 99 |         sentence = sentence.replace('μ', '缪')
100 |         sentence = sentence.replace('ν', '拗')
101 |         sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
102 |         sentence = sentence.replace('ο', '欧米克伦')
103 |         sentence = sentence.replace('π', '派').replace('Π', '派')
104 |         sentence = sentence.replace('ρ', '肉')
105 |         sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
106 |             'σ', '西格玛')
107 |         sentence = sentence.replace('τ', '套')
108 |         sentence = sentence.replace('υ', '宇普西龙')
109 |         sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
110 |         sentence = sentence.replace('χ', '器')
111 |         sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
112 |         sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
113 |         # re filter special characters, have one more character "-" than line 68
114 |         sentence = re.sub(r'[-——《》【】<=>{}()（）#&@“”^_|…\\]', '', sentence)
115 |         return sentence
116 | 
117 |     def normalize_sentence(self, sentence: str) -> str:
118 |         # basic character conversions
119 |         sentence = tranditional_to_simplified(sentence)
120 |         sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
121 |             F2H_DIGITS).translate(F2H_SPACE)
122 | 
123 |         # number related NSW verbalization
124 |         sentence = RE_DATE.sub(replace_date, sentence)
125 |         sentence = RE_DATE2.sub(replace_date2, sentence)
126 | 
127 |         # range first
128 |         sentence = RE_TIME_RANGE.sub(replace_time, sentence)
129 |         sentence = RE_TIME.sub(replace_time, sentence)
130 | 
131 |         sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
132 |         sentence = replace_measure(sentence)
133 |         sentence = RE_FRAC.sub(replace_frac, sentence)
134 |         sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
135 |         sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
136 | 
137 |         sentence = RE_TELEPHONE.sub(replace_phone, sentence)
138 |         sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
139 | 
140 |         sentence = RE_RANGE.sub(replace_range, sentence)
141 |         sentence = RE_INTEGER.sub(replace_negative_num, sentence)
142 |         sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
143 |         sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
144 |                                                sentence)
145 |         sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
146 |         sentence = RE_NUMBER.sub(replace_number, sentence)
147 |         sentence = self._post_replace(sentence)
148 | 
149 |         return sentence
150 | 
151 |     def normalize(self, text: str) -> List[str]:
152 |         sentences = self._split(text)
153 |         sentences = [self.normalize_sentence(sent) for sent in sentences]
154 |         return sentences
155 | 


--------------------------------------------------------------------------------
/academicodec/models/encodec/test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Command-line for audio compression."""
  7 | import argparse
  8 | import os
  9 | import sys
 10 | import typing as tp
 11 | from collections import OrderedDict
 12 | from pathlib import Path
 13 | 
 14 | import librosa
 15 | import soundfile as sf
 16 | import torch
 17 | from academicodec.models.encodec.net3 import SoundStream
 18 | 
 19 | 
 20 | def save_audio(wav: torch.Tensor,
 21 |                path: tp.Union[Path, str],
 22 |                sample_rate: int,
 23 |                rescale: bool=False):
 24 |     limit = 0.99
 25 |     mx = wav.abs().max()
 26 |     if rescale:
 27 |         wav = wav * min(limit / mx, 1)
 28 |     else:
 29 |         wav = wav.clamp(-limit, limit)
 30 |     wav = wav.squeeze().cpu().numpy()
 31 |     sf.write(path, wav, sample_rate)
 32 | 
 33 | 
 34 | def get_parser():
 35 |     parser = argparse.ArgumentParser(
 36 |         'encodec',
 37 |         description='High fidelity neural audio codec. '
 38 |         'If input is a .ecdc, decompresses it. '
 39 |         'If input is .wav, compresses it. If output is also wav, '
 40 |         'do a compression/decompression cycle.')
 41 |     parser.add_argument(
 42 |         '--input',
 43 |         type=Path,
 44 |         help='Input file, whatever is supported by torchaudio on your system.')
 45 |     parser.add_argument(
 46 |         '--output',
 47 |         type=Path,
 48 |         nargs='?',
 49 |         help='Output file, otherwise inferred from input file.')
 50 |     parser.add_argument(
 51 |         '--resume_path', type=str, default='resume_path', help='resume_path')
 52 |     parser.add_argument(
 53 |         '--sr', type=int, default=16000, help='sample rate of model')
 54 |     parser.add_argument(
 55 |         '-r',
 56 |         '--rescale',
 57 |         action='store_true',
 58 |         help='Automatically rescale the output to avoid clipping.')
 59 |     parser.add_argument(
 60 |         '--ratios',
 61 |         type=int,
 62 |         nargs='+',
 63 |         # probs(ratios) = hop_size
 64 |         default=[8, 5, 4, 2],
 65 |         help='ratios of SoundStream, shoud be set for different hop_size (32d, 320, 240d, ...)'
 66 |     )
 67 |     parser.add_argument(
 68 |         '--target_bandwidths',
 69 |         type=float,
 70 |         nargs='+',
 71 |         # default for 16k_320d
 72 |         default=[1, 1.5, 2, 4, 6, 12],
 73 |         help='target_bandwidths of net3.py')
 74 |     parser.add_argument(
 75 |         '--target_bw',
 76 |         type=float,
 77 |         # default for 16k_320d
 78 |         default=12,
 79 |         help='target_bw of net3.py')
 80 | 
 81 |     return parser
 82 | 
 83 | 
 84 | def fatal(*args):
 85 |     print(*args, file=sys.stderr)
 86 |     sys.exit(1)
 87 | 
 88 | 
 89 | # 这只是打印了但是没有真的 clip
 90 | def check_clipping(wav, rescale):
 91 |     if rescale:
 92 |         return
 93 |     mx = wav.abs().max()
 94 |     limit = 0.99
 95 |     if mx > limit:
 96 |         print(
 97 |             f"Clipping!! max scale {mx}, limit is {limit}. "
 98 |             "To avoid clipping, use the `-r` option to rescale the output.",
 99 |             file=sys.stderr)
100 | 
101 | 
102 | def test_one(args, wav_root, store_root, rescale, soundstream):
103 |     # torchaudio.load 的采样率为原始音频的采样率，不会自动下采样
104 |     # wav, sr = torchaudio.load(wav_root)
105 |     # # 取单声道, output shape [1, T]
106 |     # wav = wav[0].unsqueeze(0)
107 |     # # 重采样为模型的采样率
108 |     # wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=args.sr)(wav)
109 | 
110 |     # load wav with librosa
111 |     wav, sr = librosa.load(wav_root, sr=args.sr)
112 |     wav = torch.tensor(wav).unsqueeze(0)
113 | 
114 |     # add batch axis
115 |     wav = wav.unsqueeze(1).cuda()
116 | 
117 |     # compressing
118 |     compressed = soundstream.encode(wav, target_bw=args.target_bw)
119 |     print('finish compressing')
120 |     out = soundstream.decode(compressed)
121 |     out = out.detach().cpu().squeeze(0)
122 |     check_clipping(out, rescale)
123 |     save_audio(wav=out, path=store_root, sample_rate=args.sr, rescale=rescale)
124 |     print('finish decompressing')
125 | 
126 | 
127 | def remove_encodec_weight_norm(model):
128 |     from academicodec.modules import SConv1d
129 |     from academicodec.modules.seanet import SConvTranspose1d
130 |     from academicodec.modules.seanet import SEANetResnetBlock
131 |     from torch.nn.utils import remove_weight_norm
132 | 
133 |     encoder = model.encoder.model
134 |     for key in encoder._modules:
135 |         if isinstance(encoder._modules[key], SEANetResnetBlock):
136 |             remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
137 |             block_modules = encoder._modules[key].block._modules
138 |             for skey in block_modules:
139 |                 if isinstance(block_modules[skey], SConv1d):
140 |                     remove_weight_norm(block_modules[skey].conv.conv)
141 |         elif isinstance(encoder._modules[key], SConv1d):
142 |             remove_weight_norm(encoder._modules[key].conv.conv)
143 | 
144 |     decoder = model.decoder.model
145 |     for key in decoder._modules:
146 |         if isinstance(decoder._modules[key], SEANetResnetBlock):
147 |             remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
148 |             block_modules = decoder._modules[key].block._modules
149 |             for skey in block_modules:
150 |                 if isinstance(block_modules[skey], SConv1d):
151 |                     remove_weight_norm(block_modules[skey].conv.conv)
152 |         elif isinstance(decoder._modules[key], SConvTranspose1d):
153 |             remove_weight_norm(decoder._modules[key].convtr.convtr)
154 |         elif isinstance(decoder._modules[key], SConv1d):
155 |             remove_weight_norm(decoder._modules[key].conv.conv)
156 | 
157 | 
158 | def test_batch():
159 |     args = get_parser().parse_args()
160 |     print("args.target_bandwidths:", args.target_bandwidths)
161 |     if not args.input.exists():
162 |         fatal(f"Input file {args.input} does not exist.")
163 |     input_lists = os.listdir(args.input)
164 |     input_lists.sort()
165 |     soundstream = SoundStream(
166 |         n_filters=32,
167 |         D=512,
168 |         ratios=args.ratios,
169 |         sample_rate=args.sr,
170 |         target_bandwidths=args.target_bandwidths)
171 |     parameter_dict = torch.load(args.resume_path)
172 |     new_state_dict = OrderedDict()
173 |     # k 为 module.xxx.weight, v 为权重
174 |     for k, v in parameter_dict.items():
175 |         # 截取`module.`后面的xxx.weight
176 |         name = k[7:]
177 |         new_state_dict[name] = v
178 |     soundstream.load_state_dict(new_state_dict)  # load model
179 |     remove_encodec_weight_norm(soundstream)
180 |     soundstream.cuda()
181 |     soundstream.eval()
182 |     os.makedirs(args.output, exist_ok=True)
183 |     for audio in input_lists:
184 |         test_one(
185 |             args=args,
186 |             wav_root=os.path.join(args.input, audio),
187 |             store_root=os.path.join(args.output, audio),
188 |             rescale=args.rescale,
189 |             soundstream=soundstream)
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     test_batch()
194 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/zh_normalization/num.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Rules to verbalize numbers into Chinese characters.
 16 | https://zh.wikipedia.org/wiki/中文数字#現代中文
 17 | """
 18 | import re
 19 | from collections import OrderedDict
 20 | from typing import List
 21 | 
 22 | DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
 23 | UNITS = OrderedDict({
 24 |     1: '十',
 25 |     2: '百',
 26 |     3: '千',
 27 |     4: '万',
 28 |     8: '亿',
 29 | })
 30 | 
 31 | COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
 32 | 
 33 | # 分数表达式
 34 | RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
 35 | 
 36 | 
 37 | def replace_frac(match) -> str:
 38 |     """
 39 |     Args:
 40 |         match (re.Match)
 41 |     Returns:
 42 |         str
 43 |     """
 44 |     sign = match.group(1)
 45 |     nominator = match.group(2)
 46 |     denominator = match.group(3)
 47 |     sign: str = "负" if sign else ""
 48 |     nominator: str = num2str(nominator)
 49 |     denominator: str = num2str(denominator)
 50 |     result = f"{sign}{denominator}分之{nominator}"
 51 |     return result
 52 | 
 53 | 
 54 | # 百分数表达式
 55 | RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
 56 | 
 57 | 
 58 | def replace_percentage(match) -> str:
 59 |     """
 60 |     Args:
 61 |         match (re.Match)
 62 |     Returns:
 63 |         str
 64 |     """
 65 |     sign = match.group(1)
 66 |     percent = match.group(2)
 67 |     sign: str = "负" if sign else ""
 68 |     percent: str = num2str(percent)
 69 |     result = f"{sign}百分之{percent}"
 70 |     return result
 71 | 
 72 | 
 73 | # 整数表达式
 74 | # 带负号的整数 -10
 75 | RE_INTEGER = re.compile(r'(-)' r'(\d+)')
 76 | 
 77 | 
 78 | def replace_negative_num(match) -> str:
 79 |     """
 80 |     Args:
 81 |         match (re.Match)
 82 |     Returns:
 83 |         str
 84 |     """
 85 |     sign = match.group(1)
 86 |     number = match.group(2)
 87 |     sign: str = "负" if sign else ""
 88 |     number: str = num2str(number)
 89 |     result = f"{sign}{number}"
 90 |     return result
 91 | 
 92 | 
 93 | # 编号-无符号整形
 94 | # 00078
 95 | RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
 96 | 
 97 | 
 98 | def replace_default_num(match):
 99 |     """
100 |     Args:
101 |         match (re.Match)
102 |     Returns:
103 |         str
104 |     """
105 |     number = match.group(0)
106 |     return verbalize_digit(number, alt_one=True)
107 | 
108 | 
109 | # 数字表达式
110 | # 纯小数
111 | RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
112 | # 正整数 + 量词
113 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
114 | RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
115 | 
116 | 
117 | def replace_positive_quantifier(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     number = match.group(1)
125 |     match_2 = match.group(2)
126 |     if match_2 == "+":
127 |         match_2 = "多"
128 |     match_2: str = match_2 if match_2 else ""
129 |     quantifiers: str = match.group(3)
130 |     number: str = num2str(number)
131 |     result = f"{number}{match_2}{quantifiers}"
132 |     return result
133 | 
134 | 
135 | def replace_number(match) -> str:
136 |     """
137 |     Args:
138 |         match (re.Match)
139 |     Returns:
140 |         str
141 |     """
142 |     sign = match.group(1)
143 |     number = match.group(2)
144 |     pure_decimal = match.group(5)
145 |     if pure_decimal:
146 |         result = num2str(pure_decimal)
147 |     else:
148 |         sign: str = "负" if sign else ""
149 |         number: str = num2str(number)
150 |         result = f"{sign}{number}"
151 |     return result
152 | 
153 | 
154 | # 范围表达式
155 | # match.group(1) and match.group(8) are copy from RE_NUMBER
156 | 
157 | RE_RANGE = re.compile(
158 |     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
159 | 
160 | 
161 | def replace_range(match) -> str:
162 |     """
163 |     Args:
164 |         match (re.Match)
165 |     Returns:
166 |         str
167 |     """
168 |     first, second = match.group(1), match.group(8)
169 |     first = RE_NUMBER.sub(replace_number, first)
170 |     second = RE_NUMBER.sub(replace_number, second)
171 |     result = f"{first}到{second}"
172 |     return result
173 | 
174 | 
175 | def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
176 |     stripped = value_string.lstrip('0')
177 |     if len(stripped) == 0:
178 |         return []
179 |     elif len(stripped) == 1:
180 |         if use_zero and len(stripped) < len(value_string):
181 |             return [DIGITS['0'], DIGITS[stripped]]
182 |         else:
183 |             return [DIGITS[stripped]]
184 |     else:
185 |         largest_unit = next(
186 |             power for power in reversed(UNITS.keys()) if power < len(stripped))
187 |         first_part = value_string[:-largest_unit]
188 |         second_part = value_string[-largest_unit:]
189 |         return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
190 |             second_part)
191 | 
192 | 
193 | def verbalize_cardinal(value_string: str) -> str:
194 |     if not value_string:
195 |         return ''
196 | 
197 |     # 000 -> '零' , 0 -> '零'
198 |     value_string = value_string.lstrip('0')
199 |     if len(value_string) == 0:
200 |         return DIGITS['0']
201 | 
202 |     result_symbols = _get_value(value_string)
203 |     # verbalized number starting with '一十*' is abbreviated as `十*`
204 |     if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
205 |             '1'] and result_symbols[1] == UNITS[1]:
206 |         result_symbols = result_symbols[1:]
207 |     return ''.join(result_symbols)
208 | 
209 | 
210 | def verbalize_digit(value_string: str, alt_one=False) -> str:
211 |     result_symbols = [DIGITS[digit] for digit in value_string]
212 |     result = ''.join(result_symbols)
213 |     if alt_one:
214 |         result = result.replace("一", "幺")
215 |     return result
216 | 
217 | 
218 | def num2str(value_string: str) -> str:
219 |     integer_decimal = value_string.split('.')
220 |     if len(integer_decimal) == 1:
221 |         integer = integer_decimal[0]
222 |         decimal = ''
223 |     elif len(integer_decimal) == 2:
224 |         integer, decimal = integer_decimal
225 |     else:
226 |         raise ValueError(
227 |             f"The value string: '${value_string}' has more than one point in it."
228 |         )
229 | 
230 |     result = verbalize_cardinal(integer)
231 | 
232 |     decimal = decimal.rstrip('0')
233 |     if decimal:
234 |         # '.22' is verbalized as '零点二二'
235 |         # '3.20' is verbalized as '三点二
236 |         result = result if result else "零"
237 |         result += '点' + verbalize_digit(decimal)
238 |     return result
239 | 


--------------------------------------------------------------------------------
/academicodec/utils.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import os
  4 | import random
  5 | import sys
  6 | import time
  7 | import warnings
  8 | 
  9 | import matplotlib
 10 | import numpy as np
 11 | import torch
 12 | import yaml
 13 | from torch import distributed as dist
 14 | from torch.nn.utils import weight_norm
 15 | matplotlib.use("Agg")
 16 | import matplotlib.pylab as plt
 17 | import re
 18 | import pathlib
 19 | 
 20 | 
 21 | def seed_everything(seed, cudnn_deterministic=False):
 22 |     """
 23 |     Function that sets seed for pseudo-random number generators in:
 24 |     pytorch, numpy, python.random
 25 |     
 26 |     Args:
 27 |         seed: the integer value seed for global random state
 28 |     """
 29 |     if seed is not None:
 30 |         # print(f"Global seed set to {seed}")
 31 |         random.seed(seed)
 32 |         np.random.seed(seed)
 33 |         torch.manual_seed(seed)
 34 |         torch.cuda.manual_seed_all(seed)
 35 | 
 36 |     # if cudnn_deterministic:
 37 |     #     torch.backends.cudnn.deterministic = True
 38 |     #     warnings.warn('You have chosen to seed training. '
 39 |     #                   'This will turn on the CUDNN deterministic setting, '
 40 |     #                   'which can slow down your training considerably! '
 41 |     #                   'You may see unexpected behavior when restarting '
 42 |     #                   'from checkpoints.')
 43 | 
 44 | 
 45 | def is_primary():
 46 |     return get_rank() == 0
 47 | 
 48 | 
 49 | def get_rank():
 50 |     if not dist.is_available():
 51 |         return 0
 52 |     if not dist.is_initialized():
 53 |         return 0
 54 | 
 55 |     return dist.get_rank()
 56 | 
 57 | 
 58 | def load_yaml_config(path):
 59 |     with open(path) as f:
 60 |         config = yaml.full_load(f)
 61 |     return config
 62 | 
 63 | 
 64 | def save_config_to_yaml(config, path):
 65 |     assert path.endswith('.yaml')
 66 |     with open(path, 'w') as f:
 67 |         f.write(yaml.dump(config))
 68 |         f.close()
 69 | 
 70 | 
 71 | def save_dict_to_json(d, path, indent=None):
 72 |     json.dump(d, open(path, 'w'), indent=indent)
 73 | 
 74 | 
 75 | def load_dict_from_json(path):
 76 |     return json.load(open(path, 'r'))
 77 | 
 78 | 
 79 | def write_args(args, path):
 80 |     args_dict = dict((name, getattr(args, name)) for name in dir(args)
 81 |                      if not name.startswith('_'))
 82 |     with open(path, 'a') as args_file:
 83 |         args_file.write('==> torch version: {}\n'.format(torch.__version__))
 84 |         args_file.write(
 85 |             '==> cudnn version: {}\n'.format(torch.backends.cudnn.version()))
 86 |         args_file.write('==> Cmd:\n')
 87 |         args_file.write(str(sys.argv))
 88 |         args_file.write('\n==> args:\n')
 89 |         for k, v in sorted(args_dict.items()):
 90 |             args_file.write('  %s: %s\n' % (str(k), str(v)))
 91 |         args_file.close()
 92 | 
 93 | 
 94 | class Logger(object):
 95 |     def __init__(self, args):
 96 |         self.args = args
 97 |         self.save_dir = args.save_dir
 98 |         self.is_primary = is_primary()
 99 | 
100 |         if self.is_primary:
101 |             os.makedirs(self.save_dir, exist_ok=True)
102 | 
103 |             # save the args and config
104 |             self.config_dir = os.path.join(self.save_dir, 'configs')
105 |             os.makedirs(self.config_dir, exist_ok=True)
106 |             file_name = os.path.join(self.config_dir, 'args.txt')
107 |             write_args(args, file_name)
108 | 
109 |             log_dir = os.path.join(self.save_dir, 'logs')
110 |             if not os.path.exists(log_dir):
111 |                 os.makedirs(log_dir, exist_ok=True)
112 |             self.text_writer = open(os.path.join(log_dir, 'log.txt'),
113 |                                     'a')  # 'w')
114 |             if args.tensorboard:
115 |                 self.log_info('using tensorboard')
116 |                 self.tb_writer = torch.utils.tensorboard.SummaryWriter(
117 |                     log_dir=log_dir
118 |                 )  # tensorboard.SummaryWriter(log_dir=log_dir)
119 |             else:
120 |                 self.tb_writer = None
121 | 
122 |     def save_config(self, config):
123 |         if self.is_primary:
124 |             save_config_to_yaml(config,
125 |                                 os.path.join(self.config_dir, 'config.yaml'))
126 | 
127 |     def log_info(self, info, check_primary=True):
128 |         if self.is_primary or (not check_primary):
129 |             print(info)
130 |             if self.is_primary:
131 |                 info = str(info)
132 |                 time_str = time.strftime('%Y-%m-%d-%H-%M')
133 |                 info = '{}: {}'.format(time_str, info)
134 |                 if not info.endswith('\n'):
135 |                     info += '\n'
136 |                 self.text_writer.write(info)
137 |                 self.text_writer.flush()
138 | 
139 |     def add_scalar(self, **kargs):
140 |         """Log a scalar variable."""
141 |         if self.is_primary:
142 |             if self.tb_writer is not None:
143 |                 self.tb_writer.add_scalar(**kargs)
144 | 
145 |     def add_scalars(self, **kargs):
146 |         """Log a scalar variable."""
147 |         if self.is_primary:
148 |             if self.tb_writer is not None:
149 |                 self.tb_writer.add_scalars(**kargs)
150 | 
151 |     def add_image(self, **kargs):
152 |         """Log a scalar variable."""
153 |         if self.is_primary:
154 |             if self.tb_writer is not None:
155 |                 self.tb_writer.add_image(**kargs)
156 | 
157 |     def add_images(self, **kargs):
158 |         """Log a scalar variable."""
159 |         if self.is_primary:
160 |             if self.tb_writer is not None:
161 |                 self.tb_writer.add_images(**kargs)
162 | 
163 |     def close(self):
164 |         if self.is_primary:
165 |             self.text_writer.close()
166 |             self.tb_writer.close()
167 | 
168 | 
169 | def plot_spectrogram(spectrogram):
170 |     fig, ax = plt.subplots(figsize=(10, 2))
171 |     im = ax.imshow(
172 |         spectrogram, aspect="auto", origin="lower", interpolation='none')
173 |     plt.colorbar(im, ax=ax)
174 | 
175 |     fig.canvas.draw()
176 |     plt.close()
177 | 
178 |     return fig
179 | 
180 | 
181 | def init_weights(m, mean=0.0, std=0.01):
182 |     classname = m.__class__.__name__
183 |     if classname.find("Conv") != -1:
184 |         m.weight.data.normal_(mean, std)
185 | 
186 | 
187 | def apply_weight_norm(m):
188 |     classname = m.__class__.__name__
189 |     if classname.find("Conv") != -1:
190 |         weight_norm(m)
191 | 
192 | 
193 | def get_padding(kernel_size, dilation=1):
194 |     return int((kernel_size * dilation - dilation) / 2)
195 | 
196 | 
197 | def load_checkpoint(filepath, device):
198 |     assert os.path.isfile(filepath)
199 |     print("Loading '{}'".format(filepath))
200 |     checkpoint_dict = torch.load(filepath, map_location=device)
201 |     print("Complete.")
202 |     return checkpoint_dict
203 | 
204 | 
205 | def save_checkpoint(filepath, obj, num_ckpt_keep=5):
206 |     name = re.match(r'(do|g)_\d+', pathlib.Path(filepath).name).group(1)
207 |     ckpts = sorted(pathlib.Path(filepath).parent.glob(f'{name}_*'))
208 |     if len(ckpts) > num_ckpt_keep:
209 |         [os.remove(c) for c in ckpts[:-num_ckpt_keep]]
210 |     print("Saving checkpoint to {}".format(filepath))
211 |     torch.save(obj, filepath)
212 |     print("Complete.")
213 | 
214 | 
215 | def scan_checkpoint(cp_dir, prefix):
216 |     pattern = os.path.join(cp_dir, prefix + '????????')
217 |     cp_list = glob.glob(pattern)
218 |     if len(cp_list) == 0:
219 |         return None
220 |     return sorted(cp_list)[-1]
221 | 


--------------------------------------------------------------------------------
/matcha/hifigan/meldataset.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/jik876/hifi-gan """
  2 | 
  3 | import math
  4 | import os
  5 | import random
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.utils.data
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | from librosa.util import normalize
 12 | from scipy.io.wavfile import read
 13 | 
 14 | MAX_WAV_VALUE = 32768.0
 15 | 
 16 | 
 17 | def load_wav(full_path):
 18 |     sampling_rate, data = read(full_path)
 19 |     return data, sampling_rate
 20 | 
 21 | 
 22 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 23 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 24 | 
 25 | 
 26 | def dynamic_range_decompression(x, C=1):
 27 |     return np.exp(x) / C
 28 | 
 29 | 
 30 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 31 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 32 | 
 33 | 
 34 | def dynamic_range_decompression_torch(x, C=1):
 35 |     return torch.exp(x) / C
 36 | 
 37 | 
 38 | def spectral_normalize_torch(magnitudes):
 39 |     output = dynamic_range_compression_torch(magnitudes)
 40 |     return output
 41 | 
 42 | 
 43 | def spectral_de_normalize_torch(magnitudes):
 44 |     output = dynamic_range_decompression_torch(magnitudes)
 45 |     return output
 46 | 
 47 | 
 48 | mel_basis = {}
 49 | hann_window = {}
 50 | 
 51 | 
 52 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 53 |     if torch.min(y) < -1.0:
 54 |         print("min value is ", torch.min(y))
 55 |     if torch.max(y) > 1.0:
 56 |         print("max value is ", torch.max(y))
 57 | 
 58 |     global mel_basis, hann_window  # pylint: disable=global-statement
 59 |     if fmax not in mel_basis:
 60 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 61 |         mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 62 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 63 | 
 64 |     y = torch.nn.functional.pad(
 65 |         y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
 66 |     )
 67 |     y = y.squeeze(1)
 68 | 
 69 |     spec = torch.view_as_real(
 70 |         torch.stft(
 71 |             y,
 72 |             n_fft,
 73 |             hop_length=hop_size,
 74 |             win_length=win_size,
 75 |             window=hann_window[str(y.device)],
 76 |             center=center,
 77 |             pad_mode="reflect",
 78 |             normalized=False,
 79 |             onesided=True,
 80 |             return_complex=True,
 81 |         )
 82 |     )
 83 | 
 84 |     spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
 85 | 
 86 |     spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
 87 |     spec = spectral_normalize_torch(spec)
 88 | 
 89 |     return spec
 90 | 
 91 | 
 92 | def get_dataset_filelist(a):
 93 |     with open(a.input_training_file, encoding="utf-8") as fi:
 94 |         training_files = [
 95 |             os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
 96 |         ]
 97 | 
 98 |     with open(a.input_validation_file, encoding="utf-8") as fi:
 99 |         validation_files = [
100 |             os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
101 |         ]
102 |     return training_files, validation_files
103 | 
104 | 
105 | class MelDataset(torch.utils.data.Dataset):
106 |     def __init__(
107 |         self,
108 |         training_files,
109 |         segment_size,
110 |         n_fft,
111 |         num_mels,
112 |         hop_size,
113 |         win_size,
114 |         sampling_rate,
115 |         fmin,
116 |         fmax,
117 |         split=True,
118 |         shuffle=True,
119 |         n_cache_reuse=1,
120 |         device=None,
121 |         fmax_loss=None,
122 |         fine_tuning=False,
123 |         base_mels_path=None,
124 |     ):
125 |         self.audio_files = training_files
126 |         random.seed(1234)
127 |         if shuffle:
128 |             random.shuffle(self.audio_files)
129 |         self.segment_size = segment_size
130 |         self.sampling_rate = sampling_rate
131 |         self.split = split
132 |         self.n_fft = n_fft
133 |         self.num_mels = num_mels
134 |         self.hop_size = hop_size
135 |         self.win_size = win_size
136 |         self.fmin = fmin
137 |         self.fmax = fmax
138 |         self.fmax_loss = fmax_loss
139 |         self.cached_wav = None
140 |         self.n_cache_reuse = n_cache_reuse
141 |         self._cache_ref_count = 0
142 |         self.device = device
143 |         self.fine_tuning = fine_tuning
144 |         self.base_mels_path = base_mels_path
145 | 
146 |     def __getitem__(self, index):
147 |         filename = self.audio_files[index]
148 |         if self._cache_ref_count == 0:
149 |             audio, sampling_rate = load_wav(filename)
150 |             audio = audio / MAX_WAV_VALUE
151 |             if not self.fine_tuning:
152 |                 audio = normalize(audio) * 0.95
153 |             self.cached_wav = audio
154 |             if sampling_rate != self.sampling_rate:
155 |                 raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
156 |             self._cache_ref_count = self.n_cache_reuse
157 |         else:
158 |             audio = self.cached_wav
159 |             self._cache_ref_count -= 1
160 | 
161 |         audio = torch.FloatTensor(audio)
162 |         audio = audio.unsqueeze(0)
163 | 
164 |         if not self.fine_tuning:
165 |             if self.split:
166 |                 if audio.size(1) >= self.segment_size:
167 |                     max_audio_start = audio.size(1) - self.segment_size
168 |                     audio_start = random.randint(0, max_audio_start)
169 |                     audio = audio[:, audio_start : audio_start + self.segment_size]
170 |                 else:
171 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
172 | 
173 |             mel = mel_spectrogram(
174 |                 audio,
175 |                 self.n_fft,
176 |                 self.num_mels,
177 |                 self.sampling_rate,
178 |                 self.hop_size,
179 |                 self.win_size,
180 |                 self.fmin,
181 |                 self.fmax,
182 |                 center=False,
183 |             )
184 |         else:
185 |             mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
186 |             mel = torch.from_numpy(mel)
187 | 
188 |             if len(mel.shape) < 3:
189 |                 mel = mel.unsqueeze(0)
190 | 
191 |             if self.split:
192 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
193 | 
194 |                 if audio.size(1) >= self.segment_size:
195 |                     mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
196 |                     mel = mel[:, :, mel_start : mel_start + frames_per_seg]
197 |                     audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
198 |                 else:
199 |                     mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
200 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
201 | 
202 |         mel_loss = mel_spectrogram(
203 |             audio,
204 |             self.n_fft,
205 |             self.num_mels,
206 |             self.sampling_rate,
207 |             self.hop_size,
208 |             self.win_size,
209 |             self.fmin,
210 |             self.fmax_loss,
211 |             center=False,
212 |         )
213 | 
214 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
215 | 
216 |     def __len__(self):
217 |         return len(self.audio_files)
218 | 


--------------------------------------------------------------------------------
/matcha/models/baselightningmodule.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a base lightning module that can be used to train a model.
  3 | The benefit of this abstraction is that all the logic outside of model definition can be reused for different models.
  4 | """
  5 | import inspect
  6 | from abc import ABC
  7 | from typing import Any, Dict
  8 | 
  9 | import torch
 10 | from lightning import LightningModule
 11 | from lightning.pytorch.utilities import grad_norm
 12 | 
 13 | from matcha import utils
 14 | from matcha.utils.utils import plot_tensor
 15 | 
 16 | log = utils.get_pylogger(__name__)
 17 | 
 18 | 
 19 | class BaseLightningClass(LightningModule, ABC):
 20 |     def update_data_statistics(self, data_statistics):
 21 |         if data_statistics is None:
 22 |             data_statistics = {
 23 |                 "mel_mean": 0.0,
 24 |                 "mel_std": 1.0,
 25 |             }
 26 | 
 27 |         self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"]))
 28 |         self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"]))
 29 | 
 30 |     def configure_optimizers(self) -> Any:
 31 |         optimizer = self.hparams.optimizer(params=self.parameters())
 32 |         if self.hparams.scheduler not in (None, {}):
 33 |             scheduler_args = {}
 34 |             # Manage last epoch for exponential schedulers
 35 |             if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters:
 36 |                 if hasattr(self, "ckpt_loaded_epoch"):
 37 |                     current_epoch = self.ckpt_loaded_epoch - 1
 38 |                 else:
 39 |                     current_epoch = -1
 40 | 
 41 |             scheduler_args.update({"optimizer": optimizer})
 42 |             scheduler = self.hparams.scheduler.scheduler(**scheduler_args)
 43 |             scheduler.last_epoch = current_epoch
 44 |             return {
 45 |                 "optimizer": optimizer,
 46 |                 "lr_scheduler": {
 47 |                     "scheduler": scheduler,
 48 |                     "interval": self.hparams.scheduler.lightning_args.interval,
 49 |                     "frequency": self.hparams.scheduler.lightning_args.frequency,
 50 |                     "name": "learning_rate",
 51 |                 },
 52 |             }
 53 | 
 54 |         return {"optimizer": optimizer}
 55 | 
 56 |     def get_losses(self, batch):
 57 |         x, x_lengths = batch["x"], batch["x_lengths"]
 58 |         y, y_lengths = batch["y"], batch["y_lengths"]
 59 |         spks = batch["spks"]
 60 | 
 61 |         dur_loss, prior_loss, diff_loss = self(
 62 |             x=x,
 63 |             x_lengths=x_lengths,
 64 |             y=y,
 65 |             y_lengths=y_lengths,
 66 |             spks=spks,
 67 |             out_size=self.out_size,
 68 |         )
 69 |         return {
 70 |             "dur_loss": dur_loss,
 71 |             "prior_loss": prior_loss,
 72 |             "diff_loss": diff_loss,
 73 |         }
 74 | 
 75 |     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
 76 |         self.ckpt_loaded_epoch = checkpoint["epoch"]  # pylint: disable=attribute-defined-outside-init
 77 | 
 78 |     def training_step(self, batch: Any, batch_idx: int):
 79 |         loss_dict = self.get_losses(batch)
 80 |         self.log(
 81 |             "step",
 82 |             float(self.global_step),
 83 |             on_step=True,
 84 |             prog_bar=True,
 85 |             logger=True,
 86 |             sync_dist=True,
 87 |         )
 88 | 
 89 |         self.log(
 90 |             "sub_loss/train_dur_loss",
 91 |             loss_dict["dur_loss"],
 92 |             on_step=True,
 93 |             on_epoch=True,
 94 |             logger=True,
 95 |             sync_dist=True,
 96 |         )
 97 |         self.log(
 98 |             "sub_loss/train_prior_loss",
 99 |             loss_dict["prior_loss"],
100 |             on_step=True,
101 |             on_epoch=True,
102 |             logger=True,
103 |             sync_dist=True,
104 |         )
105 |         self.log(
106 |             "sub_loss/train_diff_loss",
107 |             loss_dict["diff_loss"],
108 |             on_step=True,
109 |             on_epoch=True,
110 |             logger=True,
111 |             sync_dist=True,
112 |         )
113 | 
114 |         total_loss = sum(loss_dict.values())
115 |         self.log(
116 |             "loss/train",
117 |             total_loss,
118 |             on_step=True,
119 |             on_epoch=True,
120 |             logger=True,
121 |             prog_bar=True,
122 |             sync_dist=True,
123 |         )
124 | 
125 |         return {"loss": total_loss, "log": loss_dict}
126 | 
127 |     def validation_step(self, batch: Any, batch_idx: int):
128 |         loss_dict = self.get_losses(batch)
129 |         self.log(
130 |             "sub_loss/val_dur_loss",
131 |             loss_dict["dur_loss"],
132 |             on_step=True,
133 |             on_epoch=True,
134 |             logger=True,
135 |             sync_dist=True,
136 |         )
137 |         self.log(
138 |             "sub_loss/val_prior_loss",
139 |             loss_dict["prior_loss"],
140 |             on_step=True,
141 |             on_epoch=True,
142 |             logger=True,
143 |             sync_dist=True,
144 |         )
145 |         self.log(
146 |             "sub_loss/val_diff_loss",
147 |             loss_dict["diff_loss"],
148 |             on_step=True,
149 |             on_epoch=True,
150 |             logger=True,
151 |             sync_dist=True,
152 |         )
153 | 
154 |         total_loss = sum(loss_dict.values())
155 |         self.log(
156 |             "loss/val",
157 |             total_loss,
158 |             on_step=True,
159 |             on_epoch=True,
160 |             logger=True,
161 |             prog_bar=True,
162 |             sync_dist=True,
163 |         )
164 | 
165 |         return total_loss
166 | 
167 |     def on_validation_end(self) -> None:
168 |         if self.trainer.is_global_zero:
169 |             one_batch = next(iter(self.trainer.val_dataloaders))
170 |             if self.current_epoch == 0:
171 |                 log.debug("Plotting original samples")
172 |                 for i in range(2):
173 |                     y = one_batch["y"][i].unsqueeze(0).to(self.device)
174 |                     self.logger.experiment.add_image(
175 |                         f"original/{i}",
176 |                         plot_tensor(y.squeeze().cpu()),
177 |                         self.current_epoch,
178 |                         dataformats="HWC",
179 |                     )
180 | 
181 |             log.debug("Synthesising...")
182 |             for i in range(2):
183 |                 x = one_batch["x"][i].unsqueeze(0).to(self.device)
184 |                 x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device)
185 |                 spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None
186 |                 output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks)
187 |                 y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"]
188 |                 attn = output["attn"]
189 |                 self.logger.experiment.add_image(
190 |                     f"generated_enc/{i}",
191 |                     plot_tensor(y_enc.squeeze().cpu()),
192 |                     self.current_epoch,
193 |                     dataformats="HWC",
194 |                 )
195 |                 self.logger.experiment.add_image(
196 |                     f"generated_dec/{i}",
197 |                     plot_tensor(y_dec.squeeze().cpu()),
198 |                     self.current_epoch,
199 |                     dataformats="HWC",
200 |                 )
201 |                 self.logger.experiment.add_image(
202 |                     f"alignment/{i}",
203 |                     plot_tensor(attn.squeeze().cpu()),
204 |                     self.current_epoch,
205 |                     dataformats="HWC",
206 |                 )
207 | 
208 |     def on_before_optimizer_step(self, optimizer):
209 |         self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()})
210 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/flow.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import logging
 15 | from typing import Dict, Optional
 16 | import torch
 17 | import torch.nn as nn
 18 | from torch.nn import functional as F
 19 | from omegaconf import DictConfig
 20 | from cosyvoice.utils.mask import make_pad_mask
 21 | 
 22 | 
 23 | class MaskedDiffWithXvec(torch.nn.Module):
 24 |     def __init__(self,
 25 |                  input_size: int = 512,
 26 |                  output_size: int = 80,
 27 |                  spk_embed_dim: int = 192,
 28 |                  output_type: str = "mel",
 29 |                  vocab_size: int = 4096,
 30 |                  input_frame_rate: int = 50,
 31 |                  only_mask_loss: bool = True,
 32 |                  encoder: torch.nn.Module = None,
 33 |                  length_regulator: torch.nn.Module = None,
 34 |                  decoder: torch.nn.Module = None,
 35 |                  decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
 36 |                  mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
 37 |         super().__init__()
 38 |         self.input_size = input_size
 39 |         self.output_size = output_size
 40 |         self.decoder_conf = decoder_conf
 41 |         self.mel_feat_conf = mel_feat_conf
 42 |         self.vocab_size = vocab_size
 43 |         self.output_type = output_type
 44 |         self.input_frame_rate = input_frame_rate
 45 |         logging.info(f"input frame rate={self.input_frame_rate}")
 46 |         self.input_embedding = nn.Embedding(vocab_size, input_size)
 47 |         self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
 48 |         self.encoder = encoder
 49 |         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
 50 |         self.decoder = decoder
 51 |         self.length_regulator = length_regulator
 52 |         self.only_mask_loss = only_mask_loss
 53 | 
 54 |     def forward(
 55 |             self,
 56 |             batch: dict,
 57 |             device: torch.device,
 58 |     ) -> Dict[str, Optional[torch.Tensor]]:
 59 |         token = batch['speech_token'].to(device)
 60 |         token_len = batch['speech_token_len'].to(device)
 61 |         feat = batch['speech_feat'].to(device)
 62 |         feat_len = batch['speech_feat_len'].to(device)
 63 |         embedding = batch['utt_embedding'].to(device)
 64 | 
 65 |         # xvec projection
 66 |         embedding = F.normalize(embedding, dim=1)
 67 |         embedding = self.spk_embed_affine_layer(embedding)
 68 | 
 69 |         # concat text and prompt_text
 70 |         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
 71 |         token = self.input_embedding(torch.clamp(token, min=0)) * mask
 72 | 
 73 |         # text encode
 74 |         h, h_lengths = self.encoder(token, token_len)
 75 |         h = self.encoder_proj(h)
 76 |         h, h_lengths = self.length_regulator(h, feat_len)
 77 | 
 78 |         # get conditions
 79 |         conds = torch.zeros(feat.shape, device=token.device)
 80 |         conds = conds.transpose(1, 2)
 81 | 
 82 |         mask = (~make_pad_mask(feat_len)).to(h)
 83 |         feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
 84 |         loss, _ = self.decoder.compute_loss(
 85 |             feat.transpose(1, 2).contiguous(),
 86 |             mask.unsqueeze(1),
 87 |             h.transpose(1, 2).contiguous(),
 88 |             embedding,
 89 |             cond=conds
 90 |         )
 91 |         return {'loss': loss}
 92 | 
 93 |     @torch.inference_mode()
 94 |     def inference(self,
 95 |                   token,
 96 |                   token_len,
 97 |                   prompt_token,
 98 |                   prompt_token_len,
 99 |                   prompt_feat,
100 |                   prompt_feat_len,
101 |                   embedding):
102 |         assert token.shape[0] == 1
103 |         # xvec projection
104 |         embedding = F.normalize(embedding, dim=1)
105 |         embedding = self.spk_embed_affine_layer(embedding)
106 | 
107 |         # concat text and prompt_text
108 |         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
109 |         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
110 |         token = self.input_embedding(torch.clamp(token, min=0)) * mask
111 | 
112 |         # text encode
113 |         h, h_lengths = self.encoder(token, token_len)
114 |         h = self.encoder_proj(h)
115 |         feat_len = (token_len / 50 * 22050 / 256).int()
116 |         h, h_lengths = self.length_regulator(h, feat_len)
117 | 
118 |         # get conditions
119 |         conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
120 |         if prompt_feat.shape[1] != 0:
121 |             for i, j in enumerate(prompt_feat_len):
122 |                 conds[i, :j] = prompt_feat[i]
123 |         conds = conds.transpose(1, 2)
124 | 
125 |         mask = (~make_pad_mask(feat_len)).to(h)
126 |         feat = self.decoder(
127 |             mu=h.transpose(1, 2).contiguous(),
128 |             mask=mask.unsqueeze(1),
129 |             spks=embedding,
130 |             cond=conds,
131 |             n_timesteps=10
132 |         )
133 |         if prompt_feat.shape[1] != 0:
134 |             feat = feat[:, :, prompt_feat.shape[1]:]
135 |         return feat
136 | 
137 |     @torch.inference_mode()
138 |     def inference_stream(self,
139 |                   token,
140 |                   token_len,
141 |                   prompt_token,
142 |                   prompt_token_len,
143 |                   prompt_feat,
144 |                   prompt_feat_len,
145 |                   embedding):
146 |         assert token.shape[0] == 1
147 |         # xvec projection
148 |         embedding = F.normalize(embedding, dim=1)
149 |         embedding = self.spk_embed_affine_layer(embedding)
150 | 
151 |         # concat text and prompt_text
152 |         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
153 |         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
154 |         token = self.input_embedding(torch.clamp(token, min=0)) * mask
155 | 
156 |         # text encode
157 |         h, h_lengths = self.encoder(token, token_len)
158 |         h = self.encoder_proj(h)
159 |         feat_len = (token_len / 50 * 22050 / 256).int()
160 |         h, h_lengths = self.length_regulator(h, feat_len)
161 | 
162 |         # get conditions
163 |         conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
164 |         if prompt_feat.shape[1] != 0:
165 |             for i, j in enumerate(prompt_feat_len):
166 |                 conds[i, :j] = prompt_feat[i]
167 |         conds = conds.transpose(1, 2)
168 | 
169 |         mask = (~make_pad_mask(feat_len)).to(h)
170 |         feat = self.decoder(
171 |             mu=h.transpose(1, 2).contiguous(),
172 |             mask=mask.unsqueeze(1),
173 |             spks=embedding,
174 |             cond=conds,
175 |             n_timesteps=10
176 |         )
177 |         if prompt_feat.shape[1] != 0:
178 |             feat = feat[:, :, prompt_feat.shape[1]:]
179 |         yield feat
180 | 


--------------------------------------------------------------------------------
/academicodec/models/hificodec/meldataset.py:
--------------------------------------------------------------------------------
  1 | # code based on https://github.com/b04901014/MQTTS
  2 | import math
  3 | import os
  4 | import random
  5 | 
  6 | import librosa
  7 | import numpy as np
  8 | import torch.utils.data
  9 | from librosa.filters import mel as librosa_mel_fn
 10 | 
 11 | 
 12 | def load_wav(full_path, sr):
 13 |     wav, sr = librosa.load(full_path, sr=sr)
 14 |     return wav, sr
 15 | 
 16 | 
 17 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 18 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 19 | 
 20 | 
 21 | def dynamic_range_decompression(x, C=1):
 22 |     return np.exp(x) / C
 23 | 
 24 | 
 25 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 26 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 27 | 
 28 | 
 29 | def dynamic_range_decompression_torch(x, C=1):
 30 |     return torch.exp(x) / C
 31 | 
 32 | 
 33 | def spectral_normalize_torch(magnitudes):
 34 |     output = dynamic_range_compression_torch(magnitudes)
 35 |     return output
 36 | 
 37 | 
 38 | def spectral_de_normalize_torch(magnitudes):
 39 |     output = dynamic_range_decompression_torch(magnitudes)
 40 |     return output
 41 | 
 42 | 
 43 | mel_basis = {}
 44 | hann_window = {}
 45 | 
 46 | 
 47 | def mel_spectrogram(y,
 48 |                     n_fft,
 49 |                     num_mels,
 50 |                     sampling_rate,
 51 |                     hop_size,
 52 |                     win_size,
 53 |                     fmin,
 54 |                     fmax,
 55 |                     center=False):
 56 |     if torch.min(y) < -1.:
 57 |         print('min value is ', torch.min(y))
 58 |     if torch.max(y) > 1.:
 59 |         print('max value is ', torch.max(y))
 60 | 
 61 |     global mel_basis, hann_window
 62 |     if fmax not in mel_basis:
 63 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 64 |         mel_basis[str(fmax) + '_' +
 65 |                   str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 66 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 67 | 
 68 |     y = torch.nn.functional.pad(
 69 |         y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
 70 |             (n_fft - hop_size) / 2)),
 71 |         mode='reflect')
 72 |     y = y.squeeze(1)
 73 | 
 74 |     spec = torch.stft(
 75 |         y,
 76 |         n_fft,
 77 |         hop_length=hop_size,
 78 |         win_length=win_size,
 79 |         window=hann_window[str(y.device)],
 80 |         center=center,
 81 |         pad_mode='reflect',
 82 |         normalized=False,
 83 |         onesided=True)
 84 | 
 85 |     spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
 86 | 
 87 |     spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
 88 |     spec = spectral_normalize_torch(spec)
 89 | 
 90 |     return spec
 91 | 
 92 | 
 93 | def get_dataset_filelist(a):
 94 |     with open(a.input_training_file, 'r') as f:
 95 |         training_files = [l.strip() for l in f]
 96 |     with open(a.input_validation_file, 'r') as f:
 97 |         validation_files = [l.strip() for l in f]
 98 |     return training_files, validation_files
 99 | 
100 | 
101 | class MelDataset(torch.utils.data.Dataset):
102 |     def __init__(self,
103 |                  training_files,
104 |                  segment_size,
105 |                  n_fft,
106 |                  num_mels,
107 |                  hop_size,
108 |                  win_size,
109 |                  sampling_rate,
110 |                  fmin,
111 |                  fmax,
112 |                  split=True,
113 |                  shuffle=True,
114 |                  n_cache_reuse=1,
115 |                  device=None,
116 |                  fmax_loss=None,
117 |                  fine_tuning=False,
118 |                  base_mels_path=None):
119 |         self.audio_files = training_files
120 |         random.seed(1234)
121 |         if shuffle:
122 |             random.shuffle(self.audio_files)
123 |         self.segment_size = segment_size
124 |         self.sampling_rate = sampling_rate
125 |         self.split = split
126 |         self.n_fft = n_fft
127 |         self.num_mels = num_mels
128 |         self.hop_size = hop_size
129 |         self.win_size = win_size
130 |         self.fmin = fmin
131 |         self.fmax = fmax
132 |         self.fmax_loss = fmax_loss
133 |         self.cached_wav = None
134 |         self.n_cache_reuse = n_cache_reuse
135 |         self._cache_ref_count = 0
136 |         self.device = device
137 |         self.fine_tuning = fine_tuning
138 |         self.base_mels_path = base_mels_path
139 | 
140 |     def __getitem__(self, index):
141 |         filename = self.audio_files[index]
142 |         if self._cache_ref_count == 0:
143 |             try:
144 |                 # Note by yuantian: load with the sample_rate of config
145 |                 audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
146 |             except Exception as e:
147 |                 print(f"Error on audio: {filename}")
148 |                 audio = np.random.normal(size=(160000, )) * 0.05
149 |                 sampling_rate = self.sampling_rate
150 |             self.cached_wav = audio
151 |             if sampling_rate != self.sampling_rate:
152 |                 raise ValueError("{} SR doesn't match target {} SR".format(
153 |                     sampling_rate, self.sampling_rate))
154 |             self._cache_ref_count = self.n_cache_reuse
155 |         else:
156 |             audio = self.cached_wav
157 |             self._cache_ref_count -= 1
158 | 
159 |         audio = torch.FloatTensor(audio)
160 |         audio = audio.unsqueeze(0)
161 | 
162 |         if not self.fine_tuning:
163 |             if self.split:
164 |                 if audio.size(1) >= self.segment_size:
165 |                     max_audio_start = audio.size(1) - self.segment_size
166 |                     audio_start = random.randint(0, max_audio_start)
167 |                     audio = audio[:, audio_start:audio_start +
168 |                                   self.segment_size]
169 |                 else:
170 |                     audio = torch.nn.functional.pad(audio, (
171 |                         0, self.segment_size - audio.size(1)), 'constant')
172 | 
173 |             mel = mel_spectrogram(
174 |                 audio,
175 |                 self.n_fft,
176 |                 self.num_mels,
177 |                 self.sampling_rate,
178 |                 self.hop_size,
179 |                 self.win_size,
180 |                 self.fmin,
181 |                 self.fmax,
182 |                 center=False)
183 |         else:
184 |             mel = np.load(
185 |                 os.path.join(self.base_mels_path,
186 |                              os.path.splitext(os.path.split(filename)[-1])[0] +
187 |                              '.npy'))
188 |             mel = torch.from_numpy(mel)
189 | 
190 |             if len(mel.shape) < 3:
191 |                 mel = mel.unsqueeze(0)
192 | 
193 |             if self.split:
194 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
195 | 
196 |                 if audio.size(1) >= self.segment_size:
197 |                     mel_start = random.randint(0,
198 |                                                mel.size(2) - frames_per_seg - 1)
199 |                     mel = mel[:, :, mel_start:mel_start + frames_per_seg]
200 |                     audio = audio[:, mel_start * self.hop_size:(
201 |                         mel_start + frames_per_seg) * self.hop_size]
202 |                 else:
203 |                     mel = torch.nn.functional.pad(mel, (
204 |                         0, frames_per_seg - mel.size(2)), 'constant')
205 |                     audio = torch.nn.functional.pad(audio, (
206 |                         0, self.segment_size - audio.size(1)), 'constant')
207 | 
208 |         mel_loss = mel_spectrogram(
209 |             audio,
210 |             self.n_fft,
211 |             self.num_mels,
212 |             self.sampling_rate,
213 |             self.hop_size,
214 |             self.win_size,
215 |             self.fmin,
216 |             self.fmax_loss,
217 |             center=False)
218 | 
219 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
220 | 
221 |     def __len__(self):
222 |         return len(self.audio_files)
223 | 


--------------------------------------------------------------------------------