├── results └── .gitkeep ├── cosyvoice ├── __init__.py ├── cli │ ├── __init__.py │ ├── model.py │ └── cosyvoice.py ├── utils │ ├── __init__.py │ ├── file_utils.py │ ├── class_utils.py │ ├── common.py │ ├── frontend_utils.py │ └── executor.py ├── dataset │ ├── __init__.py │ └── dataset.py ├── transformer │ ├── __init__.py │ ├── activation.py │ ├── label_smoothing_loss.py │ ├── positionwise_feed_forward.py │ ├── decoder_layer.py │ └── convolution.py ├── flow │ ├── length_regulator.py │ ├── flow_matching.py │ └── flow.py ├── hifigan │ └── f0_predictor.py └── bin │ ├── train.py │ └── inference.py ├── .gitignore ├── third_party └── Matcha-TTS │ ├── matcha │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ └── components │ │ │ └── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── components │ │ │ ├── __init__.py │ │ │ └── flow_matching.py │ │ └── baselightningmodule.py │ ├── onnx │ │ ├── __init__.py │ │ ├── export.py │ │ └── infer.py │ ├── VERSION │ ├── hifigan │ │ ├── __init__.py │ │ ├── env.py │ │ ├── config.py │ │ ├── LICENSE │ │ ├── xutils.py │ │ ├── denoiser.py │ │ ├── README.md │ │ └── meldataset.py │ ├── utils │ │ ├── monotonic_align │ │ │ ├── setup.py │ │ │ ├── __init__.py │ │ │ └── core.pyx │ │ ├── __init__.py │ │ ├── pylogger.py │ │ ├── logging_utils.py │ │ ├── instantiators.py │ │ ├── audio.py │ │ ├── model.py │ │ ├── rich_utils.py │ │ ├── generate_data_statistics.py │ │ └── utils.py │ ├── text │ │ ├── symbols.py │ │ ├── __init__.py │ │ ├── numbers.py │ │ └── cleaners.py │ └── train.py │ ├── notebooks │ └── .gitkeep │ ├── configs │ ├── local │ │ └── .gitkeep │ ├── callbacks │ │ ├── none.yaml │ │ ├── default.yaml │ │ ├── rich_progress_bar.yaml │ │ ├── model_summary.yaml │ │ └── model_checkpoint.yaml │ ├── model │ │ ├── cfm │ │ │ └── default.yaml │ │ ├── optimizer │ │ │ └── adam.yaml │ │ ├── decoder │ │ │ └── default.yaml │ │ ├── matcha.yaml │ │ └── encoder │ │ │ └── default.yaml │ ├── trainer │ │ ├── cpu.yaml │ │ ├── gpu.yaml │ │ ├── mps.yaml │ │ ├── ddp.yaml │ │ ├── ddp_sim.yaml │ │ └── default.yaml │ ├── __init__.py │ ├── debug │ │ ├── fdr.yaml │ │ ├── overfit.yaml │ │ ├── limit.yaml │ │ ├── profiler.yaml │ │ └── default.yaml │ ├── logger │ │ ├── many_loggers.yaml │ │ ├── csv.yaml │ │ ├── tensorboard.yaml │ │ ├── neptune.yaml │ │ ├── mlflow.yaml │ │ ├── comet.yaml │ │ ├── wandb.yaml │ │ └── aim.yaml │ ├── extras │ │ └── default.yaml │ ├── experiment │ │ ├── ljspeech.yaml │ │ ├── multispeaker.yaml │ │ ├── ljspeech_min_memory.yaml │ │ └── hifi_dataset_piper_phonemizer.yaml │ ├── eval.yaml │ ├── data │ │ ├── vctk.yaml │ │ ├── hi-fi_en-US_female.yaml │ │ └── ljspeech.yaml │ ├── hydra │ │ └── default.yaml │ ├── paths │ │ └── default.yaml │ ├── train.yaml │ └── hparams_search │ │ └── mnist_optuna.yaml │ ├── matcha_tts.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── entry_points.txt │ ├── requires.txt │ └── SOURCES.txt │ ├── scripts │ └── schedule.sh │ ├── MANIFEST.in │ ├── requirements.txt │ ├── LICENSE │ ├── pyproject.toml │ ├── Makefile │ └── setup.py ├── data ├── example.wav └── batch_files.csv ├── README.md ├── run_single_inference.sh ├── run_batch_inference.sh ├── requirements.txt └── batch_inference.py /results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/local/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/onnx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.5.1 2 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/data/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/models/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha_tts.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha_tts.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | configs 2 | matcha 3 | -------------------------------------------------------------------------------- /data/example.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Splend1d/BreezyVoice/HEAD/data/example.wav -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/model/cfm/default.yaml: -------------------------------------------------------------------------------- 1 | name: CFM 2 | solver: euler 3 | sigma_min: 1e-4 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BreezyVoice 2 | 3 | Migrated to [MediaTek-Research](https://github.com/mtkresearch/BreezyVoice) 4 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # this file is needed here to include configs when building project as a package 2 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/model/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | _partial_: true 3 | lr: 1e-4 4 | weight_decay: 0.0 5 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - model_summary.yaml 4 | - rich_progress_bar.yaml 5 | - _self_ 6 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | strategy: ddp 5 | 6 | accelerator: gpu 7 | devices: [0,1] 8 | num_nodes: 1 9 | sync_batchnorm: True 10 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/model/decoder/default.yaml: -------------------------------------------------------------------------------- 1 | channels: [256, 256] 2 | dropout: 0.05 3 | attention_head_dim: 64 4 | n_blocks: 1 5 | num_mid_blocks: 2 6 | num_heads: 2 7 | act_fn: snakebeta 8 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/debug/fdr.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs 1 train, 1 validation and 1 test step 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | fast_dev_run: true 10 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha_tts.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | matcha-data-stats = matcha.utils.generate_data_statistics:main 3 | matcha-tts = matcha.cli:cli 4 | matcha-tts-app = matcha.app:main 5 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | # - comet 5 | - csv 6 | # - mlflow 7 | # - neptune 8 | - tensorboard 9 | - wandb 10 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: lightning.pytorch.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html 2 | 3 | rich_progress_bar: 4 | _target_: lightning.pytorch.callbacks.RichProgressBar 5 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/scripts/schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Schedule execution of many runs 3 | # Run from root folder with: bash scripts/schedule.sh 4 | 5 | python src/train.py trainer.max_epochs=5 logger=csv 6 | 7 | python src/train.py trainer.max_epochs=10 logger=csv 8 | -------------------------------------------------------------------------------- /run_single_inference.sh: -------------------------------------------------------------------------------- 1 | python3 single_inference.py --speaker_prompt_audio_path "data/example.wav" --speaker_prompt_text_transcription "在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。" --content_to_synthesize "歡迎使用聯發創新基地 BreezyVoice 模型。" --output_path results/out.wav 2 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/debug/overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # overfits to 3 batches 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 20 10 | overfit_batches: 3 11 | 12 | # model ckpt and early stopping need to be disabled during overfitting 13 | callbacks: null 14 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/debug/limit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # uses only 1% of the training data and 5% of validation/test data 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 3 10 | limit_train_batches: 0.01 11 | limit_val_batches: 0.05 12 | limit_test_batches: 0.05 13 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html 2 | 3 | model_summary: 4 | _target_: lightning.pytorch.callbacks.RichModelSummary 5 | max_depth: 3 # the maximum depth of layer nesting that the summary will include 6 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/debug/profiler.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs with execution time profiling 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 1 10 | # profiler: "simple" 11 | profiler: "advanced" 12 | # profiler: "pytorch" 13 | accelerator: gpu 14 | 15 | limit_train_batches: 0.02 16 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: lightning.pytorch.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: username/lightning-hydra-template 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers 2 | from matcha.utils.logging_utils import log_hyperparameters 3 | from matcha.utils.pylogger import get_pylogger 4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree 5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper 6 | -------------------------------------------------------------------------------- /run_batch_inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default parameters 4 | CSV_FILE="data/batch_files.csv" 5 | SPEAKER_PROMPT_AUDIO_FOLDER="data" 6 | OUTPUT_AUDIO_FOLDER="results" 7 | 8 | # Run the Python script with default parameters 9 | python batch_inference.py \ 10 | --csv_file "$CSV_FILE" \ 11 | --speaker_prompt_audio_folder "$SPEAKER_PROMPT_AUDIO_FOLDER" \ 12 | --output_audio_folder "$OUTPUT_AUDIO_FOLDER" -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/model/matcha.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - encoder: default.yaml 4 | - decoder: default.yaml 5 | - cfm: default.yaml 6 | - optimizer: adam.yaml 7 | 8 | _target_: matcha.models.matcha_tts.MatchaTTS 9 | n_vocab: 178 10 | n_spks: ${data.n_spks} 11 | spk_emb_dim: 64 12 | n_feats: 80 13 | data_statistics: ${data.data_statistics} 14 | out_size: null # Must be divisible by 4 15 | prior_loss: true 16 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include matcha/VERSION 7 | recursive-include matcha *.json 8 | recursive-include matcha *.html 9 | recursive-include matcha *.png 10 | recursive-include matcha *.md 11 | recursive-include matcha *.py 12 | recursive-include matcha *.pyx 13 | recursive-exclude tests * 14 | prune tests* 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/experiment/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - data: mnist # choose datamodule with `test_dataloader()` for evaluation 6 | - model: mnist 7 | - logger: null 8 | - trainer: default 9 | - paths: default 10 | - extras: default 11 | - hydra: default 12 | 13 | task_name: "eval" 14 | 15 | tags: ["dev"] 16 | 17 | # passing checkpoint path is necessary for evaluation 18 | ckpt_path: ??? 19 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/experiment/multispeaker.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: vctk.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["multispeaker"] 13 | 14 | run_name: multispeaker 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: lightning.pytorch.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "lightning-hydra-template" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/data/vctk.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 6 | name: vctk 7 | train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt 8 | valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt 9 | batch_size: 32 10 | add_blank: True 11 | n_spks: 109 12 | data_statistics: # Computed for vctk dataset 13 | mel_mean: -6.630575 14 | mel_std: 2.482914 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech_min 15 | 16 | 17 | model: 18 | out_size: 172 19 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/model/encoder/default.yaml: -------------------------------------------------------------------------------- 1 | encoder_type: RoPE Encoder 2 | encoder_params: 3 | n_feats: ${model.n_feats} 4 | n_channels: 192 5 | filter_channels: 768 6 | filter_channels_dp: 256 7 | n_heads: 2 8 | n_layers: 6 9 | kernel_size: 3 10 | p_dropout: 0.1 11 | spk_emb_dim: 64 12 | n_spks: 1 13 | prenet: true 14 | 15 | duration_predictor_params: 16 | filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} 17 | kernel_size: 3 18 | p_dropout: ${model.encoder.encoder_params.p_dropout} 19 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: hi-fi_en-US_female.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] 13 | 14 | run_name: hi-fi_en-US_female_piper_phonemizer 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: lightning.pytorch.trainer.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | max_epochs: -1 6 | 7 | accelerator: gpu 8 | devices: [0] 9 | 10 | # mixed precision for extra speed-up 11 | precision: 16-mixed 12 | 13 | # perform a validation loop every N training epochs 14 | check_val_every_n_epoch: 1 15 | 16 | # set True to to ensure deterministic results 17 | # makes training slower but gives more reproducibility than just setting seeds 18 | deterministic: False 19 | 20 | gradient_clip_val: 5.0 21 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha_tts.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.0 2 | torchvision>=0.15.0 3 | lightning>=2.0.0 4 | torchmetrics>=0.11.4 5 | hydra-core==1.3.2 6 | hydra-colorlog==1.2.0 7 | hydra-optuna-sweeper==1.2.0 8 | rootutils 9 | pre-commit 10 | rich 11 | pytest 12 | phonemizer 13 | tensorboard 14 | librosa 15 | Cython 16 | numpy 17 | einops 18 | inflect 19 | Unidecode 20 | scipy 21 | torchaudio 22 | matplotlib 23 | pandas 24 | conformer==0.3.2 25 | diffusers==0.25.0 26 | notebook 27 | ipywidgets 28 | gradio==3.43.2 29 | gdown 30 | wget 31 | seaborn 32 | piper_phonemize 33 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ 6 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 7 | name: hi-fi_en-US_female 8 | train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt 9 | valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt 10 | batch_size: 32 11 | cleaners: [english_cleaners_piper] 12 | data_statistics: # Computed for this dataset 13 | mel_mean: -6.38385 14 | mel_std: 2.541796 15 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron 2 | 3 | Defines the set of symbols used in text input to the model. 4 | """ 5 | _pad = "_" 6 | _punctuation = ';:,.!?¡¿—…"«»“” ' 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 8 | _letters_ipa = ( 9 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | ) 11 | 12 | 13 | # Export all symbols: 14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 15 | 16 | # Special symbol ids 17 | SPACE_ID = symbols.index(" ") 18 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: lightning.pytorch.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "lightning-hydra-template" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/data/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 2 | name: ljspeech 3 | train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt 4 | valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt 5 | batch_size: 32 6 | num_workers: 20 7 | pin_memory: True 8 | cleaners: [english_cleaners2] 9 | add_blank: True 10 | n_spks: 1 11 | n_fft: 1024 12 | n_feats: 80 13 | sample_rate: 22050 14 | hop_length: 256 15 | win_length: 1024 16 | f_min: 0 17 | f_max: 8000 18 | data_statistics: # Computed for ljspeech dataset 19 | mel_mean: -5.536622 20 | mel_std: 2.116101 21 | seed: ${seed} 22 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | 15 | job_logging: 16 | handlers: 17 | file: 18 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 19 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 20 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # you can replace it with "." if you want the root to be the current working directory 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to data directory 7 | data_dir: ${paths.root_dir}/data/ 8 | 9 | # path to logging directory 10 | log_dir: ${paths.root_dir}/logs/ 11 | 12 | # path to output directory, created dynamically by hydra 13 | # path generation pattern is specified in `configs/hydra/default.yaml` 14 | # use it to store all files generated during the run, like ckpts and metrics 15 | output_dir: ${hydra:runtime.output_dir} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} 19 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from matcha.utils.monotonic_align.core import maximum_path_c 5 | 6 | 7 | def maximum_path(value, mask): 8 | """Cython optimised version. 9 | value: [b, t_x, t_y] 10 | mask: [b, t_x, t_y] 11 | """ 12 | value = value * mask 13 | device = value.device 14 | dtype = value.dtype 15 | value = value.data.cpu().numpy().astype(np.float32) 16 | path = np.zeros_like(value).astype(np.int32) 17 | mask = mask.data.cpu().numpy() 18 | 19 | t_x_max = mask.sum(1)[:, 0].astype(np.int32) 20 | t_y_max = mask.sum(2)[:, 0].astype(np.int32) 21 | maximum_path_c(path, value, t_x_max, t_y_max) 22 | return torch.from_numpy(path).to(device=device, dtype=dtype) 23 | -------------------------------------------------------------------------------- /data/batch_files.csv: -------------------------------------------------------------------------------- 1 | speaker_prompt_audio_filename,speaker,speaker_prompt_text_transcription,content_to_synthesize,output_audio_filename 2 | example,女,在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。,信義快速道路原名國道三號臺北聯絡線信義支線,最初是按照國道等級的標準規劃為高速公路支線。信義快速道路的開通有效改善了臺北市東南部與新北市之間的交通。由於山脈阻隔,過去臺北市與木柵、景美,以及新北市深坑、新店等地之間的交通需要繞道,增加了通勤時間,也加重其他地區的交通負荷。該道路不僅連接信義區與高速公路系統,也緩解了上述地區間的交通壓力,縮短通勤時間。道路開通後,多家大臺北地區的公車業者調整路線,利用信義快速道路縮短跨區通勤時間。由於原本設計採用國道標準,信義快速道路全線為雙向各三線道設計,但速限低於國道,設定為每小時40至70公里。內側車道原本規劃為信義輕軌的專用道,目前改為公車與計程車專用車道,一般小客車禁止行駛,是臺灣首見的道路規劃方式。2007年11月1日起,配合開放大型重型機車行駛部分高架橋與快速道路的政策,信義快速道路也允許大型重型機車行駛。,out-wiki 3 | example,女,在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。,歡迎使用聯發創新基地 BreezyVoice 模型。,out-BreezyVoice 4 | example,女,在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。,今天天氣真好,out-weather 5 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name: str = __name__) -> logging.Logger: 7 | """Initializes a multi-GPU-friendly python command line logger. 8 | 9 | :param name: The name of the logger, defaults to ``__name__``. 10 | 11 | :return: A logger object. 12 | """ 13 | logger = logging.getLogger(name) 14 | 15 | # this ensures all logging levels get marked with the rank zero decorator 16 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 17 | logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") 18 | for level in logging_levels: 19 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 20 | 21 | return logger 22 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/config.py: -------------------------------------------------------------------------------- 1 | v1 = { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0004, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | "upsample_rates": [8, 8, 2, 2], 11 | "upsample_kernel_sizes": [16, 16, 4, 4], 12 | "upsample_initial_channel": 512, 13 | "resblock_kernel_sizes": [3, 7, 11], 14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 15 | "resblock_initial_channel": 256, 16 | "segment_size": 8192, 17 | "num_mels": 80, 18 | "num_freq": 1025, 19 | "n_fft": 1024, 20 | "hop_size": 256, 21 | "win_size": 1024, 22 | "sampling_rate": 22050, 23 | "fmin": 0, 24 | "fmax": 8000, 25 | "fmax_loss": None, 26 | "num_workers": 4, 27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, 28 | } 29 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu118 2 | conformer==0.3.2 3 | deepspeed==0.14.2; sys_platform == 'linux' 4 | diffusers==0.32.0 5 | gdown==5.1.0 6 | gradio==4.32.2 7 | grpcio==1.57.0 8 | grpcio-tools==1.57.0 9 | hydra-core==1.3.2 10 | HyperPyYAML==1.2.2 11 | inflect==7.3.1 12 | librosa==0.10.2 13 | lightning==2.2.4 14 | matplotlib==3.7.5 15 | networkx==3.1 16 | omegaconf==2.3.0 17 | onnxruntime-gpu==1.16.0; sys_platform == 'linux' 18 | openai-whisper==20231117 19 | protobuf==4.25 20 | pydantic==2.7.0 21 | rich==13.7.1 22 | soundfile==0.12.1 23 | tensorboard==2.14.0 24 | torch==2.3.1 25 | torchaudio==2.3.1 26 | wget==3.2 27 | fastapi==0.111.0 28 | fastapi-cli==0.0.4 29 | WeTextProcessing==1.0.3 30 | opencc-python-reimplemented 31 | g2pw 32 | pyarrow 33 | datasets 34 | 35 | https://www.modelscope.cn/models/speech_tts/speech_kantts_ttsfrd/resolve/master/ttsfrd_dependency-0.1-py3-none-any.whl 36 | https://www.modelscope.cn/models/speech_tts/speech_kantts_ttsfrd/resolve/master/ttsfrd-0.3.9-cp310-cp310-linux_x86_64.whl 37 | 38 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/debug/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # default debugging setup, runs 1 full epoch 4 | # other debugging configs can inherit from this one 5 | 6 | # overwrite task name so debugging logs are stored in separate folder 7 | task_name: "debug" 8 | 9 | # disable callbacks and loggers during debugging 10 | # callbacks: null 11 | # logger: null 12 | 13 | extras: 14 | ignore_warnings: False 15 | enforce_tags: False 16 | 17 | # sets level of all command line loggers to 'DEBUG' 18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 19 | hydra: 20 | job_logging: 21 | root: 22 | level: DEBUG 23 | 24 | # use this to also set hydra loggers to 'DEBUG' 25 | # verbose: True 26 | 27 | trainer: 28 | max_epochs: 1 29 | accelerator: cpu # debuggers don't like gpus 30 | devices: 1 # debuggers don't like multiprocessing 31 | detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor 32 | 33 | data: 34 | num_workers: 0 # debuggers don't like multiprocessing 35 | pin_memory: False # disable gpu memory pin 36 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | # --------- pytorch --------- # 2 | torch>=2.0.0 3 | torchvision>=0.15.0 4 | lightning>=2.0.0 5 | torchmetrics>=0.11.4 6 | 7 | # --------- hydra --------- # 8 | hydra-core==1.3.2 9 | hydra-colorlog==1.2.0 10 | hydra-optuna-sweeper==1.2.0 11 | 12 | # --------- loggers --------- # 13 | # wandb 14 | # neptune-client 15 | # mlflow 16 | # comet-ml 17 | # aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 18 | 19 | # --------- others --------- # 20 | rootutils # standardizing the project root setup 21 | pre-commit # hooks for applying linters on commit 22 | rich # beautiful text formatting in terminal 23 | pytest # tests 24 | # sh # for running bash commands in some tests (linux/macos only) 25 | phonemizer # phonemization of text 26 | tensorboard 27 | librosa 28 | Cython 29 | numpy 30 | einops 31 | inflect 32 | Unidecode 33 | scipy 34 | torchaudio 35 | matplotlib 36 | pandas 37 | conformer==0.3.2 38 | diffusers==0.25.0 39 | notebook 40 | ipywidgets 41 | gradio==3.43.2 42 | gdown 43 | wget 44 | seaborn 45 | piper_phonemize 46 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Shivam Mehta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] 3 | 4 | [tool.black] 5 | line-length = 120 6 | target-version = ['py310'] 7 | exclude = ''' 8 | 9 | ( 10 | /( 11 | \.eggs # exclude a few common directories in the 12 | | \.git # root of the project 13 | | \.hg 14 | | \.mypy_cache 15 | | \.tox 16 | | \.venv 17 | | _build 18 | | buck-out 19 | | build 20 | | dist 21 | )/ 22 | | foo.py # also separately exclude a file named foo.py in 23 | # the root of the project 24 | ) 25 | ''' 26 | 27 | [tool.pytest.ini_options] 28 | addopts = [ 29 | "--color=yes", 30 | "--durations=0", 31 | "--strict-markers", 32 | "--doctest-modules", 33 | ] 34 | filterwarnings = [ 35 | "ignore::DeprecationWarning", 36 | "ignore::UserWarning", 37 | ] 38 | log_cli = "True" 39 | markers = [ 40 | "slow: slow tests", 41 | ] 42 | minversion = "6.0" 43 | testpaths = "tests/" 44 | 45 | [tool.coverage.report] 46 | exclude_lines = [ 47 | "pragma: nocover", 48 | "raise NotImplementedError", 49 | "raise NotImplementedError()", 50 | "if __name__ == .__main__.:", 51 | ] 52 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html 2 | 3 | model_checkpoint: 4 | _target_: lightning.pytorch.callbacks.ModelCheckpoint 5 | dirpath: ${paths.output_dir}/checkpoints # directory to save the model file 6 | filename: checkpoint_{epoch:03d} # checkpoint filename 7 | monitor: epoch # name of the logged metric which determines when model is improving 8 | verbose: False # verbosity mode 9 | save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt 10 | save_top_k: 10 # save k best models (determined by above metric) 11 | mode: "max" # "max" means higher metric value is better, can be also "min" 12 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 13 | save_weights_only: False # if True, then only the model’s weights will be saved 14 | every_n_train_steps: null # number of training steps between checkpoints 15 | train_time_interval: null # checkpoints are monitored at the specified time interval 16 | every_n_epochs: 100 # number of epochs between checkpoints 17 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 18 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | clean-logs: ## Clean logs 14 | rm -rf logs/** 15 | 16 | create-package: ## Create wheel and tar gz 17 | rm -rf dist/ 18 | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 19 | python setup.py sdist 20 | python -m twine upload dist/* --verbose --skip-existing 21 | 22 | format: ## Run pre-commit hooks 23 | pre-commit run -a 24 | 25 | sync: ## Merge changes from main branch to your current branch 26 | git pull 27 | git pull origin main 28 | 29 | test: ## Run not slow tests 30 | pytest -k "not slow" 31 | 32 | test-full: ## Run all tests 33 | pytest 34 | 35 | train-ljspeech: ## Train the model 36 | python matcha/train.py experiment=ljspeech 37 | 38 | train-ljspeech-min: ## Train the model with minimum memory 39 | python matcha/train.py experiment=ljspeech_min_memory 40 | 41 | start_app: ## Start the app 42 | python matcha/app.py 43 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/logger/aim.yaml: -------------------------------------------------------------------------------- 1 | # https://aimstack.io/ 2 | 3 | # example usage in lightning module: 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py 5 | 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder): 7 | # `aim up` 8 | 9 | aim: 10 | _target_: aim.pytorch_lightning.AimLogger 11 | repo: ${paths.root_dir} # .aim folder will be created here 12 | # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# 13 | 14 | # aim allows to group runs under experiment name 15 | experiment: null # any string, set to "default" if not specified 16 | 17 | train_metric_prefix: "train/" 18 | val_metric_prefix: "val/" 19 | test_metric_prefix: "test/" 20 | 21 | # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) 22 | system_tracking_interval: 10 # set to null to disable system metrics tracking 23 | 24 | # enable/disable logging of system params such as installed packages, git info, env vars, etc. 25 | log_system_params: true 26 | 27 | # enable/disable tracking console logs (default value is true) 28 | capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 29 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | 13 | 14 | def plot_spectrogram(spectrogram): 15 | fig, ax = plt.subplots(figsize=(10, 2)) 16 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 17 | plt.colorbar(im, ax=ax) 18 | 19 | fig.canvas.draw() 20 | plt.close() 21 | 22 | return fig 23 | 24 | 25 | def init_weights(m, mean=0.0, std=0.01): 26 | classname = m.__class__.__name__ 27 | if classname.find("Conv") != -1: 28 | m.weight.data.normal_(mean, std) 29 | 30 | 31 | def apply_weight_norm(m): 32 | classname = m.__class__.__name__ 33 | if classname.find("Conv") != -1: 34 | weight_norm(m) 35 | 36 | 37 | def get_padding(kernel_size, dilation=1): 38 | return int((kernel_size * dilation - dilation) / 2) 39 | 40 | 41 | def load_checkpoint(filepath, device): 42 | assert os.path.isfile(filepath) 43 | print(f"Loading '{filepath}'") 44 | checkpoint_dict = torch.load(filepath, map_location=device) 45 | print("Complete.") 46 | return checkpoint_dict 47 | 48 | 49 | def save_checkpoint(filepath, obj): 50 | print(f"Saving checkpoint to {filepath}") 51 | torch.save(obj, filepath) 52 | print("Complete.") 53 | 54 | 55 | def scan_checkpoint(cp_dir, prefix): 56 | pattern = os.path.join(cp_dir, prefix + "????????") 57 | cp_list = glob.glob(pattern) 58 | if len(cp_list) == 0: 59 | return None 60 | return sorted(cp_list)[-1] 61 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | import numpy 5 | from Cython.Build import cythonize 6 | from setuptools import Extension, find_packages, setup 7 | 8 | exts = [ 9 | Extension( 10 | name="matcha.utils.monotonic_align.core", 11 | sources=["matcha/utils/monotonic_align/core.pyx"], 12 | ) 13 | ] 14 | 15 | with open("README.md", encoding="utf-8") as readme_file: 16 | README = readme_file.read() 17 | 18 | cwd = os.path.dirname(os.path.abspath(__file__)) 19 | with open(os.path.join(cwd, "matcha", "VERSION")) as fin: 20 | version = fin.read().strip() 21 | 22 | setup( 23 | name="matcha-tts", 24 | version=version, 25 | description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching", 26 | long_description=README, 27 | long_description_content_type="text/markdown", 28 | author="Shivam Mehta", 29 | author_email="shivam.mehta25@gmail.com", 30 | url="https://shivammehta25.github.io/Matcha-TTS", 31 | install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))], 32 | include_dirs=[numpy.get_include()], 33 | include_package_data=True, 34 | packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]), 35 | # use this to customize global commands available in the terminal after installing the package 36 | entry_points={ 37 | "console_scripts": [ 38 | "matcha-data-stats=matcha.utils.generate_data_statistics:main", 39 | "matcha-tts=matcha.cli:cli", 40 | "matcha-tts-app=matcha.app:main", 41 | ] 42 | }, 43 | ext_modules=cythonize(exts, language_level=3), 44 | python_requires=">=3.9.0", 45 | ) 46 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha_tts.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | pyproject.toml 5 | requirements.txt 6 | setup.py 7 | configs/__init__.py 8 | matcha/VERSION 9 | matcha/__init__.py 10 | matcha/app.py 11 | matcha/cli.py 12 | matcha/train.py 13 | matcha/data/__init__.py 14 | matcha/data/text_mel_datamodule.py 15 | matcha/data/components/__init__.py 16 | matcha/hifigan/README.md 17 | matcha/hifigan/__init__.py 18 | matcha/hifigan/config.py 19 | matcha/hifigan/denoiser.py 20 | matcha/hifigan/env.py 21 | matcha/hifigan/meldataset.py 22 | matcha/hifigan/models.py 23 | matcha/hifigan/xutils.py 24 | matcha/models/__init__.py 25 | matcha/models/baselightningmodule.py 26 | matcha/models/matcha_tts.py 27 | matcha/models/components/__init__.py 28 | matcha/models/components/decoder.py 29 | matcha/models/components/flow_matching.py 30 | matcha/models/components/text_encoder.py 31 | matcha/models/components/transformer.py 32 | matcha/onnx/__init__.py 33 | matcha/onnx/export.py 34 | matcha/onnx/infer.py 35 | matcha/text/__init__.py 36 | matcha/text/cleaners.py 37 | matcha/text/numbers.py 38 | matcha/text/symbols.py 39 | matcha/utils/__init__.py 40 | matcha/utils/audio.py 41 | matcha/utils/generate_data_statistics.py 42 | matcha/utils/instantiators.py 43 | matcha/utils/logging_utils.py 44 | matcha/utils/model.py 45 | matcha/utils/pylogger.py 46 | matcha/utils/rich_utils.py 47 | matcha/utils/utils.py 48 | matcha/utils/monotonic_align/__init__.py 49 | matcha/utils/monotonic_align/core.c 50 | matcha/utils/monotonic_align/core.pyx 51 | matcha/utils/monotonic_align/setup.py 52 | matcha_tts.egg-info/PKG-INFO 53 | matcha_tts.egg-info/SOURCES.txt 54 | matcha_tts.egg-info/dependency_links.txt 55 | matcha_tts.egg-info/entry_points.txt 56 | matcha_tts.egg-info/requires.txt 57 | matcha_tts.egg-info/top_level.txt -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # specify here default configuration 4 | # order of defaults determines the order in which configs override each other 5 | defaults: 6 | - _self_ 7 | - data: ljspeech 8 | - model: matcha 9 | - callbacks: default 10 | - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`) 11 | - trainer: default 12 | - paths: default 13 | - extras: default 14 | - hydra: default 15 | 16 | # experiment configs allow for version control of specific hyperparameters 17 | # e.g. best hyperparameters for given model and datamodule 18 | - experiment: null 19 | 20 | # config for hyperparameter optimization 21 | - hparams_search: null 22 | 23 | # optional local config for machine/user specific settings 24 | # it's optional since it doesn't need to exist and is excluded from version control 25 | - optional local: default 26 | 27 | # debugging config (enable through command line, e.g. `python train.py debug=default) 28 | - debug: null 29 | 30 | # task name, determines output directory path 31 | task_name: "train" 32 | 33 | run_name: ??? 34 | 35 | # tags to help you identify your experiments 36 | # you can overwrite this in experiment configs 37 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"` 38 | tags: ["dev"] 39 | 40 | # set False to skip model training 41 | train: True 42 | 43 | # evaluate on test set, using best model weights achieved during training 44 | # lightning chooses best weights based on the metric specified in checkpoint callback 45 | test: True 46 | 47 | # simply provide checkpoint path to resume training 48 | ckpt_path: null 49 | 50 | # seed for random number generators in pytorch, numpy and python.random 51 | seed: 1234 52 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from matcha.text import cleaners 3 | from matcha.text.symbols import symbols 4 | 5 | # Mappings from symbol to numeric ID and vice versa: 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} # pylint: disable=unnecessary-comprehension 8 | 9 | 10 | def text_to_sequence(text, cleaner_names): 11 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 12 | Args: 13 | text: string to convert to a sequence 14 | cleaner_names: names of the cleaner functions to run the text through 15 | Returns: 16 | List of integers corresponding to the symbols in the text 17 | """ 18 | sequence = [] 19 | 20 | clean_text = _clean_text(text, cleaner_names) 21 | for symbol in clean_text: 22 | symbol_id = _symbol_to_id[symbol] 23 | sequence += [symbol_id] 24 | return sequence 25 | 26 | 27 | def cleaned_text_to_sequence(cleaned_text): 28 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 29 | Args: 30 | text: string to convert to a sequence 31 | Returns: 32 | List of integers corresponding to the symbols in the text 33 | """ 34 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] 35 | return sequence 36 | 37 | 38 | def sequence_to_text(sequence): 39 | """Converts a sequence of IDs back to a string""" 40 | result = "" 41 | for symbol_id in sequence: 42 | s = _id_to_symbol[symbol_id] 43 | result += s 44 | return result 45 | 46 | 47 | def _clean_text(text, cleaner_names): 48 | for name in cleaner_names: 49 | cleaner = getattr(cleaners, name) 50 | if not cleaner: 51 | raise Exception("Unknown cleaner: %s" % name) 52 | text = cleaner(text) 53 | return text 54 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | from omegaconf import OmegaConf 5 | 6 | from matcha.utils import pylogger 7 | 8 | log = pylogger.get_pylogger(__name__) 9 | 10 | 11 | @rank_zero_only 12 | def log_hyperparameters(object_dict: Dict[str, Any]) -> None: 13 | """Controls which config parts are saved by Lightning loggers. 14 | 15 | Additionally saves: 16 | - Number of model parameters 17 | 18 | :param object_dict: A dictionary containing the following objects: 19 | - `"cfg"`: A DictConfig object containing the main config. 20 | - `"model"`: The Lightning model. 21 | - `"trainer"`: The Lightning trainer. 22 | """ 23 | hparams = {} 24 | 25 | cfg = OmegaConf.to_container(object_dict["cfg"]) 26 | model = object_dict["model"] 27 | trainer = object_dict["trainer"] 28 | 29 | if not trainer.logger: 30 | log.warning("Logger not found! Skipping hyperparameter logging...") 31 | return 32 | 33 | hparams["model"] = cfg["model"] 34 | 35 | # save number of model parameters 36 | hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) 37 | hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad) 38 | hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad) 39 | 40 | hparams["data"] = cfg["data"] 41 | hparams["trainer"] = cfg["trainer"] 42 | 43 | hparams["callbacks"] = cfg.get("callbacks") 44 | hparams["extras"] = cfg.get("extras") 45 | 46 | hparams["task_name"] = cfg.get("task_name") 47 | hparams["tags"] = cfg.get("tags") 48 | hparams["ckpt_path"] = cfg.get("ckpt_path") 49 | hparams["seed"] = cfg.get("seed") 50 | 51 | # send hparams to all loggers 52 | for logger in trainer.loggers: 53 | logger.log_hyperparams(hparams) 54 | -------------------------------------------------------------------------------- /cosyvoice/flow/length_regulator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple 15 | import torch.nn as nn 16 | from torch.nn import functional as F 17 | from cosyvoice.utils.mask import make_pad_mask 18 | 19 | 20 | class InterpolateRegulator(nn.Module): 21 | def __init__( 22 | self, 23 | channels: int, 24 | sampling_ratios: Tuple, 25 | out_channels: int = None, 26 | groups: int = 1, 27 | ): 28 | super().__init__() 29 | self.sampling_ratios = sampling_ratios 30 | out_channels = out_channels or channels 31 | model = nn.ModuleList([]) 32 | if len(sampling_ratios) > 0: 33 | for _ in sampling_ratios: 34 | module = nn.Conv1d(channels, channels, 3, 1, 1) 35 | norm = nn.GroupNorm(groups, channels) 36 | act = nn.Mish() 37 | model.extend([module, norm, act]) 38 | model.append( 39 | nn.Conv1d(channels, out_channels, 1, 1) 40 | ) 41 | self.model = nn.Sequential(*model) 42 | 43 | def forward(self, x, ylens=None): 44 | # x in (B, T, D) 45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) 46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') 47 | out = self.model(x).transpose(1, 2).contiguous() 48 | olens = ylens 49 | return out * mask, olens 50 | -------------------------------------------------------------------------------- /cosyvoice/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import torchaudio 18 | 19 | 20 | def read_lists(list_file): 21 | lists = [] 22 | with open(list_file, 'r', encoding='utf8') as fin: 23 | for line in fin: 24 | lists.append(line.strip()) 25 | return lists 26 | 27 | def read_json_lists(list_file): 28 | lists = read_lists(list_file) 29 | results = {} 30 | for fn in lists: 31 | with open(fn, 'r', encoding='utf8') as fin: 32 | results.update(json.load(fin)) 33 | return results 34 | 35 | def load_wav(wav, target_sr): 36 | speech, sample_rate = torchaudio.load(wav) 37 | speech = speech.mean(dim=0, keepdim=True) 38 | if sample_rate != target_sr: 39 | assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) 40 | speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) 41 | return speech 42 | 43 | def speed_change(waveform, sample_rate, speed_factor: str): 44 | effects = [ 45 | ["tempo", speed_factor], # speed_factor 46 | ["rate", f"{sample_rate}"] 47 | ] 48 | augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( 49 | waveform, 50 | sample_rate, 51 | effects 52 | ) 53 | return augmented_waveform, new_sample_rate 54 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # example hyperparameter optimization of some experiment with Optuna: 4 | # python train.py -m hparams_search=mnist_optuna experiment=example 5 | 6 | defaults: 7 | - override /hydra/sweeper: optuna 8 | 9 | # choose metric which will be optimized by Optuna 10 | # make sure this is the correct name of some metric logged in lightning module! 11 | optimized_metric: "val/acc_best" 12 | 13 | # here we define Optuna hyperparameter search 14 | # it optimizes for value returned from function with @hydra.main decorator 15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper 16 | hydra: 17 | mode: "MULTIRUN" # set hydra to multirun by default if this config is attached 18 | 19 | sweeper: 20 | _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper 21 | 22 | # storage URL to persist optimization results 23 | # for example, you can use SQLite if you set 'sqlite:///example.db' 24 | storage: null 25 | 26 | # name of the study to persist optimization results 27 | study_name: null 28 | 29 | # number of parallel workers 30 | n_jobs: 1 31 | 32 | # 'minimize' or 'maximize' the objective 33 | direction: maximize 34 | 35 | # total number of runs that will be executed 36 | n_trials: 20 37 | 38 | # choose Optuna hyperparameter sampler 39 | # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others 40 | # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html 41 | sampler: 42 | _target_: optuna.samplers.TPESampler 43 | seed: 1234 44 | n_startup_trials: 10 # number of random sampling runs before optimization starts 45 | 46 | # define hyperparameter search space 47 | params: 48 | model.optimizer.lr: interval(0.0001, 0.1) 49 | data.batch_size: choice(32, 64, 128, 256) 50 | model.net.lin1_size: choice(64, 128, 256) 51 | model.net.lin2_size: choice(64, 128, 256) 52 | model.net.lin3_size: choice(32, 64, 128, 256) 53 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/instantiators.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import hydra 4 | from lightning import Callback 5 | from lightning.pytorch.loggers import Logger 6 | from omegaconf import DictConfig 7 | 8 | from matcha.utils import pylogger 9 | 10 | log = pylogger.get_pylogger(__name__) 11 | 12 | 13 | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]: 14 | """Instantiates callbacks from config. 15 | 16 | :param callbacks_cfg: A DictConfig object containing callback configurations. 17 | :return: A list of instantiated callbacks. 18 | """ 19 | callbacks: List[Callback] = [] 20 | 21 | if not callbacks_cfg: 22 | log.warning("No callback configs found! Skipping..") 23 | return callbacks 24 | 25 | if not isinstance(callbacks_cfg, DictConfig): 26 | raise TypeError("Callbacks config must be a DictConfig!") 27 | 28 | for _, cb_conf in callbacks_cfg.items(): 29 | if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf: 30 | log.info(f"Instantiating callback <{cb_conf._target_}>") # pylint: disable=protected-access 31 | callbacks.append(hydra.utils.instantiate(cb_conf)) 32 | 33 | return callbacks 34 | 35 | 36 | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]: 37 | """Instantiates loggers from config. 38 | 39 | :param logger_cfg: A DictConfig object containing logger configurations. 40 | :return: A list of instantiated loggers. 41 | """ 42 | logger: List[Logger] = [] 43 | 44 | if not logger_cfg: 45 | log.warning("No logger configs found! Skipping...") 46 | return logger 47 | 48 | if not isinstance(logger_cfg, DictConfig): 49 | raise TypeError("Logger config must be a DictConfig!") 50 | 51 | for _, lg_conf in logger_cfg.items(): 52 | if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf: 53 | log.info(f"Instantiating logger <{lg_conf._target_}>") # pylint: disable=protected-access 54 | logger.append(hydra.utils.instantiate(lg_conf)) 55 | 56 | return logger 57 | -------------------------------------------------------------------------------- /cosyvoice/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | import inflect 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 13 | _number_re = re.compile(r"[0-9]+") 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(",", "") 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace(".", " point ") 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split(".") 27 | if len(parts) > 2: 28 | return match + " dollars" 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = "dollar" if dollars == 1 else "dollars" 33 | cent_unit = "cent" if cents == 1 else "cents" 34 | return f"{dollars} {dollar_unit}, {cents} {cent_unit}" 35 | elif dollars: 36 | dollar_unit = "dollar" if dollars == 1 else "dollars" 37 | return f"{dollars} {dollar_unit}" 38 | elif cents: 39 | cent_unit = "cent" if cents == 1 else "cents" 40 | return f"{cents} {cent_unit}" 41 | else: 42 | return "zero dollars" 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return "two thousand" 54 | elif num > 2000 and num < 2010: 55 | return "two thousand " + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + " hundred" 58 | else: 59 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 60 | else: 61 | return _inflect.number_to_words(num, andword="") 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r"\1 pounds", text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from librosa.filters import mel as librosa_mel_fn 5 | from scipy.io.wavfile import read 6 | 7 | MAX_WAV_VALUE = 32768.0 8 | 9 | 10 | def load_wav(full_path): 11 | sampling_rate, data = read(full_path) 12 | return data, sampling_rate 13 | 14 | 15 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 16 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 17 | 18 | 19 | def dynamic_range_decompression(x, C=1): 20 | return np.exp(x) / C 21 | 22 | 23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 24 | return torch.log(torch.clamp(x, min=clip_val) * C) 25 | 26 | 27 | def dynamic_range_decompression_torch(x, C=1): 28 | return torch.exp(x) / C 29 | 30 | 31 | def spectral_normalize_torch(magnitudes): 32 | output = dynamic_range_compression_torch(magnitudes) 33 | return output 34 | 35 | 36 | def spectral_de_normalize_torch(magnitudes): 37 | output = dynamic_range_decompression_torch(magnitudes) 38 | return output 39 | 40 | 41 | mel_basis = {} 42 | hann_window = {} 43 | 44 | 45 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 46 | if torch.min(y) < -1.0: 47 | print("min value is ", torch.min(y)) 48 | if torch.max(y) > 1.0: 49 | print("max value is ", torch.max(y)) 50 | 51 | global mel_basis, hann_window # pylint: disable=global-statement 52 | if f"{str(fmax)}_{str(y.device)}" not in mel_basis: 53 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 54 | mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 55 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 56 | 57 | y = torch.nn.functional.pad( 58 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" 59 | ) 60 | y = y.squeeze(1) 61 | 62 | spec = torch.view_as_real( 63 | torch.stft( 64 | y, 65 | n_fft, 66 | hop_length=hop_size, 67 | win_length=win_size, 68 | window=hann_window[str(y.device)], 69 | center=center, 70 | pad_mode="reflect", 71 | normalized=False, 72 | onesided=True, 73 | return_complex=True, 74 | ) 75 | ) 76 | 77 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 78 | 79 | spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) 80 | spec = spectral_normalize_torch(spec) 81 | 82 | return spec 83 | -------------------------------------------------------------------------------- /batch_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import argparse 4 | import pandas as pd 5 | from datasets import Dataset 6 | 7 | def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_folder): 8 | # Load CSV with pandas 9 | data = pd.read_csv(csv_file) 10 | 11 | # Transform pandas DataFrame to HuggingFace Dataset 12 | dataset = Dataset.from_pandas(data) 13 | 14 | def gen_audio(row): 15 | speaker_prompt_audio_path = os.path.join(speaker_prompt_audio_folder, f"{row['speaker_prompt_audio_filename']}.wav") 16 | speaker_prompt_text_transcription = row['speaker_prompt_text_transcription'] 17 | content_to_synthesize = row['content_to_synthesize'] 18 | output_audio_path = os.path.join(output_audio_folder, f"{row['output_audio_filename']}.wav") 19 | 20 | if not os.path.exists(speaker_prompt_audio_path): 21 | print(f"File {speaker_prompt_audio_path} does not exist") 22 | return row #{"status": "failed", "reason": "file not found"} 23 | 24 | command = [ 25 | "python", "single_inference.py", 26 | "--speaker_prompt_audio_path", speaker_prompt_audio_path, 27 | "--speaker_prompt_text_transcription", speaker_prompt_text_transcription, 28 | "--content_to_synthesize", content_to_synthesize, 29 | "--output_path", output_audio_path 30 | ] 31 | 32 | try: 33 | print(f"Processing: {speaker_prompt_audio_path}") 34 | subprocess.run(command, check=True) 35 | print(f"Generated: {output_audio_path}") 36 | return row #{"status": "success", "output": gen_voice_file_name} 37 | except subprocess.CalledProcessError as e: 38 | print(f"Failed to generate {speaker_prompt_audio_path}, error: {e}") 39 | return row #{"status": "failed", "reason": str(e)} 40 | 41 | dataset = dataset.map(gen_audio, num_proc = 1) 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser(description="Batch process audio generation.") 45 | parser.add_argument("--csv_file", required=True, help="Path to the CSV file containing input data.") 46 | parser.add_argument("--speaker_prompt_audio_folder", required=True, help="Path to the folder containing speaker prompt audio files.") 47 | parser.add_argument("--output_audio_folder", required=True, help="Path to the folder where results will be stored.") 48 | 49 | args = parser.parse_args() 50 | 51 | os.makedirs(args.output_audio_folder, exist_ok=True) 52 | 53 | process_batch( 54 | csv_file=args.csv_file, 55 | speaker_prompt_audio_folder=args.speaker_prompt_audio_folder, 56 | output_audio_folder=args.output_audio_folder, 57 | ) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /cosyvoice/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright [2023-11-28] 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from cosyvoice.transformer.activation import Swish 18 | from cosyvoice.transformer.subsampling import ( 19 | LinearNoSubsampling, 20 | EmbedinigNoSubsampling, 21 | Conv1dSubsampling2, 22 | Conv2dSubsampling4, 23 | Conv2dSubsampling6, 24 | Conv2dSubsampling8, 25 | ) 26 | from cosyvoice.transformer.embedding import (PositionalEncoding, 27 | RelPositionalEncoding, 28 | WhisperPositionalEncoding, 29 | LearnablePositionalEncoding, 30 | NoPositionalEncoding) 31 | from cosyvoice.transformer.attention import (MultiHeadedAttention, 32 | RelPositionMultiHeadedAttention) 33 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding 34 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling 35 | 36 | 37 | COSYVOICE_ACTIVATION_CLASSES = { 38 | "hardtanh": torch.nn.Hardtanh, 39 | "tanh": torch.nn.Tanh, 40 | "relu": torch.nn.ReLU, 41 | "selu": torch.nn.SELU, 42 | "swish": getattr(torch.nn, "SiLU", Swish), 43 | "gelu": torch.nn.GELU, 44 | } 45 | 46 | COSYVOICE_SUBSAMPLE_CLASSES = { 47 | "linear": LinearNoSubsampling, 48 | "linear_legacy": LegacyLinearNoSubsampling, 49 | "embed": EmbedinigNoSubsampling, 50 | "conv1d2": Conv1dSubsampling2, 51 | "conv2d": Conv2dSubsampling4, 52 | "conv2d6": Conv2dSubsampling6, 53 | "conv2d8": Conv2dSubsampling8, 54 | 'paraformer_dummy': torch.nn.Identity 55 | } 56 | 57 | COSYVOICE_EMB_CLASSES = { 58 | "embed": PositionalEncoding, 59 | "abs_pos": PositionalEncoding, 60 | "rel_pos": RelPositionalEncoding, 61 | "rel_pos_espnet": EspnetRelPositionalEncoding, 62 | "no_pos": NoPositionalEncoding, 63 | "abs_pos_whisper": WhisperPositionalEncoding, 64 | "embed_learnable_pe": LearnablePositionalEncoding, 65 | } 66 | 67 | COSYVOICE_ATTENTION_CLASSES = { 68 | "selfattn": MultiHeadedAttention, 69 | "rel_selfattn": RelPositionMultiHeadedAttention, 70 | } 71 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/denoiser.py: -------------------------------------------------------------------------------- 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py 2 | 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio.""" 4 | import torch 5 | 6 | 7 | class Denoiser(torch.nn.Module): 8 | """Removes model bias from audio produced with waveglow""" 9 | 10 | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"): 11 | super().__init__() 12 | self.filter_length = filter_length 13 | self.hop_length = int(filter_length / n_overlap) 14 | self.win_length = win_length 15 | 16 | dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device 17 | self.device = device 18 | if mode == "zeros": 19 | mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device) 20 | elif mode == "normal": 21 | mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device) 22 | else: 23 | raise Exception(f"Mode {mode} if not supported") 24 | 25 | def stft_fn(audio, n_fft, hop_length, win_length, window): 26 | spec = torch.stft( 27 | audio, 28 | n_fft=n_fft, 29 | hop_length=hop_length, 30 | win_length=win_length, 31 | window=window, 32 | return_complex=True, 33 | ) 34 | spec = torch.view_as_real(spec) 35 | return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0]) 36 | 37 | self.stft = lambda x: stft_fn( 38 | audio=x, 39 | n_fft=self.filter_length, 40 | hop_length=self.hop_length, 41 | win_length=self.win_length, 42 | window=torch.hann_window(self.win_length, device=device), 43 | ) 44 | self.istft = lambda x, y: torch.istft( 45 | torch.complex(x * torch.cos(y), x * torch.sin(y)), 46 | n_fft=self.filter_length, 47 | hop_length=self.hop_length, 48 | win_length=self.win_length, 49 | window=torch.hann_window(self.win_length, device=device), 50 | ) 51 | 52 | with torch.no_grad(): 53 | bias_audio = vocoder(mel_input).float().squeeze(0) 54 | bias_spec, _ = self.stft(bias_audio) 55 | 56 | self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None]) 57 | 58 | @torch.inference_mode() 59 | def forward(self, audio, strength=0.0005): 60 | audio_spec, audio_angles = self.stft(audio) 61 | audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength 62 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 63 | audio_denoised = self.istft(audio_spec_denoised, audio_angles) 64 | return audio_denoised 65 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jaywalnut310/glow-tts """ 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def sequence_mask(length, max_length=None): 8 | if max_length is None: 9 | max_length = length.max() 10 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 11 | return x.unsqueeze(0) < length.unsqueeze(1) 12 | 13 | 14 | def fix_len_compatibility(length, num_downsamplings_in_unet=2): 15 | factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet) 16 | length = (length / factor).ceil() * factor 17 | if not torch.onnx.is_in_onnx_export(): 18 | return length.int().item() 19 | else: 20 | return length 21 | 22 | 23 | def convert_pad_shape(pad_shape): 24 | inverted_shape = pad_shape[::-1] 25 | pad_shape = [item for sublist in inverted_shape for item in sublist] 26 | return pad_shape 27 | 28 | 29 | def generate_path(duration, mask): 30 | device = duration.device 31 | 32 | b, t_x, t_y = mask.shape 33 | cum_duration = torch.cumsum(duration, 1) 34 | path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) 35 | 36 | cum_duration_flat = cum_duration.view(b * t_x) 37 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 38 | path = path.view(b, t_x, t_y) 39 | path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 40 | path = path * mask 41 | return path 42 | 43 | 44 | def duration_loss(logw, logw_, lengths): 45 | loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths) 46 | return loss 47 | 48 | 49 | def normalize(data, mu, std): 50 | if not isinstance(mu, (float, int)): 51 | if isinstance(mu, list): 52 | mu = torch.tensor(mu, dtype=data.dtype, device=data.device) 53 | elif isinstance(mu, torch.Tensor): 54 | mu = mu.to(data.device) 55 | elif isinstance(mu, np.ndarray): 56 | mu = torch.from_numpy(mu).to(data.device) 57 | mu = mu.unsqueeze(-1) 58 | 59 | if not isinstance(std, (float, int)): 60 | if isinstance(std, list): 61 | std = torch.tensor(std, dtype=data.dtype, device=data.device) 62 | elif isinstance(std, torch.Tensor): 63 | std = std.to(data.device) 64 | elif isinstance(std, np.ndarray): 65 | std = torch.from_numpy(std).to(data.device) 66 | std = std.unsqueeze(-1) 67 | 68 | return (data - mu) / std 69 | 70 | 71 | def denormalize(data, mu, std): 72 | if not isinstance(mu, float): 73 | if isinstance(mu, list): 74 | mu = torch.tensor(mu, dtype=data.dtype, device=data.device) 75 | elif isinstance(mu, torch.Tensor): 76 | mu = mu.to(data.device) 77 | elif isinstance(mu, np.ndarray): 78 | mu = torch.from_numpy(mu).to(data.device) 79 | mu = mu.unsqueeze(-1) 80 | 81 | if not isinstance(std, float): 82 | if isinstance(std, list): 83 | std = torch.tensor(std, dtype=data.dtype, device=data.device) 84 | elif isinstance(std, torch.Tensor): 85 | std = std.to(data.device) 86 | elif isinstance(std, np.ndarray): 87 | std = torch.from_numpy(std).to(data.device) 88 | std = std.unsqueeze(-1) 89 | 90 | return data * std + mu 91 | -------------------------------------------------------------------------------- /cosyvoice/transformer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 2024 Alibaba Inc (Xiang Lyu) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Swish() activation function for Conformer.""" 18 | 19 | import torch 20 | from torch import nn, sin, pow 21 | from torch.nn import Parameter 22 | 23 | 24 | class Swish(torch.nn.Module): 25 | """Construct an Swish object.""" 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | """Return Swish activation function.""" 29 | return x * torch.sigmoid(x) 30 | 31 | 32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 33 | # LICENSE is in incl_licenses directory. 34 | class Snake(nn.Module): 35 | ''' 36 | Implementation of a sine-based periodic activation function 37 | Shape: 38 | - Input: (B, C, T) 39 | - Output: (B, C, T), same shape as the input 40 | Parameters: 41 | - alpha - trainable parameter 42 | References: 43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 44 | https://arxiv.org/abs/2006.08195 45 | Examples: 46 | >>> a1 = snake(256) 47 | >>> x = torch.randn(256) 48 | >>> x = a1(x) 49 | ''' 50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 51 | ''' 52 | Initialization. 53 | INPUT: 54 | - in_features: shape of the input 55 | - alpha: trainable parameter 56 | alpha is initialized to 1 by default, higher values = higher-frequency. 57 | alpha will be trained along with the rest of your model. 58 | ''' 59 | super(Snake, self).__init__() 60 | self.in_features = in_features 61 | 62 | # initialize alpha 63 | self.alpha_logscale = alpha_logscale 64 | if self.alpha_logscale: # log scale alphas initialized to zeros 65 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 66 | else: # linear scale alphas initialized to ones 67 | self.alpha = Parameter(torch.ones(in_features) * alpha) 68 | 69 | self.alpha.requires_grad = alpha_trainable 70 | 71 | self.no_div_by_zero = 0.000000001 72 | 73 | def forward(self, x): 74 | ''' 75 | Forward pass of the function. 76 | Applies the function to the input elementwise. 77 | Snake ∶= x + 1/a * sin^2 (xa) 78 | ''' 79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 80 | if self.alpha_logscale: 81 | alpha = torch.exp(alpha) 82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/rich_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Sequence 3 | 4 | import rich 5 | import rich.syntax 6 | import rich.tree 7 | from hydra.core.hydra_config import HydraConfig 8 | from lightning.pytorch.utilities import rank_zero_only 9 | from omegaconf import DictConfig, OmegaConf, open_dict 10 | from rich.prompt import Prompt 11 | 12 | from matcha.utils import pylogger 13 | 14 | log = pylogger.get_pylogger(__name__) 15 | 16 | 17 | @rank_zero_only 18 | def print_config_tree( 19 | cfg: DictConfig, 20 | print_order: Sequence[str] = ( 21 | "data", 22 | "model", 23 | "callbacks", 24 | "logger", 25 | "trainer", 26 | "paths", 27 | "extras", 28 | ), 29 | resolve: bool = False, 30 | save_to_file: bool = False, 31 | ) -> None: 32 | """Prints the contents of a DictConfig as a tree structure using the Rich library. 33 | 34 | :param cfg: A DictConfig composed by Hydra. 35 | :param print_order: Determines in what order config components are printed. Default is ``("data", "model", 36 | "callbacks", "logger", "trainer", "paths", "extras")``. 37 | :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``. 38 | :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``. 39 | """ 40 | style = "dim" 41 | tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) 42 | 43 | queue = [] 44 | 45 | # add fields from `print_order` to queue 46 | for field in print_order: 47 | _ = ( 48 | queue.append(field) 49 | if field in cfg 50 | else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...") 51 | ) 52 | 53 | # add all the other fields to queue (not specified in `print_order`) 54 | for field in cfg: 55 | if field not in queue: 56 | queue.append(field) 57 | 58 | # generate config tree from queue 59 | for field in queue: 60 | branch = tree.add(field, style=style, guide_style=style) 61 | 62 | config_group = cfg[field] 63 | if isinstance(config_group, DictConfig): 64 | branch_content = OmegaConf.to_yaml(config_group, resolve=resolve) 65 | else: 66 | branch_content = str(config_group) 67 | 68 | branch.add(rich.syntax.Syntax(branch_content, "yaml")) 69 | 70 | # print config tree 71 | rich.print(tree) 72 | 73 | # save config tree to file 74 | if save_to_file: 75 | with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file: 76 | rich.print(tree, file=file) 77 | 78 | 79 | @rank_zero_only 80 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None: 81 | """Prompts user to input tags from command line if no tags are provided in config. 82 | 83 | :param cfg: A DictConfig composed by Hydra. 84 | :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``. 85 | """ 86 | if not cfg.get("tags"): 87 | if "id" in HydraConfig().cfg.hydra.job: 88 | raise ValueError("Specify tags before launching a multirun!") 89 | 90 | log.warning("No tags provided in config. Prompting user to input tags...") 91 | tags = Prompt.ask("Enter a list of comma separated tags", default="dev") 92 | tags = [t.strip() for t in tags.split(",") if t != ""] 93 | 94 | with open_dict(cfg): 95 | cfg.tags = tags 96 | 97 | log.info(f"Tags: {cfg.tags}") 98 | 99 | if save_to_file: 100 | with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file: 101 | rich.print(cfg.tags, file=file) 102 | -------------------------------------------------------------------------------- /cosyvoice/cli/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | class CosyVoiceModel: 17 | 18 | def __init__(self, 19 | llm: torch.nn.Module, 20 | flow: torch.nn.Module, 21 | hift: torch.nn.Module): 22 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | self.llm = llm 24 | self.flow = flow 25 | self.hift = hift 26 | 27 | def load(self, llm_model, flow_model, hift_model): 28 | self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) 29 | self.llm.to(self.device).eval() 30 | self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) 31 | self.flow.to(self.device).eval() 32 | self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) 33 | self.hift.to(self.device).eval() 34 | 35 | def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192), 36 | prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), 37 | llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 38 | flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), 39 | prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): 40 | tts_speech_token = self.llm.inference(text=text.to(self.device), 41 | text_len=text_len.to(self.device), 42 | prompt_text=prompt_text.to(self.device), 43 | prompt_text_len=prompt_text_len.to(self.device), 44 | prompt_speech_token=llm_prompt_speech_token.to(self.device), 45 | prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device), 46 | embedding=llm_embedding.to(self.device), 47 | beam_size=1, 48 | sampling=25, 49 | max_token_text_ratio=30, 50 | min_token_text_ratio=3) 51 | tts_mel = self.flow.inference(token=tts_speech_token, 52 | token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), 53 | prompt_token=flow_prompt_speech_token.to(self.device), 54 | prompt_token_len=flow_prompt_speech_token_len.to(self.device), 55 | prompt_feat=prompt_speech_feat.to(self.device), 56 | prompt_feat_len=prompt_speech_feat_len.to(self.device), 57 | embedding=flow_embedding.to(self.device)) 58 | tts_speech = self.hift.inference(mel=tts_mel).cpu() 59 | torch.cuda.empty_cache() 60 | return {'tts_speech': tts_speech} 61 | -------------------------------------------------------------------------------- /cosyvoice/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Label smoothing module.""" 16 | 17 | import torch 18 | from torch import nn 19 | 20 | 21 | class LabelSmoothingLoss(nn.Module): 22 | """Label-smoothing loss. 23 | 24 | In a standard CE loss, the label's data distribution is: 25 | [0,1,2] -> 26 | [ 27 | [1.0, 0.0, 0.0], 28 | [0.0, 1.0, 0.0], 29 | [0.0, 0.0, 1.0], 30 | ] 31 | 32 | In the smoothing version CE Loss,some probabilities 33 | are taken from the true label prob (1.0) and are divided 34 | among other labels. 35 | 36 | e.g. 37 | smoothing=0.1 38 | [0,1,2] -> 39 | [ 40 | [0.9, 0.05, 0.05], 41 | [0.05, 0.9, 0.05], 42 | [0.05, 0.05, 0.9], 43 | ] 44 | 45 | Args: 46 | size (int): the number of class 47 | padding_idx (int): padding class id which will be ignored for loss 48 | smoothing (float): smoothing rate (0.0 means the conventional CE) 49 | normalize_length (bool): 50 | normalize loss by sequence length if True 51 | normalize loss by batch size if False 52 | """ 53 | 54 | def __init__(self, 55 | size: int, 56 | padding_idx: int, 57 | smoothing: float, 58 | normalize_length: bool = False): 59 | """Construct an LabelSmoothingLoss object.""" 60 | super(LabelSmoothingLoss, self).__init__() 61 | self.criterion = nn.KLDivLoss(reduction="none") 62 | self.padding_idx = padding_idx 63 | self.confidence = 1.0 - smoothing 64 | self.smoothing = smoothing 65 | self.size = size 66 | self.normalize_length = normalize_length 67 | 68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 69 | """Compute loss between x and target. 70 | 71 | The model outputs and data labels tensors are flatten to 72 | (batch*seqlen, class) shape and a mask is applied to the 73 | padding part which should not be calculated for loss. 74 | 75 | Args: 76 | x (torch.Tensor): prediction (batch, seqlen, class) 77 | target (torch.Tensor): 78 | target signal masked with self.padding_id (batch, seqlen) 79 | Returns: 80 | loss (torch.Tensor) : The KL loss, scalar float value 81 | """ 82 | assert x.size(2) == self.size 83 | batch_size = x.size(0) 84 | x = x.view(-1, self.size) 85 | target = target.view(-1) 86 | # use zeros_like instead of torch.no_grad() for true_dist, 87 | # since no_grad() can not be exported by JIT 88 | true_dist = torch.zeros_like(x) 89 | true_dist.fill_(self.smoothing / (self.size - 1)) 90 | ignore = target == self.padding_idx # (B,) 91 | total = len(target) - ignore.sum().item() 92 | target = target.masked_fill(ignore, 0) # avoid -1 index 93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 95 | denom = total if self.normalize_length else batch_size 96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 97 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py: -------------------------------------------------------------------------------- 1 | r""" 2 | The file creates a pickle file where the values needed for loading of dataset is stored and the model can load it 3 | when needed. 4 | 5 | Parameters from hparam.py will be used 6 | """ 7 | import argparse 8 | import json 9 | import os 10 | import sys 11 | from pathlib import Path 12 | 13 | import rootutils 14 | import torch 15 | from hydra import compose, initialize 16 | from omegaconf import open_dict 17 | from tqdm.auto import tqdm 18 | 19 | from matcha.data.text_mel_datamodule import TextMelDataModule 20 | from matcha.utils.logging_utils import pylogger 21 | 22 | log = pylogger.get_pylogger(__name__) 23 | 24 | 25 | def compute_data_statistics(data_loader: torch.utils.data.DataLoader, out_channels: int): 26 | """Generate data mean and standard deviation helpful in data normalisation 27 | 28 | Args: 29 | data_loader (torch.utils.data.Dataloader): _description_ 30 | out_channels (int): mel spectrogram channels 31 | """ 32 | total_mel_sum = 0 33 | total_mel_sq_sum = 0 34 | total_mel_len = 0 35 | 36 | for batch in tqdm(data_loader, leave=False): 37 | mels = batch["y"] 38 | mel_lengths = batch["y_lengths"] 39 | 40 | total_mel_len += torch.sum(mel_lengths) 41 | total_mel_sum += torch.sum(mels) 42 | total_mel_sq_sum += torch.sum(torch.pow(mels, 2)) 43 | 44 | data_mean = total_mel_sum / (total_mel_len * out_channels) 45 | data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2)) 46 | 47 | return {"mel_mean": data_mean.item(), "mel_std": data_std.item()} 48 | 49 | 50 | def main(): 51 | parser = argparse.ArgumentParser() 52 | 53 | parser.add_argument( 54 | "-i", 55 | "--input-config", 56 | type=str, 57 | default="vctk.yaml", 58 | help="The name of the yaml config file under configs/data", 59 | ) 60 | 61 | parser.add_argument( 62 | "-b", 63 | "--batch-size", 64 | type=int, 65 | default="256", 66 | help="Can have increased batch size for faster computation", 67 | ) 68 | 69 | parser.add_argument( 70 | "-f", 71 | "--force", 72 | action="store_true", 73 | default=False, 74 | required=False, 75 | help="force overwrite the file", 76 | ) 77 | args = parser.parse_args() 78 | output_file = Path(args.input_config).with_suffix(".json") 79 | 80 | if os.path.exists(output_file) and not args.force: 81 | print("File already exists. Use -f to force overwrite") 82 | sys.exit(1) 83 | 84 | with initialize(version_base="1.3", config_path="../../configs/data"): 85 | cfg = compose(config_name=args.input_config, return_hydra_config=True, overrides=[]) 86 | 87 | root_path = rootutils.find_root(search_from=__file__, indicator=".project-root") 88 | 89 | with open_dict(cfg): 90 | del cfg["hydra"] 91 | del cfg["_target_"] 92 | cfg["data_statistics"] = None 93 | cfg["seed"] = 1234 94 | cfg["batch_size"] = args.batch_size 95 | cfg["train_filelist_path"] = str(os.path.join(root_path, cfg["train_filelist_path"])) 96 | cfg["valid_filelist_path"] = str(os.path.join(root_path, cfg["valid_filelist_path"])) 97 | 98 | text_mel_datamodule = TextMelDataModule(**cfg) 99 | text_mel_datamodule.setup() 100 | data_loader = text_mel_datamodule.train_dataloader() 101 | log.info("Dataloader loaded! Now computing stats...") 102 | params = compute_data_statistics(data_loader, cfg["n_feats"]) 103 | print(params) 104 | json.dump( 105 | params, 106 | open(output_file, "w"), 107 | ) 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /cosyvoice/utils/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """Unility functions for Transformer.""" 17 | 18 | from typing import List 19 | 20 | import torch 21 | 22 | IGNORE_ID = -1 23 | 24 | 25 | def pad_list(xs: List[torch.Tensor], pad_value: int): 26 | """Perform padding for the list of tensors. 27 | 28 | Args: 29 | xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. 30 | pad_value (float): Value for padding. 31 | 32 | Returns: 33 | Tensor: Padded tensor (B, Tmax, `*`). 34 | 35 | Examples: 36 | >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] 37 | >>> x 38 | [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] 39 | >>> pad_list(x, 0) 40 | tensor([[1., 1., 1., 1.], 41 | [1., 1., 0., 0.], 42 | [1., 0., 0., 0.]]) 43 | 44 | """ 45 | max_len = max([len(item) for item in xs]) 46 | batchs = len(xs) 47 | ndim = xs[0].ndim 48 | if ndim == 1: 49 | pad_res = torch.zeros(batchs, 50 | max_len, 51 | dtype=xs[0].dtype, 52 | device=xs[0].device) 53 | elif ndim == 2: 54 | pad_res = torch.zeros(batchs, 55 | max_len, 56 | xs[0].shape[1], 57 | dtype=xs[0].dtype, 58 | device=xs[0].device) 59 | elif ndim == 3: 60 | pad_res = torch.zeros(batchs, 61 | max_len, 62 | xs[0].shape[1], 63 | xs[0].shape[2], 64 | dtype=xs[0].dtype, 65 | device=xs[0].device) 66 | else: 67 | raise ValueError(f"Unsupported ndim: {ndim}") 68 | pad_res.fill_(pad_value) 69 | for i in range(batchs): 70 | pad_res[i, :len(xs[i])] = xs[i] 71 | return pad_res 72 | 73 | 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, 75 | ignore_label: int) -> torch.Tensor: 76 | """Calculate accuracy. 77 | 78 | Args: 79 | pad_outputs (Tensor): Prediction tensors (B * Lmax, D). 80 | pad_targets (LongTensor): Target label tensors (B, Lmax). 81 | ignore_label (int): Ignore label id. 82 | 83 | Returns: 84 | torch.Tensor: Accuracy value (0.0 - 1.0). 85 | 86 | """ 87 | pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), 88 | pad_outputs.size(1)).argmax(2) 89 | mask = pad_targets != ignore_label 90 | numerator = torch.sum( 91 | pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) 92 | denominator = torch.sum(mask) 93 | return (numerator / denominator).detach() 94 | 95 | 96 | def get_padding(kernel_size, dilation=1): 97 | return int((kernel_size * dilation - dilation) / 2) 98 | 99 | 100 | def init_weights(m, mean=0.0, std=0.01): 101 | classname = m.__class__.__name__ 102 | if classname.find("Conv") != -1: 103 | m.weight.data.normal_(mean, std) 104 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron 2 | 3 | Cleaners are transformations that run over the input text at both training and eval time. 4 | 5 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 6 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 7 | 1. "english_cleaners" for English text 8 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 9 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 10 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 11 | the symbols in symbols.py to match your data). 12 | """ 13 | 14 | import logging 15 | import re 16 | 17 | import phonemizer 18 | import piper_phonemize 19 | from unidecode import unidecode 20 | 21 | # To avoid excessive logging we set the log level of the phonemizer package to Critical 22 | critical_logger = logging.getLogger("phonemizer") 23 | critical_logger.setLevel(logging.CRITICAL) 24 | 25 | # Intializing the phonemizer globally significantly reduces the speed 26 | # now the phonemizer is not initialising at every call 27 | # Might be less flexible, but it is much-much faster 28 | global_phonemizer = phonemizer.backend.EspeakBackend( 29 | language="en-us", 30 | preserve_punctuation=True, 31 | with_stress=True, 32 | language_switch="remove-flags", 33 | logger=critical_logger, 34 | ) 35 | 36 | 37 | # Regular expression matching whitespace: 38 | _whitespace_re = re.compile(r"\s+") 39 | 40 | # List of (regular expression, replacement) pairs for abbreviations: 41 | _abbreviations = [ 42 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 43 | for x in [ 44 | ("mrs", "misess"), 45 | ("mr", "mister"), 46 | ("dr", "doctor"), 47 | ("st", "saint"), 48 | ("co", "company"), 49 | ("jr", "junior"), 50 | ("maj", "major"), 51 | ("gen", "general"), 52 | ("drs", "doctors"), 53 | ("rev", "reverend"), 54 | ("lt", "lieutenant"), 55 | ("hon", "honorable"), 56 | ("sgt", "sergeant"), 57 | ("capt", "captain"), 58 | ("esq", "esquire"), 59 | ("ltd", "limited"), 60 | ("col", "colonel"), 61 | ("ft", "fort"), 62 | ] 63 | ] 64 | 65 | 66 | def expand_abbreviations(text): 67 | for regex, replacement in _abbreviations: 68 | text = re.sub(regex, replacement, text) 69 | return text 70 | 71 | 72 | def lowercase(text): 73 | return text.lower() 74 | 75 | 76 | def collapse_whitespace(text): 77 | return re.sub(_whitespace_re, " ", text) 78 | 79 | 80 | def convert_to_ascii(text): 81 | return unidecode(text) 82 | 83 | 84 | def basic_cleaners(text): 85 | """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 86 | text = lowercase(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | 90 | 91 | def transliteration_cleaners(text): 92 | """Pipeline for non-English text that transliterates to ASCII.""" 93 | text = convert_to_ascii(text) 94 | text = lowercase(text) 95 | text = collapse_whitespace(text) 96 | return text 97 | 98 | 99 | def english_cleaners2(text): 100 | """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" 101 | text = convert_to_ascii(text) 102 | text = lowercase(text) 103 | text = expand_abbreviations(text) 104 | phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0] 105 | phonemes = collapse_whitespace(phonemes) 106 | return phonemes 107 | 108 | 109 | def english_cleaners_piper(text): 110 | """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" 111 | text = convert_to_ascii(text) 112 | text = lowercase(text) 113 | text = expand_abbreviations(text) 114 | phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0]) 115 | phonemes = collapse_whitespace(phonemes) 116 | return phonemes 117 | -------------------------------------------------------------------------------- /cosyvoice/cli/cosyvoice.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import torch 16 | from hyperpyyaml import load_hyperpyyaml 17 | from huggingface_hub import snapshot_download 18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd 19 | from cosyvoice.cli.model import CosyVoiceModel 20 | 21 | class CosyVoice: 22 | 23 | def __init__(self, model_dir): 24 | instruct = True if '-Instruct' in model_dir else False 25 | self.model_dir = model_dir 26 | if not os.path.exists(model_dir): 27 | model_dir = snapshot_download(model_dir) 28 | with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: 29 | configs = load_hyperpyyaml(f) 30 | self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], 31 | configs['feat_extractor'], 32 | '{}/campplus.onnx'.format(model_dir), 33 | '{}/speech_tokenizer_v1.onnx'.format(model_dir), 34 | '{}/spk2info.pt'.format(model_dir), 35 | instruct, 36 | configs['allowed_special']) 37 | self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) 38 | self.model.load('{}/llm.pt'.format(model_dir), 39 | '{}/flow.pt'.format(model_dir), 40 | '{}/hift.pt'.format(model_dir)) 41 | del configs 42 | 43 | def list_avaliable_spks(self): 44 | spks = list(self.frontend.spk2info.keys()) 45 | return spks 46 | 47 | def inference_sft(self, tts_text, spk_id): 48 | tts_speeches = [] 49 | for i in self.frontend.text_normalize(tts_text, split=True): 50 | model_input = self.frontend.frontend_sft(i, spk_id) 51 | model_output = self.model.inference(**model_input) 52 | tts_speeches.append(model_output['tts_speech']) 53 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 54 | 55 | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): 56 | prompt_text = self.frontend.text_normalize(prompt_text, split=False) 57 | tts_speeches = [] 58 | for i in self.frontend.text_normalize(tts_text, split=True): 59 | model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) 60 | model_output = self.model.inference(**model_input) 61 | tts_speeches.append(model_output['tts_speech']) 62 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 63 | 64 | def inference_cross_lingual(self, tts_text, prompt_speech_16k): 65 | if self.frontend.instruct is True: 66 | raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) 67 | tts_speeches = [] 68 | for i in self.frontend.text_normalize(tts_text, split=True): 69 | model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k) 70 | model_output = self.model.inference(**model_input) 71 | tts_speeches.append(model_output['tts_speech']) 72 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 73 | 74 | def inference_instruct(self, tts_text, spk_id, instruct_text): 75 | if self.frontend.instruct is False: 76 | raise ValueError('{} do not support instruct inference'.format(self.model_dir)) 77 | instruct_text = self.frontend.text_normalize(instruct_text, split=False) 78 | tts_speeches = [] 79 | for i in self.frontend.text_normalize(tts_text, split=True): 80 | model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) 81 | model_output = self.model.inference(**model_input) 82 | tts_speeches.append(model_output['tts_speech']) 83 | return {'tts_speech': torch.concat(tts_speeches, dim=1)} 84 | -------------------------------------------------------------------------------- /cosyvoice/utils/frontend_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') 17 | 18 | # whether contain chinese character 19 | def contains_chinese(text): 20 | return bool(chinese_char_pattern.search(text)) 21 | 22 | 23 | # replace special symbol 24 | def replace_corner_mark(text): 25 | text = text.replace('²', '平方') 26 | text = text.replace('³', '立方') 27 | return text 28 | 29 | 30 | # remove meaningless symbol 31 | def remove_bracket(text): 32 | text = text.replace('(', '').replace(')', '') 33 | text = text.replace('【', '').replace('】', '') 34 | text = text.replace('`', '').replace('`', '') 35 | text = text.replace("——", " ") 36 | return text 37 | 38 | 39 | # spell Arabic numerals 40 | def spell_out_number(text: str, inflect_parser): 41 | new_text = [] 42 | st = None 43 | for i, c in enumerate(text): 44 | if not c.isdigit(): 45 | if st is not None: 46 | num_str = inflect_parser.number_to_words(text[st: i]) 47 | new_text.append(num_str) 48 | st = None 49 | new_text.append(c) 50 | else: 51 | if st is None: 52 | st = i 53 | if st is not None and st < len(text): 54 | num_str = inflect_parser.number_to_words(text[st:]) 55 | new_text.append(num_str) 56 | return ''.join(new_text) 57 | 58 | 59 | # split paragrah logic: 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len 61 | # 2. cal sentence len according to lang 62 | # 3. split sentence according to puncatation 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): 64 | def calc_utt_length(_text: str): 65 | if lang == "zh": 66 | return len(_text) 67 | else: 68 | return len(tokenize(_text)) 69 | 70 | def should_merge(_text: str): 71 | if lang == "zh": 72 | return len(_text) < merge_len 73 | else: 74 | return len(tokenize(_text)) < merge_len 75 | 76 | if lang == "zh": 77 | pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] 78 | else: 79 | pounc = ['.', '?', '!', ';', ':'] 80 | if comma_split: 81 | pounc.extend([',', ',']) 82 | st = 0 83 | utts = [] 84 | for i, c in enumerate(text): 85 | if c in pounc: 86 | if len(text[st: i]) > 0: 87 | utts.append(text[st: i] + c) 88 | if i + 1 < len(text) and text[i + 1] in ['"', '”']: 89 | tmp = utts.pop(-1) 90 | utts.append(tmp + text[i + 1]) 91 | st = i + 2 92 | else: 93 | st = i + 1 94 | if len(utts) == 0: 95 | if lang == "zh": 96 | utts.append(text + '。') 97 | else: 98 | utts.append(text + '.') 99 | final_utts = [] 100 | cur_utt = "" 101 | for utt in utts: 102 | if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: 103 | final_utts.append(cur_utt) 104 | cur_utt = "" 105 | cur_utt = cur_utt + utt 106 | if len(cur_utt) > 0: 107 | if should_merge(cur_utt) and len(final_utts) != 0: 108 | final_utts[-1] = final_utts[-1] + cur_utt 109 | else: 110 | final_utts.append(cur_utt) 111 | 112 | return final_utts 113 | 114 | 115 | # remove blank between chinese character 116 | def replace_blank(text: str): 117 | out_str = [] 118 | for i, c in enumerate(text): 119 | if c == " ": 120 | if ((text[i + 1].isascii() and text[i + 1] != " ") and 121 | (text[i - 1].isascii() and text[i - 1] != " ")): 122 | out_str.append(c) 123 | else: 124 | out_str.append(c) 125 | return "".join(out_str) 126 | -------------------------------------------------------------------------------- /cosyvoice/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Positionwise feed forward layer definition.""" 16 | 17 | import torch 18 | 19 | 20 | class PositionwiseFeedForward(torch.nn.Module): 21 | """Positionwise feed forward layer. 22 | 23 | FeedForward are appied on each position of the sequence. 24 | The output dim is same with the input dim. 25 | 26 | Args: 27 | idim (int): Input dimenstion. 28 | hidden_units (int): The number of hidden units. 29 | dropout_rate (float): Dropout rate. 30 | activation (torch.nn.Module): Activation function 31 | """ 32 | 33 | def __init__( 34 | self, 35 | idim: int, 36 | hidden_units: int, 37 | dropout_rate: float, 38 | activation: torch.nn.Module = torch.nn.ReLU(), 39 | ): 40 | """Construct a PositionwiseFeedForward object.""" 41 | super(PositionwiseFeedForward, self).__init__() 42 | self.w_1 = torch.nn.Linear(idim, hidden_units) 43 | self.activation = activation 44 | self.dropout = torch.nn.Dropout(dropout_rate) 45 | self.w_2 = torch.nn.Linear(hidden_units, idim) 46 | 47 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 48 | """Forward function. 49 | 50 | Args: 51 | xs: input tensor (B, L, D) 52 | Returns: 53 | output tensor, (B, L, D) 54 | """ 55 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 56 | 57 | 58 | class MoEFFNLayer(torch.nn.Module): 59 | """ 60 | Mixture of expert with Positionwise feed forward layer 61 | See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf 62 | The output dim is same with the input dim. 63 | 64 | Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 65 | https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 66 | Args: 67 | n_expert: number of expert. 68 | n_expert_per_token: The actual number of experts used for each frame 69 | idim (int): Input dimenstion. 70 | hidden_units (int): The number of hidden units. 71 | dropout_rate (float): Dropout rate. 72 | activation (torch.nn.Module): Activation function 73 | """ 74 | 75 | def __init__( 76 | self, 77 | n_expert: int, 78 | n_expert_per_token: int, 79 | idim: int, 80 | hidden_units: int, 81 | dropout_rate: float, 82 | activation: torch.nn.Module = torch.nn.ReLU(), 83 | ): 84 | super(MoEFFNLayer, self).__init__() 85 | self.gate = torch.nn.Linear(idim, n_expert, bias=False) 86 | self.experts = torch.nn.ModuleList( 87 | PositionwiseFeedForward(idim, hidden_units, dropout_rate, 88 | activation) for _ in range(n_expert)) 89 | self.n_expert_per_token = n_expert_per_token 90 | 91 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 92 | """Foward function. 93 | Args: 94 | xs: input tensor (B, L, D) 95 | Returns: 96 | output tensor, (B, L, D) 97 | 98 | """ 99 | B, L, D = xs.size( 100 | ) # batch size, sequence length, embedding dimension (idim) 101 | xs = xs.view(-1, D) # (B*L, D) 102 | router = self.gate(xs) # (B*L, n_expert) 103 | logits, indices = torch.topk( 104 | router, self.n_expert_per_token 105 | ) # probs:(B*L, n_expert), indices: (B*L, n_expert) 106 | weights = torch.nn.functional.softmax( 107 | logits, dim=1, 108 | dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token) 109 | output = torch.zeros_like(xs) # (B*L, D) 110 | for i, expert in enumerate(self.experts): 111 | mask = indices == i 112 | batch_idx, ith_expert = torch.where(mask) 113 | output[batch_idx] += weights[batch_idx, ith_expert, None] * expert( 114 | xs[batch_idx]) 115 | return output.view(B, L, D) 116 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/train.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Tuple 2 | 3 | import hydra 4 | import lightning as L 5 | import rootutils 6 | from lightning import Callback, LightningDataModule, LightningModule, Trainer 7 | from lightning.pytorch.loggers import Logger 8 | from omegaconf import DictConfig 9 | 10 | from matcha import utils 11 | 12 | rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) 13 | # ------------------------------------------------------------------------------------ # 14 | # the setup_root above is equivalent to: 15 | # - adding project root dir to PYTHONPATH 16 | # (so you don't need to force user to install project as a package) 17 | # (necessary before importing any local modules e.g. `from src import utils`) 18 | # - setting up PROJECT_ROOT environment variable 19 | # (which is used as a base for paths in "configs/paths/default.yaml") 20 | # (this way all filepaths are the same no matter where you run the code) 21 | # - loading environment variables from ".env" in root dir 22 | # 23 | # you can remove it if you: 24 | # 1. either install project as a package or move entry files to project root dir 25 | # 2. set `root_dir` to "." in "configs/paths/default.yaml" 26 | # 27 | # more info: https://github.com/ashleve/rootutils 28 | # ------------------------------------------------------------------------------------ # 29 | 30 | 31 | log = utils.get_pylogger(__name__) 32 | 33 | 34 | @utils.task_wrapper 35 | def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: 36 | """Trains the model. Can additionally evaluate on a testset, using best weights obtained during 37 | training. 38 | 39 | This method is wrapped in optional @task_wrapper decorator, that controls the behavior during 40 | failure. Useful for multiruns, saving info about the crash, etc. 41 | 42 | :param cfg: A DictConfig configuration composed by Hydra. 43 | :return: A tuple with metrics and dict with all instantiated objects. 44 | """ 45 | # set seed for random number generators in pytorch, numpy and python.random 46 | if cfg.get("seed"): 47 | L.seed_everything(cfg.seed, workers=True) 48 | 49 | log.info(f"Instantiating datamodule <{cfg.data._target_}>") # pylint: disable=protected-access 50 | datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data) 51 | 52 | log.info(f"Instantiating model <{cfg.model._target_}>") # pylint: disable=protected-access 53 | model: LightningModule = hydra.utils.instantiate(cfg.model) 54 | 55 | log.info("Instantiating callbacks...") 56 | callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks")) 57 | 58 | log.info("Instantiating loggers...") 59 | logger: List[Logger] = utils.instantiate_loggers(cfg.get("logger")) 60 | 61 | log.info(f"Instantiating trainer <{cfg.trainer._target_}>") # pylint: disable=protected-access 62 | trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger) 63 | 64 | object_dict = { 65 | "cfg": cfg, 66 | "datamodule": datamodule, 67 | "model": model, 68 | "callbacks": callbacks, 69 | "logger": logger, 70 | "trainer": trainer, 71 | } 72 | 73 | if logger: 74 | log.info("Logging hyperparameters!") 75 | utils.log_hyperparameters(object_dict) 76 | 77 | if cfg.get("train"): 78 | log.info("Starting training!") 79 | trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) 80 | 81 | train_metrics = trainer.callback_metrics 82 | 83 | if cfg.get("test"): 84 | log.info("Starting testing!") 85 | ckpt_path = trainer.checkpoint_callback.best_model_path 86 | if ckpt_path == "": 87 | log.warning("Best ckpt not found! Using current weights for testing...") 88 | ckpt_path = None 89 | trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path) 90 | log.info(f"Best ckpt path: {ckpt_path}") 91 | 92 | test_metrics = trainer.callback_metrics 93 | 94 | # merge train and test metrics 95 | metric_dict = {**train_metrics, **test_metrics} 96 | 97 | return metric_dict, object_dict 98 | 99 | 100 | @hydra.main(version_base="1.3", config_path="../configs", config_name="train.yaml") 101 | def main(cfg: DictConfig) -> Optional[float]: 102 | """Main entry point for training. 103 | 104 | :param cfg: DictConfig configuration composed by Hydra. 105 | :return: Optional[float] with optimized metric value. 106 | """ 107 | # apply extra utilities 108 | # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.) 109 | utils.extras(cfg) 110 | 111 | # train the model 112 | metric_dict, _ = train(cfg) 113 | 114 | # safely retrieve metric value for hydra-based hyperparameter optimization 115 | metric_value = utils.get_metric_value(metric_dict=metric_dict, metric_name=cfg.get("optimized_metric")) 116 | 117 | # return optimized metric 118 | return metric_value 119 | 120 | 121 | if __name__ == "__main__": 122 | main() # pylint: disable=no-value-for-parameter 123 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/models/components/flow_matching.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from matcha.models.components.decoder import Decoder 7 | from matcha.utils.pylogger import get_pylogger 8 | 9 | log = get_pylogger(__name__) 10 | 11 | 12 | class BASECFM(torch.nn.Module, ABC): 13 | def __init__( 14 | self, 15 | n_feats, 16 | cfm_params, 17 | n_spks=1, 18 | spk_emb_dim=128, 19 | ): 20 | super().__init__() 21 | self.n_feats = n_feats 22 | self.n_spks = n_spks 23 | self.spk_emb_dim = spk_emb_dim 24 | self.solver = cfm_params.solver 25 | if hasattr(cfm_params, "sigma_min"): 26 | self.sigma_min = cfm_params.sigma_min 27 | else: 28 | self.sigma_min = 1e-4 29 | 30 | self.estimator = None 31 | 32 | @torch.inference_mode() 33 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): 34 | """Forward diffusion 35 | 36 | Args: 37 | mu (torch.Tensor): output of encoder 38 | shape: (batch_size, n_feats, mel_timesteps) 39 | mask (torch.Tensor): output_mask 40 | shape: (batch_size, 1, mel_timesteps) 41 | n_timesteps (int): number of diffusion steps 42 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0. 43 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 44 | shape: (batch_size, spk_emb_dim) 45 | cond: Not used but kept for future purposes 46 | 47 | Returns: 48 | sample: generated mel-spectrogram 49 | shape: (batch_size, n_feats, mel_timesteps) 50 | """ 51 | z = torch.randn_like(mu) * temperature 52 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) 53 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) 54 | 55 | def solve_euler(self, x, t_span, mu, mask, spks, cond): 56 | """ 57 | Fixed euler solver for ODEs. 58 | Args: 59 | x (torch.Tensor): random noise 60 | t_span (torch.Tensor): n_timesteps interpolated 61 | shape: (n_timesteps + 1,) 62 | mu (torch.Tensor): output of encoder 63 | shape: (batch_size, n_feats, mel_timesteps) 64 | mask (torch.Tensor): output_mask 65 | shape: (batch_size, 1, mel_timesteps) 66 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 67 | shape: (batch_size, spk_emb_dim) 68 | cond: Not used but kept for future purposes 69 | """ 70 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] 71 | 72 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file 73 | # Or in future might add like a return_all_steps flag 74 | sol = [] 75 | 76 | for step in range(1, len(t_span)): 77 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond) 78 | 79 | x = x + dt * dphi_dt 80 | t = t + dt 81 | sol.append(x) 82 | if step < len(t_span) - 1: 83 | dt = t_span[step + 1] - t 84 | 85 | return sol[-1] 86 | 87 | def compute_loss(self, x1, mask, mu, spks=None, cond=None): 88 | """Computes diffusion loss 89 | 90 | Args: 91 | x1 (torch.Tensor): Target 92 | shape: (batch_size, n_feats, mel_timesteps) 93 | mask (torch.Tensor): target mask 94 | shape: (batch_size, 1, mel_timesteps) 95 | mu (torch.Tensor): output of encoder 96 | shape: (batch_size, n_feats, mel_timesteps) 97 | spks (torch.Tensor, optional): speaker embedding. Defaults to None. 98 | shape: (batch_size, spk_emb_dim) 99 | 100 | Returns: 101 | loss: conditional flow matching loss 102 | y: conditional flow 103 | shape: (batch_size, n_feats, mel_timesteps) 104 | """ 105 | b, _, t = mu.shape 106 | 107 | # random timestep 108 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) 109 | # sample noise p(x_0) 110 | z = torch.randn_like(x1) 111 | 112 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1 113 | u = x1 - (1 - self.sigma_min) * z 114 | 115 | loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / ( 116 | torch.sum(mask) * u.shape[1] 117 | ) 118 | return loss, y 119 | 120 | 121 | class CFM(BASECFM): 122 | def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64): 123 | super().__init__( 124 | n_feats=in_channels, 125 | cfm_params=cfm_params, 126 | n_spks=n_spks, 127 | spk_emb_dim=spk_emb_dim, 128 | ) 129 | 130 | in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0) 131 | # Just change the architecture of the estimator here 132 | self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params) 133 | -------------------------------------------------------------------------------- /cosyvoice/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Decoder self-attention layer definition.""" 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class DecoderLayer(nn.Module): 23 | """Single decoder layer module. 24 | 25 | Args: 26 | size (int): Input dimension. 27 | self_attn (torch.nn.Module): Self-attention module instance. 28 | `MultiHeadedAttention` instance can be used as the argument. 29 | src_attn (torch.nn.Module): Inter-attention module instance. 30 | `MultiHeadedAttention` instance can be used as the argument. 31 | If `None` is passed, Inter-attention is not used, such as 32 | CIF, GPT, and other decoder only model. 33 | feed_forward (torch.nn.Module): Feed-forward module instance. 34 | `PositionwiseFeedForward` instance can be used as the argument. 35 | dropout_rate (float): Dropout rate. 36 | normalize_before (bool): 37 | True: use layer_norm before each sub-block. 38 | False: to use layer_norm after each sub-block. 39 | """ 40 | 41 | def __init__( 42 | self, 43 | size: int, 44 | self_attn: nn.Module, 45 | src_attn: Optional[nn.Module], 46 | feed_forward: nn.Module, 47 | dropout_rate: float, 48 | normalize_before: bool = True, 49 | ): 50 | """Construct an DecoderLayer object.""" 51 | super().__init__() 52 | self.size = size 53 | self.self_attn = self_attn 54 | self.src_attn = src_attn 55 | self.feed_forward = feed_forward 56 | self.norm1 = nn.LayerNorm(size, eps=1e-5) 57 | self.norm2 = nn.LayerNorm(size, eps=1e-5) 58 | self.norm3 = nn.LayerNorm(size, eps=1e-5) 59 | self.dropout = nn.Dropout(dropout_rate) 60 | self.normalize_before = normalize_before 61 | 62 | def forward( 63 | self, 64 | tgt: torch.Tensor, 65 | tgt_mask: torch.Tensor, 66 | memory: torch.Tensor, 67 | memory_mask: torch.Tensor, 68 | cache: Optional[torch.Tensor] = None 69 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 70 | """Compute decoded features. 71 | 72 | Args: 73 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). 74 | tgt_mask (torch.Tensor): Mask for input tensor 75 | (#batch, maxlen_out). 76 | memory (torch.Tensor): Encoded memory 77 | (#batch, maxlen_in, size). 78 | memory_mask (torch.Tensor): Encoded memory mask 79 | (#batch, maxlen_in). 80 | cache (torch.Tensor): cached tensors. 81 | (#batch, maxlen_out - 1, size). 82 | 83 | Returns: 84 | torch.Tensor: Output tensor (#batch, maxlen_out, size). 85 | torch.Tensor: Mask for output tensor (#batch, maxlen_out). 86 | torch.Tensor: Encoded memory (#batch, maxlen_in, size). 87 | torch.Tensor: Encoded memory mask (#batch, maxlen_in). 88 | 89 | """ 90 | residual = tgt 91 | if self.normalize_before: 92 | tgt = self.norm1(tgt) 93 | 94 | if cache is None: 95 | tgt_q = tgt 96 | tgt_q_mask = tgt_mask 97 | else: 98 | # compute only the last frame query keeping dim: max_time_out -> 1 99 | assert cache.shape == ( 100 | tgt.shape[0], 101 | tgt.shape[1] - 1, 102 | self.size, 103 | ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" 104 | tgt_q = tgt[:, -1:, :] 105 | residual = residual[:, -1:, :] 106 | tgt_q_mask = tgt_mask[:, -1:, :] 107 | 108 | x = residual + self.dropout( 109 | self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) 110 | if not self.normalize_before: 111 | x = self.norm1(x) 112 | 113 | if self.src_attn is not None: 114 | residual = x 115 | if self.normalize_before: 116 | x = self.norm2(x) 117 | x = residual + self.dropout( 118 | self.src_attn(x, memory, memory, memory_mask)[0]) 119 | if not self.normalize_before: 120 | x = self.norm2(x) 121 | 122 | residual = x 123 | if self.normalize_before: 124 | x = self.norm3(x) 125 | x = residual + self.dropout(self.feed_forward(x)) 126 | if not self.normalize_before: 127 | x = self.norm3(x) 128 | 129 | if cache is not None: 130 | x = torch.cat([cache, x], dim=1) 131 | 132 | return x, tgt_mask, memory, memory_mask 133 | -------------------------------------------------------------------------------- /cosyvoice/utils/executor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | from contextlib import nullcontext 18 | import os 19 | 20 | import torch 21 | import torch.distributed as dist 22 | 23 | from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join 24 | 25 | 26 | class Executor: 27 | 28 | def __init__(self): 29 | self.step = 0 30 | self.epoch = 0 31 | self.rank = int(os.environ.get('RANK', 0)) 32 | self.device = torch.device('cuda:{}'.format(self.rank)) 33 | 34 | def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join): 35 | ''' Train one epoch 36 | ''' 37 | 38 | lr = optimizer.param_groups[0]['lr'] 39 | logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank)) 40 | logging.info('using accumulate grad, new batch size is {} times' 41 | ' larger than before'.format(info_dict['accum_grad'])) 42 | # A context manager to be used in conjunction with an instance of 43 | # torch.nn.parallel.DistributedDataParallel to be able to train 44 | # with uneven inputs across participating processes. 45 | model.train() 46 | model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext 47 | with model_context(): 48 | for batch_idx, batch_dict in enumerate(train_data_loader): 49 | info_dict["tag"] = "TRAIN" 50 | info_dict["step"] = self.step 51 | info_dict["epoch"] = self.epoch 52 | info_dict["batch_idx"] = batch_idx 53 | if cosyvoice_join(group_join, info_dict): 54 | break 55 | 56 | # Disable gradient synchronizations across DDP processes. 57 | # Within this context, gradients will be accumulated on module 58 | # variables, which will later be synchronized. 59 | if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0: 60 | context = model.no_sync 61 | # Used for single gpu training and DDP gradient synchronization 62 | # processes. 63 | else: 64 | context = nullcontext 65 | 66 | with context(): 67 | info_dict = batch_forward(model, batch_dict, info_dict) 68 | info_dict = batch_backward(model, info_dict) 69 | 70 | info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) 71 | log_per_step(writer, info_dict) 72 | # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save 73 | if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and (batch_idx + 1) % info_dict["accum_grad"] == 0: 74 | dist.barrier() 75 | self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False) 76 | model.train() 77 | if (batch_idx + 1) % info_dict["accum_grad"] == 0: 78 | self.step += 1 79 | dist.barrier() 80 | self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) 81 | 82 | @torch.inference_mode() 83 | def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): 84 | ''' Cross validation on 85 | ''' 86 | logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank)) 87 | model.eval() 88 | total_num_utts, total_loss_dict = 0, {} # avoid division by 0 89 | for batch_idx, batch_dict in enumerate(cv_data_loader): 90 | info_dict["tag"] = "CV" 91 | info_dict["step"] = self.step 92 | info_dict["epoch"] = self.epoch 93 | info_dict["batch_idx"] = batch_idx 94 | 95 | num_utts = len(batch_dict["utts"]) 96 | total_num_utts += num_utts 97 | 98 | info_dict = batch_forward(model, batch_dict, info_dict) 99 | 100 | for k, v in info_dict['loss_dict'].items(): 101 | if k not in total_loss_dict: 102 | total_loss_dict[k] = [] 103 | total_loss_dict[k].append(v.item() * num_utts) 104 | log_per_step(None, info_dict) 105 | for k, v in total_loss_dict.items(): 106 | total_loss_dict[k] = sum(v) / total_num_utts 107 | info_dict['loss_dict'] = total_loss_dict 108 | log_per_save(writer, info_dict) 109 | model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1) 110 | save_model(model, model_name, info_dict) 111 | -------------------------------------------------------------------------------- /cosyvoice/bin/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import argparse 17 | import datetime 18 | import logging 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING) 20 | from copy import deepcopy 21 | import torch 22 | import torch.distributed as dist 23 | import deepspeed 24 | 25 | from hyperpyyaml import load_hyperpyyaml 26 | 27 | from torch.distributed.elastic.multiprocessing.errors import record 28 | 29 | from cosyvoice.utils.executor import Executor 30 | from cosyvoice.utils.train_utils import ( 31 | init_distributed, 32 | init_dataset_and_dataloader, 33 | init_optimizer_and_scheduler, 34 | init_summarywriter, save_model, 35 | wrap_cuda_model, check_modify_and_save_config) 36 | 37 | 38 | def get_args(): 39 | parser = argparse.ArgumentParser(description='training your network') 40 | parser.add_argument('--train_engine', 41 | default='torch_ddp', 42 | choices=['torch_ddp', 'deepspeed'], 43 | help='Engine for paralleled training') 44 | parser.add_argument('--model', required=True, help='model which will be trained') 45 | parser.add_argument('--config', required=True, help='config file') 46 | parser.add_argument('--train_data', required=True, help='train data file') 47 | parser.add_argument('--cv_data', required=True, help='cv data file') 48 | parser.add_argument('--checkpoint', help='checkpoint model') 49 | parser.add_argument('--model_dir', required=True, help='save model dir') 50 | parser.add_argument('--tensorboard_dir', 51 | default='tensorboard', 52 | help='tensorboard log dir') 53 | parser.add_argument('--ddp.dist_backend', 54 | dest='dist_backend', 55 | default='nccl', 56 | choices=['nccl', 'gloo'], 57 | help='distributed backend') 58 | parser.add_argument('--num_workers', 59 | default=0, 60 | type=int, 61 | help='num of subprocess workers for reading') 62 | parser.add_argument('--prefetch', 63 | default=100, 64 | type=int, 65 | help='prefetch number') 66 | parser.add_argument('--pin_memory', 67 | action='store_true', 68 | default=False, 69 | help='Use pinned memory buffers used for reading') 70 | parser.add_argument('--deepspeed.save_states', 71 | dest='save_states', 72 | default='model_only', 73 | choices=['model_only', 'model+optimizer'], 74 | help='save model/optimizer states') 75 | parser.add_argument('--timeout', 76 | default=30, 77 | type=int, 78 | help='timeout (in seconds) of cosyvoice_join.') 79 | parser = deepspeed.add_config_arguments(parser) 80 | args = parser.parse_args() 81 | return args 82 | 83 | 84 | @record 85 | def main(): 86 | args = get_args() 87 | logging.basicConfig(level=logging.DEBUG, 88 | format='%(asctime)s %(levelname)s %(message)s') 89 | 90 | override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model} 91 | with open(args.config, 'r') as f: 92 | configs = load_hyperpyyaml(f, overrides=override_dict) 93 | configs['train_conf'].update(vars(args)) 94 | 95 | # Init env for ddp 96 | init_distributed(args) 97 | 98 | # Get dataset & dataloader 99 | train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ 100 | init_dataset_and_dataloader(args, configs) 101 | 102 | # Do some sanity checks and save config to arsg.model_dir 103 | configs = check_modify_and_save_config(args, configs) 104 | 105 | # Tensorboard summary 106 | writer = init_summarywriter(args) 107 | 108 | # load checkpoint 109 | model = configs[args.model] 110 | if args.checkpoint is not None: 111 | model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) 112 | 113 | # Dispatch model from cpu to gpu 114 | model = wrap_cuda_model(args, model) 115 | 116 | # Get optimizer & scheduler 117 | model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model) 118 | 119 | # Save init checkpoints 120 | info_dict = deepcopy(configs['train_conf']) 121 | save_model(model, 'init', info_dict) 122 | 123 | # Get executor 124 | executor = Executor() 125 | 126 | # Start training loop 127 | for epoch in range(info_dict['max_epoch']): 128 | executor.epoch = epoch 129 | train_dataset.set_epoch(epoch) 130 | dist.barrier() 131 | group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) 132 | executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) 133 | dist.destroy_process_group(group_join) 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /cosyvoice/bin/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import argparse 18 | import logging 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING) 20 | import os 21 | 22 | import torch 23 | from torch.utils.data import DataLoader 24 | import torchaudio 25 | from hyperpyyaml import load_hyperpyyaml 26 | from tqdm import tqdm 27 | from cosyvoice.cli.model import CosyVoiceModel 28 | 29 | from cosyvoice.dataset.dataset import Dataset 30 | 31 | def get_args(): 32 | parser = argparse.ArgumentParser(description='inference with your model') 33 | parser.add_argument('--config', required=True, help='config file') 34 | parser.add_argument('--prompt_data', required=True, help='prompt data file') 35 | parser.add_argument('--prompt_utt2data', required=True, help='prompt data file') 36 | parser.add_argument('--tts_text', required=True, help='tts input file') 37 | parser.add_argument('--llm_model', required=True, help='llm model file') 38 | parser.add_argument('--flow_model', required=True, help='flow model file') 39 | parser.add_argument('--hifigan_model', required=True, help='hifigan model file') 40 | parser.add_argument('--gpu', 41 | type=int, 42 | default=-1, 43 | help='gpu id for this rank, -1 for cpu') 44 | parser.add_argument('--mode', 45 | default='sft', 46 | choices=['sft', 'zero_shot'], 47 | help='inference mode') 48 | parser.add_argument('--result_dir', required=True, help='asr result file') 49 | args = parser.parse_args() 50 | print(args) 51 | return args 52 | 53 | 54 | def main(): 55 | args = get_args() 56 | logging.basicConfig(level=logging.DEBUG, 57 | format='%(asctime)s %(levelname)s %(message)s') 58 | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) 59 | 60 | # Init cosyvoice models from configs 61 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 62 | device = torch.device('cuda' if use_cuda else 'cpu') 63 | with open(args.config, 'r') as f: 64 | configs = load_hyperpyyaml(f) 65 | 66 | model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) 67 | model.load(args.llm_model, args.flow_model, args.hifigan_model) 68 | 69 | test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data) 70 | test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) 71 | 72 | del configs 73 | os.makedirs(args.result_dir, exist_ok=True) 74 | fn = os.path.join(args.result_dir, 'wav.scp') 75 | f = open(fn, 'w') 76 | with torch.no_grad(): 77 | for batch_idx, batch in tqdm(enumerate(test_data_loader)): 78 | utts = batch["utts"] 79 | assert len(utts) == 1, "inference mode only support batchsize 1" 80 | text = batch["text"] 81 | text_token = batch["text_token"].to(device) 82 | text_token_len = batch["text_token_len"].to(device) 83 | tts_text = batch["tts_text"] 84 | tts_index = batch["tts_index"] 85 | tts_text_token = batch["tts_text_token"].to(device) 86 | tts_text_token_len = batch["tts_text_token_len"].to(device) 87 | speech_token = batch["speech_token"].to(device) 88 | speech_token_len = batch["speech_token_len"].to(device) 89 | speech_feat = batch["speech_feat"].to(device) 90 | speech_feat_len = batch["speech_feat_len"].to(device) 91 | utt_embedding = batch["utt_embedding"].to(device) 92 | spk_embedding = batch["spk_embedding"].to(device) 93 | if args.mode == 'sft': 94 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 95 | 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding} 96 | else: 97 | model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 98 | 'prompt_text': text_token, 'prompt_text_len': text_token_len, 99 | 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, 100 | 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, 101 | 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, 102 | 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding} 103 | model_output = model.inference(**model_input) 104 | tts_key = '{}_{}'.format(utts[0], tts_index[0]) 105 | tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key)) 106 | torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050) 107 | f.write('{} {}\n'.format(tts_key, tts_fn)) 108 | f.flush() 109 | f.close() 110 | logging.info('Result wav.scp saved in {}'.format(fn)) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /cosyvoice/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 2024 Alibaba Inc (Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """ConvolutionModule definition.""" 17 | 18 | from typing import Tuple 19 | 20 | import torch 21 | from torch import nn 22 | 23 | 24 | class ConvolutionModule(nn.Module): 25 | """ConvolutionModule in Conformer model.""" 26 | 27 | def __init__(self, 28 | channels: int, 29 | kernel_size: int = 15, 30 | activation: nn.Module = nn.ReLU(), 31 | norm: str = "batch_norm", 32 | causal: bool = False, 33 | bias: bool = True): 34 | """Construct an ConvolutionModule object. 35 | Args: 36 | channels (int): The number of channels of conv layers. 37 | kernel_size (int): Kernel size of conv layers. 38 | causal (int): Whether use causal convolution or not 39 | """ 40 | super().__init__() 41 | 42 | self.pointwise_conv1 = nn.Conv1d( 43 | channels, 44 | 2 * channels, 45 | kernel_size=1, 46 | stride=1, 47 | padding=0, 48 | bias=bias, 49 | ) 50 | # self.lorder is used to distinguish if it's a causal convolution, 51 | # if self.lorder > 0: it's a causal convolution, the input will be 52 | # padded with self.lorder frames on the left in forward. 53 | # else: it's a symmetrical convolution 54 | if causal: 55 | padding = 0 56 | self.lorder = kernel_size - 1 57 | else: 58 | # kernel_size should be an odd number for none causal convolution 59 | assert (kernel_size - 1) % 2 == 0 60 | padding = (kernel_size - 1) // 2 61 | self.lorder = 0 62 | self.depthwise_conv = nn.Conv1d( 63 | channels, 64 | channels, 65 | kernel_size, 66 | stride=1, 67 | padding=padding, 68 | groups=channels, 69 | bias=bias, 70 | ) 71 | 72 | assert norm in ['batch_norm', 'layer_norm'] 73 | if norm == "batch_norm": 74 | self.use_layer_norm = False 75 | self.norm = nn.BatchNorm1d(channels) 76 | else: 77 | self.use_layer_norm = True 78 | self.norm = nn.LayerNorm(channels) 79 | 80 | self.pointwise_conv2 = nn.Conv1d( 81 | channels, 82 | channels, 83 | kernel_size=1, 84 | stride=1, 85 | padding=0, 86 | bias=bias, 87 | ) 88 | self.activation = activation 89 | 90 | def forward( 91 | self, 92 | x: torch.Tensor, 93 | mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), 94 | cache: torch.Tensor = torch.zeros((0, 0, 0)), 95 | ) -> Tuple[torch.Tensor, torch.Tensor]: 96 | """Compute convolution module. 97 | Args: 98 | x (torch.Tensor): Input tensor (#batch, time, channels). 99 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), 100 | (0, 0, 0) means fake mask. 101 | cache (torch.Tensor): left context cache, it is only 102 | used in causal convolution (#batch, channels, cache_t), 103 | (0, 0, 0) meas fake cache. 104 | Returns: 105 | torch.Tensor: Output tensor (#batch, time, channels). 106 | """ 107 | # exchange the temporal dimension and the feature dimension 108 | x = x.transpose(1, 2) # (#batch, channels, time) 109 | 110 | # mask batch padding 111 | if mask_pad.size(2) > 0: # time > 0 112 | x.masked_fill_(~mask_pad, 0.0) 113 | 114 | if self.lorder > 0: 115 | if cache.size(2) == 0: # cache_t == 0 116 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) 117 | else: 118 | assert cache.size(0) == x.size(0) # equal batch 119 | assert cache.size(1) == x.size(1) # equal channel 120 | x = torch.cat((cache, x), dim=2) 121 | assert (x.size(2) > self.lorder) 122 | new_cache = x[:, :, -self.lorder:] 123 | else: 124 | # It's better we just return None if no cache is required, 125 | # However, for JIT export, here we just fake one tensor instead of 126 | # None. 127 | new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) 128 | 129 | # GLU mechanism 130 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 131 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 132 | 133 | # 1D Depthwise Conv 134 | x = self.depthwise_conv(x) 135 | if self.use_layer_norm: 136 | x = x.transpose(1, 2) 137 | x = self.activation(self.norm(x)) 138 | if self.use_layer_norm: 139 | x = x.transpose(1, 2) 140 | x = self.pointwise_conv2(x) 141 | # mask batch padding 142 | if mask_pad.size(2) > 0: # time > 0 143 | x.masked_fill_(~mask_pad, 0.0) 144 | 145 | return x.transpose(1, 2), new_cache 146 | -------------------------------------------------------------------------------- /cosyvoice/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import random 17 | import json 18 | import math 19 | from functools import partial 20 | 21 | import torch 22 | import torch.distributed as dist 23 | from torch.utils.data import IterableDataset 24 | from cosyvoice.utils.file_utils import read_lists, read_json_lists 25 | 26 | 27 | class Processor(IterableDataset): 28 | 29 | def __init__(self, source, f, *args, **kw): 30 | assert callable(f) 31 | self.source = source 32 | self.f = f 33 | self.args = args 34 | self.kw = kw 35 | 36 | def set_epoch(self, epoch): 37 | self.source.set_epoch(epoch) 38 | 39 | def __iter__(self): 40 | """ Return an iterator over the source dataset processed by the 41 | given processor. 42 | """ 43 | assert self.source is not None 44 | assert callable(self.f) 45 | return self.f(iter(self.source), *self.args, **self.kw) 46 | 47 | def apply(self, f): 48 | assert callable(f) 49 | return Processor(self, f, *self.args, **self.kw) 50 | 51 | 52 | class DistributedSampler: 53 | 54 | def __init__(self, shuffle=True, partition=True): 55 | self.epoch = -1 56 | self.update() 57 | self.shuffle = shuffle 58 | self.partition = partition 59 | 60 | def update(self): 61 | assert dist.is_available() 62 | if dist.is_initialized(): 63 | self.rank = dist.get_rank() 64 | self.world_size = dist.get_world_size() 65 | else: 66 | self.rank = 0 67 | self.world_size = 1 68 | worker_info = torch.utils.data.get_worker_info() 69 | if worker_info is None: 70 | self.worker_id = 0 71 | self.num_workers = 1 72 | else: 73 | self.worker_id = worker_info.id 74 | self.num_workers = worker_info.num_workers 75 | return dict(rank=self.rank, 76 | world_size=self.world_size, 77 | worker_id=self.worker_id, 78 | num_workers=self.num_workers) 79 | 80 | def set_epoch(self, epoch): 81 | self.epoch = epoch 82 | 83 | def sample(self, data): 84 | """ Sample data according to rank/world_size/num_workers 85 | 86 | Args: 87 | data(List): input data list 88 | 89 | Returns: 90 | List: data list after sample 91 | """ 92 | data = list(range(len(data))) 93 | # force datalist even 94 | if self.partition: 95 | if self.shuffle: 96 | random.Random(self.epoch).shuffle(data) 97 | if len(data) < self.world_size: 98 | data = data * math.ceil(self.world_size / len(data)) 99 | data = data[:self.world_size] 100 | data = data[self.rank::self.world_size] 101 | if len(data) < self.num_workers: 102 | data = data * math.ceil(self.num_workers / len(data)) 103 | data = data[:self.num_workers] 104 | data = data[self.worker_id::self.num_workers] 105 | return data 106 | 107 | 108 | class DataList(IterableDataset): 109 | 110 | def __init__(self, lists, shuffle=True, partition=True): 111 | self.lists = lists 112 | self.sampler = DistributedSampler(shuffle, partition) 113 | 114 | def set_epoch(self, epoch): 115 | self.sampler.set_epoch(epoch) 116 | 117 | def __iter__(self): 118 | sampler_info = self.sampler.update() 119 | indexes = self.sampler.sample(self.lists) 120 | for index in indexes: 121 | data = dict(src=self.lists[index]) 122 | data.update(sampler_info) 123 | yield data 124 | 125 | 126 | def Dataset(data_list_file, 127 | data_pipeline, 128 | mode='train', 129 | shuffle=True, 130 | partition=True, 131 | tts_file='', 132 | prompt_utt2data=''): 133 | """ Construct dataset from arguments 134 | 135 | We have two shuffle stage in the Dataset. The first is global 136 | shuffle at shards tar/raw file level. The second is global shuffle 137 | at training samples level. 138 | 139 | Args: 140 | data_type(str): raw/shard 141 | tokenizer (BaseTokenizer): tokenizer to tokenize 142 | partition(bool): whether to do data partition in terms of rank 143 | """ 144 | assert mode in ['train', 'inference'] 145 | lists = read_lists(data_list_file) 146 | if mode == 'inference': 147 | with open(tts_file) as f: 148 | tts_data = json.load(f) 149 | utt2lists = read_json_lists(prompt_utt2data) 150 | # filter unnecessary file in inference mode 151 | lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists])) 152 | dataset = DataList(lists, 153 | shuffle=shuffle, 154 | partition=partition) 155 | if mode == 'inference': 156 | # map partial arg tts_data in inference mode 157 | data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data) 158 | for func in data_pipeline: 159 | dataset = Processor(dataset, func, mode=mode) 160 | return dataset 161 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/README.md: -------------------------------------------------------------------------------- 1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis 2 | 3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae 4 | 5 | In our [paper](https://arxiv.org/abs/2010.05646), 6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
7 | We provide our implementation and pretrained models as open source in this repository. 8 | 9 | **Abstract :** 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 11 | Although such methods improve the sampling efficiency and memory usage, 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models. 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 14 | As speech audio consists of sinusoidal signals with various periods, 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart. 21 | 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples. 23 | 24 | ## Pre-requisites 25 | 26 | 1. Python >= 3.6 27 | 2. Clone this repository. 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt) 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). 30 | And move all wav files to `LJSpeech-1.1/wavs` 31 | 32 | ## Training 33 | 34 | ``` 35 | python train.py --config config_v1.json 36 | ``` 37 | 38 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
39 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
40 | You can change the path by adding `--checkpoint_path` option. 41 | 42 | Validation loss during training with V1 generator.
43 | ![validation loss](./validation_loss.png) 44 | 45 | ## Pretrained Model 46 | 47 | You can also use pretrained models we provide.
48 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
49 | Details of each folder are as in follows: 50 | 51 | | Folder Name | Generator | Dataset | Fine-Tuned | 52 | | ------------ | --------- | --------- | ------------------------------------------------------ | 53 | | LJ_V1 | V1 | LJSpeech | No | 54 | | LJ_V2 | V2 | LJSpeech | No | 55 | | LJ_V3 | V3 | LJSpeech | No | 56 | | LJ_FT_T2_V1 | V1 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 57 | | LJ_FT_T2_V2 | V2 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 58 | | LJ_FT_T2_V3 | V3 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | 59 | | VCTK_V1 | V1 | VCTK | No | 60 | | VCTK_V2 | V2 | VCTK | No | 61 | | VCTK_V3 | V3 | VCTK | No | 62 | | UNIVERSAL_V1 | V1 | Universal | No | 63 | 64 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets. 65 | 66 | ## Fine-Tuning 67 | 68 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
69 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
70 | Example: 71 | ` Audio File : LJ001-0001.wav 72 | Mel-Spectrogram File : LJ001-0001.npy` 73 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
74 | 3. Run the following command. 75 | ``` 76 | python train.py --fine_tuning True --config config_v1.json 77 | ``` 78 | For other command line options, please refer to the training section. 79 | 80 | ## Inference from wav file 81 | 82 | 1. Make `test_files` directory and copy wav files into the directory. 83 | 2. Run the following command. 84 | ` python inference.py --checkpoint_file [generator checkpoint file path]` 85 | Generated wav files are saved in `generated_files` by default.
86 | You can change the path by adding `--output_dir` option. 87 | 88 | ## Inference for end-to-end speech synthesis 89 | 90 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
91 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 92 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth. 93 | 2. Run the following command. 94 | ` python inference_e2e.py --checkpoint_file [generator checkpoint file path]` 95 | Generated wav files are saved in `generated_files_from_mel` by default.
96 | You can change the path by adding `--output_dir` option. 97 | 98 | ## Acknowledgements 99 | 100 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 101 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this. 102 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/onnx/export.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import torch 7 | from lightning import LightningModule 8 | 9 | from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder 10 | 11 | DEFAULT_OPSET = 15 12 | 13 | SEED = 1234 14 | random.seed(SEED) 15 | np.random.seed(SEED) 16 | torch.manual_seed(SEED) 17 | torch.cuda.manual_seed(SEED) 18 | torch.backends.cudnn.deterministic = True 19 | torch.backends.cudnn.benchmark = False 20 | 21 | 22 | class MatchaWithVocoder(LightningModule): 23 | def __init__(self, matcha, vocoder): 24 | super().__init__() 25 | self.matcha = matcha 26 | self.vocoder = vocoder 27 | 28 | def forward(self, x, x_lengths, scales, spks=None): 29 | mel, mel_lengths = self.matcha(x, x_lengths, scales, spks) 30 | wavs = self.vocoder(mel).clamp(-1, 1) 31 | lengths = mel_lengths * 256 32 | return wavs.squeeze(1), lengths 33 | 34 | 35 | def get_exportable_module(matcha, vocoder, n_timesteps): 36 | """ 37 | Return an appropriate `LighteningModule` and output-node names 38 | based on whether the vocoder is embedded in the final graph 39 | """ 40 | 41 | def onnx_forward_func(x, x_lengths, scales, spks=None): 42 | """ 43 | Custom forward function for accepting 44 | scaler parameters as tensors 45 | """ 46 | # Extract scaler parameters from tensors 47 | temperature = scales[0] 48 | length_scale = scales[1] 49 | output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale) 50 | return output["mel"], output["mel_lengths"] 51 | 52 | # Monkey-patch Matcha's forward function 53 | matcha.forward = onnx_forward_func 54 | 55 | if vocoder is None: 56 | model, output_names = matcha, ["mel", "mel_lengths"] 57 | else: 58 | model = MatchaWithVocoder(matcha, vocoder) 59 | output_names = ["wav", "wav_lengths"] 60 | return model, output_names 61 | 62 | 63 | def get_inputs(is_multi_speaker): 64 | """ 65 | Create dummy inputs for tracing 66 | """ 67 | dummy_input_length = 50 68 | x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long) 69 | x_lengths = torch.LongTensor([dummy_input_length]) 70 | 71 | # Scales 72 | temperature = 0.667 73 | length_scale = 1.0 74 | scales = torch.Tensor([temperature, length_scale]) 75 | 76 | model_inputs = [x, x_lengths, scales] 77 | input_names = [ 78 | "x", 79 | "x_lengths", 80 | "scales", 81 | ] 82 | 83 | if is_multi_speaker: 84 | spks = torch.LongTensor([1]) 85 | model_inputs.append(spks) 86 | input_names.append("spks") 87 | 88 | return tuple(model_inputs), input_names 89 | 90 | 91 | def main(): 92 | parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX") 93 | 94 | parser.add_argument( 95 | "checkpoint_path", 96 | type=str, 97 | help="Path to the model checkpoint", 98 | ) 99 | parser.add_argument("output", type=str, help="Path to output `.onnx` file") 100 | parser.add_argument( 101 | "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)" 102 | ) 103 | parser.add_argument( 104 | "--vocoder-name", 105 | type=str, 106 | choices=list(VOCODER_URLS.keys()), 107 | default=None, 108 | help="Name of the vocoder to embed in the ONNX graph", 109 | ) 110 | parser.add_argument( 111 | "--vocoder-checkpoint-path", 112 | type=str, 113 | default=None, 114 | help="Vocoder checkpoint to embed in the ONNX graph for an `e2e` like experience", 115 | ) 116 | parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15") 117 | 118 | args = parser.parse_args() 119 | 120 | print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}") 121 | print(f"Setting n_timesteps to {args.n_timesteps}") 122 | 123 | checkpoint_path = Path(args.checkpoint_path) 124 | matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu") 125 | 126 | if args.vocoder_name or args.vocoder_checkpoint_path: 127 | assert ( 128 | args.vocoder_name and args.vocoder_checkpoint_path 129 | ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph." 130 | vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu") 131 | else: 132 | vocoder = None 133 | 134 | is_multi_speaker = matcha.n_spks > 1 135 | 136 | dummy_input, input_names = get_inputs(is_multi_speaker) 137 | model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps) 138 | 139 | # Set dynamic shape for inputs/outputs 140 | dynamic_axes = { 141 | "x": {0: "batch_size", 1: "time"}, 142 | "x_lengths": {0: "batch_size"}, 143 | } 144 | 145 | if vocoder is None: 146 | dynamic_axes.update( 147 | { 148 | "mel": {0: "batch_size", 2: "time"}, 149 | "mel_lengths": {0: "batch_size"}, 150 | } 151 | ) 152 | else: 153 | print("Embedding the vocoder in the ONNX graph") 154 | dynamic_axes.update( 155 | { 156 | "wav": {0: "batch_size", 1: "time"}, 157 | "wav_lengths": {0: "batch_size"}, 158 | } 159 | ) 160 | 161 | if is_multi_speaker: 162 | dynamic_axes["spks"] = {0: "batch_size"} 163 | 164 | # Create the output directory (if not exists) 165 | Path(args.output).parent.mkdir(parents=True, exist_ok=True) 166 | 167 | model.to_onnx( 168 | args.output, 169 | dummy_input, 170 | input_names=input_names, 171 | output_names=output_names, 172 | dynamic_axes=dynamic_axes, 173 | opset_version=args.opset, 174 | export_params=True, 175 | do_constant_folding=True, 176 | ) 177 | print(f"[🍵] ONNX model exported to {args.output}") 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | -------------------------------------------------------------------------------- /cosyvoice/flow/flow_matching.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn.functional as F 16 | from matcha.models.components.flow_matching import BASECFM 17 | 18 | class ConditionalCFM(BASECFM): 19 | def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): 20 | super().__init__( 21 | n_feats=in_channels, 22 | cfm_params=cfm_params, 23 | n_spks=n_spks, 24 | spk_emb_dim=spk_emb_dim, 25 | ) 26 | self.t_scheduler = cfm_params.t_scheduler 27 | self.training_cfg_rate = cfm_params.training_cfg_rate 28 | self.inference_cfg_rate = cfm_params.inference_cfg_rate 29 | in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) 30 | # Just change the architecture of the estimator here 31 | self.estimator = estimator 32 | 33 | @torch.inference_mode() 34 | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): 35 | """Forward diffusion 36 | 37 | Args: 38 | mu (torch.Tensor): output of encoder 39 | shape: (batch_size, n_feats, mel_timesteps) 40 | mask (torch.Tensor): output_mask 41 | shape: (batch_size, 1, mel_timesteps) 42 | n_timesteps (int): number of diffusion steps 43 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0. 44 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 45 | shape: (batch_size, spk_emb_dim) 46 | cond: Not used but kept for future purposes 47 | 48 | Returns: 49 | sample: generated mel-spectrogram 50 | shape: (batch_size, n_feats, mel_timesteps) 51 | """ 52 | z = torch.randn_like(mu) * temperature 53 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) 54 | if self.t_scheduler == 'cosine': 55 | t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) 56 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) 57 | 58 | def solve_euler(self, x, t_span, mu, mask, spks, cond): 59 | """ 60 | Fixed euler solver for ODEs. 61 | Args: 62 | x (torch.Tensor): random noise 63 | t_span (torch.Tensor): n_timesteps interpolated 64 | shape: (n_timesteps + 1,) 65 | mu (torch.Tensor): output of encoder 66 | shape: (batch_size, n_feats, mel_timesteps) 67 | mask (torch.Tensor): output_mask 68 | shape: (batch_size, 1, mel_timesteps) 69 | spks (torch.Tensor, optional): speaker ids. Defaults to None. 70 | shape: (batch_size, spk_emb_dim) 71 | cond: Not used but kept for future purposes 72 | """ 73 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] 74 | 75 | # I am storing this because I can later plot it by putting a debugger here and saving it to a file 76 | # Or in future might add like a return_all_steps flag 77 | sol = [] 78 | 79 | for step in range(1, len(t_span)): 80 | dphi_dt = self.estimator(x, mask, mu, t, spks, cond) 81 | # Classifier-Free Guidance inference introduced in VoiceBox 82 | if self.inference_cfg_rate > 0: 83 | cfg_dphi_dt = self.estimator( 84 | x, mask, 85 | torch.zeros_like(mu), t, 86 | torch.zeros_like(spks) if spks is not None else None, 87 | torch.zeros_like(cond) 88 | ) 89 | dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - 90 | self.inference_cfg_rate * cfg_dphi_dt) 91 | x = x + dt * dphi_dt 92 | t = t + dt 93 | sol.append(x) 94 | if step < len(t_span) - 1: 95 | dt = t_span[step + 1] - t 96 | 97 | return sol[-1] 98 | 99 | def compute_loss(self, x1, mask, mu, spks=None, cond=None): 100 | """Computes diffusion loss 101 | 102 | Args: 103 | x1 (torch.Tensor): Target 104 | shape: (batch_size, n_feats, mel_timesteps) 105 | mask (torch.Tensor): target mask 106 | shape: (batch_size, 1, mel_timesteps) 107 | mu (torch.Tensor): output of encoder 108 | shape: (batch_size, n_feats, mel_timesteps) 109 | spks (torch.Tensor, optional): speaker embedding. Defaults to None. 110 | shape: (batch_size, spk_emb_dim) 111 | 112 | Returns: 113 | loss: conditional flow matching loss 114 | y: conditional flow 115 | shape: (batch_size, n_feats, mel_timesteps) 116 | """ 117 | b, _, t = mu.shape 118 | 119 | # random timestep 120 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) 121 | if self.t_scheduler == 'cosine': 122 | t = 1 - torch.cos(t * 0.5 * torch.pi) 123 | # sample noise p(x_0) 124 | z = torch.randn_like(x1) 125 | 126 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1 127 | u = x1 - (1 - self.sigma_min) * z 128 | 129 | # during training, we randomly drop condition to trade off mode coverage and sample fidelity 130 | if self.training_cfg_rate > 0: 131 | cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate 132 | mu = mu * cfg_mask.view(-1, 1, 1) 133 | spks = spks * cfg_mask.view(-1, 1) 134 | cond = cond * cfg_mask.view(-1, 1, 1) 135 | 136 | pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) 137 | loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) 138 | return loss, y 139 | -------------------------------------------------------------------------------- /cosyvoice/flow/flow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | import random 16 | from typing import Dict, Optional 17 | import torch 18 | import torch.nn as nn 19 | from torch.nn import functional as F 20 | from omegaconf import DictConfig 21 | from cosyvoice.utils.mask import make_pad_mask 22 | 23 | 24 | class MaskedDiffWithXvec(torch.nn.Module): 25 | def __init__(self, 26 | input_size: int = 512, 27 | output_size: int = 80, 28 | spk_embed_dim: int = 192, 29 | output_type: str = "mel", 30 | vocab_size: int = 4096, 31 | input_frame_rate: int = 50, 32 | only_mask_loss: bool = True, 33 | encoder: torch.nn.Module = None, 34 | length_regulator: torch.nn.Module = None, 35 | decoder: torch.nn.Module = None, 36 | decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, 37 | mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): 38 | super().__init__() 39 | self.input_size = input_size 40 | self.output_size = output_size 41 | self.decoder_conf = decoder_conf 42 | self.mel_feat_conf = mel_feat_conf 43 | self.vocab_size = vocab_size 44 | self.output_type = output_type 45 | self.input_frame_rate = input_frame_rate 46 | logging.info(f"input frame rate={self.input_frame_rate}") 47 | self.input_embedding = nn.Embedding(vocab_size, input_size) 48 | self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) 49 | self.encoder = encoder 50 | self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) 51 | self.decoder = decoder 52 | self.length_regulator = length_regulator 53 | self.only_mask_loss = only_mask_loss 54 | 55 | def forward( 56 | self, 57 | batch: dict, 58 | device: torch.device, 59 | ) -> Dict[str, Optional[torch.Tensor]]: 60 | token = batch['speech_token'].to(device) 61 | token_len = batch['speech_token_len'].to(device) 62 | feat = batch['speech_feat'].to(device) 63 | feat_len = batch['speech_feat_len'].to(device) 64 | embedding = batch['embedding'].to(device) 65 | 66 | # xvec projection 67 | embedding = F.normalize(embedding, dim=1) 68 | embedding = self.spk_embed_affine_layer(embedding) 69 | 70 | # concat text and prompt_text 71 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) 72 | token = self.input_embedding(torch.clamp(token, min=0)) * mask 73 | 74 | # text encode 75 | h, h_lengths = self.encoder(token, token_len) 76 | h = self.encoder_proj(h) 77 | h, h_lengths = self.length_regulator(h, feat_len) 78 | 79 | # get conditions 80 | conds = torch.zeros(feat.shape, device=token.device) 81 | for i, j in enumerate(feat_len): 82 | if random.random() < 0.5: 83 | continue 84 | index = random.randint(0, int(0.3 * j)) 85 | conds[i, :index] = feat[i, :index] 86 | conds = conds.transpose(1, 2) 87 | 88 | mask = (~make_pad_mask(feat_len)).to(h) 89 | feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1) 90 | loss, _ = self.decoder.compute_loss( 91 | feat.transpose(1, 2).contiguous(), 92 | mask.unsqueeze(1), 93 | h.transpose(1, 2).contiguous(), 94 | embedding, 95 | cond=conds 96 | ) 97 | return {'loss': loss} 98 | 99 | @torch.inference_mode() 100 | def inference(self, 101 | token, 102 | token_len, 103 | prompt_token, 104 | prompt_token_len, 105 | prompt_feat, 106 | prompt_feat_len, 107 | embedding): 108 | assert token.shape[0] == 1 109 | # xvec projection 110 | embedding = F.normalize(embedding, dim=1) 111 | embedding = self.spk_embed_affine_layer(embedding) 112 | 113 | # concat text and prompt_text 114 | token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len 115 | mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding) 116 | token = self.input_embedding(torch.clamp(token, min=0)) * mask 117 | 118 | # text encode 119 | h, h_lengths = self.encoder(token, token_len) 120 | h = self.encoder_proj(h) 121 | feat_len = (token_len / 50 * 22050 / 256).int() 122 | h, h_lengths = self.length_regulator(h, feat_len) 123 | 124 | # get conditions 125 | conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device) 126 | if prompt_feat.shape[1] != 0: 127 | for i, j in enumerate(prompt_feat_len): 128 | conds[i, :j] = prompt_feat[i] 129 | conds = conds.transpose(1, 2) 130 | 131 | mask = (~make_pad_mask(feat_len)).to(h) 132 | feat = self.decoder( 133 | mu=h.transpose(1, 2).contiguous(), 134 | mask=mask.unsqueeze(1), 135 | spks=embedding, 136 | cond=conds, 137 | n_timesteps=10 138 | ) 139 | if prompt_feat.shape[1] != 0: 140 | feat = feat[:, :, prompt_feat.shape[1]:] 141 | return feat 142 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/onnx/infer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import warnings 4 | from pathlib import Path 5 | from time import perf_counter 6 | 7 | import numpy as np 8 | import onnxruntime as ort 9 | import soundfile as sf 10 | import torch 11 | 12 | from matcha.cli import plot_spectrogram_to_numpy, process_text 13 | 14 | 15 | def validate_args(args): 16 | assert ( 17 | args.text or args.file 18 | ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms." 19 | assert args.temperature >= 0, "Sampling temperature cannot be negative" 20 | assert args.speaking_rate >= 0, "Speaking rate must be greater than 0" 21 | return args 22 | 23 | 24 | def write_wavs(model, inputs, output_dir, external_vocoder=None): 25 | if external_vocoder is None: 26 | print("The provided model has the vocoder embedded in the graph.\nGenerating waveform directly") 27 | t0 = perf_counter() 28 | wavs, wav_lengths = model.run(None, inputs) 29 | infer_secs = perf_counter() - t0 30 | mel_infer_secs = vocoder_infer_secs = None 31 | else: 32 | print("[🍵] Generating mel using Matcha") 33 | mel_t0 = perf_counter() 34 | mels, mel_lengths = model.run(None, inputs) 35 | mel_infer_secs = perf_counter() - mel_t0 36 | print("Generating waveform from mel using external vocoder") 37 | vocoder_inputs = {external_vocoder.get_inputs()[0].name: mels} 38 | vocoder_t0 = perf_counter() 39 | wavs = external_vocoder.run(None, vocoder_inputs)[0] 40 | vocoder_infer_secs = perf_counter() - vocoder_t0 41 | wavs = wavs.squeeze(1) 42 | wav_lengths = mel_lengths * 256 43 | infer_secs = mel_infer_secs + vocoder_infer_secs 44 | 45 | output_dir = Path(output_dir) 46 | output_dir.mkdir(parents=True, exist_ok=True) 47 | for i, (wav, wav_length) in enumerate(zip(wavs, wav_lengths)): 48 | output_filename = output_dir.joinpath(f"output_{i + 1}.wav") 49 | audio = wav[:wav_length] 50 | print(f"Writing audio to {output_filename}") 51 | sf.write(output_filename, audio, 22050, "PCM_24") 52 | 53 | wav_secs = wav_lengths.sum() / 22050 54 | print(f"Inference seconds: {infer_secs}") 55 | print(f"Generated wav seconds: {wav_secs}") 56 | rtf = infer_secs / wav_secs 57 | if mel_infer_secs is not None: 58 | mel_rtf = mel_infer_secs / wav_secs 59 | print(f"Matcha RTF: {mel_rtf}") 60 | if vocoder_infer_secs is not None: 61 | vocoder_rtf = vocoder_infer_secs / wav_secs 62 | print(f"Vocoder RTF: {vocoder_rtf}") 63 | print(f"Overall RTF: {rtf}") 64 | 65 | 66 | def write_mels(model, inputs, output_dir): 67 | t0 = perf_counter() 68 | mels, mel_lengths = model.run(None, inputs) 69 | infer_secs = perf_counter() - t0 70 | 71 | output_dir = Path(output_dir) 72 | output_dir.mkdir(parents=True, exist_ok=True) 73 | for i, mel in enumerate(mels): 74 | output_stem = output_dir.joinpath(f"output_{i + 1}") 75 | plot_spectrogram_to_numpy(mel.squeeze(), output_stem.with_suffix(".png")) 76 | np.save(output_stem.with_suffix(".numpy"), mel) 77 | 78 | wav_secs = (mel_lengths * 256).sum() / 22050 79 | print(f"Inference seconds: {infer_secs}") 80 | print(f"Generated wav seconds: {wav_secs}") 81 | rtf = infer_secs / wav_secs 82 | print(f"RTF: {rtf}") 83 | 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser( 87 | description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching" 88 | ) 89 | parser.add_argument( 90 | "model", 91 | type=str, 92 | help="ONNX model to use", 93 | ) 94 | parser.add_argument("--vocoder", type=str, default=None, help="Vocoder to use (defaults to None)") 95 | parser.add_argument("--text", type=str, default=None, help="Text to synthesize") 96 | parser.add_argument("--file", type=str, default=None, help="Text file to synthesize") 97 | parser.add_argument("--spk", type=int, default=None, help="Speaker ID") 98 | parser.add_argument( 99 | "--temperature", 100 | type=float, 101 | default=0.667, 102 | help="Variance of the x0 noise (default: 0.667)", 103 | ) 104 | parser.add_argument( 105 | "--speaking-rate", 106 | type=float, 107 | default=1.0, 108 | help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)", 109 | ) 110 | parser.add_argument("--gpu", action="store_true", help="Use CPU for inference (default: use GPU if available)") 111 | parser.add_argument( 112 | "--output-dir", 113 | type=str, 114 | default=os.getcwd(), 115 | help="Output folder to save results (default: current dir)", 116 | ) 117 | 118 | args = parser.parse_args() 119 | args = validate_args(args) 120 | 121 | if args.gpu: 122 | providers = ["GPUExecutionProvider"] 123 | else: 124 | providers = ["CPUExecutionProvider"] 125 | model = ort.InferenceSession(args.model, providers=providers) 126 | 127 | model_inputs = model.get_inputs() 128 | model_outputs = list(model.get_outputs()) 129 | 130 | if args.text: 131 | text_lines = args.text.splitlines() 132 | else: 133 | with open(args.file, encoding="utf-8") as file: 134 | text_lines = file.read().splitlines() 135 | 136 | processed_lines = [process_text(0, line, "cpu") for line in text_lines] 137 | x = [line["x"].squeeze() for line in processed_lines] 138 | # Pad 139 | x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True) 140 | x = x.detach().cpu().numpy() 141 | x_lengths = np.array([line["x_lengths"].item() for line in processed_lines], dtype=np.int64) 142 | inputs = { 143 | "x": x, 144 | "x_lengths": x_lengths, 145 | "scales": np.array([args.temperature, args.speaking_rate], dtype=np.float32), 146 | } 147 | is_multi_speaker = len(model_inputs) == 4 148 | if is_multi_speaker: 149 | if args.spk is None: 150 | args.spk = 0 151 | warn = "[!] Speaker ID not provided! Using speaker ID 0" 152 | warnings.warn(warn, UserWarning) 153 | inputs["spks"] = np.repeat(args.spk, x.shape[0]).astype(np.int64) 154 | 155 | has_vocoder_embedded = model_outputs[0].name == "wav" 156 | if has_vocoder_embedded: 157 | write_wavs(model, inputs, args.output_dir) 158 | elif args.vocoder: 159 | external_vocoder = ort.InferenceSession(args.vocoder, providers=providers) 160 | write_wavs(model, inputs, args.output_dir, external_vocoder=external_vocoder) 161 | else: 162 | warn = "[!] A vocoder is not embedded in the graph nor an external vocoder is provided. The mel output will be written as numpy arrays to `*.npy` files in the output directory" 163 | warnings.warn(warn, UserWarning) 164 | write_mels(model, inputs, args.output_dir) 165 | 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/hifigan/meldataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import math 4 | import os 5 | import random 6 | 7 | import numpy as np 8 | import torch 9 | import torch.utils.data 10 | from librosa.filters import mel as librosa_mel_fn 11 | from librosa.util import normalize 12 | from scipy.io.wavfile import read 13 | 14 | MAX_WAV_VALUE = 32768.0 15 | 16 | 17 | def load_wav(full_path): 18 | sampling_rate, data = read(full_path) 19 | return data, sampling_rate 20 | 21 | 22 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 23 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 24 | 25 | 26 | def dynamic_range_decompression(x, C=1): 27 | return np.exp(x) / C 28 | 29 | 30 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 31 | return torch.log(torch.clamp(x, min=clip_val) * C) 32 | 33 | 34 | def dynamic_range_decompression_torch(x, C=1): 35 | return torch.exp(x) / C 36 | 37 | 38 | def spectral_normalize_torch(magnitudes): 39 | output = dynamic_range_compression_torch(magnitudes) 40 | return output 41 | 42 | 43 | def spectral_de_normalize_torch(magnitudes): 44 | output = dynamic_range_decompression_torch(magnitudes) 45 | return output 46 | 47 | 48 | mel_basis = {} 49 | hann_window = {} 50 | 51 | 52 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 53 | if torch.min(y) < -1.0: 54 | print("min value is ", torch.min(y)) 55 | if torch.max(y) > 1.0: 56 | print("max value is ", torch.max(y)) 57 | 58 | global mel_basis, hann_window # pylint: disable=global-statement 59 | if fmax not in mel_basis: 60 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 61 | mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 62 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 63 | 64 | y = torch.nn.functional.pad( 65 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" 66 | ) 67 | y = y.squeeze(1) 68 | 69 | spec = torch.view_as_real( 70 | torch.stft( 71 | y, 72 | n_fft, 73 | hop_length=hop_size, 74 | win_length=win_size, 75 | window=hann_window[str(y.device)], 76 | center=center, 77 | pad_mode="reflect", 78 | normalized=False, 79 | onesided=True, 80 | return_complex=True, 81 | ) 82 | ) 83 | 84 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 85 | 86 | spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) 87 | spec = spectral_normalize_torch(spec) 88 | 89 | return spec 90 | 91 | 92 | def get_dataset_filelist(a): 93 | with open(a.input_training_file, encoding="utf-8") as fi: 94 | training_files = [ 95 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 96 | ] 97 | 98 | with open(a.input_validation_file, encoding="utf-8") as fi: 99 | validation_files = [ 100 | os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 101 | ] 102 | return training_files, validation_files 103 | 104 | 105 | class MelDataset(torch.utils.data.Dataset): 106 | def __init__( 107 | self, 108 | training_files, 109 | segment_size, 110 | n_fft, 111 | num_mels, 112 | hop_size, 113 | win_size, 114 | sampling_rate, 115 | fmin, 116 | fmax, 117 | split=True, 118 | shuffle=True, 119 | n_cache_reuse=1, 120 | device=None, 121 | fmax_loss=None, 122 | fine_tuning=False, 123 | base_mels_path=None, 124 | ): 125 | self.audio_files = training_files 126 | random.seed(1234) 127 | if shuffle: 128 | random.shuffle(self.audio_files) 129 | self.segment_size = segment_size 130 | self.sampling_rate = sampling_rate 131 | self.split = split 132 | self.n_fft = n_fft 133 | self.num_mels = num_mels 134 | self.hop_size = hop_size 135 | self.win_size = win_size 136 | self.fmin = fmin 137 | self.fmax = fmax 138 | self.fmax_loss = fmax_loss 139 | self.cached_wav = None 140 | self.n_cache_reuse = n_cache_reuse 141 | self._cache_ref_count = 0 142 | self.device = device 143 | self.fine_tuning = fine_tuning 144 | self.base_mels_path = base_mels_path 145 | 146 | def __getitem__(self, index): 147 | filename = self.audio_files[index] 148 | if self._cache_ref_count == 0: 149 | audio, sampling_rate = load_wav(filename) 150 | audio = audio / MAX_WAV_VALUE 151 | if not self.fine_tuning: 152 | audio = normalize(audio) * 0.95 153 | self.cached_wav = audio 154 | if sampling_rate != self.sampling_rate: 155 | raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR") 156 | self._cache_ref_count = self.n_cache_reuse 157 | else: 158 | audio = self.cached_wav 159 | self._cache_ref_count -= 1 160 | 161 | audio = torch.FloatTensor(audio) 162 | audio = audio.unsqueeze(0) 163 | 164 | if not self.fine_tuning: 165 | if self.split: 166 | if audio.size(1) >= self.segment_size: 167 | max_audio_start = audio.size(1) - self.segment_size 168 | audio_start = random.randint(0, max_audio_start) 169 | audio = audio[:, audio_start : audio_start + self.segment_size] 170 | else: 171 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") 172 | 173 | mel = mel_spectrogram( 174 | audio, 175 | self.n_fft, 176 | self.num_mels, 177 | self.sampling_rate, 178 | self.hop_size, 179 | self.win_size, 180 | self.fmin, 181 | self.fmax, 182 | center=False, 183 | ) 184 | else: 185 | mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy")) 186 | mel = torch.from_numpy(mel) 187 | 188 | if len(mel.shape) < 3: 189 | mel = mel.unsqueeze(0) 190 | 191 | if self.split: 192 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 193 | 194 | if audio.size(1) >= self.segment_size: 195 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 196 | mel = mel[:, :, mel_start : mel_start + frames_per_seg] 197 | audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size] 198 | else: 199 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant") 200 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") 201 | 202 | mel_loss = mel_spectrogram( 203 | audio, 204 | self.n_fft, 205 | self.num_mels, 206 | self.sampling_rate, 207 | self.hop_size, 208 | self.win_size, 209 | self.fmin, 210 | self.fmax_loss, 211 | center=False, 212 | ) 213 | 214 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 215 | 216 | def __len__(self): 217 | return len(self.audio_files) 218 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/models/baselightningmodule.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a base lightning module that can be used to train a model. 3 | The benefit of this abstraction is that all the logic outside of model definition can be reused for different models. 4 | """ 5 | import inspect 6 | from abc import ABC 7 | from typing import Any, Dict 8 | 9 | import torch 10 | from lightning import LightningModule 11 | from lightning.pytorch.utilities import grad_norm 12 | 13 | from matcha import utils 14 | from matcha.utils.utils import plot_tensor 15 | 16 | log = utils.get_pylogger(__name__) 17 | 18 | 19 | class BaseLightningClass(LightningModule, ABC): 20 | def update_data_statistics(self, data_statistics): 21 | if data_statistics is None: 22 | data_statistics = { 23 | "mel_mean": 0.0, 24 | "mel_std": 1.0, 25 | } 26 | 27 | self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"])) 28 | self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"])) 29 | 30 | def configure_optimizers(self) -> Any: 31 | optimizer = self.hparams.optimizer(params=self.parameters()) 32 | if self.hparams.scheduler not in (None, {}): 33 | scheduler_args = {} 34 | # Manage last epoch for exponential schedulers 35 | if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters: 36 | if hasattr(self, "ckpt_loaded_epoch"): 37 | current_epoch = self.ckpt_loaded_epoch - 1 38 | else: 39 | current_epoch = -1 40 | 41 | scheduler_args.update({"optimizer": optimizer}) 42 | scheduler = self.hparams.scheduler.scheduler(**scheduler_args) 43 | scheduler.last_epoch = current_epoch 44 | return { 45 | "optimizer": optimizer, 46 | "lr_scheduler": { 47 | "scheduler": scheduler, 48 | "interval": self.hparams.scheduler.lightning_args.interval, 49 | "frequency": self.hparams.scheduler.lightning_args.frequency, 50 | "name": "learning_rate", 51 | }, 52 | } 53 | 54 | return {"optimizer": optimizer} 55 | 56 | def get_losses(self, batch): 57 | x, x_lengths = batch["x"], batch["x_lengths"] 58 | y, y_lengths = batch["y"], batch["y_lengths"] 59 | spks = batch["spks"] 60 | 61 | dur_loss, prior_loss, diff_loss = self( 62 | x=x, 63 | x_lengths=x_lengths, 64 | y=y, 65 | y_lengths=y_lengths, 66 | spks=spks, 67 | out_size=self.out_size, 68 | ) 69 | return { 70 | "dur_loss": dur_loss, 71 | "prior_loss": prior_loss, 72 | "diff_loss": diff_loss, 73 | } 74 | 75 | def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: 76 | self.ckpt_loaded_epoch = checkpoint["epoch"] # pylint: disable=attribute-defined-outside-init 77 | 78 | def training_step(self, batch: Any, batch_idx: int): 79 | loss_dict = self.get_losses(batch) 80 | self.log( 81 | "step", 82 | float(self.global_step), 83 | on_step=True, 84 | prog_bar=True, 85 | logger=True, 86 | sync_dist=True, 87 | ) 88 | 89 | self.log( 90 | "sub_loss/train_dur_loss", 91 | loss_dict["dur_loss"], 92 | on_step=True, 93 | on_epoch=True, 94 | logger=True, 95 | sync_dist=True, 96 | ) 97 | self.log( 98 | "sub_loss/train_prior_loss", 99 | loss_dict["prior_loss"], 100 | on_step=True, 101 | on_epoch=True, 102 | logger=True, 103 | sync_dist=True, 104 | ) 105 | self.log( 106 | "sub_loss/train_diff_loss", 107 | loss_dict["diff_loss"], 108 | on_step=True, 109 | on_epoch=True, 110 | logger=True, 111 | sync_dist=True, 112 | ) 113 | 114 | total_loss = sum(loss_dict.values()) 115 | self.log( 116 | "loss/train", 117 | total_loss, 118 | on_step=True, 119 | on_epoch=True, 120 | logger=True, 121 | prog_bar=True, 122 | sync_dist=True, 123 | ) 124 | 125 | return {"loss": total_loss, "log": loss_dict} 126 | 127 | def validation_step(self, batch: Any, batch_idx: int): 128 | loss_dict = self.get_losses(batch) 129 | self.log( 130 | "sub_loss/val_dur_loss", 131 | loss_dict["dur_loss"], 132 | on_step=True, 133 | on_epoch=True, 134 | logger=True, 135 | sync_dist=True, 136 | ) 137 | self.log( 138 | "sub_loss/val_prior_loss", 139 | loss_dict["prior_loss"], 140 | on_step=True, 141 | on_epoch=True, 142 | logger=True, 143 | sync_dist=True, 144 | ) 145 | self.log( 146 | "sub_loss/val_diff_loss", 147 | loss_dict["diff_loss"], 148 | on_step=True, 149 | on_epoch=True, 150 | logger=True, 151 | sync_dist=True, 152 | ) 153 | 154 | total_loss = sum(loss_dict.values()) 155 | self.log( 156 | "loss/val", 157 | total_loss, 158 | on_step=True, 159 | on_epoch=True, 160 | logger=True, 161 | prog_bar=True, 162 | sync_dist=True, 163 | ) 164 | 165 | return total_loss 166 | 167 | def on_validation_end(self) -> None: 168 | if self.trainer.is_global_zero: 169 | one_batch = next(iter(self.trainer.val_dataloaders)) 170 | if self.current_epoch == 0: 171 | log.debug("Plotting original samples") 172 | for i in range(2): 173 | y = one_batch["y"][i].unsqueeze(0).to(self.device) 174 | self.logger.experiment.add_image( 175 | f"original/{i}", 176 | plot_tensor(y.squeeze().cpu()), 177 | self.current_epoch, 178 | dataformats="HWC", 179 | ) 180 | 181 | log.debug("Synthesising...") 182 | for i in range(2): 183 | x = one_batch["x"][i].unsqueeze(0).to(self.device) 184 | x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device) 185 | spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None 186 | output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks) 187 | y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"] 188 | attn = output["attn"] 189 | self.logger.experiment.add_image( 190 | f"generated_enc/{i}", 191 | plot_tensor(y_enc.squeeze().cpu()), 192 | self.current_epoch, 193 | dataformats="HWC", 194 | ) 195 | self.logger.experiment.add_image( 196 | f"generated_dec/{i}", 197 | plot_tensor(y_dec.squeeze().cpu()), 198 | self.current_epoch, 199 | dataformats="HWC", 200 | ) 201 | self.logger.experiment.add_image( 202 | f"alignment/{i}", 203 | plot_tensor(attn.squeeze().cpu()), 204 | self.current_epoch, 205 | dataformats="HWC", 206 | ) 207 | 208 | def on_before_optimizer_step(self, optimizer): 209 | self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()}) 210 | -------------------------------------------------------------------------------- /third_party/Matcha-TTS/matcha/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | from importlib.util import find_spec 5 | from pathlib import Path 6 | from typing import Any, Callable, Dict, Tuple 7 | 8 | import gdown 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import torch 12 | import wget 13 | from omegaconf import DictConfig 14 | 15 | from matcha.utils import pylogger, rich_utils 16 | 17 | log = pylogger.get_pylogger(__name__) 18 | 19 | 20 | def extras(cfg: DictConfig) -> None: 21 | """Applies optional utilities before the task is started. 22 | 23 | Utilities: 24 | - Ignoring python warnings 25 | - Setting tags from command line 26 | - Rich config printing 27 | 28 | :param cfg: A DictConfig object containing the config tree. 29 | """ 30 | # return if no `extras` config 31 | if not cfg.get("extras"): 32 | log.warning("Extras config not found! ") 33 | return 34 | 35 | # disable python warnings 36 | if cfg.extras.get("ignore_warnings"): 37 | log.info("Disabling python warnings! ") 38 | warnings.filterwarnings("ignore") 39 | 40 | # prompt user to input tags from command line if none are provided in the config 41 | if cfg.extras.get("enforce_tags"): 42 | log.info("Enforcing tags! ") 43 | rich_utils.enforce_tags(cfg, save_to_file=True) 44 | 45 | # pretty print config tree using Rich library 46 | if cfg.extras.get("print_config"): 47 | log.info("Printing config tree with Rich! ") 48 | rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True) 49 | 50 | 51 | def task_wrapper(task_func: Callable) -> Callable: 52 | """Optional decorator that controls the failure behavior when executing the task function. 53 | 54 | This wrapper can be used to: 55 | - make sure loggers are closed even if the task function raises an exception (prevents multirun failure) 56 | - save the exception to a `.log` file 57 | - mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later) 58 | - etc. (adjust depending on your needs) 59 | 60 | Example: 61 | ``` 62 | @utils.task_wrapper 63 | def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: 64 | ... 65 | return metric_dict, object_dict 66 | ``` 67 | 68 | :param task_func: The task function to be wrapped. 69 | 70 | :return: The wrapped task function. 71 | """ 72 | 73 | def wrap(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: 74 | # execute the task 75 | try: 76 | metric_dict, object_dict = task_func(cfg=cfg) 77 | 78 | # things to do if exception occurs 79 | except Exception as ex: 80 | # save exception to `.log` file 81 | log.exception("") 82 | 83 | # some hyperparameter combinations might be invalid or cause out-of-memory errors 84 | # so when using hparam search plugins like Optuna, you might want to disable 85 | # raising the below exception to avoid multirun failure 86 | raise ex 87 | 88 | # things to always do after either success or exception 89 | finally: 90 | # display output dir path in terminal 91 | log.info(f"Output dir: {cfg.paths.output_dir}") 92 | 93 | # always close wandb run (even if exception occurs so multirun won't fail) 94 | if find_spec("wandb"): # check if wandb is installed 95 | import wandb 96 | 97 | if wandb.run: 98 | log.info("Closing wandb!") 99 | wandb.finish() 100 | 101 | return metric_dict, object_dict 102 | 103 | return wrap 104 | 105 | 106 | def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> float: 107 | """Safely retrieves value of the metric logged in LightningModule. 108 | 109 | :param metric_dict: A dict containing metric values. 110 | :param metric_name: The name of the metric to retrieve. 111 | :return: The value of the metric. 112 | """ 113 | if not metric_name: 114 | log.info("Metric name is None! Skipping metric value retrieval...") 115 | return None 116 | 117 | if metric_name not in metric_dict: 118 | raise ValueError( 119 | f"Metric value not found! \n" 120 | "Make sure metric name logged in LightningModule is correct!\n" 121 | "Make sure `optimized_metric` name in `hparams_search` config is correct!" 122 | ) 123 | 124 | metric_value = metric_dict[metric_name].item() 125 | log.info(f"Retrieved metric value! <{metric_name}={metric_value}>") 126 | 127 | return metric_value 128 | 129 | 130 | def intersperse(lst, item): 131 | # Adds blank symbol 132 | result = [item] * (len(lst) * 2 + 1) 133 | result[1::2] = lst 134 | return result 135 | 136 | 137 | def save_figure_to_numpy(fig): 138 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") 139 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 140 | return data 141 | 142 | 143 | def plot_tensor(tensor): 144 | plt.style.use("default") 145 | fig, ax = plt.subplots(figsize=(12, 3)) 146 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none") 147 | plt.colorbar(im, ax=ax) 148 | plt.tight_layout() 149 | fig.canvas.draw() 150 | data = save_figure_to_numpy(fig) 151 | plt.close() 152 | return data 153 | 154 | 155 | def save_plot(tensor, savepath): 156 | plt.style.use("default") 157 | fig, ax = plt.subplots(figsize=(12, 3)) 158 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none") 159 | plt.colorbar(im, ax=ax) 160 | plt.tight_layout() 161 | fig.canvas.draw() 162 | plt.savefig(savepath) 163 | plt.close() 164 | 165 | 166 | def to_numpy(tensor): 167 | if isinstance(tensor, np.ndarray): 168 | return tensor 169 | elif isinstance(tensor, torch.Tensor): 170 | return tensor.detach().cpu().numpy() 171 | elif isinstance(tensor, list): 172 | return np.array(tensor) 173 | else: 174 | raise TypeError("Unsupported type for conversion to numpy array") 175 | 176 | 177 | def get_user_data_dir(appname="matcha_tts"): 178 | """ 179 | Args: 180 | appname (str): Name of application 181 | 182 | Returns: 183 | Path: path to user data directory 184 | """ 185 | 186 | MATCHA_HOME = os.environ.get("MATCHA_HOME") 187 | if MATCHA_HOME is not None: 188 | ans = Path(MATCHA_HOME).expanduser().resolve(strict=False) 189 | elif sys.platform == "win32": 190 | import winreg # pylint: disable=import-outside-toplevel 191 | 192 | key = winreg.OpenKey( 193 | winreg.HKEY_CURRENT_USER, 194 | r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders", 195 | ) 196 | dir_, _ = winreg.QueryValueEx(key, "Local AppData") 197 | ans = Path(dir_).resolve(strict=False) 198 | elif sys.platform == "darwin": 199 | ans = Path("~/Library/Application Support/").expanduser() 200 | else: 201 | ans = Path.home().joinpath(".local/share") 202 | 203 | final_path = ans.joinpath(appname) 204 | final_path.mkdir(parents=True, exist_ok=True) 205 | return final_path 206 | 207 | 208 | def assert_model_downloaded(checkpoint_path, url, use_wget=True): 209 | if Path(checkpoint_path).exists(): 210 | log.debug(f"[+] Model already present at {checkpoint_path}!") 211 | print(f"[+] Model already present at {checkpoint_path}!") 212 | return 213 | log.info(f"[-] Model not found at {checkpoint_path}! Will download it") 214 | print(f"[-] Model not found at {checkpoint_path}! Will download it") 215 | checkpoint_path = str(checkpoint_path) 216 | if not use_wget: 217 | gdown.download(url=url, output=checkpoint_path, quiet=False, fuzzy=True) 218 | else: 219 | wget.download(url=url, out=checkpoint_path) 220 | --------------------------------------------------------------------------------