├── results
    └── .gitkeep
├── cosyvoice
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   ├── model.py
    │   └── cosyvoice.py
    ├── utils
    │   ├── __init__.py
    │   ├── file_utils.py
    │   ├── class_utils.py
    │   ├── common.py
    │   ├── frontend_utils.py
    │   └── executor.py
    ├── dataset
    │   ├── __init__.py
    │   └── dataset.py
    ├── transformer
    │   ├── __init__.py
    │   ├── activation.py
    │   ├── label_smoothing_loss.py
    │   ├── positionwise_feed_forward.py
    │   ├── decoder_layer.py
    │   └── convolution.py
    ├── flow
    │   ├── length_regulator.py
    │   ├── flow_matching.py
    │   └── flow.py
    ├── hifigan
    │   └── f0_predictor.py
    └── bin
    │   ├── train.py
    │   └── inference.py
├── .gitignore
├── third_party
    └── Matcha-TTS
    │   ├── matcha
    │       ├── __init__.py
    │       ├── data
    │       │   ├── __init__.py
    │       │   └── components
    │       │   │   └── __init__.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── components
    │       │   │   ├── __init__.py
    │       │   │   └── flow_matching.py
    │       │   └── baselightningmodule.py
    │       ├── onnx
    │       │   ├── __init__.py
    │       │   ├── export.py
    │       │   └── infer.py
    │       ├── VERSION
    │       ├── hifigan
    │       │   ├── __init__.py
    │       │   ├── env.py
    │       │   ├── config.py
    │       │   ├── LICENSE
    │       │   ├── xutils.py
    │       │   ├── denoiser.py
    │       │   ├── README.md
    │       │   └── meldataset.py
    │       ├── utils
    │       │   ├── monotonic_align
    │       │   │   ├── setup.py
    │       │   │   ├── __init__.py
    │       │   │   └── core.pyx
    │       │   ├── __init__.py
    │       │   ├── pylogger.py
    │       │   ├── logging_utils.py
    │       │   ├── instantiators.py
    │       │   ├── audio.py
    │       │   ├── model.py
    │       │   ├── rich_utils.py
    │       │   ├── generate_data_statistics.py
    │       │   └── utils.py
    │       ├── text
    │       │   ├── symbols.py
    │       │   ├── __init__.py
    │       │   ├── numbers.py
    │       │   └── cleaners.py
    │       └── train.py
    │   ├── notebooks
    │       └── .gitkeep
    │   ├── configs
    │       ├── local
    │       │   └── .gitkeep
    │       ├── callbacks
    │       │   ├── none.yaml
    │       │   ├── default.yaml
    │       │   ├── rich_progress_bar.yaml
    │       │   ├── model_summary.yaml
    │       │   └── model_checkpoint.yaml
    │       ├── model
    │       │   ├── cfm
    │       │   │   └── default.yaml
    │       │   ├── optimizer
    │       │   │   └── adam.yaml
    │       │   ├── decoder
    │       │   │   └── default.yaml
    │       │   ├── matcha.yaml
    │       │   └── encoder
    │       │   │   └── default.yaml
    │       ├── trainer
    │       │   ├── cpu.yaml
    │       │   ├── gpu.yaml
    │       │   ├── mps.yaml
    │       │   ├── ddp.yaml
    │       │   ├── ddp_sim.yaml
    │       │   └── default.yaml
    │       ├── __init__.py
    │       ├── debug
    │       │   ├── fdr.yaml
    │       │   ├── overfit.yaml
    │       │   ├── limit.yaml
    │       │   ├── profiler.yaml
    │       │   └── default.yaml
    │       ├── logger
    │       │   ├── many_loggers.yaml
    │       │   ├── csv.yaml
    │       │   ├── tensorboard.yaml
    │       │   ├── neptune.yaml
    │       │   ├── mlflow.yaml
    │       │   ├── comet.yaml
    │       │   ├── wandb.yaml
    │       │   └── aim.yaml
    │       ├── extras
    │       │   └── default.yaml
    │       ├── experiment
    │       │   ├── ljspeech.yaml
    │       │   ├── multispeaker.yaml
    │       │   ├── ljspeech_min_memory.yaml
    │       │   └── hifi_dataset_piper_phonemizer.yaml
    │       ├── eval.yaml
    │       ├── data
    │       │   ├── vctk.yaml
    │       │   ├── hi-fi_en-US_female.yaml
    │       │   └── ljspeech.yaml
    │       ├── hydra
    │       │   └── default.yaml
    │       ├── paths
    │       │   └── default.yaml
    │       ├── train.yaml
    │       └── hparams_search
    │       │   └── mnist_optuna.yaml
    │   ├── matcha_tts.egg-info
    │       ├── dependency_links.txt
    │       ├── top_level.txt
    │       ├── entry_points.txt
    │       ├── requires.txt
    │       └── SOURCES.txt
    │   ├── scripts
    │       └── schedule.sh
    │   ├── MANIFEST.in
    │   ├── requirements.txt
    │   ├── LICENSE
    │   ├── pyproject.toml
    │   ├── Makefile
    │   └── setup.py
├── data
    ├── example.wav
    └── batch_files.csv
├── README.md
├── run_single_inference.sh
├── run_batch_inference.sh
├── requirements.txt
└── batch_inference.py


/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/local/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/onnx/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/callbacks/none.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.5.1
2 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/data/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/models/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha_tts.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha_tts.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | configs
2 | matcha
3 | 


--------------------------------------------------------------------------------
/data/example.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Splend1d/BreezyVoice/HEAD/data/example.wav


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/model/cfm/default.yaml:
--------------------------------------------------------------------------------
1 | name: CFM
2 | solver: euler
3 | sigma_min: 1e-4
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BreezyVoice
2 | 
3 | Migrated to [MediaTek-Research](https://github.com/mtkresearch/BreezyVoice)
4 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/cpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: cpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/gpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: gpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/mps.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: mps
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # this file is needed here to include configs when building project as a package
2 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | _partial_: true
3 | lr: 1e-4
4 | weight_decay: 0.0
5 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/callbacks/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - model_checkpoint.yaml
3 |   - model_summary.yaml
4 |   - rich_progress_bar.yaml
5 |   - _self_
6 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/ddp.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - default
 3 | 
 4 | strategy: ddp
 5 | 
 6 | accelerator: gpu
 7 | devices: [0,1]
 8 | num_nodes: 1
 9 | sync_batchnorm: True
10 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | # simulate DDP on CPU, useful for debugging
5 | accelerator: cpu
6 | devices: 2
7 | strategy: ddp_spawn
8 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/model/decoder/default.yaml:
--------------------------------------------------------------------------------
1 | channels: [256, 256]
2 | dropout: 0.05
3 | attention_head_dim: 64
4 | n_blocks: 1
5 | num_mid_blocks: 2
6 | num_heads: 2
7 | act_fn: snakebeta
8 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/debug/fdr.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs 1 train, 1 validation and 1 test step
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   fast_dev_run: true
10 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha_tts.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | matcha-data-stats = matcha.utils.generate_data_statistics:main
3 | matcha-tts = matcha.cli:cli
4 | matcha-tts-app = matcha.app:main
5 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/many_loggers.yaml:
--------------------------------------------------------------------------------
 1 | # train with many loggers at once
 2 | 
 3 | defaults:
 4 |   # - comet
 5 |   - csv
 6 |   # - mlflow
 7 |   # - neptune
 8 |   - tensorboard
 9 |   - wandb
10 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/csv.yaml:
--------------------------------------------------------------------------------
1 | # csv logger built in lightning
2 | 
3 | csv:
4 |   _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
5 |   save_dir: "${paths.output_dir}"
6 |   name: "csv/"
7 |   prefix: ""
8 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
2 | 
3 | rich_progress_bar:
4 |   _target_: lightning.pytorch.callbacks.RichProgressBar
5 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/scripts/schedule.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Schedule execution of many runs
3 | # Run from root folder with: bash scripts/schedule.sh
4 | 
5 | python src/train.py trainer.max_epochs=5 logger=csv
6 | 
7 | python src/train.py trainer.max_epochs=10 logger=csv
8 | 


--------------------------------------------------------------------------------
/run_single_inference.sh:
--------------------------------------------------------------------------------
1 | python3 single_inference.py --speaker_prompt_audio_path "data/example.wav" --speaker_prompt_text_transcription "在密碼學中，加密是將明文資訊改變為難以讀取的密文內容，使之不可讀的方法。只有擁有解密方法的對象，經由解密過程，才能將密文還原為正常可讀的內容。" --content_to_synthesize "歡迎使用聯發創新基地 BreezyVoice 模型。" --output_path results/out.wav
2 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 | 
5 | # setup(name='monotonic_align',
6 | #       ext_modules=cythonize("core.pyx"),
7 | #       include_dirs=[numpy.get_include()])
8 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/extras/default.yaml:
--------------------------------------------------------------------------------
1 | # disable python warnings if they annoy you
2 | ignore_warnings: False
3 | 
4 | # ask user for tags if none are provided in the config
5 | enforce_tags: True
6 | 
7 | # pretty print config tree at the start of the run using Rich library
8 | print_config: True
9 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/debug/overfit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # overfits to 3 batches
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 20
10 |   overfit_batches: 3
11 | 
12 | # model ckpt and early stopping need to be disabled during overfitting
13 | callbacks: null
14 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/debug/limit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # uses only 1% of the training data and 5% of validation/test data
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 3
10 |   limit_train_batches: 0.01
11 |   limit_val_batches: 0.05
12 |   limit_test_batches: 0.05
13 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
2 | 
3 | model_summary:
4 |   _target_: lightning.pytorch.callbacks.RichModelSummary
5 |   max_depth: 3 # the maximum depth of layer nesting that the summary will include
6 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/debug/profiler.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs with execution time profiling
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 1
10 |   # profiler: "simple"
11 |   profiler: "advanced"
12 |   # profiler: "pytorch"
13 |   accelerator: gpu
14 | 
15 |   limit_train_batches: 0.02
16 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.tensorflow.org/tensorboard/
 2 | 
 3 | tensorboard:
 4 |   _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
 5 |   save_dir: "${paths.output_dir}/tensorboard/"
 6 |   name: null
 7 |   log_graph: False
 8 |   default_hp_metric: True
 9 |   prefix: ""
10 |   # version: ""
11 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/neptune.yaml:
--------------------------------------------------------------------------------
 1 | # https://neptune.ai
 2 | 
 3 | neptune:
 4 |   _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
 5 |   api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
 6 |   project: username/lightning-hydra-template
 7 |   # name: ""
 8 |   log_model_checkpoints: True
 9 |   prefix: ""
10 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
2 | from matcha.utils.logging_utils import log_hyperparameters
3 | from matcha.utils.pylogger import get_pylogger
4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree
5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper
6 | 


--------------------------------------------------------------------------------
/run_batch_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default parameters
 4 | CSV_FILE="data/batch_files.csv"
 5 | SPEAKER_PROMPT_AUDIO_FOLDER="data"
 6 | OUTPUT_AUDIO_FOLDER="results"
 7 | 
 8 | # Run the Python script with default parameters
 9 | python batch_inference.py \
10 |     --csv_file "$CSV_FILE" \
11 |     --speaker_prompt_audio_folder "$SPEAKER_PROMPT_AUDIO_FOLDER" \
12 |     --output_audio_folder "$OUTPUT_AUDIO_FOLDER"


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/mlflow.yaml:
--------------------------------------------------------------------------------
 1 | # https://mlflow.org
 2 | 
 3 | mlflow:
 4 |   _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
 5 |   # experiment_name: ""
 6 |   # run_name: ""
 7 |   tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
 8 |   tags: null
 9 |   # save_dir: "./mlruns"
10 |   prefix: ""
11 |   artifact_location: null
12 |   # run_id: ""
13 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/model/matcha.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - encoder: default.yaml
 4 |   - decoder: default.yaml
 5 |   - cfm: default.yaml
 6 |   - optimizer: adam.yaml
 7 | 
 8 | _target_: matcha.models.matcha_tts.MatchaTTS
 9 | n_vocab: 178
10 | n_spks: ${data.n_spks}
11 | spk_emb_dim: 64
12 | n_feats: 80
13 | data_statistics: ${data.data_statistics}
14 | out_size: null # Must be divisible by 4
15 | prior_loss: true
16 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE.txt
 3 | include requirements.*.txt
 4 | include *.cff
 5 | include requirements.txt
 6 | include matcha/VERSION
 7 | recursive-include matcha *.json
 8 | recursive-include matcha *.html
 9 | recursive-include matcha *.png
10 | recursive-include matcha *.md
11 | recursive-include matcha *.py
12 | recursive-include matcha *.pyx
13 | recursive-exclude tests *
14 | prune tests*
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/eval.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - _self_
 5 |   - data: mnist # choose datamodule with `test_dataloader()` for evaluation
 6 |   - model: mnist
 7 |   - logger: null
 8 |   - trainer: default
 9 |   - paths: default
10 |   - extras: default
11 |   - hydra: default
12 | 
13 | task_name: "eval"
14 | 
15 | tags: ["dev"]
16 | 
17 | # passing checkpoint path is necessary for evaluation
18 | ckpt_path: ???
19 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: vctk.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["multispeaker"]
13 | 
14 | run_name: multispeaker
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/comet.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.comet.ml
 2 | 
 3 | comet:
 4 |   _target_: lightning.pytorch.loggers.comet.CometLogger
 5 |   api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
 6 |   save_dir: "${paths.output_dir}"
 7 |   project_name: "lightning-hydra-template"
 8 |   rest_api_key: null
 9 |   # experiment_name: ""
10 |   experiment_key: null # set to resume experiment
11 |   offline: False
12 |   prefix: ""
13 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/data/vctk.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - ljspeech
 3 |   - _self_
 4 | 
 5 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 6 | name: vctk
 7 | train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt
 8 | valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt
 9 | batch_size: 32
10 | add_blank: True
11 | n_spks: 109
12 | data_statistics:  # Computed for vctk dataset
13 |   mel_mean: -6.630575
14 |   mel_std: 2.482914
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech_min
15 | 
16 | 
17 | model:
18 |   out_size: 172
19 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/model/encoder/default.yaml:
--------------------------------------------------------------------------------
 1 | encoder_type: RoPE Encoder
 2 | encoder_params:
 3 |   n_feats: ${model.n_feats}
 4 |   n_channels: 192
 5 |   filter_channels: 768
 6 |   filter_channels_dp: 256
 7 |   n_heads: 2
 8 |   n_layers: 6
 9 |   kernel_size: 3
10 |   p_dropout: 0.1
11 |   spk_emb_dim: 64
12 |   n_spks: 1
13 |   prenet: true
14 | 
15 | duration_predictor_params:
16 |   filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
17 |   kernel_size: 3
18 |   p_dropout: ${model.encoder.encoder_params.p_dropout}
19 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: hi-fi_en-US_female.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"]
13 | 
14 | run_name: hi-fi_en-US_female_piper_phonemizer
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: lightning.pytorch.trainer.Trainer
 2 | 
 3 | default_root_dir: ${paths.output_dir}
 4 | 
 5 | max_epochs: -1
 6 | 
 7 | accelerator: gpu
 8 | devices: [0]
 9 | 
10 | # mixed precision for extra speed-up
11 | precision: 16-mixed
12 | 
13 | # perform a validation loop every N training epochs
14 | check_val_every_n_epoch: 1
15 | 
16 | # set True to to ensure deterministic results
17 | # makes training slower but gives more reproducibility than just setting seeds
18 | deterministic: False
19 | 
20 | gradient_clip_val: 5.0
21 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha_tts.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | torch>=2.0.0
 2 | torchvision>=0.15.0
 3 | lightning>=2.0.0
 4 | torchmetrics>=0.11.4
 5 | hydra-core==1.3.2
 6 | hydra-colorlog==1.2.0
 7 | hydra-optuna-sweeper==1.2.0
 8 | rootutils
 9 | pre-commit
10 | rich
11 | pytest
12 | phonemizer
13 | tensorboard
14 | librosa
15 | Cython
16 | numpy
17 | einops
18 | inflect
19 | Unidecode
20 | scipy
21 | torchaudio
22 | matplotlib
23 | pandas
24 | conformer==0.3.2
25 | diffusers==0.25.0
26 | notebook
27 | ipywidgets
28 | gradio==3.43.2
29 | gdown
30 | wget
31 | seaborn
32 | piper_phonemize
33 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - ljspeech
 3 |   - _self_
 4 | 
 5 | # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/
 6 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 7 | name: hi-fi_en-US_female
 8 | train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt
 9 | valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt
10 | batch_size: 32
11 | cleaners: [english_cleaners_piper]
12 | data_statistics:  # Computed for this dataset
13 |   mel_mean: -6.38385
14 |   mel_std: 2.541796
15 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron
 2 | 
 3 | Defines the set of symbols used in text input to the model.
 4 | """
 5 | _pad = "_"
 6 | _punctuation = ';:,.!?¡¿—…"«»“” '
 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 8 | _letters_ipa = (
 9 |     "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 | )
11 | 
12 | 
13 | # Export all symbols:
14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15 | 
16 | # Special symbol ids
17 | SPACE_ID = symbols.index(" ")
18 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/wandb.yaml:
--------------------------------------------------------------------------------
 1 | # https://wandb.ai
 2 | 
 3 | wandb:
 4 |   _target_: lightning.pytorch.loggers.wandb.WandbLogger
 5 |   # name: "" # name of the run (normally generated by wandb)
 6 |   save_dir: "${paths.output_dir}"
 7 |   offline: False
 8 |   id: null # pass correct id to resume experiment!
 9 |   anonymous: null # enable anonymous logging
10 |   project: "lightning-hydra-template"
11 |   log_model: False # upload lightning ckpts
12 |   prefix: "" # a string to put at the beginning of metric keys
13 |   # entity: "" # set to name of your wandb team
14 |   group: ""
15 |   tags: []
16 |   job_type: ""
17 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/data/ljspeech.yaml:
--------------------------------------------------------------------------------
 1 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 2 | name: ljspeech
 3 | train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt
 4 | valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt
 5 | batch_size: 32
 6 | num_workers: 20
 7 | pin_memory: True
 8 | cleaners: [english_cleaners2]
 9 | add_blank: True
10 | n_spks: 1
11 | n_fft: 1024
12 | n_feats: 80
13 | sample_rate: 22050
14 | hop_length: 256
15 | win_length: 1024
16 | f_min: 0
17 | f_max: 8000
18 | data_statistics:  # Computed for ljspeech dataset
19 |   mel_mean: -5.536622
20 |   mel_std: 2.116101
21 | seed: ${seed}
22 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # https://hydra.cc/docs/configure_hydra/intro/
 2 | 
 3 | # enable color logging
 4 | defaults:
 5 |   - override hydra_logging: colorlog
 6 |   - override job_logging: colorlog
 7 | 
 8 | # output directory, generated dynamically on each run
 9 | run:
10 |   dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | sweep:
12 |   dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
13 |   subdir: ${hydra.job.num}
14 | 
15 | job_logging:
16 |   handlers:
17 |     file:
18 |       # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
19 |       filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
20 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/paths/default.yaml:
--------------------------------------------------------------------------------
 1 | # path to root directory
 2 | # this requires PROJECT_ROOT environment variable to exist
 3 | # you can replace it with "." if you want the root to be the current working directory
 4 | root_dir: ${oc.env:PROJECT_ROOT}
 5 | 
 6 | # path to data directory
 7 | data_dir: ${paths.root_dir}/data/
 8 | 
 9 | # path to logging directory
10 | log_dir: ${paths.root_dir}/logs/
11 | 
12 | # path to output directory, created dynamically by hydra
13 | # path generation pattern is specified in `configs/hydra/default.yaml`
14 | # use it to store all files generated during the run, like ckpts and metrics
15 | output_dir: ${hydra:runtime.output_dir}
16 | 
17 | # path to working directory
18 | work_dir: ${hydra:runtime.cwd}
19 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from matcha.utils.monotonic_align.core import maximum_path_c
 5 | 
 6 | 
 7 | def maximum_path(value, mask):
 8 |     """Cython optimised version.
 9 |     value: [b, t_x, t_y]
10 |     mask: [b, t_x, t_y]
11 |     """
12 |     value = value * mask
13 |     device = value.device
14 |     dtype = value.dtype
15 |     value = value.data.cpu().numpy().astype(np.float32)
16 |     path = np.zeros_like(value).astype(np.int32)
17 |     mask = mask.data.cpu().numpy()
18 | 
19 |     t_x_max = mask.sum(1)[:, 0].astype(np.int32)
20 |     t_y_max = mask.sum(2)[:, 0].astype(np.int32)
21 |     maximum_path_c(path, value, t_x_max, t_y_max)
22 |     return torch.from_numpy(path).to(device=device, dtype=dtype)
23 | 


--------------------------------------------------------------------------------
/data/batch_files.csv:
--------------------------------------------------------------------------------
1 | speaker_prompt_audio_filename,speaker,speaker_prompt_text_transcription,content_to_synthesize,output_audio_filename
2 | example,女,在密碼學中，加密是將明文資訊改變為難以讀取的密文內容，使之不可讀的方法。只有擁有解密方法的對象，經由解密過程，才能將密文還原為正常可讀的內容。,信義快速道路原名國道三號臺北聯絡線信義支線，最初是按照國道等級的標準規劃為高速公路支線。信義快速道路的開通有效改善了臺北市東南部與新北市之間的交通。由於山脈阻隔，過去臺北市與木柵、景美，以及新北市深坑、新店等地之間的交通需要繞道，增加了通勤時間，也加重其他地區的交通負荷。該道路不僅連接信義區與高速公路系統，也緩解了上述地區間的交通壓力，縮短通勤時間。道路開通後，多家大臺北地區的公車業者調整路線，利用信義快速道路縮短跨區通勤時間。由於原本設計採用國道標準，信義快速道路全線為雙向各三線道設計，但速限低於國道，設定為每小時40至70公里。內側車道原本規劃為信義輕軌的專用道，目前改為公車與計程車專用車道，一般小客車禁止行駛，是臺灣首見的道路規劃方式。2007年11月1日起，配合開放大型重型機車行駛部分高架橋與快速道路的政策，信義快速道路也允許大型重型機車行駛。,out-wiki
3 | example,女,在密碼學中，加密是將明文資訊改變為難以讀取的密文內容，使之不可讀的方法。只有擁有解密方法的對象，經由解密過程，才能將密文還原為正常可讀的內容。,歡迎使用聯發創新基地 BreezyVoice 模型。,out-BreezyVoice
4 | example,女,在密碼學中，加密是將明文資訊改變為難以讀取的密文內容，使之不可讀的方法。只有擁有解密方法的對象，經由解密過程，才能將密文還原為正常可讀的內容。,今天天氣真好,out-weather
5 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lightning.pytorch.utilities import rank_zero_only
 4 | 
 5 | 
 6 | def get_pylogger(name: str = __name__) -> logging.Logger:
 7 |     """Initializes a multi-GPU-friendly python command line logger.
 8 | 
 9 |     :param name: The name of the logger, defaults to ``__name__``.
10 | 
11 |     :return: A logger object.
12 |     """
13 |     logger = logging.getLogger(name)
14 | 
15 |     # this ensures all logging levels get marked with the rank zero decorator
16 |     # otherwise logs would get multiplied for each GPU process in multi-GPU setup
17 |     logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical")
18 |     for level in logging_levels:
19 |         setattr(logger, level, rank_zero_only(getattr(logger, level)))
20 | 
21 |     return logger
22 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/config.py:
--------------------------------------------------------------------------------
 1 | v1 = {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0004,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 |     "upsample_rates": [8, 8, 2, 2],
11 |     "upsample_kernel_sizes": [16, 16, 4, 4],
12 |     "upsample_initial_channel": 512,
13 |     "resblock_kernel_sizes": [3, 7, 11],
14 |     "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
15 |     "resblock_initial_channel": 256,
16 |     "segment_size": 8192,
17 |     "num_mels": 80,
18 |     "num_freq": 1025,
19 |     "n_fft": 1024,
20 |     "hop_size": 256,
21 |     "win_size": 1024,
22 |     "sampling_rate": 22050,
23 |     "fmin": 0,
24 |     "fmax": 8000,
25 |     "fmax_loss": None,
26 |     "num_workers": 4,
27 |     "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
28 | }
29 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu118
 2 | conformer==0.3.2
 3 | deepspeed==0.14.2; sys_platform == 'linux'
 4 | diffusers==0.32.0
 5 | gdown==5.1.0
 6 | gradio==4.32.2
 7 | grpcio==1.57.0
 8 | grpcio-tools==1.57.0
 9 | hydra-core==1.3.2
10 | HyperPyYAML==1.2.2
11 | inflect==7.3.1
12 | librosa==0.10.2
13 | lightning==2.2.4
14 | matplotlib==3.7.5
15 | networkx==3.1
16 | omegaconf==2.3.0
17 | onnxruntime-gpu==1.16.0; sys_platform == 'linux'
18 | openai-whisper==20231117
19 | protobuf==4.25
20 | pydantic==2.7.0
21 | rich==13.7.1
22 | soundfile==0.12.1
23 | tensorboard==2.14.0
24 | torch==2.3.1
25 | torchaudio==2.3.1
26 | wget==3.2
27 | fastapi==0.111.0
28 | fastapi-cli==0.0.4
29 | WeTextProcessing==1.0.3
30 | opencc-python-reimplemented
31 | g2pw
32 | pyarrow
33 | datasets
34 | 
35 | https://www.modelscope.cn/models/speech_tts/speech_kantts_ttsfrd/resolve/master/ttsfrd_dependency-0.1-py3-none-any.whl
36 | https://www.modelscope.cn/models/speech_tts/speech_kantts_ttsfrd/resolve/master/ttsfrd-0.3.9-cp310-cp310-linux_x86_64.whl
37 | 
38 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/debug/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # default debugging setup, runs 1 full epoch
 4 | # other debugging configs can inherit from this one
 5 | 
 6 | # overwrite task name so debugging logs are stored in separate folder
 7 | task_name: "debug"
 8 | 
 9 | # disable callbacks and loggers during debugging
10 | # callbacks: null
11 | # logger: null
12 | 
13 | extras:
14 |   ignore_warnings: False
15 |   enforce_tags: False
16 | 
17 | # sets level of all command line loggers to 'DEBUG'
18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
19 | hydra:
20 |   job_logging:
21 |     root:
22 |       level: DEBUG
23 | 
24 |   # use this to also set hydra loggers to 'DEBUG'
25 |   # verbose: True
26 | 
27 | trainer:
28 |   max_epochs: 1
29 |   accelerator: cpu # debuggers don't like gpus
30 |   devices: 1 # debuggers don't like multiprocessing
31 |   detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
32 | 
33 | data:
34 |   num_workers: 0 # debuggers don't like multiprocessing
35 |   pin_memory: False # disable gpu memory pin
36 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/requirements.txt:
--------------------------------------------------------------------------------
 1 | # --------- pytorch --------- #
 2 | torch>=2.0.0
 3 | torchvision>=0.15.0
 4 | lightning>=2.0.0
 5 | torchmetrics>=0.11.4
 6 | 
 7 | # --------- hydra --------- #
 8 | hydra-core==1.3.2
 9 | hydra-colorlog==1.2.0
10 | hydra-optuna-sweeper==1.2.0
11 | 
12 | # --------- loggers --------- #
13 | # wandb
14 | # neptune-client
15 | # mlflow
16 | # comet-ml
17 | # aim>=3.16.2  # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
18 | 
19 | # --------- others --------- #
20 | rootutils       # standardizing the project root setup
21 | pre-commit      # hooks for applying linters on commit
22 | rich            # beautiful text formatting in terminal
23 | pytest          # tests
24 | # sh            # for running bash commands in some tests (linux/macos only)
25 | phonemizer      # phonemization of text
26 | tensorboard
27 | librosa
28 | Cython
29 | numpy
30 | einops
31 | inflect
32 | Unidecode
33 | scipy
34 | torchaudio
35 | matplotlib
36 | pandas
37 | conformer==0.3.2
38 | diffusers==0.25.0
39 | notebook
40 | ipywidgets
41 | gradio==3.43.2
42 | gdown
43 | wget
44 | seaborn
45 | piper_phonemize
46 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Shivam Mehta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"]
 3 | 
 4 | [tool.black]
 5 | line-length = 120
 6 | target-version = ['py310']
 7 | exclude = '''
 8 | 
 9 | (
10 |   /(
11 |       \.eggs         # exclude a few common directories in the
12 |     | \.git          # root of the project
13 |     | \.hg
14 |     | \.mypy_cache
15 |     | \.tox
16 |     | \.venv
17 |     | _build
18 |     | buck-out
19 |     | build
20 |     | dist
21 |   )/
22 |   | foo.py           # also separately exclude a file named foo.py in
23 |                      # the root of the project
24 | )
25 | '''
26 | 
27 | [tool.pytest.ini_options]
28 | addopts = [
29 |   "--color=yes",
30 |   "--durations=0",
31 |   "--strict-markers",
32 |   "--doctest-modules",
33 | ]
34 | filterwarnings = [
35 |   "ignore::DeprecationWarning",
36 |   "ignore::UserWarning",
37 | ]
38 | log_cli = "True"
39 | markers = [
40 |   "slow: slow tests",
41 | ]
42 | minversion = "6.0"
43 | testpaths = "tests/"
44 | 
45 | [tool.coverage.report]
46 | exclude_lines = [
47 |     "pragma: nocover",
48 |     "raise NotImplementedError",
49 |     "raise NotImplementedError()",
50 |     "if __name__ == .__main__.:",
51 | ]
52 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
 2 | 
 3 | model_checkpoint:
 4 |   _target_: lightning.pytorch.callbacks.ModelCheckpoint
 5 |   dirpath: ${paths.output_dir}/checkpoints # directory to save the model file
 6 |   filename: checkpoint_{epoch:03d}  # checkpoint filename
 7 |   monitor: epoch # name of the logged metric which determines when model is improving
 8 |   verbose: False # verbosity mode
 9 |   save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
10 |   save_top_k: 10 # save k best models (determined by above metric)
11 |   mode: "max" # "max" means higher metric value is better, can be also "min"
12 |   auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
13 |   save_weights_only: False # if True, then only the model’s weights will be saved
14 |   every_n_train_steps: null # number of training steps between checkpoints
15 |   train_time_interval: null # checkpoints are monitored at the specified time interval
16 |   every_n_epochs: 100 # number of epochs between checkpoints
17 |   save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
18 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: ## Clean autogenerated files
 6 | 	rm -rf dist
 7 | 	find . -type f -name "*.DS_Store" -ls -delete
 8 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
 9 | 	find . | grep -E ".pytest_cache" | xargs rm -rf
10 | 	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
11 | 	rm -f .coverage
12 | 
13 | clean-logs: ## Clean logs
14 | 	rm -rf logs/**
15 | 
16 | create-package: ## Create wheel and tar gz
17 | 	rm -rf dist/
18 | 	python setup.py bdist_wheel --plat-name=manylinux1_x86_64
19 | 	python setup.py sdist
20 | 	python -m twine upload  dist/* --verbose --skip-existing
21 | 
22 | format: ## Run pre-commit hooks
23 | 	pre-commit run -a
24 | 
25 | sync: ## Merge changes from main branch to your current branch
26 | 	git pull
27 | 	git pull origin main
28 | 
29 | test: ## Run not slow tests
30 | 	pytest -k "not slow"
31 | 
32 | test-full: ## Run all tests
33 | 	pytest
34 | 
35 | train-ljspeech: ## Train the model
36 | 	python matcha/train.py experiment=ljspeech
37 | 
38 | train-ljspeech-min: ## Train the model with minimum memory
39 | 	python matcha/train.py experiment=ljspeech_min_memory
40 | 
41 | start_app: ## Start the app
42 | 	python matcha/app.py
43 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/logger/aim.yaml:
--------------------------------------------------------------------------------
 1 | # https://aimstack.io/
 2 | 
 3 | # example usage in lightning module:
 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
 5 | 
 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder):
 7 | # `aim up`
 8 | 
 9 | aim:
10 |   _target_: aim.pytorch_lightning.AimLogger
11 |   repo: ${paths.root_dir} # .aim folder will be created here
12 |   # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
13 | 
14 |   # aim allows to group runs under experiment name
15 |   experiment: null # any string, set to "default" if not specified
16 | 
17 |   train_metric_prefix: "train/"
18 |   val_metric_prefix: "val/"
19 |   test_metric_prefix: "test/"
20 | 
21 |   # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
22 |   system_tracking_interval: 10 # set to null to disable system metrics tracking
23 | 
24 |   # enable/disable logging of system params such as installed packages, git info, env vars, etc.
25 |   log_system_params: true
26 | 
27 |   # enable/disable tracking console logs (default value is true)
28 |   capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550
29 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | cimport cython
 4 | cimport numpy as np
 5 | 
 6 | from cython.parallel import prange
 7 | 
 8 | 
 9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 |   cdef int x
13 |   cdef int y
14 |   cdef float v_prev
15 |   cdef float v_cur
16 |   cdef float tmp
17 |   cdef int index = t_x - 1
18 | 
19 |   for y in range(t_y):
20 |     for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 |       if x == y:
22 |         v_cur = max_neg_val
23 |       else:
24 |         v_cur = value[x, y-1]
25 |       if x == 0:
26 |         if y == 0:
27 |           v_prev = 0.
28 |         else:
29 |           v_prev = max_neg_val
30 |       else:
31 |         v_prev = value[x-1, y-1]
32 |       value[x, y] = max(v_cur, v_prev) + value[x, y]
33 | 
34 |   for y in range(t_y - 1, -1, -1):
35 |     path[index, y] = 1
36 |     if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 |       index = index - 1
38 | 
39 | 
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 |   cdef int b = values.shape[0]
44 | 
45 |   cdef int i
46 |   for i in prange(b, nogil=True):
47 |     maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | 
 6 | import matplotlib
 7 | import torch
 8 | from torch.nn.utils import weight_norm
 9 | 
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 | 
13 | 
14 | def plot_spectrogram(spectrogram):
15 |     fig, ax = plt.subplots(figsize=(10, 2))
16 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
17 |     plt.colorbar(im, ax=ax)
18 | 
19 |     fig.canvas.draw()
20 |     plt.close()
21 | 
22 |     return fig
23 | 
24 | 
25 | def init_weights(m, mean=0.0, std=0.01):
26 |     classname = m.__class__.__name__
27 |     if classname.find("Conv") != -1:
28 |         m.weight.data.normal_(mean, std)
29 | 
30 | 
31 | def apply_weight_norm(m):
32 |     classname = m.__class__.__name__
33 |     if classname.find("Conv") != -1:
34 |         weight_norm(m)
35 | 
36 | 
37 | def get_padding(kernel_size, dilation=1):
38 |     return int((kernel_size * dilation - dilation) / 2)
39 | 
40 | 
41 | def load_checkpoint(filepath, device):
42 |     assert os.path.isfile(filepath)
43 |     print(f"Loading '{filepath}'")
44 |     checkpoint_dict = torch.load(filepath, map_location=device)
45 |     print("Complete.")
46 |     return checkpoint_dict
47 | 
48 | 
49 | def save_checkpoint(filepath, obj):
50 |     print(f"Saving checkpoint to {filepath}")
51 |     torch.save(obj, filepath)
52 |     print("Complete.")
53 | 
54 | 
55 | def scan_checkpoint(cp_dir, prefix):
56 |     pattern = os.path.join(cp_dir, prefix + "????????")
57 |     cp_list = glob.glob(pattern)
58 |     if len(cp_list) == 0:
59 |         return None
60 |     return sorted(cp_list)[-1]
61 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | import numpy
 5 | from Cython.Build import cythonize
 6 | from setuptools import Extension, find_packages, setup
 7 | 
 8 | exts = [
 9 |     Extension(
10 |         name="matcha.utils.monotonic_align.core",
11 |         sources=["matcha/utils/monotonic_align/core.pyx"],
12 |     )
13 | ]
14 | 
15 | with open("README.md", encoding="utf-8") as readme_file:
16 |     README = readme_file.read()
17 | 
18 | cwd = os.path.dirname(os.path.abspath(__file__))
19 | with open(os.path.join(cwd, "matcha", "VERSION")) as fin:
20 |     version = fin.read().strip()
21 | 
22 | setup(
23 |     name="matcha-tts",
24 |     version=version,
25 |     description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching",
26 |     long_description=README,
27 |     long_description_content_type="text/markdown",
28 |     author="Shivam Mehta",
29 |     author_email="shivam.mehta25@gmail.com",
30 |     url="https://shivammehta25.github.io/Matcha-TTS",
31 |     install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))],
32 |     include_dirs=[numpy.get_include()],
33 |     include_package_data=True,
34 |     packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]),
35 |     # use this to customize global commands available in the terminal after installing the package
36 |     entry_points={
37 |         "console_scripts": [
38 |             "matcha-data-stats=matcha.utils.generate_data_statistics:main",
39 |             "matcha-tts=matcha.cli:cli",
40 |             "matcha-tts-app=matcha.app:main",
41 |         ]
42 |     },
43 |     ext_modules=cythonize(exts, language_level=3),
44 |     python_requires=">=3.9.0",
45 | )
46 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha_tts.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | MANIFEST.in
 3 | README.md
 4 | pyproject.toml
 5 | requirements.txt
 6 | setup.py
 7 | configs/__init__.py
 8 | matcha/VERSION
 9 | matcha/__init__.py
10 | matcha/app.py
11 | matcha/cli.py
12 | matcha/train.py
13 | matcha/data/__init__.py
14 | matcha/data/text_mel_datamodule.py
15 | matcha/data/components/__init__.py
16 | matcha/hifigan/README.md
17 | matcha/hifigan/__init__.py
18 | matcha/hifigan/config.py
19 | matcha/hifigan/denoiser.py
20 | matcha/hifigan/env.py
21 | matcha/hifigan/meldataset.py
22 | matcha/hifigan/models.py
23 | matcha/hifigan/xutils.py
24 | matcha/models/__init__.py
25 | matcha/models/baselightningmodule.py
26 | matcha/models/matcha_tts.py
27 | matcha/models/components/__init__.py
28 | matcha/models/components/decoder.py
29 | matcha/models/components/flow_matching.py
30 | matcha/models/components/text_encoder.py
31 | matcha/models/components/transformer.py
32 | matcha/onnx/__init__.py
33 | matcha/onnx/export.py
34 | matcha/onnx/infer.py
35 | matcha/text/__init__.py
36 | matcha/text/cleaners.py
37 | matcha/text/numbers.py
38 | matcha/text/symbols.py
39 | matcha/utils/__init__.py
40 | matcha/utils/audio.py
41 | matcha/utils/generate_data_statistics.py
42 | matcha/utils/instantiators.py
43 | matcha/utils/logging_utils.py
44 | matcha/utils/model.py
45 | matcha/utils/pylogger.py
46 | matcha/utils/rich_utils.py
47 | matcha/utils/utils.py
48 | matcha/utils/monotonic_align/__init__.py
49 | matcha/utils/monotonic_align/core.c
50 | matcha/utils/monotonic_align/core.pyx
51 | matcha/utils/monotonic_align/setup.py
52 | matcha_tts.egg-info/PKG-INFO
53 | matcha_tts.egg-info/SOURCES.txt
54 | matcha_tts.egg-info/dependency_links.txt
55 | matcha_tts.egg-info/entry_points.txt
56 | matcha_tts.egg-info/requires.txt
57 | matcha_tts.egg-info/top_level.txt


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # specify here default configuration
 4 | # order of defaults determines the order in which configs override each other
 5 | defaults:
 6 |   - _self_
 7 |   - data: ljspeech
 8 |   - model: matcha
 9 |   - callbacks: default
10 |   - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
11 |   - trainer: default
12 |   - paths: default
13 |   - extras: default
14 |   - hydra: default
15 | 
16 |   # experiment configs allow for version control of specific hyperparameters
17 |   # e.g. best hyperparameters for given model and datamodule
18 |   - experiment: null
19 | 
20 |   # config for hyperparameter optimization
21 |   - hparams_search: null
22 | 
23 |   # optional local config for machine/user specific settings
24 |   # it's optional since it doesn't need to exist and is excluded from version control
25 |   - optional local: default
26 | 
27 |   # debugging config (enable through command line, e.g. `python train.py debug=default)
28 |   - debug: null
29 | 
30 | # task name, determines output directory path
31 | task_name: "train"
32 | 
33 | run_name: ???
34 | 
35 | # tags to help you identify your experiments
36 | # you can overwrite this in experiment configs
37 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
38 | tags: ["dev"]
39 | 
40 | # set False to skip model training
41 | train: True
42 | 
43 | # evaluate on test set, using best model weights achieved during training
44 | # lightning chooses best weights based on the metric specified in checkpoint callback
45 | test: True
46 | 
47 | # simply provide checkpoint path to resume training
48 | ckpt_path: null
49 | 
50 | # seed for random number generators in pytorch, numpy and python.random
51 | seed: 1234
52 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from matcha.text import cleaners
 3 | from matcha.text.symbols import symbols
 4 | 
 5 | # Mappings from symbol to numeric ID and vice versa:
 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}  # pylint: disable=unnecessary-comprehension
 8 | 
 9 | 
10 | def text_to_sequence(text, cleaner_names):
11 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
12 |     Args:
13 |       text: string to convert to a sequence
14 |       cleaner_names: names of the cleaner functions to run the text through
15 |     Returns:
16 |       List of integers corresponding to the symbols in the text
17 |     """
18 |     sequence = []
19 | 
20 |     clean_text = _clean_text(text, cleaner_names)
21 |     for symbol in clean_text:
22 |         symbol_id = _symbol_to_id[symbol]
23 |         sequence += [symbol_id]
24 |     return sequence
25 | 
26 | 
27 | def cleaned_text_to_sequence(cleaned_text):
28 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
29 |     Args:
30 |       text: string to convert to a sequence
31 |     Returns:
32 |       List of integers corresponding to the symbols in the text
33 |     """
34 |     sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
35 |     return sequence
36 | 
37 | 
38 | def sequence_to_text(sequence):
39 |     """Converts a sequence of IDs back to a string"""
40 |     result = ""
41 |     for symbol_id in sequence:
42 |         s = _id_to_symbol[symbol_id]
43 |         result += s
44 |     return result
45 | 
46 | 
47 | def _clean_text(text, cleaner_names):
48 |     for name in cleaner_names:
49 |         cleaner = getattr(cleaners, name)
50 |         if not cleaner:
51 |             raise Exception("Unknown cleaner: %s" % name)
52 |         text = cleaner(text)
53 |     return text
54 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from lightning.pytorch.utilities import rank_zero_only
 4 | from omegaconf import OmegaConf
 5 | 
 6 | from matcha.utils import pylogger
 7 | 
 8 | log = pylogger.get_pylogger(__name__)
 9 | 
10 | 
11 | @rank_zero_only
12 | def log_hyperparameters(object_dict: Dict[str, Any]) -> None:
13 |     """Controls which config parts are saved by Lightning loggers.
14 | 
15 |     Additionally saves:
16 |         - Number of model parameters
17 | 
18 |     :param object_dict: A dictionary containing the following objects:
19 |         - `"cfg"`: A DictConfig object containing the main config.
20 |         - `"model"`: The Lightning model.
21 |         - `"trainer"`: The Lightning trainer.
22 |     """
23 |     hparams = {}
24 | 
25 |     cfg = OmegaConf.to_container(object_dict["cfg"])
26 |     model = object_dict["model"]
27 |     trainer = object_dict["trainer"]
28 | 
29 |     if not trainer.logger:
30 |         log.warning("Logger not found! Skipping hyperparameter logging...")
31 |         return
32 | 
33 |     hparams["model"] = cfg["model"]
34 | 
35 |     # save number of model parameters
36 |     hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
37 |     hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad)
38 |     hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad)
39 | 
40 |     hparams["data"] = cfg["data"]
41 |     hparams["trainer"] = cfg["trainer"]
42 | 
43 |     hparams["callbacks"] = cfg.get("callbacks")
44 |     hparams["extras"] = cfg.get("extras")
45 | 
46 |     hparams["task_name"] = cfg.get("task_name")
47 |     hparams["tags"] = cfg.get("tags")
48 |     hparams["ckpt_path"] = cfg.get("ckpt_path")
49 |     hparams["seed"] = cfg.get("seed")
50 | 
51 |     # send hparams to all loggers
52 |     for logger in trainer.loggers:
53 |         logger.log_hyperparams(hparams)
54 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 | 
19 | 
20 | class InterpolateRegulator(nn.Module):
21 |     def __init__(
22 |             self,
23 |             channels: int,
24 |             sampling_ratios: Tuple,
25 |             out_channels: int = None,
26 |             groups: int = 1,
27 |     ):
28 |         super().__init__()
29 |         self.sampling_ratios = sampling_ratios
30 |         out_channels = out_channels or channels
31 |         model = nn.ModuleList([])
32 |         if len(sampling_ratios) > 0:
33 |             for _ in sampling_ratios:
34 |                 module = nn.Conv1d(channels, channels, 3, 1, 1)
35 |                 norm = nn.GroupNorm(groups, channels)
36 |                 act = nn.Mish()
37 |                 model.extend([module, norm, act])
38 |         model.append(
39 |             nn.Conv1d(channels, out_channels, 1, 1)
40 |         )
41 |         self.model = nn.Sequential(*model)
42 | 
43 |     def forward(self, x, ylens=None):
44 |         # x in (B, T, D)
45 |         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 |         x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 |         out = self.model(x).transpose(1, 2).contiguous()
48 |         olens = ylens
49 |         return out * mask, olens
50 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import torchaudio
18 | 
19 | 
20 | def read_lists(list_file):
21 |     lists = []
22 |     with open(list_file, 'r', encoding='utf8') as fin:
23 |         for line in fin:
24 |             lists.append(line.strip())
25 |     return lists
26 | 
27 | def read_json_lists(list_file):
28 |     lists = read_lists(list_file)
29 |     results = {}
30 |     for fn in lists:
31 |         with open(fn, 'r', encoding='utf8') as fin:
32 |             results.update(json.load(fin))
33 |     return results
34 | 
35 | def load_wav(wav, target_sr):
36 |     speech, sample_rate = torchaudio.load(wav)
37 |     speech = speech.mean(dim=0, keepdim=True)
38 |     if sample_rate != target_sr:
39 |         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40 |         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41 |     return speech
42 | 
43 | def speed_change(waveform, sample_rate, speed_factor: str):
44 |     effects = [
45 |         ["tempo", speed_factor],  # speed_factor
46 |         ["rate", f"{sample_rate}"]
47 |     ]
48 |     augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
49 |         waveform,
50 |         sample_rate,
51 |         effects
52 |     )
53 |     return augmented_waveform, new_sample_rate
54 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # example hyperparameter optimization of some experiment with Optuna:
 4 | # python train.py -m hparams_search=mnist_optuna experiment=example
 5 | 
 6 | defaults:
 7 |   - override /hydra/sweeper: optuna
 8 | 
 9 | # choose metric which will be optimized by Optuna
10 | # make sure this is the correct name of some metric logged in lightning module!
11 | optimized_metric: "val/acc_best"
12 | 
13 | # here we define Optuna hyperparameter search
14 | # it optimizes for value returned from function with @hydra.main decorator
15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
16 | hydra:
17 |   mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
18 | 
19 |   sweeper:
20 |     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
21 | 
22 |     # storage URL to persist optimization results
23 |     # for example, you can use SQLite if you set 'sqlite:///example.db'
24 |     storage: null
25 | 
26 |     # name of the study to persist optimization results
27 |     study_name: null
28 | 
29 |     # number of parallel workers
30 |     n_jobs: 1
31 | 
32 |     # 'minimize' or 'maximize' the objective
33 |     direction: maximize
34 | 
35 |     # total number of runs that will be executed
36 |     n_trials: 20
37 | 
38 |     # choose Optuna hyperparameter sampler
39 |     # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
40 |     # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
41 |     sampler:
42 |       _target_: optuna.samplers.TPESampler
43 |       seed: 1234
44 |       n_startup_trials: 10 # number of random sampling runs before optimization starts
45 | 
46 |     # define hyperparameter search space
47 |     params:
48 |       model.optimizer.lr: interval(0.0001, 0.1)
49 |       data.batch_size: choice(32, 64, 128, 256)
50 |       model.net.lin1_size: choice(64, 128, 256)
51 |       model.net.lin2_size: choice(64, 128, 256)
52 |       model.net.lin3_size: choice(32, 64, 128, 256)
53 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/instantiators.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import hydra
 4 | from lightning import Callback
 5 | from lightning.pytorch.loggers import Logger
 6 | from omegaconf import DictConfig
 7 | 
 8 | from matcha.utils import pylogger
 9 | 
10 | log = pylogger.get_pylogger(__name__)
11 | 
12 | 
13 | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
14 |     """Instantiates callbacks from config.
15 | 
16 |     :param callbacks_cfg: A DictConfig object containing callback configurations.
17 |     :return: A list of instantiated callbacks.
18 |     """
19 |     callbacks: List[Callback] = []
20 | 
21 |     if not callbacks_cfg:
22 |         log.warning("No callback configs found! Skipping..")
23 |         return callbacks
24 | 
25 |     if not isinstance(callbacks_cfg, DictConfig):
26 |         raise TypeError("Callbacks config must be a DictConfig!")
27 | 
28 |     for _, cb_conf in callbacks_cfg.items():
29 |         if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
30 |             log.info(f"Instantiating callback <{cb_conf._target_}>")  # pylint: disable=protected-access
31 |             callbacks.append(hydra.utils.instantiate(cb_conf))
32 | 
33 |     return callbacks
34 | 
35 | 
36 | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
37 |     """Instantiates loggers from config.
38 | 
39 |     :param logger_cfg: A DictConfig object containing logger configurations.
40 |     :return: A list of instantiated loggers.
41 |     """
42 |     logger: List[Logger] = []
43 | 
44 |     if not logger_cfg:
45 |         log.warning("No logger configs found! Skipping...")
46 |         return logger
47 | 
48 |     if not isinstance(logger_cfg, DictConfig):
49 |         raise TypeError("Logger config must be a DictConfig!")
50 | 
51 |     for _, lg_conf in logger_cfg.items():
52 |         if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
53 |             log.info(f"Instantiating logger <{lg_conf._target_}>")  # pylint: disable=protected-access
54 |             logger.append(hydra.utils.instantiate(lg_conf))
55 | 
56 |     return logger
57 | 


--------------------------------------------------------------------------------
/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | import inflect
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"[0-9]+")
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(",", "")
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace(".", " point ")
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split(".")
27 |     if len(parts) > 2:
28 |         return match + " dollars"
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
33 |         cent_unit = "cent" if cents == 1 else "cents"
34 |         return f"{dollars} {dollar_unit}, {cents} {cent_unit}"
35 |     elif dollars:
36 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
37 |         return f"{dollars} {dollar_unit}"
38 |     elif cents:
39 |         cent_unit = "cent" if cents == 1 else "cents"
40 |         return f"{cents} {cent_unit}"
41 |     else:
42 |         return "zero dollars"
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return "two thousand"
54 |         elif num > 2000 and num < 2010:
55 |             return "two thousand " + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + " hundred"
58 |         else:
59 |             return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
60 |     else:
61 |         return _inflect.number_to_words(num, andword="")
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r"\1 pounds", text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text
72 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/audio.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from librosa.filters import mel as librosa_mel_fn
 5 | from scipy.io.wavfile import read
 6 | 
 7 | MAX_WAV_VALUE = 32768.0
 8 | 
 9 | 
10 | def load_wav(full_path):
11 |     sampling_rate, data = read(full_path)
12 |     return data, sampling_rate
13 | 
14 | 
15 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
16 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17 | 
18 | 
19 | def dynamic_range_decompression(x, C=1):
20 |     return np.exp(x) / C
21 | 
22 | 
23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24 |     return torch.log(torch.clamp(x, min=clip_val) * C)
25 | 
26 | 
27 | def dynamic_range_decompression_torch(x, C=1):
28 |     return torch.exp(x) / C
29 | 
30 | 
31 | def spectral_normalize_torch(magnitudes):
32 |     output = dynamic_range_compression_torch(magnitudes)
33 |     return output
34 | 
35 | 
36 | def spectral_de_normalize_torch(magnitudes):
37 |     output = dynamic_range_decompression_torch(magnitudes)
38 |     return output
39 | 
40 | 
41 | mel_basis = {}
42 | hann_window = {}
43 | 
44 | 
45 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
46 |     if torch.min(y) < -1.0:
47 |         print("min value is ", torch.min(y))
48 |     if torch.max(y) > 1.0:
49 |         print("max value is ", torch.max(y))
50 | 
51 |     global mel_basis, hann_window  # pylint: disable=global-statement
52 |     if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
53 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
54 |         mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
55 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
56 | 
57 |     y = torch.nn.functional.pad(
58 |         y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
59 |     )
60 |     y = y.squeeze(1)
61 | 
62 |     spec = torch.view_as_real(
63 |         torch.stft(
64 |             y,
65 |             n_fft,
66 |             hop_length=hop_size,
67 |             win_length=win_size,
68 |             window=hann_window[str(y.device)],
69 |             center=center,
70 |             pad_mode="reflect",
71 |             normalized=False,
72 |             onesided=True,
73 |             return_complex=True,
74 |         )
75 |     )
76 | 
77 |     spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
78 | 
79 |     spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
80 |     spec = spectral_normalize_torch(spec)
81 | 
82 |     return spec
83 | 


--------------------------------------------------------------------------------
/batch_inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import argparse
 4 | import pandas as pd
 5 | from datasets import Dataset
 6 | 
 7 | def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_folder):
 8 |     # Load CSV with pandas
 9 |     data = pd.read_csv(csv_file)
10 | 
11 |     # Transform pandas DataFrame to HuggingFace Dataset
12 |     dataset = Dataset.from_pandas(data)
13 | 
14 |     def gen_audio(row):
15 |         speaker_prompt_audio_path = os.path.join(speaker_prompt_audio_folder, f"{row['speaker_prompt_audio_filename']}.wav")
16 |         speaker_prompt_text_transcription = row['speaker_prompt_text_transcription']
17 |         content_to_synthesize = row['content_to_synthesize']
18 |         output_audio_path = os.path.join(output_audio_folder, f"{row['output_audio_filename']}.wav")
19 | 
20 |         if not os.path.exists(speaker_prompt_audio_path):
21 |             print(f"File {speaker_prompt_audio_path} does not exist")
22 |             return row #{"status": "failed", "reason": "file not found"}
23 | 
24 |         command = [
25 |             "python", "single_inference.py",
26 |             "--speaker_prompt_audio_path", speaker_prompt_audio_path,
27 |             "--speaker_prompt_text_transcription", speaker_prompt_text_transcription,
28 |             "--content_to_synthesize", content_to_synthesize,
29 |             "--output_path", output_audio_path
30 |         ]
31 | 
32 |         try:
33 |             print(f"Processing: {speaker_prompt_audio_path}")
34 |             subprocess.run(command, check=True)
35 |             print(f"Generated: {output_audio_path}")
36 |             return row #{"status": "success", "output": gen_voice_file_name}
37 |         except subprocess.CalledProcessError as e:
38 |             print(f"Failed to generate {speaker_prompt_audio_path}, error: {e}")
39 |             return row #{"status": "failed", "reason": str(e)}
40 | 
41 |     dataset = dataset.map(gen_audio, num_proc = 1)
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser(description="Batch process audio generation.")
45 |     parser.add_argument("--csv_file", required=True, help="Path to the CSV file containing input data.")
46 |     parser.add_argument("--speaker_prompt_audio_folder", required=True, help="Path to the folder containing speaker prompt audio files.")
47 |     parser.add_argument("--output_audio_folder", required=True, help="Path to the folder where results will be stored.")
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     os.makedirs(args.output_audio_folder, exist_ok=True)
52 | 
53 |     process_batch(
54 |         csv_file=args.csv_file,
55 |         speaker_prompt_audio_folder=args.speaker_prompt_audio_folder,
56 |         output_audio_folder=args.output_audio_folder,
57 |     )
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 2 | #            2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 |     LinearNoSubsampling,
20 |     EmbedinigNoSubsampling,
21 |     Conv1dSubsampling2,
22 |     Conv2dSubsampling4,
23 |     Conv2dSubsampling6,
24 |     Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 |                                              RelPositionalEncoding,
28 |                                              WhisperPositionalEncoding,
29 |                                              LearnablePositionalEncoding,
30 |                                              NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 |                                              RelPositionMultiHeadedAttention)
33 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
34 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
35 | 
36 | 
37 | COSYVOICE_ACTIVATION_CLASSES = {
38 |     "hardtanh": torch.nn.Hardtanh,
39 |     "tanh": torch.nn.Tanh,
40 |     "relu": torch.nn.ReLU,
41 |     "selu": torch.nn.SELU,
42 |     "swish": getattr(torch.nn, "SiLU", Swish),
43 |     "gelu": torch.nn.GELU,
44 | }
45 | 
46 | COSYVOICE_SUBSAMPLE_CLASSES = {
47 |     "linear": LinearNoSubsampling,
48 |     "linear_legacy": LegacyLinearNoSubsampling,
49 |     "embed": EmbedinigNoSubsampling,
50 |     "conv1d2": Conv1dSubsampling2,
51 |     "conv2d": Conv2dSubsampling4,
52 |     "conv2d6": Conv2dSubsampling6,
53 |     "conv2d8": Conv2dSubsampling8,
54 |     'paraformer_dummy': torch.nn.Identity
55 | }
56 | 
57 | COSYVOICE_EMB_CLASSES = {
58 |     "embed": PositionalEncoding,
59 |     "abs_pos": PositionalEncoding,
60 |     "rel_pos": RelPositionalEncoding,
61 |     "rel_pos_espnet": EspnetRelPositionalEncoding,
62 |     "no_pos": NoPositionalEncoding,
63 |     "abs_pos_whisper": WhisperPositionalEncoding,
64 |     "embed_learnable_pe": LearnablePositionalEncoding,
65 | }
66 | 
67 | COSYVOICE_ATTENTION_CLASSES = {
68 |     "selfattn": MultiHeadedAttention,
69 |     "rel_selfattn": RelPositionMultiHeadedAttention,
70 | }
71 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/denoiser.py:
--------------------------------------------------------------------------------
 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
 2 | 
 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
 4 | import torch
 5 | 
 6 | 
 7 | class Denoiser(torch.nn.Module):
 8 |     """Removes model bias from audio produced with waveglow"""
 9 | 
10 |     def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
11 |         super().__init__()
12 |         self.filter_length = filter_length
13 |         self.hop_length = int(filter_length / n_overlap)
14 |         self.win_length = win_length
15 | 
16 |         dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
17 |         self.device = device
18 |         if mode == "zeros":
19 |             mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
20 |         elif mode == "normal":
21 |             mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
22 |         else:
23 |             raise Exception(f"Mode {mode} if not supported")
24 | 
25 |         def stft_fn(audio, n_fft, hop_length, win_length, window):
26 |             spec = torch.stft(
27 |                 audio,
28 |                 n_fft=n_fft,
29 |                 hop_length=hop_length,
30 |                 win_length=win_length,
31 |                 window=window,
32 |                 return_complex=True,
33 |             )
34 |             spec = torch.view_as_real(spec)
35 |             return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
36 | 
37 |         self.stft = lambda x: stft_fn(
38 |             audio=x,
39 |             n_fft=self.filter_length,
40 |             hop_length=self.hop_length,
41 |             win_length=self.win_length,
42 |             window=torch.hann_window(self.win_length, device=device),
43 |         )
44 |         self.istft = lambda x, y: torch.istft(
45 |             torch.complex(x * torch.cos(y), x * torch.sin(y)),
46 |             n_fft=self.filter_length,
47 |             hop_length=self.hop_length,
48 |             win_length=self.win_length,
49 |             window=torch.hann_window(self.win_length, device=device),
50 |         )
51 | 
52 |         with torch.no_grad():
53 |             bias_audio = vocoder(mel_input).float().squeeze(0)
54 |             bias_spec, _ = self.stft(bias_audio)
55 | 
56 |         self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
57 | 
58 |     @torch.inference_mode()
59 |     def forward(self, audio, strength=0.0005):
60 |         audio_spec, audio_angles = self.stft(audio)
61 |         audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
62 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
63 |         audio_denoised = self.istft(audio_spec_denoised, audio_angles)
64 |         return audio_denoised
65 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/model.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jaywalnut310/glow-tts """
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def sequence_mask(length, max_length=None):
 8 |     if max_length is None:
 9 |         max_length = length.max()
10 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
11 |     return x.unsqueeze(0) < length.unsqueeze(1)
12 | 
13 | 
14 | def fix_len_compatibility(length, num_downsamplings_in_unet=2):
15 |     factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet)
16 |     length = (length / factor).ceil() * factor
17 |     if not torch.onnx.is_in_onnx_export():
18 |         return length.int().item()
19 |     else:
20 |         return length
21 | 
22 | 
23 | def convert_pad_shape(pad_shape):
24 |     inverted_shape = pad_shape[::-1]
25 |     pad_shape = [item for sublist in inverted_shape for item in sublist]
26 |     return pad_shape
27 | 
28 | 
29 | def generate_path(duration, mask):
30 |     device = duration.device
31 | 
32 |     b, t_x, t_y = mask.shape
33 |     cum_duration = torch.cumsum(duration, 1)
34 |     path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
35 | 
36 |     cum_duration_flat = cum_duration.view(b * t_x)
37 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
38 |     path = path.view(b, t_x, t_y)
39 |     path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
40 |     path = path * mask
41 |     return path
42 | 
43 | 
44 | def duration_loss(logw, logw_, lengths):
45 |     loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
46 |     return loss
47 | 
48 | 
49 | def normalize(data, mu, std):
50 |     if not isinstance(mu, (float, int)):
51 |         if isinstance(mu, list):
52 |             mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
53 |         elif isinstance(mu, torch.Tensor):
54 |             mu = mu.to(data.device)
55 |         elif isinstance(mu, np.ndarray):
56 |             mu = torch.from_numpy(mu).to(data.device)
57 |         mu = mu.unsqueeze(-1)
58 | 
59 |     if not isinstance(std, (float, int)):
60 |         if isinstance(std, list):
61 |             std = torch.tensor(std, dtype=data.dtype, device=data.device)
62 |         elif isinstance(std, torch.Tensor):
63 |             std = std.to(data.device)
64 |         elif isinstance(std, np.ndarray):
65 |             std = torch.from_numpy(std).to(data.device)
66 |         std = std.unsqueeze(-1)
67 | 
68 |     return (data - mu) / std
69 | 
70 | 
71 | def denormalize(data, mu, std):
72 |     if not isinstance(mu, float):
73 |         if isinstance(mu, list):
74 |             mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
75 |         elif isinstance(mu, torch.Tensor):
76 |             mu = mu.to(data.device)
77 |         elif isinstance(mu, np.ndarray):
78 |             mu = torch.from_numpy(mu).to(data.device)
79 |         mu = mu.unsqueeze(-1)
80 | 
81 |     if not isinstance(std, float):
82 |         if isinstance(std, list):
83 |             std = torch.tensor(std, dtype=data.dtype, device=data.device)
84 |         elif isinstance(std, torch.Tensor):
85 |             std = std.to(data.device)
86 |         elif isinstance(std, np.ndarray):
87 |             std = torch.from_numpy(std).to(data.device)
88 |         std = std.unsqueeze(-1)
89 | 
90 |     return data * std + mu
91 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #               2024 Alibaba Inc (Xiang Lyu)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 | 
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 | 
23 | 
24 | class Swish(torch.nn.Module):
25 |     """Construct an Swish object."""
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         """Return Swish activation function."""
29 |         return x * torch.sigmoid(x)
30 | 
31 | 
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | #   LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 |     '''
36 |     Implementation of a sine-based periodic activation function
37 |     Shape:
38 |         - Input: (B, C, T)
39 |         - Output: (B, C, T), same shape as the input
40 |     Parameters:
41 |         - alpha - trainable parameter
42 |     References:
43 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 |         https://arxiv.org/abs/2006.08195
45 |     Examples:
46 |         >>> a1 = snake(256)
47 |         >>> x = torch.randn(256)
48 |         >>> x = a1(x)
49 |     '''
50 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 |         '''
52 |         Initialization.
53 |         INPUT:
54 |             - in_features: shape of the input
55 |             - alpha: trainable parameter
56 |             alpha is initialized to 1 by default, higher values = higher-frequency.
57 |             alpha will be trained along with the rest of your model.
58 |         '''
59 |         super(Snake, self).__init__()
60 |         self.in_features = in_features
61 | 
62 |         # initialize alpha
63 |         self.alpha_logscale = alpha_logscale
64 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
65 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 |         else:  # linear scale alphas initialized to ones
67 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
68 | 
69 |         self.alpha.requires_grad = alpha_trainable
70 | 
71 |         self.no_div_by_zero = 0.000000001
72 | 
73 |     def forward(self, x):
74 |         '''
75 |         Forward pass of the function.
76 |         Applies the function to the input elementwise.
77 |         Snake ∶= x + 1/a * sin^2 (xa)
78 |         '''
79 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
80 |         if self.alpha_logscale:
81 |             alpha = torch.exp(alpha)
82 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/rich_utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Sequence
  3 | 
  4 | import rich
  5 | import rich.syntax
  6 | import rich.tree
  7 | from hydra.core.hydra_config import HydraConfig
  8 | from lightning.pytorch.utilities import rank_zero_only
  9 | from omegaconf import DictConfig, OmegaConf, open_dict
 10 | from rich.prompt import Prompt
 11 | 
 12 | from matcha.utils import pylogger
 13 | 
 14 | log = pylogger.get_pylogger(__name__)
 15 | 
 16 | 
 17 | @rank_zero_only
 18 | def print_config_tree(
 19 |     cfg: DictConfig,
 20 |     print_order: Sequence[str] = (
 21 |         "data",
 22 |         "model",
 23 |         "callbacks",
 24 |         "logger",
 25 |         "trainer",
 26 |         "paths",
 27 |         "extras",
 28 |     ),
 29 |     resolve: bool = False,
 30 |     save_to_file: bool = False,
 31 | ) -> None:
 32 |     """Prints the contents of a DictConfig as a tree structure using the Rich library.
 33 | 
 34 |     :param cfg: A DictConfig composed by Hydra.
 35 |     :param print_order: Determines in what order config components are printed. Default is ``("data", "model",
 36 |     "callbacks", "logger", "trainer", "paths", "extras")``.
 37 |     :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``.
 38 |     :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``.
 39 |     """
 40 |     style = "dim"
 41 |     tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
 42 | 
 43 |     queue = []
 44 | 
 45 |     # add fields from `print_order` to queue
 46 |     for field in print_order:
 47 |         _ = (
 48 |             queue.append(field)
 49 |             if field in cfg
 50 |             else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...")
 51 |         )
 52 | 
 53 |     # add all the other fields to queue (not specified in `print_order`)
 54 |     for field in cfg:
 55 |         if field not in queue:
 56 |             queue.append(field)
 57 | 
 58 |     # generate config tree from queue
 59 |     for field in queue:
 60 |         branch = tree.add(field, style=style, guide_style=style)
 61 | 
 62 |         config_group = cfg[field]
 63 |         if isinstance(config_group, DictConfig):
 64 |             branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
 65 |         else:
 66 |             branch_content = str(config_group)
 67 | 
 68 |         branch.add(rich.syntax.Syntax(branch_content, "yaml"))
 69 | 
 70 |     # print config tree
 71 |     rich.print(tree)
 72 | 
 73 |     # save config tree to file
 74 |     if save_to_file:
 75 |         with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
 76 |             rich.print(tree, file=file)
 77 | 
 78 | 
 79 | @rank_zero_only
 80 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
 81 |     """Prompts user to input tags from command line if no tags are provided in config.
 82 | 
 83 |     :param cfg: A DictConfig composed by Hydra.
 84 |     :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``.
 85 |     """
 86 |     if not cfg.get("tags"):
 87 |         if "id" in HydraConfig().cfg.hydra.job:
 88 |             raise ValueError("Specify tags before launching a multirun!")
 89 | 
 90 |         log.warning("No tags provided in config. Prompting user to input tags...")
 91 |         tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
 92 |         tags = [t.strip() for t in tags.split(",") if t != ""]
 93 | 
 94 |         with open_dict(cfg):
 95 |             cfg.tags = tags
 96 | 
 97 |         log.info(f"Tags: {cfg.tags}")
 98 | 
 99 |     if save_to_file:
100 |         with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
101 |             rich.print(cfg.tags, file=file)
102 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | class CosyVoiceModel:
17 | 
18 |     def __init__(self,
19 |                  llm: torch.nn.Module,
20 |                  flow: torch.nn.Module,
21 |                  hift: torch.nn.Module):
22 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 |         self.llm = llm
24 |         self.flow = flow
25 |         self.hift = hift
26 | 
27 |     def load(self, llm_model, flow_model, hift_model):
28 |         self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
29 |         self.llm.to(self.device).eval()
30 |         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
31 |         self.flow.to(self.device).eval()
32 |         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
33 |         self.hift.to(self.device).eval()
34 | 
35 |     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
36 |                   prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
37 |                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
38 |                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
39 |                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
40 |         tts_speech_token = self.llm.inference(text=text.to(self.device),
41 |                                               text_len=text_len.to(self.device),
42 |                                               prompt_text=prompt_text.to(self.device),
43 |                                               prompt_text_len=prompt_text_len.to(self.device),
44 |                                               prompt_speech_token=llm_prompt_speech_token.to(self.device),
45 |                                               prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
46 |                                               embedding=llm_embedding.to(self.device),
47 |                                               beam_size=1,
48 |                                               sampling=25,
49 |                                               max_token_text_ratio=30,
50 |                                               min_token_text_ratio=3)
51 |         tts_mel = self.flow.inference(token=tts_speech_token,
52 |                                       token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
53 |                                       prompt_token=flow_prompt_speech_token.to(self.device),
54 |                                       prompt_token_len=flow_prompt_speech_token_len.to(self.device),
55 |                                       prompt_feat=prompt_speech_feat.to(self.device),
56 |                                       prompt_feat_len=prompt_speech_feat_len.to(self.device),
57 |                                       embedding=flow_embedding.to(self.device))
58 |         tts_speech = self.hift.inference(mel=tts_mel).cpu()
59 |         torch.cuda.empty_cache()
60 |         return {'tts_speech': tts_speech}
61 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 Shigeki Karita
 2 | #               2020 Mobvoi Inc (Binbin Zhang)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Label smoothing module."""
16 | 
17 | import torch
18 | from torch import nn
19 | 
20 | 
21 | class LabelSmoothingLoss(nn.Module):
22 |     """Label-smoothing loss.
23 | 
24 |     In a standard CE loss, the label's data distribution is:
25 |     [0,1,2] ->
26 |     [
27 |         [1.0, 0.0, 0.0],
28 |         [0.0, 1.0, 0.0],
29 |         [0.0, 0.0, 1.0],
30 |     ]
31 | 
32 |     In the smoothing version CE Loss,some probabilities
33 |     are taken from the true label prob (1.0) and are divided
34 |     among other labels.
35 | 
36 |     e.g.
37 |     smoothing=0.1
38 |     [0,1,2] ->
39 |     [
40 |         [0.9, 0.05, 0.05],
41 |         [0.05, 0.9, 0.05],
42 |         [0.05, 0.05, 0.9],
43 |     ]
44 | 
45 |     Args:
46 |         size (int): the number of class
47 |         padding_idx (int): padding class id which will be ignored for loss
48 |         smoothing (float): smoothing rate (0.0 means the conventional CE)
49 |         normalize_length (bool):
50 |             normalize loss by sequence length if True
51 |             normalize loss by batch size if False
52 |     """
53 | 
54 |     def __init__(self,
55 |                  size: int,
56 |                  padding_idx: int,
57 |                  smoothing: float,
58 |                  normalize_length: bool = False):
59 |         """Construct an LabelSmoothingLoss object."""
60 |         super(LabelSmoothingLoss, self).__init__()
61 |         self.criterion = nn.KLDivLoss(reduction="none")
62 |         self.padding_idx = padding_idx
63 |         self.confidence = 1.0 - smoothing
64 |         self.smoothing = smoothing
65 |         self.size = size
66 |         self.normalize_length = normalize_length
67 | 
68 |     def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
69 |         """Compute loss between x and target.
70 | 
71 |         The model outputs and data labels tensors are flatten to
72 |         (batch*seqlen, class) shape and a mask is applied to the
73 |         padding part which should not be calculated for loss.
74 | 
75 |         Args:
76 |             x (torch.Tensor): prediction (batch, seqlen, class)
77 |             target (torch.Tensor):
78 |                 target signal masked with self.padding_id (batch, seqlen)
79 |         Returns:
80 |             loss (torch.Tensor) : The KL loss, scalar float value
81 |         """
82 |         assert x.size(2) == self.size
83 |         batch_size = x.size(0)
84 |         x = x.view(-1, self.size)
85 |         target = target.view(-1)
86 |         # use zeros_like instead of torch.no_grad() for true_dist,
87 |         # since no_grad() can not be exported by JIT
88 |         true_dist = torch.zeros_like(x)
89 |         true_dist.fill_(self.smoothing / (self.size - 1))
90 |         ignore = target == self.padding_idx  # (B,)
91 |         total = len(target) - ignore.sum().item()
92 |         target = target.masked_fill(ignore, 0)  # avoid -1 index
93 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
94 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
95 |         denom = total if self.normalize_length else batch_size
96 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
97 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | The file creates a pickle file where the values needed for loading of dataset is stored and the model can load it
  3 | when needed.
  4 | 
  5 | Parameters from hparam.py will be used
  6 | """
  7 | import argparse
  8 | import json
  9 | import os
 10 | import sys
 11 | from pathlib import Path
 12 | 
 13 | import rootutils
 14 | import torch
 15 | from hydra import compose, initialize
 16 | from omegaconf import open_dict
 17 | from tqdm.auto import tqdm
 18 | 
 19 | from matcha.data.text_mel_datamodule import TextMelDataModule
 20 | from matcha.utils.logging_utils import pylogger
 21 | 
 22 | log = pylogger.get_pylogger(__name__)
 23 | 
 24 | 
 25 | def compute_data_statistics(data_loader: torch.utils.data.DataLoader, out_channels: int):
 26 |     """Generate data mean and standard deviation helpful in data normalisation
 27 | 
 28 |     Args:
 29 |         data_loader (torch.utils.data.Dataloader): _description_
 30 |         out_channels (int): mel spectrogram channels
 31 |     """
 32 |     total_mel_sum = 0
 33 |     total_mel_sq_sum = 0
 34 |     total_mel_len = 0
 35 | 
 36 |     for batch in tqdm(data_loader, leave=False):
 37 |         mels = batch["y"]
 38 |         mel_lengths = batch["y_lengths"]
 39 | 
 40 |         total_mel_len += torch.sum(mel_lengths)
 41 |         total_mel_sum += torch.sum(mels)
 42 |         total_mel_sq_sum += torch.sum(torch.pow(mels, 2))
 43 | 
 44 |     data_mean = total_mel_sum / (total_mel_len * out_channels)
 45 |     data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2))
 46 | 
 47 |     return {"mel_mean": data_mean.item(), "mel_std": data_std.item()}
 48 | 
 49 | 
 50 | def main():
 51 |     parser = argparse.ArgumentParser()
 52 | 
 53 |     parser.add_argument(
 54 |         "-i",
 55 |         "--input-config",
 56 |         type=str,
 57 |         default="vctk.yaml",
 58 |         help="The name of the yaml config file under configs/data",
 59 |     )
 60 | 
 61 |     parser.add_argument(
 62 |         "-b",
 63 |         "--batch-size",
 64 |         type=int,
 65 |         default="256",
 66 |         help="Can have increased batch size for faster computation",
 67 |     )
 68 | 
 69 |     parser.add_argument(
 70 |         "-f",
 71 |         "--force",
 72 |         action="store_true",
 73 |         default=False,
 74 |         required=False,
 75 |         help="force overwrite the file",
 76 |     )
 77 |     args = parser.parse_args()
 78 |     output_file = Path(args.input_config).with_suffix(".json")
 79 | 
 80 |     if os.path.exists(output_file) and not args.force:
 81 |         print("File already exists. Use -f to force overwrite")
 82 |         sys.exit(1)
 83 | 
 84 |     with initialize(version_base="1.3", config_path="../../configs/data"):
 85 |         cfg = compose(config_name=args.input_config, return_hydra_config=True, overrides=[])
 86 | 
 87 |     root_path = rootutils.find_root(search_from=__file__, indicator=".project-root")
 88 | 
 89 |     with open_dict(cfg):
 90 |         del cfg["hydra"]
 91 |         del cfg["_target_"]
 92 |         cfg["data_statistics"] = None
 93 |         cfg["seed"] = 1234
 94 |         cfg["batch_size"] = args.batch_size
 95 |         cfg["train_filelist_path"] = str(os.path.join(root_path, cfg["train_filelist_path"]))
 96 |         cfg["valid_filelist_path"] = str(os.path.join(root_path, cfg["valid_filelist_path"]))
 97 | 
 98 |     text_mel_datamodule = TextMelDataModule(**cfg)
 99 |     text_mel_datamodule.setup()
100 |     data_loader = text_mel_datamodule.train_dataloader()
101 |     log.info("Dataloader loaded! Now computing stats...")
102 |     params = compute_data_statistics(data_loader, cfg["n_feats"])
103 |     print(params)
104 |     json.dump(
105 |         params,
106 |         open(output_file, "w"),
107 |     )
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     main()
112 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modified from ESPnet(https://github.com/espnet/espnet)
 16 | """Unility functions for Transformer."""
 17 | 
 18 | from typing import List
 19 | 
 20 | import torch
 21 | 
 22 | IGNORE_ID = -1
 23 | 
 24 | 
 25 | def pad_list(xs: List[torch.Tensor], pad_value: int):
 26 |     """Perform padding for the list of tensors.
 27 | 
 28 |     Args:
 29 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 30 |         pad_value (float): Value for padding.
 31 | 
 32 |     Returns:
 33 |         Tensor: Padded tensor (B, Tmax, `*`).
 34 | 
 35 |     Examples:
 36 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
 37 |         >>> x
 38 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
 39 |         >>> pad_list(x, 0)
 40 |         tensor([[1., 1., 1., 1.],
 41 |                 [1., 1., 0., 0.],
 42 |                 [1., 0., 0., 0.]])
 43 | 
 44 |     """
 45 |     max_len = max([len(item) for item in xs])
 46 |     batchs = len(xs)
 47 |     ndim = xs[0].ndim
 48 |     if ndim == 1:
 49 |         pad_res = torch.zeros(batchs,
 50 |                               max_len,
 51 |                               dtype=xs[0].dtype,
 52 |                               device=xs[0].device)
 53 |     elif ndim == 2:
 54 |         pad_res = torch.zeros(batchs,
 55 |                               max_len,
 56 |                               xs[0].shape[1],
 57 |                               dtype=xs[0].dtype,
 58 |                               device=xs[0].device)
 59 |     elif ndim == 3:
 60 |         pad_res = torch.zeros(batchs,
 61 |                               max_len,
 62 |                               xs[0].shape[1],
 63 |                               xs[0].shape[2],
 64 |                               dtype=xs[0].dtype,
 65 |                               device=xs[0].device)
 66 |     else:
 67 |         raise ValueError(f"Unsupported ndim: {ndim}")
 68 |     pad_res.fill_(pad_value)
 69 |     for i in range(batchs):
 70 |         pad_res[i, :len(xs[i])] = xs[i]
 71 |     return pad_res
 72 | 
 73 | 
 74 | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
 75 |                 ignore_label: int) -> torch.Tensor:
 76 |     """Calculate accuracy.
 77 | 
 78 |     Args:
 79 |         pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
 80 |         pad_targets (LongTensor): Target label tensors (B, Lmax).
 81 |         ignore_label (int): Ignore label id.
 82 | 
 83 |     Returns:
 84 |         torch.Tensor: Accuracy value (0.0 - 1.0).
 85 | 
 86 |     """
 87 |     pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
 88 |                                 pad_outputs.size(1)).argmax(2)
 89 |     mask = pad_targets != ignore_label
 90 |     numerator = torch.sum(
 91 |         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
 92 |     denominator = torch.sum(mask)
 93 |     return (numerator / denominator).detach()
 94 | 
 95 | 
 96 | def get_padding(kernel_size, dilation=1):
 97 |     return int((kernel_size * dilation - dilation) / 2)
 98 | 
 99 | 
100 | def init_weights(m, mean=0.0, std=0.01):
101 |     classname = m.__class__.__name__
102 |     if classname.find("Conv") != -1:
103 |         m.weight.data.normal_(mean, std)
104 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron
  2 | 
  3 | Cleaners are transformations that run over the input text at both training and eval time.
  4 | 
  5 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  6 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  7 |   1. "english_cleaners" for English text
  8 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
  9 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 10 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 11 |      the symbols in symbols.py to match your data).
 12 | """
 13 | 
 14 | import logging
 15 | import re
 16 | 
 17 | import phonemizer
 18 | import piper_phonemize
 19 | from unidecode import unidecode
 20 | 
 21 | # To avoid excessive logging we set the log level of the phonemizer package to Critical
 22 | critical_logger = logging.getLogger("phonemizer")
 23 | critical_logger.setLevel(logging.CRITICAL)
 24 | 
 25 | # Intializing the phonemizer globally significantly reduces the speed
 26 | # now the phonemizer is not initialising at every call
 27 | # Might be less flexible, but it is much-much faster
 28 | global_phonemizer = phonemizer.backend.EspeakBackend(
 29 |     language="en-us",
 30 |     preserve_punctuation=True,
 31 |     with_stress=True,
 32 |     language_switch="remove-flags",
 33 |     logger=critical_logger,
 34 | )
 35 | 
 36 | 
 37 | # Regular expression matching whitespace:
 38 | _whitespace_re = re.compile(r"\s+")
 39 | 
 40 | # List of (regular expression, replacement) pairs for abbreviations:
 41 | _abbreviations = [
 42 |     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
 43 |     for x in [
 44 |         ("mrs", "misess"),
 45 |         ("mr", "mister"),
 46 |         ("dr", "doctor"),
 47 |         ("st", "saint"),
 48 |         ("co", "company"),
 49 |         ("jr", "junior"),
 50 |         ("maj", "major"),
 51 |         ("gen", "general"),
 52 |         ("drs", "doctors"),
 53 |         ("rev", "reverend"),
 54 |         ("lt", "lieutenant"),
 55 |         ("hon", "honorable"),
 56 |         ("sgt", "sergeant"),
 57 |         ("capt", "captain"),
 58 |         ("esq", "esquire"),
 59 |         ("ltd", "limited"),
 60 |         ("col", "colonel"),
 61 |         ("ft", "fort"),
 62 |     ]
 63 | ]
 64 | 
 65 | 
 66 | def expand_abbreviations(text):
 67 |     for regex, replacement in _abbreviations:
 68 |         text = re.sub(regex, replacement, text)
 69 |     return text
 70 | 
 71 | 
 72 | def lowercase(text):
 73 |     return text.lower()
 74 | 
 75 | 
 76 | def collapse_whitespace(text):
 77 |     return re.sub(_whitespace_re, " ", text)
 78 | 
 79 | 
 80 | def convert_to_ascii(text):
 81 |     return unidecode(text)
 82 | 
 83 | 
 84 | def basic_cleaners(text):
 85 |     """Basic pipeline that lowercases and collapses whitespace without transliteration."""
 86 |     text = lowercase(text)
 87 |     text = collapse_whitespace(text)
 88 |     return text
 89 | 
 90 | 
 91 | def transliteration_cleaners(text):
 92 |     """Pipeline for non-English text that transliterates to ASCII."""
 93 |     text = convert_to_ascii(text)
 94 |     text = lowercase(text)
 95 |     text = collapse_whitespace(text)
 96 |     return text
 97 | 
 98 | 
 99 | def english_cleaners2(text):
100 |     """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
101 |     text = convert_to_ascii(text)
102 |     text = lowercase(text)
103 |     text = expand_abbreviations(text)
104 |     phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0]
105 |     phonemes = collapse_whitespace(phonemes)
106 |     return phonemes
107 | 
108 | 
109 | def english_cleaners_piper(text):
110 |     """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
111 |     text = convert_to_ascii(text)
112 |     text = lowercase(text)
113 |     text = expand_abbreviations(text)
114 |     phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0])
115 |     phonemes = collapse_whitespace(phonemes)
116 |     return phonemes
117 | 


--------------------------------------------------------------------------------
/cosyvoice/cli/cosyvoice.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | import torch
16 | from hyperpyyaml import load_hyperpyyaml
17 | from huggingface_hub import snapshot_download
18 | from cosyvoice.cli.frontend import CosyVoiceFrontEnd
19 | from cosyvoice.cli.model import CosyVoiceModel
20 | 
21 | class CosyVoice:
22 | 
23 |     def __init__(self, model_dir):
24 |         instruct = True if '-Instruct' in model_dir else False
25 |         self.model_dir = model_dir
26 |         if not os.path.exists(model_dir):
27 |             model_dir = snapshot_download(model_dir)
28 |         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
29 |             configs = load_hyperpyyaml(f)
30 |         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
31 |                                           configs['feat_extractor'],
32 |                                           '{}/campplus.onnx'.format(model_dir),
33 |                                           '{}/speech_tokenizer_v1.onnx'.format(model_dir),
34 |                                           '{}/spk2info.pt'.format(model_dir),
35 |                                           instruct,
36 |                                           configs['allowed_special'])
37 |         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
38 |         self.model.load('{}/llm.pt'.format(model_dir),
39 |                         '{}/flow.pt'.format(model_dir),
40 |                         '{}/hift.pt'.format(model_dir))
41 |         del configs
42 | 
43 |     def list_avaliable_spks(self):
44 |         spks = list(self.frontend.spk2info.keys())
45 |         return spks
46 | 
47 |     def inference_sft(self, tts_text, spk_id):
48 |         tts_speeches = []
49 |         for i in self.frontend.text_normalize(tts_text, split=True):
50 |             model_input = self.frontend.frontend_sft(i, spk_id)
51 |             model_output = self.model.inference(**model_input)
52 |             tts_speeches.append(model_output['tts_speech'])
53 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
54 | 
55 |     def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
56 |         prompt_text = self.frontend.text_normalize(prompt_text, split=False)
57 |         tts_speeches = []
58 |         for i in self.frontend.text_normalize(tts_text, split=True):
59 |             model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
60 |             model_output = self.model.inference(**model_input)
61 |             tts_speeches.append(model_output['tts_speech'])
62 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
63 | 
64 |     def inference_cross_lingual(self, tts_text, prompt_speech_16k):
65 |         if self.frontend.instruct is True:
66 |             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
67 |         tts_speeches = []
68 |         for i in self.frontend.text_normalize(tts_text, split=True):
69 |             model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
70 |             model_output = self.model.inference(**model_input)
71 |             tts_speeches.append(model_output['tts_speech'])
72 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
73 | 
74 |     def inference_instruct(self, tts_text, spk_id, instruct_text):
75 |         if self.frontend.instruct is False:
76 |             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
77 |         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
78 |         tts_speeches = []
79 |         for i in self.frontend.text_normalize(tts_text, split=True):
80 |             model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
81 |             model_output = self.model.inference(**model_input)
82 |             tts_speeches.append(model_output['tts_speech'])
83 |         return {'tts_speech': torch.concat(tts_speeches, dim=1)}
84 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/frontend_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
 17 | 
 18 | # whether contain chinese character
 19 | def contains_chinese(text):
 20 |     return bool(chinese_char_pattern.search(text))
 21 | 
 22 | 
 23 | # replace special symbol
 24 | def replace_corner_mark(text):
 25 |     text = text.replace('²', '平方')
 26 |     text = text.replace('³', '立方')
 27 |     return text
 28 | 
 29 | 
 30 | # remove meaningless symbol
 31 | def remove_bracket(text):
 32 |     text = text.replace('（', '').replace('）', '')
 33 |     text = text.replace('【', '').replace('】', '')
 34 |     text = text.replace('`', '').replace('`', '')
 35 |     text = text.replace("——", " ")
 36 |     return text
 37 | 
 38 | 
 39 | # spell Arabic numerals
 40 | def spell_out_number(text: str, inflect_parser):
 41 |     new_text = []
 42 |     st = None
 43 |     for i, c in enumerate(text):
 44 |         if not c.isdigit():
 45 |             if st is not None:
 46 |                 num_str = inflect_parser.number_to_words(text[st: i])
 47 |                 new_text.append(num_str)
 48 |                 st = None
 49 |             new_text.append(c)
 50 |         else:
 51 |             if st is None:
 52 |                 st = i
 53 |     if st is not None and st < len(text):
 54 |         num_str = inflect_parser.number_to_words(text[st:])
 55 |         new_text.append(num_str)
 56 |     return ''.join(new_text)
 57 | 
 58 | 
 59 | # split paragrah logic：
 60 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
 61 | # 2. cal sentence len according to lang
 62 | # 3. split sentence according to puncatation
 63 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
 64 |     def calc_utt_length(_text: str):
 65 |         if lang == "zh":
 66 |             return len(_text)
 67 |         else:
 68 |             return len(tokenize(_text))
 69 | 
 70 |     def should_merge(_text: str):
 71 |         if lang == "zh":
 72 |             return len(_text) < merge_len
 73 |         else:
 74 |             return len(tokenize(_text)) < merge_len
 75 | 
 76 |     if lang == "zh":
 77 |         pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
 78 |     else:
 79 |         pounc = ['.', '?', '!', ';', ':']
 80 |     if comma_split:
 81 |         pounc.extend(['，', ','])
 82 |     st = 0
 83 |     utts = []
 84 |     for i, c in enumerate(text):
 85 |         if c in pounc:
 86 |             if len(text[st: i]) > 0:
 87 |                 utts.append(text[st: i] + c)
 88 |             if i + 1 < len(text) and text[i + 1] in ['"', '”']:
 89 |                 tmp = utts.pop(-1)
 90 |                 utts.append(tmp + text[i + 1])
 91 |                 st = i + 2
 92 |             else:
 93 |                 st = i + 1
 94 |     if len(utts) == 0:
 95 |         if lang == "zh":
 96 |             utts.append(text + '。')
 97 |         else:
 98 |             utts.append(text + '.')
 99 |     final_utts = []
100 |     cur_utt = ""
101 |     for utt in utts:
102 |         if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
103 |             final_utts.append(cur_utt)
104 |             cur_utt = ""
105 |         cur_utt = cur_utt + utt
106 |     if len(cur_utt) > 0:
107 |         if should_merge(cur_utt) and len(final_utts) != 0:
108 |             final_utts[-1] = final_utts[-1] + cur_utt
109 |         else:
110 |             final_utts.append(cur_utt)
111 | 
112 |     return final_utts
113 | 
114 | 
115 | # remove blank between chinese character
116 | def replace_blank(text: str):
117 |     out_str = []
118 |     for i, c in enumerate(text):
119 |         if c == " ":
120 |             if ((text[i + 1].isascii() and text[i + 1] != " ") and
121 |                     (text[i - 1].isascii() and text[i - 1] != " ")):
122 |                 out_str.append(c)
123 |         else:
124 |             out_str.append(c)
125 |     return "".join(out_str)
126 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Positionwise feed forward layer definition."""
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | class PositionwiseFeedForward(torch.nn.Module):
 21 |     """Positionwise feed forward layer.
 22 | 
 23 |     FeedForward are appied on each position of the sequence.
 24 |     The output dim is same with the input dim.
 25 | 
 26 |     Args:
 27 |         idim (int): Input dimenstion.
 28 |         hidden_units (int): The number of hidden units.
 29 |         dropout_rate (float): Dropout rate.
 30 |         activation (torch.nn.Module): Activation function
 31 |     """
 32 | 
 33 |     def __init__(
 34 |             self,
 35 |             idim: int,
 36 |             hidden_units: int,
 37 |             dropout_rate: float,
 38 |             activation: torch.nn.Module = torch.nn.ReLU(),
 39 |     ):
 40 |         """Construct a PositionwiseFeedForward object."""
 41 |         super(PositionwiseFeedForward, self).__init__()
 42 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
 43 |         self.activation = activation
 44 |         self.dropout = torch.nn.Dropout(dropout_rate)
 45 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
 46 | 
 47 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 48 |         """Forward function.
 49 | 
 50 |         Args:
 51 |             xs: input tensor (B, L, D)
 52 |         Returns:
 53 |             output tensor, (B, L, D)
 54 |         """
 55 |         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
 56 | 
 57 | 
 58 | class MoEFFNLayer(torch.nn.Module):
 59 |     """
 60 |     Mixture of expert with Positionwise feed forward layer
 61 |     See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
 62 |     The output dim is same with the input dim.
 63 | 
 64 |     Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
 65 |                   https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
 66 |     Args:
 67 |         n_expert: number of expert.
 68 |         n_expert_per_token: The actual number of experts used for each frame
 69 |         idim (int): Input dimenstion.
 70 |         hidden_units (int): The number of hidden units.
 71 |         dropout_rate (float): Dropout rate.
 72 |         activation (torch.nn.Module): Activation function
 73 |     """
 74 | 
 75 |     def __init__(
 76 |             self,
 77 |             n_expert: int,
 78 |             n_expert_per_token: int,
 79 |             idim: int,
 80 |             hidden_units: int,
 81 |             dropout_rate: float,
 82 |             activation: torch.nn.Module = torch.nn.ReLU(),
 83 |     ):
 84 |         super(MoEFFNLayer, self).__init__()
 85 |         self.gate = torch.nn.Linear(idim, n_expert, bias=False)
 86 |         self.experts = torch.nn.ModuleList(
 87 |             PositionwiseFeedForward(idim, hidden_units, dropout_rate,
 88 |                                     activation) for _ in range(n_expert))
 89 |         self.n_expert_per_token = n_expert_per_token
 90 | 
 91 |     def forward(self, xs: torch.Tensor) -> torch.Tensor:
 92 |         """Foward function.
 93 |         Args:
 94 |             xs: input tensor (B, L, D)
 95 |         Returns:
 96 |             output tensor, (B, L, D)
 97 | 
 98 |         """
 99 |         B, L, D = xs.size(
100 |         )  # batch size, sequence length, embedding dimension (idim)
101 |         xs = xs.view(-1, D)  # (B*L, D)
102 |         router = self.gate(xs)  # (B*L, n_expert)
103 |         logits, indices = torch.topk(
104 |             router, self.n_expert_per_token
105 |         )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
106 |         weights = torch.nn.functional.softmax(
107 |             logits, dim=1,
108 |             dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_per_token)
109 |         output = torch.zeros_like(xs)  # (B*L, D)
110 |         for i, expert in enumerate(self.experts):
111 |             mask = indices == i
112 |             batch_idx, ith_expert = torch.where(mask)
113 |             output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
114 |                 xs[batch_idx])
115 |         return output.view(B, L, D)
116 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/train.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional, Tuple
  2 | 
  3 | import hydra
  4 | import lightning as L
  5 | import rootutils
  6 | from lightning import Callback, LightningDataModule, LightningModule, Trainer
  7 | from lightning.pytorch.loggers import Logger
  8 | from omegaconf import DictConfig
  9 | 
 10 | from matcha import utils
 11 | 
 12 | rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 13 | # ------------------------------------------------------------------------------------ #
 14 | # the setup_root above is equivalent to:
 15 | # - adding project root dir to PYTHONPATH
 16 | #       (so you don't need to force user to install project as a package)
 17 | #       (necessary before importing any local modules e.g. `from src import utils`)
 18 | # - setting up PROJECT_ROOT environment variable
 19 | #       (which is used as a base for paths in "configs/paths/default.yaml")
 20 | #       (this way all filepaths are the same no matter where you run the code)
 21 | # - loading environment variables from ".env" in root dir
 22 | #
 23 | # you can remove it if you:
 24 | # 1. either install project as a package or move entry files to project root dir
 25 | # 2. set `root_dir` to "." in "configs/paths/default.yaml"
 26 | #
 27 | # more info: https://github.com/ashleve/rootutils
 28 | # ------------------------------------------------------------------------------------ #
 29 | 
 30 | 
 31 | log = utils.get_pylogger(__name__)
 32 | 
 33 | 
 34 | @utils.task_wrapper
 35 | def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
 36 |     """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
 37 |     training.
 38 | 
 39 |     This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
 40 |     failure. Useful for multiruns, saving info about the crash, etc.
 41 | 
 42 |     :param cfg: A DictConfig configuration composed by Hydra.
 43 |     :return: A tuple with metrics and dict with all instantiated objects.
 44 |     """
 45 |     # set seed for random number generators in pytorch, numpy and python.random
 46 |     if cfg.get("seed"):
 47 |         L.seed_everything(cfg.seed, workers=True)
 48 | 
 49 |     log.info(f"Instantiating datamodule <{cfg.data._target_}>")  # pylint: disable=protected-access
 50 |     datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
 51 | 
 52 |     log.info(f"Instantiating model <{cfg.model._target_}>")  # pylint: disable=protected-access
 53 |     model: LightningModule = hydra.utils.instantiate(cfg.model)
 54 | 
 55 |     log.info("Instantiating callbacks...")
 56 |     callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks"))
 57 | 
 58 |     log.info("Instantiating loggers...")
 59 |     logger: List[Logger] = utils.instantiate_loggers(cfg.get("logger"))
 60 | 
 61 |     log.info(f"Instantiating trainer <{cfg.trainer._target_}>")  # pylint: disable=protected-access
 62 |     trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger)
 63 | 
 64 |     object_dict = {
 65 |         "cfg": cfg,
 66 |         "datamodule": datamodule,
 67 |         "model": model,
 68 |         "callbacks": callbacks,
 69 |         "logger": logger,
 70 |         "trainer": trainer,
 71 |     }
 72 | 
 73 |     if logger:
 74 |         log.info("Logging hyperparameters!")
 75 |         utils.log_hyperparameters(object_dict)
 76 | 
 77 |     if cfg.get("train"):
 78 |         log.info("Starting training!")
 79 |         trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
 80 | 
 81 |     train_metrics = trainer.callback_metrics
 82 | 
 83 |     if cfg.get("test"):
 84 |         log.info("Starting testing!")
 85 |         ckpt_path = trainer.checkpoint_callback.best_model_path
 86 |         if ckpt_path == "":
 87 |             log.warning("Best ckpt not found! Using current weights for testing...")
 88 |             ckpt_path = None
 89 |         trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
 90 |         log.info(f"Best ckpt path: {ckpt_path}")
 91 | 
 92 |     test_metrics = trainer.callback_metrics
 93 | 
 94 |     # merge train and test metrics
 95 |     metric_dict = {**train_metrics, **test_metrics}
 96 | 
 97 |     return metric_dict, object_dict
 98 | 
 99 | 
100 | @hydra.main(version_base="1.3", config_path="../configs", config_name="train.yaml")
101 | def main(cfg: DictConfig) -> Optional[float]:
102 |     """Main entry point for training.
103 | 
104 |     :param cfg: DictConfig configuration composed by Hydra.
105 |     :return: Optional[float] with optimized metric value.
106 |     """
107 |     # apply extra utilities
108 |     # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
109 |     utils.extras(cfg)
110 | 
111 |     # train the model
112 |     metric_dict, _ = train(cfg)
113 | 
114 |     # safely retrieve metric value for hydra-based hyperparameter optimization
115 |     metric_value = utils.get_metric_value(metric_dict=metric_dict, metric_name=cfg.get("optimized_metric"))
116 | 
117 |     # return optimized metric
118 |     return metric_value
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()  # pylint: disable=no-value-for-parameter
123 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/models/components/flow_matching.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | 
  6 | from matcha.models.components.decoder import Decoder
  7 | from matcha.utils.pylogger import get_pylogger
  8 | 
  9 | log = get_pylogger(__name__)
 10 | 
 11 | 
 12 | class BASECFM(torch.nn.Module, ABC):
 13 |     def __init__(
 14 |         self,
 15 |         n_feats,
 16 |         cfm_params,
 17 |         n_spks=1,
 18 |         spk_emb_dim=128,
 19 |     ):
 20 |         super().__init__()
 21 |         self.n_feats = n_feats
 22 |         self.n_spks = n_spks
 23 |         self.spk_emb_dim = spk_emb_dim
 24 |         self.solver = cfm_params.solver
 25 |         if hasattr(cfm_params, "sigma_min"):
 26 |             self.sigma_min = cfm_params.sigma_min
 27 |         else:
 28 |             self.sigma_min = 1e-4
 29 | 
 30 |         self.estimator = None
 31 | 
 32 |     @torch.inference_mode()
 33 |     def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
 34 |         """Forward diffusion
 35 | 
 36 |         Args:
 37 |             mu (torch.Tensor): output of encoder
 38 |                 shape: (batch_size, n_feats, mel_timesteps)
 39 |             mask (torch.Tensor): output_mask
 40 |                 shape: (batch_size, 1, mel_timesteps)
 41 |             n_timesteps (int): number of diffusion steps
 42 |             temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
 43 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 44 |                 shape: (batch_size, spk_emb_dim)
 45 |             cond: Not used but kept for future purposes
 46 | 
 47 |         Returns:
 48 |             sample: generated mel-spectrogram
 49 |                 shape: (batch_size, n_feats, mel_timesteps)
 50 |         """
 51 |         z = torch.randn_like(mu) * temperature
 52 |         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
 53 |         return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
 54 | 
 55 |     def solve_euler(self, x, t_span, mu, mask, spks, cond):
 56 |         """
 57 |         Fixed euler solver for ODEs.
 58 |         Args:
 59 |             x (torch.Tensor): random noise
 60 |             t_span (torch.Tensor): n_timesteps interpolated
 61 |                 shape: (n_timesteps + 1,)
 62 |             mu (torch.Tensor): output of encoder
 63 |                 shape: (batch_size, n_feats, mel_timesteps)
 64 |             mask (torch.Tensor): output_mask
 65 |                 shape: (batch_size, 1, mel_timesteps)
 66 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 67 |                 shape: (batch_size, spk_emb_dim)
 68 |             cond: Not used but kept for future purposes
 69 |         """
 70 |         t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
 71 | 
 72 |         # I am storing this because I can later plot it by putting a debugger here and saving it to a file
 73 |         # Or in future might add like a return_all_steps flag
 74 |         sol = []
 75 | 
 76 |         for step in range(1, len(t_span)):
 77 |             dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
 78 | 
 79 |             x = x + dt * dphi_dt
 80 |             t = t + dt
 81 |             sol.append(x)
 82 |             if step < len(t_span) - 1:
 83 |                 dt = t_span[step + 1] - t
 84 | 
 85 |         return sol[-1]
 86 | 
 87 |     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
 88 |         """Computes diffusion loss
 89 | 
 90 |         Args:
 91 |             x1 (torch.Tensor): Target
 92 |                 shape: (batch_size, n_feats, mel_timesteps)
 93 |             mask (torch.Tensor): target mask
 94 |                 shape: (batch_size, 1, mel_timesteps)
 95 |             mu (torch.Tensor): output of encoder
 96 |                 shape: (batch_size, n_feats, mel_timesteps)
 97 |             spks (torch.Tensor, optional): speaker embedding. Defaults to None.
 98 |                 shape: (batch_size, spk_emb_dim)
 99 | 
100 |         Returns:
101 |             loss: conditional flow matching loss
102 |             y: conditional flow
103 |                 shape: (batch_size, n_feats, mel_timesteps)
104 |         """
105 |         b, _, t = mu.shape
106 | 
107 |         # random timestep
108 |         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
109 |         # sample noise p(x_0)
110 |         z = torch.randn_like(x1)
111 | 
112 |         y = (1 - (1 - self.sigma_min) * t) * z + t * x1
113 |         u = x1 - (1 - self.sigma_min) * z
114 | 
115 |         loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
116 |             torch.sum(mask) * u.shape[1]
117 |         )
118 |         return loss, y
119 | 
120 | 
121 | class CFM(BASECFM):
122 |     def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64):
123 |         super().__init__(
124 |             n_feats=in_channels,
125 |             cfm_params=cfm_params,
126 |             n_spks=n_spks,
127 |             spk_emb_dim=spk_emb_dim,
128 |         )
129 | 
130 |         in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
131 |         # Just change the architecture of the estimator here
132 |         self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params)
133 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Shigeki Karita
  2 | #               2020 Mobvoi Inc (Binbin Zhang)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Decoder self-attention layer definition."""
 16 | from typing import Optional, Tuple
 17 | 
 18 | import torch
 19 | from torch import nn
 20 | 
 21 | 
 22 | class DecoderLayer(nn.Module):
 23 |     """Single decoder layer module.
 24 | 
 25 |     Args:
 26 |         size (int): Input dimension.
 27 |         self_attn (torch.nn.Module): Self-attention module instance.
 28 |             `MultiHeadedAttention` instance can be used as the argument.
 29 |         src_attn (torch.nn.Module): Inter-attention module instance.
 30 |             `MultiHeadedAttention` instance can be used as the argument.
 31 |             If `None` is passed, Inter-attention is not used, such as
 32 |             CIF, GPT, and other decoder only model.
 33 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 34 |             `PositionwiseFeedForward` instance can be used as the argument.
 35 |         dropout_rate (float): Dropout rate.
 36 |         normalize_before (bool):
 37 |             True: use layer_norm before each sub-block.
 38 |             False: to use layer_norm after each sub-block.
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         size: int,
 44 |         self_attn: nn.Module,
 45 |         src_attn: Optional[nn.Module],
 46 |         feed_forward: nn.Module,
 47 |         dropout_rate: float,
 48 |         normalize_before: bool = True,
 49 |     ):
 50 |         """Construct an DecoderLayer object."""
 51 |         super().__init__()
 52 |         self.size = size
 53 |         self.self_attn = self_attn
 54 |         self.src_attn = src_attn
 55 |         self.feed_forward = feed_forward
 56 |         self.norm1 = nn.LayerNorm(size, eps=1e-5)
 57 |         self.norm2 = nn.LayerNorm(size, eps=1e-5)
 58 |         self.norm3 = nn.LayerNorm(size, eps=1e-5)
 59 |         self.dropout = nn.Dropout(dropout_rate)
 60 |         self.normalize_before = normalize_before
 61 | 
 62 |     def forward(
 63 |         self,
 64 |         tgt: torch.Tensor,
 65 |         tgt_mask: torch.Tensor,
 66 |         memory: torch.Tensor,
 67 |         memory_mask: torch.Tensor,
 68 |         cache: Optional[torch.Tensor] = None
 69 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 70 |         """Compute decoded features.
 71 | 
 72 |         Args:
 73 |             tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
 74 |             tgt_mask (torch.Tensor): Mask for input tensor
 75 |                 (#batch, maxlen_out).
 76 |             memory (torch.Tensor): Encoded memory
 77 |                 (#batch, maxlen_in, size).
 78 |             memory_mask (torch.Tensor): Encoded memory mask
 79 |                 (#batch, maxlen_in).
 80 |             cache (torch.Tensor): cached tensors.
 81 |                 (#batch, maxlen_out - 1, size).
 82 | 
 83 |         Returns:
 84 |             torch.Tensor: Output tensor (#batch, maxlen_out, size).
 85 |             torch.Tensor: Mask for output tensor (#batch, maxlen_out).
 86 |             torch.Tensor: Encoded memory (#batch, maxlen_in, size).
 87 |             torch.Tensor: Encoded memory mask (#batch, maxlen_in).
 88 | 
 89 |         """
 90 |         residual = tgt
 91 |         if self.normalize_before:
 92 |             tgt = self.norm1(tgt)
 93 | 
 94 |         if cache is None:
 95 |             tgt_q = tgt
 96 |             tgt_q_mask = tgt_mask
 97 |         else:
 98 |             # compute only the last frame query keeping dim: max_time_out -> 1
 99 |             assert cache.shape == (
100 |                 tgt.shape[0],
101 |                 tgt.shape[1] - 1,
102 |                 self.size,
103 |             ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
104 |             tgt_q = tgt[:, -1:, :]
105 |             residual = residual[:, -1:, :]
106 |             tgt_q_mask = tgt_mask[:, -1:, :]
107 | 
108 |         x = residual + self.dropout(
109 |             self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
110 |         if not self.normalize_before:
111 |             x = self.norm1(x)
112 | 
113 |         if self.src_attn is not None:
114 |             residual = x
115 |             if self.normalize_before:
116 |                 x = self.norm2(x)
117 |             x = residual + self.dropout(
118 |                 self.src_attn(x, memory, memory, memory_mask)[0])
119 |             if not self.normalize_before:
120 |                 x = self.norm2(x)
121 | 
122 |         residual = x
123 |         if self.normalize_before:
124 |             x = self.norm3(x)
125 |         x = residual + self.dropout(self.feed_forward(x))
126 |         if not self.normalize_before:
127 |             x = self.norm3(x)
128 | 
129 |         if cache is not None:
130 |             x = torch.cat([cache, x], dim=1)
131 | 
132 |         return x, tgt_mask, memory, memory_mask
133 | 


--------------------------------------------------------------------------------
/cosyvoice/utils/executor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import logging
 17 | from contextlib import nullcontext
 18 | import os
 19 | 
 20 | import torch
 21 | import torch.distributed as dist
 22 | 
 23 | from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join
 24 | 
 25 | 
 26 | class Executor:
 27 | 
 28 |     def __init__(self):
 29 |         self.step = 0
 30 |         self.epoch = 0
 31 |         self.rank = int(os.environ.get('RANK', 0))
 32 |         self.device = torch.device('cuda:{}'.format(self.rank))
 33 | 
 34 |     def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join):
 35 |         ''' Train one epoch
 36 |         '''
 37 | 
 38 |         lr = optimizer.param_groups[0]['lr']
 39 |         logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank))
 40 |         logging.info('using accumulate grad, new batch size is {} times'
 41 |                      ' larger than before'.format(info_dict['accum_grad']))
 42 |         # A context manager to be used in conjunction with an instance of
 43 |         # torch.nn.parallel.DistributedDataParallel to be able to train
 44 |         # with uneven inputs across participating processes.
 45 |         model.train()
 46 |         model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
 47 |         with model_context():
 48 |             for batch_idx, batch_dict in enumerate(train_data_loader):
 49 |                 info_dict["tag"] = "TRAIN"
 50 |                 info_dict["step"] = self.step
 51 |                 info_dict["epoch"] = self.epoch
 52 |                 info_dict["batch_idx"] = batch_idx
 53 |                 if cosyvoice_join(group_join, info_dict):
 54 |                     break
 55 | 
 56 |                 # Disable gradient synchronizations across DDP processes.
 57 |                 # Within this context, gradients will be accumulated on module
 58 |                 # variables, which will later be synchronized.
 59 |                 if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0:
 60 |                     context = model.no_sync
 61 |                 # Used for single gpu training and DDP gradient synchronization
 62 |                 # processes.
 63 |                 else:
 64 |                     context = nullcontext
 65 | 
 66 |                 with context():
 67 |                     info_dict = batch_forward(model, batch_dict, info_dict)
 68 |                     info_dict = batch_backward(model, info_dict)
 69 | 
 70 |                 info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict)
 71 |                 log_per_step(writer, info_dict)
 72 |                 # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
 73 |                 if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and (batch_idx + 1) % info_dict["accum_grad"] == 0:
 74 |                     dist.barrier()
 75 |                     self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False)
 76 |                     model.train()
 77 |                 if (batch_idx + 1) % info_dict["accum_grad"] == 0:
 78 |                     self.step += 1
 79 |         dist.barrier()
 80 |         self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
 81 | 
 82 |     @torch.inference_mode()
 83 |     def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True):
 84 |         ''' Cross validation on
 85 |         '''
 86 |         logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank))
 87 |         model.eval()
 88 |         total_num_utts, total_loss_dict = 0, {}  # avoid division by 0
 89 |         for batch_idx, batch_dict in enumerate(cv_data_loader):
 90 |             info_dict["tag"] = "CV"
 91 |             info_dict["step"] = self.step
 92 |             info_dict["epoch"] = self.epoch
 93 |             info_dict["batch_idx"] = batch_idx
 94 | 
 95 |             num_utts = len(batch_dict["utts"])
 96 |             total_num_utts += num_utts
 97 | 
 98 |             info_dict = batch_forward(model, batch_dict, info_dict)
 99 | 
100 |             for k, v in info_dict['loss_dict'].items():
101 |                 if k not in total_loss_dict:
102 |                     total_loss_dict[k] = []
103 |                 total_loss_dict[k].append(v.item() * num_utts)
104 |             log_per_step(None, info_dict)
105 |         for k, v in total_loss_dict.items():
106 |             total_loss_dict[k] = sum(v) / total_num_utts
107 |         info_dict['loss_dict'] = total_loss_dict
108 |         log_per_save(writer, info_dict)
109 |         model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1)
110 |         save_model(model, model_name, info_dict)
111 | 


--------------------------------------------------------------------------------
/cosyvoice/bin/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function
 16 | import argparse
 17 | import datetime
 18 | import logging
 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
 20 | from copy import deepcopy
 21 | import torch
 22 | import torch.distributed as dist
 23 | import deepspeed
 24 | 
 25 | from hyperpyyaml import load_hyperpyyaml
 26 | 
 27 | from torch.distributed.elastic.multiprocessing.errors import record
 28 | 
 29 | from cosyvoice.utils.executor import Executor
 30 | from cosyvoice.utils.train_utils import (
 31 |     init_distributed,
 32 |     init_dataset_and_dataloader,
 33 |     init_optimizer_and_scheduler,
 34 |     init_summarywriter, save_model,
 35 |     wrap_cuda_model, check_modify_and_save_config)
 36 | 
 37 | 
 38 | def get_args():
 39 |     parser = argparse.ArgumentParser(description='training your network')
 40 |     parser.add_argument('--train_engine',
 41 |                         default='torch_ddp',
 42 |                         choices=['torch_ddp', 'deepspeed'],
 43 |                         help='Engine for paralleled training')
 44 |     parser.add_argument('--model', required=True, help='model which will be trained')
 45 |     parser.add_argument('--config', required=True, help='config file')
 46 |     parser.add_argument('--train_data', required=True, help='train data file')
 47 |     parser.add_argument('--cv_data', required=True, help='cv data file')
 48 |     parser.add_argument('--checkpoint', help='checkpoint model')
 49 |     parser.add_argument('--model_dir', required=True, help='save model dir')
 50 |     parser.add_argument('--tensorboard_dir',
 51 |                         default='tensorboard',
 52 |                         help='tensorboard log dir')
 53 |     parser.add_argument('--ddp.dist_backend',
 54 |                         dest='dist_backend',
 55 |                         default='nccl',
 56 |                         choices=['nccl', 'gloo'],
 57 |                         help='distributed backend')
 58 |     parser.add_argument('--num_workers',
 59 |                         default=0,
 60 |                         type=int,
 61 |                         help='num of subprocess workers for reading')
 62 |     parser.add_argument('--prefetch',
 63 |                         default=100,
 64 |                         type=int,
 65 |                         help='prefetch number')
 66 |     parser.add_argument('--pin_memory',
 67 |                         action='store_true',
 68 |                         default=False,
 69 |                         help='Use pinned memory buffers used for reading')
 70 |     parser.add_argument('--deepspeed.save_states',
 71 |                         dest='save_states',
 72 |                         default='model_only',
 73 |                         choices=['model_only', 'model+optimizer'],
 74 |                         help='save model/optimizer states')
 75 |     parser.add_argument('--timeout',
 76 |                         default=30,
 77 |                         type=int,
 78 |                         help='timeout (in seconds) of cosyvoice_join.')
 79 |     parser = deepspeed.add_config_arguments(parser)
 80 |     args = parser.parse_args()
 81 |     return args
 82 | 
 83 | 
 84 | @record
 85 | def main():
 86 |     args = get_args()
 87 |     logging.basicConfig(level=logging.DEBUG,
 88 |                         format='%(asctime)s %(levelname)s %(message)s')
 89 | 
 90 |     override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
 91 |     with open(args.config, 'r') as f:
 92 |         configs = load_hyperpyyaml(f, overrides=override_dict)
 93 |     configs['train_conf'].update(vars(args))
 94 | 
 95 |     # Init env for ddp
 96 |     init_distributed(args)
 97 | 
 98 |     # Get dataset & dataloader
 99 |     train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
100 |         init_dataset_and_dataloader(args, configs)
101 | 
102 |     # Do some sanity checks and save config to arsg.model_dir
103 |     configs = check_modify_and_save_config(args, configs)
104 | 
105 |     # Tensorboard summary
106 |     writer = init_summarywriter(args)
107 | 
108 |     # load checkpoint
109 |     model = configs[args.model]
110 |     if args.checkpoint is not None:
111 |         model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
112 | 
113 |     # Dispatch model from cpu to gpu
114 |     model = wrap_cuda_model(args, model)
115 | 
116 |     # Get optimizer & scheduler
117 |     model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
118 | 
119 |     # Save init checkpoints
120 |     info_dict = deepcopy(configs['train_conf'])
121 |     save_model(model, 'init', info_dict)
122 | 
123 |     # Get executor
124 |     executor = Executor()
125 | 
126 |     # Start training loop
127 |     for epoch in range(info_dict['max_epoch']):
128 |         executor.epoch = epoch
129 |         train_dataset.set_epoch(epoch)
130 |         dist.barrier()
131 |         group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
132 |         executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
133 |         dist.destroy_process_group(group_join)
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/cosyvoice/bin/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function
 16 | 
 17 | import argparse
 18 | import logging
 19 | logging.getLogger('matplotlib').setLevel(logging.WARNING)
 20 | import os
 21 | 
 22 | import torch
 23 | from torch.utils.data import DataLoader
 24 | import torchaudio
 25 | from hyperpyyaml import load_hyperpyyaml
 26 | from tqdm import tqdm
 27 | from cosyvoice.cli.model import CosyVoiceModel
 28 | 
 29 | from cosyvoice.dataset.dataset import Dataset
 30 | 
 31 | def get_args():
 32 |     parser = argparse.ArgumentParser(description='inference with your model')
 33 |     parser.add_argument('--config', required=True, help='config file')
 34 |     parser.add_argument('--prompt_data', required=True, help='prompt data file')
 35 |     parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
 36 |     parser.add_argument('--tts_text', required=True, help='tts input file')
 37 |     parser.add_argument('--llm_model', required=True, help='llm model file')
 38 |     parser.add_argument('--flow_model', required=True, help='flow model file')
 39 |     parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
 40 |     parser.add_argument('--gpu',
 41 |                         type=int,
 42 |                         default=-1,
 43 |                         help='gpu id for this rank, -1 for cpu')
 44 |     parser.add_argument('--mode',
 45 |                         default='sft',
 46 |                         choices=['sft', 'zero_shot'],
 47 |                         help='inference mode')
 48 |     parser.add_argument('--result_dir', required=True, help='asr result file')
 49 |     args = parser.parse_args()
 50 |     print(args)
 51 |     return args
 52 | 
 53 | 
 54 | def main():
 55 |     args = get_args()
 56 |     logging.basicConfig(level=logging.DEBUG,
 57 |                         format='%(asctime)s %(levelname)s %(message)s')
 58 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
 59 | 
 60 |     # Init cosyvoice models from configs
 61 |     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 62 |     device = torch.device('cuda' if use_cuda else 'cpu')
 63 |     with open(args.config, 'r') as f:
 64 |         configs = load_hyperpyyaml(f)
 65 | 
 66 |     model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
 67 |     model.load(args.llm_model, args.flow_model, args.hifigan_model)
 68 | 
 69 |     test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
 70 |     test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
 71 | 
 72 |     del configs
 73 |     os.makedirs(args.result_dir, exist_ok=True)
 74 |     fn = os.path.join(args.result_dir, 'wav.scp')
 75 |     f = open(fn, 'w')
 76 |     with torch.no_grad():
 77 |         for batch_idx, batch in tqdm(enumerate(test_data_loader)):
 78 |             utts = batch["utts"]
 79 |             assert len(utts) == 1, "inference mode only support batchsize 1"
 80 |             text = batch["text"]
 81 |             text_token = batch["text_token"].to(device)
 82 |             text_token_len = batch["text_token_len"].to(device)
 83 |             tts_text = batch["tts_text"]
 84 |             tts_index = batch["tts_index"]
 85 |             tts_text_token = batch["tts_text_token"].to(device)
 86 |             tts_text_token_len = batch["tts_text_token_len"].to(device)
 87 |             speech_token = batch["speech_token"].to(device)
 88 |             speech_token_len = batch["speech_token_len"].to(device)
 89 |             speech_feat = batch["speech_feat"].to(device)
 90 |             speech_feat_len = batch["speech_feat_len"].to(device)
 91 |             utt_embedding = batch["utt_embedding"].to(device)
 92 |             spk_embedding = batch["spk_embedding"].to(device)
 93 |             if args.mode == 'sft':
 94 |                 model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
 95 |                                'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
 96 |             else:
 97 |                 model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
 98 |                                'prompt_text': text_token, 'prompt_text_len': text_token_len,
 99 |                                'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
100 |                                'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
101 |                                'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
102 |                                'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
103 |             model_output = model.inference(**model_input)
104 |             tts_key = '{}_{}'.format(utts[0], tts_index[0])
105 |             tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
106 |             torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
107 |             f.write('{} {}\n'.format(tts_key, tts_fn))
108 |             f.flush()
109 |     f.close()
110 |     logging.info('Result wav.scp saved in {}'.format(fn))
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main()
115 | 


--------------------------------------------------------------------------------
/cosyvoice/transformer/convolution.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
  2 | #               2024 Alibaba Inc (Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modified from ESPnet(https://github.com/espnet/espnet)
 16 | """ConvolutionModule definition."""
 17 | 
 18 | from typing import Tuple
 19 | 
 20 | import torch
 21 | from torch import nn
 22 | 
 23 | 
 24 | class ConvolutionModule(nn.Module):
 25 |     """ConvolutionModule in Conformer model."""
 26 | 
 27 |     def __init__(self,
 28 |                  channels: int,
 29 |                  kernel_size: int = 15,
 30 |                  activation: nn.Module = nn.ReLU(),
 31 |                  norm: str = "batch_norm",
 32 |                  causal: bool = False,
 33 |                  bias: bool = True):
 34 |         """Construct an ConvolutionModule object.
 35 |         Args:
 36 |             channels (int): The number of channels of conv layers.
 37 |             kernel_size (int): Kernel size of conv layers.
 38 |             causal (int): Whether use causal convolution or not
 39 |         """
 40 |         super().__init__()
 41 | 
 42 |         self.pointwise_conv1 = nn.Conv1d(
 43 |             channels,
 44 |             2 * channels,
 45 |             kernel_size=1,
 46 |             stride=1,
 47 |             padding=0,
 48 |             bias=bias,
 49 |         )
 50 |         # self.lorder is used to distinguish if it's a causal convolution,
 51 |         # if self.lorder > 0: it's a causal convolution, the input will be
 52 |         #    padded with self.lorder frames on the left in forward.
 53 |         # else: it's a symmetrical convolution
 54 |         if causal:
 55 |             padding = 0
 56 |             self.lorder = kernel_size - 1
 57 |         else:
 58 |             # kernel_size should be an odd number for none causal convolution
 59 |             assert (kernel_size - 1) % 2 == 0
 60 |             padding = (kernel_size - 1) // 2
 61 |             self.lorder = 0
 62 |         self.depthwise_conv = nn.Conv1d(
 63 |             channels,
 64 |             channels,
 65 |             kernel_size,
 66 |             stride=1,
 67 |             padding=padding,
 68 |             groups=channels,
 69 |             bias=bias,
 70 |         )
 71 | 
 72 |         assert norm in ['batch_norm', 'layer_norm']
 73 |         if norm == "batch_norm":
 74 |             self.use_layer_norm = False
 75 |             self.norm = nn.BatchNorm1d(channels)
 76 |         else:
 77 |             self.use_layer_norm = True
 78 |             self.norm = nn.LayerNorm(channels)
 79 | 
 80 |         self.pointwise_conv2 = nn.Conv1d(
 81 |             channels,
 82 |             channels,
 83 |             kernel_size=1,
 84 |             stride=1,
 85 |             padding=0,
 86 |             bias=bias,
 87 |         )
 88 |         self.activation = activation
 89 | 
 90 |     def forward(
 91 |         self,
 92 |         x: torch.Tensor,
 93 |         mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
 94 |         cache: torch.Tensor = torch.zeros((0, 0, 0)),
 95 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 96 |         """Compute convolution module.
 97 |         Args:
 98 |             x (torch.Tensor): Input tensor (#batch, time, channels).
 99 |             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
100 |                 (0, 0, 0) means fake mask.
101 |             cache (torch.Tensor): left context cache, it is only
102 |                 used in causal convolution (#batch, channels, cache_t),
103 |                 (0, 0, 0) meas fake cache.
104 |         Returns:
105 |             torch.Tensor: Output tensor (#batch, time, channels).
106 |         """
107 |         # exchange the temporal dimension and the feature dimension
108 |         x = x.transpose(1, 2)  # (#batch, channels, time)
109 | 
110 |         # mask batch padding
111 |         if mask_pad.size(2) > 0:  # time > 0
112 |             x.masked_fill_(~mask_pad, 0.0)
113 | 
114 |         if self.lorder > 0:
115 |             if cache.size(2) == 0:  # cache_t == 0
116 |                 x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
117 |             else:
118 |                 assert cache.size(0) == x.size(0)  # equal batch
119 |                 assert cache.size(1) == x.size(1)  # equal channel
120 |                 x = torch.cat((cache, x), dim=2)
121 |             assert (x.size(2) > self.lorder)
122 |             new_cache = x[:, :, -self.lorder:]
123 |         else:
124 |             # It's better we just return None if no cache is required,
125 |             # However, for JIT export, here we just fake one tensor instead of
126 |             # None.
127 |             new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
128 | 
129 |         # GLU mechanism
130 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
131 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
132 | 
133 |         # 1D Depthwise Conv
134 |         x = self.depthwise_conv(x)
135 |         if self.use_layer_norm:
136 |             x = x.transpose(1, 2)
137 |         x = self.activation(self.norm(x))
138 |         if self.use_layer_norm:
139 |             x = x.transpose(1, 2)
140 |         x = self.pointwise_conv2(x)
141 |         # mask batch padding
142 |         if mask_pad.size(2) > 0:  # time > 0
143 |             x.masked_fill_(~mask_pad, 0.0)
144 | 
145 |         return x.transpose(1, 2), new_cache
146 | 


--------------------------------------------------------------------------------
/cosyvoice/dataset/dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
  2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import random
 17 | import json
 18 | import math
 19 | from functools import partial
 20 | 
 21 | import torch
 22 | import torch.distributed as dist
 23 | from torch.utils.data import IterableDataset
 24 | from cosyvoice.utils.file_utils import read_lists, read_json_lists
 25 | 
 26 | 
 27 | class Processor(IterableDataset):
 28 | 
 29 |     def __init__(self, source, f, *args, **kw):
 30 |         assert callable(f)
 31 |         self.source = source
 32 |         self.f = f
 33 |         self.args = args
 34 |         self.kw = kw
 35 | 
 36 |     def set_epoch(self, epoch):
 37 |         self.source.set_epoch(epoch)
 38 | 
 39 |     def __iter__(self):
 40 |         """ Return an iterator over the source dataset processed by the
 41 |             given processor.
 42 |         """
 43 |         assert self.source is not None
 44 |         assert callable(self.f)
 45 |         return self.f(iter(self.source), *self.args, **self.kw)
 46 | 
 47 |     def apply(self, f):
 48 |         assert callable(f)
 49 |         return Processor(self, f, *self.args, **self.kw)
 50 | 
 51 | 
 52 | class DistributedSampler:
 53 | 
 54 |     def __init__(self, shuffle=True, partition=True):
 55 |         self.epoch = -1
 56 |         self.update()
 57 |         self.shuffle = shuffle
 58 |         self.partition = partition
 59 | 
 60 |     def update(self):
 61 |         assert dist.is_available()
 62 |         if dist.is_initialized():
 63 |             self.rank = dist.get_rank()
 64 |             self.world_size = dist.get_world_size()
 65 |         else:
 66 |             self.rank = 0
 67 |             self.world_size = 1
 68 |         worker_info = torch.utils.data.get_worker_info()
 69 |         if worker_info is None:
 70 |             self.worker_id = 0
 71 |             self.num_workers = 1
 72 |         else:
 73 |             self.worker_id = worker_info.id
 74 |             self.num_workers = worker_info.num_workers
 75 |         return dict(rank=self.rank,
 76 |                     world_size=self.world_size,
 77 |                     worker_id=self.worker_id,
 78 |                     num_workers=self.num_workers)
 79 | 
 80 |     def set_epoch(self, epoch):
 81 |         self.epoch = epoch
 82 | 
 83 |     def sample(self, data):
 84 |         """ Sample data according to rank/world_size/num_workers
 85 | 
 86 |             Args:
 87 |                 data(List): input data list
 88 | 
 89 |             Returns:
 90 |                 List: data list after sample
 91 |         """
 92 |         data = list(range(len(data)))
 93 |         # force datalist even
 94 |         if self.partition:
 95 |             if self.shuffle:
 96 |                 random.Random(self.epoch).shuffle(data)
 97 |             if len(data) < self.world_size:
 98 |                 data = data * math.ceil(self.world_size / len(data))
 99 |                 data = data[:self.world_size]
100 |             data = data[self.rank::self.world_size]
101 |         if len(data) < self.num_workers:
102 |             data = data * math.ceil(self.num_workers / len(data))
103 |             data = data[:self.num_workers]
104 |         data = data[self.worker_id::self.num_workers]
105 |         return data
106 | 
107 | 
108 | class DataList(IterableDataset):
109 | 
110 |     def __init__(self, lists, shuffle=True, partition=True):
111 |         self.lists = lists
112 |         self.sampler = DistributedSampler(shuffle, partition)
113 | 
114 |     def set_epoch(self, epoch):
115 |         self.sampler.set_epoch(epoch)
116 | 
117 |     def __iter__(self):
118 |         sampler_info = self.sampler.update()
119 |         indexes = self.sampler.sample(self.lists)
120 |         for index in indexes:
121 |             data = dict(src=self.lists[index])
122 |             data.update(sampler_info)
123 |             yield data
124 | 
125 | 
126 | def Dataset(data_list_file,
127 |             data_pipeline,
128 |             mode='train',
129 |             shuffle=True,
130 |             partition=True,
131 |             tts_file='',
132 |             prompt_utt2data=''):
133 |     """ Construct dataset from arguments
134 | 
135 |         We have two shuffle stage in the Dataset. The first is global
136 |         shuffle at shards tar/raw file level. The second is global shuffle
137 |         at training samples level.
138 | 
139 |         Args:
140 |             data_type(str): raw/shard
141 |             tokenizer (BaseTokenizer): tokenizer to tokenize
142 |             partition(bool): whether to do data partition in terms of rank
143 |     """
144 |     assert mode in ['train', 'inference']
145 |     lists = read_lists(data_list_file)
146 |     if mode == 'inference':
147 |         with open(tts_file) as f:
148 |             tts_data = json.load(f)
149 |         utt2lists = read_json_lists(prompt_utt2data)
150 |         # filter unnecessary file in inference mode
151 |         lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
152 |     dataset = DataList(lists,
153 |                        shuffle=shuffle,
154 |                        partition=partition)
155 |     if mode == 'inference':
156 |         # map partial arg tts_data in inference mode
157 |         data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
158 |     for func in data_pipeline:
159 |         dataset = Processor(dataset, func, mode=mode)
160 |     return dataset
161 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/README.md:
--------------------------------------------------------------------------------
  1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
  2 | 
  3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
  4 | 
  5 | In our [paper](https://arxiv.org/abs/2010.05646),
  6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
  7 | We provide our implementation and pretrained models as open source in this repository.
  8 | 
  9 | **Abstract :**
 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
 11 | Although such methods improve the sampling efficiency and memory usage,
 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models.
 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
 14 | As speech audio consists of sinusoidal signals with various periods,
 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart.
 21 | 
 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
 23 | 
 24 | ## Pre-requisites
 25 | 
 26 | 1. Python >= 3.6
 27 | 2. Clone this repository.
 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
 30 |    And move all wav files to `LJSpeech-1.1/wavs`
 31 | 
 32 | ## Training
 33 | 
 34 | ```
 35 | python train.py --config config_v1.json
 36 | ```
 37 | 
 38 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
 39 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
 40 | You can change the path by adding `--checkpoint_path` option.
 41 | 
 42 | Validation loss during training with V1 generator.<br>
 43 | ![validation loss](./validation_loss.png)
 44 | 
 45 | ## Pretrained Model
 46 | 
 47 | You can also use pretrained models we provide.<br/>
 48 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
 49 | Details of each folder are as in follows:
 50 | 
 51 | | Folder Name  | Generator | Dataset   | Fine-Tuned                                             |
 52 | | ------------ | --------- | --------- | ------------------------------------------------------ |
 53 | | LJ_V1        | V1        | LJSpeech  | No                                                     |
 54 | | LJ_V2        | V2        | LJSpeech  | No                                                     |
 55 | | LJ_V3        | V3        | LJSpeech  | No                                                     |
 56 | | LJ_FT_T2_V1  | V1        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 57 | | LJ_FT_T2_V2  | V2        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 58 | | LJ_FT_T2_V3  | V3        | LJSpeech  | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) |
 59 | | VCTK_V1      | V1        | VCTK      | No                                                     |
 60 | | VCTK_V2      | V2        | VCTK      | No                                                     |
 61 | | VCTK_V3      | V3        | VCTK      | No                                                     |
 62 | | UNIVERSAL_V1 | V1        | Universal | No                                                     |
 63 | 
 64 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
 65 | 
 66 | ## Fine-Tuning
 67 | 
 68 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
 69 |    The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
 70 |    Example:
 71 |    `   Audio File : LJ001-0001.wav
 72 | Mel-Spectrogram File : LJ001-0001.npy`
 73 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
 74 | 3. Run the following command.
 75 |    ```
 76 |    python train.py --fine_tuning True --config config_v1.json
 77 |    ```
 78 |    For other command line options, please refer to the training section.
 79 | 
 80 | ## Inference from wav file
 81 | 
 82 | 1. Make `test_files` directory and copy wav files into the directory.
 83 | 2. Run the following command.
 84 |    `   python inference.py --checkpoint_file [generator checkpoint file path]`
 85 |    Generated wav files are saved in `generated_files` by default.<br>
 86 |    You can change the path by adding `--output_dir` option.
 87 | 
 88 | ## Inference for end-to-end speech synthesis
 89 | 
 90 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
 91 |    You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
 92 |    [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
 93 | 2. Run the following command.
 94 |    `   python inference_e2e.py --checkpoint_file [generator checkpoint file path]`
 95 |    Generated wav files are saved in `generated_files_from_mel` by default.<br>
 96 |    You can change the path by adding `--output_dir` option.
 97 | 
 98 | ## Acknowledgements
 99 | 
100 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
101 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
102 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/onnx/export.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import random
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from lightning import LightningModule
  8 | 
  9 | from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder
 10 | 
 11 | DEFAULT_OPSET = 15
 12 | 
 13 | SEED = 1234
 14 | random.seed(SEED)
 15 | np.random.seed(SEED)
 16 | torch.manual_seed(SEED)
 17 | torch.cuda.manual_seed(SEED)
 18 | torch.backends.cudnn.deterministic = True
 19 | torch.backends.cudnn.benchmark = False
 20 | 
 21 | 
 22 | class MatchaWithVocoder(LightningModule):
 23 |     def __init__(self, matcha, vocoder):
 24 |         super().__init__()
 25 |         self.matcha = matcha
 26 |         self.vocoder = vocoder
 27 | 
 28 |     def forward(self, x, x_lengths, scales, spks=None):
 29 |         mel, mel_lengths = self.matcha(x, x_lengths, scales, spks)
 30 |         wavs = self.vocoder(mel).clamp(-1, 1)
 31 |         lengths = mel_lengths * 256
 32 |         return wavs.squeeze(1), lengths
 33 | 
 34 | 
 35 | def get_exportable_module(matcha, vocoder, n_timesteps):
 36 |     """
 37 |     Return an appropriate `LighteningModule` and output-node names
 38 |     based on whether the vocoder is embedded in  the final graph
 39 |     """
 40 | 
 41 |     def onnx_forward_func(x, x_lengths, scales, spks=None):
 42 |         """
 43 |         Custom forward function for accepting
 44 |         scaler parameters as tensors
 45 |         """
 46 |         # Extract scaler parameters from tensors
 47 |         temperature = scales[0]
 48 |         length_scale = scales[1]
 49 |         output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale)
 50 |         return output["mel"], output["mel_lengths"]
 51 | 
 52 |     # Monkey-patch Matcha's forward function
 53 |     matcha.forward = onnx_forward_func
 54 | 
 55 |     if vocoder is None:
 56 |         model, output_names = matcha, ["mel", "mel_lengths"]
 57 |     else:
 58 |         model = MatchaWithVocoder(matcha, vocoder)
 59 |         output_names = ["wav", "wav_lengths"]
 60 |     return model, output_names
 61 | 
 62 | 
 63 | def get_inputs(is_multi_speaker):
 64 |     """
 65 |     Create dummy inputs for tracing
 66 |     """
 67 |     dummy_input_length = 50
 68 |     x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long)
 69 |     x_lengths = torch.LongTensor([dummy_input_length])
 70 | 
 71 |     # Scales
 72 |     temperature = 0.667
 73 |     length_scale = 1.0
 74 |     scales = torch.Tensor([temperature, length_scale])
 75 | 
 76 |     model_inputs = [x, x_lengths, scales]
 77 |     input_names = [
 78 |         "x",
 79 |         "x_lengths",
 80 |         "scales",
 81 |     ]
 82 | 
 83 |     if is_multi_speaker:
 84 |         spks = torch.LongTensor([1])
 85 |         model_inputs.append(spks)
 86 |         input_names.append("spks")
 87 | 
 88 |     return tuple(model_inputs), input_names
 89 | 
 90 | 
 91 | def main():
 92 |     parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX")
 93 | 
 94 |     parser.add_argument(
 95 |         "checkpoint_path",
 96 |         type=str,
 97 |         help="Path to the model checkpoint",
 98 |     )
 99 |     parser.add_argument("output", type=str, help="Path to output `.onnx` file")
100 |     parser.add_argument(
101 |         "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)"
102 |     )
103 |     parser.add_argument(
104 |         "--vocoder-name",
105 |         type=str,
106 |         choices=list(VOCODER_URLS.keys()),
107 |         default=None,
108 |         help="Name of the vocoder to embed in the ONNX graph",
109 |     )
110 |     parser.add_argument(
111 |         "--vocoder-checkpoint-path",
112 |         type=str,
113 |         default=None,
114 |         help="Vocoder checkpoint to embed  in the ONNX graph for an `e2e` like experience",
115 |     )
116 |     parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15")
117 | 
118 |     args = parser.parse_args()
119 | 
120 |     print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}")
121 |     print(f"Setting n_timesteps to {args.n_timesteps}")
122 | 
123 |     checkpoint_path = Path(args.checkpoint_path)
124 |     matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu")
125 | 
126 |     if args.vocoder_name or args.vocoder_checkpoint_path:
127 |         assert (
128 |             args.vocoder_name and args.vocoder_checkpoint_path
129 |         ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph."
130 |         vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu")
131 |     else:
132 |         vocoder = None
133 | 
134 |     is_multi_speaker = matcha.n_spks > 1
135 | 
136 |     dummy_input, input_names = get_inputs(is_multi_speaker)
137 |     model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps)
138 | 
139 |     # Set dynamic shape for inputs/outputs
140 |     dynamic_axes = {
141 |         "x": {0: "batch_size", 1: "time"},
142 |         "x_lengths": {0: "batch_size"},
143 |     }
144 | 
145 |     if vocoder is None:
146 |         dynamic_axes.update(
147 |             {
148 |                 "mel": {0: "batch_size", 2: "time"},
149 |                 "mel_lengths": {0: "batch_size"},
150 |             }
151 |         )
152 |     else:
153 |         print("Embedding the vocoder in the ONNX graph")
154 |         dynamic_axes.update(
155 |             {
156 |                 "wav": {0: "batch_size", 1: "time"},
157 |                 "wav_lengths": {0: "batch_size"},
158 |             }
159 |         )
160 | 
161 |     if is_multi_speaker:
162 |         dynamic_axes["spks"] = {0: "batch_size"}
163 | 
164 |     # Create the output directory (if not exists)
165 |     Path(args.output).parent.mkdir(parents=True, exist_ok=True)
166 | 
167 |     model.to_onnx(
168 |         args.output,
169 |         dummy_input,
170 |         input_names=input_names,
171 |         output_names=output_names,
172 |         dynamic_axes=dynamic_axes,
173 |         opset_version=args.opset,
174 |         export_params=True,
175 |         do_constant_folding=True,
176 |     )
177 |     print(f"[🍵] ONNX model exported to  {args.output}")
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/flow_matching.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import torch
 15 | import torch.nn.functional as F
 16 | from matcha.models.components.flow_matching import BASECFM
 17 | 
 18 | class ConditionalCFM(BASECFM):
 19 |     def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
 20 |         super().__init__(
 21 |             n_feats=in_channels,
 22 |             cfm_params=cfm_params,
 23 |             n_spks=n_spks,
 24 |             spk_emb_dim=spk_emb_dim,
 25 |         )
 26 |         self.t_scheduler = cfm_params.t_scheduler
 27 |         self.training_cfg_rate = cfm_params.training_cfg_rate
 28 |         self.inference_cfg_rate = cfm_params.inference_cfg_rate
 29 |         in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
 30 |         # Just change the architecture of the estimator here
 31 |         self.estimator = estimator
 32 | 
 33 |     @torch.inference_mode()
 34 |     def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
 35 |         """Forward diffusion
 36 | 
 37 |         Args:
 38 |             mu (torch.Tensor): output of encoder
 39 |                 shape: (batch_size, n_feats, mel_timesteps)
 40 |             mask (torch.Tensor): output_mask
 41 |                 shape: (batch_size, 1, mel_timesteps)
 42 |             n_timesteps (int): number of diffusion steps
 43 |             temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
 44 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 45 |                 shape: (batch_size, spk_emb_dim)
 46 |             cond: Not used but kept for future purposes
 47 | 
 48 |         Returns:
 49 |             sample: generated mel-spectrogram
 50 |                 shape: (batch_size, n_feats, mel_timesteps)
 51 |         """
 52 |         z = torch.randn_like(mu) * temperature
 53 |         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
 54 |         if self.t_scheduler == 'cosine':
 55 |             t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
 56 |         return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
 57 | 
 58 |     def solve_euler(self, x, t_span, mu, mask, spks, cond):
 59 |         """
 60 |         Fixed euler solver for ODEs.
 61 |         Args:
 62 |             x (torch.Tensor): random noise
 63 |             t_span (torch.Tensor): n_timesteps interpolated
 64 |                 shape: (n_timesteps + 1,)
 65 |             mu (torch.Tensor): output of encoder
 66 |                 shape: (batch_size, n_feats, mel_timesteps)
 67 |             mask (torch.Tensor): output_mask
 68 |                 shape: (batch_size, 1, mel_timesteps)
 69 |             spks (torch.Tensor, optional): speaker ids. Defaults to None.
 70 |                 shape: (batch_size, spk_emb_dim)
 71 |             cond: Not used but kept for future purposes
 72 |         """
 73 |         t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
 74 | 
 75 |         # I am storing this because I can later plot it by putting a debugger here and saving it to a file
 76 |         # Or in future might add like a return_all_steps flag
 77 |         sol = []
 78 | 
 79 |         for step in range(1, len(t_span)):
 80 |             dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
 81 |             # Classifier-Free Guidance inference introduced in VoiceBox
 82 |             if self.inference_cfg_rate > 0:
 83 |                 cfg_dphi_dt = self.estimator(
 84 |                     x, mask,
 85 |                     torch.zeros_like(mu), t,
 86 |                     torch.zeros_like(spks) if spks is not None else None,
 87 |                     torch.zeros_like(cond)
 88 |                 )
 89 |                 dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
 90 |                            self.inference_cfg_rate * cfg_dphi_dt)
 91 |             x = x + dt * dphi_dt
 92 |             t = t + dt
 93 |             sol.append(x)
 94 |             if step < len(t_span) - 1:
 95 |                 dt = t_span[step + 1] - t
 96 | 
 97 |         return sol[-1]
 98 | 
 99 |     def compute_loss(self, x1, mask, mu, spks=None, cond=None):
100 |         """Computes diffusion loss
101 | 
102 |         Args:
103 |             x1 (torch.Tensor): Target
104 |                 shape: (batch_size, n_feats, mel_timesteps)
105 |             mask (torch.Tensor): target mask
106 |                 shape: (batch_size, 1, mel_timesteps)
107 |             mu (torch.Tensor): output of encoder
108 |                 shape: (batch_size, n_feats, mel_timesteps)
109 |             spks (torch.Tensor, optional): speaker embedding. Defaults to None.
110 |                 shape: (batch_size, spk_emb_dim)
111 | 
112 |         Returns:
113 |             loss: conditional flow matching loss
114 |             y: conditional flow
115 |                 shape: (batch_size, n_feats, mel_timesteps)
116 |         """
117 |         b, _, t = mu.shape
118 | 
119 |         # random timestep
120 |         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
121 |         if self.t_scheduler == 'cosine':
122 |             t = 1 - torch.cos(t * 0.5 * torch.pi)
123 |         # sample noise p(x_0)
124 |         z = torch.randn_like(x1)
125 | 
126 |         y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127 |         u = x1 - (1 - self.sigma_min) * z
128 | 
129 |         # during training, we randomly drop condition to trade off mode coverage and sample fidelity
130 |         if self.training_cfg_rate > 0:
131 |             cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
132 |             mu = mu * cfg_mask.view(-1, 1, 1)
133 |             spks = spks * cfg_mask.view(-1, 1)
134 |             cond = cond * cfg_mask.view(-1, 1, 1)
135 | 
136 |         pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
137 |         loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
138 |         return loss, y
139 | 


--------------------------------------------------------------------------------
/cosyvoice/flow/flow.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import logging
 15 | import random
 16 | from typing import Dict, Optional
 17 | import torch
 18 | import torch.nn as nn
 19 | from torch.nn import functional as F
 20 | from omegaconf import DictConfig
 21 | from cosyvoice.utils.mask import make_pad_mask
 22 | 
 23 | 
 24 | class MaskedDiffWithXvec(torch.nn.Module):
 25 |     def __init__(self,
 26 |                  input_size: int = 512,
 27 |                  output_size: int = 80,
 28 |                  spk_embed_dim: int = 192,
 29 |                  output_type: str = "mel",
 30 |                  vocab_size: int = 4096,
 31 |                  input_frame_rate: int = 50,
 32 |                  only_mask_loss: bool = True,
 33 |                  encoder: torch.nn.Module = None,
 34 |                  length_regulator: torch.nn.Module = None,
 35 |                  decoder: torch.nn.Module = None,
 36 |                  decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
 37 |                  mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
 38 |         super().__init__()
 39 |         self.input_size = input_size
 40 |         self.output_size = output_size
 41 |         self.decoder_conf = decoder_conf
 42 |         self.mel_feat_conf = mel_feat_conf
 43 |         self.vocab_size = vocab_size
 44 |         self.output_type = output_type
 45 |         self.input_frame_rate = input_frame_rate
 46 |         logging.info(f"input frame rate={self.input_frame_rate}")
 47 |         self.input_embedding = nn.Embedding(vocab_size, input_size)
 48 |         self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
 49 |         self.encoder = encoder
 50 |         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
 51 |         self.decoder = decoder
 52 |         self.length_regulator = length_regulator
 53 |         self.only_mask_loss = only_mask_loss
 54 | 
 55 |     def forward(
 56 |             self,
 57 |             batch: dict,
 58 |             device: torch.device,
 59 |     ) -> Dict[str, Optional[torch.Tensor]]:
 60 |         token = batch['speech_token'].to(device)
 61 |         token_len = batch['speech_token_len'].to(device)
 62 |         feat = batch['speech_feat'].to(device)
 63 |         feat_len = batch['speech_feat_len'].to(device)
 64 |         embedding = batch['embedding'].to(device)
 65 | 
 66 |         # xvec projection
 67 |         embedding = F.normalize(embedding, dim=1)
 68 |         embedding = self.spk_embed_affine_layer(embedding)
 69 | 
 70 |         # concat text and prompt_text
 71 |         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
 72 |         token = self.input_embedding(torch.clamp(token, min=0)) * mask
 73 | 
 74 |         # text encode
 75 |         h, h_lengths = self.encoder(token, token_len)
 76 |         h = self.encoder_proj(h)
 77 |         h, h_lengths = self.length_regulator(h, feat_len)
 78 | 
 79 |         # get conditions
 80 |         conds = torch.zeros(feat.shape, device=token.device)
 81 |         for i, j in enumerate(feat_len):
 82 |             if random.random() < 0.5:
 83 |                 continue
 84 |             index = random.randint(0, int(0.3 * j))
 85 |             conds[i, :index] = feat[i, :index]
 86 |         conds = conds.transpose(1, 2)
 87 | 
 88 |         mask = (~make_pad_mask(feat_len)).to(h)
 89 |         feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
 90 |         loss, _ = self.decoder.compute_loss(
 91 |             feat.transpose(1, 2).contiguous(),
 92 |             mask.unsqueeze(1),
 93 |             h.transpose(1, 2).contiguous(),
 94 |             embedding,
 95 |             cond=conds
 96 |         )
 97 |         return {'loss': loss}
 98 | 
 99 |     @torch.inference_mode()
100 |     def inference(self,
101 |                   token,
102 |                   token_len,
103 |                   prompt_token,
104 |                   prompt_token_len,
105 |                   prompt_feat,
106 |                   prompt_feat_len,
107 |                   embedding):
108 |         assert token.shape[0] == 1
109 |         # xvec projection
110 |         embedding = F.normalize(embedding, dim=1)
111 |         embedding = self.spk_embed_affine_layer(embedding)
112 | 
113 |         # concat text and prompt_text
114 |         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
115 |         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
116 |         token = self.input_embedding(torch.clamp(token, min=0)) * mask
117 | 
118 |         # text encode
119 |         h, h_lengths = self.encoder(token, token_len)
120 |         h = self.encoder_proj(h)
121 |         feat_len = (token_len / 50 * 22050 / 256).int()
122 |         h, h_lengths = self.length_regulator(h, feat_len)
123 | 
124 |         # get conditions
125 |         conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
126 |         if prompt_feat.shape[1] != 0:
127 |             for i, j in enumerate(prompt_feat_len):
128 |                 conds[i, :j] = prompt_feat[i]
129 |         conds = conds.transpose(1, 2)
130 | 
131 |         mask = (~make_pad_mask(feat_len)).to(h)
132 |         feat = self.decoder(
133 |             mu=h.transpose(1, 2).contiguous(),
134 |             mask=mask.unsqueeze(1),
135 |             spks=embedding,
136 |             cond=conds,
137 |             n_timesteps=10
138 |         )
139 |         if prompt_feat.shape[1] != 0:
140 |             feat = feat[:, :, prompt_feat.shape[1]:]
141 |         return feat
142 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/onnx/infer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | from pathlib import Path
  5 | from time import perf_counter
  6 | 
  7 | import numpy as np
  8 | import onnxruntime as ort
  9 | import soundfile as sf
 10 | import torch
 11 | 
 12 | from matcha.cli import plot_spectrogram_to_numpy, process_text
 13 | 
 14 | 
 15 | def validate_args(args):
 16 |     assert (
 17 |         args.text or args.file
 18 |     ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms."
 19 |     assert args.temperature >= 0, "Sampling temperature cannot be negative"
 20 |     assert args.speaking_rate >= 0, "Speaking rate must be greater than 0"
 21 |     return args
 22 | 
 23 | 
 24 | def write_wavs(model, inputs, output_dir, external_vocoder=None):
 25 |     if external_vocoder is None:
 26 |         print("The provided model has the vocoder embedded in the graph.\nGenerating waveform directly")
 27 |         t0 = perf_counter()
 28 |         wavs, wav_lengths = model.run(None, inputs)
 29 |         infer_secs = perf_counter() - t0
 30 |         mel_infer_secs = vocoder_infer_secs = None
 31 |     else:
 32 |         print("[🍵] Generating mel using Matcha")
 33 |         mel_t0 = perf_counter()
 34 |         mels, mel_lengths = model.run(None, inputs)
 35 |         mel_infer_secs = perf_counter() - mel_t0
 36 |         print("Generating waveform from mel using external vocoder")
 37 |         vocoder_inputs = {external_vocoder.get_inputs()[0].name: mels}
 38 |         vocoder_t0 = perf_counter()
 39 |         wavs = external_vocoder.run(None, vocoder_inputs)[0]
 40 |         vocoder_infer_secs = perf_counter() - vocoder_t0
 41 |         wavs = wavs.squeeze(1)
 42 |         wav_lengths = mel_lengths * 256
 43 |         infer_secs = mel_infer_secs + vocoder_infer_secs
 44 | 
 45 |     output_dir = Path(output_dir)
 46 |     output_dir.mkdir(parents=True, exist_ok=True)
 47 |     for i, (wav, wav_length) in enumerate(zip(wavs, wav_lengths)):
 48 |         output_filename = output_dir.joinpath(f"output_{i + 1}.wav")
 49 |         audio = wav[:wav_length]
 50 |         print(f"Writing audio to {output_filename}")
 51 |         sf.write(output_filename, audio, 22050, "PCM_24")
 52 | 
 53 |     wav_secs = wav_lengths.sum() / 22050
 54 |     print(f"Inference seconds: {infer_secs}")
 55 |     print(f"Generated wav seconds: {wav_secs}")
 56 |     rtf = infer_secs / wav_secs
 57 |     if mel_infer_secs is not None:
 58 |         mel_rtf = mel_infer_secs / wav_secs
 59 |         print(f"Matcha RTF: {mel_rtf}")
 60 |     if vocoder_infer_secs is not None:
 61 |         vocoder_rtf = vocoder_infer_secs / wav_secs
 62 |         print(f"Vocoder RTF: {vocoder_rtf}")
 63 |     print(f"Overall RTF: {rtf}")
 64 | 
 65 | 
 66 | def write_mels(model, inputs, output_dir):
 67 |     t0 = perf_counter()
 68 |     mels, mel_lengths = model.run(None, inputs)
 69 |     infer_secs = perf_counter() - t0
 70 | 
 71 |     output_dir = Path(output_dir)
 72 |     output_dir.mkdir(parents=True, exist_ok=True)
 73 |     for i, mel in enumerate(mels):
 74 |         output_stem = output_dir.joinpath(f"output_{i + 1}")
 75 |         plot_spectrogram_to_numpy(mel.squeeze(), output_stem.with_suffix(".png"))
 76 |         np.save(output_stem.with_suffix(".numpy"), mel)
 77 | 
 78 |     wav_secs = (mel_lengths * 256).sum() / 22050
 79 |     print(f"Inference seconds: {infer_secs}")
 80 |     print(f"Generated wav seconds: {wav_secs}")
 81 |     rtf = infer_secs / wav_secs
 82 |     print(f"RTF: {rtf}")
 83 | 
 84 | 
 85 | def main():
 86 |     parser = argparse.ArgumentParser(
 87 |         description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching"
 88 |     )
 89 |     parser.add_argument(
 90 |         "model",
 91 |         type=str,
 92 |         help="ONNX model to use",
 93 |     )
 94 |     parser.add_argument("--vocoder", type=str, default=None, help="Vocoder to use (defaults to None)")
 95 |     parser.add_argument("--text", type=str, default=None, help="Text to synthesize")
 96 |     parser.add_argument("--file", type=str, default=None, help="Text file to synthesize")
 97 |     parser.add_argument("--spk", type=int, default=None, help="Speaker ID")
 98 |     parser.add_argument(
 99 |         "--temperature",
100 |         type=float,
101 |         default=0.667,
102 |         help="Variance of the x0 noise (default: 0.667)",
103 |     )
104 |     parser.add_argument(
105 |         "--speaking-rate",
106 |         type=float,
107 |         default=1.0,
108 |         help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)",
109 |     )
110 |     parser.add_argument("--gpu", action="store_true", help="Use CPU for inference (default: use GPU if available)")
111 |     parser.add_argument(
112 |         "--output-dir",
113 |         type=str,
114 |         default=os.getcwd(),
115 |         help="Output folder to save results (default: current dir)",
116 |     )
117 | 
118 |     args = parser.parse_args()
119 |     args = validate_args(args)
120 | 
121 |     if args.gpu:
122 |         providers = ["GPUExecutionProvider"]
123 |     else:
124 |         providers = ["CPUExecutionProvider"]
125 |     model = ort.InferenceSession(args.model, providers=providers)
126 | 
127 |     model_inputs = model.get_inputs()
128 |     model_outputs = list(model.get_outputs())
129 | 
130 |     if args.text:
131 |         text_lines = args.text.splitlines()
132 |     else:
133 |         with open(args.file, encoding="utf-8") as file:
134 |             text_lines = file.read().splitlines()
135 | 
136 |     processed_lines = [process_text(0, line, "cpu") for line in text_lines]
137 |     x = [line["x"].squeeze() for line in processed_lines]
138 |     # Pad
139 |     x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
140 |     x = x.detach().cpu().numpy()
141 |     x_lengths = np.array([line["x_lengths"].item() for line in processed_lines], dtype=np.int64)
142 |     inputs = {
143 |         "x": x,
144 |         "x_lengths": x_lengths,
145 |         "scales": np.array([args.temperature, args.speaking_rate], dtype=np.float32),
146 |     }
147 |     is_multi_speaker = len(model_inputs) == 4
148 |     if is_multi_speaker:
149 |         if args.spk is None:
150 |             args.spk = 0
151 |             warn = "[!] Speaker ID not provided! Using speaker ID 0"
152 |             warnings.warn(warn, UserWarning)
153 |         inputs["spks"] = np.repeat(args.spk, x.shape[0]).astype(np.int64)
154 | 
155 |     has_vocoder_embedded = model_outputs[0].name == "wav"
156 |     if has_vocoder_embedded:
157 |         write_wavs(model, inputs, args.output_dir)
158 |     elif args.vocoder:
159 |         external_vocoder = ort.InferenceSession(args.vocoder, providers=providers)
160 |         write_wavs(model, inputs, args.output_dir, external_vocoder=external_vocoder)
161 |     else:
162 |         warn = "[!] A vocoder is not embedded in the graph nor an external vocoder is provided. The mel output will be written as numpy arrays to `*.npy` files in the output directory"
163 |         warnings.warn(warn, UserWarning)
164 |         write_mels(model, inputs, args.output_dir)
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     main()
169 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/hifigan/meldataset.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/jik876/hifi-gan """
  2 | 
  3 | import math
  4 | import os
  5 | import random
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.utils.data
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | from librosa.util import normalize
 12 | from scipy.io.wavfile import read
 13 | 
 14 | MAX_WAV_VALUE = 32768.0
 15 | 
 16 | 
 17 | def load_wav(full_path):
 18 |     sampling_rate, data = read(full_path)
 19 |     return data, sampling_rate
 20 | 
 21 | 
 22 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 23 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 24 | 
 25 | 
 26 | def dynamic_range_decompression(x, C=1):
 27 |     return np.exp(x) / C
 28 | 
 29 | 
 30 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 31 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 32 | 
 33 | 
 34 | def dynamic_range_decompression_torch(x, C=1):
 35 |     return torch.exp(x) / C
 36 | 
 37 | 
 38 | def spectral_normalize_torch(magnitudes):
 39 |     output = dynamic_range_compression_torch(magnitudes)
 40 |     return output
 41 | 
 42 | 
 43 | def spectral_de_normalize_torch(magnitudes):
 44 |     output = dynamic_range_decompression_torch(magnitudes)
 45 |     return output
 46 | 
 47 | 
 48 | mel_basis = {}
 49 | hann_window = {}
 50 | 
 51 | 
 52 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 53 |     if torch.min(y) < -1.0:
 54 |         print("min value is ", torch.min(y))
 55 |     if torch.max(y) > 1.0:
 56 |         print("max value is ", torch.max(y))
 57 | 
 58 |     global mel_basis, hann_window  # pylint: disable=global-statement
 59 |     if fmax not in mel_basis:
 60 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 61 |         mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 62 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 63 | 
 64 |     y = torch.nn.functional.pad(
 65 |         y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
 66 |     )
 67 |     y = y.squeeze(1)
 68 | 
 69 |     spec = torch.view_as_real(
 70 |         torch.stft(
 71 |             y,
 72 |             n_fft,
 73 |             hop_length=hop_size,
 74 |             win_length=win_size,
 75 |             window=hann_window[str(y.device)],
 76 |             center=center,
 77 |             pad_mode="reflect",
 78 |             normalized=False,
 79 |             onesided=True,
 80 |             return_complex=True,
 81 |         )
 82 |     )
 83 | 
 84 |     spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
 85 | 
 86 |     spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
 87 |     spec = spectral_normalize_torch(spec)
 88 | 
 89 |     return spec
 90 | 
 91 | 
 92 | def get_dataset_filelist(a):
 93 |     with open(a.input_training_file, encoding="utf-8") as fi:
 94 |         training_files = [
 95 |             os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
 96 |         ]
 97 | 
 98 |     with open(a.input_validation_file, encoding="utf-8") as fi:
 99 |         validation_files = [
100 |             os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
101 |         ]
102 |     return training_files, validation_files
103 | 
104 | 
105 | class MelDataset(torch.utils.data.Dataset):
106 |     def __init__(
107 |         self,
108 |         training_files,
109 |         segment_size,
110 |         n_fft,
111 |         num_mels,
112 |         hop_size,
113 |         win_size,
114 |         sampling_rate,
115 |         fmin,
116 |         fmax,
117 |         split=True,
118 |         shuffle=True,
119 |         n_cache_reuse=1,
120 |         device=None,
121 |         fmax_loss=None,
122 |         fine_tuning=False,
123 |         base_mels_path=None,
124 |     ):
125 |         self.audio_files = training_files
126 |         random.seed(1234)
127 |         if shuffle:
128 |             random.shuffle(self.audio_files)
129 |         self.segment_size = segment_size
130 |         self.sampling_rate = sampling_rate
131 |         self.split = split
132 |         self.n_fft = n_fft
133 |         self.num_mels = num_mels
134 |         self.hop_size = hop_size
135 |         self.win_size = win_size
136 |         self.fmin = fmin
137 |         self.fmax = fmax
138 |         self.fmax_loss = fmax_loss
139 |         self.cached_wav = None
140 |         self.n_cache_reuse = n_cache_reuse
141 |         self._cache_ref_count = 0
142 |         self.device = device
143 |         self.fine_tuning = fine_tuning
144 |         self.base_mels_path = base_mels_path
145 | 
146 |     def __getitem__(self, index):
147 |         filename = self.audio_files[index]
148 |         if self._cache_ref_count == 0:
149 |             audio, sampling_rate = load_wav(filename)
150 |             audio = audio / MAX_WAV_VALUE
151 |             if not self.fine_tuning:
152 |                 audio = normalize(audio) * 0.95
153 |             self.cached_wav = audio
154 |             if sampling_rate != self.sampling_rate:
155 |                 raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
156 |             self._cache_ref_count = self.n_cache_reuse
157 |         else:
158 |             audio = self.cached_wav
159 |             self._cache_ref_count -= 1
160 | 
161 |         audio = torch.FloatTensor(audio)
162 |         audio = audio.unsqueeze(0)
163 | 
164 |         if not self.fine_tuning:
165 |             if self.split:
166 |                 if audio.size(1) >= self.segment_size:
167 |                     max_audio_start = audio.size(1) - self.segment_size
168 |                     audio_start = random.randint(0, max_audio_start)
169 |                     audio = audio[:, audio_start : audio_start + self.segment_size]
170 |                 else:
171 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
172 | 
173 |             mel = mel_spectrogram(
174 |                 audio,
175 |                 self.n_fft,
176 |                 self.num_mels,
177 |                 self.sampling_rate,
178 |                 self.hop_size,
179 |                 self.win_size,
180 |                 self.fmin,
181 |                 self.fmax,
182 |                 center=False,
183 |             )
184 |         else:
185 |             mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
186 |             mel = torch.from_numpy(mel)
187 | 
188 |             if len(mel.shape) < 3:
189 |                 mel = mel.unsqueeze(0)
190 | 
191 |             if self.split:
192 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
193 | 
194 |                 if audio.size(1) >= self.segment_size:
195 |                     mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
196 |                     mel = mel[:, :, mel_start : mel_start + frames_per_seg]
197 |                     audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
198 |                 else:
199 |                     mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
200 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
201 | 
202 |         mel_loss = mel_spectrogram(
203 |             audio,
204 |             self.n_fft,
205 |             self.num_mels,
206 |             self.sampling_rate,
207 |             self.hop_size,
208 |             self.win_size,
209 |             self.fmin,
210 |             self.fmax_loss,
211 |             center=False,
212 |         )
213 | 
214 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
215 | 
216 |     def __len__(self):
217 |         return len(self.audio_files)
218 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/models/baselightningmodule.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a base lightning module that can be used to train a model.
  3 | The benefit of this abstraction is that all the logic outside of model definition can be reused for different models.
  4 | """
  5 | import inspect
  6 | from abc import ABC
  7 | from typing import Any, Dict
  8 | 
  9 | import torch
 10 | from lightning import LightningModule
 11 | from lightning.pytorch.utilities import grad_norm
 12 | 
 13 | from matcha import utils
 14 | from matcha.utils.utils import plot_tensor
 15 | 
 16 | log = utils.get_pylogger(__name__)
 17 | 
 18 | 
 19 | class BaseLightningClass(LightningModule, ABC):
 20 |     def update_data_statistics(self, data_statistics):
 21 |         if data_statistics is None:
 22 |             data_statistics = {
 23 |                 "mel_mean": 0.0,
 24 |                 "mel_std": 1.0,
 25 |             }
 26 | 
 27 |         self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"]))
 28 |         self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"]))
 29 | 
 30 |     def configure_optimizers(self) -> Any:
 31 |         optimizer = self.hparams.optimizer(params=self.parameters())
 32 |         if self.hparams.scheduler not in (None, {}):
 33 |             scheduler_args = {}
 34 |             # Manage last epoch for exponential schedulers
 35 |             if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters:
 36 |                 if hasattr(self, "ckpt_loaded_epoch"):
 37 |                     current_epoch = self.ckpt_loaded_epoch - 1
 38 |                 else:
 39 |                     current_epoch = -1
 40 | 
 41 |             scheduler_args.update({"optimizer": optimizer})
 42 |             scheduler = self.hparams.scheduler.scheduler(**scheduler_args)
 43 |             scheduler.last_epoch = current_epoch
 44 |             return {
 45 |                 "optimizer": optimizer,
 46 |                 "lr_scheduler": {
 47 |                     "scheduler": scheduler,
 48 |                     "interval": self.hparams.scheduler.lightning_args.interval,
 49 |                     "frequency": self.hparams.scheduler.lightning_args.frequency,
 50 |                     "name": "learning_rate",
 51 |                 },
 52 |             }
 53 | 
 54 |         return {"optimizer": optimizer}
 55 | 
 56 |     def get_losses(self, batch):
 57 |         x, x_lengths = batch["x"], batch["x_lengths"]
 58 |         y, y_lengths = batch["y"], batch["y_lengths"]
 59 |         spks = batch["spks"]
 60 | 
 61 |         dur_loss, prior_loss, diff_loss = self(
 62 |             x=x,
 63 |             x_lengths=x_lengths,
 64 |             y=y,
 65 |             y_lengths=y_lengths,
 66 |             spks=spks,
 67 |             out_size=self.out_size,
 68 |         )
 69 |         return {
 70 |             "dur_loss": dur_loss,
 71 |             "prior_loss": prior_loss,
 72 |             "diff_loss": diff_loss,
 73 |         }
 74 | 
 75 |     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
 76 |         self.ckpt_loaded_epoch = checkpoint["epoch"]  # pylint: disable=attribute-defined-outside-init
 77 | 
 78 |     def training_step(self, batch: Any, batch_idx: int):
 79 |         loss_dict = self.get_losses(batch)
 80 |         self.log(
 81 |             "step",
 82 |             float(self.global_step),
 83 |             on_step=True,
 84 |             prog_bar=True,
 85 |             logger=True,
 86 |             sync_dist=True,
 87 |         )
 88 | 
 89 |         self.log(
 90 |             "sub_loss/train_dur_loss",
 91 |             loss_dict["dur_loss"],
 92 |             on_step=True,
 93 |             on_epoch=True,
 94 |             logger=True,
 95 |             sync_dist=True,
 96 |         )
 97 |         self.log(
 98 |             "sub_loss/train_prior_loss",
 99 |             loss_dict["prior_loss"],
100 |             on_step=True,
101 |             on_epoch=True,
102 |             logger=True,
103 |             sync_dist=True,
104 |         )
105 |         self.log(
106 |             "sub_loss/train_diff_loss",
107 |             loss_dict["diff_loss"],
108 |             on_step=True,
109 |             on_epoch=True,
110 |             logger=True,
111 |             sync_dist=True,
112 |         )
113 | 
114 |         total_loss = sum(loss_dict.values())
115 |         self.log(
116 |             "loss/train",
117 |             total_loss,
118 |             on_step=True,
119 |             on_epoch=True,
120 |             logger=True,
121 |             prog_bar=True,
122 |             sync_dist=True,
123 |         )
124 | 
125 |         return {"loss": total_loss, "log": loss_dict}
126 | 
127 |     def validation_step(self, batch: Any, batch_idx: int):
128 |         loss_dict = self.get_losses(batch)
129 |         self.log(
130 |             "sub_loss/val_dur_loss",
131 |             loss_dict["dur_loss"],
132 |             on_step=True,
133 |             on_epoch=True,
134 |             logger=True,
135 |             sync_dist=True,
136 |         )
137 |         self.log(
138 |             "sub_loss/val_prior_loss",
139 |             loss_dict["prior_loss"],
140 |             on_step=True,
141 |             on_epoch=True,
142 |             logger=True,
143 |             sync_dist=True,
144 |         )
145 |         self.log(
146 |             "sub_loss/val_diff_loss",
147 |             loss_dict["diff_loss"],
148 |             on_step=True,
149 |             on_epoch=True,
150 |             logger=True,
151 |             sync_dist=True,
152 |         )
153 | 
154 |         total_loss = sum(loss_dict.values())
155 |         self.log(
156 |             "loss/val",
157 |             total_loss,
158 |             on_step=True,
159 |             on_epoch=True,
160 |             logger=True,
161 |             prog_bar=True,
162 |             sync_dist=True,
163 |         )
164 | 
165 |         return total_loss
166 | 
167 |     def on_validation_end(self) -> None:
168 |         if self.trainer.is_global_zero:
169 |             one_batch = next(iter(self.trainer.val_dataloaders))
170 |             if self.current_epoch == 0:
171 |                 log.debug("Plotting original samples")
172 |                 for i in range(2):
173 |                     y = one_batch["y"][i].unsqueeze(0).to(self.device)
174 |                     self.logger.experiment.add_image(
175 |                         f"original/{i}",
176 |                         plot_tensor(y.squeeze().cpu()),
177 |                         self.current_epoch,
178 |                         dataformats="HWC",
179 |                     )
180 | 
181 |             log.debug("Synthesising...")
182 |             for i in range(2):
183 |                 x = one_batch["x"][i].unsqueeze(0).to(self.device)
184 |                 x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device)
185 |                 spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None
186 |                 output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks)
187 |                 y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"]
188 |                 attn = output["attn"]
189 |                 self.logger.experiment.add_image(
190 |                     f"generated_enc/{i}",
191 |                     plot_tensor(y_enc.squeeze().cpu()),
192 |                     self.current_epoch,
193 |                     dataformats="HWC",
194 |                 )
195 |                 self.logger.experiment.add_image(
196 |                     f"generated_dec/{i}",
197 |                     plot_tensor(y_dec.squeeze().cpu()),
198 |                     self.current_epoch,
199 |                     dataformats="HWC",
200 |                 )
201 |                 self.logger.experiment.add_image(
202 |                     f"alignment/{i}",
203 |                     plot_tensor(attn.squeeze().cpu()),
204 |                     self.current_epoch,
205 |                     dataformats="HWC",
206 |                 )
207 | 
208 |     def on_before_optimizer_step(self, optimizer):
209 |         self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()})
210 | 


--------------------------------------------------------------------------------
/third_party/Matcha-TTS/matcha/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import warnings
  4 | from importlib.util import find_spec
  5 | from pathlib import Path
  6 | from typing import Any, Callable, Dict, Tuple
  7 | 
  8 | import gdown
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import torch
 12 | import wget
 13 | from omegaconf import DictConfig
 14 | 
 15 | from matcha.utils import pylogger, rich_utils
 16 | 
 17 | log = pylogger.get_pylogger(__name__)
 18 | 
 19 | 
 20 | def extras(cfg: DictConfig) -> None:
 21 |     """Applies optional utilities before the task is started.
 22 | 
 23 |     Utilities:
 24 |         - Ignoring python warnings
 25 |         - Setting tags from command line
 26 |         - Rich config printing
 27 | 
 28 |     :param cfg: A DictConfig object containing the config tree.
 29 |     """
 30 |     # return if no `extras` config
 31 |     if not cfg.get("extras"):
 32 |         log.warning("Extras config not found! <cfg.extras=null>")
 33 |         return
 34 | 
 35 |     # disable python warnings
 36 |     if cfg.extras.get("ignore_warnings"):
 37 |         log.info("Disabling python warnings! <cfg.extras.ignore_warnings=True>")
 38 |         warnings.filterwarnings("ignore")
 39 | 
 40 |     # prompt user to input tags from command line if none are provided in the config
 41 |     if cfg.extras.get("enforce_tags"):
 42 |         log.info("Enforcing tags! <cfg.extras.enforce_tags=True>")
 43 |         rich_utils.enforce_tags(cfg, save_to_file=True)
 44 | 
 45 |     # pretty print config tree using Rich library
 46 |     if cfg.extras.get("print_config"):
 47 |         log.info("Printing config tree with Rich! <cfg.extras.print_config=True>")
 48 |         rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True)
 49 | 
 50 | 
 51 | def task_wrapper(task_func: Callable) -> Callable:
 52 |     """Optional decorator that controls the failure behavior when executing the task function.
 53 | 
 54 |     This wrapper can be used to:
 55 |         - make sure loggers are closed even if the task function raises an exception (prevents multirun failure)
 56 |         - save the exception to a `.log` file
 57 |         - mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later)
 58 |         - etc. (adjust depending on your needs)
 59 | 
 60 |     Example:
 61 |     ```
 62 |     @utils.task_wrapper
 63 |     def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
 64 |         ...
 65 |         return metric_dict, object_dict
 66 |     ```
 67 | 
 68 |     :param task_func: The task function to be wrapped.
 69 | 
 70 |     :return: The wrapped task function.
 71 |     """
 72 | 
 73 |     def wrap(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
 74 |         # execute the task
 75 |         try:
 76 |             metric_dict, object_dict = task_func(cfg=cfg)
 77 | 
 78 |         # things to do if exception occurs
 79 |         except Exception as ex:
 80 |             # save exception to `.log` file
 81 |             log.exception("")
 82 | 
 83 |             # some hyperparameter combinations might be invalid or cause out-of-memory errors
 84 |             # so when using hparam search plugins like Optuna, you might want to disable
 85 |             # raising the below exception to avoid multirun failure
 86 |             raise ex
 87 | 
 88 |         # things to always do after either success or exception
 89 |         finally:
 90 |             # display output dir path in terminal
 91 |             log.info(f"Output dir: {cfg.paths.output_dir}")
 92 | 
 93 |             # always close wandb run (even if exception occurs so multirun won't fail)
 94 |             if find_spec("wandb"):  # check if wandb is installed
 95 |                 import wandb
 96 | 
 97 |                 if wandb.run:
 98 |                     log.info("Closing wandb!")
 99 |                     wandb.finish()
100 | 
101 |         return metric_dict, object_dict
102 | 
103 |     return wrap
104 | 
105 | 
106 | def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> float:
107 |     """Safely retrieves value of the metric logged in LightningModule.
108 | 
109 |     :param metric_dict: A dict containing metric values.
110 |     :param metric_name: The name of the metric to retrieve.
111 |     :return: The value of the metric.
112 |     """
113 |     if not metric_name:
114 |         log.info("Metric name is None! Skipping metric value retrieval...")
115 |         return None
116 | 
117 |     if metric_name not in metric_dict:
118 |         raise ValueError(
119 |             f"Metric value not found! <metric_name={metric_name}>\n"
120 |             "Make sure metric name logged in LightningModule is correct!\n"
121 |             "Make sure `optimized_metric` name in `hparams_search` config is correct!"
122 |         )
123 | 
124 |     metric_value = metric_dict[metric_name].item()
125 |     log.info(f"Retrieved metric value! <{metric_name}={metric_value}>")
126 | 
127 |     return metric_value
128 | 
129 | 
130 | def intersperse(lst, item):
131 |     # Adds blank symbol
132 |     result = [item] * (len(lst) * 2 + 1)
133 |     result[1::2] = lst
134 |     return result
135 | 
136 | 
137 | def save_figure_to_numpy(fig):
138 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
139 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
140 |     return data
141 | 
142 | 
143 | def plot_tensor(tensor):
144 |     plt.style.use("default")
145 |     fig, ax = plt.subplots(figsize=(12, 3))
146 |     im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
147 |     plt.colorbar(im, ax=ax)
148 |     plt.tight_layout()
149 |     fig.canvas.draw()
150 |     data = save_figure_to_numpy(fig)
151 |     plt.close()
152 |     return data
153 | 
154 | 
155 | def save_plot(tensor, savepath):
156 |     plt.style.use("default")
157 |     fig, ax = plt.subplots(figsize=(12, 3))
158 |     im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none")
159 |     plt.colorbar(im, ax=ax)
160 |     plt.tight_layout()
161 |     fig.canvas.draw()
162 |     plt.savefig(savepath)
163 |     plt.close()
164 | 
165 | 
166 | def to_numpy(tensor):
167 |     if isinstance(tensor, np.ndarray):
168 |         return tensor
169 |     elif isinstance(tensor, torch.Tensor):
170 |         return tensor.detach().cpu().numpy()
171 |     elif isinstance(tensor, list):
172 |         return np.array(tensor)
173 |     else:
174 |         raise TypeError("Unsupported type for conversion to numpy array")
175 | 
176 | 
177 | def get_user_data_dir(appname="matcha_tts"):
178 |     """
179 |     Args:
180 |         appname (str): Name of application
181 | 
182 |     Returns:
183 |         Path: path to user data directory
184 |     """
185 | 
186 |     MATCHA_HOME = os.environ.get("MATCHA_HOME")
187 |     if MATCHA_HOME is not None:
188 |         ans = Path(MATCHA_HOME).expanduser().resolve(strict=False)
189 |     elif sys.platform == "win32":
190 |         import winreg  # pylint: disable=import-outside-toplevel
191 | 
192 |         key = winreg.OpenKey(
193 |             winreg.HKEY_CURRENT_USER,
194 |             r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders",
195 |         )
196 |         dir_, _ = winreg.QueryValueEx(key, "Local AppData")
197 |         ans = Path(dir_).resolve(strict=False)
198 |     elif sys.platform == "darwin":
199 |         ans = Path("~/Library/Application Support/").expanduser()
200 |     else:
201 |         ans = Path.home().joinpath(".local/share")
202 | 
203 |     final_path = ans.joinpath(appname)
204 |     final_path.mkdir(parents=True, exist_ok=True)
205 |     return final_path
206 | 
207 | 
208 | def assert_model_downloaded(checkpoint_path, url, use_wget=True):
209 |     if Path(checkpoint_path).exists():
210 |         log.debug(f"[+] Model already present at {checkpoint_path}!")
211 |         print(f"[+] Model already present at {checkpoint_path}!")
212 |         return
213 |     log.info(f"[-] Model not found at {checkpoint_path}! Will download it")
214 |     print(f"[-] Model not found at {checkpoint_path}! Will download it")
215 |     checkpoint_path = str(checkpoint_path)
216 |     if not use_wget:
217 |         gdown.download(url=url, output=checkpoint_path, quiet=False, fuzzy=True)
218 |     else:
219 |         wget.download(url=url, out=checkpoint_path)
220 | 


--------------------------------------------------------------------------------