├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── data
    ├── audio
    │   ├── warm_up.wav
    │   ├── 女性.wav
    │   ├── 少女.wav
    │   ├── 男性.wav
    │   └── 青年.wav
    ├── icon
    │   ├── qwen.png
    │   └── user.png
    └── video
    │   ├── Avatar1.mp4
    │   ├── Avatar2.mp4
    │   └── Avatar3.mp4
├── docs
    └── README_en.md
├── image.png
├── requirements.txt
├── server.py
└── src
    ├── GLM_4_Voice
        ├── LICENSE
        ├── README.md
        ├── README_en.md
        ├── __init__.py
        ├── cosyvoice
        │   ├── __init__.py
        │   ├── bin
        │   │   ├── inference.py
        │   │   └── train.py
        │   ├── cli
        │   │   ├── __init__.py
        │   │   ├── cosyvoice.py
        │   │   ├── frontend.py
        │   │   └── model.py
        │   ├── dataset
        │   │   ├── __init__.py
        │   │   ├── dataset.py
        │   │   └── processor.py
        │   ├── flow
        │   │   ├── decoder.py
        │   │   ├── flow.py
        │   │   ├── flow_gradtts.py
        │   │   ├── flow_matching.py
        │   │   ├── flow_matching_dit.py
        │   │   ├── length_regulator.py
        │   │   └── stable
        │   │   │   ├── adp.py
        │   │   │   ├── blocks.py
        │   │   │   ├── dit.py
        │   │   │   ├── dit_v2.py
        │   │   │   ├── sampling.py
        │   │   │   ├── stable_diffusion.py
        │   │   │   ├── stable_diffusion_test.py
        │   │   │   ├── transformer.py
        │   │   │   └── transformer_use_mask.py
        │   ├── hifigan
        │   │   ├── f0_predictor.py
        │   │   └── generator.py
        │   ├── llm
        │   │   └── llm.py
        │   ├── transformer
        │   │   ├── __init__.py
        │   │   ├── activation.py
        │   │   ├── attention.py
        │   │   ├── convolution.py
        │   │   ├── decoder.py
        │   │   ├── decoder_layer.py
        │   │   ├── embedding.py
        │   │   ├── encoder.py
        │   │   ├── encoder_layer.py
        │   │   ├── label_smoothing_loss.py
        │   │   ├── positionwise_feed_forward.py
        │   │   └── subsampling.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── block_mask_util.py
        │   │   ├── class_utils.py
        │   │   ├── common.py
        │   │   ├── executor.py
        │   │   ├── file_utils.py
        │   │   ├── frontend_utils.py
        │   │   ├── mask.py
        │   │   ├── scheduler.py
        │   │   └── train_utils.py
        ├── flow_inference.py
        ├── requirements.txt
        ├── resources
        │   ├── architecture.jpeg
        │   └── web_demo.png
        ├── speech_tokenizer
        │   ├── __init__.py
        │   ├── configuration_whisper.py
        │   ├── generation_whisper.py
        │   ├── modeling_whisper.py
        │   └── utils.py
        └── third_party
        │   └── Matcha-TTS
        │       ├── .env.example
        │       ├── .github
        │           ├── PULL_REQUEST_TEMPLATE.md
        │           ├── codecov.yml
        │           ├── dependabot.yml
        │           └── release-drafter.yml
        │       ├── .gitignore
        │       ├── .pre-commit-config.yaml
        │       ├── .project-root
        │       ├── .pylintrc
        │       ├── LICENSE
        │       ├── MANIFEST.in
        │       ├── Makefile
        │       ├── README.md
        │       ├── configs
        │           ├── __init__.py
        │           ├── callbacks
        │           │   ├── default.yaml
        │           │   ├── model_checkpoint.yaml
        │           │   ├── model_summary.yaml
        │           │   ├── none.yaml
        │           │   └── rich_progress_bar.yaml
        │           ├── data
        │           │   ├── hi-fi_en-US_female.yaml
        │           │   ├── ljspeech.yaml
        │           │   └── vctk.yaml
        │           ├── debug
        │           │   ├── default.yaml
        │           │   ├── fdr.yaml
        │           │   ├── limit.yaml
        │           │   ├── overfit.yaml
        │           │   └── profiler.yaml
        │           ├── eval.yaml
        │           ├── experiment
        │           │   ├── hifi_dataset_piper_phonemizer.yaml
        │           │   ├── ljspeech.yaml
        │           │   ├── ljspeech_min_memory.yaml
        │           │   └── multispeaker.yaml
        │           ├── extras
        │           │   └── default.yaml
        │           ├── hparams_search
        │           │   └── mnist_optuna.yaml
        │           ├── hydra
        │           │   └── default.yaml
        │           ├── local
        │           │   └── .gitkeep
        │           ├── logger
        │           │   ├── aim.yaml
        │           │   ├── comet.yaml
        │           │   ├── csv.yaml
        │           │   ├── many_loggers.yaml
        │           │   ├── mlflow.yaml
        │           │   ├── neptune.yaml
        │           │   ├── tensorboard.yaml
        │           │   └── wandb.yaml
        │           ├── model
        │           │   ├── cfm
        │           │   │   └── default.yaml
        │           │   ├── decoder
        │           │   │   └── default.yaml
        │           │   ├── encoder
        │           │   │   └── default.yaml
        │           │   ├── matcha.yaml
        │           │   └── optimizer
        │           │   │   └── adam.yaml
        │           ├── paths
        │           │   └── default.yaml
        │           ├── train.yaml
        │           └── trainer
        │           │   ├── cpu.yaml
        │           │   ├── ddp.yaml
        │           │   ├── ddp_sim.yaml
        │           │   ├── default.yaml
        │           │   ├── gpu.yaml
        │           │   └── mps.yaml
        │       ├── data
        │       ├── matcha
        │           ├── VERSION
        │           ├── __init__.py
        │           ├── app.py
        │           ├── cli.py
        │           ├── data
        │           │   ├── __init__.py
        │           │   ├── components
        │           │   │   └── __init__.py
        │           │   └── text_mel_datamodule.py
        │           ├── hifigan
        │           │   ├── LICENSE
        │           │   ├── README.md
        │           │   ├── __init__.py
        │           │   ├── config.py
        │           │   ├── denoiser.py
        │           │   ├── env.py
        │           │   ├── meldataset.py
        │           │   ├── models.py
        │           │   └── xutils.py
        │           ├── models
        │           │   ├── __init__.py
        │           │   ├── baselightningmodule.py
        │           │   ├── components
        │           │   │   ├── __init__.py
        │           │   │   ├── decoder.py
        │           │   │   ├── flow_matching.py
        │           │   │   ├── text_encoder.py
        │           │   │   └── transformer.py
        │           │   └── matcha_tts.py
        │           ├── onnx
        │           │   ├── __init__.py
        │           │   ├── export.py
        │           │   └── infer.py
        │           ├── text
        │           │   ├── __init__.py
        │           │   ├── cleaners.py
        │           │   ├── numbers.py
        │           │   └── symbols.py
        │           ├── train.py
        │           └── utils
        │           │   ├── __init__.py
        │           │   ├── audio.py
        │           │   ├── generate_data_statistics.py
        │           │   ├── instantiators.py
        │           │   ├── logging_utils.py
        │           │   ├── model.py
        │           │   ├── monotonic_align
        │           │       ├── __init__.py
        │           │       ├── core.pyx
        │           │       └── setup.py
        │           │   ├── pylogger.py
        │           │   ├── rich_utils.py
        │           │   └── utils.py
        │       ├── notebooks
        │           └── .gitkeep
        │       ├── pyproject.toml
        │       ├── requirements.txt
        │       ├── scripts
        │           └── schedule.sh
        │       ├── setup.py
        │       └── synthesis.ipynb
    ├── GPT_SoVITS
        ├── AR
        │   ├── __init__.py
        │   ├── data
        │   │   ├── __init__.py
        │   │   ├── bucket_sampler.py
        │   │   ├── data_module.py
        │   │   └── dataset.py
        │   ├── models
        │   │   ├── __init__.py
        │   │   ├── t2s_lightning_module.py
        │   │   ├── t2s_lightning_module_onnx.py
        │   │   ├── t2s_model.py
        │   │   ├── t2s_model_onnx.py
        │   │   └── utils.py
        │   ├── modules
        │   │   ├── __init__.py
        │   │   ├── activation.py
        │   │   ├── activation_onnx.py
        │   │   ├── embedding.py
        │   │   ├── embedding_onnx.py
        │   │   ├── lr_schedulers.py
        │   │   ├── optim.py
        │   │   ├── patched_mha_with_cache.py
        │   │   ├── patched_mha_with_cache_onnx.py
        │   │   ├── scaling.py
        │   │   ├── transformer.py
        │   │   └── transformer_onnx.py
        │   ├── text_processing
        │   │   ├── __init__.py
        │   │   ├── phonemizer.py
        │   │   └── symbols.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── initialize.py
        │   │   └── io.py
        ├── TTS_infer_pack
        │   ├── TTS.py
        │   ├── TextPreprocessor.py
        │   ├── __init__.py
        │   └── text_segmentation_method.py
        ├── __init__.py
        ├── configs
        │   ├── s1.yaml
        │   ├── s1big.yaml
        │   ├── s1big2.yaml
        │   ├── s1longer-v2.yaml
        │   ├── s1longer.yaml
        │   ├── s1mq.yaml
        │   ├── s2.json
        │   ├── train.yaml
        │   └── tts_infer.yaml
        ├── feature_extractor
        │   ├── __init__.py
        │   ├── cnhubert.py
        │   └── whisper_enc.py
        ├── module
        │   ├── __init__.py
        │   ├── attentions.py
        │   ├── attentions_onnx.py
        │   ├── commons.py
        │   ├── core_vq.py
        │   ├── data_utils.py
        │   ├── losses.py
        │   ├── mel_processing.py
        │   ├── models.py
        │   ├── models_onnx.py
        │   ├── modules.py
        │   ├── mrte_model.py
        │   ├── quantize.py
        │   └── transforms.py
        ├── text
        │   ├── __init__.py
        │   ├── cantonese.py
        │   ├── chinese.py
        │   ├── chinese2.py
        │   ├── cleaner.py
        │   ├── cmudict-fast.rep
        │   ├── cmudict.rep
        │   ├── engdict-hot.rep
        │   ├── engdict_cache.pickle
        │   ├── english.py
        │   ├── g2pw
        │   │   ├── __init__.py
        │   │   ├── dataset.py
        │   │   ├── g2pw.py
        │   │   ├── onnx_api.py
        │   │   ├── polyphonic-fix.rep
        │   │   ├── polyphonic.pickle
        │   │   ├── polyphonic.rep
        │   │   └── utils.py
        │   ├── ja_userdic
        │   │   └── userdict.csv
        │   ├── japanese.py
        │   ├── korean.py
        │   ├── namedict_cache.pickle
        │   ├── opencpop-strict.txt
        │   ├── symbols.py
        │   ├── symbols2.py
        │   ├── tone_sandhi.py
        │   └── zh_normalization
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── chronology.py
        │   │   ├── constants.py
        │   │   ├── num.py
        │   │   ├── phonecode.py
        │   │   ├── quantifier.py
        │   │   └── text_normlization.py
        ├── tools
        │   ├── __init__.py
        │   ├── asr
        │   │   ├── __init__.py
        │   │   ├── config.py
        │   │   ├── fasterwhisper_asr.py
        │   │   ├── funasr_asr.py
        │   │   └── models
        │   │   │   └── .gitignore
        │   ├── cmd-denoise.py
        │   ├── denoise-model
        │   │   └── .gitignore
        │   ├── i18n
        │   │   ├── __init__.py
        │   │   ├── i18n.py
        │   │   ├── locale
        │   │   │   ├── en_US.json
        │   │   │   ├── es_ES.json
        │   │   │   ├── fr_FR.json
        │   │   │   ├── it_IT.json
        │   │   │   ├── ja_JP.json
        │   │   │   ├── ko_KR.json
        │   │   │   ├── pt_BR.json
        │   │   │   ├── ru_RU.json
        │   │   │   ├── tr_TR.json
        │   │   │   ├── zh_CN.json
        │   │   │   ├── zh_HK.json
        │   │   │   ├── zh_SG.json
        │   │   │   └── zh_TW.json
        │   │   └── scan_i18n.py
        │   ├── my_utils.py
        │   ├── slice_audio.py
        │   ├── slicer2.py
        │   ├── subfix_webui.py
        │   └── uvr5
        │   │   ├── __init__.py
        │   │   ├── bs_roformer
        │   │       ├── __init__.py
        │   │       ├── attend.py
        │   │       └── bs_roformer.py
        │   │   ├── bsroformer.py
        │   │   ├── lib
        │   │       ├── lib_v5
        │   │       │   ├── dataset.py
        │   │       │   ├── layers.py
        │   │       │   ├── layers_123812KB.py
        │   │       │   ├── layers_123821KB.py
        │   │       │   ├── layers_33966KB.py
        │   │       │   ├── layers_537227KB.py
        │   │       │   ├── layers_537238KB.py
        │   │       │   ├── layers_new.py
        │   │       │   ├── model_param_init.py
        │   │       │   ├── modelparams
        │   │       │   │   ├── 1band_sr16000_hl512.json
        │   │       │   │   ├── 1band_sr32000_hl512.json
        │   │       │   │   ├── 1band_sr33075_hl384.json
        │   │       │   │   ├── 1band_sr44100_hl1024.json
        │   │       │   │   ├── 1band_sr44100_hl256.json
        │   │       │   │   ├── 1band_sr44100_hl512.json
        │   │       │   │   ├── 1band_sr44100_hl512_cut.json
        │   │       │   │   ├── 2band_32000.json
        │   │       │   │   ├── 2band_44100_lofi.json
        │   │       │   │   ├── 2band_48000.json
        │   │       │   │   ├── 3band_44100.json
        │   │       │   │   ├── 3band_44100_mid.json
        │   │       │   │   ├── 3band_44100_msb2.json
        │   │       │   │   ├── 4band_44100.json
        │   │       │   │   ├── 4band_44100_mid.json
        │   │       │   │   ├── 4band_44100_msb.json
        │   │       │   │   ├── 4band_44100_msb2.json
        │   │       │   │   ├── 4band_44100_reverse.json
        │   │       │   │   ├── 4band_44100_sw.json
        │   │       │   │   ├── 4band_v2.json
        │   │       │   │   ├── 4band_v2_sn.json
        │   │       │   │   ├── 4band_v3.json
        │   │       │   │   └── ensemble.json
        │   │       │   ├── nets.py
        │   │       │   ├── nets_123812KB.py
        │   │       │   ├── nets_123821KB.py
        │   │       │   ├── nets_33966KB.py
        │   │       │   ├── nets_537227KB.py
        │   │       │   ├── nets_537238KB.py
        │   │       │   ├── nets_61968KB.py
        │   │       │   ├── nets_new.py
        │   │       │   └── spec_utils.py
        │   │       ├── name_params.json
        │   │       └── utils.py
        │   │   ├── mdxnet.py
        │   │   ├── uvr5_weights
        │   │       └── .gitignore
        │   │   ├── vr.py
        │   │   └── webui.py
        └── utils.py
    ├── __init__.py
    ├── asr.py
    ├── glm.py
    ├── llm.py
    ├── musetalk
        ├── __init__.py
        ├── models
        │   ├── unet.py
        │   └── vae.py
        ├── utils
        │   ├── __init__.py
        │   ├── blending.py
        │   ├── dwpose
        │   │   ├── __init__.py
        │   │   ├── default_runtime.py
        │   │   └── rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
        │   ├── face_detection
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── api.py
        │   │   ├── detection
        │   │   │   ├── __init__.py
        │   │   │   ├── core.py
        │   │   │   └── sfd
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── bbox.py
        │   │   │   │   ├── detect.py
        │   │   │   │   ├── net_s3fd.py
        │   │   │   │   └── sfd_detector.py
        │   │   ├── models.py
        │   │   └── utils.py
        │   ├── face_parsing
        │   │   ├── __init__.py
        │   │   ├── model.py
        │   │   └── resnet.py
        │   ├── preprocessing.py
        │   └── utils.py
        └── whisper
        │   ├── audio2feature.py
        │   └── whisper
        │       ├── __init__.py
        │       ├── __main__.py
        │       ├── assets
        │           ├── gpt2
        │           │   ├── merges.txt
        │           │   ├── special_tokens_map.json
        │           │   ├── tokenizer_config.json
        │           │   └── vocab.json
        │           ├── mel_filters.npz
        │           └── multilingual
        │           │   ├── added_tokens.json
        │           │   ├── merges.txt
        │           │   ├── special_tokens_map.json
        │           │   ├── tokenizer_config.json
        │           │   └── vocab.json
        │       ├── audio.py
        │       ├── decoding.py
        │       ├── model.py
        │       ├── normalizers
        │           ├── __init__.py
        │           ├── basic.py
        │           ├── english.json
        │           └── english.py
        │       ├── tokenizer.py
        │       ├── transcribe.py
        │       └── utils.py
    ├── pipeline_llm.py
    ├── pipeline_mllm.py
    ├── prompt.txt
    ├── thg.py
    ├── tts.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | */.DS_Store
 3 | *.log
 4 | .idea/
 5 | .vscode/
 6 | *.pyc
 7 | workspaces/
 8 | */__pycache__/
 9 | */.ipynb_checkpoints/
10 | */.pytest_cache/
11 | */.mypy_cache/
12 | */.coverage
13 | __pycache__/
14 | weights/ZhipuAI/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Henry
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/audio/warm_up.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/warm_up.wav


--------------------------------------------------------------------------------
/data/audio/女性.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/女性.wav


--------------------------------------------------------------------------------
/data/audio/少女.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/少女.wav


--------------------------------------------------------------------------------
/data/audio/男性.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/男性.wav


--------------------------------------------------------------------------------
/data/audio/青年.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/青年.wav


--------------------------------------------------------------------------------
/data/icon/qwen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/icon/qwen.png


--------------------------------------------------------------------------------
/data/icon/user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/icon/user.png


--------------------------------------------------------------------------------
/data/video/Avatar1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar1.mp4


--------------------------------------------------------------------------------
/data/video/Avatar2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar2.mp4


--------------------------------------------------------------------------------
/data/video/Avatar3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar3.mp4


--------------------------------------------------------------------------------
/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/image.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu121
 2 | # https://gradio-builds.s3.amazonaws.com/bed454c3d22cfacedc047eb3b0ba987b485ac3fd/gradio-4.40.0-py3-none-any.whl
 3 | gradio==5.4.0
 4 | modelscope_studio==0.5.2
 5 | # torch==2.1.2 
 6 | # torchvision==0.16.2
 7 | # torchaudio==2.1.2
 8 | torch==2.3.0
 9 | torchvision==0.18.0
10 | torchaudio==2.3.0
11 | diffusers==0.27.2
12 | accelerate==0.28.0
13 | tensorflow==2.14.0
14 | tensorboard==2.14.0
15 | opencv-python==4.9.0.80
16 | soundfile==0.12.1
17 | gdown==5.2.0
18 | requests==2.32.3
19 | imageio==2.35.1
20 | imageio[ffmpeg]
21 | omegaconf==2.3.0
22 | ffmpeg-python==0.2.0
23 | spaces==0.30.0
24 | moviepy==1.0.3
25 | numpy==1.23.5
26 | scipy==1.13.1
27 | librosa==0.9.2
28 | numba==0.56.4
29 | pytorch-lightning==2.4.0
30 | onnxruntime==1.19.2; sys_platform == 'darwin'
31 | onnxruntime-gpu==1.19.2; sys_platform != 'darwin'
32 | tqdm==4.66.5
33 | funasr==1.1.6
34 | cn2an==0.5.22
35 | pypinyin==0.52.0
36 | pyopenjtalk==0.3.4
37 | g2p-en==2.1.0
38 | sentencepiece==0.2.0
39 | chardet==5.2.0
40 | PyYAML==6.0.2
41 | psutil==5.9.8
42 | jieba_fast==0.53
43 | jieba==0.42.1
44 | LangSegment==0.3.5
45 | Faster_Whisper==1.0.3
46 | wordsegment==1.3.1
47 | rotary-embedding-torch==0.7.0
48 | pyjyutping==1.0.0
49 | g2pk2==0.0.3
50 | ko-pron==1.3
51 | opencc; sys_platform != 'linux'
52 | opencc==1.1.1; sys_platform == 'linux'
53 | python_mecab_ko==1.3.7; sys_platform != 'win32'
54 | openmim==0.3.9
55 | openai==1.43.0
56 | fastapi[all]
57 | nltk
58 | modelscope==1.18.0
59 | pydub
60 | dashscope
61 | edge-tts
62 | # fastapi==0.112.2
63 | 
64 | # GLM-4-Voice requirements
65 | transformers==4.44.1
66 | conformer==0.3.2
67 | deepspeed==0.14.2; sys_platform == 'linux'
68 | grpcio==1.57.0
69 | grpcio-tools==1.57.0
70 | huggingface_hub==0.25.2
71 | hydra-core==1.3.2
72 | HyperPyYAML==1.2.2
73 | inflect==7.3.1
74 | lightning==2.2.4
75 | networkx==3.1
76 | openai-whisper==20231117
77 | protobuf==4.25
78 | rich==13.7.1
79 | Requests==2.32.3
80 | safetensors==0.4.5
81 | soundfile==0.12.1
82 | tensorboard==2.14.0
83 | wget==3.2
84 | WeTextProcessing==1.0.3


--------------------------------------------------------------------------------
/src/GLM_4_Voice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/cli/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/dataset/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/flow/length_regulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Tuple
15 | import torch.nn as nn
16 | from torch.nn import functional as F
17 | from cosyvoice.utils.mask import make_pad_mask
18 | 
19 | 
20 | class InterpolateRegulator(nn.Module):
21 |     def __init__(
22 |             self,
23 |             channels: int,
24 |             sampling_ratios: Tuple,
25 |             out_channels: int = None,
26 |             groups: int = 1,
27 |     ):
28 |         super().__init__()
29 |         self.sampling_ratios = sampling_ratios
30 |         out_channels = out_channels or channels
31 |         model = nn.ModuleList([])
32 |         if len(sampling_ratios) > 0:
33 |             for _ in sampling_ratios:
34 |                 module = nn.Conv1d(channels, channels, 3, 1, 1)
35 |                 norm = nn.GroupNorm(groups, channels)
36 |                 act = nn.Mish()
37 |                 model.extend([module, norm, act])
38 |         model.append(
39 |             nn.Conv1d(channels, out_channels, 1, 1)
40 |         )
41 |         self.model = nn.Sequential(*model)
42 | 
43 |     def forward(self, x, ylens=None):
44 |         # x in (B, T, D)
45 |         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46 |         x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47 |         out = self.model(x).transpose(1, 2).contiguous()
48 |         olens = ylens
49 |         return out * mask, olens
50 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/hifigan/f0_predictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | import torch.nn as nn
16 | from torch.nn.utils import weight_norm
17 | 
18 | 
19 | class ConvRNNF0Predictor(nn.Module):
20 |     def __init__(self,
21 |                  num_class: int = 1,
22 |                  in_channels: int = 80,
23 |                  cond_channels: int = 512
24 |                  ):
25 |         super().__init__()
26 | 
27 |         self.num_class = num_class
28 |         self.condnet = nn.Sequential(
29 |             weight_norm(
30 |                 nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31 |             ),
32 |             nn.ELU(),
33 |             weight_norm(
34 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35 |             ),
36 |             nn.ELU(),
37 |             weight_norm(
38 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39 |             ),
40 |             nn.ELU(),
41 |             weight_norm(
42 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43 |             ),
44 |             nn.ELU(),
45 |             weight_norm(
46 |                 nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47 |             ),
48 |             nn.ELU(),
49 |         )
50 |         self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51 | 
52 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
53 |         x = self.condnet(x)
54 |         x = x.transpose(1, 2)
55 |         return torch.abs(self.classifier(x).squeeze(-1))
56 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/transformer/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/transformer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #               2020 Northwestern Polytechnical University (Pengcheng Guo)
 3 | #               2020 Mobvoi Inc (Binbin Zhang)
 4 | #               2024 Alibaba Inc (Xiang Lyu)
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Swish() activation function for Conformer."""
18 | 
19 | import torch
20 | from torch import nn, sin, pow
21 | from torch.nn import Parameter
22 | 
23 | 
24 | class Swish(torch.nn.Module):
25 |     """Construct an Swish object."""
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         """Return Swish activation function."""
29 |         return x * torch.sigmoid(x)
30 | 
31 | 
32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
33 | #   LICENSE is in incl_licenses directory.
34 | class Snake(nn.Module):
35 |     '''
36 |     Implementation of a sine-based periodic activation function
37 |     Shape:
38 |         - Input: (B, C, T)
39 |         - Output: (B, C, T), same shape as the input
40 |     Parameters:
41 |         - alpha - trainable parameter
42 |     References:
43 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
44 |         https://arxiv.org/abs/2006.08195
45 |     Examples:
46 |         >>> a1 = snake(256)
47 |         >>> x = torch.randn(256)
48 |         >>> x = a1(x)
49 |     '''
50 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
51 |         '''
52 |         Initialization.
53 |         INPUT:
54 |             - in_features: shape of the input
55 |             - alpha: trainable parameter
56 |             alpha is initialized to 1 by default, higher values = higher-frequency.
57 |             alpha will be trained along with the rest of your model.
58 |         '''
59 |         super(Snake, self).__init__()
60 |         self.in_features = in_features
61 | 
62 |         # initialize alpha
63 |         self.alpha_logscale = alpha_logscale
64 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
65 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
66 |         else:  # linear scale alphas initialized to ones
67 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
68 | 
69 |         self.alpha.requires_grad = alpha_trainable
70 | 
71 |         self.no_div_by_zero = 0.000000001
72 | 
73 |     def forward(self, x):
74 |         '''
75 |         Forward pass of the function.
76 |         Applies the function to the input elementwise.
77 |         Snake ∶= x + 1/a * sin^2 (xa)
78 |         '''
79 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
80 |         if self.alpha_logscale:
81 |             alpha = torch.exp(alpha)
82 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/utils/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/utils/block_mask_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle):
 5 |     assert seq_length > 0
 6 | 
 7 |     # 先不考虑seen_length创建一个grid mask：
 8 |     if fill_triangle:
 9 |         mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1)
10 |         # 下三角与主对角线都为1
11 |     else:
12 |         mask = torch.zeros(seq_length, seq_length)
13 | 
14 |     for i in range(seq_length):
15 |         trunck_idx = i // trunck_length
16 |         trunck_start = trunck_idx * trunck_length
17 |         trunck_end = trunck_length + trunck_start
18 |         mask[i][trunck_start:trunck_end] = 1
19 | 
20 |     return mask
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int()
25 |     print(mask)
26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0],
27 | #         [1, 1, 1, 0, 0, 0, 0, 0],
28 | #         [1, 1, 1, 0, 0, 0, 0, 0],
29 | #         [1, 1, 1, 1, 1, 1, 0, 0],
30 | #         [1, 1, 1, 1, 1, 1, 0, 0],
31 | #         [1, 1, 1, 1, 1, 1, 0, 0],
32 | #         [1, 1, 1, 1, 1, 1, 1, 1],
33 | #         [1, 1, 1, 1, 1, 1, 1, 1]]
34 | 
35 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/utils/class_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
 2 | #            2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from cosyvoice.transformer.activation import Swish
18 | from cosyvoice.transformer.subsampling import (
19 |     LinearNoSubsampling,
20 |     EmbedinigNoSubsampling,
21 |     Conv1dSubsampling2,
22 |     Conv2dSubsampling4,
23 |     Conv2dSubsampling6,
24 |     Conv2dSubsampling8,
25 | )
26 | from cosyvoice.transformer.embedding import (PositionalEncoding,
27 |                                              RelPositionalEncoding,
28 |                                              WhisperPositionalEncoding,
29 |                                              LearnablePositionalEncoding,
30 |                                              NoPositionalEncoding)
31 | from cosyvoice.transformer.attention import (MultiHeadedAttention,
32 |                                              RelPositionMultiHeadedAttention,
33 |                                              BlockRelPositionMultiHeadedAttention)
34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
36 | 
37 | 
38 | COSYVOICE_ACTIVATION_CLASSES = {
39 |     "hardtanh": torch.nn.Hardtanh,
40 |     "tanh": torch.nn.Tanh,
41 |     "relu": torch.nn.ReLU,
42 |     "selu": torch.nn.SELU,
43 |     "swish": getattr(torch.nn, "SiLU", Swish),
44 |     "gelu": torch.nn.GELU,
45 | }
46 | 
47 | COSYVOICE_SUBSAMPLE_CLASSES = {
48 |     "linear": LinearNoSubsampling,
49 |     "linear_legacy": LegacyLinearNoSubsampling,
50 |     "embed": EmbedinigNoSubsampling,
51 |     "conv1d2": Conv1dSubsampling2,
52 |     "conv2d": Conv2dSubsampling4,
53 |     "conv2d6": Conv2dSubsampling6,
54 |     "conv2d8": Conv2dSubsampling8,
55 |     'paraformer_dummy': torch.nn.Identity
56 | }
57 | 
58 | COSYVOICE_EMB_CLASSES = {
59 |     "embed": PositionalEncoding,
60 |     "abs_pos": PositionalEncoding,
61 |     "rel_pos": RelPositionalEncoding,
62 |     "rel_pos_espnet": EspnetRelPositionalEncoding,
63 |     "no_pos": NoPositionalEncoding,
64 |     "abs_pos_whisper": WhisperPositionalEncoding,
65 |     "embed_learnable_pe": LearnablePositionalEncoding,
66 | }
67 | 
68 | COSYVOICE_ATTENTION_CLASSES = {
69 |     "selfattn": MultiHeadedAttention,
70 |     "rel_selfattn": RelPositionMultiHeadedAttention,
71 |     "block_rel_selfattn": BlockRelPositionMultiHeadedAttention,
72 | }
73 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/cosyvoice/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #               2024 Alibaba Inc (authors: Xiang Lyu)
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import torchaudio
18 | 
19 | 
20 | def read_lists(list_file):
21 |     lists = []
22 |     with open(list_file, 'r', encoding='utf8') as fin:
23 |         for line in fin:
24 |             lists.append(line.strip())
25 |     return lists
26 | 
27 | def read_json_lists(list_file):
28 |     lists = read_lists(list_file)
29 |     results = {}
30 |     for fn in lists:
31 |         with open(fn, 'r', encoding='utf8') as fin:
32 |             results.update(json.load(fin))
33 |     return results
34 | 
35 | def load_wav(wav, target_sr):
36 |     speech, sample_rate = torchaudio.load(wav)
37 |     speech = speech.mean(dim=0, keepdim=True)
38 |     if sample_rate != target_sr:
39 |         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40 |         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41 |     return speech
42 | 
43 | def speed_change(waveform, sample_rate, speed_factor: str):
44 |     effects = [
45 |         ["tempo", speed_factor],  # speed_factor
46 |         ["rate", f"{sample_rate}"]
47 |     ]
48 |     augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
49 |         waveform,
50 |         sample_rate,
51 |         effects
52 |     )
53 |     return augmented_waveform, new_sample_rate
54 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/requirements.txt:
--------------------------------------------------------------------------------
 1 | conformer==0.3.2
 2 | deepspeed==0.14.2; sys_platform == 'linux'
 3 | diffusers==0.27.2
 4 | fastapi==0.115.3
 5 | fastapi-cli==0.0.4
 6 | gdown==5.1.0
 7 | gradio==5.3.0
 8 | grpcio==1.57.0
 9 | grpcio-tools==1.57.0
10 | huggingface_hub==0.25.2
11 | hydra-core==1.3.2
12 | HyperPyYAML==1.2.2
13 | inflect==7.3.1
14 | librosa==0.10.2
15 | lightning==2.2.4
16 | matplotlib==3.7.5
17 | modelscope==1.15.0
18 | 
19 | networkx==3.1
20 | numpy==1.24.4
21 | omegaconf==2.3.0
22 | onnxruntime-gpu==1.16.0; sys_platform == 'linux'
23 | onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'
24 | openai-whisper==20231117
25 | protobuf==4.25
26 | pydantic==2.7.0
27 | rich==13.7.1
28 | Requests==2.32.3
29 | safetensors==0.4.5
30 | soundfile==0.12.1
31 | tensorboard==2.14.0
32 | transformers==4.44.1
33 | uvicorn==0.32.0
34 | wget==3.2
35 | WeTextProcessing==1.0.3
36 | torch==2.3.0
37 | torchaudio==2.3.0
38 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/resources/architecture.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/resources/architecture.jpeg


--------------------------------------------------------------------------------
/src/GLM_4_Voice/resources/web_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/resources/web_demo.png


--------------------------------------------------------------------------------
/src/GLM_4_Voice/speech_tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/speech_tokenizer/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/speech_tokenizer/configuration_whisper.py:
--------------------------------------------------------------------------------
 1 | from transformers import WhisperConfig
 2 | 
 3 | 
 4 | class WhisperVQConfig(WhisperConfig):
 5 |     def __init__(self,
 6 |                  pooling_kernel_size=None,
 7 |                  pooling_type="max",
 8 |                  pooling_position=0,
 9 |                  quantize_vocab_size=None,
10 |                  quantize_position=16,
11 |                  quantize_commit_coefficient=0.25,
12 |                  quantize_loss_scale=1.0,
13 |                  quantize_ema_decay=None,
14 |                  quantize_restart_interval=None,
15 |                  quantize_encoder_only=False,
16 |                  quantize_causal_encoder=False,
17 |                  quantize_causal_block_size=None,
18 |                  skip_language_detection=False,
19 |                  encoder_causal_attention=False,
20 |                  encoder_causal_convolution=False,
21 |                  **kwargs):
22 |         self.pooling_kernel_size = pooling_kernel_size
23 |         self.pooling_type = pooling_type
24 |         self.pooling_position = pooling_position
25 |         self.quantize_vocab_size = quantize_vocab_size
26 |         self.quantize_position = quantize_position
27 |         self.quantize_commit_coefficient = quantize_commit_coefficient
28 |         self.quantize_loss_scale = quantize_loss_scale
29 |         self.quantize_ema_decay = quantize_ema_decay
30 |         self.quantize_restart_interval = quantize_restart_interval
31 |         self.quantize_encoder_only = quantize_encoder_only
32 |         self.quantize_causal_encoder = quantize_causal_encoder
33 |         self.quantize_causal_block_size = quantize_causal_block_size
34 |         self.skip_language_detection = skip_language_detection
35 |         self.encoder_causal_attention = encoder_causal_attention
36 |         self.encoder_causal_convolution = encoder_causal_convolution
37 |         super().__init__(**kwargs)
38 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.env.example:
--------------------------------------------------------------------------------
1 | # example of file for storing private and user specific environment variables, like keys or system paths
2 | # rename it to ".env" (excluded from version control by default)
3 | # .env is loaded by train.py automatically
4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR}
5 | 
6 | MY_VAR="/home/user/my/system/path"
7 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## What does this PR do?
 2 | 
 3 | <!--
 4 | Please include a summary of the change and which issue is fixed.
 5 | Please also include relevant motivation and context.
 6 | List any dependencies that are required for this change.
 7 | List all the breaking changes introduced by this pull request.
 8 | -->
 9 | 
10 | Fixes #\<issue_number>
11 | 
12 | ## Before submitting
13 | 
14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**?
15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together?
16 | - [ ] Did you list all the **breaking changes** introduced by this pull request?
17 | - [ ] Did you **test your PR locally** with `pytest` command?
18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command?
19 | 
20 | ## Did you have fun?
21 | 
22 | Make sure you had fun coding 🙃
23 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     # measures overall project coverage
 4 |     project:
 5 |       default:
 6 |         threshold: 100% # how much decrease in coverage is needed to not consider success
 7 | 
 8 |     # measures PR or single commit coverage
 9 |     patch:
10 |       default:
11 |         threshold: 100% # how much decrease in coverage is needed to not consider success
12 | 
13 | 
14 |     # project: off
15 |     # patch: off
16 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     target-branch: "dev"
11 |     schedule:
12 |       interval: "daily"
13 |     ignore:
14 |       - dependency-name: "pytorch-lightning"
15 |         update-types: ["version-update:semver-patch"]
16 |       - dependency-name: "torchmetrics"
17 |         update-types: ["version-update:semver-patch"]
18 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: "v$RESOLVED_VERSION"
 2 | tag-template: "v$RESOLVED_VERSION"
 3 | 
 4 | categories:
 5 |   - title: "🚀 Features"
 6 |     labels:
 7 |       - "feature"
 8 |       - "enhancement"
 9 |   - title: "🐛 Bug Fixes"
10 |     labels:
11 |       - "fix"
12 |       - "bugfix"
13 |       - "bug"
14 |   - title: "🧹 Maintenance"
15 |     labels:
16 |       - "maintenance"
17 |       - "dependencies"
18 |       - "refactoring"
19 |       - "cosmetic"
20 |       - "chore"
21 |   - title: "📝️ Documentation"
22 |     labels:
23 |       - "documentation"
24 |       - "docs"
25 | 
26 | change-template: "- $TITLE @$AUTHOR (#$NUMBER)"
27 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
28 | 
29 | version-resolver:
30 |   major:
31 |     labels:
32 |       - "major"
33 |   minor:
34 |     labels:
35 |       - "minor"
36 |   patch:
37 |     labels:
38 |       - "patch"
39 |   default: patch
40 | 
41 | template: |
42 |   ## Changes
43 | 
44 |   $CHANGES
45 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | ### VisualStudioCode
131 | .vscode/*
132 | !.vscode/settings.json
133 | !.vscode/tasks.json
134 | !.vscode/launch.json
135 | !.vscode/extensions.json
136 | *.code-workspace
137 | **/.vscode
138 | 
139 | # JetBrains
140 | .idea/
141 | 
142 | # Data & Models
143 | *.h5
144 | *.tar
145 | *.tar.gz
146 | 
147 | # Lightning-Hydra-Template
148 | configs/local/default.yaml
149 | /data/
150 | /logs/
151 | .env
152 | 
153 | # Aim logging
154 | .aim
155 | 
156 | # Cython complied files
157 | matcha/utils/monotonic_align/core.c
158 | 
159 | # Ignoring hifigan checkpoint
160 | generator_v1
161 | g_02500000
162 | gradio_cached_examples/
163 | synth_output/
164 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.10
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.5.0
 7 |     hooks:
 8 |       # list of supported hooks: https://pre-commit.com/hooks.html
 9 |       - id: trailing-whitespace
10 |       - id: end-of-file-fixer
11 |       # - id: check-docstring-first
12 |       - id: check-yaml
13 |       - id: debug-statements
14 |       - id: detect-private-key
15 |       - id: check-toml
16 |       - id: check-case-conflict
17 |       - id: check-added-large-files
18 | 
19 |   # python code formatting
20 |   - repo: https://github.com/psf/black
21 |     rev: 23.12.1
22 |     hooks:
23 |       - id: black
24 |         args: [--line-length, "120"]
25 | 
26 |   # python import sorting
27 |   - repo: https://github.com/PyCQA/isort
28 |     rev: 5.13.2
29 |     hooks:
30 |       - id: isort
31 |         args: ["--profile", "black", "--filter-files"]
32 | 
33 |   # python upgrading syntax to newer version
34 |   - repo: https://github.com/asottile/pyupgrade
35 |     rev: v3.15.0
36 |     hooks:
37 |       - id: pyupgrade
38 |         args: [--py38-plus]
39 | 
40 |   # python check (PEP8), programming errors and code complexity
41 |   - repo: https://github.com/PyCQA/flake8
42 |     rev: 7.0.0
43 |     hooks:
44 |       - id: flake8
45 |         args:
46 |           [
47 |             "--max-line-length", "120",
48 |             "--extend-ignore",
49 |             "E203,E402,E501,F401,F841,RST2,RST301",
50 |             "--exclude",
51 |             "logs/*,data/*,matcha/hifigan/*",
52 |           ]
53 |         additional_dependencies: [flake8-rst-docstrings==0.3.0]
54 | 
55 |   # pylint
56 |   - repo: https://github.com/pycqa/pylint
57 |     rev: v3.0.3
58 |     hooks:
59 |     -   id: pylint
60 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/.project-root:
--------------------------------------------------------------------------------
1 | # this file is required for inferring the project root directory
2 | # do not delete
3 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Shivam Mehta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE.txt
 3 | include requirements.*.txt
 4 | include *.cff
 5 | include requirements.txt
 6 | include matcha/VERSION
 7 | recursive-include matcha *.json
 8 | recursive-include matcha *.html
 9 | recursive-include matcha *.png
10 | recursive-include matcha *.md
11 | recursive-include matcha *.py
12 | recursive-include matcha *.pyx
13 | recursive-exclude tests *
14 | prune tests*
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: ## Clean autogenerated files
 6 | 	rm -rf dist
 7 | 	find . -type f -name "*.DS_Store" -ls -delete
 8 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
 9 | 	find . | grep -E ".pytest_cache" | xargs rm -rf
10 | 	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
11 | 	rm -f .coverage
12 | 
13 | clean-logs: ## Clean logs
14 | 	rm -rf logs/**
15 | 
16 | create-package: ## Create wheel and tar gz
17 | 	rm -rf dist/
18 | 	python setup.py bdist_wheel --plat-name=manylinux1_x86_64
19 | 	python setup.py sdist
20 | 	python -m twine upload  dist/* --verbose --skip-existing
21 | 
22 | format: ## Run pre-commit hooks
23 | 	pre-commit run -a
24 | 
25 | sync: ## Merge changes from main branch to your current branch
26 | 	git pull
27 | 	git pull origin main
28 | 
29 | test: ## Run not slow tests
30 | 	pytest -k "not slow"
31 | 
32 | test-full: ## Run all tests
33 | 	pytest
34 | 
35 | train-ljspeech: ## Train the model
36 | 	python matcha/train.py experiment=ljspeech
37 | 
38 | train-ljspeech-min: ## Train the model with minimum memory
39 | 	python matcha/train.py experiment=ljspeech_min_memory
40 | 
41 | start_app: ## Start the app
42 | 	python matcha/app.py
43 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # this file is needed here to include configs when building project as a package
2 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - model_checkpoint.yaml
3 |   - model_summary.yaml
4 |   - rich_progress_bar.yaml
5 |   - _self_
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
 2 | 
 3 | model_checkpoint:
 4 |   _target_: lightning.pytorch.callbacks.ModelCheckpoint
 5 |   dirpath: ${paths.output_dir}/checkpoints # directory to save the model file
 6 |   filename: checkpoint_{epoch:03d}  # checkpoint filename
 7 |   monitor: epoch # name of the logged metric which determines when model is improving
 8 |   verbose: False # verbosity mode
 9 |   save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
10 |   save_top_k: 10 # save k best models (determined by above metric)
11 |   mode: "max" # "max" means higher metric value is better, can be also "min"
12 |   auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
13 |   save_weights_only: False # if True, then only the model’s weights will be saved
14 |   every_n_train_steps: null # number of training steps between checkpoints
15 |   train_time_interval: null # checkpoints are monitored at the specified time interval
16 |   every_n_epochs: 100 # number of epochs between checkpoints
17 |   save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
18 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
2 | 
3 | model_summary:
4 |   _target_: lightning.pytorch.callbacks.RichModelSummary
5 |   max_depth: 3 # the maximum depth of layer nesting that the summary will include
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/none.yaml


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
2 | 
3 | rich_progress_bar:
4 |   _target_: lightning.pytorch.callbacks.RichProgressBar
5 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - ljspeech
 3 |   - _self_
 4 | 
 5 | # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/
 6 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 7 | name: hi-fi_en-US_female
 8 | train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt
 9 | valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt
10 | batch_size: 32
11 | cleaners: [english_cleaners_piper]
12 | data_statistics:  # Computed for this dataset
13 |   mel_mean: -6.38385
14 |   mel_std: 2.541796
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/ljspeech.yaml:
--------------------------------------------------------------------------------
 1 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 2 | name: ljspeech
 3 | train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt
 4 | valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt
 5 | batch_size: 32
 6 | num_workers: 20
 7 | pin_memory: True
 8 | cleaners: [english_cleaners2]
 9 | add_blank: True
10 | n_spks: 1
11 | n_fft: 1024
12 | n_feats: 80
13 | sample_rate: 22050
14 | hop_length: 256
15 | win_length: 1024
16 | f_min: 0
17 | f_max: 8000
18 | data_statistics:  # Computed for ljspeech dataset
19 |   mel_mean: -5.536622
20 |   mel_std: 2.116101
21 | seed: ${seed}
22 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/vctk.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - ljspeech
 3 |   - _self_
 4 | 
 5 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule
 6 | name: vctk
 7 | train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt
 8 | valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt
 9 | batch_size: 32
10 | add_blank: True
11 | n_spks: 109
12 | data_statistics:  # Computed for vctk dataset
13 |   mel_mean: -6.630575
14 |   mel_std: 2.482914
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # default debugging setup, runs 1 full epoch
 4 | # other debugging configs can inherit from this one
 5 | 
 6 | # overwrite task name so debugging logs are stored in separate folder
 7 | task_name: "debug"
 8 | 
 9 | # disable callbacks and loggers during debugging
10 | # callbacks: null
11 | # logger: null
12 | 
13 | extras:
14 |   ignore_warnings: False
15 |   enforce_tags: False
16 | 
17 | # sets level of all command line loggers to 'DEBUG'
18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
19 | hydra:
20 |   job_logging:
21 |     root:
22 |       level: DEBUG
23 | 
24 |   # use this to also set hydra loggers to 'DEBUG'
25 |   # verbose: True
26 | 
27 | trainer:
28 |   max_epochs: 1
29 |   accelerator: cpu # debuggers don't like gpus
30 |   devices: 1 # debuggers don't like multiprocessing
31 |   detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
32 | 
33 | data:
34 |   num_workers: 0 # debuggers don't like multiprocessing
35 |   pin_memory: False # disable gpu memory pin
36 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/fdr.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs 1 train, 1 validation and 1 test step
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   fast_dev_run: true
10 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/limit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # uses only 1% of the training data and 5% of validation/test data
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 3
10 |   limit_train_batches: 0.01
11 |   limit_val_batches: 0.05
12 |   limit_test_batches: 0.05
13 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/overfit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # overfits to 3 batches
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 20
10 |   overfit_batches: 3
11 | 
12 | # model ckpt and early stopping need to be disabled during overfitting
13 | callbacks: null
14 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/profiler.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs with execution time profiling
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 1
10 |   # profiler: "simple"
11 |   profiler: "advanced"
12 |   # profiler: "pytorch"
13 |   accelerator: gpu
14 | 
15 |   limit_train_batches: 0.02
16 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/eval.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - _self_
 5 |   - data: mnist # choose datamodule with `test_dataloader()` for evaluation
 6 |   - model: mnist
 7 |   - logger: null
 8 |   - trainer: default
 9 |   - paths: default
10 |   - extras: default
11 |   - hydra: default
12 | 
13 | task_name: "eval"
14 | 
15 | tags: ["dev"]
16 | 
17 | # passing checkpoint path is necessary for evaluation
18 | ckpt_path: ???
19 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: hi-fi_en-US_female.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"]
13 | 
14 | run_name: hi-fi_en-US_female_piper_phonemizer
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech_min
15 | 
16 | 
17 | model:
18 |   out_size: 172
19 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: vctk.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["multispeaker"]
13 | 
14 | run_name: multispeaker
15 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/extras/default.yaml:
--------------------------------------------------------------------------------
1 | # disable python warnings if they annoy you
2 | ignore_warnings: False
3 | 
4 | # ask user for tags if none are provided in the config
5 | enforce_tags: True
6 | 
7 | # pretty print config tree at the start of the run using Rich library
8 | print_config: True
9 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # example hyperparameter optimization of some experiment with Optuna:
 4 | # python train.py -m hparams_search=mnist_optuna experiment=example
 5 | 
 6 | defaults:
 7 |   - override /hydra/sweeper: optuna
 8 | 
 9 | # choose metric which will be optimized by Optuna
10 | # make sure this is the correct name of some metric logged in lightning module!
11 | optimized_metric: "val/acc_best"
12 | 
13 | # here we define Optuna hyperparameter search
14 | # it optimizes for value returned from function with @hydra.main decorator
15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
16 | hydra:
17 |   mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
18 | 
19 |   sweeper:
20 |     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
21 | 
22 |     # storage URL to persist optimization results
23 |     # for example, you can use SQLite if you set 'sqlite:///example.db'
24 |     storage: null
25 | 
26 |     # name of the study to persist optimization results
27 |     study_name: null
28 | 
29 |     # number of parallel workers
30 |     n_jobs: 1
31 | 
32 |     # 'minimize' or 'maximize' the objective
33 |     direction: maximize
34 | 
35 |     # total number of runs that will be executed
36 |     n_trials: 20
37 | 
38 |     # choose Optuna hyperparameter sampler
39 |     # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
40 |     # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
41 |     sampler:
42 |       _target_: optuna.samplers.TPESampler
43 |       seed: 1234
44 |       n_startup_trials: 10 # number of random sampling runs before optimization starts
45 | 
46 |     # define hyperparameter search space
47 |     params:
48 |       model.optimizer.lr: interval(0.0001, 0.1)
49 |       data.batch_size: choice(32, 64, 128, 256)
50 |       model.net.lin1_size: choice(64, 128, 256)
51 |       model.net.lin2_size: choice(64, 128, 256)
52 |       model.net.lin3_size: choice(32, 64, 128, 256)
53 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # https://hydra.cc/docs/configure_hydra/intro/
 2 | 
 3 | # enable color logging
 4 | defaults:
 5 |   - override hydra_logging: colorlog
 6 |   - override job_logging: colorlog
 7 | 
 8 | # output directory, generated dynamically on each run
 9 | run:
10 |   dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | sweep:
12 |   dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
13 |   subdir: ${hydra.job.num}
14 | 
15 | job_logging:
16 |   handlers:
17 |     file:
18 |       # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
19 |       filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
20 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/local/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/configs/local/.gitkeep


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/aim.yaml:
--------------------------------------------------------------------------------
 1 | # https://aimstack.io/
 2 | 
 3 | # example usage in lightning module:
 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
 5 | 
 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder):
 7 | # `aim up`
 8 | 
 9 | aim:
10 |   _target_: aim.pytorch_lightning.AimLogger
11 |   repo: ${paths.root_dir} # .aim folder will be created here
12 |   # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
13 | 
14 |   # aim allows to group runs under experiment name
15 |   experiment: null # any string, set to "default" if not specified
16 | 
17 |   train_metric_prefix: "train/"
18 |   val_metric_prefix: "val/"
19 |   test_metric_prefix: "test/"
20 | 
21 |   # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
22 |   system_tracking_interval: 10 # set to null to disable system metrics tracking
23 | 
24 |   # enable/disable logging of system params such as installed packages, git info, env vars, etc.
25 |   log_system_params: true
26 | 
27 |   # enable/disable tracking console logs (default value is true)
28 |   capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550
29 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/comet.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.comet.ml
 2 | 
 3 | comet:
 4 |   _target_: lightning.pytorch.loggers.comet.CometLogger
 5 |   api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
 6 |   save_dir: "${paths.output_dir}"
 7 |   project_name: "lightning-hydra-template"
 8 |   rest_api_key: null
 9 |   # experiment_name: ""
10 |   experiment_key: null # set to resume experiment
11 |   offline: False
12 |   prefix: ""
13 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/csv.yaml:
--------------------------------------------------------------------------------
1 | # csv logger built in lightning
2 | 
3 | csv:
4 |   _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
5 |   save_dir: "${paths.output_dir}"
6 |   name: "csv/"
7 |   prefix: ""
8 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/many_loggers.yaml:
--------------------------------------------------------------------------------
 1 | # train with many loggers at once
 2 | 
 3 | defaults:
 4 |   # - comet
 5 |   - csv
 6 |   # - mlflow
 7 |   # - neptune
 8 |   - tensorboard
 9 |   - wandb
10 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/mlflow.yaml:
--------------------------------------------------------------------------------
 1 | # https://mlflow.org
 2 | 
 3 | mlflow:
 4 |   _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
 5 |   # experiment_name: ""
 6 |   # run_name: ""
 7 |   tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
 8 |   tags: null
 9 |   # save_dir: "./mlruns"
10 |   prefix: ""
11 |   artifact_location: null
12 |   # run_id: ""
13 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/neptune.yaml:
--------------------------------------------------------------------------------
 1 | # https://neptune.ai
 2 | 
 3 | neptune:
 4 |   _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
 5 |   api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
 6 |   project: username/lightning-hydra-template
 7 |   # name: ""
 8 |   log_model_checkpoints: True
 9 |   prefix: ""
10 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.tensorflow.org/tensorboard/
 2 | 
 3 | tensorboard:
 4 |   _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
 5 |   save_dir: "${paths.output_dir}/tensorboard/"
 6 |   name: null
 7 |   log_graph: False
 8 |   default_hp_metric: True
 9 |   prefix: ""
10 |   # version: ""
11 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/wandb.yaml:
--------------------------------------------------------------------------------
 1 | # https://wandb.ai
 2 | 
 3 | wandb:
 4 |   _target_: lightning.pytorch.loggers.wandb.WandbLogger
 5 |   # name: "" # name of the run (normally generated by wandb)
 6 |   save_dir: "${paths.output_dir}"
 7 |   offline: False
 8 |   id: null # pass correct id to resume experiment!
 9 |   anonymous: null # enable anonymous logging
10 |   project: "lightning-hydra-template"
11 |   log_model: False # upload lightning ckpts
12 |   prefix: "" # a string to put at the beginning of metric keys
13 |   # entity: "" # set to name of your wandb team
14 |   group: ""
15 |   tags: []
16 |   job_type: ""
17 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/cfm/default.yaml:
--------------------------------------------------------------------------------
1 | name: CFM
2 | solver: euler
3 | sigma_min: 1e-4
4 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/decoder/default.yaml:
--------------------------------------------------------------------------------
1 | channels: [256, 256]
2 | dropout: 0.05
3 | attention_head_dim: 64
4 | n_blocks: 1
5 | num_mid_blocks: 2
6 | num_heads: 2
7 | act_fn: snakebeta
8 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/encoder/default.yaml:
--------------------------------------------------------------------------------
 1 | encoder_type: RoPE Encoder
 2 | encoder_params:
 3 |   n_feats: ${model.n_feats}
 4 |   n_channels: 192
 5 |   filter_channels: 768
 6 |   filter_channels_dp: 256
 7 |   n_heads: 2
 8 |   n_layers: 6
 9 |   kernel_size: 3
10 |   p_dropout: 0.1
11 |   spk_emb_dim: 64
12 |   n_spks: 1
13 |   prenet: true
14 | 
15 | duration_predictor_params:
16 |   filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
17 |   kernel_size: 3
18 |   p_dropout: ${model.encoder.encoder_params.p_dropout}
19 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/matcha.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - encoder: default.yaml
 4 |   - decoder: default.yaml
 5 |   - cfm: default.yaml
 6 |   - optimizer: adam.yaml
 7 | 
 8 | _target_: matcha.models.matcha_tts.MatchaTTS
 9 | n_vocab: 178
10 | n_spks: ${data.n_spks}
11 | spk_emb_dim: 64
12 | n_feats: 80
13 | data_statistics: ${data.data_statistics}
14 | out_size: null # Must be divisible by 4
15 | prior_loss: true
16 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | _partial_: true
3 | lr: 1e-4
4 | weight_decay: 0.0
5 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/paths/default.yaml:
--------------------------------------------------------------------------------
 1 | # path to root directory
 2 | # this requires PROJECT_ROOT environment variable to exist
 3 | # you can replace it with "." if you want the root to be the current working directory
 4 | root_dir: ${oc.env:PROJECT_ROOT}
 5 | 
 6 | # path to data directory
 7 | data_dir: ${paths.root_dir}/data/
 8 | 
 9 | # path to logging directory
10 | log_dir: ${paths.root_dir}/logs/
11 | 
12 | # path to output directory, created dynamically by hydra
13 | # path generation pattern is specified in `configs/hydra/default.yaml`
14 | # use it to store all files generated during the run, like ckpts and metrics
15 | output_dir: ${hydra:runtime.output_dir}
16 | 
17 | # path to working directory
18 | work_dir: ${hydra:runtime.cwd}
19 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # specify here default configuration
 4 | # order of defaults determines the order in which configs override each other
 5 | defaults:
 6 |   - _self_
 7 |   - data: ljspeech
 8 |   - model: matcha
 9 |   - callbacks: default
10 |   - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
11 |   - trainer: default
12 |   - paths: default
13 |   - extras: default
14 |   - hydra: default
15 | 
16 |   # experiment configs allow for version control of specific hyperparameters
17 |   # e.g. best hyperparameters for given model and datamodule
18 |   - experiment: null
19 | 
20 |   # config for hyperparameter optimization
21 |   - hparams_search: null
22 | 
23 |   # optional local config for machine/user specific settings
24 |   # it's optional since it doesn't need to exist and is excluded from version control
25 |   - optional local: default
26 | 
27 |   # debugging config (enable through command line, e.g. `python train.py debug=default)
28 |   - debug: null
29 | 
30 | # task name, determines output directory path
31 | task_name: "train"
32 | 
33 | run_name: ???
34 | 
35 | # tags to help you identify your experiments
36 | # you can overwrite this in experiment configs
37 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
38 | tags: ["dev"]
39 | 
40 | # set False to skip model training
41 | train: True
42 | 
43 | # evaluate on test set, using best model weights achieved during training
44 | # lightning chooses best weights based on the metric specified in checkpoint callback
45 | test: True
46 | 
47 | # simply provide checkpoint path to resume training
48 | ckpt_path: null
49 | 
50 | # seed for random number generators in pytorch, numpy and python.random
51 | seed: 1234
52 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/cpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: cpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/ddp.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - default
 3 | 
 4 | strategy: ddp
 5 | 
 6 | accelerator: gpu
 7 | devices: [0,1]
 8 | num_nodes: 1
 9 | sync_batchnorm: True
10 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | # simulate DDP on CPU, useful for debugging
5 | accelerator: cpu
6 | devices: 2
7 | strategy: ddp_spawn
8 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: lightning.pytorch.trainer.Trainer
 2 | 
 3 | default_root_dir: ${paths.output_dir}
 4 | 
 5 | max_epochs: -1
 6 | 
 7 | accelerator: gpu
 8 | devices: [0]
 9 | 
10 | # mixed precision for extra speed-up
11 | precision: 16-mixed
12 | 
13 | # perform a validation loop every N training epochs
14 | check_val_every_n_epoch: 1
15 | 
16 | # set True to to ensure deterministic results
17 | # makes training slower but gives more reproducibility than just setting seeds
18 | deterministic: False
19 | 
20 | gradient_clip_val: 5.0
21 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/gpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: gpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/mps.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: mps
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/data:
--------------------------------------------------------------------------------
1 | /home/smehta/Projects/Speech-Backbones/Grad-TTS/data


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.5.1
2 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/components/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/config.py:
--------------------------------------------------------------------------------
 1 | v1 = {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0004,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 |     "upsample_rates": [8, 8, 2, 2],
11 |     "upsample_kernel_sizes": [16, 16, 4, 4],
12 |     "upsample_initial_channel": 512,
13 |     "resblock_kernel_sizes": [3, 7, 11],
14 |     "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
15 |     "resblock_initial_channel": 256,
16 |     "segment_size": 8192,
17 |     "num_mels": 80,
18 |     "num_freq": 1025,
19 |     "n_fft": 1024,
20 |     "hop_size": 256,
21 |     "win_size": 1024,
22 |     "sampling_rate": 22050,
23 |     "fmin": 0,
24 |     "fmax": 8000,
25 |     "fmax_loss": None,
26 |     "num_workers": 4,
27 |     "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
28 | }
29 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/denoiser.py:
--------------------------------------------------------------------------------
 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
 2 | 
 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
 4 | import torch
 5 | 
 6 | 
 7 | class Denoiser(torch.nn.Module):
 8 |     """Removes model bias from audio produced with waveglow"""
 9 | 
10 |     def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
11 |         super().__init__()
12 |         self.filter_length = filter_length
13 |         self.hop_length = int(filter_length / n_overlap)
14 |         self.win_length = win_length
15 | 
16 |         dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
17 |         self.device = device
18 |         if mode == "zeros":
19 |             mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
20 |         elif mode == "normal":
21 |             mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
22 |         else:
23 |             raise Exception(f"Mode {mode} if not supported")
24 | 
25 |         def stft_fn(audio, n_fft, hop_length, win_length, window):
26 |             spec = torch.stft(
27 |                 audio,
28 |                 n_fft=n_fft,
29 |                 hop_length=hop_length,
30 |                 win_length=win_length,
31 |                 window=window,
32 |                 return_complex=True,
33 |             )
34 |             spec = torch.view_as_real(spec)
35 |             return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
36 | 
37 |         self.stft = lambda x: stft_fn(
38 |             audio=x,
39 |             n_fft=self.filter_length,
40 |             hop_length=self.hop_length,
41 |             win_length=self.win_length,
42 |             window=torch.hann_window(self.win_length, device=device),
43 |         )
44 |         self.istft = lambda x, y: torch.istft(
45 |             torch.complex(x * torch.cos(y), x * torch.sin(y)),
46 |             n_fft=self.filter_length,
47 |             hop_length=self.hop_length,
48 |             win_length=self.win_length,
49 |             window=torch.hann_window(self.win_length, device=device),
50 |         )
51 | 
52 |         with torch.no_grad():
53 |             bias_audio = vocoder(mel_input).float().squeeze(0)
54 |             bias_spec, _ = self.stft(bias_audio)
55 | 
56 |         self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
57 | 
58 |     @torch.inference_mode()
59 |     def forward(self, audio, strength=0.0005):
60 |         audio_spec, audio_angles = self.stft(audio)
61 |         audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
62 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
63 |         audio_denoised = self.istft(audio_spec_denoised, audio_angles)
64 |         return audio_denoised
65 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | 
 6 | import matplotlib
 7 | import torch
 8 | from torch.nn.utils import weight_norm
 9 | 
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 | 
13 | 
14 | def plot_spectrogram(spectrogram):
15 |     fig, ax = plt.subplots(figsize=(10, 2))
16 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
17 |     plt.colorbar(im, ax=ax)
18 | 
19 |     fig.canvas.draw()
20 |     plt.close()
21 | 
22 |     return fig
23 | 
24 | 
25 | def init_weights(m, mean=0.0, std=0.01):
26 |     classname = m.__class__.__name__
27 |     if classname.find("Conv") != -1:
28 |         m.weight.data.normal_(mean, std)
29 | 
30 | 
31 | def apply_weight_norm(m):
32 |     classname = m.__class__.__name__
33 |     if classname.find("Conv") != -1:
34 |         weight_norm(m)
35 | 
36 | 
37 | def get_padding(kernel_size, dilation=1):
38 |     return int((kernel_size * dilation - dilation) / 2)
39 | 
40 | 
41 | def load_checkpoint(filepath, device):
42 |     assert os.path.isfile(filepath)
43 |     print(f"Loading '{filepath}'")
44 |     checkpoint_dict = torch.load(filepath, map_location=device)
45 |     print("Complete.")
46 |     return checkpoint_dict
47 | 
48 | 
49 | def save_checkpoint(filepath, obj):
50 |     print(f"Saving checkpoint to {filepath}")
51 |     torch.save(obj, filepath)
52 |     print("Complete.")
53 | 
54 | 
55 | def scan_checkpoint(cp_dir, prefix):
56 |     pattern = os.path.join(cp_dir, prefix + "????????")
57 |     cp_list = glob.glob(pattern)
58 |     if len(cp_list) == 0:
59 |         return None
60 |     return sorted(cp_list)[-1]
61 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/components/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/onnx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/onnx/__init__.py


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from matcha.text import cleaners
 3 | from matcha.text.symbols import symbols
 4 | 
 5 | # Mappings from symbol to numeric ID and vice versa:
 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}  # pylint: disable=unnecessary-comprehension
 8 | 
 9 | 
10 | def text_to_sequence(text, cleaner_names):
11 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
12 |     Args:
13 |       text: string to convert to a sequence
14 |       cleaner_names: names of the cleaner functions to run the text through
15 |     Returns:
16 |       List of integers corresponding to the symbols in the text
17 |     """
18 |     sequence = []
19 | 
20 |     clean_text = _clean_text(text, cleaner_names)
21 |     for symbol in clean_text:
22 |         symbol_id = _symbol_to_id[symbol]
23 |         sequence += [symbol_id]
24 |     return sequence
25 | 
26 | 
27 | def cleaned_text_to_sequence(cleaned_text):
28 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
29 |     Args:
30 |       text: string to convert to a sequence
31 |     Returns:
32 |       List of integers corresponding to the symbols in the text
33 |     """
34 |     sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
35 |     return sequence
36 | 
37 | 
38 | def sequence_to_text(sequence):
39 |     """Converts a sequence of IDs back to a string"""
40 |     result = ""
41 |     for symbol_id in sequence:
42 |         s = _id_to_symbol[symbol_id]
43 |         result += s
44 |     return result
45 | 
46 | 
47 | def _clean_text(text, cleaner_names):
48 |     for name in cleaner_names:
49 |         cleaner = getattr(cleaners, name)
50 |         if not cleaner:
51 |             raise Exception("Unknown cleaner: %s" % name)
52 |         text = cleaner(text)
53 |     return text
54 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | import inflect
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"[0-9]+")
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(",", "")
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace(".", " point ")
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split(".")
27 |     if len(parts) > 2:
28 |         return match + " dollars"
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
33 |         cent_unit = "cent" if cents == 1 else "cents"
34 |         return f"{dollars} {dollar_unit}, {cents} {cent_unit}"
35 |     elif dollars:
36 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
37 |         return f"{dollars} {dollar_unit}"
38 |     elif cents:
39 |         cent_unit = "cent" if cents == 1 else "cents"
40 |         return f"{cents} {cent_unit}"
41 |     else:
42 |         return "zero dollars"
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return "two thousand"
54 |         elif num > 2000 and num < 2010:
55 |             return "two thousand " + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + " hundred"
58 |         else:
59 |             return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
60 |     else:
61 |         return _inflect.number_to_words(num, andword="")
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r"\1 pounds", text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text
72 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron
 2 | 
 3 | Defines the set of symbols used in text input to the model.
 4 | """
 5 | _pad = "_"
 6 | _punctuation = ';:,.!?¡¿—…"«»“” '
 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 8 | _letters_ipa = (
 9 |     "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 | )
11 | 
12 | 
13 | # Export all symbols:
14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15 | 
16 | # Special symbol ids
17 | SPACE_ID = symbols.index(" ")
18 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
2 | from matcha.utils.logging_utils import log_hyperparameters
3 | from matcha.utils.pylogger import get_pylogger
4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree
5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper
6 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/audio.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from librosa.filters import mel as librosa_mel_fn
 5 | from scipy.io.wavfile import read
 6 | 
 7 | MAX_WAV_VALUE = 32768.0
 8 | 
 9 | 
10 | def load_wav(full_path):
11 |     sampling_rate, data = read(full_path)
12 |     return data, sampling_rate
13 | 
14 | 
15 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
16 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17 | 
18 | 
19 | def dynamic_range_decompression(x, C=1):
20 |     return np.exp(x) / C
21 | 
22 | 
23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24 |     return torch.log(torch.clamp(x, min=clip_val) * C)
25 | 
26 | 
27 | def dynamic_range_decompression_torch(x, C=1):
28 |     return torch.exp(x) / C
29 | 
30 | 
31 | def spectral_normalize_torch(magnitudes):
32 |     output = dynamic_range_compression_torch(magnitudes)
33 |     return output
34 | 
35 | 
36 | def spectral_de_normalize_torch(magnitudes):
37 |     output = dynamic_range_decompression_torch(magnitudes)
38 |     return output
39 | 
40 | 
41 | mel_basis = {}
42 | hann_window = {}
43 | 
44 | 
45 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
46 |     if torch.min(y) < -1.0:
47 |         print("min value is ", torch.min(y))
48 |     if torch.max(y) > 1.0:
49 |         print("max value is ", torch.max(y))
50 | 
51 |     global mel_basis, hann_window  # pylint: disable=global-statement
52 |     if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
53 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
54 |         mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
55 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
56 | 
57 |     y = torch.nn.functional.pad(
58 |         y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
59 |     )
60 |     y = y.squeeze(1)
61 | 
62 |     spec = torch.view_as_real(
63 |         torch.stft(
64 |             y,
65 |             n_fft,
66 |             hop_length=hop_size,
67 |             win_length=win_size,
68 |             window=hann_window[str(y.device)],
69 |             center=center,
70 |             pad_mode="reflect",
71 |             normalized=False,
72 |             onesided=True,
73 |             return_complex=True,
74 |         )
75 |     )
76 | 
77 |     spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
78 | 
79 |     spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
80 |     spec = spectral_normalize_torch(spec)
81 | 
82 |     return spec
83 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/instantiators.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import hydra
 4 | from lightning import Callback
 5 | from lightning.pytorch.loggers import Logger
 6 | from omegaconf import DictConfig
 7 | 
 8 | from matcha.utils import pylogger
 9 | 
10 | log = pylogger.get_pylogger(__name__)
11 | 
12 | 
13 | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
14 |     """Instantiates callbacks from config.
15 | 
16 |     :param callbacks_cfg: A DictConfig object containing callback configurations.
17 |     :return: A list of instantiated callbacks.
18 |     """
19 |     callbacks: List[Callback] = []
20 | 
21 |     if not callbacks_cfg:
22 |         log.warning("No callback configs found! Skipping..")
23 |         return callbacks
24 | 
25 |     if not isinstance(callbacks_cfg, DictConfig):
26 |         raise TypeError("Callbacks config must be a DictConfig!")
27 | 
28 |     for _, cb_conf in callbacks_cfg.items():
29 |         if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
30 |             log.info(f"Instantiating callback <{cb_conf._target_}>")  # pylint: disable=protected-access
31 |             callbacks.append(hydra.utils.instantiate(cb_conf))
32 | 
33 |     return callbacks
34 | 
35 | 
36 | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
37 |     """Instantiates loggers from config.
38 | 
39 |     :param logger_cfg: A DictConfig object containing logger configurations.
40 |     :return: A list of instantiated loggers.
41 |     """
42 |     logger: List[Logger] = []
43 | 
44 |     if not logger_cfg:
45 |         log.warning("No logger configs found! Skipping...")
46 |         return logger
47 | 
48 |     if not isinstance(logger_cfg, DictConfig):
49 |         raise TypeError("Logger config must be a DictConfig!")
50 | 
51 |     for _, lg_conf in logger_cfg.items():
52 |         if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
53 |             log.info(f"Instantiating logger <{lg_conf._target_}>")  # pylint: disable=protected-access
54 |             logger.append(hydra.utils.instantiate(lg_conf))
55 | 
56 |     return logger
57 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from lightning.pytorch.utilities import rank_zero_only
 4 | from omegaconf import OmegaConf
 5 | 
 6 | from matcha.utils import pylogger
 7 | 
 8 | log = pylogger.get_pylogger(__name__)
 9 | 
10 | 
11 | @rank_zero_only
12 | def log_hyperparameters(object_dict: Dict[str, Any]) -> None:
13 |     """Controls which config parts are saved by Lightning loggers.
14 | 
15 |     Additionally saves:
16 |         - Number of model parameters
17 | 
18 |     :param object_dict: A dictionary containing the following objects:
19 |         - `"cfg"`: A DictConfig object containing the main config.
20 |         - `"model"`: The Lightning model.
21 |         - `"trainer"`: The Lightning trainer.
22 |     """
23 |     hparams = {}
24 | 
25 |     cfg = OmegaConf.to_container(object_dict["cfg"])
26 |     model = object_dict["model"]
27 |     trainer = object_dict["trainer"]
28 | 
29 |     if not trainer.logger:
30 |         log.warning("Logger not found! Skipping hyperparameter logging...")
31 |         return
32 | 
33 |     hparams["model"] = cfg["model"]
34 | 
35 |     # save number of model parameters
36 |     hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
37 |     hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad)
38 |     hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad)
39 | 
40 |     hparams["data"] = cfg["data"]
41 |     hparams["trainer"] = cfg["trainer"]
42 | 
43 |     hparams["callbacks"] = cfg.get("callbacks")
44 |     hparams["extras"] = cfg.get("extras")
45 | 
46 |     hparams["task_name"] = cfg.get("task_name")
47 |     hparams["tags"] = cfg.get("tags")
48 |     hparams["ckpt_path"] = cfg.get("ckpt_path")
49 |     hparams["seed"] = cfg.get("seed")
50 | 
51 |     # send hparams to all loggers
52 |     for logger in trainer.loggers:
53 |         logger.log_hyperparams(hparams)
54 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/model.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jaywalnut310/glow-tts """
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def sequence_mask(length, max_length=None):
 8 |     if max_length is None:
 9 |         max_length = length.max()
10 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
11 |     return x.unsqueeze(0) < length.unsqueeze(1)
12 | 
13 | 
14 | def fix_len_compatibility(length, num_downsamplings_in_unet=2):
15 |     factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet)
16 |     length = (length / factor).ceil() * factor
17 |     if not torch.onnx.is_in_onnx_export():
18 |         return length.int().item()
19 |     else:
20 |         return length
21 | 
22 | 
23 | def convert_pad_shape(pad_shape):
24 |     inverted_shape = pad_shape[::-1]
25 |     pad_shape = [item for sublist in inverted_shape for item in sublist]
26 |     return pad_shape
27 | 
28 | 
29 | def generate_path(duration, mask):
30 |     device = duration.device
31 | 
32 |     b, t_x, t_y = mask.shape
33 |     cum_duration = torch.cumsum(duration, 1)
34 |     path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
35 | 
36 |     cum_duration_flat = cum_duration.view(b * t_x)
37 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
38 |     path = path.view(b, t_x, t_y)
39 |     path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
40 |     path = path * mask
41 |     return path
42 | 
43 | 
44 | def duration_loss(logw, logw_, lengths):
45 |     loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
46 |     return loss
47 | 
48 | 
49 | def normalize(data, mu, std):
50 |     if not isinstance(mu, (float, int)):
51 |         if isinstance(mu, list):
52 |             mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
53 |         elif isinstance(mu, torch.Tensor):
54 |             mu = mu.to(data.device)
55 |         elif isinstance(mu, np.ndarray):
56 |             mu = torch.from_numpy(mu).to(data.device)
57 |         mu = mu.unsqueeze(-1)
58 | 
59 |     if not isinstance(std, (float, int)):
60 |         if isinstance(std, list):
61 |             std = torch.tensor(std, dtype=data.dtype, device=data.device)
62 |         elif isinstance(std, torch.Tensor):
63 |             std = std.to(data.device)
64 |         elif isinstance(std, np.ndarray):
65 |             std = torch.from_numpy(std).to(data.device)
66 |         std = std.unsqueeze(-1)
67 | 
68 |     return (data - mu) / std
69 | 
70 | 
71 | def denormalize(data, mu, std):
72 |     if not isinstance(mu, float):
73 |         if isinstance(mu, list):
74 |             mu = torch.tensor(mu, dtype=data.dtype, device=data.device)
75 |         elif isinstance(mu, torch.Tensor):
76 |             mu = mu.to(data.device)
77 |         elif isinstance(mu, np.ndarray):
78 |             mu = torch.from_numpy(mu).to(data.device)
79 |         mu = mu.unsqueeze(-1)
80 | 
81 |     if not isinstance(std, float):
82 |         if isinstance(std, list):
83 |             std = torch.tensor(std, dtype=data.dtype, device=data.device)
84 |         elif isinstance(std, torch.Tensor):
85 |             std = std.to(data.device)
86 |         elif isinstance(std, np.ndarray):
87 |             std = torch.from_numpy(std).to(data.device)
88 |         std = std.unsqueeze(-1)
89 | 
90 |     return data * std + mu
91 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from matcha.utils.monotonic_align.core import maximum_path_c
 5 | 
 6 | 
 7 | def maximum_path(value, mask):
 8 |     """Cython optimised version.
 9 |     value: [b, t_x, t_y]
10 |     mask: [b, t_x, t_y]
11 |     """
12 |     value = value * mask
13 |     device = value.device
14 |     dtype = value.dtype
15 |     value = value.data.cpu().numpy().astype(np.float32)
16 |     path = np.zeros_like(value).astype(np.int32)
17 |     mask = mask.data.cpu().numpy()
18 | 
19 |     t_x_max = mask.sum(1)[:, 0].astype(np.int32)
20 |     t_y_max = mask.sum(2)[:, 0].astype(np.int32)
21 |     maximum_path_c(path, value, t_x_max, t_y_max)
22 |     return torch.from_numpy(path).to(device=device, dtype=dtype)
23 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | cimport cython
 4 | cimport numpy as np
 5 | 
 6 | from cython.parallel import prange
 7 | 
 8 | 
 9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 |   cdef int x
13 |   cdef int y
14 |   cdef float v_prev
15 |   cdef float v_cur
16 |   cdef float tmp
17 |   cdef int index = t_x - 1
18 | 
19 |   for y in range(t_y):
20 |     for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 |       if x == y:
22 |         v_cur = max_neg_val
23 |       else:
24 |         v_cur = value[x, y-1]
25 |       if x == 0:
26 |         if y == 0:
27 |           v_prev = 0.
28 |         else:
29 |           v_prev = max_neg_val
30 |       else:
31 |         v_prev = value[x-1, y-1]
32 |       value[x, y] = max(v_cur, v_prev) + value[x, y]
33 | 
34 |   for y in range(t_y - 1, -1, -1):
35 |     path[index, y] = 1
36 |     if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 |       index = index - 1
38 | 
39 | 
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 |   cdef int b = values.shape[0]
44 | 
45 |   cdef int i
46 |   for i in prange(b, nogil=True):
47 |     maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 | 
5 | # setup(name='monotonic_align',
6 | #       ext_modules=cythonize("core.pyx"),
7 | #       include_dirs=[numpy.get_include()])
8 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lightning.pytorch.utilities import rank_zero_only
 4 | 
 5 | 
 6 | def get_pylogger(name: str = __name__) -> logging.Logger:
 7 |     """Initializes a multi-GPU-friendly python command line logger.
 8 | 
 9 |     :param name: The name of the logger, defaults to ``__name__``.
10 | 
11 |     :return: A logger object.
12 |     """
13 |     logger = logging.getLogger(name)
14 | 
15 |     # this ensures all logging levels get marked with the rank zero decorator
16 |     # otherwise logs would get multiplied for each GPU process in multi-GPU setup
17 |     logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical")
18 |     for level in logging_levels:
19 |         setattr(logger, level, rank_zero_only(getattr(logger, level)))
20 | 
21 |     return logger
22 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/rich_utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Sequence
  3 | 
  4 | import rich
  5 | import rich.syntax
  6 | import rich.tree
  7 | from hydra.core.hydra_config import HydraConfig
  8 | from lightning.pytorch.utilities import rank_zero_only
  9 | from omegaconf import DictConfig, OmegaConf, open_dict
 10 | from rich.prompt import Prompt
 11 | 
 12 | from matcha.utils import pylogger
 13 | 
 14 | log = pylogger.get_pylogger(__name__)
 15 | 
 16 | 
 17 | @rank_zero_only
 18 | def print_config_tree(
 19 |     cfg: DictConfig,
 20 |     print_order: Sequence[str] = (
 21 |         "data",
 22 |         "model",
 23 |         "callbacks",
 24 |         "logger",
 25 |         "trainer",
 26 |         "paths",
 27 |         "extras",
 28 |     ),
 29 |     resolve: bool = False,
 30 |     save_to_file: bool = False,
 31 | ) -> None:
 32 |     """Prints the contents of a DictConfig as a tree structure using the Rich library.
 33 | 
 34 |     :param cfg: A DictConfig composed by Hydra.
 35 |     :param print_order: Determines in what order config components are printed. Default is ``("data", "model",
 36 |     "callbacks", "logger", "trainer", "paths", "extras")``.
 37 |     :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``.
 38 |     :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``.
 39 |     """
 40 |     style = "dim"
 41 |     tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
 42 | 
 43 |     queue = []
 44 | 
 45 |     # add fields from `print_order` to queue
 46 |     for field in print_order:
 47 |         _ = (
 48 |             queue.append(field)
 49 |             if field in cfg
 50 |             else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...")
 51 |         )
 52 | 
 53 |     # add all the other fields to queue (not specified in `print_order`)
 54 |     for field in cfg:
 55 |         if field not in queue:
 56 |             queue.append(field)
 57 | 
 58 |     # generate config tree from queue
 59 |     for field in queue:
 60 |         branch = tree.add(field, style=style, guide_style=style)
 61 | 
 62 |         config_group = cfg[field]
 63 |         if isinstance(config_group, DictConfig):
 64 |             branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
 65 |         else:
 66 |             branch_content = str(config_group)
 67 | 
 68 |         branch.add(rich.syntax.Syntax(branch_content, "yaml"))
 69 | 
 70 |     # print config tree
 71 |     rich.print(tree)
 72 | 
 73 |     # save config tree to file
 74 |     if save_to_file:
 75 |         with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
 76 |             rich.print(tree, file=file)
 77 | 
 78 | 
 79 | @rank_zero_only
 80 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
 81 |     """Prompts user to input tags from command line if no tags are provided in config.
 82 | 
 83 |     :param cfg: A DictConfig composed by Hydra.
 84 |     :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``.
 85 |     """
 86 |     if not cfg.get("tags"):
 87 |         if "id" in HydraConfig().cfg.hydra.job:
 88 |             raise ValueError("Specify tags before launching a multirun!")
 89 | 
 90 |         log.warning("No tags provided in config. Prompting user to input tags...")
 91 |         tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
 92 |         tags = [t.strip() for t in tags.split(",") if t != ""]
 93 | 
 94 |         with open_dict(cfg):
 95 |             cfg.tags = tags
 96 | 
 97 |         log.info(f"Tags: {cfg.tags}")
 98 | 
 99 |     if save_to_file:
100 |         with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
101 |             rich.print(cfg.tags, file=file)
102 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/notebooks/.gitkeep


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"]
 3 | 
 4 | [tool.black]
 5 | line-length = 120
 6 | target-version = ['py310']
 7 | exclude = '''
 8 | 
 9 | (
10 |   /(
11 |       \.eggs         # exclude a few common directories in the
12 |     | \.git          # root of the project
13 |     | \.hg
14 |     | \.mypy_cache
15 |     | \.tox
16 |     | \.venv
17 |     | _build
18 |     | buck-out
19 |     | build
20 |     | dist
21 |   )/
22 |   | foo.py           # also separately exclude a file named foo.py in
23 |                      # the root of the project
24 | )
25 | '''
26 | 
27 | [tool.pytest.ini_options]
28 | addopts = [
29 |   "--color=yes",
30 |   "--durations=0",
31 |   "--strict-markers",
32 |   "--doctest-modules",
33 | ]
34 | filterwarnings = [
35 |   "ignore::DeprecationWarning",
36 |   "ignore::UserWarning",
37 | ]
38 | log_cli = "True"
39 | markers = [
40 |   "slow: slow tests",
41 | ]
42 | minversion = "6.0"
43 | testpaths = "tests/"
44 | 
45 | [tool.coverage.report]
46 | exclude_lines = [
47 |     "pragma: nocover",
48 |     "raise NotImplementedError",
49 |     "raise NotImplementedError()",
50 |     "if __name__ == .__main__.:",
51 | ]
52 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/requirements.txt:
--------------------------------------------------------------------------------
 1 | # --------- pytorch --------- #
 2 | torch>=2.0.0
 3 | torchvision>=0.15.0
 4 | lightning>=2.0.0
 5 | torchmetrics>=0.11.4
 6 | 
 7 | # --------- hydra --------- #
 8 | hydra-core==1.3.2
 9 | hydra-colorlog==1.2.0
10 | hydra-optuna-sweeper==1.2.0
11 | 
12 | # --------- loggers --------- #
13 | # wandb
14 | # neptune-client
15 | # mlflow
16 | # comet-ml
17 | # aim>=3.16.2  # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
18 | 
19 | # --------- others --------- #
20 | rootutils       # standardizing the project root setup
21 | pre-commit      # hooks for applying linters on commit
22 | rich            # beautiful text formatting in terminal
23 | pytest          # tests
24 | # sh            # for running bash commands in some tests (linux/macos only)
25 | phonemizer      # phonemization of text
26 | tensorboard
27 | librosa
28 | Cython
29 | numpy
30 | einops
31 | inflect
32 | Unidecode
33 | scipy
34 | torchaudio
35 | matplotlib
36 | pandas
37 | conformer==0.3.2
38 | diffusers==0.25.0
39 | notebook
40 | ipywidgets
41 | gradio==3.43.2
42 | gdown
43 | wget
44 | seaborn
45 | piper_phonemize
46 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/scripts/schedule.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Schedule execution of many runs
3 | # Run from root folder with: bash scripts/schedule.sh
4 | 
5 | python src/train.py trainer.max_epochs=5 logger=csv
6 | 
7 | python src/train.py trainer.max_epochs=10 logger=csv
8 | 


--------------------------------------------------------------------------------
/src/GLM_4_Voice/third_party/Matcha-TTS/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | import numpy
 5 | from Cython.Build import cythonize
 6 | from setuptools import Extension, find_packages, setup
 7 | 
 8 | exts = [
 9 |     Extension(
10 |         name="matcha.utils.monotonic_align.core",
11 |         sources=["matcha/utils/monotonic_align/core.pyx"],
12 |     )
13 | ]
14 | 
15 | with open("README.md", encoding="utf-8") as readme_file:
16 |     README = readme_file.read()
17 | 
18 | cwd = os.path.dirname(os.path.abspath(__file__))
19 | with open(os.path.join(cwd, "matcha", "VERSION")) as fin:
20 |     version = fin.read().strip()
21 | 
22 | setup(
23 |     name="matcha-tts",
24 |     version=version,
25 |     description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching",
26 |     long_description=README,
27 |     long_description_content_type="text/markdown",
28 |     author="Shivam Mehta",
29 |     author_email="shivam.mehta25@gmail.com",
30 |     url="https://shivammehta25.github.io/Matcha-TTS",
31 |     install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))],
32 |     include_dirs=[numpy.get_include()],
33 |     include_package_data=True,
34 |     packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]),
35 |     # use this to customize global commands available in the terminal after installing the package
36 |     entry_points={
37 |         "console_scripts": [
38 |             "matcha-data-stats=matcha.utils.generate_data_statistics:main",
39 |             "matcha-tts=matcha.cli:cli",
40 |             "matcha-tts-app=matcha.app:main",
41 |         ]
42 |     },
43 |     ext_modules=cythonize(exts, language_level=3),
44 |     python_requires=">=3.9.0",
45 | )
46 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from AR.data.bucket_sampler import DistributedBucketSampler
 5 | from AR.data.dataset import Text2SemanticDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | class Text2SemanticDataModule(LightningDataModule):
10 |     def __init__(
11 |         self,
12 |         config,
13 |         train_semantic_path,
14 |         train_phoneme_path,
15 |         dev_semantic_path=None,
16 |         dev_phoneme_path=None,
17 |     ):
18 |         super().__init__()
19 |         self.config = config
20 |         self.train_semantic_path = train_semantic_path
21 |         self.train_phoneme_path = train_phoneme_path
22 |         self.dev_semantic_path = dev_semantic_path
23 |         self.dev_phoneme_path = dev_phoneme_path
24 |         self.num_workers = self.config["data"]["num_workers"]
25 | 
26 |     def prepare_data(self):
27 |         pass
28 | 
29 |     def setup(self, stage=None, output_logs=False):
30 |         self._train_dataset = Text2SemanticDataset(
31 |             phoneme_path=self.train_phoneme_path,
32 |             semantic_path=self.train_semantic_path,
33 |             max_sec=self.config["data"]["max_sec"],
34 |             pad_val=self.config["data"]["pad_val"],
35 |         )
36 |         self._dev_dataset = self._train_dataset
37 |         # self._dev_dataset = Text2SemanticDataset(
38 |         #     phoneme_path=self.dev_phoneme_path,
39 |         #     semantic_path=self.dev_semantic_path,
40 |         #     max_sample=self.config['data']['max_eval_sample'],
41 |         #     max_sec=self.config['data']['max_sec'],
42 |         #     pad_val=self.config['data']['pad_val'])
43 | 
44 |     def train_dataloader(self):
45 |         batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 |         batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 |         return DataLoader(
49 |             self._train_dataset,
50 |             batch_size=batch_size,
51 |             sampler=sampler,
52 |             collate_fn=self._train_dataset.collate,
53 |             num_workers=self.num_workers,
54 |             persistent_workers=True,
55 |             prefetch_factor=16,
56 |         )
57 | 
58 |     def val_dataloader(self):
59 |         return DataLoader(
60 |             self._dev_dataset,
61 |             batch_size=1,
62 |             shuffle=False,
63 |             collate_fn=self._train_dataset.collate,
64 |             num_workers=max(self.num_workers, 12),
65 |             persistent_workers=True,
66 |             prefetch_factor=16,
67 |         )
68 | 
69 |     # 这个会使用到嘛？
70 |     def test_dataloader(self):
71 |         return DataLoader(
72 |             self._dev_dataset,
73 |             batch_size=1,
74 |             shuffle=False,
75 |             collate_fn=self._train_dataset.collate,
76 |         )
77 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(
64 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 |             ).unsqueeze(1)
66 |         else:
67 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 |         div_term = torch.exp(
69 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 |             * -(math.log(10000.0) / self.embedding_dim)
71 |         )
72 |         pe[:, 0::2] = torch.sin(position * div_term)
73 |         pe[:, 1::2] = torch.cos(position * div_term)
74 |         pe = pe.unsqueeze(0)
75 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 | 
77 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
78 |         self.extend_pe(x)
79 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
80 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 |         return self.dropout(output)
82 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (
53 |                 self.total_steps - self.warmup_steps
54 |             )
55 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
56 |                 raise RuntimeError(
57 |                     "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 |                 )
59 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 | 
62 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
63 |         self.set_lr(lr)
64 |         self.lr = lr
65 |         self._current_step += 1
66 |         return self.lr
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     m = nn.Linear(10, 10)
71 |     opt = Adam(m.parameters(), lr=1e-4)
72 |     s = WarmupCosineLRSchedule(
73 |         opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 |     )
75 |     lrs = []
76 |     for i in range(25000):
77 |         s.step()
78 |         lrs.append(s.lr)
79 |         print(s.lr)
80 | 
81 |     plt.plot(lrs)
82 |     plt.plot(range(0, 25000), lrs)
83 |     plt.show()
84 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _mha_shape_check,
 4 |     _canonical_mask,
 5 |     _none_or_dtype,
 6 |     _in_projection_packed,
 7 | )
 8 | 
 9 | def multi_head_attention_forward_patched(
10 |     query,
11 |     key,
12 |     value,
13 |     embed_dim_to_check: int,
14 |     num_heads: int,
15 |     in_proj_weight,
16 |     in_proj_bias: Optional[Tensor],
17 |     bias_k: Optional[Tensor],
18 |     bias_v: Optional[Tensor],
19 |     add_zero_attn: bool,
20 |     dropout_p: float,
21 |     out_proj_weight: Tensor,
22 |     out_proj_bias: Optional[Tensor],
23 |     training: bool = True,
24 |     key_padding_mask: Optional[Tensor] = None,
25 |     need_weights: bool = True,
26 |     attn_mask: Optional[Tensor] = None,
27 |     use_separate_proj_weight: bool = False,
28 |     q_proj_weight: Optional[Tensor] = None,
29 |     k_proj_weight: Optional[Tensor] = None,
30 |     v_proj_weight: Optional[Tensor] = None,
31 |     static_k: Optional[Tensor] = None,
32 |     static_v: Optional[Tensor] = None,
33 |     average_attn_weights: bool = True,
34 |     is_causal: bool = False,
35 |     cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 | 
38 |     # set up shape vars
39 |     _, _, embed_dim = query.shape
40 |     attn_mask = _canonical_mask(
41 |         mask=attn_mask,
42 |         mask_name="attn_mask",
43 |         other_type=None,
44 |         other_name="",
45 |         target_type=query.dtype,
46 |         check_other=False,
47 |     )
48 |     head_dim = embed_dim // num_heads
49 | 
50 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 | 
54 |     if cache["first_infer"] == 1:
55 |         cache["k"][cache["stage"]] = k
56 |         cache["v"][cache["stage"]] = v
57 |     else:
58 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 |         k = cache["k"][cache["stage"]]
61 |         v = cache["v"][cache["stage"]]
62 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 | 
64 |     attn_mask = _canonical_mask(
65 |         mask=attn_mask,
66 |         mask_name="attn_mask",
67 |         other_type=None,
68 |         other_name="",
69 |         target_type=q.dtype,
70 |         check_other=False,
71 |     )
72 |     attn_mask = attn_mask.unsqueeze(0)
73 | 
74 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 | 
78 |     dropout_p = 0.0
79 |     attn_mask = attn_mask.unsqueeze(0)
80 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 |     attn_output = scaled_dot_product_attention(
84 |         q, k, v, attn_mask, dropout_p, is_causal
85 |     )
86 |     attn_output = (
87 |         attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 |     )
89 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 | 
92 |     return attn_output
93 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from src.GPT_SoVITS.AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = (
34 |             rf"([{''.join(self._special_cases_dict.keys())}])"
35 |         )
36 | 
37 |     def _normalize_punctuation(self, text: str) -> str:
38 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 |         text = regex.sub(r"\pZ+", r" ", text)
41 |         return text.strip()
42 | 
43 |     def _convert_punctuation(self, word: Word) -> str:
44 |         if not word.phonemes:
45 |             return ""
46 |         if word.phonemes[0] in ["‖", "|"]:
47 |             return word.text.strip()
48 | 
49 |         phonemes = "".join(word.phonemes)
50 |         # remove modifier characters ˈˌː with regex
51 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 |         return phonemes.strip()
53 | 
54 |     def phonemize(self, text: str, espeak: bool = False) -> str:
55 |         text_to_phonemize: str = self._normalize_punctuation(text)
56 |         sents: List[Sentence] = [
57 |             sent
58 |             for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 |         ]
60 |         words: List[str] = [
61 |             self._convert_punctuation(word) for word in itertools.chain(*sents)
62 |         ]
63 |         return " ".join(words)
64 | 
65 |     def transform(self, phonemes):
66 |         # convert phonemes to ids
67 |         # dictionary is in symbols.py
68 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     phonemizer = GruutPhonemizer("en-us")
73 |     # text -> IPA
74 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 |     print("phonemes:", phonemes)
76 |     print("len(phonemes):", len(phonemes))
77 |     phoneme_ids = phonemizer.transform(phonemes)
78 |     print("phoneme_ids:", phoneme_ids)
79 |     print("len(phoneme_ids):", len(phoneme_ids))
80 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
 8 | SPACE_ID = SYMBOLS.index(" ")
 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == 'true' else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(
22 |         extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 |     # 获取最新的 ckpt 文件名
24 |     newest_ckpt = sorted_info[0][2]
25 |     return newest_ckpt
26 | 
27 | 
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 |     try:
31 |         with open(file_path, 'r') as file:
32 |             text = file.readline().strip()
33 |         assert text.strip() != ''
34 |         return text
35 |     except Exception:
36 |         return False
37 |     return False
38 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | 
 7 | def initialize(model: torch.nn.Module, init: str):
 8 |     """Initialize weights of a neural network module.
 9 | 
10 |     Parameters are initialized using the given method or distribution.
11 | 
12 |     Custom initialization routines can be implemented into submodules
13 |     as function `espnet_initialization_fn` within the custom module.
14 | 
15 |     Args:
16 |         model: Target.
17 |         init: Method of initialization.
18 |     """
19 |     assert check_argument_types()
20 |     print("init with", init)
21 | 
22 |     # weight init
23 |     for p in model.parameters():
24 |         if p.dim() > 1:
25 |             if init == "xavier_uniform":
26 |                 torch.nn.init.xavier_uniform_(p.data)
27 |             elif init == "xavier_normal":
28 |                 torch.nn.init.xavier_normal_(p.data)
29 |             elif init == "kaiming_uniform":
30 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 |             elif init == "kaiming_normal":
32 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 |             else:
34 |                 raise ValueError("Unknown initialization: " + init)
35 |     # bias init
36 |     for name, p in model.named_parameters():
37 |         if ".bias" in name and p.dim() == 1:
38 |             p.data.zero_()
39 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict(
22 |         (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 |     )
24 |     with open(path, "a") as args_file:
25 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
26 |         args_file.write(
27 |             "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 |         )
29 |         args_file.write("==> Cmd:\n")
30 |         args_file.write(str(sys.argv))
31 |         args_file.write("\n==> args:\n")
32 |         for k, v in sorted(args_dict.items()):
33 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
34 |         args_file.close()
35 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from src.GPT_SoVITS.TTS_infer_pack import TTS, text_segmentation_method


--------------------------------------------------------------------------------
/src/GPT_SoVITS/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 12
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 16
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 12
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 6
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1longer-v2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 732
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 15
32 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 512
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 100
 4 |   batch_size: 6
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 32
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 40
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   saving_path: "ckpt/"
22 |   resume_checkpoint: null
23 |   vocoder_config_path: "quantizer/new_ckpt/config.json"
24 |   vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 |   datadir: "/home/liweiche/GigaSpeech/wavs"
26 |   metapath: "/home/liweiche/GigaSpeech/train2.json"
27 |   val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 |   sampledir: "logs/"
29 |   pretrained_path: null
30 |   lr: 0.0001
31 |   batch_size: 200.0
32 |   train_bucket_size: 8192
33 |   training_step: 800000
34 |   optim_flat_percent: 0.0
35 |   warmup_step: 50
36 |   adam_beta1: 0.9
37 |   adam_beta2: 0.98
38 |   ffd_size: 3072
39 |   hidden_size: 768
40 |   enc_nlayers: 6
41 |   dec_nlayers: 6
42 |   nheads: 12
43 |   ar_layer: 4
44 |   ar_ffd_size: 1024
45 |   ar_hidden_size: 256
46 |   ar_nheads: 4
47 |   aligner_softmax_temp: 1.0
48 |   layer_norm_eps: 0.00001
49 |   speaker_embed_dropout: 0.05
50 |   label_smoothing: 0.0
51 |   val_check_interval: 5000
52 |   check_val_every_n_epoch: 1
53 |   precision: "fp16"
54 |   nworkers: 16
55 |   distributed: true
56 |   accelerator: "ddp"
57 |   version: null
58 |   accumulate_grad_batches: 1
59 |   use_repetition_token: true
60 |   use_repetition_gating: false
61 |   repetition_penalty: 1.0
62 |   sampling_temperature: 1.0
63 |   top_k: -1
64 |   min_top_k: 3
65 |   top_p: 0.8
66 |   sample_num: 4
67 |   length_penalty_max_length: 15000
68 |   length_penalty_max_prob: 0.95
69 |   max_input_length: 2048
70 |   max_output_length: 2000
71 |   sample_rate: 16000
72 |   n_codes: 1024
73 |   n_cluster_groups: 1
74 |   phone_context_window: 4
75 |   phoneset_size: 1000
76 | inference:
77 |   top_k: 5
78 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4
22 |   },
23 |   "data": {
24 |     "max_wav_value": 32768.0,
25 |     "sampling_rate": 32000,
26 |     "filter_length": 2048,
27 |     "hop_length": 640,
28 |     "win_length": 2048,
29 |     "n_mel_channels": 128,
30 |     "mel_fmin": 0.0,
31 |     "mel_fmax": null,
32 |     "add_blank": true,
33 |     "n_speakers": 300,
34 |     "cleaned_text": true
35 |   },
36 |   "model": {
37 |     "inter_channels": 192,
38 |     "hidden_channels": 192,
39 |     "filter_channels": 768,
40 |     "n_heads": 2,
41 |     "n_layers": 6,
42 |     "kernel_size": 3,
43 |     "p_dropout": 0.1,
44 |     "resblock": "1",
45 |     "resblock_kernel_sizes": [
46 |       3,
47 |       7,
48 |       11
49 |     ],
50 |     "resblock_dilation_sizes": [
51 |       [
52 |         1,
53 |         3,
54 |         5
55 |       ],
56 |       [
57 |         1,
58 |         3,
59 |         5
60 |       ],
61 |       [
62 |         1,
63 |         3,
64 |         5
65 |       ]
66 |     ],
67 |     "upsample_rates": [
68 |       10,
69 |       8,
70 |       2,
71 |       2,
72 |       2
73 |     ],
74 |     "upsample_initial_channel": 512,
75 |     "upsample_kernel_sizes": [
76 |       16,
77 |       16,
78 |       8,
79 |       2,
80 |       2
81 |     ],
82 |     "n_layers_q": 3,
83 |     "use_spectral_norm": false,
84 |     "gin_channels": 512,
85 |     "semantic_frame_rate": "25hz",
86 |     "freeze_quantizer": true
87 |   },
88 |   "s2_ckpt_dir": "logs/s2/big2k1",
89 |   "content_module": "cnhubert"
90 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | gpu:
 2 |   n_card: 1
 3 |   n_process_per_card: 2
 4 | io:
 5 |   text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 24
28 |   dropout: 0
29 |   EOS: 1024
30 |   random_bert: 0
31 | inference:
32 |   top_k: 5
33 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/configs/tts_infer.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 |   bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
 3 |   cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base
 4 |   device: cuda
 5 |   is_half: true
 6 |   t2s_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
 7 |   version: v2
 8 |   vits_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
 9 | default:
10 |   bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
11 |   cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base
12 |   device: cpu
13 |   is_half: false
14 |   t2s_weights_path: src/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
15 |   version: v1
16 |   vits_weights_path: src/GPT_SoVITS/pretrained_models/s2G488k.pth
17 | default_v2:
18 |   bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
19 |   cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base
20 |   device: cpu
21 |   is_half: false
22 |   t2s_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
23 |   version: v2
24 |   vits_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
25 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {
4 |     'cnhubert': cnhubert,
5 |     'whisper': whisper_enc
6 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 |             :1, :feature_len, :
24 |         ].transpose(1, 2)
25 |     return feature
26 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |     loss = 0
 9 |     for dr, dg in zip(fmap_r, fmap_g):
10 |         for rl, gl in zip(dr, dg):
11 |             rl = rl.float().detach()
12 |             gl = gl.float()
13 |             loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |     return loss * 2
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |     loss = 0
20 |     r_losses = []
21 |     g_losses = []
22 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |         dr = dr.float()
24 |         dg = dg.float()
25 |         r_loss = torch.mean((1 - dr) ** 2)
26 |         g_loss = torch.mean(dg**2)
27 |         loss += r_loss + g_loss
28 |         r_losses.append(r_loss.item())
29 |         g_losses.append(g_loss.item())
30 | 
31 |     return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |     loss = 0
36 |     gen_losses = []
37 |     for dg in disc_outputs:
38 |         dg = dg.float()
39 |         l = torch.mean((1 - dg) ** 2)
40 |         gen_losses.append(l)
41 |         loss += l
42 | 
43 |     return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |     """
48 |     z_p, logs_q: [b, h, t_t]
49 |     m_p, logs_p: [b, h, t_t]
50 |     """
51 |     z_p = z_p.float()
52 |     logs_q = logs_q.float()
53 |     m_p = m_p.float()
54 |     logs_p = logs_p.float()
55 |     z_mask = z_mask.float()
56 | 
57 |     kl = logs_p - logs_q - 0.5
58 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 |     kl = torch.sum(kl * z_mask)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def mle_loss(z, m, logs, logdet, mask):
65 |     l = torch.sum(logs) + 0.5 * torch.sum(
66 |         torch.exp(-2 * logs) * ((z - m) ** 2)
67 |     )  # neg normal likelihood w/o the constant term
68 |     l = l - torch.sum(logdet)  # log jacobian determinant
69 |     l = l / torch.sum(
70 |         torch.ones_like(z) * mask
71 |     )  # averaging across batch, channel and time axes
72 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
73 |     return l
74 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # if os.environ.get("version","v1")=="v1":
 3 | #   from text.symbols import symbols
 4 | # else:
 5 | #   from text.symbols2 import symbols
 6 | from src.GPT_SoVITS.text import symbols as symbols_v1
 7 | from src.GPT_SoVITS.text import symbols2 as symbols_v2
 8 | 
 9 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
10 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
11 | 
12 | def cleaned_text_to_sequence(cleaned_text, version=None):
13 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
14 |     Args:
15 |       text: string to convert to a sequence
16 |     Returns:
17 |       List of integers corresponding to the symbols in the text
18 |   '''
19 |   if version is None:version=os.environ.get('version', 'v2')
20 |   if version == "v1":
21 |     phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
22 |   else:
23 |     phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
24 | 
25 |   return phones
26 | 
27 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from src.GPT_SoVITS.text import cleaned_text_to_sequence
 2 | import os
 3 | # if os.environ.get("version","v1")=="v1":
 4 | #     from text import chinese
 5 | #     from text.symbols import symbols
 6 | # else:
 7 | #     from text import chinese2 as chinese
 8 | #     from text.symbols2 import symbols
 9 | 
10 | from src.GPT_SoVITS.text import symbols as symbols_v1
11 | from src.GPT_SoVITS.text import symbols2 as symbols_v2
12 | 
13 | special = [
14 |     # ("%", "zh", "SP"),
15 |     ("￥", "zh", "SP2"),
16 |     ("^", "zh", "SP3"),
17 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
18 | ]
19 | 
20 | 
21 | def clean_text(text, language, version=None):
22 |     if version is None:version=os.environ.get('version', 'v2')
23 |     if version == "v1":
24 |         symbols = symbols_v1.symbols
25 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
26 |     else:
27 |         symbols = symbols_v2.symbols
28 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
29 | 
30 |     if(language not in language_module_map):
31 |         language="en"
32 |         text=" "
33 |     for special_s, special_l, target_symbol in special:
34 |         if special_s in text and language == special_l:
35 |             return clean_special(text, language, special_s, target_symbol, version)
36 |     language_module = __import__("src.GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]])
37 |     if hasattr(language_module,"text_normalize"):
38 |         norm_text = language_module.text_normalize(text)
39 |     else:
40 |         norm_text=text
41 |     if language == "zh" or language=="yue":##########
42 |         phones, word2ph = language_module.g2p(norm_text)
43 |         assert len(phones) == sum(word2ph)
44 |         assert len(norm_text) == len(word2ph)
45 |     elif language == "en":
46 |         phones = language_module.g2p(norm_text)
47 |         if len(phones) < 4:
48 |             phones = [','] + phones
49 |         word2ph = None
50 |     else:
51 |         phones = language_module.g2p(norm_text)
52 |         word2ph = None
53 |     phones = ['UNK' if ph not in symbols else ph for ph in phones]
54 |     return phones, word2ph, norm_text
55 | 
56 | 
57 | def clean_special(text, language, special_s, target_symbol, version=None):
58 |     if version is None:version=os.environ.get('version', 'v2')
59 |     if version == "v1":
60 |         symbols = symbols_v1.symbols
61 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
62 |     else:
63 |         symbols = symbols_v2.symbols
64 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
65 | 
66 |     """
67 |     特殊静音段sp符号处理
68 |     """
69 |     text = text.replace(special_s, ",")
70 |     language_module = __import__("src.GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]])
71 |     norm_text = language_module.text_normalize(text)
72 |     phones = language_module.g2p(norm_text)
73 |     new_ph = []
74 |     for ph in phones[0]:
75 |         assert ph in symbols
76 |         if ph == ",":
77 |             new_ph.append(target_symbol)
78 |         else:
79 |             new_ph.append(ph)
80 |     return new_ph, phones[1], norm_text
81 | 
82 | 
83 | def text_to_sequence(text, language, version=None):
84 |     version = os.environ.get('version',version)
85 |     if version is None:version='v2'
86 |     phones = clean_text(text)
87 |     return cleaned_text_to_sequence(phones, version)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
92 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N
3 | CONDA K AA1 N D AH0


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/g2pw/__init__.py:
--------------------------------------------------------------------------------
1 | from src.GPT_SoVITS.text.g2pw.g2pw import *


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/g2pw/polyphonic.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/g2pw/polyphonic.pickle


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/g2pw/polyphonic.rep:
--------------------------------------------------------------------------------
 1 | 湖泊: ['hu2','po1']
 2 | 地壳: ['di4','qiao4']
 3 | 柏树: ['bai3','shu4']
 4 | 曝光: ['bao4','guang1']
 5 | 弹力: ['tan2','li4']
 6 | 字帖: ['zi4','tie4']
 7 | 口吃: ['kou3','chi1']
 8 | 包扎: ['bao1','za1']
 9 | 哪吒: ['ne2','zha1']
10 | 说服: ['shuo1','fu2']
11 | 识字: ['shi2','zi4']
12 | 骨头: ['gu3','tou5']
13 | 对称: ['dui4','chen4']
14 | 口供: ['kou3','gong4']
15 | 抹布: ['ma1','bu4']
16 | 露背: ['lu4','bei4']
17 | 圈养: ['juan4', 'yang3']
18 | 眼眶: ['yan3', 'kuang4']
19 | 品行: ['pin3','xing2']
20 | 颤抖: ['chan4','dou3']
21 | 差不多: ['cha4','bu5','duo1']
22 | 鸭绿江: ['ya1','lu4','jiang1']
23 | 撒切尔: ['sa4','qie4','er3']
24 | 比比皆是: ['bi3','bi3','jie1','shi4']
25 | 身无长物: ['shen1','wu2','chang2','wu4']
26 | 手里: ['shou2','li3']
27 | 关卡: ['guan1','qia3']
28 | 怀揣: ['huai2','chuai1']
29 | 挑剔: ['tiao1','ti4']
30 | 供称: ['gong4','cheng1']
31 | 作坊: ['zuo1', 'fang5']
32 | 中医: ['zhong1','yi1']
33 | 嚷嚷: ['rang1','rang5']
34 | 商厦: ['shang1','sha4']
35 | 大厦: ['da4','sha4']
36 | 刹车: ['sha1','che1']
37 | 嘚瑟: ['de4','se5']
38 | 朝鲜: ['chao2','xian3']
39 | 阿房宫: ['e1','pang2','gong1']
40 | 阿胶: ['e1','jiao1']
41 | 咖喱: ['ga1','li5']
42 | 时分: ['shi2','fen1']
43 | 蚌埠: ['beng4','bu4']
44 | 驯服: ['xun4','fu2']
45 | 幸免于难: ['xing4','mian3','yu2','nan4']
46 | 恶行: ['e4','xing2']
47 | 唉: ['ai4']
48 | 扎实: ['zha1','shi2']
49 | 干将: ['gan4','jiang4']
50 | 陈威行: ['chen2', 'wei1', 'hang2']
51 | 郭晟: ['guo1', 'sheng4']
52 | 中标: ['zhong4', 'biao1']
53 | 抗住: ['kang2', 'zhu4']


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/ja_userdic/userdict.csv:
--------------------------------------------------------------------------------
1 | 主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,*


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from src.GPT_SoVITS.text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/asr/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/asr/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def check_fw_local_models():
 4 |     '''
 5 |     启动时检查本地是否有 Faster Whisper 模型.
 6 |     '''
 7 |     model_size_list = [
 8 |         "tiny",     "tiny.en", 
 9 |         "base",     "base.en", 
10 |         "small",    "small.en", 
11 |         "medium",   "medium.en", 
12 |         "large",    "large-v1", 
13 |         "large-v2", "large-v3"]
14 |     for i, size in enumerate(model_size_list):
15 |         if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
16 |             model_size_list[i] = size + '-local'
17 |     return model_size_list
18 | 
19 | asr_dict = {
20 |     "达摩 ASR (中文)": {
21 |         'lang': ['zh','yue'],
22 |         'size': ['large'],
23 |         'path': 'funasr_asr.py',
24 |         'precision': ['float32']
25 |     },
26 |     "Faster Whisper (多语种)": {
27 |         'lang': ['auto', 'zh', 'en', 'ja', 'ko', 'yue'],
28 |         'size': check_fw_local_models(),
29 |         'path': 'fasterwhisper_asr.py',
30 |         'precision': ['float32', 'float16', 'int8']
31 |     },
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
 1 | import os,argparse
 2 | import traceback
 3 | 
 4 | from modelscope.pipelines import pipeline
 5 | from modelscope.utils.constant import Tasks
 6 | from tqdm import tqdm
 7 | 
 8 | path_denoise  = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
 9 | path_denoise  = path_denoise  if os.path.exists(path_denoise)  else "damo/speech_frcrn_ans_cirm_16k"
10 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
11 | def execute_denoise(input_folder,output_folder):
12 |     os.makedirs(output_folder,exist_ok=True)
13 |     # print(input_folder)
14 |     # print(list(os.listdir(input_folder).sort()))
15 |     for name in tqdm(os.listdir(input_folder)):
16 |         try:
17 |             ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
18 |         except:
19 |             traceback.print_exc()
20 | 
21 | if __name__ == '__main__':
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
24 |                         help="Path to the folder containing WAV files.")
25 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
26 |                         help="Output folder to store transcriptions.")
27 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
28 |                         help="fp16 or fp32")#还没接入
29 |     cmd = parser.parse_args()
30 |     execute_denoise(
31 |         input_folder  = cmd.input_folder,
32 |         output_folder = cmd.output_folder,
33 |     )


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/i18n/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/i18n/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale')
 6 | 
 7 | def load_language_list(language):
 8 |     with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
 9 |         language_list = json.load(f)
10 |     return language_list
11 | 
12 | def scan_language_list():
13 |     language_list = []
14 |     for name in os.listdir(I18N_JSON_DIR):
15 |         if name.endswith(".json"):language_list.append(name.split('.')[0])
16 |     return language_list
17 | 
18 | class I18nAuto:
19 |     def __init__(self, language=None):
20 |         if language in ["Auto", None]:
21 |             language = locale.getdefaultlocale()[0]  
22 |             # getlocale can't identify the system's language ((None, None))
23 |         if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
24 |             language = "en_US"
25 |         self.language = language
26 |         self.language_map = load_language_list(language)
27 | 
28 |     def __call__(self, key):
29 |         return self.language_map.get(key, key)
30 | 
31 |     def __repr__(self):
32 |         return "Use Language: " + self.language
33 | 
34 | if __name__ == "__main__":
35 |     i18n = I18nAuto(language='en_US')
36 |     print(i18n)


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/slice_audio.py:
--------------------------------------------------------------------------------
 1 | import os,sys,numpy as np
 2 | import traceback
 3 | from scipy.io import wavfile
 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
 5 | # sys.path.append(parent_directory)
 6 | from src.GPT_SoVITS.tools.my_utils import load_audio
 7 | from slicer2 import Slicer
 8 | 
 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
10 |     os.makedirs(opt_root,exist_ok=True)
11 |     if os.path.isfile(inp):
12 |         input=[inp]
13 |     elif os.path.isdir(inp):
14 |         input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
15 |     else:
16 |         return "输入路径存在但既不是文件也不是文件夹"
17 |     slicer = Slicer(
18 |         sr=32000,  # 长音频采样率
19 |         threshold=      int(threshold),  # 音量小于这个值视作静音的备选切割点
20 |         min_length=     int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
21 |         min_interval=   int(min_interval),  # 最短切割间隔
22 |         hop_size=       int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
23 |         max_sil_kept=   int(max_sil_kept),  # 切完后静音最多留多长
24 |     )
25 |     _max=float(_max)
26 |     alpha=float(alpha)
27 |     for inp_path in input[int(i_part)::int(all_part)]:
28 |         # print(inp_path)
29 |         try:
30 |             name = os.path.basename(inp_path)
31 |             audio = load_audio(inp_path, 32000)
32 |             # print(audio.shape)
33 |             for chunk, start, end in slicer.slice(audio):  # start和end是帧数
34 |                 tmp_max = np.abs(chunk).max()
35 |                 if(tmp_max>1):chunk/=tmp_max
36 |                 chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
37 |                 wavfile.write(
38 |                     "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
39 |                     32000,
40 |                     # chunk.astype(np.float32),
41 |                     (chunk * 32767).astype(np.int16),
42 |                 )
43 |         except:
44 |             print(inp_path,"->fail->",traceback.format_exc())
45 |     return "执行完毕，请检查输出文件"
46 | 
47 | print(slice(*sys.argv[1:]))
48 | 
49 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/uvr5/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/src/GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/__init__.py


--------------------------------------------------------------------------------
/src/asr.py:
--------------------------------------------------------------------------------
 1 | from funasr import AutoModel
 2 | from funasr.utils.postprocess_utils import rich_transcription_postprocess
 3 | 
 4 | 
 5 | class Fun_ASR:
 6 |     def __init__(self, model = "iic/SenseVoiceSmall", vad_model = "fsmn-vad", vad_kwargs = {"max_single_segment_time": 30000}, device = "cuda", disable_update = True):
 7 |         self.model = AutoModel(
 8 |             model = model,
 9 |             # vad_model = vad_model,
10 |             # vad_kwargs=vad_kwargs,
11 |             device = device,
12 |             disable_update = disable_update,
13 |         )
14 | 
15 |     def infer(self, audio_file):
16 |         res = self.model.generate(
17 |             input = audio_file,
18 |             cache = {},
19 |             language = "auto",
20 |             use_itn = True,
21 |             batch_size_s = 60,
22 |             merge_vad = True,
23 |             merge_length_s = 15,
24 |         )
25 |         text = rich_transcription_postprocess(res[0]["text"])
26 | 
27 |         return text
28 | 


--------------------------------------------------------------------------------
/src/musetalk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/__init__.py


--------------------------------------------------------------------------------
/src/musetalk/models/unet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | import json
 5 | 
 6 | from diffusers import UNet2DConditionModel
 7 | import sys
 8 | import time
 9 | import numpy as np
10 | import os
11 | 
12 | class PositionalEncoding(nn.Module):
13 |     def __init__(self, d_model=384, max_len=5000):
14 |         super(PositionalEncoding, self).__init__()
15 |         pe = torch.zeros(max_len, d_model)
16 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
17 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
18 |         pe[:, 0::2] = torch.sin(position * div_term)
19 |         pe[:, 1::2] = torch.cos(position * div_term)
20 |         pe = pe.unsqueeze(0)
21 |         self.register_buffer('pe', pe)
22 | 
23 |     def forward(self, x):
24 |         b, seq_len, d_model = x.size()
25 |         pe = self.pe[:, :seq_len, :]
26 |         x = x + pe.to(x.device)
27 |         return x
28 |     
29 | class UNet():
30 |     def __init__(self, 
31 |                  unet_config,
32 |                  model_path,
33 |                  use_float16=False,
34 |                  device = "cuda"
35 |         ):
36 |         with open(unet_config, 'r') as f:
37 |             unet_config = json.load(f)
38 |         self.model = UNet2DConditionModel(**unet_config)
39 |         self.pe = PositionalEncoding(d_model=384)
40 |         self.device = torch.device(device if torch.cuda.is_available() else "cpu")
41 |         weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device)
42 |         self.model.load_state_dict(weights)
43 |         if use_float16:
44 |             self.model = self.model.half()
45 |         self.model.to(self.device)
46 |     
47 | if __name__ == "__main__":
48 |     unet = UNet()
49 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from os.path import abspath, dirname
3 | current_dir = dirname(abspath(__file__))
4 | parent_dir = dirname(current_dir)
5 | sys.path.append(parent_dir+'/utils')
6 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/blending.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import numpy as np
 3 | import cv2
 4 | from face_parsing import FaceParsing
 5 | 
 6 | fp = FaceParsing()
 7 | 
 8 | def get_crop_box(box, expand):
 9 |     x, y, x1, y1 = box
10 |     x_c, y_c = (x+x1)//2, (y+y1)//2
11 |     w, h = x1-x, y1-y
12 |     s = int(max(w, h)//2*expand)
13 |     crop_box = [x_c-s, y_c-s, x_c+s, y_c+s]
14 |     return crop_box, s
15 | 
16 | def face_seg(image):
17 |     seg_image = fp(image)
18 |     if seg_image is None:
19 |         print("error, no person_segment")
20 |         return None
21 | 
22 |     seg_image = seg_image.resize(image.size)
23 |     return seg_image
24 | 
25 | def get_image(image,face,face_box,upper_boundary_ratio = 0.5,expand=1.2):
26 |     body = Image.fromarray(image[:,:,::-1])
27 |     face = Image.fromarray(face[:,:,::-1])
28 | 
29 |     x, y, x1, y1 = face_box 
30 |     crop_box, s = get_crop_box(face_box, expand)
31 |     x_s, y_s, x_e, y_e = crop_box
32 |     face_position = (x, y)
33 | 
34 |     face_large = body.crop(crop_box)
35 |     ori_shape = face_large.size
36 | 
37 |     mask_image = face_seg(face_large)
38 |     mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s))
39 |     mask_image = Image.new('L', ori_shape, 0)
40 |     mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s))
41 | 
42 |     # keep upper_boundary_ratio of talking area
43 |     width, height = mask_image.size
44 |     top_boundary = int(height * upper_boundary_ratio)
45 |     modified_mask_image = Image.new('L', ori_shape, 0)
46 |     modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
47 | 
48 |     blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
49 |     mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
50 |     mask_image = Image.fromarray(mask_array)
51 |     
52 |     face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
53 |     body.paste(face_large, crop_box[:2], mask_image)
54 |     body = np.array(body)
55 |     return body[:,:,::-1]
56 | 
57 | def get_image_prepare_material(image,face_box,upper_boundary_ratio = 0.5,expand=1.2):
58 |     body = Image.fromarray(image[:,:,::-1])
59 | 
60 |     x, y, x1, y1 = face_box
61 |     crop_box, s = get_crop_box(face_box, expand)
62 |     x_s, y_s, x_e, y_e = crop_box
63 | 
64 |     face_large = body.crop(crop_box)
65 |     ori_shape = face_large.size
66 | 
67 |     mask_image = face_seg(face_large)
68 |     mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s))
69 |     mask_image = Image.new('L', ori_shape, 0)
70 |     mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s))
71 | 
72 |     # keep upper_boundary_ratio of talking area
73 |     width, height = mask_image.size
74 |     top_boundary = int(height * upper_boundary_ratio)
75 |     modified_mask_image = Image.new('L', ori_shape, 0)
76 |     modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
77 | 
78 |     blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
79 |     mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
80 |     return mask_array,crop_box
81 | 
82 | def get_image_blending(image, face, face_box, mask_array, crop_box):
83 |     body = Image.fromarray(image[:,:,::-1])
84 |     face = Image.fromarray(face[:,:,::-1])
85 | 
86 |     x, y, x1, y1 = face_box
87 |     x_s, y_s, x_e, y_e = crop_box
88 |     face_large = body.crop(crop_box)
89 | 
90 |     mask_image = Image.fromarray(mask_array)
91 |     mask_image = mask_image.convert("L")
92 |     face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
93 |     body.paste(face_large, crop_box[:2], mask_image)
94 |     body = np.array(body)
95 |     return body[:,:,::-1]


--------------------------------------------------------------------------------
/src/musetalk/utils/dwpose/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/utils/dwpose/__init__.py


--------------------------------------------------------------------------------
/src/musetalk/utils/dwpose/default_runtime.py:
--------------------------------------------------------------------------------
 1 | default_scope = 'mmpose'
 2 | 
 3 | # hooks
 4 | default_hooks = dict(
 5 |     timer=dict(type='IterTimerHook'),
 6 |     logger=dict(type='LoggerHook', interval=50),
 7 |     param_scheduler=dict(type='ParamSchedulerHook'),
 8 |     checkpoint=dict(type='CheckpointHook', interval=10),
 9 |     sampler_seed=dict(type='DistSamplerSeedHook'),
10 |     visualization=dict(type='PoseVisualizationHook', enable=False),
11 |     badcase=dict(
12 |         type='BadCaseAnalysisHook',
13 |         enable=False,
14 |         out_dir='badcase',
15 |         metric_type='loss',
16 |         badcase_thr=5))
17 | 
18 | # custom hooks
19 | custom_hooks = [
20 |     # Synchronize model buffers such as running_mean and running_var in BN
21 |     # at the end of each epoch
22 |     dict(type='SyncBuffersHook')
23 | ]
24 | 
25 | # multi-processing backend
26 | env_cfg = dict(
27 |     cudnn_benchmark=False,
28 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
29 |     dist_cfg=dict(backend='nccl'),
30 | )
31 | 
32 | # visualizer
33 | vis_backends = [
34 |     dict(type='LocalVisBackend'),
35 |     # dict(type='TensorboardVisBackend'),
36 |     # dict(type='WandbVisBackend'),
37 | ]
38 | visualizer = dict(
39 |     type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
40 | 
41 | # logger
42 | log_processor = dict(
43 |     type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
44 | log_level = 'INFO'
45 | load_from = None
46 | resume = False
47 | 
48 | # file I/O backend
49 | backend_args = dict(backend='local')
50 | 
51 | # training/validation/testing progress
52 | train_cfg = dict(by_epoch=True)
53 | val_cfg = dict()
54 | test_cfg = dict()
55 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/face_detection/README.md:
--------------------------------------------------------------------------------
1 | The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. 


--------------------------------------------------------------------------------
/src/musetalk/utils/face_detection/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = """Adrian Bulat"""
4 | __email__ = 'adrian.bulat@nottingham.ac.uk'
5 | __version__ = '1.0.1'
6 | 
7 | from .api import FaceAlignment, LandmarksType, NetworkSize, YOLOv8_face
8 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/face_detection/detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import FaceDetector


--------------------------------------------------------------------------------
/src/musetalk/utils/face_detection/detection/sfd/__init__.py:
--------------------------------------------------------------------------------
1 | from .sfd_detector import SFDDetector as FaceDetector


--------------------------------------------------------------------------------
/src/musetalk/utils/face_detection/detection/sfd/sfd_detector.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | from torch.utils.model_zoo import load_url
 4 | 
 5 | from ..core import FaceDetector
 6 | 
 7 | from .net_s3fd import s3fd
 8 | from .bbox import *
 9 | from .detect import *
10 | 
11 | models_urls = {
12 |     's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
13 | }
14 | 
15 | 
16 | class SFDDetector(FaceDetector):
17 |     def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
18 |         super(SFDDetector, self).__init__(device, verbose)
19 | 
20 |         # Initialise the face detector
21 |         if not os.path.isfile(path_to_detector):
22 |             model_weights = load_url(models_urls['s3fd'])
23 |         else:
24 |             model_weights = torch.load(path_to_detector)
25 | 
26 |         self.face_detector = s3fd()
27 |         self.face_detector.load_state_dict(model_weights)
28 |         self.face_detector.to(device)
29 |         self.face_detector.eval()
30 | 
31 |     def detect_from_image(self, tensor_or_path):
32 |         image = self.tensor_or_path_to_ndarray(tensor_or_path)
33 | 
34 |         bboxlist = detect(self.face_detector, image, device=self.device)
35 |         keep = nms(bboxlist, 0.3)
36 |         bboxlist = bboxlist[keep, :]
37 |         bboxlist = [x for x in bboxlist if x[-1] > 0.5]
38 | 
39 |         return bboxlist
40 | 
41 |     def detect_from_batch(self, images):
42 |         bboxlists = batch_detect(self.face_detector, images, device=self.device)
43 |         keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
44 |         bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
45 |         bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
46 | 
47 |         return bboxlists
48 | 
49 |     @property
50 |     def reference_scale(self):
51 |         return 195
52 | 
53 |     @property
54 |     def reference_x_shift(self):
55 |         return 0
56 | 
57 |     @property
58 |     def reference_y_shift(self):
59 |         return 0
60 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/face_parsing/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import os
 4 | import cv2
 5 | import numpy as np
 6 | from PIL import Image
 7 | from .model import BiSeNet
 8 | import torchvision.transforms as transforms
 9 | 
10 | class FaceParsing():
11 |     def __init__(self):
12 |         self.net = self.model_init()
13 |         self.preprocess = self.image_preprocess()
14 | 
15 |     def model_init(self, 
16 |                    resnet_path='./weights/face-parse-bisent/resnet18-5c106cde.pth', 
17 |                    model_pth='./weights/face-parse-bisent/79999_iter.pth'):
18 |         net = BiSeNet(resnet_path)
19 |         if torch.cuda.is_available():
20 |             net.cuda()
21 |             net.load_state_dict(torch.load(model_pth)) 
22 |         else:
23 |             net.load_state_dict(torch.load(model_pth, map_location=torch.device('cpu')))
24 |         net.eval()
25 |         return net
26 | 
27 |     def image_preprocess(self):
28 |         return transforms.Compose([
29 |             transforms.ToTensor(),
30 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
31 |         ])
32 | 
33 |     def __call__(self, image, size=(512, 512)):
34 |         if isinstance(image, str):
35 |             image = Image.open(image)
36 | 
37 |         width, height = image.size
38 |         with torch.no_grad():
39 |             image = image.resize(size, Image.BILINEAR)
40 |             img = self.preprocess(image)
41 |             if torch.cuda.is_available():
42 |                 img = torch.unsqueeze(img, 0).cuda()
43 |             else:
44 |                 img = torch.unsqueeze(img, 0)
45 |             out = self.net(img)[0]
46 |             parsing = out.squeeze(0).cpu().numpy().argmax(0)
47 |             parsing[np.where(parsing>13)] = 0
48 |             parsing[np.where(parsing>=1)] = 255
49 |         parsing = Image.fromarray(parsing.astype(np.uint8))
50 |         return parsing
51 | 
52 | 
53 |     
54 | 


--------------------------------------------------------------------------------
/src/musetalk/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | ffmpeg_path = os.getenv('FFMPEG_PATH')
 7 | if ffmpeg_path is None:
 8 |     print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static")
 9 | elif ffmpeg_path not in os.getenv('PATH'):
10 |     print("add ffmpeg to path")
11 |     os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
12 | 
13 |     
14 | from src.musetalk.whisper.audio2feature import Audio2Feature
15 | from src.musetalk.models.vae import VAE
16 | from src.musetalk.models.unet import UNet,PositionalEncoding
17 | 
18 | def load_all_model():
19 |     audio_processor = Audio2Feature(model_path="./weights/whisper/tiny.pt")
20 |     vae = VAE(model_path = "./weights/sd-vae-ft-mse/")
21 |     unet = UNet(unet_config="./weights/musetalk/musetalk.json",
22 |                 model_path ="./weights/musetalk/pytorch_model.bin")
23 |     pe = PositionalEncoding(d_model=384)
24 |     return audio_processor,vae,unet,pe
25 | 
26 | def get_file_type(video_path):
27 |     _, ext = os.path.splitext(video_path)
28 | 
29 |     if ext.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']:
30 |         return 'image'
31 |     elif ext.lower() in ['.avi', '.mp4', '.mov', '.flv', '.mkv']:
32 |         return 'video'
33 |     else:
34 |         return 'unsupported'
35 | 
36 | def get_video_fps(video_path):
37 |     video = cv2.VideoCapture(video_path)
38 |     fps = video.get(cv2.CAP_PROP_FPS)
39 |     video.release()
40 |     return fps
41 | 
42 | def datagen(whisper_chunks,
43 |             vae_encode_latents,
44 |             batch_size=8,
45 |             delay_frame=0):
46 |     whisper_batch, latent_batch = [], []
47 |     for i, w in enumerate(whisper_chunks):
48 |         idx = (i+delay_frame)%len(vae_encode_latents)
49 |         latent = vae_encode_latents[idx]
50 |         whisper_batch.append(w)
51 |         latent_batch.append(latent)
52 | 
53 |         if len(latent_batch) >= batch_size:
54 |             whisper_batch = np.stack(whisper_batch)
55 |             latent_batch = torch.cat(latent_batch, dim=0)
56 |             yield whisper_batch, latent_batch
57 |             whisper_batch, latent_batch = [], []
58 | 
59 |     # the last batch may smaller than batch size
60 |     if len(latent_batch) > 0:
61 |         whisper_batch = np.stack(whisper_batch)
62 |         latent_batch = torch.cat(latent_batch, dim=0)
63 | 
64 |         yield whisper_batch, latent_batch
65 | 
66 | def video2imgs(vid_path, save_path,cut_frame = 10000000):
67 |     cap = cv2.VideoCapture(vid_path)
68 |     count = 0
69 |     while True:
70 |         if count > cut_frame:
71 |             break
72 |         ret, frame = cap.read()
73 |         if ret:
74 |             cv2.imwrite(f"{save_path}/{count:08d}.png", frame)
75 |             count += 1
76 |         else:
77 |             break
78 | 
79 | def osmakedirs(path_list):
80 |     for path in path_list:
81 |         os.makedirs(path) if not os.path.exists(path) else None


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/__main__.py:
--------------------------------------------------------------------------------
1 | from .transcribe import cli
2 | 
3 | 
4 | cli()
5 | 


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/whisper/whisper/assets/mel_filters.npz


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/multilingual/added_tokens.json:
--------------------------------------------------------------------------------
1 | {"<|endoftext|>": 50257}
2 | 


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/normalizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .basic import BasicTextNormalizer
2 | from .english import EnglishTextNormalizer
3 | 


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/normalizers/basic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | 
 4 | import regex
 5 | 
 6 | # non-ASCII letters that are not separated by "NFKD" normalization
 7 | ADDITIONAL_DIACRITICS = {
 8 |     "œ": "oe",
 9 |     "Œ": "OE",
10 |     "ø": "o",
11 |     "Ø": "O",
12 |     "æ": "ae",
13 |     "Æ": "AE",
14 |     "ß": "ss",
15 |     "ẞ": "SS",
16 |     "đ": "d",
17 |     "Đ": "D",
18 |     "ð": "d",
19 |     "Ð": "D",
20 |     "þ": "th",
21 |     "Þ": "th",
22 |     "ł": "l",
23 |     "Ł": "L",
24 | }
25 | 
26 | 
27 | def remove_symbols_and_diacritics(s: str, keep=""):
28 |     """
29 |     Replace any other markers, symbols, and punctuations with a space,
30 |     and drop any diacritics (category 'Mn' and some manual mappings)
31 |     """
32 |     return "".join(
33 |         c
34 |         if c in keep
35 |         else ADDITIONAL_DIACRITICS[c]
36 |         if c in ADDITIONAL_DIACRITICS
37 |         else ""
38 |         if unicodedata.category(c) == "Mn"
39 |         else " "
40 |         if unicodedata.category(c)[0] in "MSP"
41 |         else c
42 |         for c in unicodedata.normalize("NFKD", s)
43 |     )
44 | 
45 | 
46 | def remove_symbols(s: str):
47 |     """
48 |     Replace any other markers, symbols, punctuations with a space, keeping diacritics
49 |     """
50 |     return "".join(
51 |         " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
52 |     )
53 | 
54 | 
55 | class BasicTextNormalizer:
56 |     def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
57 |         self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
58 |         self.split_letters = split_letters
59 | 
60 |     def __call__(self, s: str):
61 |         s = s.lower()
62 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
63 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
64 |         s = self.clean(s).lower()
65 | 
66 |         if self.split_letters:
67 |             s = " ".join(regex.findall(r"\X", s, regex.U))
68 | 
69 |         s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
70 | 
71 |         return s
72 | 


--------------------------------------------------------------------------------
/src/musetalk/whisper/whisper/utils.py:
--------------------------------------------------------------------------------
 1 | import zlib
 2 | from typing import Iterator, TextIO
 3 | 
 4 | 
 5 | def exact_div(x, y):
 6 |     assert x % y == 0
 7 |     return x // y
 8 | 
 9 | 
10 | def str2bool(string):
11 |     str2val = {"True": True, "False": False}
12 |     if string in str2val:
13 |         return str2val[string]
14 |     else:
15 |         raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
16 | 
17 | 
18 | def optional_int(string):
19 |     return None if string == "None" else int(string)
20 | 
21 | 
22 | def optional_float(string):
23 |     return None if string == "None" else float(string)
24 | 
25 | 
26 | def compression_ratio(text) -> float:
27 |     return len(text) / len(zlib.compress(text.encode("utf-8")))
28 | 
29 | 
30 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
31 |     assert seconds >= 0, "non-negative timestamp expected"
32 |     milliseconds = round(seconds * 1000.0)
33 | 
34 |     hours = milliseconds // 3_600_000
35 |     milliseconds -= hours * 3_600_000
36 | 
37 |     minutes = milliseconds // 60_000
38 |     milliseconds -= minutes * 60_000
39 | 
40 |     seconds = milliseconds // 1_000
41 |     milliseconds -= seconds * 1_000
42 | 
43 |     hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
44 |     return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
45 | 
46 | 
47 | def write_txt(transcript: Iterator[dict], file: TextIO):
48 |     for segment in transcript:
49 |         print(segment['text'].strip(), file=file, flush=True)
50 | 
51 | 
52 | def write_vtt(transcript: Iterator[dict], file: TextIO):
53 |     print("WEBVTT\n", file=file)
54 |     for segment in transcript:
55 |         print(
56 |             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
57 |             f"{segment['text'].strip().replace('-->', '->')}\n",
58 |             file=file,
59 |             flush=True,
60 |         )
61 | 
62 | 
63 | def write_srt(transcript: Iterator[dict], file: TextIO):
64 |     """
65 |     Write a transcript to a file in SRT format.
66 | 
67 |     Example usage:
68 |         from pathlib import Path
69 |         from whisper.utils import write_srt
70 | 
71 |         result = transcribe(model, audio_path, temperature=temperature, **args)
72 | 
73 |         # save SRT
74 |         audio_basename = Path(audio_path).stem
75 |         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
76 |             write_srt(result["segments"], file=srt)
77 |     """
78 |     for i, segment in enumerate(transcript, start=1):
79 |         # write srt lines
80 |         print(
81 |             f"{i}\n"
82 |             f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
83 |             f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
84 |             f"{segment['text'].strip().replace('-->', '->')}\n",
85 |             file=file,
86 |             flush=True,
87 |         )
88 | 


--------------------------------------------------------------------------------
/src/prompt.txt:
--------------------------------------------------------------------------------
1 | 你负责为一个语音聊天系统生成对话文本输出，确保语气情感丰富、友好，并且响应迅速以保持用户的参与感。请你遵循以下规则：
2 | 1. 回复应该简短、对话性强，并保持互动式的交流风格，每个句子保持简短且长度接近（5-10个字）。
3 | 2. 理解用户的意图并提供简洁、相关的回复，避免不必要的说明或平淡的陈述。
4 | 3. 在整个对话中保持友好和情感丰富的语气。
5 | 4. 快速回应，以免让用户等待，以“好的”、“没问题”、“明白了”等短句作为回复的开头
6 | 5. 确保示例是多轮的，而不仅仅是一个问题一个回答。
7 | 
8 | 接下来，我会给你一系列用户输入，请你遵循上述要求输出内容。
9 | 用户输入：


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import subprocess
 4 | import cv2
 5 | import time
 6 | from pathlib import Path
 7 | from datetime import datetime
 8 | import wave
 9 | from dashscope.audio.tts_v2 import *
10 | 
11 | def merge_frames_with_audio(audio_path, fps = 25):
12 |     video_idx = audio_path.split("/")[-1].split("_")[-1].split(".")[0]
13 |     print(f"[Real-time Inference] Merging frames with audio on {video_idx}")
14 | 
15 |     video_path = str(Path(audio_path).parent.parent / "videos" / f"{video_idx}.ts")
16 |     frame_path = str(Path(audio_path).parent.parent / "frames" / f"{video_idx}")
17 |     start_time = time.time()
18 |     
19 |     ffmpeg_command = [
20 |         'ffmpeg',
21 |         '-framerate', str(fps),
22 |         '-i', f"{frame_path}/%08d.jpg",
23 |         '-i', audio_path,
24 |         '-c:v', 'libx264',     
25 |         '-shortest',
26 |         '-f', 'mpegts',    
27 |         '-y',     
28 |         video_path
29 |     ]
30 |     subprocess.run(ffmpeg_command, check=True)
31 |     print(f"[Real-time Inference] Merging frames with audio costs {time.time()-start_time}s")
32 |     return video_path
33 | 
34 | def get_video_duration(video_path):
35 |     cap = cv2.VideoCapture(video_path)
36 |     fps = cap.get(cv2.CAP_PROP_FPS)
37 |     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
38 |     duration = frame_count / fps
39 |     return round(duration, 2)
40 | 
41 | def split_into_sentences(text, sentence_split_option):
42 |     text = ''.join(text.splitlines())
43 |     sentence_endings = re.compile(r'[。！？.!?]')
44 |     sentences = sentence_endings.split(text)
45 |     sentences = [s.strip() for s in sentences if s.strip()]
46 |     split_count = int(sentence_split_option)
47 |     return ['。'.join(sentences[i:i+split_count]) for i in range(0, len(sentences), split_count)]
48 | 
49 | def get_timestamp_str():
50 |     fmt = "%Y%m%d_%H%M%S"
51 |     current_time = datetime.now()
52 |     folder_name = current_time.strftime(fmt)
53 |     return folder_name
54 | 
55 | def merge_videos(video_folder_path, suffix = '.mp4'):
56 |     output_path = os.path.join(video_folder_path, f'merged_video{suffix}')
57 |     file_list_path = os.path.join(video_folder_path, 'video_list.txt')
58 | 
59 |     def extract_index(filename):
60 |         index = filename.split('.')[0].split('_')[-1]
61 |         return int(index) 
62 | 
63 |     with open(file_list_path, 'w') as file_list:
64 |         ts_files = [f for f in os.listdir(video_folder_path) if f.endswith('.ts')]
65 |         ts_files.sort(key=extract_index)
66 | 
67 |         for filename in ts_files:
68 |             file_list.write(f"file '{filename}'\n")
69 | 
70 |     ffmpeg_command = [
71 |         'ffmpeg',
72 |         '-f', 'concat',
73 |         '-safe', '0',
74 |         '-i', file_list_path,
75 |         '-c', 'copy',
76 |         '-c:v', 'libx264',
77 |         '-c:a', 'aac', 
78 |         '-y',
79 |         output_path
80 |     ]
81 | 
82 |     subprocess.run(ffmpeg_command, check=True)
83 |     return output_path
84 | 


--------------------------------------------------------------------------------