├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── data ├── audio │ ├── warm_up.wav │ ├── 女性.wav │ ├── 少女.wav │ ├── 男性.wav │ └── 青年.wav ├── icon │ ├── qwen.png │ └── user.png └── video │ ├── Avatar1.mp4 │ ├── Avatar2.mp4 │ └── Avatar3.mp4 ├── docs └── README_en.md ├── image.png ├── requirements.txt ├── server.py └── src ├── GLM_4_Voice ├── LICENSE ├── README.md ├── README_en.md ├── __init__.py ├── cosyvoice │ ├── __init__.py │ ├── bin │ │ ├── inference.py │ │ └── train.py │ ├── cli │ │ ├── __init__.py │ │ ├── cosyvoice.py │ │ ├── frontend.py │ │ └── model.py │ ├── dataset │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── processor.py │ ├── flow │ │ ├── decoder.py │ │ ├── flow.py │ │ ├── flow_gradtts.py │ │ ├── flow_matching.py │ │ ├── flow_matching_dit.py │ │ ├── length_regulator.py │ │ └── stable │ │ │ ├── adp.py │ │ │ ├── blocks.py │ │ │ ├── dit.py │ │ │ ├── dit_v2.py │ │ │ ├── sampling.py │ │ │ ├── stable_diffusion.py │ │ │ ├── stable_diffusion_test.py │ │ │ ├── transformer.py │ │ │ └── transformer_use_mask.py │ ├── hifigan │ │ ├── f0_predictor.py │ │ └── generator.py │ ├── llm │ │ └── llm.py │ ├── transformer │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── attention.py │ │ ├── convolution.py │ │ ├── decoder.py │ │ ├── decoder_layer.py │ │ ├── embedding.py │ │ ├── encoder.py │ │ ├── encoder_layer.py │ │ ├── label_smoothing_loss.py │ │ ├── positionwise_feed_forward.py │ │ └── subsampling.py │ └── utils │ │ ├── __init__.py │ │ ├── block_mask_util.py │ │ ├── class_utils.py │ │ ├── common.py │ │ ├── executor.py │ │ ├── file_utils.py │ │ ├── frontend_utils.py │ │ ├── mask.py │ │ ├── scheduler.py │ │ └── train_utils.py ├── flow_inference.py ├── requirements.txt ├── resources │ ├── architecture.jpeg │ └── web_demo.png ├── speech_tokenizer │ ├── __init__.py │ ├── configuration_whisper.py │ ├── generation_whisper.py │ ├── modeling_whisper.py │ └── utils.py └── third_party │ └── Matcha-TTS │ ├── .env.example │ ├── .github │ ├── PULL_REQUEST_TEMPLATE.md │ ├── codecov.yml │ ├── dependabot.yml │ └── release-drafter.yml │ ├── .gitignore │ ├── .pre-commit-config.yaml │ ├── .project-root │ ├── .pylintrc │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── configs │ ├── __init__.py │ ├── callbacks │ │ ├── default.yaml │ │ ├── model_checkpoint.yaml │ │ ├── model_summary.yaml │ │ ├── none.yaml │ │ └── rich_progress_bar.yaml │ ├── data │ │ ├── hi-fi_en-US_female.yaml │ │ ├── ljspeech.yaml │ │ └── vctk.yaml │ ├── debug │ │ ├── default.yaml │ │ ├── fdr.yaml │ │ ├── limit.yaml │ │ ├── overfit.yaml │ │ └── profiler.yaml │ ├── eval.yaml │ ├── experiment │ │ ├── hifi_dataset_piper_phonemizer.yaml │ │ ├── ljspeech.yaml │ │ ├── ljspeech_min_memory.yaml │ │ └── multispeaker.yaml │ ├── extras │ │ └── default.yaml │ ├── hparams_search │ │ └── mnist_optuna.yaml │ ├── hydra │ │ └── default.yaml │ ├── local │ │ └── .gitkeep │ ├── logger │ │ ├── aim.yaml │ │ ├── comet.yaml │ │ ├── csv.yaml │ │ ├── many_loggers.yaml │ │ ├── mlflow.yaml │ │ ├── neptune.yaml │ │ ├── tensorboard.yaml │ │ └── wandb.yaml │ ├── model │ │ ├── cfm │ │ │ └── default.yaml │ │ ├── decoder │ │ │ └── default.yaml │ │ ├── encoder │ │ │ └── default.yaml │ │ ├── matcha.yaml │ │ └── optimizer │ │ │ └── adam.yaml │ ├── paths │ │ └── default.yaml │ ├── train.yaml │ └── trainer │ │ ├── cpu.yaml │ │ ├── ddp.yaml │ │ ├── ddp_sim.yaml │ │ ├── default.yaml │ │ ├── gpu.yaml │ │ └── mps.yaml │ ├── data │ ├── matcha │ ├── VERSION │ ├── __init__.py │ ├── app.py │ ├── cli.py │ ├── data │ │ ├── __init__.py │ │ ├── components │ │ │ └── __init__.py │ │ └── text_mel_datamodule.py │ ├── hifigan │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config.py │ │ ├── denoiser.py │ │ ├── env.py │ │ ├── meldataset.py │ │ ├── models.py │ │ └── xutils.py │ ├── models │ │ ├── __init__.py │ │ ├── baselightningmodule.py │ │ ├── components │ │ │ ├── __init__.py │ │ │ ├── decoder.py │ │ │ ├── flow_matching.py │ │ │ ├── text_encoder.py │ │ │ └── transformer.py │ │ └── matcha_tts.py │ ├── onnx │ │ ├── __init__.py │ │ ├── export.py │ │ └── infer.py │ ├── text │ │ ├── __init__.py │ │ ├── cleaners.py │ │ ├── numbers.py │ │ └── symbols.py │ ├── train.py │ └── utils │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── generate_data_statistics.py │ │ ├── instantiators.py │ │ ├── logging_utils.py │ │ ├── model.py │ │ ├── monotonic_align │ │ ├── __init__.py │ │ ├── core.pyx │ │ └── setup.py │ │ ├── pylogger.py │ │ ├── rich_utils.py │ │ └── utils.py │ ├── notebooks │ └── .gitkeep │ ├── pyproject.toml │ ├── requirements.txt │ ├── scripts │ └── schedule.sh │ ├── setup.py │ └── synthesis.ipynb ├── GPT_SoVITS ├── AR │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── bucket_sampler.py │ │ ├── data_module.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── t2s_lightning_module.py │ │ ├── t2s_lightning_module_onnx.py │ │ ├── t2s_model.py │ │ ├── t2s_model_onnx.py │ │ └── utils.py │ ├── modules │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── activation_onnx.py │ │ ├── embedding.py │ │ ├── embedding_onnx.py │ │ ├── lr_schedulers.py │ │ ├── optim.py │ │ ├── patched_mha_with_cache.py │ │ ├── patched_mha_with_cache_onnx.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── transformer_onnx.py │ ├── text_processing │ │ ├── __init__.py │ │ ├── phonemizer.py │ │ └── symbols.py │ └── utils │ │ ├── __init__.py │ │ ├── initialize.py │ │ └── io.py ├── TTS_infer_pack │ ├── TTS.py │ ├── TextPreprocessor.py │ ├── __init__.py │ └── text_segmentation_method.py ├── __init__.py ├── configs │ ├── s1.yaml │ ├── s1big.yaml │ ├── s1big2.yaml │ ├── s1longer-v2.yaml │ ├── s1longer.yaml │ ├── s1mq.yaml │ ├── s2.json │ ├── train.yaml │ └── tts_infer.yaml ├── feature_extractor │ ├── __init__.py │ ├── cnhubert.py │ └── whisper_enc.py ├── module │ ├── __init__.py │ ├── attentions.py │ ├── attentions_onnx.py │ ├── commons.py │ ├── core_vq.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── models_onnx.py │ ├── modules.py │ ├── mrte_model.py │ ├── quantize.py │ └── transforms.py ├── text │ ├── __init__.py │ ├── cantonese.py │ ├── chinese.py │ ├── chinese2.py │ ├── cleaner.py │ ├── cmudict-fast.rep │ ├── cmudict.rep │ ├── engdict-hot.rep │ ├── engdict_cache.pickle │ ├── english.py │ ├── g2pw │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── g2pw.py │ │ ├── onnx_api.py │ │ ├── polyphonic-fix.rep │ │ ├── polyphonic.pickle │ │ ├── polyphonic.rep │ │ └── utils.py │ ├── ja_userdic │ │ └── userdict.csv │ ├── japanese.py │ ├── korean.py │ ├── namedict_cache.pickle │ ├── opencpop-strict.txt │ ├── symbols.py │ ├── symbols2.py │ ├── tone_sandhi.py │ └── zh_normalization │ │ ├── README.md │ │ ├── __init__.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py ├── tools │ ├── __init__.py │ ├── asr │ │ ├── __init__.py │ │ ├── config.py │ │ ├── fasterwhisper_asr.py │ │ ├── funasr_asr.py │ │ └── models │ │ │ └── .gitignore │ ├── cmd-denoise.py │ ├── denoise-model │ │ └── .gitignore │ ├── i18n │ │ ├── __init__.py │ │ ├── i18n.py │ │ ├── locale │ │ │ ├── en_US.json │ │ │ ├── es_ES.json │ │ │ ├── fr_FR.json │ │ │ ├── it_IT.json │ │ │ ├── ja_JP.json │ │ │ ├── ko_KR.json │ │ │ ├── pt_BR.json │ │ │ ├── ru_RU.json │ │ │ ├── tr_TR.json │ │ │ ├── zh_CN.json │ │ │ ├── zh_HK.json │ │ │ ├── zh_SG.json │ │ │ └── zh_TW.json │ │ └── scan_i18n.py │ ├── my_utils.py │ ├── slice_audio.py │ ├── slicer2.py │ ├── subfix_webui.py │ └── uvr5 │ │ ├── __init__.py │ │ ├── bs_roformer │ │ ├── __init__.py │ │ ├── attend.py │ │ └── bs_roformer.py │ │ ├── bsroformer.py │ │ ├── lib │ │ ├── lib_v5 │ │ │ ├── dataset.py │ │ │ ├── layers.py │ │ │ ├── layers_123812KB.py │ │ │ ├── layers_123821KB.py │ │ │ ├── layers_33966KB.py │ │ │ ├── layers_537227KB.py │ │ │ ├── layers_537238KB.py │ │ │ ├── layers_new.py │ │ │ ├── model_param_init.py │ │ │ ├── modelparams │ │ │ │ ├── 1band_sr16000_hl512.json │ │ │ │ ├── 1band_sr32000_hl512.json │ │ │ │ ├── 1band_sr33075_hl384.json │ │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ │ ├── 1band_sr44100_hl256.json │ │ │ │ ├── 1band_sr44100_hl512.json │ │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ │ ├── 2band_32000.json │ │ │ │ ├── 2band_44100_lofi.json │ │ │ │ ├── 2band_48000.json │ │ │ │ ├── 3band_44100.json │ │ │ │ ├── 3band_44100_mid.json │ │ │ │ ├── 3band_44100_msb2.json │ │ │ │ ├── 4band_44100.json │ │ │ │ ├── 4band_44100_mid.json │ │ │ │ ├── 4band_44100_msb.json │ │ │ │ ├── 4band_44100_msb2.json │ │ │ │ ├── 4band_44100_reverse.json │ │ │ │ ├── 4band_44100_sw.json │ │ │ │ ├── 4band_v2.json │ │ │ │ ├── 4band_v2_sn.json │ │ │ │ ├── 4band_v3.json │ │ │ │ └── ensemble.json │ │ │ ├── nets.py │ │ │ ├── nets_123812KB.py │ │ │ ├── nets_123821KB.py │ │ │ ├── nets_33966KB.py │ │ │ ├── nets_537227KB.py │ │ │ ├── nets_537238KB.py │ │ │ ├── nets_61968KB.py │ │ │ ├── nets_new.py │ │ │ └── spec_utils.py │ │ ├── name_params.json │ │ └── utils.py │ │ ├── mdxnet.py │ │ ├── uvr5_weights │ │ └── .gitignore │ │ ├── vr.py │ │ └── webui.py └── utils.py ├── __init__.py ├── asr.py ├── glm.py ├── llm.py ├── musetalk ├── __init__.py ├── models │ ├── unet.py │ └── vae.py ├── utils │ ├── __init__.py │ ├── blending.py │ ├── dwpose │ │ ├── __init__.py │ │ ├── default_runtime.py │ │ └── rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py │ ├── face_detection │ │ ├── README.md │ │ ├── __init__.py │ │ ├── api.py │ │ ├── detection │ │ │ ├── __init__.py │ │ │ ├── core.py │ │ │ └── sfd │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox.py │ │ │ │ ├── detect.py │ │ │ │ ├── net_s3fd.py │ │ │ │ └── sfd_detector.py │ │ ├── models.py │ │ └── utils.py │ ├── face_parsing │ │ ├── __init__.py │ │ ├── model.py │ │ └── resnet.py │ ├── preprocessing.py │ └── utils.py └── whisper │ ├── audio2feature.py │ └── whisper │ ├── __init__.py │ ├── __main__.py │ ├── assets │ ├── gpt2 │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.json │ ├── mel_filters.npz │ └── multilingual │ │ ├── added_tokens.json │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.json │ ├── audio.py │ ├── decoding.py │ ├── model.py │ ├── normalizers │ ├── __init__.py │ ├── basic.py │ ├── english.json │ └── english.py │ ├── tokenizer.py │ ├── transcribe.py │ └── utils.py ├── pipeline_llm.py ├── pipeline_mllm.py ├── prompt.txt ├── thg.py ├── tts.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | *.log 4 | .idea/ 5 | .vscode/ 6 | *.pyc 7 | workspaces/ 8 | */__pycache__/ 9 | */.ipynb_checkpoints/ 10 | */.pytest_cache/ 11 | */.mypy_cache/ 12 | */.coverage 13 | __pycache__/ 14 | weights/ZhipuAI/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Henry 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data/audio/warm_up.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/warm_up.wav -------------------------------------------------------------------------------- /data/audio/女性.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/女性.wav -------------------------------------------------------------------------------- /data/audio/少女.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/少女.wav -------------------------------------------------------------------------------- /data/audio/男性.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/男性.wav -------------------------------------------------------------------------------- /data/audio/青年.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/audio/青年.wav -------------------------------------------------------------------------------- /data/icon/qwen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/icon/qwen.png -------------------------------------------------------------------------------- /data/icon/user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/icon/user.png -------------------------------------------------------------------------------- /data/video/Avatar1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar1.mp4 -------------------------------------------------------------------------------- /data/video/Avatar2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar2.mp4 -------------------------------------------------------------------------------- /data/video/Avatar3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/data/video/Avatar3.mp4 -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/image.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | # https://gradio-builds.s3.amazonaws.com/bed454c3d22cfacedc047eb3b0ba987b485ac3fd/gradio-4.40.0-py3-none-any.whl 3 | gradio==5.4.0 4 | modelscope_studio==0.5.2 5 | # torch==2.1.2 6 | # torchvision==0.16.2 7 | # torchaudio==2.1.2 8 | torch==2.3.0 9 | torchvision==0.18.0 10 | torchaudio==2.3.0 11 | diffusers==0.27.2 12 | accelerate==0.28.0 13 | tensorflow==2.14.0 14 | tensorboard==2.14.0 15 | opencv-python==4.9.0.80 16 | soundfile==0.12.1 17 | gdown==5.2.0 18 | requests==2.32.3 19 | imageio==2.35.1 20 | imageio[ffmpeg] 21 | omegaconf==2.3.0 22 | ffmpeg-python==0.2.0 23 | spaces==0.30.0 24 | moviepy==1.0.3 25 | numpy==1.23.5 26 | scipy==1.13.1 27 | librosa==0.9.2 28 | numba==0.56.4 29 | pytorch-lightning==2.4.0 30 | onnxruntime==1.19.2; sys_platform == 'darwin' 31 | onnxruntime-gpu==1.19.2; sys_platform != 'darwin' 32 | tqdm==4.66.5 33 | funasr==1.1.6 34 | cn2an==0.5.22 35 | pypinyin==0.52.0 36 | pyopenjtalk==0.3.4 37 | g2p-en==2.1.0 38 | sentencepiece==0.2.0 39 | chardet==5.2.0 40 | PyYAML==6.0.2 41 | psutil==5.9.8 42 | jieba_fast==0.53 43 | jieba==0.42.1 44 | LangSegment==0.3.5 45 | Faster_Whisper==1.0.3 46 | wordsegment==1.3.1 47 | rotary-embedding-torch==0.7.0 48 | pyjyutping==1.0.0 49 | g2pk2==0.0.3 50 | ko-pron==1.3 51 | opencc; sys_platform != 'linux' 52 | opencc==1.1.1; sys_platform == 'linux' 53 | python_mecab_ko==1.3.7; sys_platform != 'win32' 54 | openmim==0.3.9 55 | openai==1.43.0 56 | fastapi[all] 57 | nltk 58 | modelscope==1.18.0 59 | pydub 60 | dashscope 61 | edge-tts 62 | # fastapi==0.112.2 63 | 64 | # GLM-4-Voice requirements 65 | transformers==4.44.1 66 | conformer==0.3.2 67 | deepspeed==0.14.2; sys_platform == 'linux' 68 | grpcio==1.57.0 69 | grpcio-tools==1.57.0 70 | huggingface_hub==0.25.2 71 | hydra-core==1.3.2 72 | HyperPyYAML==1.2.2 73 | inflect==7.3.1 74 | lightning==2.2.4 75 | networkx==3.1 76 | openai-whisper==20231117 77 | protobuf==4.25 78 | rich==13.7.1 79 | Requests==2.32.3 80 | safetensors==0.4.5 81 | soundfile==0.12.1 82 | tensorboard==2.14.0 83 | wget==3.2 84 | WeTextProcessing==1.0.3 -------------------------------------------------------------------------------- /src/GLM_4_Voice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/cli/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/dataset/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/flow/length_regulator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple 15 | import torch.nn as nn 16 | from torch.nn import functional as F 17 | from cosyvoice.utils.mask import make_pad_mask 18 | 19 | 20 | class InterpolateRegulator(nn.Module): 21 | def __init__( 22 | self, 23 | channels: int, 24 | sampling_ratios: Tuple, 25 | out_channels: int = None, 26 | groups: int = 1, 27 | ): 28 | super().__init__() 29 | self.sampling_ratios = sampling_ratios 30 | out_channels = out_channels or channels 31 | model = nn.ModuleList([]) 32 | if len(sampling_ratios) > 0: 33 | for _ in sampling_ratios: 34 | module = nn.Conv1d(channels, channels, 3, 1, 1) 35 | norm = nn.GroupNorm(groups, channels) 36 | act = nn.Mish() 37 | model.extend([module, norm, act]) 38 | model.append( 39 | nn.Conv1d(channels, out_channels, 1, 1) 40 | ) 41 | self.model = nn.Sequential(*model) 42 | 43 | def forward(self, x, ylens=None): 44 | # x in (B, T, D) 45 | mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) 46 | x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') 47 | out = self.model(x).transpose(1, 2).contiguous() 48 | olens = ylens 49 | return out * mask, olens 50 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/hifigan/f0_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.utils import weight_norm 17 | 18 | 19 | class ConvRNNF0Predictor(nn.Module): 20 | def __init__(self, 21 | num_class: int = 1, 22 | in_channels: int = 80, 23 | cond_channels: int = 512 24 | ): 25 | super().__init__() 26 | 27 | self.num_class = num_class 28 | self.condnet = nn.Sequential( 29 | weight_norm( 30 | nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) 31 | ), 32 | nn.ELU(), 33 | weight_norm( 34 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 35 | ), 36 | nn.ELU(), 37 | weight_norm( 38 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 39 | ), 40 | nn.ELU(), 41 | weight_norm( 42 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 43 | ), 44 | nn.ELU(), 45 | weight_norm( 46 | nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) 47 | ), 48 | nn.ELU(), 49 | ) 50 | self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) 51 | 52 | def forward(self, x: torch.Tensor) -> torch.Tensor: 53 | x = self.condnet(x) 54 | x = x.transpose(1, 2) 55 | return torch.abs(self.classifier(x).squeeze(-1)) 56 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/transformer/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/transformer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 2024 Alibaba Inc (Xiang Lyu) 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Swish() activation function for Conformer.""" 18 | 19 | import torch 20 | from torch import nn, sin, pow 21 | from torch.nn import Parameter 22 | 23 | 24 | class Swish(torch.nn.Module): 25 | """Construct an Swish object.""" 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | """Return Swish activation function.""" 29 | return x * torch.sigmoid(x) 30 | 31 | 32 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 33 | # LICENSE is in incl_licenses directory. 34 | class Snake(nn.Module): 35 | ''' 36 | Implementation of a sine-based periodic activation function 37 | Shape: 38 | - Input: (B, C, T) 39 | - Output: (B, C, T), same shape as the input 40 | Parameters: 41 | - alpha - trainable parameter 42 | References: 43 | - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 44 | https://arxiv.org/abs/2006.08195 45 | Examples: 46 | >>> a1 = snake(256) 47 | >>> x = torch.randn(256) 48 | >>> x = a1(x) 49 | ''' 50 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 51 | ''' 52 | Initialization. 53 | INPUT: 54 | - in_features: shape of the input 55 | - alpha: trainable parameter 56 | alpha is initialized to 1 by default, higher values = higher-frequency. 57 | alpha will be trained along with the rest of your model. 58 | ''' 59 | super(Snake, self).__init__() 60 | self.in_features = in_features 61 | 62 | # initialize alpha 63 | self.alpha_logscale = alpha_logscale 64 | if self.alpha_logscale: # log scale alphas initialized to zeros 65 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 66 | else: # linear scale alphas initialized to ones 67 | self.alpha = Parameter(torch.ones(in_features) * alpha) 68 | 69 | self.alpha.requires_grad = alpha_trainable 70 | 71 | self.no_div_by_zero = 0.000000001 72 | 73 | def forward(self, x): 74 | ''' 75 | Forward pass of the function. 76 | Applies the function to the input elementwise. 77 | Snake ∶= x + 1/a * sin^2 (xa) 78 | ''' 79 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 80 | if self.alpha_logscale: 81 | alpha = torch.exp(alpha) 82 | x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/cosyvoice/utils/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/utils/block_mask_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle): 5 | assert seq_length > 0 6 | 7 | # 先不考虑seen_length创建一个grid mask: 8 | if fill_triangle: 9 | mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) 10 | # 下三角与主对角线都为1 11 | else: 12 | mask = torch.zeros(seq_length, seq_length) 13 | 14 | for i in range(seq_length): 15 | trunck_idx = i // trunck_length 16 | trunck_start = trunck_idx * trunck_length 17 | trunck_end = trunck_length + trunck_start 18 | mask[i][trunck_start:trunck_end] = 1 19 | 20 | return mask 21 | 22 | 23 | if __name__ == "__main__": 24 | mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int() 25 | print(mask) 26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0], 27 | # [1, 1, 1, 0, 0, 0, 0, 0], 28 | # [1, 1, 1, 0, 0, 0, 0, 0], 29 | # [1, 1, 1, 1, 1, 1, 0, 0], 30 | # [1, 1, 1, 1, 1, 1, 0, 0], 31 | # [1, 1, 1, 1, 1, 1, 0, 0], 32 | # [1, 1, 1, 1, 1, 1, 1, 1], 33 | # [1, 1, 1, 1, 1, 1, 1, 1]] 34 | 35 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright [2023-11-28] 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from cosyvoice.transformer.activation import Swish 18 | from cosyvoice.transformer.subsampling import ( 19 | LinearNoSubsampling, 20 | EmbedinigNoSubsampling, 21 | Conv1dSubsampling2, 22 | Conv2dSubsampling4, 23 | Conv2dSubsampling6, 24 | Conv2dSubsampling8, 25 | ) 26 | from cosyvoice.transformer.embedding import (PositionalEncoding, 27 | RelPositionalEncoding, 28 | WhisperPositionalEncoding, 29 | LearnablePositionalEncoding, 30 | NoPositionalEncoding) 31 | from cosyvoice.transformer.attention import (MultiHeadedAttention, 32 | RelPositionMultiHeadedAttention, 33 | BlockRelPositionMultiHeadedAttention) 34 | from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding 35 | from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling 36 | 37 | 38 | COSYVOICE_ACTIVATION_CLASSES = { 39 | "hardtanh": torch.nn.Hardtanh, 40 | "tanh": torch.nn.Tanh, 41 | "relu": torch.nn.ReLU, 42 | "selu": torch.nn.SELU, 43 | "swish": getattr(torch.nn, "SiLU", Swish), 44 | "gelu": torch.nn.GELU, 45 | } 46 | 47 | COSYVOICE_SUBSAMPLE_CLASSES = { 48 | "linear": LinearNoSubsampling, 49 | "linear_legacy": LegacyLinearNoSubsampling, 50 | "embed": EmbedinigNoSubsampling, 51 | "conv1d2": Conv1dSubsampling2, 52 | "conv2d": Conv2dSubsampling4, 53 | "conv2d6": Conv2dSubsampling6, 54 | "conv2d8": Conv2dSubsampling8, 55 | 'paraformer_dummy': torch.nn.Identity 56 | } 57 | 58 | COSYVOICE_EMB_CLASSES = { 59 | "embed": PositionalEncoding, 60 | "abs_pos": PositionalEncoding, 61 | "rel_pos": RelPositionalEncoding, 62 | "rel_pos_espnet": EspnetRelPositionalEncoding, 63 | "no_pos": NoPositionalEncoding, 64 | "abs_pos_whisper": WhisperPositionalEncoding, 65 | "embed_learnable_pe": LearnablePositionalEncoding, 66 | } 67 | 68 | COSYVOICE_ATTENTION_CLASSES = { 69 | "selfattn": MultiHeadedAttention, 70 | "rel_selfattn": RelPositionMultiHeadedAttention, 71 | "block_rel_selfattn": BlockRelPositionMultiHeadedAttention, 72 | } 73 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/cosyvoice/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2024 Alibaba Inc (authors: Xiang Lyu) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import torchaudio 18 | 19 | 20 | def read_lists(list_file): 21 | lists = [] 22 | with open(list_file, 'r', encoding='utf8') as fin: 23 | for line in fin: 24 | lists.append(line.strip()) 25 | return lists 26 | 27 | def read_json_lists(list_file): 28 | lists = read_lists(list_file) 29 | results = {} 30 | for fn in lists: 31 | with open(fn, 'r', encoding='utf8') as fin: 32 | results.update(json.load(fin)) 33 | return results 34 | 35 | def load_wav(wav, target_sr): 36 | speech, sample_rate = torchaudio.load(wav) 37 | speech = speech.mean(dim=0, keepdim=True) 38 | if sample_rate != target_sr: 39 | assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) 40 | speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) 41 | return speech 42 | 43 | def speed_change(waveform, sample_rate, speed_factor: str): 44 | effects = [ 45 | ["tempo", speed_factor], # speed_factor 46 | ["rate", f"{sample_rate}"] 47 | ] 48 | augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( 49 | waveform, 50 | sample_rate, 51 | effects 52 | ) 53 | return augmented_waveform, new_sample_rate 54 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/requirements.txt: -------------------------------------------------------------------------------- 1 | conformer==0.3.2 2 | deepspeed==0.14.2; sys_platform == 'linux' 3 | diffusers==0.27.2 4 | fastapi==0.115.3 5 | fastapi-cli==0.0.4 6 | gdown==5.1.0 7 | gradio==5.3.0 8 | grpcio==1.57.0 9 | grpcio-tools==1.57.0 10 | huggingface_hub==0.25.2 11 | hydra-core==1.3.2 12 | HyperPyYAML==1.2.2 13 | inflect==7.3.1 14 | librosa==0.10.2 15 | lightning==2.2.4 16 | matplotlib==3.7.5 17 | modelscope==1.15.0 18 | 19 | networkx==3.1 20 | numpy==1.24.4 21 | omegaconf==2.3.0 22 | onnxruntime-gpu==1.16.0; sys_platform == 'linux' 23 | onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' 24 | openai-whisper==20231117 25 | protobuf==4.25 26 | pydantic==2.7.0 27 | rich==13.7.1 28 | Requests==2.32.3 29 | safetensors==0.4.5 30 | soundfile==0.12.1 31 | tensorboard==2.14.0 32 | transformers==4.44.1 33 | uvicorn==0.32.0 34 | wget==3.2 35 | WeTextProcessing==1.0.3 36 | torch==2.3.0 37 | torchaudio==2.3.0 38 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/resources/architecture.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/resources/architecture.jpeg -------------------------------------------------------------------------------- /src/GLM_4_Voice/resources/web_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/resources/web_demo.png -------------------------------------------------------------------------------- /src/GLM_4_Voice/speech_tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/speech_tokenizer/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/speech_tokenizer/configuration_whisper.py: -------------------------------------------------------------------------------- 1 | from transformers import WhisperConfig 2 | 3 | 4 | class WhisperVQConfig(WhisperConfig): 5 | def __init__(self, 6 | pooling_kernel_size=None, 7 | pooling_type="max", 8 | pooling_position=0, 9 | quantize_vocab_size=None, 10 | quantize_position=16, 11 | quantize_commit_coefficient=0.25, 12 | quantize_loss_scale=1.0, 13 | quantize_ema_decay=None, 14 | quantize_restart_interval=None, 15 | quantize_encoder_only=False, 16 | quantize_causal_encoder=False, 17 | quantize_causal_block_size=None, 18 | skip_language_detection=False, 19 | encoder_causal_attention=False, 20 | encoder_causal_convolution=False, 21 | **kwargs): 22 | self.pooling_kernel_size = pooling_kernel_size 23 | self.pooling_type = pooling_type 24 | self.pooling_position = pooling_position 25 | self.quantize_vocab_size = quantize_vocab_size 26 | self.quantize_position = quantize_position 27 | self.quantize_commit_coefficient = quantize_commit_coefficient 28 | self.quantize_loss_scale = quantize_loss_scale 29 | self.quantize_ema_decay = quantize_ema_decay 30 | self.quantize_restart_interval = quantize_restart_interval 31 | self.quantize_encoder_only = quantize_encoder_only 32 | self.quantize_causal_encoder = quantize_causal_encoder 33 | self.quantize_causal_block_size = quantize_causal_block_size 34 | self.skip_language_detection = skip_language_detection 35 | self.encoder_causal_attention = encoder_causal_attention 36 | self.encoder_causal_convolution = encoder_causal_convolution 37 | super().__init__(**kwargs) 38 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.env.example: -------------------------------------------------------------------------------- 1 | # example of file for storing private and user specific environment variables, like keys or system paths 2 | # rename it to ".env" (excluded from version control by default) 3 | # .env is loaded by train.py automatically 4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} 5 | 6 | MY_VAR="/home/user/my/system/path" 7 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 9 | 10 | Fixes #\ 11 | 12 | ## Before submitting 13 | 14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? 15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? 16 | - [ ] Did you list all the **breaking changes** introduced by this pull request? 17 | - [ ] Did you **test your PR locally** with `pytest` command? 18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? 19 | 20 | ## Did you have fun? 21 | 22 | Make sure you had fun coding 🙃 23 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.github/codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | # measures overall project coverage 4 | project: 5 | default: 6 | threshold: 100% # how much decrease in coverage is needed to not consider success 7 | 8 | # measures PR or single commit coverage 9 | patch: 10 | default: 11 | threshold: 100% # how much decrease in coverage is needed to not consider success 12 | 13 | 14 | # project: off 15 | # patch: off 16 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | target-branch: "dev" 11 | schedule: 12 | interval: "daily" 13 | ignore: 14 | - dependency-name: "pytorch-lightning" 15 | update-types: ["version-update:semver-patch"] 16 | - dependency-name: "torchmetrics" 17 | update-types: ["version-update:semver-patch"] 18 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: "v$RESOLVED_VERSION" 2 | tag-template: "v$RESOLVED_VERSION" 3 | 4 | categories: 5 | - title: "🚀 Features" 6 | labels: 7 | - "feature" 8 | - "enhancement" 9 | - title: "🐛 Bug Fixes" 10 | labels: 11 | - "fix" 12 | - "bugfix" 13 | - "bug" 14 | - title: "🧹 Maintenance" 15 | labels: 16 | - "maintenance" 17 | - "dependencies" 18 | - "refactoring" 19 | - "cosmetic" 20 | - "chore" 21 | - title: "📝️ Documentation" 22 | labels: 23 | - "documentation" 24 | - "docs" 25 | 26 | change-template: "- $TITLE @$AUTHOR (#$NUMBER)" 27 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions 28 | 29 | version-resolver: 30 | major: 31 | labels: 32 | - "major" 33 | minor: 34 | labels: 35 | - "minor" 36 | patch: 37 | labels: 38 | - "patch" 39 | default: patch 40 | 41 | template: | 42 | ## Changes 43 | 44 | $CHANGES 45 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | ### VisualStudioCode 131 | .vscode/* 132 | !.vscode/settings.json 133 | !.vscode/tasks.json 134 | !.vscode/launch.json 135 | !.vscode/extensions.json 136 | *.code-workspace 137 | **/.vscode 138 | 139 | # JetBrains 140 | .idea/ 141 | 142 | # Data & Models 143 | *.h5 144 | *.tar 145 | *.tar.gz 146 | 147 | # Lightning-Hydra-Template 148 | configs/local/default.yaml 149 | /data/ 150 | /logs/ 151 | .env 152 | 153 | # Aim logging 154 | .aim 155 | 156 | # Cython complied files 157 | matcha/utils/monotonic_align/core.c 158 | 159 | # Ignoring hifigan checkpoint 160 | generator_v1 161 | g_02500000 162 | gradio_cached_examples/ 163 | synth_output/ 164 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.10 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.5.0 7 | hooks: 8 | # list of supported hooks: https://pre-commit.com/hooks.html 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | # - id: check-docstring-first 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: detect-private-key 15 | - id: check-toml 16 | - id: check-case-conflict 17 | - id: check-added-large-files 18 | 19 | # python code formatting 20 | - repo: https://github.com/psf/black 21 | rev: 23.12.1 22 | hooks: 23 | - id: black 24 | args: [--line-length, "120"] 25 | 26 | # python import sorting 27 | - repo: https://github.com/PyCQA/isort 28 | rev: 5.13.2 29 | hooks: 30 | - id: isort 31 | args: ["--profile", "black", "--filter-files"] 32 | 33 | # python upgrading syntax to newer version 34 | - repo: https://github.com/asottile/pyupgrade 35 | rev: v3.15.0 36 | hooks: 37 | - id: pyupgrade 38 | args: [--py38-plus] 39 | 40 | # python check (PEP8), programming errors and code complexity 41 | - repo: https://github.com/PyCQA/flake8 42 | rev: 7.0.0 43 | hooks: 44 | - id: flake8 45 | args: 46 | [ 47 | "--max-line-length", "120", 48 | "--extend-ignore", 49 | "E203,E402,E501,F401,F841,RST2,RST301", 50 | "--exclude", 51 | "logs/*,data/*,matcha/hifigan/*", 52 | ] 53 | additional_dependencies: [flake8-rst-docstrings==0.3.0] 54 | 55 | # pylint 56 | - repo: https://github.com/pycqa/pylint 57 | rev: v3.0.3 58 | hooks: 59 | - id: pylint 60 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/.project-root: -------------------------------------------------------------------------------- 1 | # this file is required for inferring the project root directory 2 | # do not delete 3 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Shivam Mehta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include matcha/VERSION 7 | recursive-include matcha *.json 8 | recursive-include matcha *.html 9 | recursive-include matcha *.png 10 | recursive-include matcha *.md 11 | recursive-include matcha *.py 12 | recursive-include matcha *.pyx 13 | recursive-exclude tests * 14 | prune tests* 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | clean-logs: ## Clean logs 14 | rm -rf logs/** 15 | 16 | create-package: ## Create wheel and tar gz 17 | rm -rf dist/ 18 | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 19 | python setup.py sdist 20 | python -m twine upload dist/* --verbose --skip-existing 21 | 22 | format: ## Run pre-commit hooks 23 | pre-commit run -a 24 | 25 | sync: ## Merge changes from main branch to your current branch 26 | git pull 27 | git pull origin main 28 | 29 | test: ## Run not slow tests 30 | pytest -k "not slow" 31 | 32 | test-full: ## Run all tests 33 | pytest 34 | 35 | train-ljspeech: ## Train the model 36 | python matcha/train.py experiment=ljspeech 37 | 38 | train-ljspeech-min: ## Train the model with minimum memory 39 | python matcha/train.py experiment=ljspeech_min_memory 40 | 41 | start_app: ## Start the app 42 | python matcha/app.py 43 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # this file is needed here to include configs when building project as a package 2 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - model_summary.yaml 4 | - rich_progress_bar.yaml 5 | - _self_ 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html 2 | 3 | model_checkpoint: 4 | _target_: lightning.pytorch.callbacks.ModelCheckpoint 5 | dirpath: ${paths.output_dir}/checkpoints # directory to save the model file 6 | filename: checkpoint_{epoch:03d} # checkpoint filename 7 | monitor: epoch # name of the logged metric which determines when model is improving 8 | verbose: False # verbosity mode 9 | save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt 10 | save_top_k: 10 # save k best models (determined by above metric) 11 | mode: "max" # "max" means higher metric value is better, can be also "min" 12 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 13 | save_weights_only: False # if True, then only the model’s weights will be saved 14 | every_n_train_steps: null # number of training steps between checkpoints 15 | train_time_interval: null # checkpoints are monitored at the specified time interval 16 | every_n_epochs: 100 # number of epochs between checkpoints 17 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 18 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html 2 | 3 | model_summary: 4 | _target_: lightning.pytorch.callbacks.RichModelSummary 5 | max_depth: 3 # the maximum depth of layer nesting that the summary will include 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/none.yaml -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html 2 | 3 | rich_progress_bar: 4 | _target_: lightning.pytorch.callbacks.RichProgressBar 5 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ 6 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 7 | name: hi-fi_en-US_female 8 | train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt 9 | valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt 10 | batch_size: 32 11 | cleaners: [english_cleaners_piper] 12 | data_statistics: # Computed for this dataset 13 | mel_mean: -6.38385 14 | mel_std: 2.541796 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 2 | name: ljspeech 3 | train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt 4 | valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt 5 | batch_size: 32 6 | num_workers: 20 7 | pin_memory: True 8 | cleaners: [english_cleaners2] 9 | add_blank: True 10 | n_spks: 1 11 | n_fft: 1024 12 | n_feats: 80 13 | sample_rate: 22050 14 | hop_length: 256 15 | win_length: 1024 16 | f_min: 0 17 | f_max: 8000 18 | data_statistics: # Computed for ljspeech dataset 19 | mel_mean: -5.536622 20 | mel_std: 2.116101 21 | seed: ${seed} 22 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/data/vctk.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 6 | name: vctk 7 | train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt 8 | valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt 9 | batch_size: 32 10 | add_blank: True 11 | n_spks: 109 12 | data_statistics: # Computed for vctk dataset 13 | mel_mean: -6.630575 14 | mel_std: 2.482914 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # default debugging setup, runs 1 full epoch 4 | # other debugging configs can inherit from this one 5 | 6 | # overwrite task name so debugging logs are stored in separate folder 7 | task_name: "debug" 8 | 9 | # disable callbacks and loggers during debugging 10 | # callbacks: null 11 | # logger: null 12 | 13 | extras: 14 | ignore_warnings: False 15 | enforce_tags: False 16 | 17 | # sets level of all command line loggers to 'DEBUG' 18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 19 | hydra: 20 | job_logging: 21 | root: 22 | level: DEBUG 23 | 24 | # use this to also set hydra loggers to 'DEBUG' 25 | # verbose: True 26 | 27 | trainer: 28 | max_epochs: 1 29 | accelerator: cpu # debuggers don't like gpus 30 | devices: 1 # debuggers don't like multiprocessing 31 | detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor 32 | 33 | data: 34 | num_workers: 0 # debuggers don't like multiprocessing 35 | pin_memory: False # disable gpu memory pin 36 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/fdr.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs 1 train, 1 validation and 1 test step 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | fast_dev_run: true 10 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/limit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # uses only 1% of the training data and 5% of validation/test data 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 3 10 | limit_train_batches: 0.01 11 | limit_val_batches: 0.05 12 | limit_test_batches: 0.05 13 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # overfits to 3 batches 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 20 10 | overfit_batches: 3 11 | 12 | # model ckpt and early stopping need to be disabled during overfitting 13 | callbacks: null 14 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/debug/profiler.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs with execution time profiling 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 1 10 | # profiler: "simple" 11 | profiler: "advanced" 12 | # profiler: "pytorch" 13 | accelerator: gpu 14 | 15 | limit_train_batches: 0.02 16 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - data: mnist # choose datamodule with `test_dataloader()` for evaluation 6 | - model: mnist 7 | - logger: null 8 | - trainer: default 9 | - paths: default 10 | - extras: default 11 | - hydra: default 12 | 13 | task_name: "eval" 14 | 15 | tags: ["dev"] 16 | 17 | # passing checkpoint path is necessary for evaluation 18 | ckpt_path: ??? 19 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: hi-fi_en-US_female.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] 13 | 14 | run_name: hi-fi_en-US_female_piper_phonemizer 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech_min 15 | 16 | 17 | model: 18 | out_size: 172 19 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: vctk.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["multispeaker"] 13 | 14 | run_name: multispeaker 15 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # example hyperparameter optimization of some experiment with Optuna: 4 | # python train.py -m hparams_search=mnist_optuna experiment=example 5 | 6 | defaults: 7 | - override /hydra/sweeper: optuna 8 | 9 | # choose metric which will be optimized by Optuna 10 | # make sure this is the correct name of some metric logged in lightning module! 11 | optimized_metric: "val/acc_best" 12 | 13 | # here we define Optuna hyperparameter search 14 | # it optimizes for value returned from function with @hydra.main decorator 15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper 16 | hydra: 17 | mode: "MULTIRUN" # set hydra to multirun by default if this config is attached 18 | 19 | sweeper: 20 | _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper 21 | 22 | # storage URL to persist optimization results 23 | # for example, you can use SQLite if you set 'sqlite:///example.db' 24 | storage: null 25 | 26 | # name of the study to persist optimization results 27 | study_name: null 28 | 29 | # number of parallel workers 30 | n_jobs: 1 31 | 32 | # 'minimize' or 'maximize' the objective 33 | direction: maximize 34 | 35 | # total number of runs that will be executed 36 | n_trials: 20 37 | 38 | # choose Optuna hyperparameter sampler 39 | # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others 40 | # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html 41 | sampler: 42 | _target_: optuna.samplers.TPESampler 43 | seed: 1234 44 | n_startup_trials: 10 # number of random sampling runs before optimization starts 45 | 46 | # define hyperparameter search space 47 | params: 48 | model.optimizer.lr: interval(0.0001, 0.1) 49 | data.batch_size: choice(32, 64, 128, 256) 50 | model.net.lin1_size: choice(64, 128, 256) 51 | model.net.lin2_size: choice(64, 128, 256) 52 | model.net.lin3_size: choice(32, 64, 128, 256) 53 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | 15 | job_logging: 16 | handlers: 17 | file: 18 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 19 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 20 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/local/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/configs/local/.gitkeep -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/aim.yaml: -------------------------------------------------------------------------------- 1 | # https://aimstack.io/ 2 | 3 | # example usage in lightning module: 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py 5 | 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder): 7 | # `aim up` 8 | 9 | aim: 10 | _target_: aim.pytorch_lightning.AimLogger 11 | repo: ${paths.root_dir} # .aim folder will be created here 12 | # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# 13 | 14 | # aim allows to group runs under experiment name 15 | experiment: null # any string, set to "default" if not specified 16 | 17 | train_metric_prefix: "train/" 18 | val_metric_prefix: "val/" 19 | test_metric_prefix: "test/" 20 | 21 | # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) 22 | system_tracking_interval: 10 # set to null to disable system metrics tracking 23 | 24 | # enable/disable logging of system params such as installed packages, git info, env vars, etc. 25 | log_system_params: true 26 | 27 | # enable/disable tracking console logs (default value is true) 28 | capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 29 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: lightning.pytorch.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "lightning-hydra-template" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: lightning.pytorch.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | # - comet 5 | - csv 6 | # - mlflow 7 | # - neptune 8 | - tensorboard 9 | - wandb 10 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: lightning.pytorch.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: username/lightning-hydra-template 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: lightning.pytorch.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "lightning-hydra-template" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/cfm/default.yaml: -------------------------------------------------------------------------------- 1 | name: CFM 2 | solver: euler 3 | sigma_min: 1e-4 4 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/decoder/default.yaml: -------------------------------------------------------------------------------- 1 | channels: [256, 256] 2 | dropout: 0.05 3 | attention_head_dim: 64 4 | n_blocks: 1 5 | num_mid_blocks: 2 6 | num_heads: 2 7 | act_fn: snakebeta 8 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/encoder/default.yaml: -------------------------------------------------------------------------------- 1 | encoder_type: RoPE Encoder 2 | encoder_params: 3 | n_feats: ${model.n_feats} 4 | n_channels: 192 5 | filter_channels: 768 6 | filter_channels_dp: 256 7 | n_heads: 2 8 | n_layers: 6 9 | kernel_size: 3 10 | p_dropout: 0.1 11 | spk_emb_dim: 64 12 | n_spks: 1 13 | prenet: true 14 | 15 | duration_predictor_params: 16 | filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} 17 | kernel_size: 3 18 | p_dropout: ${model.encoder.encoder_params.p_dropout} 19 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/matcha.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - encoder: default.yaml 4 | - decoder: default.yaml 5 | - cfm: default.yaml 6 | - optimizer: adam.yaml 7 | 8 | _target_: matcha.models.matcha_tts.MatchaTTS 9 | n_vocab: 178 10 | n_spks: ${data.n_spks} 11 | spk_emb_dim: 64 12 | n_feats: 80 13 | data_statistics: ${data.data_statistics} 14 | out_size: null # Must be divisible by 4 15 | prior_loss: true 16 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | _partial_: true 3 | lr: 1e-4 4 | weight_decay: 0.0 5 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # you can replace it with "." if you want the root to be the current working directory 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to data directory 7 | data_dir: ${paths.root_dir}/data/ 8 | 9 | # path to logging directory 10 | log_dir: ${paths.root_dir}/logs/ 11 | 12 | # path to output directory, created dynamically by hydra 13 | # path generation pattern is specified in `configs/hydra/default.yaml` 14 | # use it to store all files generated during the run, like ckpts and metrics 15 | output_dir: ${hydra:runtime.output_dir} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} 19 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # specify here default configuration 4 | # order of defaults determines the order in which configs override each other 5 | defaults: 6 | - _self_ 7 | - data: ljspeech 8 | - model: matcha 9 | - callbacks: default 10 | - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`) 11 | - trainer: default 12 | - paths: default 13 | - extras: default 14 | - hydra: default 15 | 16 | # experiment configs allow for version control of specific hyperparameters 17 | # e.g. best hyperparameters for given model and datamodule 18 | - experiment: null 19 | 20 | # config for hyperparameter optimization 21 | - hparams_search: null 22 | 23 | # optional local config for machine/user specific settings 24 | # it's optional since it doesn't need to exist and is excluded from version control 25 | - optional local: default 26 | 27 | # debugging config (enable through command line, e.g. `python train.py debug=default) 28 | - debug: null 29 | 30 | # task name, determines output directory path 31 | task_name: "train" 32 | 33 | run_name: ??? 34 | 35 | # tags to help you identify your experiments 36 | # you can overwrite this in experiment configs 37 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"` 38 | tags: ["dev"] 39 | 40 | # set False to skip model training 41 | train: True 42 | 43 | # evaluate on test set, using best model weights achieved during training 44 | # lightning chooses best weights based on the metric specified in checkpoint callback 45 | test: True 46 | 47 | # simply provide checkpoint path to resume training 48 | ckpt_path: null 49 | 50 | # seed for random number generators in pytorch, numpy and python.random 51 | seed: 1234 52 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | strategy: ddp 5 | 6 | accelerator: gpu 7 | devices: [0,1] 8 | num_nodes: 1 9 | sync_batchnorm: True 10 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: lightning.pytorch.trainer.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | max_epochs: -1 6 | 7 | accelerator: gpu 8 | devices: [0] 9 | 10 | # mixed precision for extra speed-up 11 | precision: 16-mixed 12 | 13 | # perform a validation loop every N training epochs 14 | check_val_every_n_epoch: 1 15 | 16 | # set True to to ensure deterministic results 17 | # makes training slower but gives more reproducibility than just setting seeds 18 | deterministic: False 19 | 20 | gradient_clip_val: 5.0 21 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/data: -------------------------------------------------------------------------------- 1 | /home/smehta/Projects/Speech-Backbones/Grad-TTS/data -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.5.1 2 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/data/components/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/config.py: -------------------------------------------------------------------------------- 1 | v1 = { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0004, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | "upsample_rates": [8, 8, 2, 2], 11 | "upsample_kernel_sizes": [16, 16, 4, 4], 12 | "upsample_initial_channel": 512, 13 | "resblock_kernel_sizes": [3, 7, 11], 14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 15 | "resblock_initial_channel": 256, 16 | "segment_size": 8192, 17 | "num_mels": 80, 18 | "num_freq": 1025, 19 | "n_fft": 1024, 20 | "hop_size": 256, 21 | "win_size": 1024, 22 | "sampling_rate": 22050, 23 | "fmin": 0, 24 | "fmax": 8000, 25 | "fmax_loss": None, 26 | "num_workers": 4, 27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, 28 | } 29 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/denoiser.py: -------------------------------------------------------------------------------- 1 | # Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py 2 | 3 | """Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio.""" 4 | import torch 5 | 6 | 7 | class Denoiser(torch.nn.Module): 8 | """Removes model bias from audio produced with waveglow""" 9 | 10 | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"): 11 | super().__init__() 12 | self.filter_length = filter_length 13 | self.hop_length = int(filter_length / n_overlap) 14 | self.win_length = win_length 15 | 16 | dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device 17 | self.device = device 18 | if mode == "zeros": 19 | mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device) 20 | elif mode == "normal": 21 | mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device) 22 | else: 23 | raise Exception(f"Mode {mode} if not supported") 24 | 25 | def stft_fn(audio, n_fft, hop_length, win_length, window): 26 | spec = torch.stft( 27 | audio, 28 | n_fft=n_fft, 29 | hop_length=hop_length, 30 | win_length=win_length, 31 | window=window, 32 | return_complex=True, 33 | ) 34 | spec = torch.view_as_real(spec) 35 | return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0]) 36 | 37 | self.stft = lambda x: stft_fn( 38 | audio=x, 39 | n_fft=self.filter_length, 40 | hop_length=self.hop_length, 41 | win_length=self.win_length, 42 | window=torch.hann_window(self.win_length, device=device), 43 | ) 44 | self.istft = lambda x, y: torch.istft( 45 | torch.complex(x * torch.cos(y), x * torch.sin(y)), 46 | n_fft=self.filter_length, 47 | hop_length=self.hop_length, 48 | win_length=self.win_length, 49 | window=torch.hann_window(self.win_length, device=device), 50 | ) 51 | 52 | with torch.no_grad(): 53 | bias_audio = vocoder(mel_input).float().squeeze(0) 54 | bias_spec, _ = self.stft(bias_audio) 55 | 56 | self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None]) 57 | 58 | @torch.inference_mode() 59 | def forward(self, audio, strength=0.0005): 60 | audio_spec, audio_angles = self.stft(audio) 61 | audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength 62 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 63 | audio_denoised = self.istft(audio_spec_denoised, audio_angles) 64 | return audio_denoised 65 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/hifigan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | 13 | 14 | def plot_spectrogram(spectrogram): 15 | fig, ax = plt.subplots(figsize=(10, 2)) 16 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 17 | plt.colorbar(im, ax=ax) 18 | 19 | fig.canvas.draw() 20 | plt.close() 21 | 22 | return fig 23 | 24 | 25 | def init_weights(m, mean=0.0, std=0.01): 26 | classname = m.__class__.__name__ 27 | if classname.find("Conv") != -1: 28 | m.weight.data.normal_(mean, std) 29 | 30 | 31 | def apply_weight_norm(m): 32 | classname = m.__class__.__name__ 33 | if classname.find("Conv") != -1: 34 | weight_norm(m) 35 | 36 | 37 | def get_padding(kernel_size, dilation=1): 38 | return int((kernel_size * dilation - dilation) / 2) 39 | 40 | 41 | def load_checkpoint(filepath, device): 42 | assert os.path.isfile(filepath) 43 | print(f"Loading '{filepath}'") 44 | checkpoint_dict = torch.load(filepath, map_location=device) 45 | print("Complete.") 46 | return checkpoint_dict 47 | 48 | 49 | def save_checkpoint(filepath, obj): 50 | print(f"Saving checkpoint to {filepath}") 51 | torch.save(obj, filepath) 52 | print("Complete.") 53 | 54 | 55 | def scan_checkpoint(cp_dir, prefix): 56 | pattern = os.path.join(cp_dir, prefix + "????????") 57 | cp_list = glob.glob(pattern) 58 | if len(cp_list) == 0: 59 | return None 60 | return sorted(cp_list)[-1] 61 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/models/components/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/onnx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/matcha/onnx/__init__.py -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from matcha.text import cleaners 3 | from matcha.text.symbols import symbols 4 | 5 | # Mappings from symbol to numeric ID and vice versa: 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} # pylint: disable=unnecessary-comprehension 8 | 9 | 10 | def text_to_sequence(text, cleaner_names): 11 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 12 | Args: 13 | text: string to convert to a sequence 14 | cleaner_names: names of the cleaner functions to run the text through 15 | Returns: 16 | List of integers corresponding to the symbols in the text 17 | """ 18 | sequence = [] 19 | 20 | clean_text = _clean_text(text, cleaner_names) 21 | for symbol in clean_text: 22 | symbol_id = _symbol_to_id[symbol] 23 | sequence += [symbol_id] 24 | return sequence 25 | 26 | 27 | def cleaned_text_to_sequence(cleaned_text): 28 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 29 | Args: 30 | text: string to convert to a sequence 31 | Returns: 32 | List of integers corresponding to the symbols in the text 33 | """ 34 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] 35 | return sequence 36 | 37 | 38 | def sequence_to_text(sequence): 39 | """Converts a sequence of IDs back to a string""" 40 | result = "" 41 | for symbol_id in sequence: 42 | s = _id_to_symbol[symbol_id] 43 | result += s 44 | return result 45 | 46 | 47 | def _clean_text(text, cleaner_names): 48 | for name in cleaner_names: 49 | cleaner = getattr(cleaners, name) 50 | if not cleaner: 51 | raise Exception("Unknown cleaner: %s" % name) 52 | text = cleaner(text) 53 | return text 54 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | import inflect 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 13 | _number_re = re.compile(r"[0-9]+") 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(",", "") 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace(".", " point ") 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split(".") 27 | if len(parts) > 2: 28 | return match + " dollars" 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = "dollar" if dollars == 1 else "dollars" 33 | cent_unit = "cent" if cents == 1 else "cents" 34 | return f"{dollars} {dollar_unit}, {cents} {cent_unit}" 35 | elif dollars: 36 | dollar_unit = "dollar" if dollars == 1 else "dollars" 37 | return f"{dollars} {dollar_unit}" 38 | elif cents: 39 | cent_unit = "cent" if cents == 1 else "cents" 40 | return f"{cents} {cent_unit}" 41 | else: 42 | return "zero dollars" 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return "two thousand" 54 | elif num > 2000 and num < 2010: 55 | return "two thousand " + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + " hundred" 58 | else: 59 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 60 | else: 61 | return _inflect.number_to_words(num, andword="") 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r"\1 pounds", text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron 2 | 3 | Defines the set of symbols used in text input to the model. 4 | """ 5 | _pad = "_" 6 | _punctuation = ';:,.!?¡¿—…"«»“” ' 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 8 | _letters_ipa = ( 9 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | ) 11 | 12 | 13 | # Export all symbols: 14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 15 | 16 | # Special symbol ids 17 | SPACE_ID = symbols.index(" ") 18 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers 2 | from matcha.utils.logging_utils import log_hyperparameters 3 | from matcha.utils.pylogger import get_pylogger 4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree 5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper 6 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from librosa.filters import mel as librosa_mel_fn 5 | from scipy.io.wavfile import read 6 | 7 | MAX_WAV_VALUE = 32768.0 8 | 9 | 10 | def load_wav(full_path): 11 | sampling_rate, data = read(full_path) 12 | return data, sampling_rate 13 | 14 | 15 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 16 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 17 | 18 | 19 | def dynamic_range_decompression(x, C=1): 20 | return np.exp(x) / C 21 | 22 | 23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 24 | return torch.log(torch.clamp(x, min=clip_val) * C) 25 | 26 | 27 | def dynamic_range_decompression_torch(x, C=1): 28 | return torch.exp(x) / C 29 | 30 | 31 | def spectral_normalize_torch(magnitudes): 32 | output = dynamic_range_compression_torch(magnitudes) 33 | return output 34 | 35 | 36 | def spectral_de_normalize_torch(magnitudes): 37 | output = dynamic_range_decompression_torch(magnitudes) 38 | return output 39 | 40 | 41 | mel_basis = {} 42 | hann_window = {} 43 | 44 | 45 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 46 | if torch.min(y) < -1.0: 47 | print("min value is ", torch.min(y)) 48 | if torch.max(y) > 1.0: 49 | print("max value is ", torch.max(y)) 50 | 51 | global mel_basis, hann_window # pylint: disable=global-statement 52 | if f"{str(fmax)}_{str(y.device)}" not in mel_basis: 53 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 54 | mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 55 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 56 | 57 | y = torch.nn.functional.pad( 58 | y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" 59 | ) 60 | y = y.squeeze(1) 61 | 62 | spec = torch.view_as_real( 63 | torch.stft( 64 | y, 65 | n_fft, 66 | hop_length=hop_size, 67 | win_length=win_size, 68 | window=hann_window[str(y.device)], 69 | center=center, 70 | pad_mode="reflect", 71 | normalized=False, 72 | onesided=True, 73 | return_complex=True, 74 | ) 75 | ) 76 | 77 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 78 | 79 | spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) 80 | spec = spectral_normalize_torch(spec) 81 | 82 | return spec 83 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/instantiators.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import hydra 4 | from lightning import Callback 5 | from lightning.pytorch.loggers import Logger 6 | from omegaconf import DictConfig 7 | 8 | from matcha.utils import pylogger 9 | 10 | log = pylogger.get_pylogger(__name__) 11 | 12 | 13 | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]: 14 | """Instantiates callbacks from config. 15 | 16 | :param callbacks_cfg: A DictConfig object containing callback configurations. 17 | :return: A list of instantiated callbacks. 18 | """ 19 | callbacks: List[Callback] = [] 20 | 21 | if not callbacks_cfg: 22 | log.warning("No callback configs found! Skipping..") 23 | return callbacks 24 | 25 | if not isinstance(callbacks_cfg, DictConfig): 26 | raise TypeError("Callbacks config must be a DictConfig!") 27 | 28 | for _, cb_conf in callbacks_cfg.items(): 29 | if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf: 30 | log.info(f"Instantiating callback <{cb_conf._target_}>") # pylint: disable=protected-access 31 | callbacks.append(hydra.utils.instantiate(cb_conf)) 32 | 33 | return callbacks 34 | 35 | 36 | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]: 37 | """Instantiates loggers from config. 38 | 39 | :param logger_cfg: A DictConfig object containing logger configurations. 40 | :return: A list of instantiated loggers. 41 | """ 42 | logger: List[Logger] = [] 43 | 44 | if not logger_cfg: 45 | log.warning("No logger configs found! Skipping...") 46 | return logger 47 | 48 | if not isinstance(logger_cfg, DictConfig): 49 | raise TypeError("Logger config must be a DictConfig!") 50 | 51 | for _, lg_conf in logger_cfg.items(): 52 | if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf: 53 | log.info(f"Instantiating logger <{lg_conf._target_}>") # pylint: disable=protected-access 54 | logger.append(hydra.utils.instantiate(lg_conf)) 55 | 56 | return logger 57 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | from omegaconf import OmegaConf 5 | 6 | from matcha.utils import pylogger 7 | 8 | log = pylogger.get_pylogger(__name__) 9 | 10 | 11 | @rank_zero_only 12 | def log_hyperparameters(object_dict: Dict[str, Any]) -> None: 13 | """Controls which config parts are saved by Lightning loggers. 14 | 15 | Additionally saves: 16 | - Number of model parameters 17 | 18 | :param object_dict: A dictionary containing the following objects: 19 | - `"cfg"`: A DictConfig object containing the main config. 20 | - `"model"`: The Lightning model. 21 | - `"trainer"`: The Lightning trainer. 22 | """ 23 | hparams = {} 24 | 25 | cfg = OmegaConf.to_container(object_dict["cfg"]) 26 | model = object_dict["model"] 27 | trainer = object_dict["trainer"] 28 | 29 | if not trainer.logger: 30 | log.warning("Logger not found! Skipping hyperparameter logging...") 31 | return 32 | 33 | hparams["model"] = cfg["model"] 34 | 35 | # save number of model parameters 36 | hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) 37 | hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad) 38 | hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad) 39 | 40 | hparams["data"] = cfg["data"] 41 | hparams["trainer"] = cfg["trainer"] 42 | 43 | hparams["callbacks"] = cfg.get("callbacks") 44 | hparams["extras"] = cfg.get("extras") 45 | 46 | hparams["task_name"] = cfg.get("task_name") 47 | hparams["tags"] = cfg.get("tags") 48 | hparams["ckpt_path"] = cfg.get("ckpt_path") 49 | hparams["seed"] = cfg.get("seed") 50 | 51 | # send hparams to all loggers 52 | for logger in trainer.loggers: 53 | logger.log_hyperparams(hparams) 54 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jaywalnut310/glow-tts """ 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def sequence_mask(length, max_length=None): 8 | if max_length is None: 9 | max_length = length.max() 10 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 11 | return x.unsqueeze(0) < length.unsqueeze(1) 12 | 13 | 14 | def fix_len_compatibility(length, num_downsamplings_in_unet=2): 15 | factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet) 16 | length = (length / factor).ceil() * factor 17 | if not torch.onnx.is_in_onnx_export(): 18 | return length.int().item() 19 | else: 20 | return length 21 | 22 | 23 | def convert_pad_shape(pad_shape): 24 | inverted_shape = pad_shape[::-1] 25 | pad_shape = [item for sublist in inverted_shape for item in sublist] 26 | return pad_shape 27 | 28 | 29 | def generate_path(duration, mask): 30 | device = duration.device 31 | 32 | b, t_x, t_y = mask.shape 33 | cum_duration = torch.cumsum(duration, 1) 34 | path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) 35 | 36 | cum_duration_flat = cum_duration.view(b * t_x) 37 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 38 | path = path.view(b, t_x, t_y) 39 | path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 40 | path = path * mask 41 | return path 42 | 43 | 44 | def duration_loss(logw, logw_, lengths): 45 | loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths) 46 | return loss 47 | 48 | 49 | def normalize(data, mu, std): 50 | if not isinstance(mu, (float, int)): 51 | if isinstance(mu, list): 52 | mu = torch.tensor(mu, dtype=data.dtype, device=data.device) 53 | elif isinstance(mu, torch.Tensor): 54 | mu = mu.to(data.device) 55 | elif isinstance(mu, np.ndarray): 56 | mu = torch.from_numpy(mu).to(data.device) 57 | mu = mu.unsqueeze(-1) 58 | 59 | if not isinstance(std, (float, int)): 60 | if isinstance(std, list): 61 | std = torch.tensor(std, dtype=data.dtype, device=data.device) 62 | elif isinstance(std, torch.Tensor): 63 | std = std.to(data.device) 64 | elif isinstance(std, np.ndarray): 65 | std = torch.from_numpy(std).to(data.device) 66 | std = std.unsqueeze(-1) 67 | 68 | return (data - mu) / std 69 | 70 | 71 | def denormalize(data, mu, std): 72 | if not isinstance(mu, float): 73 | if isinstance(mu, list): 74 | mu = torch.tensor(mu, dtype=data.dtype, device=data.device) 75 | elif isinstance(mu, torch.Tensor): 76 | mu = mu.to(data.device) 77 | elif isinstance(mu, np.ndarray): 78 | mu = torch.from_numpy(mu).to(data.device) 79 | mu = mu.unsqueeze(-1) 80 | 81 | if not isinstance(std, float): 82 | if isinstance(std, list): 83 | std = torch.tensor(std, dtype=data.dtype, device=data.device) 84 | elif isinstance(std, torch.Tensor): 85 | std = std.to(data.device) 86 | elif isinstance(std, np.ndarray): 87 | std = torch.from_numpy(std).to(data.device) 88 | std = std.unsqueeze(-1) 89 | 90 | return data * std + mu 91 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from matcha.utils.monotonic_align.core import maximum_path_c 5 | 6 | 7 | def maximum_path(value, mask): 8 | """Cython optimised version. 9 | value: [b, t_x, t_y] 10 | mask: [b, t_x, t_y] 11 | """ 12 | value = value * mask 13 | device = value.device 14 | dtype = value.dtype 15 | value = value.data.cpu().numpy().astype(np.float32) 16 | path = np.zeros_like(value).astype(np.int32) 17 | mask = mask.data.cpu().numpy() 18 | 19 | t_x_max = mask.sum(1)[:, 0].astype(np.int32) 20 | t_y_max = mask.sum(2)[:, 0].astype(np.int32) 21 | maximum_path_c(path, value, t_x_max, t_y_max) 22 | return torch.from_numpy(path).to(device=device, dtype=dtype) 23 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name: str = __name__) -> logging.Logger: 7 | """Initializes a multi-GPU-friendly python command line logger. 8 | 9 | :param name: The name of the logger, defaults to ``__name__``. 10 | 11 | :return: A logger object. 12 | """ 13 | logger = logging.getLogger(name) 14 | 15 | # this ensures all logging levels get marked with the rank zero decorator 16 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 17 | logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") 18 | for level in logging_levels: 19 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 20 | 21 | return logger 22 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/matcha/utils/rich_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Sequence 3 | 4 | import rich 5 | import rich.syntax 6 | import rich.tree 7 | from hydra.core.hydra_config import HydraConfig 8 | from lightning.pytorch.utilities import rank_zero_only 9 | from omegaconf import DictConfig, OmegaConf, open_dict 10 | from rich.prompt import Prompt 11 | 12 | from matcha.utils import pylogger 13 | 14 | log = pylogger.get_pylogger(__name__) 15 | 16 | 17 | @rank_zero_only 18 | def print_config_tree( 19 | cfg: DictConfig, 20 | print_order: Sequence[str] = ( 21 | "data", 22 | "model", 23 | "callbacks", 24 | "logger", 25 | "trainer", 26 | "paths", 27 | "extras", 28 | ), 29 | resolve: bool = False, 30 | save_to_file: bool = False, 31 | ) -> None: 32 | """Prints the contents of a DictConfig as a tree structure using the Rich library. 33 | 34 | :param cfg: A DictConfig composed by Hydra. 35 | :param print_order: Determines in what order config components are printed. Default is ``("data", "model", 36 | "callbacks", "logger", "trainer", "paths", "extras")``. 37 | :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``. 38 | :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``. 39 | """ 40 | style = "dim" 41 | tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) 42 | 43 | queue = [] 44 | 45 | # add fields from `print_order` to queue 46 | for field in print_order: 47 | _ = ( 48 | queue.append(field) 49 | if field in cfg 50 | else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...") 51 | ) 52 | 53 | # add all the other fields to queue (not specified in `print_order`) 54 | for field in cfg: 55 | if field not in queue: 56 | queue.append(field) 57 | 58 | # generate config tree from queue 59 | for field in queue: 60 | branch = tree.add(field, style=style, guide_style=style) 61 | 62 | config_group = cfg[field] 63 | if isinstance(config_group, DictConfig): 64 | branch_content = OmegaConf.to_yaml(config_group, resolve=resolve) 65 | else: 66 | branch_content = str(config_group) 67 | 68 | branch.add(rich.syntax.Syntax(branch_content, "yaml")) 69 | 70 | # print config tree 71 | rich.print(tree) 72 | 73 | # save config tree to file 74 | if save_to_file: 75 | with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file: 76 | rich.print(tree, file=file) 77 | 78 | 79 | @rank_zero_only 80 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None: 81 | """Prompts user to input tags from command line if no tags are provided in config. 82 | 83 | :param cfg: A DictConfig composed by Hydra. 84 | :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``. 85 | """ 86 | if not cfg.get("tags"): 87 | if "id" in HydraConfig().cfg.hydra.job: 88 | raise ValueError("Specify tags before launching a multirun!") 89 | 90 | log.warning("No tags provided in config. Prompting user to input tags...") 91 | tags = Prompt.ask("Enter a list of comma separated tags", default="dev") 92 | tags = [t.strip() for t in tags.split(",") if t != ""] 93 | 94 | with open_dict(cfg): 95 | cfg.tags = tags 96 | 97 | log.info(f"Tags: {cfg.tags}") 98 | 99 | if save_to_file: 100 | with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file: 101 | rich.print(cfg.tags, file=file) 102 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GLM_4_Voice/third_party/Matcha-TTS/notebooks/.gitkeep -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] 3 | 4 | [tool.black] 5 | line-length = 120 6 | target-version = ['py310'] 7 | exclude = ''' 8 | 9 | ( 10 | /( 11 | \.eggs # exclude a few common directories in the 12 | | \.git # root of the project 13 | | \.hg 14 | | \.mypy_cache 15 | | \.tox 16 | | \.venv 17 | | _build 18 | | buck-out 19 | | build 20 | | dist 21 | )/ 22 | | foo.py # also separately exclude a file named foo.py in 23 | # the root of the project 24 | ) 25 | ''' 26 | 27 | [tool.pytest.ini_options] 28 | addopts = [ 29 | "--color=yes", 30 | "--durations=0", 31 | "--strict-markers", 32 | "--doctest-modules", 33 | ] 34 | filterwarnings = [ 35 | "ignore::DeprecationWarning", 36 | "ignore::UserWarning", 37 | ] 38 | log_cli = "True" 39 | markers = [ 40 | "slow: slow tests", 41 | ] 42 | minversion = "6.0" 43 | testpaths = "tests/" 44 | 45 | [tool.coverage.report] 46 | exclude_lines = [ 47 | "pragma: nocover", 48 | "raise NotImplementedError", 49 | "raise NotImplementedError()", 50 | "if __name__ == .__main__.:", 51 | ] 52 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | # --------- pytorch --------- # 2 | torch>=2.0.0 3 | torchvision>=0.15.0 4 | lightning>=2.0.0 5 | torchmetrics>=0.11.4 6 | 7 | # --------- hydra --------- # 8 | hydra-core==1.3.2 9 | hydra-colorlog==1.2.0 10 | hydra-optuna-sweeper==1.2.0 11 | 12 | # --------- loggers --------- # 13 | # wandb 14 | # neptune-client 15 | # mlflow 16 | # comet-ml 17 | # aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 18 | 19 | # --------- others --------- # 20 | rootutils # standardizing the project root setup 21 | pre-commit # hooks for applying linters on commit 22 | rich # beautiful text formatting in terminal 23 | pytest # tests 24 | # sh # for running bash commands in some tests (linux/macos only) 25 | phonemizer # phonemization of text 26 | tensorboard 27 | librosa 28 | Cython 29 | numpy 30 | einops 31 | inflect 32 | Unidecode 33 | scipy 34 | torchaudio 35 | matplotlib 36 | pandas 37 | conformer==0.3.2 38 | diffusers==0.25.0 39 | notebook 40 | ipywidgets 41 | gradio==3.43.2 42 | gdown 43 | wget 44 | seaborn 45 | piper_phonemize 46 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/scripts/schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Schedule execution of many runs 3 | # Run from root folder with: bash scripts/schedule.sh 4 | 5 | python src/train.py trainer.max_epochs=5 logger=csv 6 | 7 | python src/train.py trainer.max_epochs=10 logger=csv 8 | -------------------------------------------------------------------------------- /src/GLM_4_Voice/third_party/Matcha-TTS/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | import numpy 5 | from Cython.Build import cythonize 6 | from setuptools import Extension, find_packages, setup 7 | 8 | exts = [ 9 | Extension( 10 | name="matcha.utils.monotonic_align.core", 11 | sources=["matcha/utils/monotonic_align/core.pyx"], 12 | ) 13 | ] 14 | 15 | with open("README.md", encoding="utf-8") as readme_file: 16 | README = readme_file.read() 17 | 18 | cwd = os.path.dirname(os.path.abspath(__file__)) 19 | with open(os.path.join(cwd, "matcha", "VERSION")) as fin: 20 | version = fin.read().strip() 21 | 22 | setup( 23 | name="matcha-tts", 24 | version=version, 25 | description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching", 26 | long_description=README, 27 | long_description_content_type="text/markdown", 28 | author="Shivam Mehta", 29 | author_email="shivam.mehta25@gmail.com", 30 | url="https://shivammehta25.github.io/Matcha-TTS", 31 | install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))], 32 | include_dirs=[numpy.get_include()], 33 | include_package_data=True, 34 | packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]), 35 | # use this to customize global commands available in the terminal after installing the package 36 | entry_points={ 37 | "console_scripts": [ 38 | "matcha-data-stats=matcha.utils.generate_data_statistics:main", 39 | "matcha-tts=matcha.cli:cli", 40 | "matcha-tts-app=matcha.app:main", 41 | ] 42 | }, 43 | ext_modules=cythonize(exts, language_level=3), 44 | python_requires=">=3.9.0", 45 | ) 46 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/data/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from AR.data.bucket_sampler import DistributedBucketSampler 5 | from AR.data.dataset import Text2SemanticDataset 6 | from torch.utils.data import DataLoader 7 | 8 | 9 | class Text2SemanticDataModule(LightningDataModule): 10 | def __init__( 11 | self, 12 | config, 13 | train_semantic_path, 14 | train_phoneme_path, 15 | dev_semantic_path=None, 16 | dev_phoneme_path=None, 17 | ): 18 | super().__init__() 19 | self.config = config 20 | self.train_semantic_path = train_semantic_path 21 | self.train_phoneme_path = train_phoneme_path 22 | self.dev_semantic_path = dev_semantic_path 23 | self.dev_phoneme_path = dev_phoneme_path 24 | self.num_workers = self.config["data"]["num_workers"] 25 | 26 | def prepare_data(self): 27 | pass 28 | 29 | def setup(self, stage=None, output_logs=False): 30 | self._train_dataset = Text2SemanticDataset( 31 | phoneme_path=self.train_phoneme_path, 32 | semantic_path=self.train_semantic_path, 33 | max_sec=self.config["data"]["max_sec"], 34 | pad_val=self.config["data"]["pad_val"], 35 | ) 36 | self._dev_dataset = self._train_dataset 37 | # self._dev_dataset = Text2SemanticDataset( 38 | # phoneme_path=self.dev_phoneme_path, 39 | # semantic_path=self.dev_semantic_path, 40 | # max_sample=self.config['data']['max_eval_sample'], 41 | # max_sec=self.config['data']['max_sec'], 42 | # pad_val=self.config['data']['pad_val']) 43 | 44 | def train_dataloader(self): 45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] 46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 48 | return DataLoader( 49 | self._train_dataset, 50 | batch_size=batch_size, 51 | sampler=sampler, 52 | collate_fn=self._train_dataset.collate, 53 | num_workers=self.num_workers, 54 | persistent_workers=True, 55 | prefetch_factor=16, 56 | ) 57 | 58 | def val_dataloader(self): 59 | return DataLoader( 60 | self._dev_dataset, 61 | batch_size=1, 62 | shuffle=False, 63 | collate_fn=self._train_dataset.collate, 64 | num_workers=max(self.num_workers, 12), 65 | persistent_workers=True, 66 | prefetch_factor=16, 67 | ) 68 | 69 | # 这个会使用到嘛? 70 | def test_dataloader(self): 71 | return DataLoader( 72 | self._dev_dataset, 73 | batch_size=1, 74 | shuffle=False, 75 | collate_fn=self._train_dataset.collate, 76 | ) 77 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/models/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/modules/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange( 64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32 65 | ).unsqueeze(1) 66 | else: 67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 68 | div_term = torch.exp( 69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) 70 | * -(math.log(10000.0) / self.embedding_dim) 71 | ) 72 | pe[:, 0::2] = torch.sin(position * div_term) 73 | pe[:, 1::2] = torch.cos(position * div_term) 74 | pe = pe.unsqueeze(0) 75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 76 | 77 | def forward(self, x: torch.Tensor) -> torch.Tensor: 78 | self.extend_pe(x) 79 | output = x.unsqueeze(-1) if x.ndim == 2 else x 80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 81 | return self.dropout(output) 82 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / ( 53 | self.total_steps - self.warmup_steps 54 | ) 55 | if decay_ratio < 0.0 or decay_ratio > 1.0: 56 | raise RuntimeError( 57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." 58 | ) 59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 61 | 62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 63 | self.set_lr(lr) 64 | self.lr = lr 65 | self._current_step += 1 66 | return self.lr 67 | 68 | 69 | if __name__ == "__main__": 70 | m = nn.Linear(10, 10) 71 | opt = Adam(m.parameters(), lr=1e-4) 72 | s = WarmupCosineLRSchedule( 73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 74 | ) 75 | lrs = [] 76 | for i in range(25000): 77 | s.step() 78 | lrs.append(s.lr) 79 | print(s.lr) 80 | 81 | plt.plot(lrs) 82 | plt.plot(range(0, 25000), lrs) 83 | plt.show() 84 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _mha_shape_check, 4 | _canonical_mask, 5 | _none_or_dtype, 6 | _in_projection_packed, 7 | ) 8 | 9 | def multi_head_attention_forward_patched( 10 | query, 11 | key, 12 | value, 13 | embed_dim_to_check: int, 14 | num_heads: int, 15 | in_proj_weight, 16 | in_proj_bias: Optional[Tensor], 17 | bias_k: Optional[Tensor], 18 | bias_v: Optional[Tensor], 19 | add_zero_attn: bool, 20 | dropout_p: float, 21 | out_proj_weight: Tensor, 22 | out_proj_bias: Optional[Tensor], 23 | training: bool = True, 24 | key_padding_mask: Optional[Tensor] = None, 25 | need_weights: bool = True, 26 | attn_mask: Optional[Tensor] = None, 27 | use_separate_proj_weight: bool = False, 28 | q_proj_weight: Optional[Tensor] = None, 29 | k_proj_weight: Optional[Tensor] = None, 30 | v_proj_weight: Optional[Tensor] = None, 31 | static_k: Optional[Tensor] = None, 32 | static_v: Optional[Tensor] = None, 33 | average_attn_weights: bool = True, 34 | is_causal: bool = False, 35 | cache=None, 36 | ) -> Tuple[Tensor, Optional[Tensor]]: 37 | 38 | # set up shape vars 39 | _, _, embed_dim = query.shape 40 | attn_mask = _canonical_mask( 41 | mask=attn_mask, 42 | mask_name="attn_mask", 43 | other_type=None, 44 | other_name="", 45 | target_type=query.dtype, 46 | check_other=False, 47 | ) 48 | head_dim = embed_dim // num_heads 49 | 50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 53 | 54 | if cache["first_infer"] == 1: 55 | cache["k"][cache["stage"]] = k 56 | cache["v"][cache["stage"]] = v 57 | else: 58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 60 | k = cache["k"][cache["stage"]] 61 | v = cache["v"][cache["stage"]] 62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 63 | 64 | attn_mask = _canonical_mask( 65 | mask=attn_mask, 66 | mask_name="attn_mask", 67 | other_type=None, 68 | other_name="", 69 | target_type=q.dtype, 70 | check_other=False, 71 | ) 72 | attn_mask = attn_mask.unsqueeze(0) 73 | 74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 77 | 78 | dropout_p = 0.0 79 | attn_mask = attn_mask.unsqueeze(0) 80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 83 | attn_output = scaled_dot_product_attention( 84 | q, k, v, attn_mask, dropout_p, is_causal 85 | ) 86 | attn_output = ( 87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 88 | ) 89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 90 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 91 | 92 | return attn_output 93 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from src.GPT_SoVITS.AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = ( 34 | rf"([{''.join(self._special_cases_dict.keys())}])" 35 | ) 36 | 37 | def _normalize_punctuation(self, text: str) -> str: 38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 40 | text = regex.sub(r"\pZ+", r" ", text) 41 | return text.strip() 42 | 43 | def _convert_punctuation(self, word: Word) -> str: 44 | if not word.phonemes: 45 | return "" 46 | if word.phonemes[0] in ["‖", "|"]: 47 | return word.text.strip() 48 | 49 | phonemes = "".join(word.phonemes) 50 | # remove modifier characters ˈˌː with regex 51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 52 | return phonemes.strip() 53 | 54 | def phonemize(self, text: str, espeak: bool = False) -> str: 55 | text_to_phonemize: str = self._normalize_punctuation(text) 56 | sents: List[Sentence] = [ 57 | sent 58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) 59 | ] 60 | words: List[str] = [ 61 | self._convert_punctuation(word) for word in itertools.chain(*sents) 62 | ] 63 | return " ".join(words) 64 | 65 | def transform(self, phonemes): 66 | # convert phonemes to ids 67 | # dictionary is in symbols.py 68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 69 | 70 | 71 | if __name__ == "__main__": 72 | phonemizer = GruutPhonemizer("en-us") 73 | # text -> IPA 74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 75 | print("phonemes:", phonemes) 76 | print("len(phonemes):", len(phonemes)) 77 | phoneme_ids = phonemizer.transform(phonemes) 78 | print("phoneme_ids:", phoneme_ids) 79 | print("len(phoneme_ids):", len(phoneme_ids)) 80 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 8 | SPACE_ID = SYMBOLS.index(" ") 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 11 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == 'true' else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted( 22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 23 | # 获取最新的 ckpt 文件名 24 | newest_ckpt = sorted_info[0][2] 25 | return newest_ckpt 26 | 27 | 28 | # 文本存在且不为空时 return True 29 | def check_txt_file(file_path): 30 | try: 31 | with open(file_path, 'r') as file: 32 | text = file.readline().strip() 33 | assert text.strip() != '' 34 | return text 35 | except Exception: 36 | return False 37 | return False 38 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | 7 | def initialize(model: torch.nn.Module, init: str): 8 | """Initialize weights of a neural network module. 9 | 10 | Parameters are initialized using the given method or distribution. 11 | 12 | Custom initialization routines can be implemented into submodules 13 | as function `espnet_initialization_fn` within the custom module. 14 | 15 | Args: 16 | model: Target. 17 | init: Method of initialization. 18 | """ 19 | assert check_argument_types() 20 | print("init with", init) 21 | 22 | # weight init 23 | for p in model.parameters(): 24 | if p.dim() > 1: 25 | if init == "xavier_uniform": 26 | torch.nn.init.xavier_uniform_(p.data) 27 | elif init == "xavier_normal": 28 | torch.nn.init.xavier_normal_(p.data) 29 | elif init == "kaiming_uniform": 30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 31 | elif init == "kaiming_normal": 32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 33 | else: 34 | raise ValueError("Unknown initialization: " + init) 35 | # bias init 36 | for name, p in model.named_parameters(): 37 | if ".bias" in name and p.dim() == 1: 38 | p.data.zero_() 39 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict( 22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") 23 | ) 24 | with open(path, "a") as args_file: 25 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 26 | args_file.write( 27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) 28 | ) 29 | args_file.write("==> Cmd:\n") 30 | args_file.write(str(sys.argv)) 31 | args_file.write("\n==> args:\n") 32 | for k, v in sorted(args_dict.items()): 33 | args_file.write(" %s: %s\n" % (str(k), str(v))) 34 | args_file.close() 35 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/TTS_infer_pack/__init__.py: -------------------------------------------------------------------------------- 1 | from src.GPT_SoVITS.TTS_infer_pack import TTS, text_segmentation_method -------------------------------------------------------------------------------- /src/GPT_SoVITS/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 12 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1big.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 16 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1big2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 12 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 6 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1longer-v2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 732 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 15 32 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1longer.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 512 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s1mq.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 100 4 | batch_size: 6 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 32 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 40 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | saving_path: "ckpt/" 22 | resume_checkpoint: null 23 | vocoder_config_path: "quantizer/new_ckpt/config.json" 24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" 25 | datadir: "/home/liweiche/GigaSpeech/wavs" 26 | metapath: "/home/liweiche/GigaSpeech/train2.json" 27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json" 28 | sampledir: "logs/" 29 | pretrained_path: null 30 | lr: 0.0001 31 | batch_size: 200.0 32 | train_bucket_size: 8192 33 | training_step: 800000 34 | optim_flat_percent: 0.0 35 | warmup_step: 50 36 | adam_beta1: 0.9 37 | adam_beta2: 0.98 38 | ffd_size: 3072 39 | hidden_size: 768 40 | enc_nlayers: 6 41 | dec_nlayers: 6 42 | nheads: 12 43 | ar_layer: 4 44 | ar_ffd_size: 1024 45 | ar_hidden_size: 256 46 | ar_nheads: 4 47 | aligner_softmax_temp: 1.0 48 | layer_norm_eps: 0.00001 49 | speaker_embed_dropout: 0.05 50 | label_smoothing: 0.0 51 | val_check_interval: 5000 52 | check_val_every_n_epoch: 1 53 | precision: "fp16" 54 | nworkers: 16 55 | distributed: true 56 | accelerator: "ddp" 57 | version: null 58 | accumulate_grad_batches: 1 59 | use_repetition_token: true 60 | use_repetition_gating: false 61 | repetition_penalty: 1.0 62 | sampling_temperature: 1.0 63 | top_k: -1 64 | min_top_k: 3 65 | top_p: 0.8 66 | sample_num: 4 67 | length_penalty_max_length: 15000 68 | length_penalty_max_prob: 0.95 69 | max_input_length: 2048 70 | max_output_length: 2000 71 | sample_rate: 16000 72 | n_codes: 1024 73 | n_cluster_groups: 1 74 | phone_context_window: 4 75 | phoneset_size: 1000 76 | inference: 77 | top_k: 5 78 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/s2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 500, 5 | "seed": 1234, 6 | "epochs": 100, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 20480, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "text_low_lr_rate": 0.4 22 | }, 23 | "data": { 24 | "max_wav_value": 32768.0, 25 | "sampling_rate": 32000, 26 | "filter_length": 2048, 27 | "hop_length": 640, 28 | "win_length": 2048, 29 | "n_mel_channels": 128, 30 | "mel_fmin": 0.0, 31 | "mel_fmax": null, 32 | "add_blank": true, 33 | "n_speakers": 300, 34 | "cleaned_text": true 35 | }, 36 | "model": { 37 | "inter_channels": 192, 38 | "hidden_channels": 192, 39 | "filter_channels": 768, 40 | "n_heads": 2, 41 | "n_layers": 6, 42 | "kernel_size": 3, 43 | "p_dropout": 0.1, 44 | "resblock": "1", 45 | "resblock_kernel_sizes": [ 46 | 3, 47 | 7, 48 | 11 49 | ], 50 | "resblock_dilation_sizes": [ 51 | [ 52 | 1, 53 | 3, 54 | 5 55 | ], 56 | [ 57 | 1, 58 | 3, 59 | 5 60 | ], 61 | [ 62 | 1, 63 | 3, 64 | 5 65 | ] 66 | ], 67 | "upsample_rates": [ 68 | 10, 69 | 8, 70 | 2, 71 | 2, 72 | 2 73 | ], 74 | "upsample_initial_channel": 512, 75 | "upsample_kernel_sizes": [ 76 | 16, 77 | 16, 78 | 8, 79 | 2, 80 | 2 81 | ], 82 | "n_layers_q": 3, 83 | "use_spectral_norm": false, 84 | "gin_channels": 512, 85 | "semantic_frame_rate": "25hz", 86 | "freeze_quantizer": true 87 | }, 88 | "s2_ckpt_dir": "logs/s2/big2k1", 89 | "content_module": "cnhubert" 90 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | gpu: 2 | n_card: 1 3 | n_process_per_card: 2 4 | io: 5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 24 28 | dropout: 0 29 | EOS: 1024 30 | random_bert: 0 31 | inference: 32 | top_k: 5 33 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/configs/tts_infer.yaml: -------------------------------------------------------------------------------- 1 | custom: 2 | bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 3 | cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base 4 | device: cuda 5 | is_half: true 6 | t2s_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt 7 | version: v2 8 | vits_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth 9 | default: 10 | bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 11 | cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base 12 | device: cpu 13 | is_half: false 14 | t2s_weights_path: src/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt 15 | version: v1 16 | vits_weights_path: src/GPT_SoVITS/pretrained_models/s2G488k.pth 17 | default_v2: 18 | bert_base_path: src/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 19 | cnhuhbert_base_path: src/GPT_SoVITS/pretrained_models/chinese-hubert-base 20 | device: cpu 21 | is_half: false 22 | t2s_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt 23 | version: v2 24 | vits_weights_path: src/GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth 25 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = { 4 | 'cnhubert': cnhubert, 5 | 'whisper': whisper_enc 6 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ 23 | :1, :feature_len, : 24 | ].transpose(1, 2) 25 | return feature 26 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/module/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1 - dr) ** 2) 26 | g_loss = torch.mean(dg**2) 27 | loss += r_loss + g_loss 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1 - dg) ** 2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | 64 | def mle_loss(z, m, logs, logdet, mask): 65 | l = torch.sum(logs) + 0.5 * torch.sum( 66 | torch.exp(-2 * logs) * ((z - m) ** 2) 67 | ) # neg normal likelihood w/o the constant term 68 | l = l - torch.sum(logdet) # log jacobian determinant 69 | l = l / torch.sum( 70 | torch.ones_like(z) * mask 71 | ) # averaging across batch, channel and time axes 72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 73 | return l 74 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | # if os.environ.get("version","v1")=="v1": 3 | # from text.symbols import symbols 4 | # else: 5 | # from text.symbols2 import symbols 6 | from src.GPT_SoVITS.text import symbols as symbols_v1 7 | from src.GPT_SoVITS.text import symbols2 as symbols_v2 8 | 9 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} 10 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} 11 | 12 | def cleaned_text_to_sequence(cleaned_text, version=None): 13 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 14 | Args: 15 | text: string to convert to a sequence 16 | Returns: 17 | List of integers corresponding to the symbols in the text 18 | ''' 19 | if version is None:version=os.environ.get('version', 'v2') 20 | if version == "v1": 21 | phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] 22 | else: 23 | phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] 24 | 25 | return phones 26 | 27 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from src.GPT_SoVITS.text import cleaned_text_to_sequence 2 | import os 3 | # if os.environ.get("version","v1")=="v1": 4 | # from text import chinese 5 | # from text.symbols import symbols 6 | # else: 7 | # from text import chinese2 as chinese 8 | # from text.symbols2 import symbols 9 | 10 | from src.GPT_SoVITS.text import symbols as symbols_v1 11 | from src.GPT_SoVITS.text import symbols2 as symbols_v2 12 | 13 | special = [ 14 | # ("%", "zh", "SP"), 15 | ("¥", "zh", "SP2"), 16 | ("^", "zh", "SP3"), 17 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 18 | ] 19 | 20 | 21 | def clean_text(text, language, version=None): 22 | if version is None:version=os.environ.get('version', 'v2') 23 | if version == "v1": 24 | symbols = symbols_v1.symbols 25 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 26 | else: 27 | symbols = symbols_v2.symbols 28 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} 29 | 30 | if(language not in language_module_map): 31 | language="en" 32 | text=" " 33 | for special_s, special_l, target_symbol in special: 34 | if special_s in text and language == special_l: 35 | return clean_special(text, language, special_s, target_symbol, version) 36 | language_module = __import__("src.GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]]) 37 | if hasattr(language_module,"text_normalize"): 38 | norm_text = language_module.text_normalize(text) 39 | else: 40 | norm_text=text 41 | if language == "zh" or language=="yue":########## 42 | phones, word2ph = language_module.g2p(norm_text) 43 | assert len(phones) == sum(word2ph) 44 | assert len(norm_text) == len(word2ph) 45 | elif language == "en": 46 | phones = language_module.g2p(norm_text) 47 | if len(phones) < 4: 48 | phones = [','] + phones 49 | word2ph = None 50 | else: 51 | phones = language_module.g2p(norm_text) 52 | word2ph = None 53 | phones = ['UNK' if ph not in symbols else ph for ph in phones] 54 | return phones, word2ph, norm_text 55 | 56 | 57 | def clean_special(text, language, special_s, target_symbol, version=None): 58 | if version is None:version=os.environ.get('version', 'v2') 59 | if version == "v1": 60 | symbols = symbols_v1.symbols 61 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 62 | else: 63 | symbols = symbols_v2.symbols 64 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} 65 | 66 | """ 67 | 特殊静音段sp符号处理 68 | """ 69 | text = text.replace(special_s, ",") 70 | language_module = __import__("src.GPT_SoVITS.text."+language_module_map[language],fromlist=[language_module_map[language]]) 71 | norm_text = language_module.text_normalize(text) 72 | phones = language_module.g2p(norm_text) 73 | new_ph = [] 74 | for ph in phones[0]: 75 | assert ph in symbols 76 | if ph == ",": 77 | new_ph.append(target_symbol) 78 | else: 79 | new_ph.append(ph) 80 | return new_ph, phones[1], norm_text 81 | 82 | 83 | def text_to_sequence(text, language, version=None): 84 | version = os.environ.get('version',version) 85 | if version is None:version='v2' 86 | phones = clean_text(text) 87 | return cleaned_text_to_sequence(phones, version) 88 | 89 | 90 | if __name__ == "__main__": 91 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 92 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 2 | JSON JH EY1 S AH0 N 3 | CONDA K AA1 N D AH0 -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/engdict_cache.pickle -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/g2pw/__init__.py: -------------------------------------------------------------------------------- 1 | from src.GPT_SoVITS.text.g2pw.g2pw import * -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/g2pw/polyphonic.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/g2pw/polyphonic.pickle -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/g2pw/polyphonic.rep: -------------------------------------------------------------------------------- 1 | 湖泊: ['hu2','po1'] 2 | 地壳: ['di4','qiao4'] 3 | 柏树: ['bai3','shu4'] 4 | 曝光: ['bao4','guang1'] 5 | 弹力: ['tan2','li4'] 6 | 字帖: ['zi4','tie4'] 7 | 口吃: ['kou3','chi1'] 8 | 包扎: ['bao1','za1'] 9 | 哪吒: ['ne2','zha1'] 10 | 说服: ['shuo1','fu2'] 11 | 识字: ['shi2','zi4'] 12 | 骨头: ['gu3','tou5'] 13 | 对称: ['dui4','chen4'] 14 | 口供: ['kou3','gong4'] 15 | 抹布: ['ma1','bu4'] 16 | 露背: ['lu4','bei4'] 17 | 圈养: ['juan4', 'yang3'] 18 | 眼眶: ['yan3', 'kuang4'] 19 | 品行: ['pin3','xing2'] 20 | 颤抖: ['chan4','dou3'] 21 | 差不多: ['cha4','bu5','duo1'] 22 | 鸭绿江: ['ya1','lu4','jiang1'] 23 | 撒切尔: ['sa4','qie4','er3'] 24 | 比比皆是: ['bi3','bi3','jie1','shi4'] 25 | 身无长物: ['shen1','wu2','chang2','wu4'] 26 | 手里: ['shou2','li3'] 27 | 关卡: ['guan1','qia3'] 28 | 怀揣: ['huai2','chuai1'] 29 | 挑剔: ['tiao1','ti4'] 30 | 供称: ['gong4','cheng1'] 31 | 作坊: ['zuo1', 'fang5'] 32 | 中医: ['zhong1','yi1'] 33 | 嚷嚷: ['rang1','rang5'] 34 | 商厦: ['shang1','sha4'] 35 | 大厦: ['da4','sha4'] 36 | 刹车: ['sha1','che1'] 37 | 嘚瑟: ['de4','se5'] 38 | 朝鲜: ['chao2','xian3'] 39 | 阿房宫: ['e1','pang2','gong1'] 40 | 阿胶: ['e1','jiao1'] 41 | 咖喱: ['ga1','li5'] 42 | 时分: ['shi2','fen1'] 43 | 蚌埠: ['beng4','bu4'] 44 | 驯服: ['xun4','fu2'] 45 | 幸免于难: ['xing4','mian3','yu2','nan4'] 46 | 恶行: ['e4','xing2'] 47 | 唉: ['ai4'] 48 | 扎实: ['zha1','shi2'] 49 | 干将: ['gan4','jiang4'] 50 | 陈威行: ['chen2', 'wei1', 'hang2'] 51 | 郭晟: ['guo1', 'sheng4'] 52 | 中标: ['zhong4', 'biao1'] 53 | 抗住: ['kang2', 'zhu4'] -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/ja_userdic/userdict.csv: -------------------------------------------------------------------------------- 1 | 主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,* -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/namedict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/text/namedict_cache.pickle -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from src.GPT_SoVITS.text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | ord(char) + 65248: ord(char) 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒" 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/asr/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/asr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def check_fw_local_models(): 4 | ''' 5 | 启动时检查本地是否有 Faster Whisper 模型. 6 | ''' 7 | model_size_list = [ 8 | "tiny", "tiny.en", 9 | "base", "base.en", 10 | "small", "small.en", 11 | "medium", "medium.en", 12 | "large", "large-v1", 13 | "large-v2", "large-v3"] 14 | for i, size in enumerate(model_size_list): 15 | if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): 16 | model_size_list[i] = size + '-local' 17 | return model_size_list 18 | 19 | asr_dict = { 20 | "达摩 ASR (中文)": { 21 | 'lang': ['zh','yue'], 22 | 'size': ['large'], 23 | 'path': 'funasr_asr.py', 24 | 'precision': ['float32'] 25 | }, 26 | "Faster Whisper (多语种)": { 27 | 'lang': ['auto', 'zh', 'en', 'ja', 'ko', 'yue'], 28 | 'size': check_fw_local_models(), 29 | 'path': 'fasterwhisper_asr.py', 30 | 'precision': ['float32', 'float16', 'int8'] 31 | }, 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/asr/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os,argparse 2 | import traceback 3 | 4 | from modelscope.pipelines import pipeline 5 | from modelscope.utils.constant import Tasks 6 | from tqdm import tqdm 7 | 8 | path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' 9 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 10 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) 11 | def execute_denoise(input_folder,output_folder): 12 | os.makedirs(output_folder,exist_ok=True) 13 | # print(input_folder) 14 | # print(list(os.listdir(input_folder).sort())) 15 | for name in tqdm(os.listdir(input_folder)): 16 | try: 17 | ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) 18 | except: 19 | traceback.print_exc() 20 | 21 | if __name__ == '__main__': 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("-i", "--input_folder", type=str, required=True, 24 | help="Path to the folder containing WAV files.") 25 | parser.add_argument("-o", "--output_folder", type=str, required=True, 26 | help="Output folder to store transcriptions.") 27 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 28 | help="fp16 or fp32")#还没接入 29 | cmd = parser.parse_args() 30 | execute_denoise( 31 | input_folder = cmd.input_folder, 32 | output_folder = cmd.output_folder, 33 | ) -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/denoise-model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/i18n/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/i18n/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale') 6 | 7 | def load_language_list(language): 8 | with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f: 9 | language_list = json.load(f) 10 | return language_list 11 | 12 | def scan_language_list(): 13 | language_list = [] 14 | for name in os.listdir(I18N_JSON_DIR): 15 | if name.endswith(".json"):language_list.append(name.split('.')[0]) 16 | return language_list 17 | 18 | class I18nAuto: 19 | def __init__(self, language=None): 20 | if language in ["Auto", None]: 21 | language = locale.getdefaultlocale()[0] 22 | # getlocale can't identify the system's language ((None, None)) 23 | if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")): 24 | language = "en_US" 25 | self.language = language 26 | self.language_map = load_language_list(language) 27 | 28 | def __call__(self, key): 29 | return self.language_map.get(key, key) 30 | 31 | def __repr__(self): 32 | return "Use Language: " + self.language 33 | 34 | if __name__ == "__main__": 35 | i18n = I18nAuto(language='en_US') 36 | print(i18n) -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os,sys,numpy as np 2 | import traceback 3 | from scipy.io import wavfile 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 5 | # sys.path.append(parent_directory) 6 | from src.GPT_SoVITS.tools.my_utils import load_audio 7 | from slicer2 import Slicer 8 | 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): 10 | os.makedirs(opt_root,exist_ok=True) 11 | if os.path.isfile(inp): 12 | input=[inp] 13 | elif os.path.isdir(inp): 14 | input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 15 | else: 16 | return "输入路径存在但既不是文件也不是文件夹" 17 | slicer = Slicer( 18 | sr=32000, # 长音频采样率 19 | threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 20 | min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 21 | min_interval= int(min_interval), # 最短切割间隔 22 | hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 23 | max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 24 | ) 25 | _max=float(_max) 26 | alpha=float(alpha) 27 | for inp_path in input[int(i_part)::int(all_part)]: 28 | # print(inp_path) 29 | try: 30 | name = os.path.basename(inp_path) 31 | audio = load_audio(inp_path, 32000) 32 | # print(audio.shape) 33 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 34 | tmp_max = np.abs(chunk).max() 35 | if(tmp_max>1):chunk/=tmp_max 36 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 37 | wavfile.write( 38 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 39 | 32000, 40 | # chunk.astype(np.float32), 41 | (chunk * 32767).astype(np.int16), 42 | ) 43 | except: 44 | print(inp_path,"->fail->",traceback.format_exc()) 45 | return "执行完毕,请检查输出文件" 46 | 47 | print(slice(*sys.argv[1:])) 48 | 49 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/uvr5/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/GPT_SoVITS/tools/uvr5/bs_roformer/__init__.py -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/lib/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /src/GPT_SoVITS/tools/uvr5/uvr5_weights/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/__init__.py -------------------------------------------------------------------------------- /src/asr.py: -------------------------------------------------------------------------------- 1 | from funasr import AutoModel 2 | from funasr.utils.postprocess_utils import rich_transcription_postprocess 3 | 4 | 5 | class Fun_ASR: 6 | def __init__(self, model = "iic/SenseVoiceSmall", vad_model = "fsmn-vad", vad_kwargs = {"max_single_segment_time": 30000}, device = "cuda", disable_update = True): 7 | self.model = AutoModel( 8 | model = model, 9 | # vad_model = vad_model, 10 | # vad_kwargs=vad_kwargs, 11 | device = device, 12 | disable_update = disable_update, 13 | ) 14 | 15 | def infer(self, audio_file): 16 | res = self.model.generate( 17 | input = audio_file, 18 | cache = {}, 19 | language = "auto", 20 | use_itn = True, 21 | batch_size_s = 60, 22 | merge_vad = True, 23 | merge_length_s = 15, 24 | ) 25 | text = rich_transcription_postprocess(res[0]["text"]) 26 | 27 | return text 28 | -------------------------------------------------------------------------------- /src/musetalk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/__init__.py -------------------------------------------------------------------------------- /src/musetalk/models/unet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import json 5 | 6 | from diffusers import UNet2DConditionModel 7 | import sys 8 | import time 9 | import numpy as np 10 | import os 11 | 12 | class PositionalEncoding(nn.Module): 13 | def __init__(self, d_model=384, max_len=5000): 14 | super(PositionalEncoding, self).__init__() 15 | pe = torch.zeros(max_len, d_model) 16 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 17 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 18 | pe[:, 0::2] = torch.sin(position * div_term) 19 | pe[:, 1::2] = torch.cos(position * div_term) 20 | pe = pe.unsqueeze(0) 21 | self.register_buffer('pe', pe) 22 | 23 | def forward(self, x): 24 | b, seq_len, d_model = x.size() 25 | pe = self.pe[:, :seq_len, :] 26 | x = x + pe.to(x.device) 27 | return x 28 | 29 | class UNet(): 30 | def __init__(self, 31 | unet_config, 32 | model_path, 33 | use_float16=False, 34 | device = "cuda" 35 | ): 36 | with open(unet_config, 'r') as f: 37 | unet_config = json.load(f) 38 | self.model = UNet2DConditionModel(**unet_config) 39 | self.pe = PositionalEncoding(d_model=384) 40 | self.device = torch.device(device if torch.cuda.is_available() else "cpu") 41 | weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device) 42 | self.model.load_state_dict(weights) 43 | if use_float16: 44 | self.model = self.model.half() 45 | self.model.to(self.device) 46 | 47 | if __name__ == "__main__": 48 | unet = UNet() 49 | -------------------------------------------------------------------------------- /src/musetalk/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from os.path import abspath, dirname 3 | current_dir = dirname(abspath(__file__)) 4 | parent_dir = dirname(current_dir) 5 | sys.path.append(parent_dir+'/utils') 6 | -------------------------------------------------------------------------------- /src/musetalk/utils/blending.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | import cv2 4 | from face_parsing import FaceParsing 5 | 6 | fp = FaceParsing() 7 | 8 | def get_crop_box(box, expand): 9 | x, y, x1, y1 = box 10 | x_c, y_c = (x+x1)//2, (y+y1)//2 11 | w, h = x1-x, y1-y 12 | s = int(max(w, h)//2*expand) 13 | crop_box = [x_c-s, y_c-s, x_c+s, y_c+s] 14 | return crop_box, s 15 | 16 | def face_seg(image): 17 | seg_image = fp(image) 18 | if seg_image is None: 19 | print("error, no person_segment") 20 | return None 21 | 22 | seg_image = seg_image.resize(image.size) 23 | return seg_image 24 | 25 | def get_image(image,face,face_box,upper_boundary_ratio = 0.5,expand=1.2): 26 | body = Image.fromarray(image[:,:,::-1]) 27 | face = Image.fromarray(face[:,:,::-1]) 28 | 29 | x, y, x1, y1 = face_box 30 | crop_box, s = get_crop_box(face_box, expand) 31 | x_s, y_s, x_e, y_e = crop_box 32 | face_position = (x, y) 33 | 34 | face_large = body.crop(crop_box) 35 | ori_shape = face_large.size 36 | 37 | mask_image = face_seg(face_large) 38 | mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s)) 39 | mask_image = Image.new('L', ori_shape, 0) 40 | mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s)) 41 | 42 | # keep upper_boundary_ratio of talking area 43 | width, height = mask_image.size 44 | top_boundary = int(height * upper_boundary_ratio) 45 | modified_mask_image = Image.new('L', ori_shape, 0) 46 | modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary)) 47 | 48 | blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1 49 | mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) 50 | mask_image = Image.fromarray(mask_array) 51 | 52 | face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s)) 53 | body.paste(face_large, crop_box[:2], mask_image) 54 | body = np.array(body) 55 | return body[:,:,::-1] 56 | 57 | def get_image_prepare_material(image,face_box,upper_boundary_ratio = 0.5,expand=1.2): 58 | body = Image.fromarray(image[:,:,::-1]) 59 | 60 | x, y, x1, y1 = face_box 61 | crop_box, s = get_crop_box(face_box, expand) 62 | x_s, y_s, x_e, y_e = crop_box 63 | 64 | face_large = body.crop(crop_box) 65 | ori_shape = face_large.size 66 | 67 | mask_image = face_seg(face_large) 68 | mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s)) 69 | mask_image = Image.new('L', ori_shape, 0) 70 | mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s)) 71 | 72 | # keep upper_boundary_ratio of talking area 73 | width, height = mask_image.size 74 | top_boundary = int(height * upper_boundary_ratio) 75 | modified_mask_image = Image.new('L', ori_shape, 0) 76 | modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary)) 77 | 78 | blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1 79 | mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) 80 | return mask_array,crop_box 81 | 82 | def get_image_blending(image, face, face_box, mask_array, crop_box): 83 | body = Image.fromarray(image[:,:,::-1]) 84 | face = Image.fromarray(face[:,:,::-1]) 85 | 86 | x, y, x1, y1 = face_box 87 | x_s, y_s, x_e, y_e = crop_box 88 | face_large = body.crop(crop_box) 89 | 90 | mask_image = Image.fromarray(mask_array) 91 | mask_image = mask_image.convert("L") 92 | face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s)) 93 | body.paste(face_large, crop_box[:2], mask_image) 94 | body = np.array(body) 95 | return body[:,:,::-1] -------------------------------------------------------------------------------- /src/musetalk/utils/dwpose/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/utils/dwpose/__init__.py -------------------------------------------------------------------------------- /src/musetalk/utils/dwpose/default_runtime.py: -------------------------------------------------------------------------------- 1 | default_scope = 'mmpose' 2 | 3 | # hooks 4 | default_hooks = dict( 5 | timer=dict(type='IterTimerHook'), 6 | logger=dict(type='LoggerHook', interval=50), 7 | param_scheduler=dict(type='ParamSchedulerHook'), 8 | checkpoint=dict(type='CheckpointHook', interval=10), 9 | sampler_seed=dict(type='DistSamplerSeedHook'), 10 | visualization=dict(type='PoseVisualizationHook', enable=False), 11 | badcase=dict( 12 | type='BadCaseAnalysisHook', 13 | enable=False, 14 | out_dir='badcase', 15 | metric_type='loss', 16 | badcase_thr=5)) 17 | 18 | # custom hooks 19 | custom_hooks = [ 20 | # Synchronize model buffers such as running_mean and running_var in BN 21 | # at the end of each epoch 22 | dict(type='SyncBuffersHook') 23 | ] 24 | 25 | # multi-processing backend 26 | env_cfg = dict( 27 | cudnn_benchmark=False, 28 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 29 | dist_cfg=dict(backend='nccl'), 30 | ) 31 | 32 | # visualizer 33 | vis_backends = [ 34 | dict(type='LocalVisBackend'), 35 | # dict(type='TensorboardVisBackend'), 36 | # dict(type='WandbVisBackend'), 37 | ] 38 | visualizer = dict( 39 | type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') 40 | 41 | # logger 42 | log_processor = dict( 43 | type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) 44 | log_level = 'INFO' 45 | load_from = None 46 | resume = False 47 | 48 | # file I/O backend 49 | backend_args = dict(backend='local') 50 | 51 | # training/validation/testing progress 52 | train_cfg = dict(by_epoch=True) 53 | val_cfg = dict() 54 | test_cfg = dict() 55 | -------------------------------------------------------------------------------- /src/musetalk/utils/face_detection/README.md: -------------------------------------------------------------------------------- 1 | The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. -------------------------------------------------------------------------------- /src/musetalk/utils/face_detection/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Adrian Bulat""" 4 | __email__ = 'adrian.bulat@nottingham.ac.uk' 5 | __version__ = '1.0.1' 6 | 7 | from .api import FaceAlignment, LandmarksType, NetworkSize, YOLOv8_face 8 | -------------------------------------------------------------------------------- /src/musetalk/utils/face_detection/detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import FaceDetector -------------------------------------------------------------------------------- /src/musetalk/utils/face_detection/detection/sfd/__init__.py: -------------------------------------------------------------------------------- 1 | from .sfd_detector import SFDDetector as FaceDetector -------------------------------------------------------------------------------- /src/musetalk/utils/face_detection/detection/sfd/sfd_detector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | from torch.utils.model_zoo import load_url 4 | 5 | from ..core import FaceDetector 6 | 7 | from .net_s3fd import s3fd 8 | from .bbox import * 9 | from .detect import * 10 | 11 | models_urls = { 12 | 's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth', 13 | } 14 | 15 | 16 | class SFDDetector(FaceDetector): 17 | def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False): 18 | super(SFDDetector, self).__init__(device, verbose) 19 | 20 | # Initialise the face detector 21 | if not os.path.isfile(path_to_detector): 22 | model_weights = load_url(models_urls['s3fd']) 23 | else: 24 | model_weights = torch.load(path_to_detector) 25 | 26 | self.face_detector = s3fd() 27 | self.face_detector.load_state_dict(model_weights) 28 | self.face_detector.to(device) 29 | self.face_detector.eval() 30 | 31 | def detect_from_image(self, tensor_or_path): 32 | image = self.tensor_or_path_to_ndarray(tensor_or_path) 33 | 34 | bboxlist = detect(self.face_detector, image, device=self.device) 35 | keep = nms(bboxlist, 0.3) 36 | bboxlist = bboxlist[keep, :] 37 | bboxlist = [x for x in bboxlist if x[-1] > 0.5] 38 | 39 | return bboxlist 40 | 41 | def detect_from_batch(self, images): 42 | bboxlists = batch_detect(self.face_detector, images, device=self.device) 43 | keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])] 44 | bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)] 45 | bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists] 46 | 47 | return bboxlists 48 | 49 | @property 50 | def reference_scale(self): 51 | return 195 52 | 53 | @property 54 | def reference_x_shift(self): 55 | return 0 56 | 57 | @property 58 | def reference_y_shift(self): 59 | return 0 60 | -------------------------------------------------------------------------------- /src/musetalk/utils/face_parsing/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import os 4 | import cv2 5 | import numpy as np 6 | from PIL import Image 7 | from .model import BiSeNet 8 | import torchvision.transforms as transforms 9 | 10 | class FaceParsing(): 11 | def __init__(self): 12 | self.net = self.model_init() 13 | self.preprocess = self.image_preprocess() 14 | 15 | def model_init(self, 16 | resnet_path='./weights/face-parse-bisent/resnet18-5c106cde.pth', 17 | model_pth='./weights/face-parse-bisent/79999_iter.pth'): 18 | net = BiSeNet(resnet_path) 19 | if torch.cuda.is_available(): 20 | net.cuda() 21 | net.load_state_dict(torch.load(model_pth)) 22 | else: 23 | net.load_state_dict(torch.load(model_pth, map_location=torch.device('cpu'))) 24 | net.eval() 25 | return net 26 | 27 | def image_preprocess(self): 28 | return transforms.Compose([ 29 | transforms.ToTensor(), 30 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 31 | ]) 32 | 33 | def __call__(self, image, size=(512, 512)): 34 | if isinstance(image, str): 35 | image = Image.open(image) 36 | 37 | width, height = image.size 38 | with torch.no_grad(): 39 | image = image.resize(size, Image.BILINEAR) 40 | img = self.preprocess(image) 41 | if torch.cuda.is_available(): 42 | img = torch.unsqueeze(img, 0).cuda() 43 | else: 44 | img = torch.unsqueeze(img, 0) 45 | out = self.net(img)[0] 46 | parsing = out.squeeze(0).cpu().numpy().argmax(0) 47 | parsing[np.where(parsing>13)] = 0 48 | parsing[np.where(parsing>=1)] = 255 49 | parsing = Image.fromarray(parsing.astype(np.uint8)) 50 | return parsing 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/musetalk/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import torch 5 | 6 | ffmpeg_path = os.getenv('FFMPEG_PATH') 7 | if ffmpeg_path is None: 8 | print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static") 9 | elif ffmpeg_path not in os.getenv('PATH'): 10 | print("add ffmpeg to path") 11 | os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}" 12 | 13 | 14 | from src.musetalk.whisper.audio2feature import Audio2Feature 15 | from src.musetalk.models.vae import VAE 16 | from src.musetalk.models.unet import UNet,PositionalEncoding 17 | 18 | def load_all_model(): 19 | audio_processor = Audio2Feature(model_path="./weights/whisper/tiny.pt") 20 | vae = VAE(model_path = "./weights/sd-vae-ft-mse/") 21 | unet = UNet(unet_config="./weights/musetalk/musetalk.json", 22 | model_path ="./weights/musetalk/pytorch_model.bin") 23 | pe = PositionalEncoding(d_model=384) 24 | return audio_processor,vae,unet,pe 25 | 26 | def get_file_type(video_path): 27 | _, ext = os.path.splitext(video_path) 28 | 29 | if ext.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']: 30 | return 'image' 31 | elif ext.lower() in ['.avi', '.mp4', '.mov', '.flv', '.mkv']: 32 | return 'video' 33 | else: 34 | return 'unsupported' 35 | 36 | def get_video_fps(video_path): 37 | video = cv2.VideoCapture(video_path) 38 | fps = video.get(cv2.CAP_PROP_FPS) 39 | video.release() 40 | return fps 41 | 42 | def datagen(whisper_chunks, 43 | vae_encode_latents, 44 | batch_size=8, 45 | delay_frame=0): 46 | whisper_batch, latent_batch = [], [] 47 | for i, w in enumerate(whisper_chunks): 48 | idx = (i+delay_frame)%len(vae_encode_latents) 49 | latent = vae_encode_latents[idx] 50 | whisper_batch.append(w) 51 | latent_batch.append(latent) 52 | 53 | if len(latent_batch) >= batch_size: 54 | whisper_batch = np.stack(whisper_batch) 55 | latent_batch = torch.cat(latent_batch, dim=0) 56 | yield whisper_batch, latent_batch 57 | whisper_batch, latent_batch = [], [] 58 | 59 | # the last batch may smaller than batch size 60 | if len(latent_batch) > 0: 61 | whisper_batch = np.stack(whisper_batch) 62 | latent_batch = torch.cat(latent_batch, dim=0) 63 | 64 | yield whisper_batch, latent_batch 65 | 66 | def video2imgs(vid_path, save_path,cut_frame = 10000000): 67 | cap = cv2.VideoCapture(vid_path) 68 | count = 0 69 | while True: 70 | if count > cut_frame: 71 | break 72 | ret, frame = cap.read() 73 | if ret: 74 | cv2.imwrite(f"{save_path}/{count:08d}.png", frame) 75 | count += 1 76 | else: 77 | break 78 | 79 | def osmakedirs(path_list): 80 | for path in path_list: 81 | os.makedirs(path) if not os.path.exists(path) else None -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/__main__.py: -------------------------------------------------------------------------------- 1 | from .transcribe import cli 2 | 3 | 4 | cli() 5 | -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"} -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"} -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Henry-23/VideoChat/3d6700c5f2c4e49220501898f96049579b39345e/src/musetalk/whisper/whisper/assets/mel_filters.npz -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/multilingual/added_tokens.json: -------------------------------------------------------------------------------- 1 | {"<|endoftext|>": 50257} 2 | -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"} -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"} -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/normalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import BasicTextNormalizer 2 | from .english import EnglishTextNormalizer 3 | -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/normalizers/basic.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | import regex 5 | 6 | # non-ASCII letters that are not separated by "NFKD" normalization 7 | ADDITIONAL_DIACRITICS = { 8 | "œ": "oe", 9 | "Œ": "OE", 10 | "ø": "o", 11 | "Ø": "O", 12 | "æ": "ae", 13 | "Æ": "AE", 14 | "ß": "ss", 15 | "ẞ": "SS", 16 | "đ": "d", 17 | "Đ": "D", 18 | "ð": "d", 19 | "Ð": "D", 20 | "þ": "th", 21 | "Þ": "th", 22 | "ł": "l", 23 | "Ł": "L", 24 | } 25 | 26 | 27 | def remove_symbols_and_diacritics(s: str, keep=""): 28 | """ 29 | Replace any other markers, symbols, and punctuations with a space, 30 | and drop any diacritics (category 'Mn' and some manual mappings) 31 | """ 32 | return "".join( 33 | c 34 | if c in keep 35 | else ADDITIONAL_DIACRITICS[c] 36 | if c in ADDITIONAL_DIACRITICS 37 | else "" 38 | if unicodedata.category(c) == "Mn" 39 | else " " 40 | if unicodedata.category(c)[0] in "MSP" 41 | else c 42 | for c in unicodedata.normalize("NFKD", s) 43 | ) 44 | 45 | 46 | def remove_symbols(s: str): 47 | """ 48 | Replace any other markers, symbols, punctuations with a space, keeping diacritics 49 | """ 50 | return "".join( 51 | " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s) 52 | ) 53 | 54 | 55 | class BasicTextNormalizer: 56 | def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): 57 | self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols 58 | self.split_letters = split_letters 59 | 60 | def __call__(self, s: str): 61 | s = s.lower() 62 | s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets 63 | s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis 64 | s = self.clean(s).lower() 65 | 66 | if self.split_letters: 67 | s = " ".join(regex.findall(r"\X", s, regex.U)) 68 | 69 | s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space 70 | 71 | return s 72 | -------------------------------------------------------------------------------- /src/musetalk/whisper/whisper/utils.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | from typing import Iterator, TextIO 3 | 4 | 5 | def exact_div(x, y): 6 | assert x % y == 0 7 | return x // y 8 | 9 | 10 | def str2bool(string): 11 | str2val = {"True": True, "False": False} 12 | if string in str2val: 13 | return str2val[string] 14 | else: 15 | raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") 16 | 17 | 18 | def optional_int(string): 19 | return None if string == "None" else int(string) 20 | 21 | 22 | def optional_float(string): 23 | return None if string == "None" else float(string) 24 | 25 | 26 | def compression_ratio(text) -> float: 27 | return len(text) / len(zlib.compress(text.encode("utf-8"))) 28 | 29 | 30 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): 31 | assert seconds >= 0, "non-negative timestamp expected" 32 | milliseconds = round(seconds * 1000.0) 33 | 34 | hours = milliseconds // 3_600_000 35 | milliseconds -= hours * 3_600_000 36 | 37 | minutes = milliseconds // 60_000 38 | milliseconds -= minutes * 60_000 39 | 40 | seconds = milliseconds // 1_000 41 | milliseconds -= seconds * 1_000 42 | 43 | hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" 44 | return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" 45 | 46 | 47 | def write_txt(transcript: Iterator[dict], file: TextIO): 48 | for segment in transcript: 49 | print(segment['text'].strip(), file=file, flush=True) 50 | 51 | 52 | def write_vtt(transcript: Iterator[dict], file: TextIO): 53 | print("WEBVTT\n", file=file) 54 | for segment in transcript: 55 | print( 56 | f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" 57 | f"{segment['text'].strip().replace('-->', '->')}\n", 58 | file=file, 59 | flush=True, 60 | ) 61 | 62 | 63 | def write_srt(transcript: Iterator[dict], file: TextIO): 64 | """ 65 | Write a transcript to a file in SRT format. 66 | 67 | Example usage: 68 | from pathlib import Path 69 | from whisper.utils import write_srt 70 | 71 | result = transcribe(model, audio_path, temperature=temperature, **args) 72 | 73 | # save SRT 74 | audio_basename = Path(audio_path).stem 75 | with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt: 76 | write_srt(result["segments"], file=srt) 77 | """ 78 | for i, segment in enumerate(transcript, start=1): 79 | # write srt lines 80 | print( 81 | f"{i}\n" 82 | f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " 83 | f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n" 84 | f"{segment['text'].strip().replace('-->', '->')}\n", 85 | file=file, 86 | flush=True, 87 | ) 88 | -------------------------------------------------------------------------------- /src/prompt.txt: -------------------------------------------------------------------------------- 1 | 你负责为一个语音聊天系统生成对话文本输出,确保语气情感丰富、友好,并且响应迅速以保持用户的参与感。请你遵循以下规则: 2 | 1. 回复应该简短、对话性强,并保持互动式的交流风格,每个句子保持简短且长度接近(5-10个字)。 3 | 2. 理解用户的意图并提供简洁、相关的回复,避免不必要的说明或平淡的陈述。 4 | 3. 在整个对话中保持友好和情感丰富的语气。 5 | 4. 快速回应,以免让用户等待,以“好的”、“没问题”、“明白了”等短句作为回复的开头 6 | 5. 确保示例是多轮的,而不仅仅是一个问题一个回答。 7 | 8 | 接下来,我会给你一系列用户输入,请你遵循上述要求输出内容。 9 | 用户输入: -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | import cv2 5 | import time 6 | from pathlib import Path 7 | from datetime import datetime 8 | import wave 9 | from dashscope.audio.tts_v2 import * 10 | 11 | def merge_frames_with_audio(audio_path, fps = 25): 12 | video_idx = audio_path.split("/")[-1].split("_")[-1].split(".")[0] 13 | print(f"[Real-time Inference] Merging frames with audio on {video_idx}") 14 | 15 | video_path = str(Path(audio_path).parent.parent / "videos" / f"{video_idx}.ts") 16 | frame_path = str(Path(audio_path).parent.parent / "frames" / f"{video_idx}") 17 | start_time = time.time() 18 | 19 | ffmpeg_command = [ 20 | 'ffmpeg', 21 | '-framerate', str(fps), 22 | '-i', f"{frame_path}/%08d.jpg", 23 | '-i', audio_path, 24 | '-c:v', 'libx264', 25 | '-shortest', 26 | '-f', 'mpegts', 27 | '-y', 28 | video_path 29 | ] 30 | subprocess.run(ffmpeg_command, check=True) 31 | print(f"[Real-time Inference] Merging frames with audio costs {time.time()-start_time}s") 32 | return video_path 33 | 34 | def get_video_duration(video_path): 35 | cap = cv2.VideoCapture(video_path) 36 | fps = cap.get(cv2.CAP_PROP_FPS) 37 | frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 38 | duration = frame_count / fps 39 | return round(duration, 2) 40 | 41 | def split_into_sentences(text, sentence_split_option): 42 | text = ''.join(text.splitlines()) 43 | sentence_endings = re.compile(r'[。!?.!?]') 44 | sentences = sentence_endings.split(text) 45 | sentences = [s.strip() for s in sentences if s.strip()] 46 | split_count = int(sentence_split_option) 47 | return ['。'.join(sentences[i:i+split_count]) for i in range(0, len(sentences), split_count)] 48 | 49 | def get_timestamp_str(): 50 | fmt = "%Y%m%d_%H%M%S" 51 | current_time = datetime.now() 52 | folder_name = current_time.strftime(fmt) 53 | return folder_name 54 | 55 | def merge_videos(video_folder_path, suffix = '.mp4'): 56 | output_path = os.path.join(video_folder_path, f'merged_video{suffix}') 57 | file_list_path = os.path.join(video_folder_path, 'video_list.txt') 58 | 59 | def extract_index(filename): 60 | index = filename.split('.')[0].split('_')[-1] 61 | return int(index) 62 | 63 | with open(file_list_path, 'w') as file_list: 64 | ts_files = [f for f in os.listdir(video_folder_path) if f.endswith('.ts')] 65 | ts_files.sort(key=extract_index) 66 | 67 | for filename in ts_files: 68 | file_list.write(f"file '{filename}'\n") 69 | 70 | ffmpeg_command = [ 71 | 'ffmpeg', 72 | '-f', 'concat', 73 | '-safe', '0', 74 | '-i', file_list_path, 75 | '-c', 'copy', 76 | '-c:v', 'libx264', 77 | '-c:a', 'aac', 78 | '-y', 79 | output_path 80 | ] 81 | 82 | subprocess.run(ffmpeg_command, check=True) 83 | return output_path 84 | --------------------------------------------------------------------------------