├── .gitignore ├── LICENSE ├── NeuralSeq ├── LICENSE ├── README.md ├── configs │ ├── config_base.yaml │ ├── singing │ │ ├── base.yaml │ │ └── fs2.yaml │ └── tts │ │ ├── base.yaml │ │ ├── base_zh.yaml │ │ ├── emotion │ │ ├── base_text2mel.yaml │ │ └── pre_align.py │ │ ├── fs2.yaml │ │ ├── hifigan.yaml │ │ ├── libritts │ │ ├── base_text2mel.yaml │ │ ├── fs2.yaml │ │ ├── pre_align.py │ │ └── pwg.yaml │ │ ├── lj │ │ ├── base_mel2wav.yaml │ │ ├── base_text2mel.yaml │ │ ├── fs2.yaml │ │ ├── hifigan.yaml │ │ └── pwg.yaml │ │ └── pwg.yaml ├── data_gen │ └── tts │ │ ├── base_binarizer.py │ │ ├── base_binarizer_emotion.py │ │ ├── base_preprocess.py │ │ ├── binarizer_zh.py │ │ ├── data_gen_utils.py │ │ ├── emotion │ │ ├── audio.py │ │ ├── inference.py │ │ ├── model.py │ │ ├── params_data.py │ │ ├── params_model.py │ │ └── test_emotion.py │ │ ├── txt_processors │ │ ├── __init__.py │ │ ├── base_text_processor.py │ │ ├── en.py │ │ ├── zh.py │ │ └── zh_g2pM.py │ │ └── wav_processors │ │ ├── __init__.py │ │ ├── base_processor.py │ │ └── common_processors.py ├── egs │ ├── datasets │ │ └── audio │ │ │ ├── emotion │ │ │ ├── base_text2mel.yaml │ │ │ └── pre_align.py │ │ │ ├── libritts │ │ │ ├── base_text2mel.yaml │ │ │ ├── fs2.yaml │ │ │ ├── pre_align.py │ │ │ └── pwg.yaml │ │ │ ├── lj │ │ │ ├── base_mel2wav.yaml │ │ │ ├── preprocess.py │ │ │ └── pwg.yaml │ │ │ └── vctk │ │ │ ├── base_mel2wav.yaml │ │ │ ├── fs2.yaml │ │ │ ├── pre_align.py │ │ │ └── pwg.yaml │ └── egs_bases │ │ ├── config_base.yaml │ │ ├── svs │ │ ├── base.yaml │ │ ├── lj_ds_beta6.yaml │ │ ├── midi │ │ │ ├── cascade │ │ │ │ └── opencs │ │ │ │ │ ├── aux_rel.yaml │ │ │ │ │ ├── ds60_rel.yaml │ │ │ │ │ └── opencpop_statis.yaml │ │ │ ├── e2e │ │ │ │ ├── opencpop │ │ │ │ │ ├── ds1000-10dil.yaml │ │ │ │ │ ├── ds1000.yaml │ │ │ │ │ └── ds100_adj_rel.yaml │ │ │ │ └── popcs │ │ │ │ │ └── ds100_adj_rel.yaml │ │ │ └── pe.yaml │ │ ├── popcs_ds_beta6.yaml │ │ ├── popcs_ds_beta6_offline.yaml │ │ └── popcs_fs2.yaml │ │ └── tts │ │ ├── base.yaml │ │ ├── base_zh.yaml │ │ ├── fs2.yaml │ │ ├── fs2_adv.yaml │ │ ├── ps.yaml │ │ ├── ps_flow.yaml │ │ ├── ps_flow_small.yaml │ │ └── vocoder │ │ ├── base.yaml │ │ ├── hifigan.yaml │ │ └── pwg.yaml ├── gitattributes ├── inference │ ├── svs │ │ ├── base_svs_infer.py │ │ ├── ds_cascade.py │ │ ├── ds_e2e.py │ │ └── opencpop │ │ │ ├── cpop_pinyin2ph.txt │ │ │ └── map.py │ └── tts │ │ ├── GenerSpeech.py │ │ ├── PortaSpeech.py │ │ └── base_tts_infer.py ├── modules │ ├── GenerSpeech │ │ ├── config │ │ │ └── generspeech.yaml │ │ ├── model │ │ │ ├── generspeech.py │ │ │ ├── glow_modules.py │ │ │ ├── mixstyle.py │ │ │ ├── prosody_util.py │ │ │ └── wavenet.py │ │ └── task │ │ │ ├── dataset.py │ │ │ └── generspeech.py │ ├── __init__.py │ ├── commons │ │ ├── align_ops.py │ │ ├── common_layers.py │ │ ├── conv.py │ │ ├── espnet_positional_embedding.py │ │ ├── normalizing_flow │ │ │ ├── glow_modules.py │ │ │ ├── res_flow.py │ │ │ └── utils.py │ │ ├── rel_transformer.py │ │ ├── ssim.py │ │ ├── transformer.py │ │ └── wavenet.py │ ├── diff │ │ ├── candidate_decoder.py │ │ ├── diffusion.py │ │ ├── net.py │ │ └── shallow_diffusion_tts.py │ ├── diffsinger_midi │ │ └── fs2.py │ ├── fastspeech │ │ ├── fs2.py │ │ ├── pe.py │ │ └── tts_modules.py │ ├── hifigan │ │ ├── hifigan.py │ │ └── mel_utils.py │ ├── parallel_wavegan │ │ ├── __init__.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── causal_conv.py │ │ │ ├── pqmf.py │ │ │ ├── residual_block.py │ │ │ ├── residual_stack.py │ │ │ ├── tf_layers.py │ │ │ └── upsample.py │ │ ├── losses │ │ │ ├── __init__.py │ │ │ └── stft_loss.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── melgan.py │ │ │ ├── parallel_wavegan.py │ │ │ └── source.py │ │ ├── optimizers │ │ │ ├── __init__.py │ │ │ └── radam.py │ │ ├── stft_loss.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── utils.py │ └── syntaspeech │ │ ├── multi_window_disc.py │ │ ├── syntactic_graph_buider.py │ │ ├── syntactic_graph_encoder.py │ │ └── syntaspeech.py ├── tasks │ ├── base_task.py │ ├── run.py │ ├── svs │ │ ├── __init__.py │ │ ├── diffsinger_task.py │ │ ├── diffspeech_task.py │ │ └── task.py │ ├── tts │ │ ├── dataset_utils.py │ │ ├── fs2.py │ │ ├── fs2_adv.py │ │ ├── fs2_utils.py │ │ ├── pe.py │ │ ├── ps.py │ │ ├── ps_adv.py │ │ ├── ps_flow.py │ │ ├── synta.py │ │ ├── tts.py │ │ ├── tts_base.py │ │ └── tts_utils.py │ └── vocoder │ │ ├── dataset_utils.py │ │ └── vocoder_base.py ├── utils │ ├── __init__.py │ ├── audio.py │ ├── ckpt_utils.py │ ├── cwt.py │ ├── dtw.py │ ├── hparams.py │ ├── indexed_datasets.py │ ├── multiprocess_utils.py │ ├── os_utils.py │ ├── pitch_utils.py │ ├── pl_utils.py │ ├── plot.py │ ├── text_encoder.py │ ├── text_norm.py │ ├── training_utils.py │ └── tts_utils.py └── vocoders │ ├── __init__.py │ ├── base_vocoder.py │ ├── hifigan.py │ ├── pwg.py │ └── vocoder_utils.py ├── README.md ├── assets ├── 2bf90e35.wav ├── 5d67d1b9.wav ├── 7cb0d24f.wav ├── 7ef0ec0b.wav ├── README.md ├── Track 4.wav ├── a-group-of-sheep-are-baaing.wav ├── a2i.png ├── asr.png ├── b973e878.wav ├── detection.png ├── drums-and-music-playing-with-a-man-speaking.wav ├── fd5cf55e.wav ├── i2a-1.png ├── i2a-2.png ├── inpaint-1.png ├── inpaint-2.png ├── m2b.png ├── mix1.wav ├── sound_extraction.png ├── style_transfer_tts.png ├── t2a.png ├── t2i.png ├── t2s.png ├── tsd.png └── tts.png ├── audio-chatgpt.py ├── audio_detection ├── __init__.py ├── audio_infer │ ├── __init__.py │ ├── metadata │ │ ├── black_list │ │ │ ├── groundtruth_weak_label_evaluation_set.csv │ │ │ └── groundtruth_weak_label_testing_set.csv │ │ └── class_labels_indices.csv │ ├── pytorch │ │ ├── evaluate.py │ │ ├── finetune_template.py │ │ ├── inference.py │ │ ├── losses.py │ │ ├── main.py │ │ ├── models.py │ │ └── pytorch_utils.py │ ├── results │ │ └── YDlWd7Wmdi1E.png │ └── utils │ │ ├── config.py │ │ ├── crash.py │ │ ├── create_black_list.py │ │ ├── create_indexes.py │ │ ├── data_generator.py │ │ ├── dataset.py │ │ ├── plot_for_paper.py │ │ ├── plot_statistics.py │ │ └── utilities.py └── target_sound_detection │ └── src │ ├── models.py │ └── utils.py ├── audio_to_text ├── __init__.py ├── captioning │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── decoder.py │ │ ├── encoder.py │ │ ├── transformer_model.py │ │ └── utils.py │ └── utils │ │ ├── README.md │ │ ├── __init__.py │ │ ├── bert │ │ ├── create_sent_embedding.py │ │ └── create_word_embedding.py │ │ ├── build_vocab.py │ │ ├── build_vocab_ltp.py │ │ ├── build_vocab_spacy.py │ │ ├── eval_round_robin.py │ │ ├── fasttext │ │ └── create_word_embedding.py │ │ ├── lr_scheduler.py │ │ ├── model_eval_diff.py │ │ ├── predict_nn.py │ │ ├── remove_optimizer.py │ │ ├── report_results.py │ │ ├── tokenize_caption.py │ │ ├── train_util.py │ │ └── word2vec │ │ └── create_word_embedding.py └── inference_waveform.py ├── download.sh ├── mono2binaural └── src │ ├── models.py │ ├── utils.py │ └── warping.py ├── requirements.txt ├── run.md ├── sound_extraction ├── model │ ├── LASSNet.py │ ├── film.py │ ├── modules.py │ ├── resunet_film.py │ └── text_encoder.py └── utils │ ├── create_mixtures.py │ ├── stft.py │ └── wav_io.py └── text_to_audio └── Make_An_Audio ├── configs ├── img_to_audio │ └── img2audio_args.yaml ├── inpaint │ └── txt2audio_args.yaml └── text_to_audio │ ├── clap_args.yaml │ ├── hifigan_args.yaml │ └── txt2audio_args.yaml ├── ldm ├── data │ └── extract_mel_spectrogram.py ├── lr_scheduler.py ├── models │ ├── autoencoder.py │ ├── autoencoder_multi.py │ └── diffusion │ │ ├── __init__.py │ │ ├── classifier.py │ │ ├── ddim.py │ │ ├── ddpm.py │ │ ├── ddpm_audio.py │ │ ├── ddpm_audio_inpaint.py │ │ └── plms.py ├── modules │ ├── attention.py │ ├── diffusionmodules │ │ ├── __init__.py │ │ ├── custom_openaimodel.py │ │ ├── model.py │ │ ├── openaimodel.py │ │ └── util.py │ ├── discriminator │ │ ├── model.py │ │ └── multi_window_disc.py │ ├── distributions │ │ ├── __init__.py │ │ └── distributions.py │ ├── ema.py │ ├── encoders │ │ ├── CLAP │ │ │ ├── CLAPWrapper.py │ │ │ ├── __init__.py │ │ │ ├── audio.py │ │ │ ├── clap.py │ │ │ ├── config.yml │ │ │ └── utils.py │ │ ├── __init__.py │ │ ├── modules.py │ │ └── open_clap │ │ │ ├── __init__.py │ │ │ ├── bert.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── factory.py │ │ │ ├── feature_fusion.py │ │ │ ├── htsat.py │ │ │ ├── linear_probe.py │ │ │ ├── loss.py │ │ │ ├── model.py │ │ │ ├── model_configs │ │ │ ├── HTSAT-base.json │ │ │ ├── HTSAT-large.json │ │ │ ├── HTSAT-tiny-win-1536.json │ │ │ ├── HTSAT-tiny.json │ │ │ ├── PANN-10.json │ │ │ ├── PANN-14-fmax-18k.json │ │ │ ├── PANN-14-fmax-8k-20s.json │ │ │ ├── PANN-14-tiny-transformer.json │ │ │ ├── PANN-14-win-1536.json │ │ │ ├── PANN-14.json │ │ │ ├── PANN-6.json │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ └── ViT-L-14.json │ │ │ ├── openai.py │ │ │ ├── pann_model.py │ │ │ ├── pretrained.py │ │ │ ├── timm_model.py │ │ │ ├── tokenizer.py │ │ │ ├── transform.py │ │ │ ├── utils.py │ │ │ └── version.py │ ├── image_degradation │ │ ├── __init__.py │ │ ├── bsrgan.py │ │ ├── bsrgan_light.py │ │ ├── utils │ │ │ └── test.png │ │ └── utils_image.py │ ├── losses_audio │ │ ├── __init__.py │ │ ├── contperceptual.py │ │ ├── contperceptual_dis.py │ │ ├── lpaps.py │ │ ├── vggishish │ │ │ ├── config │ │ │ │ ├── melception.yaml │ │ │ │ └── vggish.yaml │ │ │ ├── data │ │ │ │ ├── train_means_stds_melspec_10s_22050hz.txt │ │ │ │ ├── vggsound.csv │ │ │ │ ├── vggsound_test.txt │ │ │ │ ├── vggsound_train.txt │ │ │ │ └── vggsound_valid.txt │ │ │ ├── dataset.py │ │ │ ├── logger.py │ │ │ ├── loss.py │ │ │ ├── metrics.py │ │ │ ├── model.py │ │ │ ├── predict.py │ │ │ ├── train_melception.py │ │ │ ├── train_vggishish.py │ │ │ └── transforms.py │ │ └── vqperceptual.py │ └── x_transformer.py └── util.py ├── useful_ckpts └── CLAP │ └── config.yml ├── vocoder ├── bigvgan │ ├── __init__.py │ ├── activations.py │ ├── alias_free_torch │ │ ├── __init__.py │ │ ├── act.py │ │ ├── filter.py │ │ └── resample.py │ └── models.py ├── hifigan │ └── modules.py └── logs │ └── hifi_0127 │ └── args.yml └── wav_evaluation └── models ├── CLAPWrapper.py ├── __init__.py ├── audio.py ├── clap.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains PyCharm IDE 2 | .idea/ 3 | .github/ 4 | .circleci/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | *__pycache__/ 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # macOS dir files 16 | .DS_Store 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # Checkpoints 38 | checkpoints 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # dotenv 95 | .env 96 | 97 | # virtualenv 98 | .venv 99 | venv/ 100 | ENV/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | 115 | # Generated files 116 | /fairseq/temporal_convolution_tbc 117 | /fairseq/modules/*_layer/*_forward.cu 118 | /fairseq/modules/*_layer/*_backward.cu 119 | /fairseq/version.py 120 | 121 | # data 122 | data-bin/ 123 | 124 | # reranking 125 | /examples/reranking/rerank_data 126 | 127 | # Cython-generated C++ source files 128 | /fairseq/data/data_utils_fast.cpp 129 | /fairseq/data/token_block_utils_fast.cpp 130 | 131 | # VSCODE 132 | .vscode/ftp-sync.json 133 | .vscode/settings.json 134 | 135 | # Experimental Folder 136 | experimental/* 137 | 138 | # Weights and Biases logs 139 | wandb/ 140 | 141 | # Hydra artifacts 142 | nohup.out 143 | multirun 144 | outputs 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/LICENSE -------------------------------------------------------------------------------- /NeuralSeq/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jinglin Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /NeuralSeq/README.md: -------------------------------------------------------------------------------- 1 | In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger -------------------------------------------------------------------------------- /NeuralSeq/configs/config_base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | binary_data_dir: '' 3 | work_dir: '' # experiment directory. 4 | infer: false # infer 5 | seed: 1234 6 | debug: false 7 | save_codes: 8 | - configs 9 | - modules 10 | - tasks 11 | - utils 12 | - usr 13 | 14 | ############# 15 | # dataset 16 | ############# 17 | ds_workers: 1 18 | test_num: 100 19 | valid_num: 100 20 | endless_ds: false 21 | sort_by_len: true 22 | 23 | ######### 24 | # train and eval 25 | ######### 26 | load_ckpt: '' 27 | save_ckpt: true 28 | save_best: false 29 | num_ckpt_keep: 3 30 | clip_grad_norm: 0 31 | accumulate_grad_batches: 1 32 | log_interval: 100 33 | num_sanity_val_steps: 5 # steps of validation at the beginning 34 | check_val_every_n_epoch: 10 35 | val_check_interval: 2000 36 | max_epochs: 1000 37 | max_updates: 160000 38 | max_tokens: 31250 39 | max_sentences: 100000 40 | max_eval_tokens: -1 41 | max_eval_sentences: -1 42 | test_input_dir: '' 43 | -------------------------------------------------------------------------------- /NeuralSeq/configs/singing/base.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/base.yaml 3 | - configs/tts/base_zh.yaml 4 | 5 | 6 | datasets: [] 7 | test_prefixes: [] 8 | test_num: 0 9 | valid_num: 0 10 | 11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign 12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer 13 | pre_align_args: 14 | use_tone: false # for ZH 15 | forced_align: mfa 16 | use_sox: true 17 | hop_size: 128 # Hop size. 18 | fft_size: 512 # FFT size. 19 | win_size: 512 # FFT size. 20 | max_frames: 8000 21 | fmin: 50 # Minimum freq in mel basis calculation. 22 | fmax: 11025 # Maximum frequency in mel basis calculation. 23 | pitch_type: frame 24 | 25 | hidden_size: 256 26 | mel_loss: "ssim:0.5|l1:0.5" 27 | lambda_f0: 0.0 28 | lambda_uv: 0.0 29 | lambda_energy: 0.0 30 | lambda_ph_dur: 0.0 31 | lambda_sent_dur: 0.0 32 | lambda_word_dur: 0.0 33 | predictor_grad: 0.0 34 | use_spk_embed: true 35 | use_spk_id: false 36 | 37 | max_tokens: 20000 38 | max_updates: 400000 39 | num_spk: 100 40 | save_f0: true 41 | use_gt_dur: true 42 | use_gt_f0: true 43 | -------------------------------------------------------------------------------- /NeuralSeq/configs/singing/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/singing/base.yaml 4 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | base_config: configs/config_base.yaml 3 | task_cls: '' 4 | ############# 5 | # dataset 6 | ############# 7 | raw_data_dir: '' 8 | processed_data_dir: '' 9 | binary_data_dir: '' 10 | dict_dir: '' 11 | pre_align_cls: '' 12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer 13 | pre_align_args: 14 | use_tone: true # for ZH 15 | forced_align: mfa 16 | use_sox: false 17 | txt_processor: en 18 | allow_no_txt: false 19 | denoise: false 20 | binarization_args: 21 | shuffle: false 22 | with_txt: true 23 | with_wav: false 24 | with_align: true 25 | with_spk_embed: true 26 | with_f0: true 27 | with_f0cwt: true 28 | 29 | loud_norm: false 30 | endless_ds: true 31 | reset_phone_dict: true 32 | 33 | test_num: 100 34 | valid_num: 100 35 | max_frames: 1550 36 | max_input_tokens: 1550 37 | audio_num_mel_bins: 80 38 | audio_sample_rate: 22050 39 | hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 40 | win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 41 | fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 42 | fmax: 7600 # To be increased/reduced depending on data. 43 | fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter 44 | min_level_db: -100 45 | num_spk: 1 46 | mel_vmin: -6 47 | mel_vmax: 1.5 48 | ds_workers: 4 49 | 50 | ######### 51 | # model 52 | ######### 53 | dropout: 0.1 54 | enc_layers: 4 55 | dec_layers: 4 56 | hidden_size: 384 57 | num_heads: 2 58 | prenet_dropout: 0.5 59 | prenet_hidden_size: 256 60 | stop_token_weight: 5.0 61 | enc_ffn_kernel_size: 9 62 | dec_ffn_kernel_size: 9 63 | ffn_act: gelu 64 | ffn_padding: 'SAME' 65 | 66 | 67 | ########### 68 | # optimization 69 | ########### 70 | lr: 2.0 71 | warmup_updates: 8000 72 | optimizer_adam_beta1: 0.9 73 | optimizer_adam_beta2: 0.98 74 | weight_decay: 0 75 | clip_grad_norm: 1 76 | 77 | 78 | ########### 79 | # train and eval 80 | ########### 81 | max_tokens: 30000 82 | max_sentences: 100000 83 | max_eval_sentences: 1 84 | max_eval_tokens: 60000 85 | train_set_name: 'train' 86 | valid_set_name: 'valid' 87 | test_set_name: 'test' 88 | vocoder: pwg 89 | vocoder_ckpt: '' 90 | profile_infer: false 91 | out_wav_norm: false 92 | save_gt: false 93 | save_f0: false 94 | gen_dir_name: '' 95 | use_denoise: false 96 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/base_zh.yaml: -------------------------------------------------------------------------------- 1 | pre_align_args: 2 | txt_processor: zh_g2pM 3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/emotion/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/ESD' 2 | processed_data_dir: 'data/processed/emotion' 3 | binary_data_dir: 'data/binary/emotion' 4 | pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign 5 | audio_sample_rate: 16000 6 | binarization_args: 7 | shuffle: true 8 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer 9 | use_spk_id: true 10 | test_num: 200 11 | num_spk: 10 12 | pitch_type: frame 13 | min_frames: 128 14 | num_test_samples: 30 15 | mel_loss: "ssim:0.5|l1:0.5" 16 | vocoder_ckpt: '' 17 | use_emotion: true -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/emotion/pre_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_gen.tts.base_preprocess import BasePreprocessor 4 | import glob 5 | import re 6 | 7 | class EmoPreAlign(BasePreprocessor): 8 | 9 | def meta_data(self): 10 | spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020'] 11 | pattern = re.compile('[\t\n ]+') 12 | for spk in spks: 13 | for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件 14 | line = re.sub(pattern, ' ', line) 15 | if line == ' ': continue 16 | split_ = line.split(' ') 17 | txt = ' '.join(split_[1: -2]) 18 | item_name = split_[0] 19 | emotion = split_[-2] 20 | wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav' 21 | yield item_name, wav_fn, txt, spk, emotion 22 | 23 | 24 | if __name__ == "__main__": 25 | EmoPreAlign().process() 26 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/base.yaml 2 | task_cls: tasks.tts.fs2.FastSpeech2Task 3 | 4 | # model 5 | hidden_size: 256 6 | dropout: 0.1 7 | encoder_type: fft # fft|tacotron|tacotron2|conformer 8 | encoder_K: 8 # for tacotron encoder 9 | decoder_type: fft # fft|rnn|conv|conformer 10 | use_pos_embed: true 11 | 12 | # duration 13 | predictor_hidden: -1 14 | predictor_kernel: 5 15 | predictor_layers: 2 16 | dur_predictor_kernel: 3 17 | dur_predictor_layers: 2 18 | predictor_dropout: 0.5 19 | 20 | # pitch and energy 21 | use_pitch_embed: true 22 | pitch_type: ph # frame|ph|cwt 23 | use_uv: true 24 | cwt_hidden_size: 128 25 | cwt_layers: 2 26 | cwt_loss: l1 27 | cwt_add_f0_loss: false 28 | cwt_std_scale: 0.8 29 | 30 | pitch_ar: false 31 | #pitch_embed_type: 0q 32 | pitch_loss: 'l1' # l1|l2|ssim 33 | pitch_norm: log 34 | use_energy_embed: false 35 | 36 | # reference encoder and speaker embedding 37 | use_spk_id: false 38 | use_split_spk_id: false 39 | use_spk_embed: false 40 | use_var_enc: false 41 | lambda_commit: 0.25 42 | ref_norm_layer: bn 43 | pitch_enc_hidden_stride_kernel: 44 | - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 45 | - 0,2,5 46 | - 0,2,5 47 | dur_enc_hidden_stride_kernel: 48 | - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 49 | - 0,2,3 50 | - 0,1,3 51 | 52 | 53 | # mel 54 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 55 | 56 | # loss lambda 57 | lambda_f0: 1.0 58 | lambda_uv: 1.0 59 | lambda_energy: 0.1 60 | lambda_ph_dur: 1.0 61 | lambda_sent_dur: 1.0 62 | lambda_word_dur: 1.0 63 | predictor_grad: 0.1 64 | 65 | # train and eval 66 | pretrain_fs_ckpt: '' 67 | warmup_updates: 2000 68 | max_tokens: 32000 69 | max_sentences: 100000 70 | max_eval_sentences: 1 71 | max_updates: 120000 72 | num_valid_plots: 5 73 | num_test_samples: 0 74 | test_ids: [] 75 | use_gt_dur: false 76 | use_gt_f0: false 77 | 78 | # exp 79 | dur_loss: mse # huber|mol 80 | norm_type: gn -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/pwg.yaml 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask 3 | resblock: "1" 4 | adam_b1: 0.8 5 | adam_b2: 0.99 6 | upsample_rates: [ 8,8,2,2 ] 7 | upsample_kernel_sizes: [ 16,16,4,4 ] 8 | upsample_initial_channel: 128 9 | resblock_kernel_sizes: [ 3,7,11 ] 10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ] 11 | 12 | lambda_mel: 45.0 13 | 14 | max_samples: 8192 15 | max_sentences: 16 16 | 17 | generator_params: 18 | lr: 0.0002 # Generator's learning rate. 19 | aux_context_window: 0 # Context window size for auxiliary feature. 20 | discriminator_optimizer_params: 21 | lr: 0.0002 # Discriminator's learning rate. -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/libritts/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LibriTTS' 2 | processed_data_dir: 'data/processed/libritts' 3 | binary_data_dir: 'data/binary/libritts' 4 | pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign 5 | binarization_args: 6 | shuffle: true 7 | use_spk_id: true 8 | test_num: 200 9 | num_spk: 2320 10 | pitch_type: frame 11 | min_frames: 128 12 | num_test_samples: 30 13 | mel_loss: "ssim:0.5|l1:0.5" 14 | vocoder_ckpt: '' -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/libritts/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - ./base_text2mel.yaml 4 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/libritts/pre_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_gen.tts.base_preprocess import BasePreprocessor 4 | import glob 5 | 6 | 7 | class LibrittsPreAlign(BasePreprocessor): 8 | def meta_data(self): 9 | wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav')) 10 | for wav_fn in wav_fns: 11 | item_name = os.path.basename(wav_fn)[:-4] 12 | txt_fn = f'{wav_fn[:-4]}.normalized.txt' 13 | with open(txt_fn, 'r') as f: 14 | txt = f.readlines() 15 | f.close() 16 | spk = item_name.split("_")[0] 17 | # Example: 18 | # 19 | # 'item_name': '103_1241_000000_000001' 20 | # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav' 21 | # 'txt': 'matthew Cuthbert is surprised' 22 | # 'spk_name': '103' 23 | yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk} 24 | 25 | 26 | if __name__ == "__main__": 27 | LibrittsPreAlign().process() 28 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/libritts/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: egs/egs_bases/tts/vocoder/pwg.yaml 2 | raw_data_dir: 'data/raw/LibriTTS' 3 | processed_data_dir: 'data/processed/libritts' 4 | binary_data_dir: 'data/binary/libritts_wav' 5 | generator_params: 6 | kernel_size: 5 7 | num_spk: 400 8 | max_samples: 20480 9 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/lj/base_mel2wav.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech_wav' 4 | -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/lj/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech' 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign 5 | 6 | pitch_type: cwt 7 | mel_loss: l1 8 | num_test_samples: 20 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, 10 | 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] 11 | use_energy_embed: false 12 | test_num: 523 13 | valid_num: 348 -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/lj/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/tts/lj/base_text2mel.yaml -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/lj/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/hifigan.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /NeuralSeq/configs/tts/lj/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/pwg.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/binarizer_zh.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OMP_NUM_THREADS"] = "1" 4 | 5 | from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU 6 | from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError 7 | from data_gen.tts.data_gen_utils import get_mel2ph 8 | from utils.hparams import set_hparams, hparams 9 | import numpy as np 10 | 11 | 12 | class ZhBinarizer(BaseBinarizer): 13 | @staticmethod 14 | def get_align(tg_fn, ph, mel, phone_encoded, res): 15 | if tg_fn is not None and os.path.exists(tg_fn): 16 | _, dur = get_mel2ph(tg_fn, ph, mel, hparams) 17 | else: 18 | raise BinarizationError(f"Align not found") 19 | ph_list = ph.split(" ") 20 | assert len(dur) == len(ph_list) 21 | mel2ph = [] 22 | # 分隔符的时长分配给韵母 23 | dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0) 24 | for i in range(len(dur)): 25 | p = ph_list[i] 26 | if p[0] != '<' and not p[0].isalpha(): 27 | uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0 28 | j = 0 29 | while j < len(uv_) and not uv_[j]: 30 | j += 1 31 | dur[i - 1] += j 32 | dur[i] -= j 33 | if dur[i] < 100: 34 | dur[i - 1] += dur[i] 35 | dur[i] = 0 36 | # 声母和韵母等长 37 | for i in range(len(dur)): 38 | p = ph_list[i] 39 | if p in ALL_SHENMU: 40 | p_next = ph_list[i + 1] 41 | if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU): 42 | print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, " 43 | f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.") 44 | continue 45 | total = dur[i + 1] + dur[i] 46 | dur[i] = total // 2 47 | dur[i + 1] = total - dur[i] 48 | for i in range(len(dur)): 49 | mel2ph += [i + 1] * dur[i] 50 | mel2ph = np.array(mel2ph) 51 | if mel2ph.max() - 1 >= len(phone_encoded): 52 | raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}") 53 | res['mel2ph'] = mel2ph 54 | res['dur'] = dur 55 | 56 | 57 | if __name__ == "__main__": 58 | set_hparams() 59 | ZhBinarizer().process() 60 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/emotion/model.py: -------------------------------------------------------------------------------- 1 | 2 | from data_gen.tts.emotion.params_model import * 3 | from data_gen.tts.emotion.params_data import * 4 | from torch.nn.utils import clip_grad_norm_ 5 | from scipy.optimize import brentq 6 | from torch import nn 7 | import numpy as np 8 | import torch 9 | 10 | 11 | class EmotionEncoder(nn.Module): 12 | def __init__(self, device, loss_device): 13 | super().__init__() 14 | self.loss_device = loss_device 15 | 16 | # Network defition 17 | self.lstm = nn.LSTM(input_size=mel_n_channels, 18 | hidden_size=model_hidden_size, 19 | num_layers=model_num_layers, 20 | batch_first=True).to(device) 21 | self.linear = nn.Linear(in_features=model_hidden_size, 22 | out_features=model_embedding_size).to(device) 23 | self.relu = torch.nn.ReLU().to(device) 24 | 25 | 26 | # Cosine similarity scaling (with fixed initial parameter values) 27 | self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) 28 | self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) 29 | 30 | # Loss 31 | self.loss_fn = nn.CrossEntropyLoss().to(loss_device) 32 | 33 | def do_gradient_ops(self): 34 | # Gradient scale 35 | self.similarity_weight.grad *= 0.01 36 | self.similarity_bias.grad *= 0.01 37 | 38 | # Gradient clipping 39 | clip_grad_norm_(self.parameters(), 3, norm_type=2) 40 | 41 | def forward(self, utterances, hidden_init=None): 42 | """ 43 | Computes the embeddings of a batch of utterance spectrograms. 44 | 45 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 46 | (batch_size, n_frames, n_channels) 47 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 48 | batch_size, hidden_size). Will default to a tensor of zeros if None. 49 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 50 | """ 51 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 52 | # and the final cell state. 53 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 54 | 55 | # We take only the hidden state of the last layer 56 | embeds_raw = self.relu(self.linear(hidden[-1])) 57 | 58 | # L2-normalize it 59 | embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 60 | 61 | return embeds 62 | 63 | def inference(self, utterances, hidden_init=None): 64 | """ 65 | Computes the embeddings of a batch of utterance spectrograms. 66 | 67 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 68 | (batch_size, n_frames, n_channels) 69 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 70 | batch_size, hidden_size). Will default to a tensor of zeros if None. 71 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 72 | """ 73 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 74 | # and the final cell state. 75 | 76 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 77 | 78 | return hidden[-1] -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/emotion/params_data.py: -------------------------------------------------------------------------------- 1 | 2 | ## Mel-filterbank 3 | mel_window_length = 25 # In milliseconds 4 | mel_window_step = 10 # In milliseconds 5 | mel_n_channels = 40 6 | 7 | 8 | ## Audio 9 | sampling_rate = 16000 10 | # Number of spectrogram frames in a partial utterance 11 | partials_n_frames = 160 # 1600 ms 12 | # Number of spectrogram frames at inference 13 | inference_n_frames = 80 # 800 ms 14 | 15 | 16 | ## Voice Activation Detection 17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 18 | # This sets the granularity of the VAD. Should not need to be changed. 19 | vad_window_length = 30 # In milliseconds 20 | # Number of frames to average together when performing the moving average smoothing. 21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 22 | vad_moving_average_width = 8 23 | # Maximum number of consecutive silent frames a segment can have. 24 | vad_max_silence_length = 6 25 | 26 | 27 | ## Audio volume normalization 28 | audio_norm_target_dBFS = -30 29 | 30 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/emotion/params_model.py: -------------------------------------------------------------------------------- 1 | 2 | ## Model parameters 3 | model_hidden_size = 256 4 | model_embedding_size = 256 5 | model_num_layers = 3 6 | 7 | 8 | ## Training parameters 9 | learning_rate_init = 1e-4 10 | speakers_per_batch = 6 11 | utterances_per_speaker = 20 12 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/txt_processors/__init__.py: -------------------------------------------------------------------------------- 1 | from . import en -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py: -------------------------------------------------------------------------------- 1 | from data_gen.tts.data_gen_utils import is_sil_phoneme 2 | 3 | REGISTERED_TEXT_PROCESSORS = {} 4 | 5 | def register_txt_processors(name): 6 | def _f(cls): 7 | REGISTERED_TEXT_PROCESSORS[name] = cls 8 | return cls 9 | 10 | return _f 11 | 12 | 13 | def get_txt_processor_cls(name): 14 | return REGISTERED_TEXT_PROCESSORS.get(name, None) 15 | 16 | 17 | class BaseTxtProcessor: 18 | @staticmethod 19 | def sp_phonemes(): 20 | return ['|'] 21 | 22 | @classmethod 23 | def process(cls, txt, preprocess_args): 24 | raise NotImplementedError 25 | 26 | @classmethod 27 | def postprocess(cls, txt_struct, preprocess_args): 28 | # remove sil phoneme in head and tail 29 | while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]): 30 | txt_struct = txt_struct[1:] 31 | while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]): 32 | txt_struct = txt_struct[:-1] 33 | if preprocess_args['with_phsep']: 34 | txt_struct = cls.add_bdr(txt_struct) 35 | if preprocess_args['add_eos_bos']: 36 | txt_struct = [["", [""]]] + txt_struct + [["", [""]]] 37 | return txt_struct 38 | 39 | @classmethod 40 | def add_bdr(cls, txt_struct): 41 | txt_struct_ = [] 42 | for i, ts in enumerate(txt_struct): 43 | txt_struct_.append(ts) 44 | if i != len(txt_struct) - 1 and \ 45 | not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]): 46 | txt_struct_.append(['|', ['|']]) 47 | return txt_struct_ -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/txt_processors/en.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | from g2p_en import G2p 5 | from g2p_en.expand import normalize_numbers 6 | from nltk import pos_tag 7 | from nltk.tokenize import TweetTokenizer 8 | 9 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors 10 | from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS 11 | 12 | class EnG2p(G2p): 13 | word_tokenize = TweetTokenizer().tokenize 14 | 15 | def __call__(self, text): 16 | # preprocessing 17 | words = EnG2p.word_tokenize(text) 18 | tokens = pos_tag(words) # tuples of (word, tag) 19 | 20 | # steps 21 | prons = [] 22 | for word, pos in tokens: 23 | if re.search("[a-z]", word) is None: 24 | pron = [word] 25 | 26 | elif word in self.homograph2features: # Check homograph 27 | pron1, pron2, pos1 = self.homograph2features[word] 28 | if pos.startswith(pos1): 29 | pron = pron1 30 | else: 31 | pron = pron2 32 | elif word in self.cmu: # lookup CMU dict 33 | pron = self.cmu[word][0] 34 | else: # predict for oov 35 | pron = self.predict(word) 36 | 37 | prons.extend(pron) 38 | prons.extend([" "]) 39 | 40 | return prons[:-1] 41 | 42 | 43 | @register_txt_processors('en') 44 | class TxtProcessor(BaseTxtProcessor): 45 | g2p = EnG2p() 46 | 47 | @staticmethod 48 | def preprocess_text(text): 49 | text = normalize_numbers(text) 50 | text = ''.join(char for char in unicodedata.normalize('NFD', text) 51 | if unicodedata.category(char) != 'Mn') # Strip accents 52 | text = text.lower() 53 | text = re.sub("[\'\"()]+", "", text) 54 | text = re.sub("[-]+", " ", text) 55 | text = re.sub(f"[^ a-z{PUNCS}]", "", text) 56 | text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> ! 57 | text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! 58 | text = text.replace("i.e.", "that is") 59 | text = text.replace("i.e.", "that is") 60 | text = text.replace("etc.", "etc") 61 | text = re.sub(f"([{PUNCS}])", r" \1 ", text) 62 | text = re.sub(rf"\s+", r" ", text) 63 | return text 64 | 65 | @classmethod 66 | def process(cls, txt, preprocess_args): 67 | txt = cls.preprocess_text(txt).strip() 68 | phs = cls.g2p(txt) 69 | txt_struct = [[w, []] for w in txt.split(" ")] 70 | i_word = 0 71 | for p in phs: 72 | if p == ' ': 73 | i_word += 1 74 | else: 75 | txt_struct[i_word][1].append(p) 76 | txt_struct = cls.postprocess(txt_struct, preprocess_args) 77 | return txt_struct, txt -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/txt_processors/zh.py: -------------------------------------------------------------------------------- 1 | import re 2 | import jieba 3 | from pypinyin import pinyin, Style 4 | from data_gen.tts.data_gen_utils import PUNCS 5 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor 6 | from utils.text_norm import NSWNormalizer 7 | 8 | 9 | class TxtProcessor(BaseTxtProcessor): 10 | table = {ord(f): ord(t) for f, t in zip( 11 | u':,。!?【】()%#@&1234567890', 12 | u':,.!?[]()%#@&1234567890')} 13 | 14 | @staticmethod 15 | def preprocess_text(text): 16 | text = text.translate(TxtProcessor.table) 17 | text = NSWNormalizer(text).normalize(remove_punc=False) 18 | text = re.sub("[\'\"()]+", "", text) 19 | text = re.sub("[-]+", " ", text) 20 | text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text) 21 | text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! 22 | text = re.sub(f"([{PUNCS}])", r" \1 ", text) 23 | text = re.sub(rf"\s+", r"", text) 24 | text = re.sub(rf"[A-Za-z]+", r"$", text) 25 | return text 26 | 27 | @classmethod 28 | def process(cls, txt, pre_align_args): 29 | txt = cls.preprocess_text(txt) 30 | shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403 31 | yunmu_finals = pinyin(txt, style=Style.FINALS) 32 | yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3) 33 | yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \ 34 | if pre_align_args['use_tone'] else yunmu_finals 35 | 36 | assert len(shengmu) == len(yunmu) 37 | phs = ["|"] 38 | for a, b, c in zip(shengmu, yunmu, yunmu_finals): 39 | if a[0] == c[0]: 40 | phs += [a[0], "|"] 41 | else: 42 | phs += [a[0], b[0], "|"] 43 | return phs, txt 44 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/txt_processors/zh_g2pM.py: -------------------------------------------------------------------------------- 1 | import re 2 | import jieba 3 | from pypinyin import pinyin, Style 4 | from data_gen.tts.data_gen_utils import PUNCS 5 | from data_gen.tts.txt_processors import zh 6 | from g2pM import G2pM 7 | 8 | ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 9 | 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'] 10 | ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 11 | 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou', 12 | 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn'] 13 | 14 | 15 | class TxtProcessor(zh.TxtProcessor): 16 | model = G2pM() 17 | 18 | @staticmethod 19 | def sp_phonemes(): 20 | return ['|', '#'] 21 | 22 | @classmethod 23 | def process(cls, txt, pre_align_args): 24 | txt = cls.preprocess_text(txt) 25 | ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True) 26 | seg_list = '#'.join(jieba.cut(txt)) 27 | assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list) 28 | 29 | # 加入词边界'#' 30 | ph_list_ = [] 31 | seg_idx = 0 32 | for p in ph_list: 33 | p = p.replace("u:", "v") 34 | if seg_list[seg_idx] == '#': 35 | ph_list_.append('#') 36 | seg_idx += 1 37 | else: 38 | ph_list_.append("|") 39 | seg_idx += 1 40 | if re.findall('[\u4e00-\u9fff]', p): 41 | if pre_align_args['use_tone']: 42 | p = pinyin(p, style=Style.TONE3, strict=True)[0][0] 43 | if p[-1] not in ['1', '2', '3', '4', '5']: 44 | p = p + '5' 45 | else: 46 | p = pinyin(p, style=Style.NORMAL, strict=True)[0][0] 47 | 48 | finished = False 49 | if len([c.isalpha() for c in p]) > 1: 50 | for shenmu in ALL_SHENMU: 51 | if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric(): 52 | ph_list_ += [shenmu, p.lstrip(shenmu)] 53 | finished = True 54 | break 55 | if not finished: 56 | ph_list_.append(p) 57 | 58 | ph_list = ph_list_ 59 | 60 | # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...] 61 | sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes() 62 | ph_list_ = [] 63 | for i in range(0, len(ph_list), 1): 64 | if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes): 65 | ph_list_.append(ph_list[i]) 66 | ph_list = ph_list_ 67 | return ph_list, txt 68 | 69 | 70 | if __name__ == '__main__': 71 | phs, txt = TxtProcessor.process('他来到了,网易杭研大厦', {'use_tone': True}) 72 | print(phs) 73 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/wav_processors/__init__.py: -------------------------------------------------------------------------------- 1 | from . import base_processor 2 | from . import common_processors 3 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/wav_processors/base_processor.py: -------------------------------------------------------------------------------- 1 | REGISTERED_WAV_PROCESSORS = {} 2 | 3 | 4 | def register_wav_processors(name): 5 | def _f(cls): 6 | REGISTERED_WAV_PROCESSORS[name] = cls 7 | return cls 8 | 9 | return _f 10 | 11 | 12 | def get_wav_processor_cls(name): 13 | return REGISTERED_WAV_PROCESSORS.get(name, None) 14 | 15 | 16 | class BaseWavProcessor: 17 | @property 18 | def name(self): 19 | raise NotImplementedError 20 | 21 | def output_fn(self, input_fn): 22 | return f'{input_fn[:-4]}_{self.name}.wav' 23 | 24 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /NeuralSeq/data_gen/tts/wav_processors/common_processors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import librosa 4 | import numpy as np 5 | from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors 6 | from data_gen.tts.data_gen_utils import trim_long_silences 7 | from utils.audio import save_wav, rnnoise 8 | from utils.hparams import hparams 9 | 10 | 11 | @register_wav_processors(name='sox_to_wav') 12 | class ConvertToWavProcessor(BaseWavProcessor): 13 | @property 14 | def name(self): 15 | return 'ToWav' 16 | 17 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 18 | if input_fn[-4:] == '.wav': 19 | return input_fn, sr 20 | else: 21 | output_fn = self.output_fn(input_fn) 22 | subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True) 23 | return output_fn, sr 24 | 25 | 26 | @register_wav_processors(name='sox_resample') 27 | class ResampleProcessor(BaseWavProcessor): 28 | @property 29 | def name(self): 30 | return 'Resample' 31 | 32 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 33 | output_fn = self.output_fn(input_fn) 34 | sr_file = librosa.core.get_samplerate(input_fn) 35 | if sr != sr_file: 36 | subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True) 37 | y, _ = librosa.core.load(input_fn, sr=sr) 38 | y, _ = librosa.effects.trim(y) 39 | save_wav(y, output_fn, sr) 40 | return output_fn, sr 41 | else: 42 | return input_fn, sr 43 | 44 | 45 | @register_wav_processors(name='trim_sil') 46 | class TrimSILProcessor(BaseWavProcessor): 47 | @property 48 | def name(self): 49 | return 'TrimSIL' 50 | 51 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 52 | output_fn = self.output_fn(input_fn) 53 | y, _ = librosa.core.load(input_fn, sr=sr) 54 | y, _ = librosa.effects.trim(y) 55 | save_wav(y, output_fn, sr) 56 | return output_fn 57 | 58 | 59 | @register_wav_processors(name='trim_all_sil') 60 | class TrimAllSILProcessor(BaseWavProcessor): 61 | @property 62 | def name(self): 63 | return 'TrimSIL' 64 | 65 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 66 | output_fn = self.output_fn(input_fn) 67 | y, audio_mask, _ = trim_long_silences( 68 | input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12)) 69 | save_wav(y, output_fn, sr) 70 | if preprocess_args['save_sil_mask']: 71 | os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True) 72 | np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask) 73 | return output_fn, sr 74 | 75 | 76 | @register_wav_processors(name='denoise') 77 | class DenoiseProcessor(BaseWavProcessor): 78 | @property 79 | def name(self): 80 | return 'Denoise' 81 | 82 | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args): 83 | output_fn = self.output_fn(input_fn) 84 | rnnoise(input_fn, output_fn, out_sample_rate=sr) 85 | return output_fn, sr 86 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/emotion/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/ESD' 2 | processed_data_dir: 'data/processed/emotion' 3 | binary_data_dir: 'data/binary/emotion' 4 | pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign 5 | audio_sample_rate: 16000 6 | binarization_args: 7 | shuffle: true 8 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer 9 | use_spk_id: true 10 | test_num: 200 11 | num_spk: 10 12 | pitch_type: frame 13 | min_frames: 128 14 | num_test_samples: 30 15 | mel_loss: "ssim:0.5|l1:0.5" 16 | vocoder_ckpt: '' 17 | use_emotion: true -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/emotion/pre_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_gen.tts.base_preprocess import BasePreprocessor 4 | import glob 5 | import re 6 | 7 | class EmoPreAlign(BasePreprocessor): 8 | 9 | def meta_data(self): 10 | spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020'] 11 | pattern = re.compile('[\t\n ]+') 12 | for spk in spks: 13 | for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件 14 | line = re.sub(pattern, ' ', line) 15 | if line == ' ': continue 16 | split_ = line.split(' ') 17 | txt = ' '.join(split_[1: -2]) 18 | item_name = split_[0] 19 | emotion = split_[-2] 20 | wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav' 21 | yield item_name, wav_fn, txt, spk, emotion 22 | 23 | 24 | if __name__ == "__main__": 25 | EmoPreAlign().process() 26 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/libritts/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LibriTTS' 2 | processed_data_dir: 'data/processed/libritts' 3 | binary_data_dir: 'data/binary/libritts' 4 | pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign 5 | binarization_args: 6 | shuffle: true 7 | use_spk_id: true 8 | test_num: 200 9 | num_spk: 2320 10 | pitch_type: frame 11 | min_frames: 128 12 | num_test_samples: 30 13 | mel_loss: "ssim:0.5|l1:0.5" 14 | vocoder_ckpt: '' -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/libritts/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/tts/fs2.yaml 3 | - ./base_text2mel.yaml 4 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/libritts/pre_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_gen.tts.base_preprocess import BasePreprocessor 4 | import glob 5 | 6 | 7 | class LibrittsPreAlign(BasePreprocessor): 8 | def meta_data(self): 9 | wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav')) 10 | for wav_fn in wav_fns: 11 | item_name = os.path.basename(wav_fn)[:-4] 12 | txt_fn = f'{wav_fn[:-4]}.normalized.txt' 13 | with open(txt_fn, 'r') as f: 14 | txt = f.readlines() 15 | f.close() 16 | spk = item_name.split("_")[0] 17 | # Example: 18 | # 19 | # 'item_name': '103_1241_000000_000001' 20 | # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav' 21 | # 'txt': 'matthew Cuthbert is surprised' 22 | # 'spk_name': '103' 23 | yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk} 24 | 25 | 26 | if __name__ == "__main__": 27 | LibrittsPreAlign().process() 28 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/libritts/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: egs/egs_bases/tts/vocoder/pwg.yaml 2 | raw_data_dir: 'data/raw/LibriTTS' 3 | processed_data_dir: 'data/processed/libritts' 4 | binary_data_dir: 'data/binary/libritts_wav' 5 | generator_params: 6 | kernel_size: 5 7 | num_spk: 400 8 | max_samples: 20480 9 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/lj/base_mel2wav.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech_wav' 4 | binarization_args: 5 | with_spk_embed: false -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/lj/preprocess.py: -------------------------------------------------------------------------------- 1 | from data_gen.tts.base_preprocess import BasePreprocessor 2 | 3 | 4 | class LJPreprocess(BasePreprocessor): 5 | def meta_data(self): 6 | for l in open(f'{self.raw_data_dir}/metadata.csv').readlines(): 7 | item_name, _, txt = l.strip().split("|") 8 | wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav" 9 | yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt} 10 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/lj/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/tts/vocoder/pwg.yaml 3 | - ./base_mel2wav.yaml -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/vctk/base_mel2wav.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/VCTK-Corpus' 2 | processed_data_dir: 'data/processed/vctk' 3 | binary_data_dir: 'data/binary/vctk_wav' 4 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/vctk/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/tts/fs2.yaml 3 | raw_data_dir: 'data/raw/VCTK-Corpus' 4 | processed_data_dir: 'data/processed/vctk' 5 | binary_data_dir: 'data/binary/vctk' 6 | pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign 7 | use_spk_id: true 8 | test_num: 200 9 | num_spk: 400 10 | binarization_args: 11 | shuffle: true 12 | trim_eos_bos: true -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/vctk/pre_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from data_gen.tts.base_pre_align import BasePreAlign 4 | import glob 5 | 6 | 7 | class VCTKPreAlign(BasePreAlign): 8 | def meta_data(self): 9 | wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav') 10 | for wav_fn in wav_fns: 11 | item_name = os.path.basename(wav_fn)[:-4] 12 | spk = item_name.split("_")[0] 13 | txt_fn = wav_fn.split("/") 14 | txt_fn[-1] = f'{item_name}.txt' 15 | txt_fn[-3] = f'txt' 16 | txt_fn = "/".join(txt_fn) 17 | if os.path.exists(txt_fn) and os.path.exists(wav_fn): 18 | yield item_name, wav_fn, (self.load_txt, txt_fn), spk 19 | 20 | 21 | if __name__ == "__main__": 22 | VCTKPreAlign().process() 23 | -------------------------------------------------------------------------------- /NeuralSeq/egs/datasets/audio/vctk/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/tts/vocoder/pwg.yaml 3 | - ./base_mel2wav.yaml 4 | 5 | num_spk: 400 6 | max_samples: 20480 7 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/config_base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | binary_data_dir: '' 3 | work_dir: '' # experiment directory. 4 | infer: false # inference 5 | amp: false 6 | seed: 1234 7 | debug: false 8 | save_codes: [] 9 | # - configs 10 | # - modules 11 | # - tasks 12 | # - utils 13 | # - usr 14 | 15 | ############# 16 | # dataset 17 | ############# 18 | ds_workers: 1 19 | test_num: 100 20 | endless_ds: false 21 | sort_by_len: true 22 | 23 | ######### 24 | # train and eval 25 | ######### 26 | print_nan_grads: false 27 | load_ckpt: '' 28 | save_best: true 29 | num_ckpt_keep: 3 30 | clip_grad_norm: 0 31 | accumulate_grad_batches: 1 32 | tb_log_interval: 100 33 | num_sanity_val_steps: 5 # steps of validation at the beginning 34 | check_val_every_n_epoch: 10 35 | val_check_interval: 2000 36 | valid_monitor_key: 'val_loss' 37 | valid_monitor_mode: 'min' 38 | max_epochs: 1000 39 | max_updates: 1000000 40 | max_tokens: 31250 41 | max_sentences: 100000 42 | max_valid_tokens: -1 43 | max_valid_sentences: -1 44 | test_input_dir: '' 45 | resume_from_checkpoint: 0 46 | rename_tmux: true -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/base.yaml: -------------------------------------------------------------------------------- 1 | task_cls: tasks.svs.task.DiffFsTask 2 | pitch_type: frame 3 | timesteps: 100 4 | dilation_cycle_length: 1 5 | residual_layers: 20 6 | residual_channels: 256 7 | lr: 0.001 8 | decay_steps: 50000 9 | keep_bins: 80 10 | spec_min: [ ] 11 | spec_max: [ ] 12 | 13 | content_cond_steps: [ ] # [ 0, 10000 ] 14 | spk_cond_steps: [ ] # [ 0, 10000 ] 15 | # train and eval 16 | fs2_ckpt: '' 17 | max_updates: 400000 18 | # max_updates: 200000 19 | use_gt_dur: true 20 | use_gt_f0: true 21 | gen_tgt_spk_id: -1 22 | max_sentences: 48 23 | num_sanity_val_steps: 1 24 | num_valid_plots: 1 25 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/lj/fs2.yaml 3 | - ./base.yaml 4 | # spec_min and spec_max are calculated on the training set. 5 | spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672, 6 | -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759, 7 | -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733, 8 | -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510, 9 | -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916, 10 | -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875, 11 | -5.0483, -5.0848, -5.1809, -5.0677, -5.0015, -5.0792, -5.0636, -5.2413, 12 | -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173, 13 | -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757, 14 | -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ] 15 | spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.7093, 16 | 0.6461, 0.6420, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591, 17 | 0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492, 18 | 0.6909, 0.6106, 0.5761, 0.5936, 0.5638, 0.4054, 0.4545, 0.3589, 19 | 0.3037, 0.3380, 0.1599, 0.2433, 0.2741, 0.2130, 0.1569, 0.1911, 20 | 0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933, 21 | -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405, 22 | -0.1244, -0.2116, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000, 23 | 0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566, 24 | 0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ] 25 | 26 | task_cls: tasks.svs.diffspeech_task.DiffSpeechTask 27 | vocoder: vocoders.hifigan.HifiGAN 28 | vocoder_ckpt: checkpoints/0414_hifi_lj_1 29 | num_valid_plots: 10 30 | use_gt_dur: false 31 | use_gt_f0: false 32 | pitch_type: cwt 33 | pitch_extractor: 'parselmouth' 34 | max_updates: 160000 35 | lr: 0.001 36 | timesteps: 100 37 | K_step: 71 38 | diff_loss_type: l1 39 | diff_decoder_type: 'wavenet' 40 | schedule_type: 'linear' 41 | max_beta: 0.06 42 | fs2_ckpt: checkpoints/fs2_lj_1/model_ckpt_steps_150000.ckpt 43 | save_gt: true -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binarization_args: 14 | with_wav: true 15 | with_spk_embed: false 16 | with_align: true 17 | raw_data_dir: 'data/raw/opencpop/segments' 18 | processed_data_dir: 'xxx' 19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 20 | 21 | 22 | binary_data_dir: 'data/binary/opencpop-midi-dp' 23 | use_midi: true # for midi exp 24 | use_gt_f0: false # for midi exp 25 | use_gt_dur: false # for further midi exp 26 | lambda_f0: 1.0 27 | lambda_uv: 1.0 28 | #lambda_energy: 0.1 29 | lambda_ph_dur: 1.0 30 | lambda_sent_dur: 1.0 31 | lambda_word_dur: 1.0 32 | predictor_grad: 0.1 33 | pe_enable: false 34 | pe_ckpt: '' 35 | 36 | num_spk: 1 37 | test_prefixes: [ 38 | '2044', 39 | '2086', 40 | '2092', 41 | '2093', 42 | '2100', 43 | ] 44 | 45 | task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask 46 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN 47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 48 | vocoder: vocoders.hifigan.HifiGAN 49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 50 | 51 | use_nsf: true 52 | 53 | # config for experiments 54 | max_frames: 5000 55 | max_tokens: 40000 56 | predictor_layers: 5 57 | rel_pos: true 58 | dur_predictor_layers: 5 # * 59 | 60 | use_spk_embed: false 61 | num_valid_plots: 10 62 | max_updates: 160000 63 | save_gt: true -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/svs/popcs_ds_beta6.yaml 3 | - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_f0: false # for midi exp 11 | use_gt_dur: false # for further midi exp 12 | lambda_f0: 1.0 13 | lambda_uv: 1.0 14 | #lambda_energy: 0.1 15 | lambda_ph_dur: 1.0 16 | lambda_sent_dur: 1.0 17 | lambda_word_dur: 1.0 18 | predictor_grad: 0.1 19 | pe_enable: false 20 | pe_ckpt: '' 21 | 22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' # 23 | #num_valid_plots: 0 24 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask 25 | 26 | K_step: 60 27 | max_tokens: 36000 28 | predictor_layers: 5 29 | dilation_cycle_length: 4 # * 30 | rel_pos: true 31 | dur_predictor_layers: 5 # * 32 | max_updates: 160000 33 | gaussian_start: false 34 | mask_uv_prob: 0.15 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml: -------------------------------------------------------------------------------- 1 | spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 2 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 3 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 4 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 5 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 6 | -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., 7 | -6., -6., -6., -6., -6., -6., -6., -6.] 8 | spec_max: [-7.9453e-01, -8.1116e-01, -6.1631e-01, -3.0679e-01, -1.3863e-01, 9 | -5.0652e-02, -1.1563e-01, -1.0679e-01, -9.1068e-02, -6.2174e-02, 10 | -7.5302e-02, -7.2217e-02, -6.3815e-02, -7.3299e-02, 7.3610e-03, 11 | -7.2508e-02, -5.0234e-02, -1.6534e-01, -2.6928e-01, -2.0782e-01, 12 | -2.0823e-01, -1.1702e-01, -7.0128e-02, -6.5868e-02, -1.2675e-02, 13 | 1.5121e-03, -8.9902e-02, -2.1392e-01, -2.3789e-01, -2.8922e-01, 14 | -3.0405e-01, -2.3029e-01, -2.2088e-01, -2.1542e-01, -2.9367e-01, 15 | -3.0137e-01, -3.8281e-01, -4.3590e-01, -2.8681e-01, -4.6855e-01, 16 | -5.7485e-01, -4.7022e-01, -5.4266e-01, -4.4848e-01, -6.4120e-01, 17 | -6.8700e-01, -6.4860e-01, -7.6436e-01, -4.9971e-01, -7.1068e-01, 18 | -6.9724e-01, -6.1487e-01, -5.5843e-01, -6.9773e-01, -5.7502e-01, 19 | -7.0919e-01, -8.2431e-01, -8.4213e-01, -9.0431e-01, -8.2840e-01, 20 | -7.7945e-01, -8.2758e-01, -8.7699e-01, -1.0532e+00, -1.0766e+00, 21 | -1.1198e+00, -1.0185e+00, -9.8983e-01, -1.0001e+00, -1.0756e+00, 22 | -1.0024e+00, -1.0304e+00, -1.0579e+00, -1.0188e+00, -1.0500e+00, 23 | -1.0842e+00, -1.0923e+00, -1.1223e+00, -1.2381e+00, -1.6467e+00] 24 | 25 | mel_vmin: -6. #-6. 26 | mel_vmax: 1.5 27 | wav2spec_eps: 1e-6 28 | 29 | raw_data_dir: 'data/raw/opencpop/segments' 30 | processed_data_dir: 'xxx' 31 | binary_data_dir: 'data/binary/opencpop-midi-dp' 32 | datasets: [ 33 | 'opencpop', 34 | ] 35 | test_prefixes: [ 36 | '2044', 37 | '2086', 38 | '2092', 39 | '2093', 40 | '2100', 41 | ] 42 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/svs/popcs_ds_beta6.yaml 3 | - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask 21 | 22 | timesteps: 1000 23 | K_step: 1000 24 | max_beta: 0.02 25 | max_tokens: 36000 26 | max_updates: 320000 27 | gaussian_start: True 28 | 29 | use_pitch_embed: false 30 | use_gt_f0: false # for midi exp 31 | 32 | lambda_f0: 0. 33 | lambda_uv: 0. 34 | dilation_cycle_length: 10 # * 35 | rel_pos: true 36 | predictor_layers: 5 37 | pe_enable: true 38 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 39 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/svs/popcs_ds_beta6.yaml 3 | - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask 21 | 22 | # for diffusion schedule 23 | timesteps: 1000 24 | K_step: 1000 25 | max_beta: 0.02 26 | max_tokens: 36000 27 | max_updates: 320000 28 | gaussian_start: True 29 | pndm_speedup: 10 30 | 31 | use_pitch_embed: false 32 | use_gt_f0: false # for midi exp 33 | 34 | lambda_f0: 0. 35 | lambda_uv: 0. 36 | dilation_cycle_length: 4 # * 37 | rel_pos: true 38 | predictor_layers: 5 39 | pe_enable: true 40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 41 | 42 | 43 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/svs/popcs_ds_beta6.yaml 3 | - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 36000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/svs/popcs_ds_beta6.yaml 3 | - egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer 6 | binary_data_dir: 'data/binary/popcs-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 40000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/midi/pe.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/lj/fs2.yaml 3 | 4 | max_frames: 8000 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binary_data_dir: 'xxx' 14 | 15 | pitch_type: frame 16 | task_cls: tasks.tts.pe.PitchExtractionTask 17 | pitch_extractor_conv_layers: 2 18 | 19 | 20 | # config for experiments 21 | max_tokens: 20000 22 | use_spk_embed: false 23 | num_valid_plots: 10 24 | max_updates: 60000 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/singing/base.yaml 4 | - ./base.yaml 5 | 6 | audio_sample_rate: 24000 7 | hop_size: 128 # Hop size. 8 | fft_size: 512 # FFT size. 9 | win_size: 512 # FFT size. 10 | fmin: 30 11 | fmax: 12000 12 | min_level_db: -120 13 | 14 | binarization_args: 15 | with_wav: true 16 | with_spk_embed: false 17 | with_align: true 18 | raw_data_dir: 'data/raw/popcs' 19 | processed_data_dir: 'data/processed/popcs' 20 | binary_data_dir: 'data/binary/popcs-pmf0' 21 | num_spk: 1 22 | datasets: [ 23 | 'popcs', 24 | ] 25 | test_prefixes: [ 26 | 'popcs-说散就散', 27 | 'popcs-隐形的翅膀', 28 | ] 29 | 30 | spec_min: [-6.8276, -7.0270, -6.8142, -7.1429, -7.6669, -7.6000, -7.1148, -6.9640, 31 | -6.8414, -6.6596, -6.6880, -6.7439, -6.7986, -7.4940, -7.7845, -7.6586, 32 | -6.9288, -6.7639, -6.9118, -6.8246, -6.7183, -7.1769, -6.9794, -7.4513, 33 | -7.3422, -7.5623, -6.9610, -6.8158, -6.9595, -6.8403, -6.5688, -6.6356, 34 | -7.0209, -6.5002, -6.7819, -6.5232, -6.6927, -6.5701, -6.5531, -6.7069, 35 | -6.6462, -6.4523, -6.5954, -6.4264, -6.4487, -6.7070, -6.4025, -6.3042, 36 | -6.4008, -6.3857, -6.3903, -6.3094, -6.2491, -6.3518, -6.3566, -6.4168, 37 | -6.2481, -6.3624, -6.2858, -6.2575, -6.3638, -6.4520, -6.1835, -6.2754, 38 | -6.1253, -6.1645, -6.0638, -6.1262, -6.0710, -6.1039, -6.4428, -6.1363, 39 | -6.1054, -6.1252, -6.1797, -6.0235, -6.0758, -5.9453, -6.0213, -6.0446] 40 | spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.1212, 41 | 0.2421, 0.1809, 0.2134, 0.3161, 0.3301, 0.3289, 0.2667, 0.2421, 42 | 0.2581, 0.2600, 0.1394, 0.1907, 0.1082, 0.1474, 0.1680, 0.2550, 43 | 0.1057, 0.0826, 0.0423, 0.1203, -0.0701, -0.0056, 0.0477, -0.0639, 44 | -0.0272, -0.0728, -0.1648, -0.0855, -0.2652, -0.1998, -0.1547, -0.2167, 45 | -0.4181, -0.5463, -0.4161, -0.4733, -0.6518, -0.5387, -0.4290, -0.4191, 46 | -0.4151, -0.3042, -0.3810, -0.4160, -0.4496, -0.2847, -0.4676, -0.4658, 47 | -0.4931, -0.4885, -0.5547, -0.5481, -0.6948, -0.7968, -0.8455, -0.8392, 48 | -0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035, 49 | -0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766] 50 | 51 | task_cls: tasks.svs.diffsinger_task.DiffSingerTask 52 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN 53 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 54 | vocoder: vocoders.hifigan.HifiGAN 55 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 56 | 57 | pitch_extractor: 'parselmouth' 58 | # config for experiments 59 | use_spk_embed: false 60 | num_valid_plots: 10 61 | max_updates: 160000 62 | lr: 0.001 63 | timesteps: 100 64 | K_step: 51 65 | diff_loss_type: l1 66 | diff_decoder_type: 'wavenet' 67 | schedule_type: 'linear' 68 | max_beta: 0.06 69 | fs2_ckpt: '' 70 | use_nsf: true -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./popcs_ds_beta6.yaml 3 | 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer 5 | num_valid_plots: 0 6 | task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask 7 | 8 | # tmp: 9 | #pe_enable: true 10 | #pe_ckpt: '' 11 | vocoder: vocoders.hifigan.HifiGAN 12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | 4 | audio_sample_rate: 24000 5 | hop_size: 128 # Hop size. 6 | fft_size: 512 # FFT size. 7 | win_size: 512 # FFT size. 8 | fmin: 30 9 | fmax: 12000 10 | min_level_db: -120 11 | 12 | binarization_args: 13 | with_wav: true 14 | with_spk_embed: false 15 | with_align: true 16 | raw_data_dir: 'data/raw/popcs' 17 | processed_data_dir: 'data/processed/popcs' 18 | binary_data_dir: 'data/binary/popcs-pmf0' 19 | num_spk: 1 20 | datasets: [ 21 | 'popcs', 22 | ] 23 | test_prefixes: [ 24 | 'popcs-说散就散', 25 | 'popcs-隐形的翅膀', 26 | ] 27 | 28 | task_cls: tasks.tts.fs2.FastSpeech2Task 29 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN 30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 31 | vocoder: vocoders.hifigan.HifiGAN 32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 33 | use_nsf: true 34 | 35 | # config for experiments 36 | max_tokens: 18000 37 | use_spk_embed: false 38 | num_valid_plots: 10 39 | max_updates: 160000 40 | save_gt: true 41 | 42 | # tmp: 43 | #pe_enable: true 44 | #pe_ckpt: '' -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | base_config: ../config_base.yaml 3 | task_cls: '' 4 | ############# 5 | # dataset 6 | ############# 7 | raw_data_dir: '' 8 | processed_data_dir: '' 9 | binary_data_dir: '' 10 | dict_dir: '' 11 | pre_align_cls: '' 12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer 13 | pre_align_args: 14 | txt_processor: en 15 | use_tone: true # for ZH 16 | sox_resample: false 17 | sox_to_wav: false 18 | allow_no_txt: false 19 | trim_sil: false 20 | denoise: false 21 | binarization_args: 22 | shuffle: false 23 | with_txt: true 24 | with_wav: false 25 | with_align: true 26 | with_spk_embed: false 27 | with_spk_id: true 28 | with_f0: true 29 | with_f0cwt: false 30 | with_linear: false 31 | with_word: true 32 | trim_sil: false 33 | trim_eos_bos: false 34 | reset_phone_dict: true 35 | reset_word_dict: true 36 | word_size: 30000 37 | pitch_extractor: parselmouth 38 | 39 | loud_norm: false 40 | endless_ds: true 41 | 42 | test_num: 100 43 | min_frames: 0 44 | max_frames: 1548 45 | frames_multiple: 1 46 | max_input_tokens: 1550 47 | audio_num_mel_bins: 80 48 | audio_sample_rate: 22050 49 | hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 50 | win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 51 | fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 52 | fmax: 7600 # To be increased/reduced depending on data. 53 | fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter 54 | min_level_db: -100 55 | ref_level_db: 20 56 | griffin_lim_iters: 60 57 | num_spk: 1 58 | mel_vmin: -6 59 | mel_vmax: 1.5 60 | ds_workers: 1 61 | 62 | ######### 63 | # model 64 | ######### 65 | dropout: 0.1 66 | enc_layers: 4 67 | dec_layers: 4 68 | hidden_size: 256 69 | num_heads: 2 70 | enc_ffn_kernel_size: 9 71 | dec_ffn_kernel_size: 9 72 | ffn_act: gelu 73 | ffn_padding: 'SAME' 74 | use_spk_id: false 75 | use_split_spk_id: false 76 | use_spk_embed: false 77 | 78 | 79 | ########### 80 | # optimization 81 | ########### 82 | lr: 2.0 83 | scheduler: rsqrt # rsqrt|none 84 | warmup_updates: 8000 85 | optimizer_adam_beta1: 0.9 86 | optimizer_adam_beta2: 0.98 87 | weight_decay: 0 88 | clip_grad_norm: 1 89 | clip_grad_value: 0 90 | 91 | 92 | ########### 93 | # train and eval 94 | ########### 95 | max_tokens: 30000 96 | max_sentences: 100000 97 | max_valid_sentences: 1 98 | max_valid_tokens: 60000 99 | valid_infer_interval: 10000 100 | train_set_name: 'train' 101 | train_sets: '' 102 | valid_set_name: 'valid' 103 | test_set_name: 'test' 104 | num_test_samples: 0 105 | num_valid_plots: 10 106 | test_ids: [ ] 107 | vocoder_denoise_c: 0.0 108 | profile_infer: false 109 | out_wav_norm: false 110 | save_gt: true 111 | save_f0: false 112 | gen_dir_name: '' -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/base_zh.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./base.yaml 2 | preprocess_args: 3 | txt_processor: zh 4 | use_tone: true 5 | 6 | word_size: 3000 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./base.yaml 2 | task_cls: tasks.tts.fs2.FastSpeech2Task 3 | 4 | # model 5 | hidden_size: 256 6 | dropout: 0.1 7 | encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer 8 | decoder_type: fft # fft|rnn|conv|conformer|wn 9 | 10 | # rnn enc/dec 11 | encoder_K: 8 12 | decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2 13 | 14 | # fft enc/dec 15 | use_pos_embed: true 16 | dec_num_heads: 2 17 | dec_layers: 4 18 | ffn_hidden_size: 1024 19 | enc_ffn_kernel_size: 9 20 | dec_ffn_kernel_size: 9 21 | 22 | # conv enc/dec 23 | enc_dec_norm: ln 24 | conv_use_pos: false 25 | layers_in_block: 2 26 | enc_dilations: [ 1, 1, 1, 1 ] 27 | enc_kernel_size: 5 28 | dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder 29 | dec_kernel_size: 5 30 | dur_loss: mse # huber|mol 31 | 32 | # duration 33 | predictor_hidden: -1 34 | predictor_kernel: 5 35 | predictor_layers: 2 36 | dur_predictor_kernel: 3 37 | dur_predictor_layers: 2 38 | predictor_dropout: 0.5 39 | 40 | # pitch and energy 41 | pitch_norm: standard # standard|log 42 | use_pitch_embed: true 43 | pitch_type: frame # frame|ph|cwt 44 | use_uv: true 45 | cwt_hidden_size: 128 46 | cwt_layers: 2 47 | cwt_loss: l1 48 | cwt_add_f0_loss: false 49 | cwt_std_scale: 0.8 50 | 51 | pitch_ar: false 52 | pitch_embed_type: 0 53 | pitch_loss: 'l1' # l1|l2|ssim 54 | pitch_ssim_win: 11 55 | use_energy_embed: false 56 | 57 | # reference encoder and speaker embedding 58 | use_ref_enc: false 59 | use_var_enc: false 60 | lambda_commit: 0.25 61 | var_enc_vq_codes: 64 62 | ref_norm_layer: bn 63 | dec_inp_add_noise: false 64 | sil_add_noise: false 65 | ref_hidden_stride_kernel: 66 | - 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 67 | - 0,3,5 68 | - 0,2,5 69 | - 0,2,5 70 | - 0,2,5 71 | pitch_enc_hidden_stride_kernel: 72 | - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 73 | - 0,2,5 74 | - 0,2,5 75 | dur_enc_hidden_stride_kernel: 76 | - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size 77 | - 0,2,3 78 | - 0,1,3 79 | 80 | # mel 81 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 82 | 83 | # loss lambda 84 | lambda_f0: 1.0 85 | lambda_uv: 1.0 86 | lambda_energy: 0.1 87 | lambda_ph_dur: 0.1 88 | lambda_sent_dur: 1.0 89 | lambda_word_dur: 1.0 90 | predictor_grad: 0.1 91 | 92 | # train and eval 93 | pretrain_fs_ckpt: '' 94 | warmup_updates: 2000 95 | max_tokens: 32000 96 | max_sentences: 100000 97 | max_valid_sentences: 1 98 | max_updates: 120000 99 | use_gt_dur: false 100 | use_gt_f0: false 101 | ds_workers: 2 102 | lr: 1.0 103 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/fs2_adv.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./fs2.yaml 2 | task_cls: tasks.tts.fs2_adv.FastSpeech2AdvTask 3 | 4 | disc_win_num: 3 5 | disc_interval: 1 6 | disc_reduction: stack # stack|sum|none 7 | disc_start_steps: 0 8 | rerun_gen: false 9 | 10 | disc_norm: in 11 | mel_disc_hidden_size: 128 12 | 13 | # mel decoder 14 | mel_gan: true 15 | lambda_mel_adv: 0.1 16 | mel_hidden_size: 256 17 | 18 | # others 19 | dropout: 0.05 20 | pitch_embed_type: 0 21 | enc_ffn_kernel_size: 9 22 | dec_ffn_kernel_size: 9 23 | use_cond_disc: false 24 | 25 | optimizer_adam_beta1: 0.5 26 | optimizer_adam_beta2: 0.999 27 | generator_grad_norm: 5.0 # Generator's gradient norm. 28 | disc_hidden_size: 128 29 | disc_lr: 0.0001 # Discriminator's learning rate. 30 | discriminator_optimizer_params: 31 | eps: 1.0e-6 # Discriminator's epsilon. 32 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 33 | discriminator_scheduler_params: 34 | step_size: 60000 # Discriminator's scheduler step size. 35 | gamma: 0.5 # D5iscriminator's scheduler gamma. 36 | # At each step size, lr will be multiplied by this parameter. 37 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 38 | 39 | max_updates: 400000 40 | max_tokens: 30000 41 | max_sentences: 80 42 | val_check_interval: 2000 43 | 44 | gen_dir_name: '' 45 | num_ckpt_keep: 2 46 | save_best: false 47 | 48 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/ps.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./fs2.yaml 2 | 3 | ########################### 4 | # models 5 | ########################### 6 | # encoders 7 | hidden_size: 192 8 | ffn_hidden_size: 768 9 | enc_ffn_kernel_size: 5 10 | enc_layers: 4 11 | dur_level: word 12 | encoder_type: rel_fft 13 | use_word_encoder: true 14 | 15 | # mix ling encoder 16 | word_enc_layers: 4 17 | word_encoder_type: rel_fft 18 | use_pitch_embed: false 19 | enc_prenet: true 20 | enc_pre_ln: true 21 | text_encoder_postnet: true 22 | dropout: 0.0 23 | add_word_pos: true 24 | 25 | # dur predictor 26 | dur_predictor_layers: 3 27 | dur_predictor_kernel: 5 28 | predictor_dropout: 0.2 29 | 30 | ## fvae 31 | use_fvae: true 32 | latent_size: 16 33 | fvae_encoder_type: conv 34 | fvae_decoder_type: conv 35 | fvae_enc_dec_hidden: 192 36 | fvae_kernel_size: 5 37 | fvae_enc_n_layers: 8 38 | fvae_dec_n_layers: 4 39 | fvae_strides: 4 40 | fvae_noise_scale: 1.0 41 | 42 | # prior flow 43 | use_prior_flow: true 44 | prior_flow_hidden: 64 45 | prior_flow_kernel_size: 3 46 | prior_flow_n_blocks: 4 47 | 48 | ########################### 49 | # training and inference 50 | ########################### 51 | lambda_kl: 1.0 52 | kl_min: 0.0 53 | lambda_sent_dur: 0.0 54 | kl_start_steps: 10000 55 | posterior_start_steps: 0 56 | frames_multiple: 4 57 | num_valid_plots: 10 58 | lr: 0.0002 59 | warmup_updates: 8000 60 | max_tokens: 40000 61 | valid_infer_interval: 10000 62 | max_sentences: 80 63 | max_updates: 480000 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/ps_flow.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./ps2.yaml 2 | task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask 3 | 4 | use_post_flow: true 5 | detach_postflow_input: true 6 | post_flow_lr: 0.001 7 | post_glow_hidden: 192 8 | post_glow_kernel_size: 3 9 | post_glow_n_blocks: 12 10 | post_glow_n_block_layers: 3 11 | post_share_cond_layers: false 12 | share_wn_layers: 4 13 | use_cond_proj: false 14 | use_latent_cond: false 15 | use_txt_cond: true 16 | sigmoid_scale: false 17 | post_glow_training_start: 160000 18 | noise_scale: 0.8 19 | infer_post_glow: true 20 | two_stage: true -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/ps_flow_small.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./ps_flow.yaml 2 | 3 | ########################### 4 | # models 5 | ########################### 6 | # encoders 7 | hidden_size: 128 8 | ffn_hidden_size: 512 9 | enc_ffn_kernel_size: 3 10 | enc_layers: 3 11 | word_enc_layers: 3 12 | 13 | # dur predictor 14 | dur_predictor_layers: 3 15 | dur_predictor_kernel: 5 16 | predictor_dropout: 0.2 17 | 18 | ## fvae 19 | use_fvae: true 20 | latent_size: 16 21 | fvae_encoder_type: wn 22 | fvae_decoder_type: wn 23 | fvae_enc_dec_hidden: 128 24 | fvae_kernel_size: 3 25 | fvae_enc_n_layers: 8 26 | fvae_dec_n_layers: 3 27 | fvae_strides: 4 28 | fvae_noise_scale: 1.0 29 | 30 | 31 | # prior flow 32 | use_prior_flow: true 33 | prior_flow_hidden: 32 34 | prior_flow_kernel_size: 3 35 | prior_flow_n_blocks: 3 36 | # post flow 37 | post_glow_hidden: 128 38 | post_glow_kernel_size: 3 39 | post_glow_n_blocks: 8 40 | post_glow_n_block_layers: 3 41 | share_wn_layers: 4 42 | noise_scale: 0.6 -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/vocoder/base.yaml: -------------------------------------------------------------------------------- 1 | base_config: ../base.yaml 2 | binarization_args: 3 | with_wav: true 4 | with_spk_embed: false 5 | with_align: false 6 | with_word: false 7 | with_txt: false 8 | 9 | ########### 10 | # train and eval 11 | ########### 12 | max_samples: 25600 13 | max_sentences: 5 14 | max_valid_sentences: 1 15 | max_updates: 1000000 16 | val_check_interval: 2000 17 | 18 | ########################################################### 19 | # FEATURE EXTRACTION SETTING # 20 | ########################################################### 21 | fft_size: 1024 # FFT size. 22 | hop_size: 256 # Hop size. 23 | win_length: null # Window length. 24 | # If set to null, it will be the same as fft_size. 25 | window: "hann" # Window function. 26 | num_mels: 80 # Number of mel basis. 27 | fmin: 80 # Minimum freq in mel basis calculation. 28 | fmax: 7600 # Maximum frequency in mel basis calculation. 29 | aux_context_window: 0 # Context window size for auxiliary feature. 30 | use_pitch_embed: false 31 | 32 | generator_grad_norm: 10 # Generator's gradient norm. 33 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 34 | disc_start_steps: 40000 # Number of steps to start to train discriminator. 35 | -------------------------------------------------------------------------------- /NeuralSeq/egs/egs_bases/tts/vocoder/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./base.yaml 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask 3 | resblock: "1" 4 | adam_b1: 0.8 5 | adam_b2: 0.99 6 | upsample_rates: [ 8,8,2,2 ] 7 | upsample_kernel_sizes: [ 16,16,4,4 ] 8 | upsample_initial_channel: 512 9 | resblock_kernel_sizes: [ 3,7,11 ] 10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ] 11 | 12 | use_pitch_embed: false 13 | use_fm_loss: false 14 | use_ms_stft: false 15 | 16 | lambda_mel: 5.0 17 | lambda_mel_adv: 1.0 18 | lambda_cdisc: 4.0 19 | lambda_adv: 1.0 20 | 21 | lr: 0.0002 # Generator's learning rate. 22 | generator_scheduler_params: 23 | step_size: 600 24 | gamma: 0.999 25 | discriminator_scheduler_params: 26 | step_size: 600 27 | gamma: 0.999 28 | max_updates: 3000000 -------------------------------------------------------------------------------- /NeuralSeq/gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ftz filter=lfs diff=lfs merge=lfs -text 6 | *.gz filter=lfs diff=lfs merge=lfs -text 7 | *.h5 filter=lfs diff=lfs merge=lfs -text 8 | *.joblib filter=lfs diff=lfs merge=lfs -text 9 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 10 | *.model filter=lfs diff=lfs merge=lfs -text 11 | *.msgpack filter=lfs diff=lfs merge=lfs -text 12 | *.npy filter=lfs diff=lfs merge=lfs -text 13 | *.npz filter=lfs diff=lfs merge=lfs -text 14 | *.onnx filter=lfs diff=lfs merge=lfs -text 15 | *.ot filter=lfs diff=lfs merge=lfs -text 16 | *.parquet filter=lfs diff=lfs merge=lfs -text 17 | *.pickle filter=lfs diff=lfs merge=lfs -text 18 | *.pkl filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pt filter=lfs diff=lfs merge=lfs -text 21 | *.pth filter=lfs diff=lfs merge=lfs -text 22 | *.rar filter=lfs diff=lfs merge=lfs -text 23 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 24 | *.tar.* filter=lfs diff=lfs merge=lfs -text 25 | *.tflite filter=lfs diff=lfs merge=lfs -text 26 | *.tgz filter=lfs diff=lfs merge=lfs -text 27 | *.wasm filter=lfs diff=lfs merge=lfs -text 28 | *.xz filter=lfs diff=lfs merge=lfs -text 29 | *.zip filter=lfs diff=lfs merge=lfs -text 30 | *.zstandard filter=lfs diff=lfs merge=lfs -text 31 | *tfevents* filter=lfs diff=lfs merge=lfs -text 32 | model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text 33 | checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text 34 | -------------------------------------------------------------------------------- /NeuralSeq/inference/svs/ds_cascade.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from inference.svs.base_svs_infer import BaseSVSInfer 3 | from utils import load_ckpt 4 | from utils.hparams import hparams 5 | from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion 6 | from tasks.svs.diffsinger_task import DIFF_DECODERS 7 | 8 | class DiffSingerCascadeInfer(BaseSVSInfer): 9 | def build_model(self): 10 | model = GaussianDiffusion( 11 | phone_encoder=self.ph_encoder, 12 | out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams), 13 | timesteps=hparams['timesteps'], 14 | K_step=hparams['K_step'], 15 | loss_type=hparams['diff_loss_type'], 16 | spec_min=hparams['spec_min'], spec_max=hparams['spec_max'], 17 | ) 18 | model.eval() 19 | load_ckpt(model, hparams['work_dir'], 'model') 20 | return model 21 | 22 | def forward_model(self, inp): 23 | sample = self.input_to_batch(inp) 24 | txt_tokens = sample['txt_tokens'] # [B, T_t] 25 | spk_id = sample.get('spk_ids') 26 | with torch.no_grad(): 27 | output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True, 28 | pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'], 29 | is_slur=sample['is_slur']) 30 | mel_out = output['mel_out'] # [B, T,80] 31 | f0_pred = output['f0_denorm'] 32 | wav_out = self.run_vocoder(mel_out, f0=f0_pred) 33 | wav_out = wav_out.cpu().numpy() 34 | return wav_out[0] 35 | 36 | 37 | if __name__ == '__main__': 38 | inp = { 39 | 'text': '小酒窝长睫毛AP是你最美的记号', 40 | 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', 41 | 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', 42 | 'input_type': 'word' 43 | } # user input: Chinese characters 44 | c = { 45 | 'text': '小酒窝长睫毛AP是你最美的记号', 46 | 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao', 47 | 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4', 48 | 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340', 49 | 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0', 50 | 'input_type': 'phoneme' 51 | } # input like Opencpop dataset. 52 | DiffSingerCascadeInfer.example_run(inp) 53 | 54 | # # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi -------------------------------------------------------------------------------- /NeuralSeq/inference/svs/opencpop/map.py: -------------------------------------------------------------------------------- 1 | def cpop_pinyin2ph_func(): 2 | # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table" 3 | pinyin2phs = {'AP': 'AP', 'SP': 'SP'} 4 | with open('NeuralSeq/inference/svs/opencpop/cpop_pinyin2ph.txt') as rf: 5 | for line in rf.readlines(): 6 | elements = [x.strip() for x in line.split('|') if x.strip() != ''] 7 | pinyin2phs[elements[0]] = elements[1] 8 | return pinyin2phs -------------------------------------------------------------------------------- /NeuralSeq/modules/GenerSpeech/config/generspeech.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/tts/fs2.yaml 3 | - egs/datasets/audio/emotion/base_text2mel.yaml 4 | 5 | task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask 6 | 7 | # emotion encoder 8 | emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path 9 | 10 | # vocoder 11 | vocoder: hifigan 12 | vocoder_ckpt: checkpoints/trainset_hifigan 13 | 14 | # dataset 15 | raw_data_dir: 'data/raw/training_set' 16 | processed_data_dir: 'data/processed/training_set' 17 | binary_data_dir: 'data/binary/training_set' 18 | test_input_dir: '' 19 | 20 | # process 21 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer 22 | audio_sample_rate: 16000 23 | hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 24 | win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 25 | fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 26 | fmax: 7600 # To be increased/reduced depending on data. 27 | fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter 28 | min_level_db: -100 29 | ref_level_db: 20 30 | 31 | binarization_args: 32 | reset_phone_dict: true 33 | reset_word_dict: true 34 | shuffle: true 35 | trim_eos_bos: false 36 | trim_sil: false 37 | with_align: true 38 | with_f0: true 39 | with_f0cwt: false 40 | with_linear: false 41 | with_spk_embed: true 42 | with_spk_id: true 43 | with_txt: true 44 | with_wav: true 45 | with_word: true 46 | 47 | preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign 48 | preprocess_args: 49 | nsample_per_mfa_group: 1000 50 | # text process 51 | txt_processor: en 52 | use_mfa: true 53 | with_phsep: true 54 | reset_phone_dict: true 55 | reset_word_dict: true 56 | add_eos_bos: true 57 | # mfa 58 | mfa_group_shuffle: false 59 | mfa_offset: 0.02 60 | # wav processors 61 | wav_processors: [] 62 | save_sil_mask: true 63 | vad_max_silence_length: 12 64 | 65 | # data 66 | word_dict_size: 10000 67 | num_spk: 500 68 | use_spk_embed: true 69 | use_spk_id: false 70 | use_word: true 71 | use_emotion: true 72 | use_gt_dur: false 73 | ref_audio: '' 74 | text: '' 75 | 76 | # training 77 | num_sanity_val_steps: -1 78 | max_updates: 300000 79 | max_sentences: 100000 80 | num_test_samples: 72 81 | 82 | ## glow 83 | post_glow_hidden: 128 84 | post_glow_kernel_size: 3 85 | post_glow_n_blocks: 8 86 | post_glow_n_block_layers: 3 87 | share_wn_layers: 4 88 | sigmoid_scale: false 89 | post_share_cond_layers: false 90 | use_txt_cond: true 91 | use_latent_cond: true 92 | noise_scale: 0.8 93 | 94 | # prosody extractor 95 | lambda_commit: 0.25 96 | vq_start: 20500 97 | vae_dropout: 0.0 98 | nVQ: 128 99 | forcing: 20000 100 | crop: false 101 | predictor_grad: 1.0 -------------------------------------------------------------------------------- /NeuralSeq/modules/GenerSpeech/model/mixstyle.py: -------------------------------------------------------------------------------- 1 | from modules.commons.common_layers import * 2 | import random 3 | 4 | 5 | class MixStyle(nn.Module): 6 | """MixStyle. 7 | Reference: 8 | Zhou et al. Domain Generalization with MixStyle. ICLR 2021. 9 | """ 10 | 11 | def __init__(self, p=0.5, alpha=0.1, eps=1e-6, hidden_size=256): 12 | """ 13 | Args: 14 | p (float): probability of using MixStyle. 15 | alpha (float): parameter of the Beta distribution. 16 | eps (float): scaling parameter to avoid numerical issues. 17 | mix (str): how to mix. 18 | """ 19 | super().__init__() 20 | self.p = p 21 | self.beta = torch.distributions.Beta(alpha, alpha) 22 | self.eps = eps 23 | self.alpha = alpha 24 | self._activated = True 25 | self.hidden_size = hidden_size 26 | self.affine_layer = LinearNorm( 27 | hidden_size, 28 | 2 * hidden_size, # For both b (bias) g (gain) 29 | ) 30 | 31 | def __repr__(self): 32 | return f'MixStyle(p={self.p}, alpha={self.alpha}, eps={self.eps})' 33 | 34 | def set_activation_status(self, status=True): 35 | self._activated = status 36 | 37 | def forward(self, x, spk_embed): 38 | if not self.training or not self._activated: 39 | return x 40 | 41 | if random.random() > self.p: 42 | return x 43 | 44 | B = x.size(0) 45 | 46 | mu, sig = torch.mean(x, dim=-1, keepdim=True), torch.std(x, dim=-1, keepdim=True) 47 | x_normed = (x - mu) / (sig + 1e-6) # [B, T, H_m] 48 | 49 | lmda = self.beta.sample((B, 1, 1)) 50 | lmda = lmda.to(x.device) 51 | 52 | # Get Bias and Gain 53 | mu1, sig1 = torch.split(self.affine_layer(spk_embed), self.hidden_size, dim=-1) # [B, 1, 2 * H_m] --> 2 * [B, 1, H_m] 54 | 55 | # MixStyle 56 | perm = torch.randperm(B) 57 | mu2, sig2 = mu1[perm], sig1[perm] 58 | 59 | mu_mix = mu1*lmda + mu2 * (1-lmda) 60 | sig_mix = sig1*lmda + sig2 * (1-lmda) 61 | 62 | # Perform Scailing and Shifting 63 | return sig_mix * x_normed + mu_mix # [B, T, H_m] 64 | -------------------------------------------------------------------------------- /NeuralSeq/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/modules/__init__.py -------------------------------------------------------------------------------- /NeuralSeq/modules/commons/align_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def build_word_mask(x2word, y2word): 6 | return (x2word[:, :, None] == y2word[:, None, :]).long() 7 | 8 | 9 | def mel2ph_to_mel2word(mel2ph, ph2word): 10 | mel2word = (ph2word - 1).gather(1, (mel2ph - 1).clamp(min=0)) + 1 11 | mel2word = mel2word * (mel2ph > 0).long() 12 | return mel2word 13 | 14 | 15 | def clip_mel2token_to_multiple(mel2token, frames_multiple): 16 | max_frames = mel2token.shape[1] // frames_multiple * frames_multiple 17 | mel2token = mel2token[:, :max_frames] 18 | return mel2token 19 | 20 | 21 | def expand_states(h, mel2token): 22 | h = F.pad(h, [0, 0, 1, 0]) 23 | mel2token_ = mel2token[..., None].repeat([1, 1, h.shape[-1]]) 24 | h = torch.gather(h, 1, mel2token_) # [B, T, H] 25 | return h 26 | -------------------------------------------------------------------------------- /NeuralSeq/modules/commons/normalizing_flow/res_flow.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from modules.commons.conv import ConditionalConvBlocks 4 | from modules.commons.wavenet import WN 5 | 6 | 7 | class FlipLayer(nn.Module): 8 | def forward(self, x, nonpadding, cond=None, reverse=False): 9 | x = torch.flip(x, [1]) 10 | return x 11 | 12 | 13 | class CouplingLayer(nn.Module): 14 | def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout=0, c_in_g=0, nn_type='wn'): 15 | super().__init__() 16 | self.channels = c_in 17 | self.hidden_size = hidden_size 18 | self.kernel_size = kernel_size 19 | self.n_layers = n_layers 20 | self.c_half = c_in // 2 21 | 22 | self.pre = nn.Conv1d(self.c_half, hidden_size, 1) 23 | if nn_type == 'wn': 24 | self.enc = WN(hidden_size, kernel_size, 1, n_layers, p_dropout=p_dropout, 25 | c_cond=c_in_g) 26 | elif nn_type == 'conv': 27 | self.enc = ConditionalConvBlocks( 28 | hidden_size, c_in_g, hidden_size, None, kernel_size, 29 | layers_in_block=1, is_BTC=False, num_layers=n_layers) 30 | self.post = nn.Conv1d(hidden_size, self.c_half, 1) 31 | 32 | def forward(self, x, nonpadding, cond=None, reverse=False): 33 | x0, x1 = x[:, :self.c_half], x[:, self.c_half:] 34 | x_ = self.pre(x0) * nonpadding 35 | x_ = self.enc(x_, nonpadding=nonpadding, cond=cond) 36 | m = self.post(x_) 37 | x1 = m + x1 if not reverse else x1 - m 38 | x = torch.cat([x0, x1], 1) 39 | return x * nonpadding 40 | 41 | 42 | class ResFlow(nn.Module): 43 | def __init__(self, 44 | c_in, 45 | hidden_size, 46 | kernel_size, 47 | n_flow_layers, 48 | n_flow_steps=4, 49 | c_cond=0, 50 | nn_type='wn'): 51 | super().__init__() 52 | self.flows = nn.ModuleList() 53 | for i in range(n_flow_steps): 54 | self.flows.append( 55 | CouplingLayer(c_in, hidden_size, kernel_size, n_flow_layers, c_in_g=c_cond, nn_type=nn_type)) 56 | self.flows.append(FlipLayer()) 57 | 58 | def forward(self, x, nonpadding, cond=None, reverse=False): 59 | for flow in (self.flows if not reverse else reversed(self.flows)): 60 | x = flow(x, nonpadding, cond=cond, reverse=reverse) 61 | return x 62 | -------------------------------------------------------------------------------- /NeuralSeq/modules/commons/normalizing_flow/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def squeeze(x, x_mask=None, n_sqz=2): 5 | b, c, t = x.size() 6 | 7 | t = (t // n_sqz) * n_sqz 8 | x = x[:, :, :t] 9 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 10 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 11 | 12 | if x_mask is not None: 13 | x_mask = x_mask[:, :, n_sqz - 1::n_sqz] 14 | else: 15 | x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 16 | return x_sqz * x_mask, x_mask 17 | 18 | 19 | def unsqueeze(x, x_mask=None, n_sqz=2): 20 | b, c, t = x.size() 21 | 22 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 23 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 24 | 25 | if x_mask is not None: 26 | x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 27 | else: 28 | x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 29 | return x_unsqz * x_mask, x_mask 30 | -------------------------------------------------------------------------------- /NeuralSeq/modules/hifigan/mel_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from librosa.filters import mel as librosa_mel_fn 5 | from scipy.io.wavfile import read 6 | 7 | MAX_WAV_VALUE = 32768.0 8 | 9 | 10 | def load_wav(full_path): 11 | sampling_rate, data = read(full_path) 12 | return data, sampling_rate 13 | 14 | 15 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 16 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 17 | 18 | 19 | def dynamic_range_decompression(x, C=1): 20 | return np.exp(x) / C 21 | 22 | 23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 24 | return torch.log(torch.clamp(x, min=clip_val) * C) 25 | 26 | 27 | def dynamic_range_decompression_torch(x, C=1): 28 | return torch.exp(x) / C 29 | 30 | 31 | def spectral_normalize_torch(magnitudes): 32 | output = dynamic_range_compression_torch(magnitudes) 33 | return output 34 | 35 | 36 | def spectral_de_normalize_torch(magnitudes): 37 | output = dynamic_range_decompression_torch(magnitudes) 38 | return output 39 | 40 | 41 | mel_basis = {} 42 | hann_window = {} 43 | 44 | 45 | def mel_spectrogram(y, hparams, center=False, complex=False): 46 | # hop_size: 512 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 47 | # win_size: 2048 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) 48 | # fmin: 55 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 49 | # fmax: 10000 # To be increased/reduced depending on data. 50 | # fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter 51 | # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, 52 | n_fft = hparams['fft_size'] 53 | num_mels = hparams['audio_num_mel_bins'] 54 | sampling_rate = hparams['audio_sample_rate'] 55 | hop_size = hparams['hop_size'] 56 | win_size = hparams['win_size'] 57 | fmin = hparams['fmin'] 58 | fmax = hparams['fmax'] 59 | y = y.clamp(min=-1., max=1.) 60 | global mel_basis, hann_window 61 | if fmax not in mel_basis: 62 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 63 | mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device) 64 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 65 | 66 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 67 | mode='reflect') 68 | y = y.squeeze(1) 69 | 70 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 71 | center=center, pad_mode='reflect', normalized=False, onesided=True) 72 | 73 | if not complex: 74 | spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) 75 | spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec) 76 | spec = spectral_normalize_torch(spec) 77 | else: 78 | B, C, T, _ = spec.shape 79 | spec = spec.transpose(1, 2) # [B, T, n_fft, 2] 80 | return spec 81 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/modules/parallel_wavegan/__init__.py -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .causal_conv import * # NOQA 2 | from .pqmf import * # NOQA 3 | from .residual_block import * # NOQA 4 | from modules.parallel_wavegan.layers.residual_stack import * # NOQA 5 | from .upsample import * # NOQA 6 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/layers/causal_conv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Causal convolusion layer modules.""" 7 | 8 | 9 | import torch 10 | 11 | 12 | class CausalConv1d(torch.nn.Module): 13 | """CausalConv1d module with customized initialization.""" 14 | 15 | def __init__(self, in_channels, out_channels, kernel_size, 16 | dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}): 17 | """Initialize CausalConv1d module.""" 18 | super(CausalConv1d, self).__init__() 19 | self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params) 20 | self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, 21 | dilation=dilation, bias=bias) 22 | 23 | def forward(self, x): 24 | """Calculate forward propagation. 25 | 26 | Args: 27 | x (Tensor): Input tensor (B, in_channels, T). 28 | 29 | Returns: 30 | Tensor: Output tensor (B, out_channels, T). 31 | 32 | """ 33 | return self.conv(self.pad(x))[:, :, :x.size(2)] 34 | 35 | 36 | class CausalConvTranspose1d(torch.nn.Module): 37 | """CausalConvTranspose1d module with customized initialization.""" 38 | 39 | def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): 40 | """Initialize CausalConvTranspose1d module.""" 41 | super(CausalConvTranspose1d, self).__init__() 42 | self.deconv = torch.nn.ConvTranspose1d( 43 | in_channels, out_channels, kernel_size, stride, bias=bias) 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | """Calculate forward propagation. 48 | 49 | Args: 50 | x (Tensor): Input tensor (B, in_channels, T_in). 51 | 52 | Returns: 53 | Tensor: Output tensor (B, out_channels, T_out). 54 | 55 | """ 56 | return self.deconv(x)[:, :, :-self.stride] 57 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/layers/residual_stack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual stack module in MelGAN.""" 7 | 8 | import torch 9 | 10 | from . import CausalConv1d 11 | 12 | 13 | class ResidualStack(torch.nn.Module): 14 | """Residual stack module introduced in MelGAN.""" 15 | 16 | def __init__(self, 17 | kernel_size=3, 18 | channels=32, 19 | dilation=1, 20 | bias=True, 21 | nonlinear_activation="LeakyReLU", 22 | nonlinear_activation_params={"negative_slope": 0.2}, 23 | pad="ReflectionPad1d", 24 | pad_params={}, 25 | use_causal_conv=False, 26 | ): 27 | """Initialize ResidualStack module. 28 | 29 | Args: 30 | kernel_size (int): Kernel size of dilation convolution layer. 31 | channels (int): Number of channels of convolution layers. 32 | dilation (int): Dilation factor. 33 | bias (bool): Whether to add bias parameter in convolution layers. 34 | nonlinear_activation (str): Activation function module name. 35 | nonlinear_activation_params (dict): Hyperparameters for activation function. 36 | pad (str): Padding function module name before dilated convolution layer. 37 | pad_params (dict): Hyperparameters for padding function. 38 | use_causal_conv (bool): Whether to use causal convolution. 39 | 40 | """ 41 | super(ResidualStack, self).__init__() 42 | 43 | # defile residual stack part 44 | if not use_causal_conv: 45 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 46 | self.stack = torch.nn.Sequential( 47 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 48 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 49 | torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), 50 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 51 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 52 | ) 53 | else: 54 | self.stack = torch.nn.Sequential( 55 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 56 | CausalConv1d(channels, channels, kernel_size, dilation=dilation, 57 | bias=bias, pad=pad, pad_params=pad_params), 58 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 59 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 60 | ) 61 | 62 | # defile extra layer for skip connection 63 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 64 | 65 | def forward(self, c): 66 | """Calculate forward propagation. 67 | 68 | Args: 69 | c (Tensor): Input tensor (B, channels, T). 70 | 71 | Returns: 72 | Tensor: Output tensor (B, chennels, T). 73 | 74 | """ 75 | return self.stack(c) + self.skip_layer(c) 76 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .stft_loss import * # NOQA 2 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .melgan import * # NOQA 2 | from .parallel_wavegan import * # NOQA 3 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.optim import * # NOQA 2 | from .radam import * # NOQA 3 | -------------------------------------------------------------------------------- /NeuralSeq/modules/parallel_wavegan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # NOQA 2 | -------------------------------------------------------------------------------- /NeuralSeq/tasks/run.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from utils.hparams import set_hparams, hparams 3 | 4 | 5 | def run_task(): 6 | assert hparams['task_cls'] != '' 7 | pkg = ".".join(hparams["task_cls"].split(".")[:-1]) 8 | cls_name = hparams["task_cls"].split(".")[-1] 9 | task_cls = getattr(importlib.import_module(pkg), cls_name) 10 | task_cls.start() 11 | 12 | 13 | if __name__ == '__main__': 14 | set_hparams() 15 | run_task() 16 | -------------------------------------------------------------------------------- /NeuralSeq/tasks/svs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/tasks/svs/__init__.py -------------------------------------------------------------------------------- /NeuralSeq/tasks/tts/synta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from modules.tts.syntaspeech.syntaspeech import SyntaSpeech 7 | from tasks.tts.ps_adv import PortaSpeechAdvTask 8 | from utils.hparams import hparams 9 | 10 | 11 | class SyntaSpeechTask(PortaSpeechAdvTask): 12 | def build_tts_model(self): 13 | ph_dict_size = len(self.token_encoder) 14 | word_dict_size = len(self.word_encoder) 15 | self.model = SyntaSpeech(ph_dict_size, word_dict_size, hparams) 16 | 17 | self.gen_params = [p for p in self.model.parameters() if p.requires_grad] 18 | self.dp_params = [p for k, p in self.model.named_parameters() if (('dur_predictor' in k) and p.requires_grad)] 19 | self.gen_params_except_dp = [p for k, p in self.model.named_parameters() if (('dur_predictor' not in k) and p.requires_grad)] 20 | self.bert_params = [p for k, p in self.model.named_parameters() if (('bert' in k) and p.requires_grad)] 21 | self.gen_params_except_bert_and_dp = [p for k, p in self.model.named_parameters() if ('dur_predictor' not in k) and ('bert' not in k) and p.requires_grad ] 22 | 23 | self.use_bert = True if len(self.bert_params) > 0 else False 24 | 25 | -------------------------------------------------------------------------------- /NeuralSeq/tasks/tts/tts_utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from data_gen.tts.base_binarizer import BaseBinarizer 4 | from data_gen.tts.base_preprocess import BasePreprocessor 5 | from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls 6 | from utils.hparams import hparams 7 | 8 | 9 | def parse_dataset_configs(): 10 | max_tokens = hparams['max_tokens'] 11 | max_sentences = hparams['max_sentences'] 12 | max_valid_tokens = hparams['max_valid_tokens'] 13 | if max_valid_tokens == -1: 14 | hparams['max_valid_tokens'] = max_valid_tokens = max_tokens 15 | max_valid_sentences = hparams['max_valid_sentences'] 16 | if max_valid_sentences == -1: 17 | hparams['max_valid_sentences'] = max_valid_sentences = max_sentences 18 | return max_tokens, max_sentences, max_valid_tokens, max_valid_sentences 19 | 20 | 21 | def parse_mel_losses(): 22 | mel_losses = hparams['mel_losses'].split("|") 23 | loss_and_lambda = {} 24 | for i, l in enumerate(mel_losses): 25 | if l == '': 26 | continue 27 | if ':' in l: 28 | l, lbd = l.split(":") 29 | lbd = float(lbd) 30 | else: 31 | lbd = 1.0 32 | loss_and_lambda[l] = lbd 33 | print("| Mel losses:", loss_and_lambda) 34 | return loss_and_lambda 35 | 36 | 37 | def load_data_preprocessor(): 38 | preprocess_cls = hparams["preprocess_cls"] 39 | pkg = ".".join(preprocess_cls.split(".")[:-1]) 40 | cls_name = preprocess_cls.split(".")[-1] 41 | preprocessor: BasePreprocessor = getattr(importlib.import_module(pkg), cls_name)() 42 | preprocess_args = {} 43 | preprocess_args.update(hparams['preprocess_args']) 44 | return preprocessor, preprocess_args 45 | 46 | 47 | def load_data_binarizer(): 48 | binarizer_cls = hparams['binarizer_cls'] 49 | pkg = ".".join(binarizer_cls.split(".")[:-1]) 50 | cls_name = binarizer_cls.split(".")[-1] 51 | binarizer: BaseBinarizer = getattr(importlib.import_module(pkg), cls_name)() 52 | binarization_args = {} 53 | binarization_args.update(hparams['binarization_args']) 54 | return binarizer, binarization_args -------------------------------------------------------------------------------- /NeuralSeq/tasks/vocoder/vocoder_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.distributed as dist 5 | from torch.utils.data import DistributedSampler 6 | 7 | from tasks.base_task import BaseTask 8 | from tasks.base_task import data_loader 9 | from tasks.vocoder.dataset_utils import VocoderDataset, EndlessDistributedSampler 10 | from utils.hparams import hparams 11 | 12 | 13 | class VocoderBaseTask(BaseTask): 14 | def __init__(self): 15 | super(VocoderBaseTask, self).__init__() 16 | self.max_sentences = hparams['max_sentences'] 17 | self.max_valid_sentences = hparams['max_valid_sentences'] 18 | if self.max_valid_sentences == -1: 19 | hparams['max_valid_sentences'] = self.max_valid_sentences = self.max_sentences 20 | self.dataset_cls = VocoderDataset 21 | 22 | @data_loader 23 | def train_dataloader(self): 24 | train_dataset = self.dataset_cls('train', shuffle=True) 25 | return self.build_dataloader(train_dataset, True, self.max_sentences, hparams['endless_ds']) 26 | 27 | @data_loader 28 | def val_dataloader(self): 29 | valid_dataset = self.dataset_cls('valid', shuffle=False) 30 | return self.build_dataloader(valid_dataset, False, self.max_valid_sentences) 31 | 32 | @data_loader 33 | def test_dataloader(self): 34 | test_dataset = self.dataset_cls('test', shuffle=False) 35 | return self.build_dataloader(test_dataset, False, self.max_valid_sentences) 36 | 37 | def build_dataloader(self, dataset, shuffle, max_sentences, endless=False): 38 | world_size = 1 39 | rank = 0 40 | if dist.is_initialized(): 41 | world_size = dist.get_world_size() 42 | rank = dist.get_rank() 43 | sampler_cls = DistributedSampler if not endless else EndlessDistributedSampler 44 | train_sampler = sampler_cls( 45 | dataset=dataset, 46 | num_replicas=world_size, 47 | rank=rank, 48 | shuffle=shuffle, 49 | ) 50 | return torch.utils.data.DataLoader( 51 | dataset=dataset, 52 | shuffle=False, 53 | collate_fn=dataset.collater, 54 | batch_size=max_sentences, 55 | num_workers=dataset.num_workers, 56 | sampler=train_sampler, 57 | pin_memory=True, 58 | ) 59 | 60 | def test_start(self): 61 | self.gen_dir = os.path.join(hparams['work_dir'], 62 | f'generated_{self.trainer.global_step}_{hparams["gen_dir_name"]}') 63 | os.makedirs(self.gen_dir, exist_ok=True) 64 | 65 | def test_end(self, outputs): 66 | return {} 67 | -------------------------------------------------------------------------------- /NeuralSeq/utils/ckpt_utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import re 5 | import torch 6 | 7 | 8 | def get_last_checkpoint(work_dir, steps=None): 9 | checkpoint = None 10 | last_ckpt_path = None 11 | ckpt_paths = get_all_ckpts(work_dir, steps) 12 | if len(ckpt_paths) > 0: 13 | last_ckpt_path = ckpt_paths[0] 14 | checkpoint = torch.load(last_ckpt_path, map_location='cpu') 15 | logging.info(f'load module from checkpoint: {last_ckpt_path}') 16 | return checkpoint, last_ckpt_path 17 | 18 | 19 | def get_all_ckpts(work_dir, steps=None): 20 | if steps is None: 21 | ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_*.ckpt' 22 | else: 23 | ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_{steps}.ckpt' 24 | return sorted(glob.glob(ckpt_path_pattern), 25 | key=lambda x: -int(re.findall('.*steps\_(\d+)\.ckpt', x)[0])) 26 | 27 | 28 | def load_ckpt(cur_model, ckpt_base_dir, model_name='model', force=True, strict=True): 29 | if os.path.isfile(ckpt_base_dir): 30 | base_dir = os.path.dirname(ckpt_base_dir) 31 | ckpt_path = ckpt_base_dir 32 | checkpoint = torch.load(ckpt_base_dir, map_location='cpu') 33 | else: 34 | base_dir = ckpt_base_dir 35 | checkpoint, ckpt_path = get_last_checkpoint(ckpt_base_dir) 36 | if checkpoint is not None: 37 | state_dict = checkpoint["state_dict"] 38 | if len([k for k in state_dict.keys() if '.' in k]) > 0: 39 | state_dict = {k[len(model_name) + 1:]: v for k, v in state_dict.items() 40 | if k.startswith(f'{model_name}.')} 41 | else: 42 | if '.' not in model_name: 43 | state_dict = state_dict[model_name] 44 | else: 45 | base_model_name = model_name.split('.')[0] 46 | rest_model_name = model_name[len(base_model_name) + 1:] 47 | state_dict = { 48 | k[len(rest_model_name) + 1:]: v for k, v in state_dict[base_model_name].items() 49 | if k.startswith(f'{rest_model_name}.')} 50 | if not strict: 51 | cur_model_state_dict = cur_model.state_dict() 52 | unmatched_keys = [] 53 | for key, param in state_dict.items(): 54 | if key in cur_model_state_dict: 55 | new_param = cur_model_state_dict[key] 56 | if new_param.shape != param.shape: 57 | unmatched_keys.append(key) 58 | print("| Unmatched keys: ", key, new_param.shape, param.shape) 59 | for key in unmatched_keys: 60 | del state_dict[key] 61 | cur_model.load_state_dict(state_dict, strict=strict) 62 | print(f"| load '{model_name}' from '{ckpt_path}'.") 63 | else: 64 | e_msg = f"| ckpt not found in {base_dir}." 65 | if force: 66 | assert False, e_msg 67 | else: 68 | print(e_msg) 69 | -------------------------------------------------------------------------------- /NeuralSeq/utils/indexed_datasets.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | 6 | 7 | class IndexedDataset: 8 | def __init__(self, path, num_cache=1): 9 | super().__init__() 10 | self.path = path 11 | self.data_file = None 12 | self.data_offsets = np.load(f"{path}.idx", allow_pickle=True).item()['offsets'] 13 | self.data_file = open(f"{path}.data", 'rb', buffering=-1) 14 | self.cache = [] 15 | self.num_cache = num_cache 16 | 17 | def check_index(self, i): 18 | if i < 0 or i >= len(self.data_offsets) - 1: 19 | raise IndexError('index out of range') 20 | 21 | def __del__(self): 22 | if self.data_file: 23 | self.data_file.close() 24 | 25 | def __getitem__(self, i): 26 | self.check_index(i) 27 | if self.num_cache > 0: 28 | for c in self.cache: 29 | if c[0] == i: 30 | return c[1] 31 | self.data_file.seek(self.data_offsets[i]) 32 | b = self.data_file.read(self.data_offsets[i + 1] - self.data_offsets[i]) 33 | item = pickle.loads(b) 34 | if self.num_cache > 0: 35 | self.cache = [(i, deepcopy(item))] + self.cache[:-1] 36 | return item 37 | 38 | def __len__(self): 39 | return len(self.data_offsets) - 1 40 | 41 | class IndexedDatasetBuilder: 42 | def __init__(self, path): 43 | self.path = path 44 | self.out_file = open(f"{path}.data", 'wb') 45 | self.byte_offsets = [0] 46 | 47 | def add_item(self, item): 48 | s = pickle.dumps(item) 49 | bytes = self.out_file.write(s) 50 | self.byte_offsets.append(self.byte_offsets[-1] + bytes) 51 | 52 | def finalize(self): 53 | self.out_file.close() 54 | np.save(open(f"{self.path}.idx", 'wb'), {'offsets': self.byte_offsets}) 55 | 56 | 57 | if __name__ == "__main__": 58 | import random 59 | from tqdm import tqdm 60 | ds_path = '/tmp/indexed_ds_example' 61 | size = 100 62 | items = [{"a": np.random.normal(size=[10000, 10]), 63 | "b": np.random.normal(size=[10000, 10])} for i in range(size)] 64 | builder = IndexedDatasetBuilder(ds_path) 65 | for i in tqdm(range(size)): 66 | builder.add_item(items[i]) 67 | builder.finalize() 68 | ds = IndexedDataset(ds_path) 69 | for i in tqdm(range(10000)): 70 | idx = random.randint(0, size - 1) 71 | assert (ds[idx]['a'] == items[idx]['a']).all() 72 | -------------------------------------------------------------------------------- /NeuralSeq/utils/multiprocess_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | from multiprocessing import Queue, Process 4 | 5 | 6 | def chunked_worker(worker_id, map_func, args, results_queue=None, init_ctx_func=None): 7 | ctx = init_ctx_func(worker_id) if init_ctx_func is not None else None 8 | for job_idx, arg in args: 9 | try: 10 | if ctx is not None: 11 | res = map_func(*arg, ctx=ctx) 12 | else: 13 | res = map_func(*arg) 14 | results_queue.put((job_idx, res)) 15 | except: 16 | traceback.print_exc() 17 | results_queue.put((job_idx, None)) 18 | 19 | def chunked_multiprocess_run(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, q_max_size=1000): 20 | args = zip(range(len(args)), args) 21 | args = list(args) 22 | n_jobs = len(args) 23 | if num_workers is None: 24 | num_workers = int(os.getenv('N_PROC', os.cpu_count())) 25 | results_queues = [] 26 | if ordered: 27 | for i in range(num_workers): 28 | results_queues.append(Queue(maxsize=q_max_size // num_workers)) 29 | else: 30 | results_queue = Queue(maxsize=q_max_size) 31 | for i in range(num_workers): 32 | results_queues.append(results_queue) 33 | workers = [] 34 | for i in range(num_workers): 35 | args_worker = args[i::num_workers] 36 | p = Process(target=chunked_worker, args=( 37 | i, map_func, args_worker, results_queues[i], init_ctx_func), daemon=True) 38 | workers.append(p) 39 | p.start() 40 | for n_finished in range(n_jobs): 41 | results_queue = results_queues[n_finished % num_workers] 42 | job_idx, res = results_queue.get() 43 | assert job_idx == n_finished or not ordered, (job_idx, n_finished) 44 | yield res 45 | for w in workers: 46 | w.join() 47 | w.close() 48 | 49 | def multiprocess_run_tqdm(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, 50 | multithread=False, desc=None): 51 | for i, res in tqdm(enumerate( 52 | multiprocess_run(map_func, args, num_workers, ordered, init_ctx_func, multithread)), 53 | total=len(args), desc=desc): 54 | yield i, res -------------------------------------------------------------------------------- /NeuralSeq/utils/os_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | 5 | def link_file(from_file, to_file): 6 | subprocess.check_call( 7 | f'ln -s "`realpath --relative-to="{os.path.dirname(to_file)}" "{from_file}"`" "{to_file}"', shell=True) 8 | 9 | 10 | def move_file(from_file, to_file): 11 | subprocess.check_call(f'mv "{from_file}" "{to_file}"', shell=True) 12 | 13 | 14 | def copy_file(from_file, to_file): 15 | subprocess.check_call(f'cp -r "{from_file}" "{to_file}"', shell=True) 16 | 17 | 18 | def remove_file(*fns): 19 | for f in fns: 20 | subprocess.check_call(f'rm -rf "{f}"', shell=True) -------------------------------------------------------------------------------- /NeuralSeq/utils/pitch_utils.py: -------------------------------------------------------------------------------- 1 | ######### 2 | # world 3 | ########## 4 | import librosa 5 | import numpy as np 6 | import torch 7 | 8 | gamma = 0 9 | mcepInput = 3 # 0 for dB, 3 for magnitude 10 | alpha = 0.45 11 | en_floor = 10 ** (-80 / 20) 12 | FFT_SIZE = 2048 13 | 14 | 15 | f0_bin = 256 16 | f0_max = 1100.0 17 | f0_min = 50.0 18 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 19 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 20 | 21 | 22 | def f0_to_coarse(f0): 23 | is_torch = isinstance(f0, torch.Tensor) 24 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) 25 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 26 | 27 | f0_mel[f0_mel <= 1] = 1 28 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 29 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) 30 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) 31 | return f0_coarse 32 | 33 | 34 | def norm_f0(f0, uv, hparams): 35 | is_torch = isinstance(f0, torch.Tensor) 36 | if hparams['pitch_norm'] == 'standard': 37 | f0 = (f0 - hparams['f0_mean']) / hparams['f0_std'] 38 | if hparams['pitch_norm'] == 'log': 39 | f0 = torch.log2(f0) if is_torch else np.log2(f0) 40 | if uv is not None and hparams['use_uv']: 41 | f0[uv > 0] = 0 42 | return f0 43 | 44 | 45 | def norm_interp_f0(f0, hparams): 46 | is_torch = isinstance(f0, torch.Tensor) 47 | if is_torch: 48 | device = f0.device 49 | f0 = f0.data.cpu().numpy() 50 | uv = f0 == 0 51 | f0 = norm_f0(f0, uv, hparams) 52 | if sum(uv) == len(f0): 53 | f0[uv] = 0 54 | elif sum(uv) > 0: 55 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 56 | uv = torch.FloatTensor(uv) 57 | f0 = torch.FloatTensor(f0) 58 | if is_torch: 59 | f0 = f0.to(device) 60 | return f0, uv 61 | 62 | 63 | def denorm_f0(f0, uv, hparams, pitch_padding=None, min=None, max=None): 64 | if hparams['pitch_norm'] == 'standard': 65 | f0 = f0 * hparams['f0_std'] + hparams['f0_mean'] 66 | if hparams['pitch_norm'] == 'log': 67 | f0 = 2 ** f0 68 | if min is not None: 69 | f0 = f0.clamp(min=min) 70 | if max is not None: 71 | f0 = f0.clamp(max=max) 72 | if uv is not None and hparams['use_uv']: 73 | f0[uv > 0] = 0 74 | if pitch_padding is not None: 75 | f0[pitch_padding] = 0 76 | return f0 77 | -------------------------------------------------------------------------------- /NeuralSeq/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import torch 4 | 5 | LINE_COLORS = ['w', 'r', 'y', 'cyan', 'm', 'b', 'lime'] 6 | 7 | 8 | def spec_to_figure(spec, vmin=None, vmax=None): 9 | if isinstance(spec, torch.Tensor): 10 | spec = spec.cpu().numpy() 11 | fig = plt.figure(figsize=(12, 6)) 12 | plt.pcolor(spec.T, vmin=vmin, vmax=vmax) 13 | return fig 14 | 15 | 16 | def spec_f0_to_figure(spec, f0s, figsize=None): 17 | max_y = spec.shape[1] 18 | if isinstance(spec, torch.Tensor): 19 | spec = spec.detach().cpu().numpy() 20 | f0s = {k: f0.detach().cpu().numpy() for k, f0 in f0s.items()} 21 | f0s = {k: f0 / 10 for k, f0 in f0s.items()} 22 | fig = plt.figure(figsize=(12, 6) if figsize is None else figsize) 23 | plt.pcolor(spec.T) 24 | for i, (k, f0) in enumerate(f0s.items()): 25 | plt.plot(f0.clip(0, max_y), label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.8) 26 | plt.legend() 27 | return fig 28 | 29 | 30 | def dur_to_figure(dur_gt, dur_pred, txt): 31 | dur_gt = dur_gt.long().cpu().numpy() 32 | dur_pred = dur_pred.long().cpu().numpy() 33 | dur_gt = np.cumsum(dur_gt) 34 | dur_pred = np.cumsum(dur_pred) 35 | fig = plt.figure(figsize=(12, 6)) 36 | for i in range(len(dur_gt)): 37 | shift = (i % 8) + 1 38 | plt.text(dur_gt[i], shift, txt[i]) 39 | plt.text(dur_pred[i], 10 + shift, txt[i]) 40 | plt.vlines(dur_gt[i], 0, 10, colors='b') # blue is gt 41 | plt.vlines(dur_pred[i], 10, 20, colors='r') # red is pred 42 | return fig 43 | 44 | 45 | def f0_to_figure(f0_gt, f0_cwt=None, f0_pred=None): 46 | fig = plt.figure() 47 | f0_gt = f0_gt.cpu().numpy() 48 | plt.plot(f0_gt, color='r', label='gt') 49 | if f0_cwt is not None: 50 | f0_cwt = f0_cwt.cpu().numpy() 51 | plt.plot(f0_cwt, color='b', label='cwt') 52 | if f0_pred is not None: 53 | f0_pred = f0_pred.cpu().numpy() 54 | plt.plot(f0_pred, color='green', label='pred') 55 | plt.legend() 56 | return fig 57 | -------------------------------------------------------------------------------- /NeuralSeq/utils/training_utils.py: -------------------------------------------------------------------------------- 1 | from utils.hparams import hparams 2 | 3 | 4 | class RSQRTSchedule(object): 5 | def __init__(self, optimizer): 6 | super().__init__() 7 | self.optimizer = optimizer 8 | self.constant_lr = hparams['lr'] 9 | self.warmup_updates = hparams['warmup_updates'] 10 | self.hidden_size = hparams['hidden_size'] 11 | self.lr = hparams['lr'] 12 | for param_group in optimizer.param_groups: 13 | param_group['lr'] = self.lr 14 | self.step(0) 15 | 16 | def step(self, num_updates): 17 | constant_lr = self.constant_lr 18 | warmup = min(num_updates / self.warmup_updates, 1.0) 19 | rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5 20 | rsqrt_hidden = self.hidden_size ** -0.5 21 | self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7) 22 | for param_group in self.optimizer.param_groups: 23 | param_group['lr'] = self.lr 24 | return self.lr 25 | 26 | def get_lr(self): 27 | return self.optimizer.param_groups[0]['lr'] 28 | -------------------------------------------------------------------------------- /NeuralSeq/vocoders/__init__.py: -------------------------------------------------------------------------------- 1 | from vocoders import hifigan 2 | -------------------------------------------------------------------------------- /NeuralSeq/vocoders/base_vocoder.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | VOCODERS = {} 3 | 4 | 5 | def register_vocoder(cls): 6 | VOCODERS[cls.__name__.lower()] = cls 7 | VOCODERS[cls.__name__] = cls 8 | return cls 9 | 10 | 11 | def get_vocoder_cls(hparams): 12 | if hparams['vocoder'] in VOCODERS: 13 | return VOCODERS[hparams['vocoder']] 14 | else: 15 | vocoder_cls = hparams['vocoder'] 16 | pkg = ".".join(vocoder_cls.split(".")[:-1]) 17 | cls_name = vocoder_cls.split(".")[-1] 18 | vocoder_cls = getattr(importlib.import_module(pkg), cls_name) 19 | return vocoder_cls 20 | 21 | 22 | class BaseVocoder: 23 | def spec2wav(self, mel): 24 | """ 25 | 26 | :param mel: [T, 80] 27 | :return: wav: [T'] 28 | """ 29 | 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | def wav2spec(wav_fn): 34 | """ 35 | 36 | :param wav_fn: str 37 | :return: wav, mel: [T, 80] 38 | """ 39 | raise NotImplementedError 40 | -------------------------------------------------------------------------------- /NeuralSeq/vocoders/hifigan.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import re 5 | 6 | import librosa 7 | import torch 8 | 9 | import utils 10 | from modules.hifigan.hifigan import HifiGanGenerator 11 | from utils.hparams import hparams, set_hparams 12 | from vocoders.base_vocoder import register_vocoder 13 | from vocoders.pwg import PWG 14 | from vocoders.vocoder_utils import denoise 15 | 16 | 17 | def load_model(config_path, checkpoint_path): 18 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | ckpt_dict = torch.load(checkpoint_path, map_location="cpu") 20 | if '.yaml' in config_path: 21 | config = set_hparams(config_path, global_hparams=False) 22 | state = ckpt_dict["state_dict"]["model_gen"] 23 | elif '.json' in config_path: 24 | config = json.load(open(config_path, 'r')) 25 | state = ckpt_dict["generator"] 26 | 27 | model = HifiGanGenerator(config) 28 | model.load_state_dict(state, strict=True) 29 | model.remove_weight_norm() 30 | model = model.eval().to(device) 31 | print(f"| Loaded model parameters from {checkpoint_path}.") 32 | print(f"| HifiGAN device: {device}.") 33 | return model, config, device 34 | 35 | 36 | total_time = 0 37 | 38 | 39 | @register_vocoder 40 | class HifiGAN(PWG): 41 | def __init__(self): 42 | base_dir = hparams['vocoder_ckpt'] 43 | config_path = f'{base_dir}/config.yaml' 44 | if os.path.exists(config_path): 45 | ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key= 46 | lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1] 47 | print('| load HifiGAN: ', ckpt) 48 | self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt) 49 | else: 50 | config_path = f'{base_dir}/config.json' 51 | ckpt = f'{base_dir}/generator_v1' 52 | if os.path.exists(config_path): 53 | self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt) 54 | 55 | def spec2wav(self, mel, **kwargs): 56 | device = self.device 57 | with torch.no_grad(): 58 | c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device) 59 | with utils.Timer('hifigan', print_time=hparams['profile_infer']): 60 | f0 = kwargs.get('f0') 61 | if f0 is not None and hparams.get('use_nsf'): 62 | f0 = torch.FloatTensor(f0[None, :]).to(device) 63 | y = self.model(c, f0).view(-1) 64 | else: 65 | y = self.model(c).view(-1) 66 | wav_out = y.cpu().numpy() 67 | if hparams.get('vocoder_denoise_c', 0.0) > 0: 68 | wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c']) 69 | return wav_out 70 | 71 | # @staticmethod 72 | # def wav2spec(wav_fn, **kwargs): 73 | # wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate']) 74 | # wav_torch = torch.FloatTensor(wav)[None, :] 75 | # mel = mel_spectrogram(wav_torch, hparams).numpy()[0] 76 | # return wav, mel.T 77 | -------------------------------------------------------------------------------- /NeuralSeq/vocoders/vocoder_utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | 3 | from utils.hparams import hparams 4 | import numpy as np 5 | 6 | 7 | def denoise(wav, v=0.1): 8 | spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'], 9 | win_length=hparams['win_size'], pad_mode='constant') 10 | spec_m = np.abs(spec) 11 | spec_m = np.clip(spec_m - v, a_min=0, a_max=None) 12 | spec_a = np.angle(spec) 13 | 14 | return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'], 15 | win_length=hparams['win_size']) 16 | -------------------------------------------------------------------------------- /assets/2bf90e35.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/2bf90e35.wav -------------------------------------------------------------------------------- /assets/5d67d1b9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/5d67d1b9.wav -------------------------------------------------------------------------------- /assets/7cb0d24f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/7cb0d24f.wav -------------------------------------------------------------------------------- /assets/7ef0ec0b.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/7ef0ec0b.wav -------------------------------------------------------------------------------- /assets/Track 4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/Track 4.wav -------------------------------------------------------------------------------- /assets/a-group-of-sheep-are-baaing.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/a-group-of-sheep-are-baaing.wav -------------------------------------------------------------------------------- /assets/a2i.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/a2i.png -------------------------------------------------------------------------------- /assets/asr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/asr.png -------------------------------------------------------------------------------- /assets/b973e878.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/b973e878.wav -------------------------------------------------------------------------------- /assets/detection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/detection.png -------------------------------------------------------------------------------- /assets/drums-and-music-playing-with-a-man-speaking.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/drums-and-music-playing-with-a-man-speaking.wav -------------------------------------------------------------------------------- /assets/fd5cf55e.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/fd5cf55e.wav -------------------------------------------------------------------------------- /assets/i2a-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/i2a-1.png -------------------------------------------------------------------------------- /assets/i2a-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/i2a-2.png -------------------------------------------------------------------------------- /assets/inpaint-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/inpaint-1.png -------------------------------------------------------------------------------- /assets/inpaint-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/inpaint-2.png -------------------------------------------------------------------------------- /assets/m2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/m2b.png -------------------------------------------------------------------------------- /assets/mix1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/mix1.wav -------------------------------------------------------------------------------- /assets/sound_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/sound_extraction.png -------------------------------------------------------------------------------- /assets/style_transfer_tts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/style_transfer_tts.png -------------------------------------------------------------------------------- /assets/t2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2a.png -------------------------------------------------------------------------------- /assets/t2i.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2i.png -------------------------------------------------------------------------------- /assets/t2s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2s.png -------------------------------------------------------------------------------- /assets/tsd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/tsd.png -------------------------------------------------------------------------------- /assets/tts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/tts.png -------------------------------------------------------------------------------- /audio_detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/__init__.py -------------------------------------------------------------------------------- /audio_detection/audio_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/audio_infer/__init__.py -------------------------------------------------------------------------------- /audio_detection/audio_infer/pytorch/evaluate.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | 3 | from pytorch_utils import forward 4 | 5 | 6 | class Evaluator(object): 7 | def __init__(self, model): 8 | """Evaluator. 9 | 10 | Args: 11 | model: object 12 | """ 13 | self.model = model 14 | 15 | def evaluate(self, data_loader): 16 | """Forward evaluation data and calculate statistics. 17 | 18 | Args: 19 | data_loader: object 20 | 21 | Returns: 22 | statistics: dict, 23 | {'average_precision': (classes_num,), 'auc': (classes_num,)} 24 | """ 25 | 26 | # Forward 27 | output_dict = forward( 28 | model=self.model, 29 | generator=data_loader, 30 | return_target=True) 31 | 32 | clipwise_output = output_dict['clipwise_output'] # (audios_num, classes_num) 33 | target = output_dict['target'] # (audios_num, classes_num) 34 | 35 | average_precision = metrics.average_precision_score( 36 | target, clipwise_output, average=None) 37 | 38 | auc = metrics.roc_auc_score(target, clipwise_output, average=None) 39 | 40 | statistics = {'average_precision': average_precision, 'auc': auc} 41 | 42 | return statistics -------------------------------------------------------------------------------- /audio_detection/audio_infer/pytorch/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def clip_bce(output_dict, target_dict): 6 | """Binary crossentropy loss. 7 | """ 8 | return F.binary_cross_entropy( 9 | output_dict['clipwise_output'], target_dict['target']) 10 | 11 | 12 | def get_loss_func(loss_type): 13 | if loss_type == 'clip_bce': 14 | return clip_bce -------------------------------------------------------------------------------- /audio_detection/audio_infer/results/YDlWd7Wmdi1E.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png -------------------------------------------------------------------------------- /audio_detection/audio_infer/utils/crash.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class ExceptionHook: 4 | instance = None 5 | def __call__(self, *args, **kwargs): 6 | if self.instance is None: 7 | from IPython.core import ultratb 8 | self.instance = ultratb.FormattedTB(mode='Plain', 9 | color_scheme='Linux', call_pdb=1) 10 | return self.instance(*args, **kwargs) 11 | 12 | sys.excepthook = ExceptionHook() 13 | -------------------------------------------------------------------------------- /audio_detection/audio_infer/utils/create_black_list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import os 4 | 5 | from utilities import create_folder 6 | 7 | 8 | def dcase2017task4(args): 9 | """Create black list. Black list is a list of audio ids that will be 10 | skipped in training. 11 | """ 12 | 13 | # Augments & parameters 14 | workspace = args.workspace 15 | 16 | # Black list from DCASE 2017 Task 4 17 | test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv' 18 | evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv' 19 | 20 | black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv') 21 | create_folder(os.path.dirname(black_list_csv)) 22 | 23 | def get_id_sets(csv_path): 24 | with open(csv_path, 'r') as fr: 25 | reader = csv.reader(fr, delimiter='\t') 26 | lines = list(reader) 27 | 28 | ids_set = [] 29 | 30 | for line in lines: 31 | """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']""" 32 | ids_set.append(line[0][0 : 11]) 33 | 34 | ids_set = list(set(ids_set)) 35 | return ids_set 36 | 37 | test_ids_set = get_id_sets(test_weak_csv) 38 | evaluation_ids_set = get_id_sets(evaluation_weak_csv) 39 | 40 | full_ids_set = test_ids_set + evaluation_ids_set 41 | 42 | # Write black list 43 | fw = open(black_list_csv, 'w') 44 | 45 | for id in full_ids_set: 46 | fw.write('{}\n'.format(id)) 47 | 48 | print('Write black list to {}'.format(black_list_csv)) 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser(description='') 53 | subparsers = parser.add_subparsers(dest='mode') 54 | 55 | parser_dcase2017task4 = subparsers.add_parser('dcase2017task4') 56 | parser_dcase2017task4.add_argument('--workspace', type=str, required=True) 57 | 58 | args = parser.parse_args() 59 | 60 | if args.mode == 'dcase2017task4': 61 | dcase2017task4(args) 62 | 63 | else: 64 | raise Exception('Error argument!') -------------------------------------------------------------------------------- /audio_to_text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/__init__.py -------------------------------------------------------------------------------- /audio_to_text/captioning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/captioning/__init__.py -------------------------------------------------------------------------------- /audio_to_text/captioning/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_model import * 2 | from .transformer_model import * 3 | 4 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utils 2 | 3 | Scripts in this directory are used as utility functions. 4 | 5 | ## BERT Pretrained Embeddings 6 | 7 | You can load pretrained word embeddings in Google [BERT](https://github.com/google-research/bert#pre-trained-models) instead of training word embeddings from scratch. The scripts in `utils/bert` need a BERT server in the background. We use BERT server from [bert-as-service](https://github.com/hanxiao/bert-as-service). 8 | 9 | To use bert-as-service, you need to first install the repository. It is recommended that you create a new environment with Tensorflow 1.3 to run BERT server since it is incompatible with Tensorflow 2.x. 10 | 11 | After successful installation of [bert-as-service](https://github.com/hanxiao/bert-as-service), downloading and running the BERT server needs to execute: 12 | 13 | ```bash 14 | bash scripts/prepare_bert_server.sh zh 15 | ``` 16 | 17 | By default, server based on BERT base Chinese model is running in the background. You can change to other models by changing corresponding model name and path in `scripts/prepare_bert_server.sh`. 18 | 19 | To extract BERT word embeddings, you need to execute `utils/bert/create_word_embedding.py`. 20 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/captioning/utils/__init__.py -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/bert/create_word_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | 6 | from bert_serving.client import BertClient 7 | import numpy as np 8 | from tqdm import tqdm 9 | import fire 10 | import torch 11 | 12 | sys.path.append(os.getcwd()) 13 | from utils.build_vocab import Vocabulary 14 | 15 | def main(vocab_file: str, output: str, server_hostname: str): 16 | client = BertClient(ip=server_hostname) 17 | vocabulary = torch.load(vocab_file) 18 | vocab_size = len(vocabulary) 19 | 20 | fake_embedding = client.encode(["test"]).reshape(-1) 21 | embed_size = fake_embedding.shape[0] 22 | 23 | print("Encoding words into embeddings with size: ", embed_size) 24 | 25 | embeddings = np.empty((vocab_size, embed_size)) 26 | for i in tqdm(range(len(embeddings)), ascii=True): 27 | embeddings[i] = client.encode([vocabulary.idx2word[i]]) 28 | np.save(output, embeddings) 29 | 30 | 31 | if __name__ == '__main__': 32 | fire.Fire(main) 33 | 34 | 35 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/fasttext/create_word_embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python3 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from gensim.models import FastText 8 | from tqdm import tqdm 9 | import fire 10 | 11 | import sys 12 | import os 13 | sys.path.append(os.getcwd()) 14 | from utils.build_vocab import Vocabulary 15 | 16 | def create_embedding(caption_file: str, 17 | vocab_file: str, 18 | embed_size: int, 19 | output: str, 20 | **fasttext_kwargs): 21 | caption_df = pd.read_json(caption_file) 22 | caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""]) 23 | 24 | sentences = list(caption_df["tokens"].values) 25 | vocabulary = torch.load(vocab_file, map_location="cpu") 26 | 27 | epochs = fasttext_kwargs.get("epochs", 10) 28 | model = FastText(size=embed_size, min_count=1, **fasttext_kwargs) 29 | model.build_vocab(sentences=sentences) 30 | model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) 31 | 32 | word_embeddings = np.zeros((len(vocabulary), embed_size)) 33 | 34 | with tqdm(total=len(vocabulary), ascii=True) as pbar: 35 | for word, idx in vocabulary.word2idx.items(): 36 | if word == "" or word == "": 37 | continue 38 | word_embeddings[idx] = model.wv[word] 39 | pbar.update() 40 | 41 | np.save(output, word_embeddings) 42 | 43 | print("Finish writing fasttext embeddings to " + output) 44 | 45 | 46 | if __name__ == "__main__": 47 | fire.Fire(create_embedding) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/predict_nn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import argparse 4 | import numpy as np 5 | from tqdm import tqdm 6 | from h5py import File 7 | import sklearn.metrics 8 | 9 | random.seed(1) 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("train_feature", type=str) 13 | parser.add_argument("train_corpus", type=str) 14 | parser.add_argument("pred_feature", type=str) 15 | parser.add_argument("output_json", type=str) 16 | 17 | args = parser.parse_args() 18 | train_embs = [] 19 | train_idx_to_audioid = [] 20 | with File(args.train_feature, "r") as store: 21 | for audio_id, embedding in tqdm(store.items(), ascii=True): 22 | train_embs.append(embedding[()]) 23 | train_idx_to_audioid.append(audio_id) 24 | 25 | train_annotation = json.load(open(args.train_corpus, "r"))["audios"] 26 | train_audioid_to_tokens = {} 27 | for item in train_annotation: 28 | audio_id = item["audio_id"] 29 | train_audioid_to_tokens[audio_id] = [cap_item["tokens"] for cap_item in item["captions"]] 30 | train_embs = np.stack(train_embs) 31 | 32 | 33 | pred_data = [] 34 | pred_embs = [] 35 | pred_idx_to_audioids = [] 36 | with File(args.pred_feature, "r") as store: 37 | for audio_id, embedding in tqdm(store.items(), ascii=True): 38 | pred_embs.append(embedding[()]) 39 | pred_idx_to_audioids.append(audio_id) 40 | pred_embs = np.stack(pred_embs) 41 | 42 | similarity = sklearn.metrics.pairwise.cosine_similarity(pred_embs, train_embs) 43 | for idx, audio_id in enumerate(pred_idx_to_audioids): 44 | train_idx = similarity[idx].argmax() 45 | pred_data.append({ 46 | "filename": audio_id, 47 | "tokens": random.choice(train_audioid_to_tokens[train_idx_to_audioid[train_idx]]) 48 | }) 49 | json.dump({"predictions": pred_data}, open(args.output_json, "w"), ensure_ascii=False, indent=4) 50 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/remove_optimizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | 5 | def main(checkpoint): 6 | state_dict = torch.load(checkpoint, map_location="cpu") 7 | if "optimizer" in state_dict: 8 | del state_dict["optimizer"] 9 | if "lr_scheduler" in state_dict: 10 | del state_dict["lr_scheduler"] 11 | torch.save(state_dict, checkpoint) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("checkpoint", type=str) 17 | args = parser.parse_args() 18 | main(args.checkpoint) 19 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/report_results.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import argparse 3 | import numpy as np 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--input", help="input filename", type=str, nargs="+") 7 | parser.add_argument("--output", help="output result file", default=None) 8 | 9 | args = parser.parse_args() 10 | 11 | 12 | scores = {} 13 | for path in args.input: 14 | with open(path, "r") as reader: 15 | for line in reader.readlines(): 16 | metric, score = line.strip().split(": ") 17 | score = float(score) 18 | if metric not in scores: 19 | scores[metric] = [] 20 | scores[metric].append(score) 21 | 22 | if len(scores) == 0: 23 | print("No experiment directory found, wrong path?") 24 | exit(1) 25 | 26 | with open(args.output, "w") as writer: 27 | print("Average results: ", file=writer) 28 | for metric, score in scores.items(): 29 | score = np.array(score) 30 | mean = np.mean(score) 31 | std = np.std(score) 32 | print(f"{metric}: {mean:.3f} (±{std:.3f})", file=writer) 33 | print("", file=writer) 34 | print("Best results: ", file=writer) 35 | for metric, score in scores.items(): 36 | score = np.max(score) 37 | print(f"{metric}: {score:.3f}", file=writer) 38 | -------------------------------------------------------------------------------- /audio_to_text/captioning/utils/word2vec/create_word_embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python3 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | import gensim 8 | from gensim.models import Word2Vec 9 | from tqdm import tqdm 10 | import fire 11 | 12 | import sys 13 | import os 14 | sys.path.append(os.getcwd()) 15 | from utils.build_vocab import Vocabulary 16 | 17 | def create_embedding(vocab_file: str, 18 | embed_size: int, 19 | output: str, 20 | caption_file: str = None, 21 | pretrained_weights_path: str = None, 22 | **word2vec_kwargs): 23 | vocabulary = torch.load(vocab_file, map_location="cpu") 24 | 25 | if pretrained_weights_path: 26 | model = gensim.models.KeyedVectors.load_word2vec_format( 27 | fname=pretrained_weights_path, 28 | binary=True, 29 | ) 30 | if model.vector_size != embed_size: 31 | assert embed_size < model.vector_size, f"only reduce dimension, cannot add dimesion {model.vector_size} to {embed_size}" 32 | from sklearn.decomposition import PCA 33 | pca = PCA(n_components=embed_size) 34 | model.vectors = pca.fit_transform(model.vectors) 35 | else: 36 | caption_df = pd.read_json(caption_file) 37 | caption_df["tokens"] = caption_df["tokens"].apply(lambda x: [""] + [token for token in x] + [""]) 38 | sentences = list(caption_df["tokens"].values) 39 | epochs = word2vec_kwargs.get("epochs", 10) 40 | if "epochs" in word2vec_kwargs: 41 | del word2vec_kwargs["epochs"] 42 | model = Word2Vec(size=embed_size, min_count=1, **word2vec_kwargs) 43 | model.build_vocab(sentences=sentences) 44 | model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) 45 | 46 | word_embeddings = np.random.randn(len(vocabulary), embed_size) 47 | 48 | if isinstance(model, gensim.models.word2vec.Word2Vec): 49 | model = model.wv 50 | with tqdm(total=len(vocabulary), ascii=True) as pbar: 51 | for word, idx in vocabulary.word2idx.items(): 52 | try: 53 | word_embeddings[idx] = model.get_vector(word) 54 | except KeyError: 55 | print(f"word {word} not found in word2vec model, it is random initialized!") 56 | pbar.update() 57 | 58 | np.save(output, word_embeddings) 59 | 60 | print("Finish writing word2vec embeddings to " + output) 61 | 62 | 63 | if __name__ == "__main__": 64 | fire.Fire(create_embedding) 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu113 2 | accelerate 3 | addict==2.4.0 4 | aiofiles 5 | albumentations==1.3.0 6 | appdirs==1.4.4 7 | basicsr==1.4.2 8 | beautifulsoup4==4.10.0 9 | Cython==0.29.24 10 | diffusers 11 | einops==0.3.0 12 | espnet 13 | espnet_model_zoo 14 | ffmpeg-python 15 | g2p-en==2.1.0 16 | google==3.0.0 17 | gradio 18 | h5py 19 | imageio==2.9.0 20 | imageio-ffmpeg==0.4.2 21 | invisible-watermark>=0.1.5 22 | jieba 23 | kornia==0.6 24 | langchain==0.0.101 25 | librosa 26 | loguru 27 | miditoolkit==0.1.7 28 | mmcv==1.5.0 29 | mmdet==2.23.0 30 | mmengine==0.7.2 31 | moviepy==1.0.3 32 | numpy==1.23.1 33 | omegaconf==2.1.1 34 | open_clip_torch==2.0.2 35 | openai 36 | openai-whisper 37 | opencv-contrib-python==4.3.0.36 38 | praat-parselmouth==0.3.3 39 | prettytable==3.6.0 40 | proglog==0.1.9 41 | pycwt==0.3.0a22 42 | pyloudnorm==0.1.0 43 | pypinyin==0.43.0 44 | pytorch-lightning==1.5.0 45 | pytorch-ssim==0.1 46 | pyworld==0.3.0 47 | resampy==0.2.2 48 | Resemblyzer==0.1.1.dev0 49 | safetensors==0.2.7 50 | sklearn==0.0 51 | soundfile 52 | soupsieve==2.3 53 | streamlit==1.12.1 54 | streamlit-drawable-canvas==0.8.0 55 | tensorboardX==2.4 56 | test-tube>=0.7.5 57 | TextGrid==1.5 58 | timm==0.6.12 59 | torch==1.12.1 60 | torchaudio==0.12.1 61 | torch-fidelity==0.3.0 62 | torchlibrosa 63 | torchmetrics==0.6.0 64 | torchvision==0.13.1 65 | transformers==4.26.1 66 | typing-extensions==4.0.0 67 | uuid==1.30 68 | webdataset==0.2.5 69 | webrtcvad==2.0.10 70 | yapf==0.32.0 71 | git+https://github.com/openai/CLIP.git -------------------------------------------------------------------------------- /run.md: -------------------------------------------------------------------------------- 1 | # Run AudioGPT 2 | ``` 3 | # create a new environment 4 | conda create -n audiogpt python=3.8 5 | 6 | # prepare the basic environments 7 | pip install -r requirements.txt 8 | 9 | # download the foundation models you need 10 | bash download.sh 11 | 12 | # prepare your private openAI private key 13 | export OPENAI_API_KEY={Your_Private_Openai_Key} 14 | 15 | # Start AudioGPT ! 16 | python audio-chatgpt.py 17 | ``` 18 | 19 | 20 | -------------------------------------------------------------------------------- /sound_extraction/model/LASSNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .text_encoder import Text_Encoder 5 | from .resunet_film import UNetRes_FiLM 6 | 7 | class LASSNet(nn.Module): 8 | def __init__(self, device='cuda'): 9 | super(LASSNet, self).__init__() 10 | self.text_embedder = Text_Encoder(device) 11 | self.UNet = UNetRes_FiLM(channels=1, cond_embedding_dim=256) 12 | 13 | def forward(self, x, caption): 14 | # x: (Batch, 1, T, 128)) 15 | input_ids, attns_mask = self.text_embedder.tokenize(caption) 16 | 17 | cond_vec = self.text_embedder(input_ids, attns_mask)[0] 18 | dec_cond_vec = cond_vec 19 | 20 | mask = self.UNet(x, cond_vec, dec_cond_vec) 21 | mask = torch.sigmoid(mask) 22 | return mask 23 | 24 | def get_tokenizer(self): 25 | return self.text_embedder.tokenizer 26 | -------------------------------------------------------------------------------- /sound_extraction/model/film.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Film(nn.Module): 5 | def __init__(self, channels, cond_embedding_dim): 6 | super(Film, self).__init__() 7 | self.linear = nn.Sequential( 8 | nn.Linear(cond_embedding_dim, channels * 2), 9 | nn.ReLU(inplace=True), 10 | nn.Linear(channels * 2, channels), 11 | nn.ReLU(inplace=True) 12 | ) 13 | 14 | def forward(self, data, cond_vec): 15 | """ 16 | :param data: [batchsize, channels, samples] or [batchsize, channels, T, F] or [batchsize, channels, F, T] 17 | :param cond_vec: [batchsize, cond_embedding_dim] 18 | :return: 19 | """ 20 | bias = self.linear(cond_vec) # [batchsize, channels] 21 | if len(list(data.size())) == 3: 22 | data = data + bias[..., None] 23 | elif len(list(data.size())) == 4: 24 | data = data + bias[..., None, None] 25 | else: 26 | print("Warning: The size of input tensor,", data.size(), "is not correct. Film is not working.") 27 | return data -------------------------------------------------------------------------------- /sound_extraction/model/text_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import * 4 | import warnings 5 | warnings.filterwarnings('ignore') 6 | # pretrained model name: (model class, model tokenizer, output dimension, token style) 7 | MODELS = { 8 | 'prajjwal1/bert-mini': (BertModel, BertTokenizer), 9 | } 10 | 11 | class Text_Encoder(nn.Module): 12 | def __init__(self, device): 13 | super(Text_Encoder, self).__init__() 14 | self.base_model = 'prajjwal1/bert-mini' 15 | self.dropout = 0.1 16 | 17 | self.tokenizer = MODELS[self.base_model][1].from_pretrained(self.base_model) 18 | 19 | self.bert_layer = MODELS[self.base_model][0].from_pretrained(self.base_model, 20 | add_pooling_layer=False, 21 | hidden_dropout_prob=self.dropout, 22 | attention_probs_dropout_prob=self.dropout, 23 | output_hidden_states=True) 24 | 25 | self.linear_layer = nn.Sequential(nn.Linear(256, 256), nn.ReLU(inplace=True)) 26 | 27 | self.device = device 28 | 29 | def tokenize(self, caption): 30 | # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | tokenized = self.tokenizer(caption, add_special_tokens=False, padding=True, return_tensors='pt') 32 | input_ids = tokenized['input_ids'] 33 | attns_mask = tokenized['attention_mask'] 34 | 35 | input_ids = input_ids.to(self.device) 36 | attns_mask = attns_mask.to(self.device) 37 | return input_ids, attns_mask 38 | 39 | def forward(self, input_ids, attns_mask): 40 | # input_ids, attns_mask = self.tokenize(caption) 41 | output = self.bert_layer(input_ids=input_ids, attention_mask=attns_mask)[0] 42 | cls_embed = output[:, 0, :] 43 | text_embed = self.linear_layer(cls_embed) 44 | 45 | return text_embed, output # text_embed: (batch, hidden_size) -------------------------------------------------------------------------------- /sound_extraction/utils/wav_io.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | import scipy.io.wavfile 6 | 7 | def load_wav(path): 8 | max_length = 32000 * 10 9 | wav = librosa.core.load(path, sr=32000)[0] 10 | if len(wav) > max_length: 11 | audio = wav[0:max_length] 12 | 13 | # pad audio to max length, 10s for AudioCaps 14 | if len(wav) < max_length: 15 | # audio = torch.nn.functional.pad(audio, (0, self.max_length - audio.size(1)), 'constant') 16 | wav = np.pad(wav, (0, max_length - len(wav)), 'constant') 17 | wav = wav[...,None] 18 | return wav 19 | 20 | 21 | def save_wav(wav, path): 22 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 23 | scipy.io.wavfile.write(path, 32000, wav.astype(np.int16)) -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: caption 12 | image_size: 32 # unused 13 | mel_dim: 10 # 80 // 2^3 14 | mel_length: 78 # 624 // 2^3 15 | channels: 4 16 | cond_stage_trainable: false 17 | conditioning_key: crossattn 18 | monitor: val/loss_simple_ema 19 | scale_by_std: True 20 | use_ema: False 21 | 22 | scheduler_config: # 10000 warmup steps 23 | target: ldm.lr_scheduler.LambdaLinearScheduler 24 | params: 25 | warm_up_steps: [10000] 26 | cycle_lengths: [10000000000000] 27 | f_start: [1.e-6] 28 | f_max: [1.] 29 | f_min: [ 1.] 30 | 31 | unet_config: 32 | target: ldm.modules.diffusionmodules.custom_openaimodel.UNetModel 33 | params: 34 | image_size: 32 # ununsed 35 | in_channels: 4 36 | out_channels: 4 37 | model_channels: 256 38 | attention_resolutions: 39 | - 1 40 | - 2 41 | num_res_blocks: 2 42 | channel_mult: # num_down = len(ch_mult)-1 43 | - 1 44 | - 2 45 | num_head_channels: 32 46 | use_spatial_transformer: true 47 | transformer_depth: 1 48 | context_dim: 1024 49 | use_context_project: false 50 | 51 | 52 | first_stage_config: 53 | target: ldm.models.autoencoder.AutoencoderKL 54 | params: 55 | embed_dim: 4 56 | monitor: val/rec_loss 57 | ddconfig: 58 | double_z: true 59 | z_channels: 4 60 | resolution: 848 61 | in_channels: 1 62 | out_ch: 1 63 | ch: 128 64 | ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1 65 | num_res_blocks: 2 66 | attn_resolutions: [106, 212] 67 | dropout: 0.0 68 | lossconfig: 69 | target: torch.nn.Identity 70 | 71 | cond_stage_config: 72 | target: ldm.modules.encoders.modules.FrozenGlobalNormOpenCLIPEmbedder 73 | params: 74 | freeze: True 75 | delvisual: False 76 | 77 | 78 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio 4 | params: 5 | linear_start: 0.0015 6 | linear_end: 0.0205 7 | log_every_t: 100 8 | timesteps: 1000 9 | loss_type: l1 10 | first_stage_key: image 11 | cond_stage_key: masked_image 12 | image_size: 32 # unused 13 | mel_dim: 10 # 80 // 2^3 14 | mel_length: 106 # 848 // 2^3 15 | channels: 4 16 | concat_mode: true 17 | monitor: val/loss 18 | use_ema: False 19 | 20 | scheduler_config: 21 | target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler 22 | params: 23 | verbosity_interval: 0 24 | warm_up_steps: 1000 25 | max_decay_steps: 50000 26 | lr_start: 0.001 27 | lr_max: 0.1 28 | lr_min: 0.0001 29 | 30 | unet_config: 31 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 32 | params: 33 | image_size: 32 # ununsed 34 | in_channels: 9 # 4 + 1 + 4 35 | out_channels: 4 36 | model_channels: 320 37 | attention_resolutions: 38 | - 1 39 | - 2 40 | num_res_blocks: 2 41 | channel_mult: # num_down = len(ch_mult)-1 42 | - 1 43 | - 2 44 | num_heads: 8 45 | resblock_updown: true 46 | 47 | first_stage_config: 48 | target: ldm.models.autoencoder.AutoencoderKL 49 | params: 50 | embed_dim: 4 51 | monitor: val/rec_loss 52 | ckpt_path: # /apdcephfs/share_1316500/nlphuang/results/Text_to_audio/ae15/2022-12-15T22-24-00_mixdata_kl_4_tile/epoch=000009-v2.ckpt 53 | ddconfig: 54 | double_z: true 55 | z_channels: 4 56 | resolution: 848 57 | in_channels: 1 58 | out_ch: 1 59 | ch: 128 60 | ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1 61 | num_res_blocks: 2 62 | attn_resolutions: [106, 212] 63 | dropout: 0.0 64 | lossconfig: 65 | target: torch.nn.Identity 66 | 67 | cond_stage_config: __is_first_stage__ 68 | 69 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/configs/text_to_audio/clap_args.yaml: -------------------------------------------------------------------------------- 1 | # TEXT ENCODER CONFIG 2 | text_model: 'bert-base-uncased' 3 | text_len: 100 4 | transformer_embed_dim: 768 5 | freeze_text_encoder_weights: True 6 | 7 | # AUDIO ENCODER CONFIG 8 | audioenc_name: 'Cnn14' 9 | out_emb: 2048 10 | sampling_rate: 44100 11 | duration: 9 12 | fmin: 50 13 | fmax: 14000 14 | n_fft: 1028 15 | hop_size: 320 16 | mel_bins: 64 17 | window_size: 1024 18 | 19 | # PROJECTION SPACE CONFIG 20 | d_proj: 1024 21 | temperature: 0.003 22 | 23 | # TRAINING AND EVALUATION CONFIG 24 | num_classes: 527 25 | batch_size: 1024 26 | demo: False 27 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/configs/text_to_audio/hifigan_args.yaml: -------------------------------------------------------------------------------- 1 | adam_b1: 0.8 2 | adam_b2: 0.99 3 | batch_size: 24 4 | dist_config: 5 | dist_backend: nccl 6 | dist_url: tcp://localhost:54321 7 | world_size: 1 8 | fmax: 8000 9 | fmax_for_loss: null 10 | fmin: 0 11 | hop_size: 256 12 | learning_rate: 0.0002 13 | lr_decay: 0.999 14 | n_fft: 1024 15 | num_gpus: 0 16 | num_mels: 80 17 | num_workers: 4 18 | resblock: '1' 19 | resblock_dilation_sizes: 20 | - - 1 21 | - 3 22 | - 5 23 | - - 1 24 | - 3 25 | - 5 26 | - - 1 27 | - 3 28 | - 5 29 | resblock_kernel_sizes: 30 | - 3 31 | - 7 32 | - 11 33 | sampling_rate: 16000 34 | seed: 1234 35 | segment_size: 8192 36 | upsample_initial_channel: 512 37 | upsample_kernel_sizes: 38 | - 16 39 | - 16 40 | - 4 41 | - 4 42 | upsample_rates: 43 | - 8 44 | - 8 45 | - 2 46 | - 2 47 | win_size: 1024 48 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/configs/text_to_audio/txt2audio_args.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-05 3 | target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: image 11 | cond_stage_key: caption 12 | image_size: 32 # unused 13 | mel_dim: 10 # 80 // 2^3 14 | mel_length: 78 # 624 // 2^3 15 | channels: 4 16 | cond_stage_trainable: false 17 | conditioning_key: crossattn 18 | monitor: val/loss_simple_ema 19 | scale_by_std: True 20 | use_ema: False 21 | 22 | scheduler_config: # 10000 warmup steps 23 | target: ldm.lr_scheduler.LambdaLinearScheduler 24 | params: 25 | warm_up_steps: [10000] 26 | cycle_lengths: [10000000000000] 27 | f_start: [1.e-6] 28 | f_max: [1.] 29 | f_min: [ 1.] 30 | 31 | unet_config: 32 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 33 | params: 34 | image_size: 32 # ununsed 35 | in_channels: 4 36 | out_channels: 4 37 | model_channels: 320 38 | attention_resolutions: 39 | - 1 40 | - 2 41 | num_res_blocks: 2 42 | channel_mult: # num_down = len(ch_mult)-1 43 | - 1 44 | - 2 45 | num_heads: 8 46 | use_spatial_transformer: true 47 | transformer_depth: 1 48 | context_dim: 1024 49 | use_checkpoint: true 50 | legacy: False 51 | 52 | first_stage_config: 53 | target: ldm.models.autoencoder.AutoencoderKL 54 | params: 55 | embed_dim: 4 56 | monitor: val/rec_loss 57 | ckpt_path: 58 | ddconfig: 59 | double_z: true 60 | z_channels: 4 61 | resolution: 848 62 | in_channels: 1 63 | out_ch: 1 64 | ch: 128 65 | ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1 66 | num_res_blocks: 2 67 | attn_resolutions: [106, 212] 68 | dropout: 0.0 69 | lossconfig: 70 | target: torch.nn.Identity 71 | 72 | cond_stage_config: 73 | target: ldm.modules.encoders.modules.FrozenCLAPEmbedder 74 | params: 75 | weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth 76 | 77 | ckpt_path: useful_ckpts/ta40multi_epoch=000085.ckpt 78 | 79 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class AbstractDistribution: 6 | def sample(self): 7 | raise NotImplementedError() 8 | 9 | def mode(self): 10 | raise NotImplementedError() 11 | 12 | 13 | class DiracDistribution(AbstractDistribution): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | def sample(self): 18 | return self.value 19 | 20 | def mode(self): 21 | return self.value 22 | 23 | 24 | class DiagonalGaussianDistribution(object): 25 | def __init__(self, parameters, deterministic=False): 26 | self.parameters = parameters 27 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 28 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 29 | self.deterministic = deterministic 30 | self.std = torch.exp(0.5 * self.logvar) 31 | self.var = torch.exp(self.logvar) 32 | if self.deterministic: 33 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) 34 | 35 | def sample(self): 36 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) 37 | return x 38 | 39 | def kl(self, other=None): 40 | if self.deterministic: 41 | return torch.Tensor([0.]) 42 | else: 43 | if other is None: 44 | return 0.5 * torch.sum(torch.pow(self.mean, 2) 45 | + self.var - 1.0 - self.logvar, 46 | dim=[1, 2, 3]) 47 | else: 48 | return 0.5 * torch.sum( 49 | torch.pow(self.mean - other.mean, 2) / other.var 50 | + self.var / other.var - 1.0 - self.logvar + other.logvar, 51 | dim=[1, 2, 3]) 52 | 53 | def nll(self, sample, dims=[1,2,3]): 54 | if self.deterministic: 55 | return torch.Tensor([0.]) 56 | logtwopi = np.log(2.0 * np.pi) 57 | return 0.5 * torch.sum( 58 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 59 | dim=dims) 60 | 61 | def mode(self): 62 | return self.mean 63 | 64 | 65 | def normal_kl(mean1, logvar1, mean2, logvar2): 66 | """ 67 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 68 | Compute the KL divergence between two gaussians. 69 | Shapes are automatically broadcasted, so batches can be compared to 70 | scalars, among other use cases. 71 | """ 72 | tensor = None 73 | for obj in (mean1, logvar1, mean2, logvar2): 74 | if isinstance(obj, torch.Tensor): 75 | tensor = obj 76 | break 77 | assert tensor is not None, "at least one argument must be a Tensor" 78 | 79 | # Force variances to be Tensors. Broadcasting helps convert scalars to 80 | # Tensors, but it does not work for torch.exp(). 81 | logvar1, logvar2 = [ 82 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 83 | for x in (logvar1, logvar2) 84 | ] 85 | 86 | return 0.5 * ( 87 | -1.0 88 | + logvar2 89 | - logvar1 90 | + torch.exp(logvar1 - logvar2) 91 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 92 | ) 93 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/ema.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LitEma(nn.Module): 6 | def __init__(self, model, decay=0.9999, use_num_upates=True): 7 | super().__init__() 8 | if decay < 0.0 or decay > 1.0: 9 | raise ValueError('Decay must be between 0 and 1') 10 | 11 | self.m_name2s_name = {} 12 | self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) 13 | self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates 14 | else torch.tensor(-1,dtype=torch.int)) 15 | 16 | for name, p in model.named_parameters(): 17 | if p.requires_grad: 18 | #remove as '.'-character is not allowed in buffers 19 | s_name = name.replace('.','') 20 | self.m_name2s_name.update({name:s_name}) 21 | self.register_buffer(s_name,p.clone().detach().data) 22 | 23 | self.collected_params = [] 24 | 25 | def forward(self,model): 26 | decay = self.decay 27 | 28 | if self.num_updates >= 0: 29 | self.num_updates += 1 30 | decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) 31 | 32 | one_minus_decay = 1.0 - decay 33 | 34 | with torch.no_grad(): 35 | m_param = dict(model.named_parameters()) 36 | shadow_params = dict(self.named_buffers()) 37 | 38 | for key in m_param: 39 | if m_param[key].requires_grad: 40 | sname = self.m_name2s_name[key] 41 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 42 | shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) 43 | else: 44 | assert not key in self.m_name2s_name 45 | 46 | def copy_to(self, model): 47 | m_param = dict(model.named_parameters()) 48 | shadow_params = dict(self.named_buffers()) 49 | for key in m_param: 50 | if m_param[key].requires_grad: 51 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 52 | else: 53 | assert not key in self.m_name2s_name 54 | 55 | def store(self, parameters): 56 | """ 57 | Save the current parameters for restoring later. 58 | Args: 59 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 60 | temporarily stored. 61 | """ 62 | self.collected_params = [param.clone() for param in parameters] 63 | 64 | def restore(self, parameters): 65 | """ 66 | Restore the parameters stored with the `store` method. 67 | Useful to validate the model with EMA parameters without affecting the 68 | original optimization process. Store the parameters before the 69 | `copy_to` method. After validation (or model saving), use this to 70 | restore the former parameters. 71 | Args: 72 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 73 | updated with the stored parameters. 74 | """ 75 | for c_param, param in zip(self.collected_params, parameters): 76 | param.data.copy_(c_param.data) 77 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/__init__.py: -------------------------------------------------------------------------------- 1 | from . import clap 2 | from . import audio 3 | from . import utils -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/config.yml: -------------------------------------------------------------------------------- 1 | # TEXT ENCODER CONFIG 2 | text_model: 'bert-base-uncased' 3 | text_len: 100 4 | transformer_embed_dim: 768 5 | freeze_text_encoder_weights: True 6 | 7 | # AUDIO ENCODER CONFIG 8 | audioenc_name: 'Cnn14' 9 | out_emb: 2048 10 | sampling_rate: 44100 11 | duration: 5 12 | fmin: 50 13 | fmax: 14000 14 | n_fft: 1028 15 | hop_size: 320 16 | mel_bins: 64 17 | window_size: 1024 18 | 19 | # PROJECTION SPACE CONFIG 20 | d_proj: 1024 21 | temperature: 0.003 22 | 23 | # TRAINING AND EVALUATION CONFIG 24 | num_classes: 527 25 | batch_size: 1024 26 | demo: False 27 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import sys 4 | 5 | def read_config_as_args(config_path,args=None,is_config_str=False): 6 | return_dict = {} 7 | 8 | if config_path is not None: 9 | if is_config_str: 10 | yml_config = yaml.load(config_path, Loader=yaml.FullLoader) 11 | else: 12 | with open(config_path, "r") as f: 13 | yml_config = yaml.load(f, Loader=yaml.FullLoader) 14 | 15 | if args != None: 16 | for k, v in yml_config.items(): 17 | if k in args.__dict__: 18 | args.__dict__[k] = v 19 | else: 20 | sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k)) 21 | else: 22 | for k, v in yml_config.items(): 23 | return_dict[k] = v 24 | 25 | args = args if args != None else return_dict 26 | return argparse.Namespace(**args) 27 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import list_models, create_model, create_model_and_transforms, add_model_config 2 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics 3 | from .model import CLAP, CLAPTextCfg, CLAPVisionCfg, CLAPAudioCfp, convert_weights_to_fp16, trace_model 4 | from .openai import load_openai_model, list_openai_models 5 | from .pretrained import list_pretrained, list_pretrained_tag_models, list_pretrained_model_tags,\ 6 | get_pretrained_url, download_pretrained 7 | from .tokenizer import SimpleTokenizer, tokenize 8 | from .transform import image_transform 9 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bert.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, BertModel 2 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 3 | model = BertModel.from_pretrained("bert-base-uncased") 4 | text = "Replace me by any text you'd like." 5 | 6 | def bert_embeddings(text): 7 | # text = "Replace me by any text you'd like." 8 | encoded_input = tokenizer(text, return_tensors='pt') 9 | output = model(**encoded_input) 10 | return output 11 | 12 | from transformers import RobertaTokenizer, RobertaModel 13 | 14 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 15 | model = RobertaModel.from_pretrained('roberta-base') 16 | text = "Replace me by any text you'd like." 17 | def Roberta_embeddings(text): 18 | # text = "Replace me by any text you'd like." 19 | encoded_input = tokenizer(text, return_tensors='pt') 20 | output = model(**encoded_input) 21 | return output 22 | 23 | from transformers import BartTokenizer, BartModel 24 | 25 | tokenizer = BartTokenizer.from_pretrained('facebook/bart-base') 26 | model = BartModel.from_pretrained('facebook/bart-base') 27 | text = "Replace me by any text you'd like." 28 | def bart_embeddings(text): 29 | # text = "Replace me by any text you'd like." 30 | encoded_input = tokenizer(text, return_tensors='pt') 31 | output = model(**encoded_input) 32 | return output -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/linear_probe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from .model import MLPLayers 5 | 6 | 7 | class LinearProbe(nn.Module): 8 | def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None): 9 | """ 10 | Args: 11 | model: nn.Module 12 | mlp: bool, if True, then use the MLP layer as the linear probe module 13 | freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe 14 | in_ch: int, the output channel from CLAP model 15 | out_ch: int, the output channel from linear probe (class_num) 16 | act: torch.nn.functional, the activation function before the loss function 17 | """ 18 | super().__init__() 19 | in_ch = 512 20 | self.clap_model = model 21 | self.clap_model.text_branch = None # to save memory 22 | self.freeze = freeze 23 | if mlp: 24 | self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch]) 25 | else: 26 | self.lp_layer = nn.Linear(in_ch, out_ch) 27 | 28 | if self.freeze: 29 | for param in self.clap_model.parameters(): 30 | param.requires_grad = False 31 | 32 | if act == 'None': 33 | self.act = None 34 | elif act == 'relu': 35 | self.act = nn.ReLU() 36 | elif act == 'elu': 37 | self.act = nn.ELU() 38 | elif act == 'prelu': 39 | self.act = nn.PReLU(num_parameters=in_ch) 40 | elif act == 'softmax': 41 | self.act = nn.Softmax(dim=-1) 42 | elif act == 'sigmoid': 43 | self.act = nn.Sigmoid() 44 | 45 | def forward(self, x, mix_lambda=None, device=None): 46 | """ 47 | Args: 48 | x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list 49 | mix_lambda: torch.tensor [batch], the mixup lambda 50 | Returns: 51 | class_prob: torch.tensor [batch, class_num] 52 | 53 | """ 54 | # batchnorm cancel grandient 55 | if self.freeze: 56 | self.clap_model.eval() 57 | 58 | x = self.clap_model.audio_projection( 59 | self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)["embedding"]) 60 | out = self.lp_layer(x) 61 | if self.act is not None: 62 | out = self.act(out) 63 | return out 64 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "base" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "large" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-tiny-win-1536.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1536, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "tiny" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "HTSAT", 14 | "model_name": "tiny" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-10.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn10" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-fmax-18k.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 18000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-fmax-8k-20s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 960000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 360, 10 | "fmin": 50, 11 | "fmax": 8000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-tiny-transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 4 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-win-1536.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1536, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 2048, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn14" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-6.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "audio_cfg": { 4 | "audio_length": 1024, 5 | "clip_samples": 480000, 6 | "mel_bins": 64, 7 | "sample_rate": 48000, 8 | "window_size": 1024, 9 | "hop_size": 480, 10 | "fmin": 50, 11 | "fmax": 14000, 12 | "class_num": 527, 13 | "model_type": "PANN", 14 | "model_name": "Cnn6" 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 512, 20 | "heads": 8, 21 | "layers": 12 22 | } 23 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/transform.py: -------------------------------------------------------------------------------- 1 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ 2 | CenterCrop 3 | 4 | 5 | def _convert_to_rgb(image): 6 | return image.convert('RGB') 7 | 8 | 9 | def image_transform( 10 | image_size: int, 11 | is_train: bool, 12 | mean=(0.48145466, 0.4578275, 0.40821073), 13 | std=(0.26862954, 0.26130258, 0.27577711) 14 | ): 15 | normalize = Normalize(mean=mean, std=std) 16 | if is_train: 17 | return Compose([ 18 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), 19 | _convert_to_rgb, 20 | ToTensor(), 21 | normalize, 22 | ]) 23 | else: 24 | return Compose([ 25 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 26 | CenterCrop(image_size), 27 | _convert_to_rgb, 28 | ToTensor(), 29 | normalize, 30 | ]) 31 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.2.1' 2 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses_audio.vqperceptual import DummyLoss 2 | 3 | # relative imports pain 4 | import os 5 | import sys 6 | path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vggishish') 7 | sys.path.append(path) 8 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/config/melception.yaml: -------------------------------------------------------------------------------- 1 | seed: 1337 2 | log_code_state: True 3 | # patterns to ignore when backing up the code folder 4 | patterns_to_ignore: ['logs', '.git', '__pycache__', 'data', 'checkpoints', '*.pt'] 5 | 6 | # data: 7 | mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/' 8 | spec_shape: [80, 860] 9 | cropped_size: [80, 848] 10 | random_crop: False 11 | 12 | # train: 13 | device: 'cuda:0' 14 | batch_size: 8 15 | num_workers: 0 16 | optimizer: adam 17 | betas: [0.9, 0.999] 18 | momentum: 0.9 19 | learning_rate: 3e-4 20 | weight_decay: 0 21 | num_epochs: 100 22 | patience: 3 23 | logdir: './logs' 24 | cls_weights_in_loss: False 25 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/config/vggish.yaml: -------------------------------------------------------------------------------- 1 | seed: 1337 2 | log_code_state: True 3 | # patterns to ignore when backing up the code folder 4 | patterns_to_ignore: ['logs', '.git', '__pycache__'] 5 | 6 | # data: 7 | mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/' 8 | spec_shape: [80, 860] 9 | cropped_size: [80, 848] 10 | random_crop: False 11 | 12 | # model: 13 | # original vgg family except for MP is missing at the end 14 | # 'vggish': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512] 15 | # 'vgg11': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512], 16 | # 'vgg13': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512], 17 | # 'vgg16': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512], 18 | # 'vgg19': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 256, 'MP', 512, 512, 512, 512, 'MP', 512, 512, 512, 512], 19 | conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512] 20 | use_bn: False 21 | 22 | # train: 23 | device: 'cuda:0' 24 | batch_size: 32 25 | num_workers: 0 26 | optimizer: adam 27 | betas: [0.9, 0.999] 28 | momentum: 0.9 29 | learning_rate: 3e-4 30 | weight_decay: 0.0001 31 | num_epochs: 100 32 | patience: 3 33 | logdir: './logs' 34 | cls_weights_in_loss: False 35 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/data/train_means_stds_melspec_10s_22050hz.txt: -------------------------------------------------------------------------------- 1 | 0.51234712 0.08187601 2 | 0.52630101 0.08393201 3 | 0.52002938 0.08533191 4 | 0.51831866 0.08651366 5 | 0.52457265 0.08795700 6 | 0.51129235 0.08924046 7 | 0.51403755 0.09011565 8 | 0.51189406 0.09138965 9 | 0.50142221 0.09215379 10 | 0.50632402 0.09251092 11 | 0.49724399 0.09356590 12 | 0.49062130 0.09333057 13 | 0.49971113 0.09446411 14 | 0.48442903 0.09400197 15 | 0.48598301 0.09477853 16 | 0.48681630 0.09487848 17 | 0.47436119 0.09424447 18 | 0.48031359 0.09417475 19 | 0.47422810 0.09498061 20 | 0.46369397 0.09362415 21 | 0.47413055 0.09453825 22 | 0.46062686 0.09481226 23 | 0.45677793 0.09359464 24 | 0.46135474 0.09437913 25 | 0.45246800 0.09384014 26 | 0.45232766 0.09438247 27 | 0.45208386 0.09419720 28 | 0.44351671 0.09340880 29 | 0.44667316 0.09423184 30 | 0.44447692 0.09344461 31 | 0.43676363 0.09265266 32 | 0.44002381 0.09307925 33 | 0.43772414 0.09299363 34 | 0.43061019 0.09236166 35 | 0.43120828 0.09156491 36 | 0.42603271 0.09161403 37 | 0.42863234 0.09150530 38 | 0.42296206 0.09102536 39 | 0.41733331 0.09048366 40 | 0.41804121 0.09025013 41 | 0.41605068 0.09078869 42 | 0.40875265 0.08985338 43 | 0.40666997 0.08877566 44 | 0.40407463 0.08961667 45 | 0.40353311 0.08859275 46 | 0.39708031 0.08827818 47 | 0.39375066 0.08833999 48 | 0.39301091 0.08760654 49 | 0.39047117 0.08812327 50 | 0.38461680 0.08782288 51 | 0.38145284 0.08645484 52 | 0.37985209 0.08718211 53 | 0.37419526 0.08644421 54 | 0.37080597 0.08532454 55 | 0.36786535 0.08592822 56 | 0.36569049 0.08452069 57 | 0.36336079 0.08474272 58 | 0.35775191 0.08476392 59 | 0.35504801 0.08334654 60 | 0.35284816 0.08412110 61 | 0.34594865 0.08367254 62 | 0.34112312 0.08252251 63 | 0.33784886 0.08320975 64 | 0.33095703 0.08257768 65 | 0.32559461 0.08171253 66 | 0.32003106 0.08204872 67 | 0.31506222 0.08098545 68 | 0.31138077 0.08152917 69 | 0.30403516 0.08209135 70 | 0.29969540 0.08073266 71 | 0.29578024 0.08225822 72 | 0.28861871 0.08324076 73 | 0.28581686 0.08058489 74 | 0.27922253 0.08515350 75 | 0.27444035 0.08355056 76 | 0.27339468 0.08067638 77 | 0.26571759 0.08536921 78 | 0.26280864 0.08107620 79 | 0.25664202 0.08357468 80 | 0.24853513 0.08556041 81 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | class WeightedCrossEntropy(nn.CrossEntropyLoss): 7 | 8 | def __init__(self, weights, **pytorch_ce_loss_args) -> None: 9 | super().__init__(reduction='none', **pytorch_ce_loss_args) 10 | self.weights = weights 11 | 12 | def __call__(self, outputs, targets, to_weight=True): 13 | loss = super().__call__(outputs, targets) 14 | if to_weight: 15 | return (loss * self.weights[targets]).sum() / self.weights[targets].sum() 16 | else: 17 | return loss.mean() 18 | 19 | 20 | if __name__ == '__main__': 21 | x = torch.randn(10, 5) 22 | target = torch.randint(0, 5, (10,)) 23 | weights = torch.tensor([1., 2., 3., 4., 5.]) 24 | 25 | # criterion_weighted = nn.CrossEntropyLoss(weight=weights) 26 | # loss_weighted = criterion_weighted(x, target) 27 | 28 | # criterion_weighted_manual = nn.CrossEntropyLoss(reduction='none') 29 | # loss_weighted_manual = criterion_weighted_manual(x, target) 30 | # print(loss_weighted, loss_weighted_manual.mean()) 31 | # loss_weighted_manual = (loss_weighted_manual * weights[target]).sum() / weights[target].sum() 32 | # print(loss_weighted, loss_weighted_manual) 33 | # print(torch.allclose(loss_weighted, loss_weighted_manual)) 34 | 35 | pytorch_weighted = nn.CrossEntropyLoss(weight=weights) 36 | pytorch_unweighted = nn.CrossEntropyLoss() 37 | custom = WeightedCrossEntropy(weights) 38 | 39 | assert torch.allclose(pytorch_weighted(x, target), custom(x, target, to_weight=True)) 40 | assert torch.allclose(pytorch_unweighted(x, target), custom(x, target, to_weight=False)) 41 | print(custom(x, target, to_weight=True), custom(x, target, to_weight=False)) 42 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from sklearn.metrics import average_precision_score, roc_auc_score 7 | 8 | logger = logging.getLogger(f'main.{__name__}') 9 | 10 | def metrics(targets, outputs, topk=(1, 5)): 11 | """ 12 | Adapted from https://github.com/hche11/VGGSound/blob/master/utils.py 13 | 14 | Calculate statistics including mAP, AUC, and d-prime. 15 | Args: 16 | output: 2d tensors, (dataset_size, classes_num) - before softmax 17 | target: 1d tensors, (dataset_size, ) 18 | topk: tuple 19 | Returns: 20 | metric_dict: a dict of metrics 21 | """ 22 | metrics_dict = dict() 23 | 24 | num_cls = outputs.shape[-1] 25 | 26 | # accuracy@k 27 | _, preds = torch.topk(outputs, k=max(topk), dim=1) 28 | correct_for_maxtopk = preds == targets.view(-1, 1).expand_as(preds) 29 | for k in topk: 30 | metrics_dict[f'accuracy_{k}'] = float(correct_for_maxtopk[:, :k].sum() / correct_for_maxtopk.shape[0]) 31 | 32 | # avg precision, average roc_auc, and dprime 33 | targets = torch.nn.functional.one_hot(targets, num_classes=num_cls) 34 | 35 | # ids of the predicted classes (same as softmax) 36 | targets_pred = torch.softmax(outputs, dim=1) 37 | 38 | targets = targets.numpy() 39 | targets_pred = targets_pred.numpy() 40 | 41 | # one-vs-rest 42 | avg_p = [average_precision_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)] 43 | try: 44 | roc_aucs = [roc_auc_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)] 45 | except ValueError: 46 | logger.warning('Weird... Some classes never occured in targets. Do not trust the metrics.') 47 | roc_aucs = np.array([0.5]) 48 | avg_p = np.array([0]) 49 | 50 | metrics_dict['mAP'] = np.mean(avg_p) 51 | metrics_dict['mROCAUC'] = np.mean(roc_aucs) 52 | # Percent point function (ppf) (inverse of cdf — percentiles). 53 | metrics_dict['dprime'] = scipy.stats.norm().ppf(metrics_dict['mROCAUC']) * np.sqrt(2) 54 | 55 | return metrics_dict 56 | 57 | 58 | if __name__ == '__main__': 59 | targets = torch.tensor([3, 3, 1, 2, 1, 0]) 60 | outputs = torch.tensor([ 61 | [1.2, 1.3, 1.1, 1.5], 62 | [1.3, 1.4, 1.0, 1.1], 63 | [1.5, 1.1, 1.4, 1.3], 64 | [1.0, 1.2, 1.4, 1.5], 65 | [1.2, 1.3, 1.1, 1.1], 66 | [1.2, 1.1, 1.1, 1.1], 67 | ]).float() 68 | metrics_dict = metrics(targets, outputs, topk=(1, 3)) 69 | print(metrics_dict) 70 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class VGGishish(nn.Module): 6 | 7 | def __init__(self, conv_layers, use_bn, num_classes): 8 | ''' 9 | Mostly from 10 | https://pytorch.org/vision/0.8/_modules/torchvision/models/vgg.html 11 | ''' 12 | super().__init__() 13 | layers = [] 14 | in_channels = 1 15 | 16 | # a list of channels with 'MP' (maxpool) from config 17 | for v in conv_layers: 18 | if v == 'MP': 19 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 20 | else: 21 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1, stride=1) 22 | if use_bn: 23 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 24 | else: 25 | layers += [conv2d, nn.ReLU(inplace=True)] 26 | in_channels = v 27 | self.features = nn.Sequential(*layers) 28 | 29 | self.avgpool = nn.AdaptiveAvgPool2d((5, 10)) 30 | 31 | self.flatten = nn.Flatten() 32 | self.classifier = nn.Sequential( 33 | nn.Linear(512 * 5 * 10, 4096), 34 | nn.ReLU(True), 35 | nn.Linear(4096, 4096), 36 | nn.ReLU(True), 37 | nn.Linear(4096, num_classes) 38 | ) 39 | 40 | # weight init 41 | self.reset_parameters() 42 | 43 | def forward(self, x): 44 | # adding channel dim for conv2d (B, 1, F, T) <- 45 | x = x.unsqueeze(1) 46 | # backbone (B, 1, 5, 53) <- (B, 1, 80, 860) 47 | x = self.features(x) 48 | # adaptive avg pooling (B, 1, 5, 10) <- (B, 1, 5, 53) – if no MP is used as the end of VGG 49 | x = self.avgpool(x) 50 | # flatten 51 | x = self.flatten(x) 52 | # classify 53 | x = self.classifier(x) 54 | return x 55 | 56 | def reset_parameters(self): 57 | for m in self.modules(): 58 | if isinstance(m, nn.Conv2d): 59 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 60 | if m.bias is not None: 61 | nn.init.constant_(m.bias, 0) 62 | elif isinstance(m, nn.BatchNorm2d): 63 | nn.init.constant_(m.weight, 1) 64 | nn.init.constant_(m.bias, 0) 65 | elif isinstance(m, nn.Linear): 66 | nn.init.normal_(m.weight, 0, 0.01) 67 | nn.init.constant_(m.bias, 0) 68 | 69 | 70 | if __name__ == '__main__': 71 | num_classes = 309 72 | inputs = torch.rand(3, 80, 848) 73 | conv_layers = [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512] 74 | # conv_layers = [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP'] 75 | model = VGGishish(conv_layers, use_bn=False, num_classes=num_classes) 76 | outputs = model(inputs) 77 | print(outputs.shape) 78 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/useful_ckpts/CLAP/config.yml: -------------------------------------------------------------------------------- 1 | # TEXT ENCODER CONFIG 2 | text_model: 'bert-base-uncased' 3 | text_len: 100 4 | transformer_embed_dim: 768 5 | freeze_text_encoder_weights: True 6 | 7 | # AUDIO ENCODER CONFIG 8 | audioenc_name: 'Cnn14' 9 | out_emb: 2048 10 | sampling_rate: 44100 11 | duration: 9 12 | fmin: 50 13 | fmax: 14000 14 | n_fft: 1028 15 | hop_size: 320 16 | mel_bins: 64 17 | window_size: 1024 18 | 19 | # PROJECTION SPACE CONFIG 20 | d_proj: 1024 21 | temperature: 0.003 22 | 23 | # TRAINING AND EVALUATION CONFIG 24 | num_classes: 527 25 | batch_size: 1024 26 | demo: False 27 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/vocoder/bigvgan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/vocoder/bigvgan/__init__.py -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__(self, 10 | activation, 11 | up_ratio: int = 2, 12 | down_ratio: int = 2, 13 | up_kernel_size: int = 12, 14 | down_kernel_size: int = 12): 15 | super().__init__() 16 | self.up_ratio = up_ratio 17 | self.down_ratio = down_ratio 18 | self.act = activation 19 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 20 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 21 | 22 | # x: [B,C,T] 23 | def forward(self, x): 24 | x = self.upsample(x) 25 | x = self.act(x) 26 | x = self.downsample(x) 27 | 28 | return x -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, 20 | half_width=0.6 / ratio, 21 | kernel_size=self.kernel_size) 22 | self.register_buffer("filter", filter) 23 | 24 | # x: [B, C, T] 25 | def forward(self, x): 26 | _, C, _ = x.shape 27 | 28 | x = F.pad(x, (self.pad, self.pad), mode='replicate') 29 | x = self.ratio * F.conv_transpose1d( 30 | x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 31 | x = x[..., self.pad_left:-self.pad_right] 32 | 33 | return x 34 | 35 | 36 | class DownSample1d(nn.Module): 37 | def __init__(self, ratio=2, kernel_size=None): 38 | super().__init__() 39 | self.ratio = ratio 40 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 41 | self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, 42 | half_width=0.6 / ratio, 43 | stride=ratio, 44 | kernel_size=self.kernel_size) 45 | 46 | def forward(self, x): 47 | xx = self.lowpass(x) 48 | 49 | return xx -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/vocoder/logs/hifi_0127/args.yml: -------------------------------------------------------------------------------- 1 | adam_b1: 0.8 2 | adam_b2: 0.99 3 | batch_size: 24 4 | dist_config: 5 | dist_backend: nccl 6 | dist_url: tcp://localhost:54321 7 | world_size: 1 8 | fmax: 8000 9 | fmax_for_loss: null 10 | fmin: 0 11 | hop_size: 256 12 | learning_rate: 0.0002 13 | lr_decay: 0.999 14 | n_fft: 1024 15 | num_gpus: 0 16 | num_mels: 80 17 | num_workers: 4 18 | resblock: '1' 19 | resblock_dilation_sizes: 20 | - - 1 21 | - 3 22 | - 5 23 | - - 1 24 | - 3 25 | - 5 26 | - - 1 27 | - 3 28 | - 5 29 | resblock_kernel_sizes: 30 | - 3 31 | - 7 32 | - 11 33 | sampling_rate: 16000 34 | seed: 1234 35 | segment_size: 8192 36 | upsample_initial_channel: 512 37 | upsample_kernel_sizes: 38 | - 16 39 | - 16 40 | - 4 41 | - 4 42 | upsample_rates: 43 | - 8 44 | - 8 45 | - 2 46 | - 2 47 | win_size: 1024 48 | -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/wav_evaluation/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import clap 2 | from . import audio 3 | from . import utils -------------------------------------------------------------------------------- /text_to_audio/Make_An_Audio/wav_evaluation/models/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import sys 4 | 5 | def read_config_as_args(config_path,args=None,is_config_str=False): 6 | return_dict = {} 7 | 8 | if config_path is not None: 9 | if is_config_str: 10 | yml_config = yaml.load(config_path, Loader=yaml.FullLoader) 11 | else: 12 | with open(config_path, "r") as f: 13 | yml_config = yaml.load(f, Loader=yaml.FullLoader) 14 | 15 | if args != None: 16 | for k, v in yml_config.items(): 17 | if k in args.__dict__: 18 | args.__dict__[k] = v 19 | else: 20 | sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k)) 21 | else: 22 | for k, v in yml_config.items(): 23 | return_dict[k] = v 24 | 25 | args = args if args != None else return_dict 26 | return argparse.Namespace(**args) 27 | --------------------------------------------------------------------------------