├── .gitignore
├── LICENSE
├── NeuralSeq
    ├── LICENSE
    ├── README.md
    ├── configs
    │   ├── config_base.yaml
    │   ├── singing
    │   │   ├── base.yaml
    │   │   └── fs2.yaml
    │   └── tts
    │   │   ├── base.yaml
    │   │   ├── base_zh.yaml
    │   │   ├── emotion
    │   │       ├── base_text2mel.yaml
    │   │       └── pre_align.py
    │   │   ├── fs2.yaml
    │   │   ├── hifigan.yaml
    │   │   ├── libritts
    │   │       ├── base_text2mel.yaml
    │   │       ├── fs2.yaml
    │   │       ├── pre_align.py
    │   │       └── pwg.yaml
    │   │   ├── lj
    │   │       ├── base_mel2wav.yaml
    │   │       ├── base_text2mel.yaml
    │   │       ├── fs2.yaml
    │   │       ├── hifigan.yaml
    │   │       └── pwg.yaml
    │   │   └── pwg.yaml
    ├── data_gen
    │   └── tts
    │   │   ├── base_binarizer.py
    │   │   ├── base_binarizer_emotion.py
    │   │   ├── base_preprocess.py
    │   │   ├── binarizer_zh.py
    │   │   ├── data_gen_utils.py
    │   │   ├── emotion
    │   │       ├── audio.py
    │   │       ├── inference.py
    │   │       ├── model.py
    │   │       ├── params_data.py
    │   │       ├── params_model.py
    │   │       └── test_emotion.py
    │   │   ├── txt_processors
    │   │       ├── __init__.py
    │   │       ├── base_text_processor.py
    │   │       ├── en.py
    │   │       ├── zh.py
    │   │       └── zh_g2pM.py
    │   │   └── wav_processors
    │   │       ├── __init__.py
    │   │       ├── base_processor.py
    │   │       └── common_processors.py
    ├── egs
    │   ├── datasets
    │   │   └── audio
    │   │   │   ├── emotion
    │   │   │       ├── base_text2mel.yaml
    │   │   │       └── pre_align.py
    │   │   │   ├── libritts
    │   │   │       ├── base_text2mel.yaml
    │   │   │       ├── fs2.yaml
    │   │   │       ├── pre_align.py
    │   │   │       └── pwg.yaml
    │   │   │   ├── lj
    │   │   │       ├── base_mel2wav.yaml
    │   │   │       ├── preprocess.py
    │   │   │       └── pwg.yaml
    │   │   │   └── vctk
    │   │   │       ├── base_mel2wav.yaml
    │   │   │       ├── fs2.yaml
    │   │   │       ├── pre_align.py
    │   │   │       └── pwg.yaml
    │   └── egs_bases
    │   │   ├── config_base.yaml
    │   │   ├── svs
    │   │       ├── base.yaml
    │   │       ├── lj_ds_beta6.yaml
    │   │       ├── midi
    │   │       │   ├── cascade
    │   │       │   │   └── opencs
    │   │       │   │   │   ├── aux_rel.yaml
    │   │       │   │   │   ├── ds60_rel.yaml
    │   │       │   │   │   └── opencpop_statis.yaml
    │   │       │   ├── e2e
    │   │       │   │   ├── opencpop
    │   │       │   │   │   ├── ds1000-10dil.yaml
    │   │       │   │   │   ├── ds1000.yaml
    │   │       │   │   │   └── ds100_adj_rel.yaml
    │   │       │   │   └── popcs
    │   │       │   │   │   └── ds100_adj_rel.yaml
    │   │       │   └── pe.yaml
    │   │       ├── popcs_ds_beta6.yaml
    │   │       ├── popcs_ds_beta6_offline.yaml
    │   │       └── popcs_fs2.yaml
    │   │   └── tts
    │   │       ├── base.yaml
    │   │       ├── base_zh.yaml
    │   │       ├── fs2.yaml
    │   │       ├── fs2_adv.yaml
    │   │       ├── ps.yaml
    │   │       ├── ps_flow.yaml
    │   │       ├── ps_flow_small.yaml
    │   │       └── vocoder
    │   │           ├── base.yaml
    │   │           ├── hifigan.yaml
    │   │           └── pwg.yaml
    ├── gitattributes
    ├── inference
    │   ├── svs
    │   │   ├── base_svs_infer.py
    │   │   ├── ds_cascade.py
    │   │   ├── ds_e2e.py
    │   │   └── opencpop
    │   │   │   ├── cpop_pinyin2ph.txt
    │   │   │   └── map.py
    │   └── tts
    │   │   ├── GenerSpeech.py
    │   │   ├── PortaSpeech.py
    │   │   └── base_tts_infer.py
    ├── modules
    │   ├── GenerSpeech
    │   │   ├── config
    │   │   │   └── generspeech.yaml
    │   │   ├── model
    │   │   │   ├── generspeech.py
    │   │   │   ├── glow_modules.py
    │   │   │   ├── mixstyle.py
    │   │   │   ├── prosody_util.py
    │   │   │   └── wavenet.py
    │   │   └── task
    │   │   │   ├── dataset.py
    │   │   │   └── generspeech.py
    │   ├── __init__.py
    │   ├── commons
    │   │   ├── align_ops.py
    │   │   ├── common_layers.py
    │   │   ├── conv.py
    │   │   ├── espnet_positional_embedding.py
    │   │   ├── normalizing_flow
    │   │   │   ├── glow_modules.py
    │   │   │   ├── res_flow.py
    │   │   │   └── utils.py
    │   │   ├── rel_transformer.py
    │   │   ├── ssim.py
    │   │   ├── transformer.py
    │   │   └── wavenet.py
    │   ├── diff
    │   │   ├── candidate_decoder.py
    │   │   ├── diffusion.py
    │   │   ├── net.py
    │   │   └── shallow_diffusion_tts.py
    │   ├── diffsinger_midi
    │   │   └── fs2.py
    │   ├── fastspeech
    │   │   ├── fs2.py
    │   │   ├── pe.py
    │   │   └── tts_modules.py
    │   ├── hifigan
    │   │   ├── hifigan.py
    │   │   └── mel_utils.py
    │   ├── parallel_wavegan
    │   │   ├── __init__.py
    │   │   ├── layers
    │   │   │   ├── __init__.py
    │   │   │   ├── causal_conv.py
    │   │   │   ├── pqmf.py
    │   │   │   ├── residual_block.py
    │   │   │   ├── residual_stack.py
    │   │   │   ├── tf_layers.py
    │   │   │   └── upsample.py
    │   │   ├── losses
    │   │   │   ├── __init__.py
    │   │   │   └── stft_loss.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── melgan.py
    │   │   │   ├── parallel_wavegan.py
    │   │   │   └── source.py
    │   │   ├── optimizers
    │   │   │   ├── __init__.py
    │   │   │   └── radam.py
    │   │   ├── stft_loss.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   └── utils.py
    │   └── syntaspeech
    │   │   ├── multi_window_disc.py
    │   │   ├── syntactic_graph_buider.py
    │   │   ├── syntactic_graph_encoder.py
    │   │   └── syntaspeech.py
    ├── tasks
    │   ├── base_task.py
    │   ├── run.py
    │   ├── svs
    │   │   ├── __init__.py
    │   │   ├── diffsinger_task.py
    │   │   ├── diffspeech_task.py
    │   │   └── task.py
    │   ├── tts
    │   │   ├── dataset_utils.py
    │   │   ├── fs2.py
    │   │   ├── fs2_adv.py
    │   │   ├── fs2_utils.py
    │   │   ├── pe.py
    │   │   ├── ps.py
    │   │   ├── ps_adv.py
    │   │   ├── ps_flow.py
    │   │   ├── synta.py
    │   │   ├── tts.py
    │   │   ├── tts_base.py
    │   │   └── tts_utils.py
    │   └── vocoder
    │   │   ├── dataset_utils.py
    │   │   └── vocoder_base.py
    ├── utils
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── ckpt_utils.py
    │   ├── cwt.py
    │   ├── dtw.py
    │   ├── hparams.py
    │   ├── indexed_datasets.py
    │   ├── multiprocess_utils.py
    │   ├── os_utils.py
    │   ├── pitch_utils.py
    │   ├── pl_utils.py
    │   ├── plot.py
    │   ├── text_encoder.py
    │   ├── text_norm.py
    │   ├── training_utils.py
    │   └── tts_utils.py
    └── vocoders
    │   ├── __init__.py
    │   ├── base_vocoder.py
    │   ├── hifigan.py
    │   ├── pwg.py
    │   └── vocoder_utils.py
├── README.md
├── assets
    ├── 2bf90e35.wav
    ├── 5d67d1b9.wav
    ├── 7cb0d24f.wav
    ├── 7ef0ec0b.wav
    ├── README.md
    ├── Track 4.wav
    ├── a-group-of-sheep-are-baaing.wav
    ├── a2i.png
    ├── asr.png
    ├── b973e878.wav
    ├── detection.png
    ├── drums-and-music-playing-with-a-man-speaking.wav
    ├── fd5cf55e.wav
    ├── i2a-1.png
    ├── i2a-2.png
    ├── inpaint-1.png
    ├── inpaint-2.png
    ├── m2b.png
    ├── mix1.wav
    ├── sound_extraction.png
    ├── style_transfer_tts.png
    ├── t2a.png
    ├── t2i.png
    ├── t2s.png
    ├── tsd.png
    └── tts.png
├── audio-chatgpt.py
├── audio_detection
    ├── __init__.py
    ├── audio_infer
    │   ├── __init__.py
    │   ├── metadata
    │   │   ├── black_list
    │   │   │   ├── groundtruth_weak_label_evaluation_set.csv
    │   │   │   └── groundtruth_weak_label_testing_set.csv
    │   │   └── class_labels_indices.csv
    │   ├── pytorch
    │   │   ├── evaluate.py
    │   │   ├── finetune_template.py
    │   │   ├── inference.py
    │   │   ├── losses.py
    │   │   ├── main.py
    │   │   ├── models.py
    │   │   └── pytorch_utils.py
    │   ├── results
    │   │   └── YDlWd7Wmdi1E.png
    │   └── utils
    │   │   ├── config.py
    │   │   ├── crash.py
    │   │   ├── create_black_list.py
    │   │   ├── create_indexes.py
    │   │   ├── data_generator.py
    │   │   ├── dataset.py
    │   │   ├── plot_for_paper.py
    │   │   ├── plot_statistics.py
    │   │   └── utilities.py
    └── target_sound_detection
    │   └── src
    │       ├── models.py
    │       └── utils.py
├── audio_to_text
    ├── __init__.py
    ├── captioning
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   ├── decoder.py
    │   │   ├── encoder.py
    │   │   ├── transformer_model.py
    │   │   └── utils.py
    │   └── utils
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │       ├── create_sent_embedding.py
    │   │       └── create_word_embedding.py
    │   │   ├── build_vocab.py
    │   │   ├── build_vocab_ltp.py
    │   │   ├── build_vocab_spacy.py
    │   │   ├── eval_round_robin.py
    │   │   ├── fasttext
    │   │       └── create_word_embedding.py
    │   │   ├── lr_scheduler.py
    │   │   ├── model_eval_diff.py
    │   │   ├── predict_nn.py
    │   │   ├── remove_optimizer.py
    │   │   ├── report_results.py
    │   │   ├── tokenize_caption.py
    │   │   ├── train_util.py
    │   │   └── word2vec
    │   │       └── create_word_embedding.py
    └── inference_waveform.py
├── download.sh
├── mono2binaural
    └── src
    │   ├── models.py
    │   ├── utils.py
    │   └── warping.py
├── requirements.txt
├── run.md
├── sound_extraction
    ├── model
    │   ├── LASSNet.py
    │   ├── film.py
    │   ├── modules.py
    │   ├── resunet_film.py
    │   └── text_encoder.py
    └── utils
    │   ├── create_mixtures.py
    │   ├── stft.py
    │   └── wav_io.py
└── text_to_audio
    └── Make_An_Audio
        ├── configs
            ├── img_to_audio
            │   └── img2audio_args.yaml
            ├── inpaint
            │   └── txt2audio_args.yaml
            └── text_to_audio
            │   ├── clap_args.yaml
            │   ├── hifigan_args.yaml
            │   └── txt2audio_args.yaml
        ├── ldm
            ├── data
            │   └── extract_mel_spectrogram.py
            ├── lr_scheduler.py
            ├── models
            │   ├── autoencoder.py
            │   ├── autoencoder_multi.py
            │   └── diffusion
            │   │   ├── __init__.py
            │   │   ├── classifier.py
            │   │   ├── ddim.py
            │   │   ├── ddpm.py
            │   │   ├── ddpm_audio.py
            │   │   ├── ddpm_audio_inpaint.py
            │   │   └── plms.py
            ├── modules
            │   ├── attention.py
            │   ├── diffusionmodules
            │   │   ├── __init__.py
            │   │   ├── custom_openaimodel.py
            │   │   ├── model.py
            │   │   ├── openaimodel.py
            │   │   └── util.py
            │   ├── discriminator
            │   │   ├── model.py
            │   │   └── multi_window_disc.py
            │   ├── distributions
            │   │   ├── __init__.py
            │   │   └── distributions.py
            │   ├── ema.py
            │   ├── encoders
            │   │   ├── CLAP
            │   │   │   ├── CLAPWrapper.py
            │   │   │   ├── __init__.py
            │   │   │   ├── audio.py
            │   │   │   ├── clap.py
            │   │   │   ├── config.yml
            │   │   │   └── utils.py
            │   │   ├── __init__.py
            │   │   ├── modules.py
            │   │   └── open_clap
            │   │   │   ├── __init__.py
            │   │   │   ├── bert.py
            │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
            │   │   │   ├── factory.py
            │   │   │   ├── feature_fusion.py
            │   │   │   ├── htsat.py
            │   │   │   ├── linear_probe.py
            │   │   │   ├── loss.py
            │   │   │   ├── model.py
            │   │   │   ├── model_configs
            │   │   │       ├── HTSAT-base.json
            │   │   │       ├── HTSAT-large.json
            │   │   │       ├── HTSAT-tiny-win-1536.json
            │   │   │       ├── HTSAT-tiny.json
            │   │   │       ├── PANN-10.json
            │   │   │       ├── PANN-14-fmax-18k.json
            │   │   │       ├── PANN-14-fmax-8k-20s.json
            │   │   │       ├── PANN-14-tiny-transformer.json
            │   │   │       ├── PANN-14-win-1536.json
            │   │   │       ├── PANN-14.json
            │   │   │       ├── PANN-6.json
            │   │   │       ├── RN101-quickgelu.json
            │   │   │       ├── RN101.json
            │   │   │       ├── RN50-quickgelu.json
            │   │   │       ├── RN50.json
            │   │   │       ├── RN50x16.json
            │   │   │       ├── RN50x4.json
            │   │   │       ├── ViT-B-16.json
            │   │   │       ├── ViT-B-32-quickgelu.json
            │   │   │       ├── ViT-B-32.json
            │   │   │       └── ViT-L-14.json
            │   │   │   ├── openai.py
            │   │   │   ├── pann_model.py
            │   │   │   ├── pretrained.py
            │   │   │   ├── timm_model.py
            │   │   │   ├── tokenizer.py
            │   │   │   ├── transform.py
            │   │   │   ├── utils.py
            │   │   │   └── version.py
            │   ├── image_degradation
            │   │   ├── __init__.py
            │   │   ├── bsrgan.py
            │   │   ├── bsrgan_light.py
            │   │   ├── utils
            │   │   │   └── test.png
            │   │   └── utils_image.py
            │   ├── losses_audio
            │   │   ├── __init__.py
            │   │   ├── contperceptual.py
            │   │   ├── contperceptual_dis.py
            │   │   ├── lpaps.py
            │   │   ├── vggishish
            │   │   │   ├── config
            │   │   │   │   ├── melception.yaml
            │   │   │   │   └── vggish.yaml
            │   │   │   ├── data
            │   │   │   │   ├── train_means_stds_melspec_10s_22050hz.txt
            │   │   │   │   ├── vggsound.csv
            │   │   │   │   ├── vggsound_test.txt
            │   │   │   │   ├── vggsound_train.txt
            │   │   │   │   └── vggsound_valid.txt
            │   │   │   ├── dataset.py
            │   │   │   ├── logger.py
            │   │   │   ├── loss.py
            │   │   │   ├── metrics.py
            │   │   │   ├── model.py
            │   │   │   ├── predict.py
            │   │   │   ├── train_melception.py
            │   │   │   ├── train_vggishish.py
            │   │   │   └── transforms.py
            │   │   └── vqperceptual.py
            │   └── x_transformer.py
            └── util.py
        ├── useful_ckpts
            └── CLAP
            │   └── config.yml
        ├── vocoder
            ├── bigvgan
            │   ├── __init__.py
            │   ├── activations.py
            │   ├── alias_free_torch
            │   │   ├── __init__.py
            │   │   ├── act.py
            │   │   ├── filter.py
            │   │   └── resample.py
            │   └── models.py
            ├── hifigan
            │   └── modules.py
            └── logs
            │   └── hifi_0127
            │       └── args.yml
        └── wav_evaluation
            └── models
                ├── CLAPWrapper.py
                ├── __init__.py
                ├── audio.py
                ├── clap.py
                └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # JetBrains PyCharm IDE
  2 | .idea/
  3 | .github/
  4 | .circleci/
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | *__pycache__/
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # macOS dir files
 16 | .DS_Store
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | env/
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | 
 37 | # Checkpoints
 38 | checkpoints
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # dotenv
 95 | .env
 96 | 
 97 | # virtualenv
 98 | .venv
 99 | venv/
100 | ENV/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 
115 | # Generated files
116 | /fairseq/temporal_convolution_tbc
117 | /fairseq/modules/*_layer/*_forward.cu
118 | /fairseq/modules/*_layer/*_backward.cu
119 | /fairseq/version.py
120 | 
121 | # data
122 | data-bin/
123 | 
124 | # reranking
125 | /examples/reranking/rerank_data
126 | 
127 | # Cython-generated C++ source files
128 | /fairseq/data/data_utils_fast.cpp
129 | /fairseq/data/token_block_utils_fast.cpp
130 | 
131 | # VSCODE
132 | .vscode/ftp-sync.json
133 | .vscode/settings.json
134 | 
135 | # Experimental Folder
136 | experimental/*
137 | 
138 | # Weights and Biases logs
139 | wandb/
140 | 
141 | # Hydra artifacts
142 | nohup.out
143 | multirun
144 | outputs
145 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/LICENSE


--------------------------------------------------------------------------------
/NeuralSeq/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jinglin Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/NeuralSeq/README.md:
--------------------------------------------------------------------------------
1 | In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger


--------------------------------------------------------------------------------
/NeuralSeq/configs/config_base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | binary_data_dir: ''
 3 | work_dir: '' # experiment directory.
 4 | infer: false # infer
 5 | seed: 1234
 6 | debug: false
 7 | save_codes:
 8 |   - configs
 9 |   - modules
10 |   - tasks
11 |   - utils
12 |   - usr
13 | 
14 | #############
15 | # dataset
16 | #############
17 | ds_workers: 1
18 | test_num: 100
19 | valid_num: 100
20 | endless_ds: false
21 | sort_by_len: true
22 | 
23 | #########
24 | # train and eval
25 | #########
26 | load_ckpt: ''
27 | save_ckpt: true
28 | save_best: false
29 | num_ckpt_keep: 3
30 | clip_grad_norm: 0
31 | accumulate_grad_batches: 1
32 | log_interval: 100
33 | num_sanity_val_steps: 5  # steps of validation at the beginning
34 | check_val_every_n_epoch: 10
35 | val_check_interval: 2000
36 | max_epochs: 1000
37 | max_updates: 160000
38 | max_tokens: 31250
39 | max_sentences: 100000
40 | max_eval_tokens: -1
41 | max_eval_sentences: -1
42 | test_input_dir: ''
43 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/singing/base.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/base.yaml
 3 |   - configs/tts/base_zh.yaml
 4 | 
 5 | 
 6 | datasets: []
 7 | test_prefixes: []
 8 | test_num: 0
 9 | valid_num: 0
10 | 
11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer
13 | pre_align_args:
14 |   use_tone: false # for ZH
15 |   forced_align: mfa
16 |   use_sox: true
17 | hop_size: 128            # Hop size.
18 | fft_size: 512           # FFT size.
19 | win_size: 512           # FFT size.
20 | max_frames: 8000
21 | fmin: 50                 # Minimum freq in mel basis calculation.
22 | fmax: 11025               # Maximum frequency in mel basis calculation.
23 | pitch_type: frame
24 | 
25 | hidden_size: 256
26 | mel_loss: "ssim:0.5|l1:0.5"
27 | lambda_f0: 0.0
28 | lambda_uv: 0.0
29 | lambda_energy: 0.0
30 | lambda_ph_dur: 0.0
31 | lambda_sent_dur: 0.0
32 | lambda_word_dur: 0.0
33 | predictor_grad: 0.0
34 | use_spk_embed: true
35 | use_spk_id: false
36 | 
37 | max_tokens: 20000
38 | max_updates: 400000
39 | num_spk: 100
40 | save_f0: true
41 | use_gt_dur: true
42 | use_gt_f0: true
43 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/singing/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/singing/base.yaml
4 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | base_config: configs/config_base.yaml
 3 | task_cls: ''
 4 | #############
 5 | # dataset
 6 | #############
 7 | raw_data_dir: ''
 8 | processed_data_dir: ''
 9 | binary_data_dir: ''
10 | dict_dir: ''
11 | pre_align_cls: ''
12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
13 | pre_align_args:
14 |   use_tone: true # for ZH
15 |   forced_align: mfa
16 |   use_sox: false
17 |   txt_processor: en
18 |   allow_no_txt: false
19 |   denoise: false
20 | binarization_args:
21 |   shuffle: false
22 |   with_txt: true
23 |   with_wav: false
24 |   with_align: true
25 |   with_spk_embed: true
26 |   with_f0: true
27 |   with_f0cwt: true
28 | 
29 | loud_norm: false
30 | endless_ds: true
31 | reset_phone_dict: true
32 | 
33 | test_num: 100
34 | valid_num: 100
35 | max_frames: 1550
36 | max_input_tokens: 1550
37 | audio_num_mel_bins: 80
38 | audio_sample_rate: 22050
39 | hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
40 | win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
41 | fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
42 | fmax: 7600  # To be increased/reduced depending on data.
43 | fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
44 | min_level_db: -100
45 | num_spk: 1
46 | mel_vmin: -6
47 | mel_vmax: 1.5
48 | ds_workers: 4
49 | 
50 | #########
51 | # model
52 | #########
53 | dropout: 0.1
54 | enc_layers: 4
55 | dec_layers: 4
56 | hidden_size: 384
57 | num_heads: 2
58 | prenet_dropout: 0.5
59 | prenet_hidden_size: 256
60 | stop_token_weight: 5.0
61 | enc_ffn_kernel_size: 9
62 | dec_ffn_kernel_size: 9
63 | ffn_act: gelu
64 | ffn_padding: 'SAME'
65 | 
66 | 
67 | ###########
68 | # optimization
69 | ###########
70 | lr: 2.0
71 | warmup_updates: 8000
72 | optimizer_adam_beta1: 0.9
73 | optimizer_adam_beta2: 0.98
74 | weight_decay: 0
75 | clip_grad_norm: 1
76 | 
77 | 
78 | ###########
79 | # train and eval
80 | ###########
81 | max_tokens: 30000
82 | max_sentences: 100000
83 | max_eval_sentences: 1
84 | max_eval_tokens: 60000
85 | train_set_name: 'train'
86 | valid_set_name: 'valid'
87 | test_set_name: 'test'
88 | vocoder: pwg
89 | vocoder_ckpt: ''
90 | profile_infer: false
91 | out_wav_norm: false
92 | save_gt: false
93 | save_f0: false
94 | gen_dir_name: ''
95 | use_denoise: false
96 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/base_zh.yaml:
--------------------------------------------------------------------------------
1 | pre_align_args:
2 |   txt_processor: zh_g2pM
3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/emotion/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/ESD'
 2 | processed_data_dir: 'data/processed/emotion'
 3 | binary_data_dir: 'data/binary/emotion'
 4 | pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign
 5 | audio_sample_rate: 16000
 6 | binarization_args:
 7 |   shuffle: true
 8 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
 9 | use_spk_id: true
10 | test_num: 200
11 | num_spk: 10
12 | pitch_type: frame
13 | min_frames: 128
14 | num_test_samples: 30
15 | mel_loss: "ssim:0.5|l1:0.5"
16 | vocoder_ckpt: ''
17 | use_emotion: true


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/emotion/pre_align.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from data_gen.tts.base_preprocess import BasePreprocessor
 4 | import glob
 5 | import re
 6 | 
 7 | class EmoPreAlign(BasePreprocessor):
 8 | 
 9 |     def meta_data(self):
10 |         spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
11 |         pattern = re.compile('[\t\n ]+')
12 |         for spk in spks:
13 |             for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'):  # 打开文件
14 |                 line = re.sub(pattern, ' ', line)
15 |                 if line == ' ': continue
16 |                 split_ = line.split(' ')
17 |                 txt = ' '.join(split_[1: -2])
18 |                 item_name = split_[0]
19 |                 emotion = split_[-2]
20 |                 wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
21 |                 yield item_name, wav_fn, txt, spk, emotion
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     EmoPreAlign().process()
26 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config: configs/tts/base.yaml
 2 | task_cls: tasks.tts.fs2.FastSpeech2Task
 3 | 
 4 | # model
 5 | hidden_size: 256
 6 | dropout: 0.1
 7 | encoder_type: fft # fft|tacotron|tacotron2|conformer
 8 | encoder_K: 8 # for tacotron encoder
 9 | decoder_type: fft # fft|rnn|conv|conformer
10 | use_pos_embed: true
11 | 
12 | # duration
13 | predictor_hidden: -1
14 | predictor_kernel: 5
15 | predictor_layers: 2
16 | dur_predictor_kernel: 3
17 | dur_predictor_layers: 2
18 | predictor_dropout: 0.5
19 | 
20 | # pitch and energy
21 | use_pitch_embed: true
22 | pitch_type: ph # frame|ph|cwt
23 | use_uv: true
24 | cwt_hidden_size: 128
25 | cwt_layers: 2
26 | cwt_loss: l1
27 | cwt_add_f0_loss: false
28 | cwt_std_scale: 0.8
29 | 
30 | pitch_ar: false
31 | #pitch_embed_type: 0q
32 | pitch_loss: 'l1' # l1|l2|ssim
33 | pitch_norm: log
34 | use_energy_embed: false
35 | 
36 | # reference encoder and speaker embedding
37 | use_spk_id: false
38 | use_split_spk_id: false
39 | use_spk_embed: false
40 | use_var_enc: false
41 | lambda_commit: 0.25
42 | ref_norm_layer: bn
43 | pitch_enc_hidden_stride_kernel:
44 |   - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
45 |   - 0,2,5
46 |   - 0,2,5
47 | dur_enc_hidden_stride_kernel:
48 |   - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
49 |   - 0,2,3
50 |   - 0,1,3
51 | 
52 | 
53 | # mel
54 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
55 | 
56 | # loss lambda
57 | lambda_f0: 1.0
58 | lambda_uv: 1.0
59 | lambda_energy: 0.1
60 | lambda_ph_dur: 1.0
61 | lambda_sent_dur: 1.0
62 | lambda_word_dur: 1.0
63 | predictor_grad: 0.1
64 | 
65 | # train and eval
66 | pretrain_fs_ckpt: ''
67 | warmup_updates: 2000
68 | max_tokens: 32000
69 | max_sentences: 100000
70 | max_eval_sentences: 1
71 | max_updates: 120000
72 | num_valid_plots: 5
73 | num_test_samples: 0
74 | test_ids: []
75 | use_gt_dur: false
76 | use_gt_f0: false
77 | 
78 | # exp
79 | dur_loss: mse # huber|mol
80 | norm_type: gn


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/hifigan.yaml:
--------------------------------------------------------------------------------
 1 | base_config: configs/tts/pwg.yaml
 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask
 3 | resblock: "1"
 4 | adam_b1: 0.8
 5 | adam_b2: 0.99
 6 | upsample_rates: [ 8,8,2,2 ]
 7 | upsample_kernel_sizes: [ 16,16,4,4 ]
 8 | upsample_initial_channel: 128
 9 | resblock_kernel_sizes: [ 3,7,11 ]
10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11 | 
12 | lambda_mel: 45.0
13 | 
14 | max_samples: 8192
15 | max_sentences: 16
16 | 
17 | generator_params:
18 |   lr: 0.0002            # Generator's learning rate.
19 |   aux_context_window: 0 # Context window size for auxiliary feature.
20 | discriminator_optimizer_params:
21 |   lr: 0.0002            # Discriminator's learning rate.


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/libritts/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/LibriTTS'
 2 | processed_data_dir: 'data/processed/libritts'
 3 | binary_data_dir: 'data/binary/libritts'
 4 | pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign
 5 | binarization_args:
 6 |   shuffle: true
 7 | use_spk_id: true
 8 | test_num: 200
 9 | num_spk: 2320
10 | pitch_type: frame
11 | min_frames: 128
12 | num_test_samples: 30
13 | mel_loss: "ssim:0.5|l1:0.5"
14 | vocoder_ckpt: ''


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/libritts/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - ./base_text2mel.yaml
4 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/libritts/pre_align.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from data_gen.tts.base_preprocess import BasePreprocessor
 4 | import glob
 5 | 
 6 | 
 7 | class LibrittsPreAlign(BasePreprocessor):
 8 |     def meta_data(self):
 9 |         wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
10 |         for wav_fn in wav_fns:
11 |             item_name = os.path.basename(wav_fn)[:-4]
12 |             txt_fn = f'{wav_fn[:-4]}.normalized.txt'
13 |             with open(txt_fn, 'r') as f:
14 |                 txt = f.readlines()
15 |                 f.close()
16 |             spk = item_name.split("_")[0]
17 |             # Example:
18 |             #
19 |             # 'item_name': '103_1241_000000_000001'
20 |             # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
21 |             # 'txt': 'matthew Cuthbert is surprised'
22 |             # 'spk_name': '103'
23 |             yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     LibrittsPreAlign().process()
28 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/libritts/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config: egs/egs_bases/tts/vocoder/pwg.yaml
2 | raw_data_dir: 'data/raw/LibriTTS'
3 | processed_data_dir: 'data/processed/libritts'
4 | binary_data_dir: 'data/binary/libritts_wav'
5 | generator_params:
6 |   kernel_size: 5
7 | num_spk: 400
8 | max_samples: 20480
9 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/lj/base_mel2wav.yaml:
--------------------------------------------------------------------------------
1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
2 | processed_data_dir: 'data/processed/ljspeech'
3 | binary_data_dir: 'data/binary/ljspeech_wav'
4 | 


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/lj/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
 2 | processed_data_dir: 'data/processed/ljspeech'
 3 | binary_data_dir: 'data/binary/ljspeech'
 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
 5 | 
 6 | pitch_type: cwt
 7 | mel_loss: l1
 8 | num_test_samples: 20
 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
10 |             316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
11 | use_energy_embed: false
12 | test_num: 523
13 | valid_num: 348


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/lj/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/tts/lj/base_text2mel.yaml


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/lj/hifigan.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/hifigan.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/NeuralSeq/configs/tts/lj/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/pwg.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/binarizer_zh.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OMP_NUM_THREADS"] = "1"
 4 | 
 5 | from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
 6 | from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
 7 | from data_gen.tts.data_gen_utils import get_mel2ph
 8 | from utils.hparams import set_hparams, hparams
 9 | import numpy as np
10 | 
11 | 
12 | class ZhBinarizer(BaseBinarizer):
13 |     @staticmethod
14 |     def get_align(tg_fn, ph, mel, phone_encoded, res):
15 |         if tg_fn is not None and os.path.exists(tg_fn):
16 |             _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
17 |         else:
18 |             raise BinarizationError(f"Align not found")
19 |         ph_list = ph.split(" ")
20 |         assert len(dur) == len(ph_list)
21 |         mel2ph = []
22 |         # 分隔符的时长分配给韵母
23 |         dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
24 |         for i in range(len(dur)):
25 |             p = ph_list[i]
26 |             if p[0] != '<' and not p[0].isalpha():
27 |                 uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
28 |                 j = 0
29 |                 while j < len(uv_) and not uv_[j]:
30 |                     j += 1
31 |                 dur[i - 1] += j
32 |                 dur[i] -= j
33 |                 if dur[i] < 100:
34 |                     dur[i - 1] += dur[i]
35 |                     dur[i] = 0
36 |         # 声母和韵母等长
37 |         for i in range(len(dur)):
38 |             p = ph_list[i]
39 |             if p in ALL_SHENMU:
40 |                 p_next = ph_list[i + 1]
41 |                 if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
42 |                     print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
43 |                           f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
44 |                     continue
45 |                 total = dur[i + 1] + dur[i]
46 |                 dur[i] = total // 2
47 |                 dur[i + 1] = total - dur[i]
48 |         for i in range(len(dur)):
49 |             mel2ph += [i + 1] * dur[i]
50 |         mel2ph = np.array(mel2ph)
51 |         if mel2ph.max() - 1 >= len(phone_encoded):
52 |             raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
53 |         res['mel2ph'] = mel2ph
54 |         res['dur'] = dur
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     set_hparams()
59 |     ZhBinarizer().process()
60 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/emotion/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from data_gen.tts.emotion.params_model import *
 3 | from data_gen.tts.emotion.params_data import *
 4 | from torch.nn.utils import clip_grad_norm_
 5 | from scipy.optimize import brentq
 6 | from torch import nn
 7 | import numpy as np
 8 | import torch
 9 | 
10 | 
11 | class EmotionEncoder(nn.Module):
12 |     def __init__(self, device, loss_device):
13 |         super().__init__()
14 |         self.loss_device = loss_device
15 | 
16 |         # Network defition
17 |         self.lstm = nn.LSTM(input_size=mel_n_channels,
18 |                             hidden_size=model_hidden_size,
19 |                             num_layers=model_num_layers,
20 |                             batch_first=True).to(device)
21 |         self.linear = nn.Linear(in_features=model_hidden_size,
22 |                                 out_features=model_embedding_size).to(device)
23 |         self.relu = torch.nn.ReLU().to(device)
24 | 
25 | 
26 |         # Cosine similarity scaling (with fixed initial parameter values)
27 |         self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
28 |         self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
29 | 
30 |         # Loss
31 |         self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
32 | 
33 |     def do_gradient_ops(self):
34 |         # Gradient scale
35 |         self.similarity_weight.grad *= 0.01
36 |         self.similarity_bias.grad *= 0.01
37 | 
38 |         # Gradient clipping
39 |         clip_grad_norm_(self.parameters(), 3, norm_type=2)
40 | 
41 |     def forward(self, utterances, hidden_init=None):
42 |         """
43 |         Computes the embeddings of a batch of utterance spectrograms.
44 | 
45 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
46 |         (batch_size, n_frames, n_channels)
47 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
48 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
49 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
50 |         """
51 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
52 |         # and the final cell state.
53 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
54 | 
55 |         # We take only the hidden state of the last layer
56 |         embeds_raw = self.relu(self.linear(hidden[-1]))
57 | 
58 |         # L2-normalize it
59 |         embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
60 | 
61 |         return embeds
62 | 
63 |     def inference(self, utterances, hidden_init=None):
64 |         """
65 |         Computes the embeddings of a batch of utterance spectrograms.
66 | 
67 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
68 |         (batch_size, n_frames, n_channels)
69 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
70 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
71 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
72 |         """
73 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
74 |         # and the final cell state.
75 | 
76 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
77 | 
78 |         return hidden[-1]


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/emotion/params_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Mel-filterbank
 3 | mel_window_length = 25  # In milliseconds
 4 | mel_window_step = 10    # In milliseconds
 5 | mel_n_channels = 40
 6 | 
 7 | 
 8 | ## Audio
 9 | sampling_rate = 16000
10 | # Number of spectrogram frames in a partial utterance
11 | partials_n_frames = 160     # 1600 ms
12 | # Number of spectrogram frames at inference
13 | inference_n_frames = 80     #  800 ms
14 | 
15 | 
16 | ## Voice Activation Detection
17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18 | # This sets the granularity of the VAD. Should not need to be changed.
19 | vad_window_length = 30  # In milliseconds
20 | # Number of frames to average together when performing the moving average smoothing.
21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
22 | vad_moving_average_width = 8
23 | # Maximum number of consecutive silent frames a segment can have.
24 | vad_max_silence_length = 6
25 | 
26 | 
27 | ## Audio volume normalization
28 | audio_norm_target_dBFS = -30
29 | 
30 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/emotion/params_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Model parameters
 3 | model_hidden_size = 256
 4 | model_embedding_size = 256
 5 | model_num_layers = 3
 6 | 
 7 | 
 8 | ## Training parameters
 9 | learning_rate_init = 1e-4
10 | speakers_per_batch = 6
11 | utterances_per_speaker = 20
12 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/txt_processors/__init__.py:
--------------------------------------------------------------------------------
1 | from . import en


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py:
--------------------------------------------------------------------------------
 1 | from data_gen.tts.data_gen_utils import is_sil_phoneme
 2 | 
 3 | REGISTERED_TEXT_PROCESSORS = {}
 4 | 
 5 | def register_txt_processors(name):
 6 |     def _f(cls):
 7 |         REGISTERED_TEXT_PROCESSORS[name] = cls
 8 |         return cls
 9 | 
10 |     return _f
11 | 
12 | 
13 | def get_txt_processor_cls(name):
14 |     return REGISTERED_TEXT_PROCESSORS.get(name, None)
15 | 
16 | 
17 | class BaseTxtProcessor:
18 |     @staticmethod
19 |     def sp_phonemes():
20 |         return ['|']
21 | 
22 |     @classmethod
23 |     def process(cls, txt, preprocess_args):
24 |         raise NotImplementedError
25 | 
26 |     @classmethod
27 |     def postprocess(cls, txt_struct, preprocess_args):
28 |         # remove sil phoneme in head and tail
29 |         while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
30 |             txt_struct = txt_struct[1:]
31 |         while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
32 |             txt_struct = txt_struct[:-1]
33 |         if preprocess_args['with_phsep']:
34 |             txt_struct = cls.add_bdr(txt_struct)
35 |         if preprocess_args['add_eos_bos']:
36 |             txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
37 |         return txt_struct
38 | 
39 |     @classmethod
40 |     def add_bdr(cls, txt_struct):
41 |         txt_struct_ = []
42 |         for i, ts in enumerate(txt_struct):
43 |             txt_struct_.append(ts)
44 |             if i != len(txt_struct) - 1 and \
45 |                     not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
46 |                 txt_struct_.append(['|', ['|']])
47 |         return txt_struct_


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/txt_processors/en.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | 
 4 | from g2p_en import G2p
 5 | from g2p_en.expand import normalize_numbers
 6 | from nltk import pos_tag
 7 | from nltk.tokenize import TweetTokenizer
 8 | 
 9 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
10 | from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
11 | 
12 | class EnG2p(G2p):
13 |     word_tokenize = TweetTokenizer().tokenize
14 | 
15 |     def __call__(self, text):
16 |         # preprocessing
17 |         words = EnG2p.word_tokenize(text)
18 |         tokens = pos_tag(words)  # tuples of (word, tag)
19 | 
20 |         # steps
21 |         prons = []
22 |         for word, pos in tokens:
23 |             if re.search("[a-z]", word) is None:
24 |                 pron = [word]
25 | 
26 |             elif word in self.homograph2features:  # Check homograph
27 |                 pron1, pron2, pos1 = self.homograph2features[word]
28 |                 if pos.startswith(pos1):
29 |                     pron = pron1
30 |                 else:
31 |                     pron = pron2
32 |             elif word in self.cmu:  # lookup CMU dict
33 |                 pron = self.cmu[word][0]
34 |             else:  # predict for oov
35 |                 pron = self.predict(word)
36 | 
37 |             prons.extend(pron)
38 |             prons.extend([" "])
39 | 
40 |         return prons[:-1]
41 | 
42 | 
43 | @register_txt_processors('en')
44 | class TxtProcessor(BaseTxtProcessor):
45 |     g2p = EnG2p()
46 | 
47 |     @staticmethod
48 |     def preprocess_text(text):
49 |         text = normalize_numbers(text)
50 |         text = ''.join(char for char in unicodedata.normalize('NFD', text)
51 |                        if unicodedata.category(char) != 'Mn')  # Strip accents
52 |         text = text.lower()
53 |         text = re.sub("[\'\"()]+", "", text)
54 |         text = re.sub("[-]+", " ", text)
55 |         text = re.sub(f"[^ a-z{PUNCS}]", "", text)
56 |         text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
57 |         text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
58 |         text = text.replace("i.e.", "that is")
59 |         text = text.replace("i.e.", "that is")
60 |         text = text.replace("etc.", "etc")
61 |         text = re.sub(f"([{PUNCS}])", r" \1 ", text)
62 |         text = re.sub(rf"\s+", r" ", text)
63 |         return text
64 | 
65 |     @classmethod
66 |     def process(cls, txt, preprocess_args):
67 |         txt = cls.preprocess_text(txt).strip()
68 |         phs = cls.g2p(txt)
69 |         txt_struct = [[w, []] for w in txt.split(" ")]
70 |         i_word = 0
71 |         for p in phs:
72 |             if p == ' ':
73 |                 i_word += 1
74 |             else:
75 |                 txt_struct[i_word][1].append(p)
76 |         txt_struct = cls.postprocess(txt_struct, preprocess_args)
77 |         return txt_struct, txt


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/txt_processors/zh.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import jieba
 3 | from pypinyin import pinyin, Style
 4 | from data_gen.tts.data_gen_utils import PUNCS
 5 | from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
 6 | from utils.text_norm import NSWNormalizer
 7 | 
 8 | 
 9 | class TxtProcessor(BaseTxtProcessor):
10 |     table = {ord(f): ord(t) for f, t in zip(
11 |         u'：，。！？【】（）％＃＠＆１２３４５６７８９０',
12 |         u':,.!?[]()%#@&1234567890')}
13 | 
14 |     @staticmethod
15 |     def preprocess_text(text):
16 |         text = text.translate(TxtProcessor.table)
17 |         text = NSWNormalizer(text).normalize(remove_punc=False)
18 |         text = re.sub("[\'\"()]+", "", text)
19 |         text = re.sub("[-]+", " ", text)
20 |         text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
21 |         text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
22 |         text = re.sub(f"([{PUNCS}])", r" \1 ", text)
23 |         text = re.sub(rf"\s+", r"", text)
24 |         text = re.sub(rf"[A-Za-z]+", r"$", text)
25 |         return text
26 | 
27 |     @classmethod
28 |     def process(cls, txt, pre_align_args):
29 |         txt = cls.preprocess_text(txt)
30 |         shengmu = pinyin(txt, style=Style.INITIALS)  # https://blog.csdn.net/zhoulei124/article/details/89055403
31 |         yunmu_finals = pinyin(txt, style=Style.FINALS)
32 |         yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
33 |         yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
34 |             if pre_align_args['use_tone'] else yunmu_finals
35 | 
36 |         assert len(shengmu) == len(yunmu)
37 |         phs = ["|"]
38 |         for a, b, c in zip(shengmu, yunmu, yunmu_finals):
39 |             if a[0] == c[0]:
40 |                 phs += [a[0], "|"]
41 |             else:
42 |                 phs += [a[0], b[0], "|"]
43 |         return phs, txt
44 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/txt_processors/zh_g2pM.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import jieba
 3 | from pypinyin import pinyin, Style
 4 | from data_gen.tts.data_gen_utils import PUNCS
 5 | from data_gen.tts.txt_processors import zh
 6 | from g2pM import G2pM
 7 | 
 8 | ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
 9 |               'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
10 | ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
11 |              'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
12 |              'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
13 | 
14 | 
15 | class TxtProcessor(zh.TxtProcessor):
16 |     model = G2pM()
17 | 
18 |     @staticmethod
19 |     def sp_phonemes():
20 |         return ['|', '#']
21 | 
22 |     @classmethod
23 |     def process(cls, txt, pre_align_args):
24 |         txt = cls.preprocess_text(txt)
25 |         ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
26 |         seg_list = '#'.join(jieba.cut(txt))
27 |         assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
28 | 
29 |         # 加入词边界'#'
30 |         ph_list_ = []
31 |         seg_idx = 0
32 |         for p in ph_list:
33 |             p = p.replace("u:", "v")
34 |             if seg_list[seg_idx] == '#':
35 |                 ph_list_.append('#')
36 |                 seg_idx += 1
37 |             else:
38 |                 ph_list_.append("|")
39 |             seg_idx += 1
40 |             if re.findall('[\u4e00-\u9fff]', p):
41 |                 if pre_align_args['use_tone']:
42 |                     p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
43 |                     if p[-1] not in ['1', '2', '3', '4', '5']:
44 |                         p = p + '5'
45 |                 else:
46 |                     p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
47 | 
48 |             finished = False
49 |             if len([c.isalpha() for c in p]) > 1:
50 |                 for shenmu in ALL_SHENMU:
51 |                     if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
52 |                         ph_list_ += [shenmu, p.lstrip(shenmu)]
53 |                         finished = True
54 |                         break
55 |             if not finished:
56 |                 ph_list_.append(p)
57 | 
58 |         ph_list = ph_list_
59 | 
60 |         # 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
61 |         sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
62 |         ph_list_ = []
63 |         for i in range(0, len(ph_list), 1):
64 |             if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
65 |                 ph_list_.append(ph_list[i])
66 |         ph_list = ph_list_
67 |         return ph_list, txt
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     phs, txt = TxtProcessor.process('他来到了，网易杭研大厦', {'use_tone': True})
72 |     print(phs)
73 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/wav_processors/__init__.py:
--------------------------------------------------------------------------------
1 | from . import base_processor
2 | from . import common_processors
3 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/wav_processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | REGISTERED_WAV_PROCESSORS = {}
 2 | 
 3 | 
 4 | def register_wav_processors(name):
 5 |     def _f(cls):
 6 |         REGISTERED_WAV_PROCESSORS[name] = cls
 7 |         return cls
 8 | 
 9 |     return _f
10 | 
11 | 
12 | def get_wav_processor_cls(name):
13 |     return REGISTERED_WAV_PROCESSORS.get(name, None)
14 | 
15 | 
16 | class BaseWavProcessor:
17 |     @property
18 |     def name(self):
19 |         raise NotImplementedError
20 | 
21 |     def output_fn(self, input_fn):
22 |         return f'{input_fn[:-4]}_{self.name}.wav'
23 | 
24 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/NeuralSeq/data_gen/tts/wav_processors/common_processors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import librosa
 4 | import numpy as np
 5 | from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
 6 | from data_gen.tts.data_gen_utils import trim_long_silences
 7 | from utils.audio import save_wav, rnnoise
 8 | from utils.hparams import hparams
 9 | 
10 | 
11 | @register_wav_processors(name='sox_to_wav')
12 | class ConvertToWavProcessor(BaseWavProcessor):
13 |     @property
14 |     def name(self):
15 |         return 'ToWav'
16 | 
17 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
18 |         if input_fn[-4:] == '.wav':
19 |             return input_fn, sr
20 |         else:
21 |             output_fn = self.output_fn(input_fn)
22 |             subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
23 |             return output_fn, sr
24 | 
25 | 
26 | @register_wav_processors(name='sox_resample')
27 | class ResampleProcessor(BaseWavProcessor):
28 |     @property
29 |     def name(self):
30 |         return 'Resample'
31 | 
32 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
33 |         output_fn = self.output_fn(input_fn)
34 |         sr_file = librosa.core.get_samplerate(input_fn)
35 |         if sr != sr_file:
36 |             subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
37 |             y, _ = librosa.core.load(input_fn, sr=sr)
38 |             y, _ = librosa.effects.trim(y)
39 |             save_wav(y, output_fn, sr)
40 |             return output_fn, sr
41 |         else:
42 |             return input_fn, sr
43 | 
44 | 
45 | @register_wav_processors(name='trim_sil')
46 | class TrimSILProcessor(BaseWavProcessor):
47 |     @property
48 |     def name(self):
49 |         return 'TrimSIL'
50 | 
51 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
52 |         output_fn = self.output_fn(input_fn)
53 |         y, _ = librosa.core.load(input_fn, sr=sr)
54 |         y, _ = librosa.effects.trim(y)
55 |         save_wav(y, output_fn, sr)
56 |         return output_fn
57 | 
58 | 
59 | @register_wav_processors(name='trim_all_sil')
60 | class TrimAllSILProcessor(BaseWavProcessor):
61 |     @property
62 |     def name(self):
63 |         return 'TrimSIL'
64 | 
65 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
66 |         output_fn = self.output_fn(input_fn)
67 |         y, audio_mask, _ = trim_long_silences(
68 |             input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
69 |         save_wav(y, output_fn, sr)
70 |         if preprocess_args['save_sil_mask']:
71 |             os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
72 |             np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
73 |         return output_fn, sr
74 | 
75 | 
76 | @register_wav_processors(name='denoise')
77 | class DenoiseProcessor(BaseWavProcessor):
78 |     @property
79 |     def name(self):
80 |         return 'Denoise'
81 | 
82 |     def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
83 |         output_fn = self.output_fn(input_fn)
84 |         rnnoise(input_fn, output_fn, out_sample_rate=sr)
85 |         return output_fn, sr
86 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/emotion/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/ESD'
 2 | processed_data_dir: 'data/processed/emotion'
 3 | binary_data_dir: 'data/binary/emotion'
 4 | pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
 5 | audio_sample_rate: 16000
 6 | binarization_args:
 7 |   shuffle: true
 8 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
 9 | use_spk_id: true
10 | test_num: 200
11 | num_spk: 10
12 | pitch_type: frame
13 | min_frames: 128
14 | num_test_samples: 30
15 | mel_loss: "ssim:0.5|l1:0.5"
16 | vocoder_ckpt: ''
17 | use_emotion: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/emotion/pre_align.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from data_gen.tts.base_preprocess import BasePreprocessor
 4 | import glob
 5 | import re
 6 | 
 7 | class EmoPreAlign(BasePreprocessor):
 8 | 
 9 |     def meta_data(self):
10 |         spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
11 |         pattern = re.compile('[\t\n ]+')
12 |         for spk in spks:
13 |             for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'):  # 打开文件
14 |                 line = re.sub(pattern, ' ', line)
15 |                 if line == ' ': continue
16 |                 split_ = line.split(' ')
17 |                 txt = ' '.join(split_[1: -2])
18 |                 item_name = split_[0]
19 |                 emotion = split_[-2]
20 |                 wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
21 |                 yield item_name, wav_fn, txt, spk, emotion
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     EmoPreAlign().process()
26 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/libritts/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/LibriTTS'
 2 | processed_data_dir: 'data/processed/libritts'
 3 | binary_data_dir: 'data/binary/libritts'
 4 | pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
 5 | binarization_args:
 6 |   shuffle: true
 7 | use_spk_id: true
 8 | test_num: 200
 9 | num_spk: 2320
10 | pitch_type: frame
11 | min_frames: 128
12 | num_test_samples: 30
13 | mel_loss: "ssim:0.5|l1:0.5"
14 | vocoder_ckpt: ''


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/libritts/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - egs/egs_bases/tts/fs2.yaml
3 |   - ./base_text2mel.yaml
4 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/libritts/pre_align.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from data_gen.tts.base_preprocess import BasePreprocessor
 4 | import glob
 5 | 
 6 | 
 7 | class LibrittsPreAlign(BasePreprocessor):
 8 |     def meta_data(self):
 9 |         wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
10 |         for wav_fn in wav_fns:
11 |             item_name = os.path.basename(wav_fn)[:-4]
12 |             txt_fn = f'{wav_fn[:-4]}.normalized.txt'
13 |             with open(txt_fn, 'r') as f:
14 |                 txt = f.readlines()
15 |                 f.close()
16 |             spk = item_name.split("_")[0]
17 |             # Example:
18 |             #
19 |             # 'item_name': '103_1241_000000_000001'
20 |             # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
21 |             # 'txt': 'matthew Cuthbert is surprised'
22 |             # 'spk_name': '103'
23 |             yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     LibrittsPreAlign().process()
28 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/libritts/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config: egs/egs_bases/tts/vocoder/pwg.yaml
2 | raw_data_dir: 'data/raw/LibriTTS'
3 | processed_data_dir: 'data/processed/libritts'
4 | binary_data_dir: 'data/binary/libritts_wav'
5 | generator_params:
6 |   kernel_size: 5
7 | num_spk: 400
8 | max_samples: 20480
9 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/lj/base_mel2wav.yaml:
--------------------------------------------------------------------------------
1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
2 | processed_data_dir: 'data/processed/ljspeech'
3 | binary_data_dir: 'data/binary/ljspeech_wav'
4 | binarization_args:
5 |   with_spk_embed: false


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/lj/preprocess.py:
--------------------------------------------------------------------------------
 1 | from data_gen.tts.base_preprocess import BasePreprocessor
 2 | 
 3 | 
 4 | class LJPreprocess(BasePreprocessor):
 5 |     def meta_data(self):
 6 |         for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
 7 |             item_name, _, txt = l.strip().split("|")
 8 |             wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
 9 |             yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
10 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/lj/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - egs/egs_bases/tts/vocoder/pwg.yaml
3 |   - ./base_mel2wav.yaml


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/vctk/base_mel2wav.yaml:
--------------------------------------------------------------------------------
1 | raw_data_dir: 'data/raw/VCTK-Corpus'
2 | processed_data_dir: 'data/processed/vctk'
3 | binary_data_dir: 'data/binary/vctk_wav'
4 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/vctk/fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/tts/fs2.yaml
 3 | raw_data_dir: 'data/raw/VCTK-Corpus'
 4 | processed_data_dir: 'data/processed/vctk'
 5 | binary_data_dir: 'data/binary/vctk'
 6 | pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
 7 | use_spk_id: true
 8 | test_num: 200
 9 | num_spk: 400
10 | binarization_args:
11 |   shuffle: true
12 |   trim_eos_bos: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/vctk/pre_align.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from data_gen.tts.base_pre_align import BasePreAlign
 4 | import glob
 5 | 
 6 | 
 7 | class VCTKPreAlign(BasePreAlign):
 8 |     def meta_data(self):
 9 |         wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
10 |         for wav_fn in wav_fns:
11 |             item_name = os.path.basename(wav_fn)[:-4]
12 |             spk = item_name.split("_")[0]
13 |             txt_fn = wav_fn.split("/")
14 |             txt_fn[-1] = f'{item_name}.txt'
15 |             txt_fn[-3] = f'txt'
16 |             txt_fn = "/".join(txt_fn)
17 |             if os.path.exists(txt_fn) and os.path.exists(wav_fn):
18 |                 yield item_name, wav_fn, (self.load_txt, txt_fn), spk
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     VCTKPreAlign().process()
23 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/datasets/audio/vctk/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - egs/egs_bases/tts/vocoder/pwg.yaml
3 |   - ./base_mel2wav.yaml
4 | 
5 | num_spk: 400
6 | max_samples: 20480
7 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/config_base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | binary_data_dir: ''
 3 | work_dir: '' # experiment directory.
 4 | infer: false # inference
 5 | amp: false
 6 | seed: 1234
 7 | debug: false
 8 | save_codes: []
 9 | #  - configs
10 | #  - modules
11 | #  - tasks
12 | #  - utils
13 | #  - usr
14 | 
15 | #############
16 | # dataset
17 | #############
18 | ds_workers: 1
19 | test_num: 100
20 | endless_ds: false
21 | sort_by_len: true
22 | 
23 | #########
24 | # train and eval
25 | #########
26 | print_nan_grads: false
27 | load_ckpt: ''
28 | save_best: true
29 | num_ckpt_keep: 3
30 | clip_grad_norm: 0
31 | accumulate_grad_batches: 1
32 | tb_log_interval: 100
33 | num_sanity_val_steps: 5  # steps of validation at the beginning
34 | check_val_every_n_epoch: 10
35 | val_check_interval: 2000
36 | valid_monitor_key: 'val_loss'
37 | valid_monitor_mode: 'min'
38 | max_epochs: 1000
39 | max_updates: 1000000
40 | max_tokens: 31250
41 | max_sentences: 100000
42 | max_valid_tokens: -1
43 | max_valid_sentences: -1
44 | test_input_dir: ''
45 | resume_from_checkpoint: 0
46 | rename_tmux: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/base.yaml:
--------------------------------------------------------------------------------
 1 | task_cls: tasks.svs.task.DiffFsTask
 2 | pitch_type: frame
 3 | timesteps: 100
 4 | dilation_cycle_length: 1
 5 | residual_layers: 20
 6 | residual_channels: 256
 7 | lr: 0.001
 8 | decay_steps: 50000
 9 | keep_bins: 80
10 | spec_min: [ ]
11 | spec_max: [ ]
12 | 
13 | content_cond_steps: [ ] # [ 0, 10000 ]
14 | spk_cond_steps: [ ] # [ 0, 10000 ]
15 | # train and eval
16 | fs2_ckpt: ''
17 | max_updates: 400000
18 | # max_updates: 200000
19 | use_gt_dur: true
20 | use_gt_f0: true
21 | gen_tgt_spk_id: -1
22 | max_sentences: 48
23 | num_sanity_val_steps: 1
24 | num_valid_plots: 1
25 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/lj/fs2.yaml
 3 |   - ./base.yaml
 4 | # spec_min and spec_max are calculated on the training set.
 5 | spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
 6 |             -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
 7 |             -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
 8 |             -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
 9 |             -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
10 |             -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
11 |             -5.0483, -5.0848, -5.1809, -5.0677, -5.0015, -5.0792, -5.0636, -5.2413,
12 |             -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
13 |             -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
14 |             -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
15 | spec_max: [ -0.5982, -0.0778,  0.1205,  0.2747,  0.4657,  0.5123,  0.5684,  0.7093,
16 |             0.6461,  0.6420,  0.7316,  0.7715,  0.7681,  0.8349,  0.7815,  0.7591,
17 |             0.7910,  0.7433,  0.7352,  0.6869,  0.6854,  0.6623,  0.5353,  0.6492,
18 |             0.6909,  0.6106,  0.5761,  0.5936,  0.5638,  0.4054,  0.4545,  0.3589,
19 |             0.3037,  0.3380,  0.1599,  0.2433,  0.2741,  0.2130,  0.1569,  0.1911,
20 |             0.2324,  0.1586,  0.1221,  0.0341, -0.0558,  0.0553, -0.1153, -0.0933,
21 |             -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
22 |             -0.1244, -0.2116, -0.1361, -0.1575, -0.1442,  0.0513, -0.1567, -0.2000,
23 |             0.0086, -0.0698,  0.1385,  0.0941,  0.1864,  0.1225,  0.2176,  0.2566,
24 |             0.1670,  0.1007,  0.1444,  0.0888,  0.1998,  0.2414,  0.2932,  0.3047 ]
25 | 
26 | task_cls: tasks.svs.diffspeech_task.DiffSpeechTask
27 | vocoder: vocoders.hifigan.HifiGAN
28 | vocoder_ckpt: checkpoints/0414_hifi_lj_1
29 | num_valid_plots: 10
30 | use_gt_dur: false
31 | use_gt_f0: false
32 | pitch_type: cwt
33 | pitch_extractor: 'parselmouth'
34 | max_updates: 160000
35 | lr: 0.001
36 | timesteps: 100
37 | K_step: 71
38 | diff_loss_type: l1
39 | diff_decoder_type: 'wavenet'
40 | schedule_type: 'linear'
41 | max_beta: 0.06
42 | fs2_ckpt: checkpoints/fs2_lj_1/model_ckpt_steps_150000.ckpt
43 | save_gt: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binarization_args:
14 |   with_wav: true
15 |   with_spk_embed: false
16 |   with_align: true
17 | raw_data_dir: 'data/raw/opencpop/segments'
18 | processed_data_dir: 'xxx'
19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
20 | 
21 | 
22 | binary_data_dir: 'data/binary/opencpop-midi-dp'
23 | use_midi: true  #  for midi exp
24 | use_gt_f0: false  #  for midi exp
25 | use_gt_dur: false  # for further midi exp
26 | lambda_f0: 1.0
27 | lambda_uv: 1.0
28 | #lambda_energy: 0.1
29 | lambda_ph_dur: 1.0
30 | lambda_sent_dur: 1.0
31 | lambda_word_dur: 1.0
32 | predictor_grad: 0.1
33 | pe_enable: false
34 | pe_ckpt: ''
35 | 
36 | num_spk: 1
37 | test_prefixes: [
38 |     '2044',
39 |     '2086',
40 |     '2092',
41 |     '2093',
42 |     '2100',
43 | ]
44 | 
45 | task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask
46 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN
47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
48 | vocoder: vocoders.hifigan.HifiGAN
49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
50 | 
51 | use_nsf: true
52 | 
53 | # config for experiments
54 | max_frames: 5000
55 | max_tokens: 40000
56 | predictor_layers: 5
57 | rel_pos: true
58 | dur_predictor_layers: 5  # *
59 | 
60 | use_spk_embed: false
61 | num_valid_plots: 10
62 | max_updates: 160000
63 | save_gt: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/svs/popcs_ds_beta6.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_f0: false  #  for midi exp
11 | use_gt_dur: false  # for further midi exp
12 | lambda_f0: 1.0
13 | lambda_uv: 1.0
14 | #lambda_energy: 0.1
15 | lambda_ph_dur: 1.0
16 | lambda_sent_dur: 1.0
17 | lambda_word_dur: 1.0
18 | predictor_grad: 0.1
19 | pe_enable: false
20 | pe_ckpt: ''
21 | 
22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt'  #
23 | #num_valid_plots: 0
24 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
25 | 
26 | K_step: 60
27 | max_tokens: 36000
28 | predictor_layers: 5
29 | dilation_cycle_length: 4  # *
30 | rel_pos: true
31 | dur_predictor_layers: 5  # *
32 | max_updates: 160000
33 | gaussian_start: false
34 | mask_uv_prob: 0.15


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml:
--------------------------------------------------------------------------------
 1 | spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 2 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 3 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 4 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 5 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 6 |            -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
 7 |            -6., -6., -6., -6., -6., -6., -6., -6.]
 8 | spec_max: [-7.9453e-01, -8.1116e-01, -6.1631e-01, -3.0679e-01, -1.3863e-01,
 9 |            -5.0652e-02, -1.1563e-01, -1.0679e-01, -9.1068e-02, -6.2174e-02,
10 |            -7.5302e-02, -7.2217e-02, -6.3815e-02, -7.3299e-02,  7.3610e-03,
11 |            -7.2508e-02, -5.0234e-02, -1.6534e-01, -2.6928e-01, -2.0782e-01,
12 |            -2.0823e-01, -1.1702e-01, -7.0128e-02, -6.5868e-02, -1.2675e-02,
13 |            1.5121e-03, -8.9902e-02, -2.1392e-01, -2.3789e-01, -2.8922e-01,
14 |            -3.0405e-01, -2.3029e-01, -2.2088e-01, -2.1542e-01, -2.9367e-01,
15 |            -3.0137e-01, -3.8281e-01, -4.3590e-01, -2.8681e-01, -4.6855e-01,
16 |            -5.7485e-01, -4.7022e-01, -5.4266e-01, -4.4848e-01, -6.4120e-01,
17 |            -6.8700e-01, -6.4860e-01, -7.6436e-01, -4.9971e-01, -7.1068e-01,
18 |            -6.9724e-01, -6.1487e-01, -5.5843e-01, -6.9773e-01, -5.7502e-01,
19 |            -7.0919e-01, -8.2431e-01, -8.4213e-01, -9.0431e-01, -8.2840e-01,
20 |            -7.7945e-01, -8.2758e-01, -8.7699e-01, -1.0532e+00, -1.0766e+00,
21 |            -1.1198e+00, -1.0185e+00, -9.8983e-01, -1.0001e+00, -1.0756e+00,
22 |            -1.0024e+00, -1.0304e+00, -1.0579e+00, -1.0188e+00, -1.0500e+00,
23 |            -1.0842e+00, -1.0923e+00, -1.1223e+00, -1.2381e+00, -1.6467e+00]
24 | 
25 | mel_vmin: -6. #-6.
26 | mel_vmax: 1.5
27 | wav2spec_eps: 1e-6
28 | 
29 | raw_data_dir: 'data/raw/opencpop/segments'
30 | processed_data_dir: 'xxx'
31 | binary_data_dir: 'data/binary/opencpop-midi-dp'
32 | datasets: [
33 |   'opencpop',
34 | ]
35 | test_prefixes: [
36 |     '2044',
37 |     '2086',
38 |     '2092',
39 |     '2093',
40 |     '2100',
41 | ]
42 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/svs/popcs_ds_beta6.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
21 | 
22 | timesteps: 1000
23 | K_step: 1000
24 | max_beta: 0.02
25 | max_tokens: 36000
26 | max_updates: 320000
27 | gaussian_start: True
28 | 
29 | use_pitch_embed: false
30 | use_gt_f0: false  #  for midi exp
31 | 
32 | lambda_f0: 0.
33 | lambda_uv: 0.
34 | dilation_cycle_length: 10  # *
35 | rel_pos: true
36 | predictor_layers: 5
37 | pe_enable: true
38 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
39 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/svs/popcs_ds_beta6.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
21 | 
22 | # for diffusion schedule
23 | timesteps: 1000
24 | K_step: 1000
25 | max_beta: 0.02
26 | max_tokens: 36000
27 | max_updates: 320000
28 | gaussian_start: True
29 | pndm_speedup: 10
30 | 
31 | use_pitch_embed: false
32 | use_gt_f0: false  #  for midi exp
33 | 
34 | lambda_f0: 0.
35 | lambda_uv: 0.
36 | dilation_cycle_length: 4  # *
37 | rel_pos: true
38 | predictor_layers: 5
39 | pe_enable: true
40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/svs/popcs_ds_beta6.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 36000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/svs/popcs_ds_beta6.yaml
 3 |   - egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
 6 | binary_data_dir: 'data/binary/popcs-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 40000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/midi/pe.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/lj/fs2.yaml
 3 | 
 4 | max_frames: 8000
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binary_data_dir: 'xxx'
14 | 
15 | pitch_type: frame
16 | task_cls: tasks.tts.pe.PitchExtractionTask
17 | pitch_extractor_conv_layers: 2
18 | 
19 | 
20 | # config for experiments
21 | max_tokens: 20000
22 | use_spk_embed: false
23 | num_valid_plots: 10
24 | max_updates: 60000


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/fs2.yaml
 3 |   - configs/singing/base.yaml
 4 |   - ./base.yaml
 5 | 
 6 | audio_sample_rate: 24000
 7 | hop_size: 128            # Hop size.
 8 | fft_size: 512           # FFT size.
 9 | win_size: 512           # FFT size.
10 | fmin: 30
11 | fmax: 12000
12 | min_level_db: -120
13 | 
14 | binarization_args:
15 |   with_wav: true
16 |   with_spk_embed: false
17 |   with_align: true
18 | raw_data_dir: 'data/raw/popcs'
19 | processed_data_dir: 'data/processed/popcs'
20 | binary_data_dir: 'data/binary/popcs-pmf0'
21 | num_spk: 1
22 | datasets: [
23 |   'popcs',
24 | ]
25 | test_prefixes: [
26 |   'popcs-说散就散',
27 |   'popcs-隐形的翅膀',
28 | ]
29 | 
30 | spec_min: [-6.8276, -7.0270, -6.8142, -7.1429, -7.6669, -7.6000, -7.1148, -6.9640,
31 |            -6.8414, -6.6596, -6.6880, -6.7439, -6.7986, -7.4940, -7.7845, -7.6586,
32 |            -6.9288, -6.7639, -6.9118, -6.8246, -6.7183, -7.1769, -6.9794, -7.4513,
33 |            -7.3422, -7.5623, -6.9610, -6.8158, -6.9595, -6.8403, -6.5688, -6.6356,
34 |            -7.0209, -6.5002, -6.7819, -6.5232, -6.6927, -6.5701, -6.5531, -6.7069,
35 |            -6.6462, -6.4523, -6.5954, -6.4264, -6.4487, -6.7070, -6.4025, -6.3042,
36 |            -6.4008, -6.3857, -6.3903, -6.3094, -6.2491, -6.3518, -6.3566, -6.4168,
37 |            -6.2481, -6.3624, -6.2858, -6.2575, -6.3638, -6.4520, -6.1835, -6.2754,
38 |            -6.1253, -6.1645, -6.0638, -6.1262, -6.0710, -6.1039, -6.4428, -6.1363,
39 |            -6.1054, -6.1252, -6.1797, -6.0235, -6.0758, -5.9453, -6.0213, -6.0446]
40 | spec_max: [ 0.2645,  0.0583, -0.2344, -0.0184,  0.1227,  0.1533,  0.1103,  0.1212,
41 |             0.2421,  0.1809,  0.2134,  0.3161,  0.3301,  0.3289,  0.2667,  0.2421,
42 |             0.2581,  0.2600,  0.1394,  0.1907,  0.1082,  0.1474,  0.1680,  0.2550,
43 |             0.1057,  0.0826,  0.0423,  0.1203, -0.0701, -0.0056,  0.0477, -0.0639,
44 |             -0.0272, -0.0728, -0.1648, -0.0855, -0.2652, -0.1998, -0.1547, -0.2167,
45 |             -0.4181, -0.5463, -0.4161, -0.4733, -0.6518, -0.5387, -0.4290, -0.4191,
46 |             -0.4151, -0.3042, -0.3810, -0.4160, -0.4496, -0.2847, -0.4676, -0.4658,
47 |             -0.4931, -0.4885, -0.5547, -0.5481, -0.6948, -0.7968, -0.8455, -0.8392,
48 |             -0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
49 |             -0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
50 | 
51 | task_cls: tasks.svs.diffsinger_task.DiffSingerTask
52 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN
53 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
54 | vocoder: vocoders.hifigan.HifiGAN
55 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
56 | 
57 | pitch_extractor: 'parselmouth'
58 | # config for experiments
59 | use_spk_embed: false
60 | num_valid_plots: 10
61 | max_updates: 160000
62 | lr: 0.001
63 | timesteps: 100
64 | K_step: 51
65 | diff_loss_type: l1
66 | diff_decoder_type: 'wavenet'
67 | schedule_type: 'linear'
68 | max_beta: 0.06
69 | fs2_ckpt: ''
70 | use_nsf: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./popcs_ds_beta6.yaml
 3 | 
 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt  # to be infer
 5 | num_valid_plots: 0
 6 | task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask
 7 | 
 8 | # tmp:
 9 | #pe_enable: true
10 | #pe_ckpt: ''
11 | vocoder: vocoders.hifigan.HifiGAN
12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 | 
 4 | audio_sample_rate: 24000
 5 | hop_size: 128            # Hop size.
 6 | fft_size: 512           # FFT size.
 7 | win_size: 512           # FFT size.
 8 | fmin: 30
 9 | fmax: 12000
10 | min_level_db: -120
11 | 
12 | binarization_args:
13 |   with_wav: true
14 |   with_spk_embed: false
15 |   with_align: true
16 | raw_data_dir: 'data/raw/popcs'
17 | processed_data_dir: 'data/processed/popcs'
18 | binary_data_dir: 'data/binary/popcs-pmf0'
19 | num_spk: 1
20 | datasets: [
21 |   'popcs',
22 | ]
23 | test_prefixes: [
24 |   'popcs-说散就散',
25 |   'popcs-隐形的翅膀',
26 | ]
27 | 
28 | task_cls: tasks.tts.fs2.FastSpeech2Task
29 | #vocoder: tasks.svs.singingvocoder.highgan.HighGAN
30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
31 | vocoder: vocoders.hifigan.HifiGAN
32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
33 | use_nsf: true
34 | 
35 | # config for experiments
36 | max_tokens: 18000
37 | use_spk_embed: false
38 | num_valid_plots: 10
39 | max_updates: 160000
40 | save_gt: true
41 | 
42 | # tmp:
43 | #pe_enable: true
44 | #pe_ckpt: ''


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/base.yaml:
--------------------------------------------------------------------------------
  1 | # task
  2 | base_config: ../config_base.yaml
  3 | task_cls: ''
  4 | #############
  5 | # dataset
  6 | #############
  7 | raw_data_dir: ''
  8 | processed_data_dir: ''
  9 | binary_data_dir: ''
 10 | dict_dir: ''
 11 | pre_align_cls: ''
 12 | binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
 13 | pre_align_args:
 14 |   txt_processor: en
 15 |   use_tone: true # for ZH
 16 |   sox_resample: false
 17 |   sox_to_wav: false
 18 |   allow_no_txt: false
 19 |   trim_sil: false
 20 |   denoise: false
 21 | binarization_args:
 22 |   shuffle: false
 23 |   with_txt: true
 24 |   with_wav: false
 25 |   with_align: true
 26 |   with_spk_embed: false
 27 |   with_spk_id: true
 28 |   with_f0: true
 29 |   with_f0cwt: false
 30 |   with_linear: false
 31 |   with_word: true
 32 |   trim_sil: false
 33 |   trim_eos_bos: false
 34 |   reset_phone_dict: true
 35 |   reset_word_dict: true
 36 | word_size: 30000
 37 | pitch_extractor: parselmouth
 38 | 
 39 | loud_norm: false
 40 | endless_ds: true
 41 | 
 42 | test_num: 100
 43 | min_frames: 0
 44 | max_frames: 1548
 45 | frames_multiple: 1
 46 | max_input_tokens: 1550
 47 | audio_num_mel_bins: 80
 48 | audio_sample_rate: 22050
 49 | hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
 50 | win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
 51 | fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 52 | fmax: 7600  # To be increased/reduced depending on data.
 53 | fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
 54 | min_level_db: -100
 55 | ref_level_db: 20
 56 | griffin_lim_iters: 60
 57 | num_spk: 1
 58 | mel_vmin: -6
 59 | mel_vmax: 1.5
 60 | ds_workers: 1
 61 | 
 62 | #########
 63 | # model
 64 | #########
 65 | dropout: 0.1
 66 | enc_layers: 4
 67 | dec_layers: 4
 68 | hidden_size: 256
 69 | num_heads: 2
 70 | enc_ffn_kernel_size: 9
 71 | dec_ffn_kernel_size: 9
 72 | ffn_act: gelu
 73 | ffn_padding: 'SAME'
 74 | use_spk_id: false
 75 | use_split_spk_id: false
 76 | use_spk_embed: false
 77 | 
 78 | 
 79 | ###########
 80 | # optimization
 81 | ###########
 82 | lr: 2.0
 83 | scheduler: rsqrt # rsqrt|none
 84 | warmup_updates: 8000
 85 | optimizer_adam_beta1: 0.9
 86 | optimizer_adam_beta2: 0.98
 87 | weight_decay: 0
 88 | clip_grad_norm: 1
 89 | clip_grad_value: 0
 90 | 
 91 | 
 92 | ###########
 93 | # train and eval
 94 | ###########
 95 | max_tokens: 30000
 96 | max_sentences: 100000
 97 | max_valid_sentences: 1
 98 | max_valid_tokens: 60000
 99 | valid_infer_interval: 10000
100 | train_set_name: 'train'
101 | train_sets: ''
102 | valid_set_name: 'valid'
103 | test_set_name: 'test'
104 | num_test_samples: 0
105 | num_valid_plots: 10
106 | test_ids: [ ]
107 | vocoder_denoise_c: 0.0
108 | profile_infer: false
109 | out_wav_norm: false
110 | save_gt: true
111 | save_f0: false
112 | gen_dir_name: ''


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/base_zh.yaml:
--------------------------------------------------------------------------------
1 | base_config: ./base.yaml
2 | preprocess_args:
3 |   txt_processor: zh
4 |   use_tone: true
5 | 
6 | word_size: 3000


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/fs2.yaml:
--------------------------------------------------------------------------------
  1 | base_config: ./base.yaml
  2 | task_cls: tasks.tts.fs2.FastSpeech2Task
  3 | 
  4 | # model
  5 | hidden_size: 256
  6 | dropout: 0.1
  7 | encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer
  8 | decoder_type: fft # fft|rnn|conv|conformer|wn
  9 | 
 10 | # rnn enc/dec
 11 | encoder_K: 8
 12 | decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
 13 | 
 14 | # fft enc/dec
 15 | use_pos_embed: true
 16 | dec_num_heads: 2
 17 | dec_layers: 4
 18 | ffn_hidden_size: 1024
 19 | enc_ffn_kernel_size: 9
 20 | dec_ffn_kernel_size: 9
 21 | 
 22 | # conv enc/dec
 23 | enc_dec_norm: ln
 24 | conv_use_pos: false
 25 | layers_in_block: 2
 26 | enc_dilations: [ 1, 1, 1, 1 ]
 27 | enc_kernel_size: 5
 28 | dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
 29 | dec_kernel_size: 5
 30 | dur_loss: mse # huber|mol
 31 | 
 32 | # duration
 33 | predictor_hidden: -1
 34 | predictor_kernel: 5
 35 | predictor_layers: 2
 36 | dur_predictor_kernel: 3
 37 | dur_predictor_layers: 2
 38 | predictor_dropout: 0.5
 39 | 
 40 | # pitch and energy
 41 | pitch_norm: standard # standard|log
 42 | use_pitch_embed: true
 43 | pitch_type: frame # frame|ph|cwt
 44 | use_uv: true
 45 | cwt_hidden_size: 128
 46 | cwt_layers: 2
 47 | cwt_loss: l1
 48 | cwt_add_f0_loss: false
 49 | cwt_std_scale: 0.8
 50 | 
 51 | pitch_ar: false
 52 | pitch_embed_type: 0
 53 | pitch_loss: 'l1' # l1|l2|ssim
 54 | pitch_ssim_win: 11
 55 | use_energy_embed: false
 56 | 
 57 | # reference encoder and speaker embedding
 58 | use_ref_enc: false
 59 | use_var_enc: false
 60 | lambda_commit: 0.25
 61 | var_enc_vq_codes: 64
 62 | ref_norm_layer: bn
 63 | dec_inp_add_noise: false
 64 | sil_add_noise: false
 65 | ref_hidden_stride_kernel:
 66 |   - 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
 67 |   - 0,3,5
 68 |   - 0,2,5
 69 |   - 0,2,5
 70 |   - 0,2,5
 71 | pitch_enc_hidden_stride_kernel:
 72 |   - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
 73 |   - 0,2,5
 74 |   - 0,2,5
 75 | dur_enc_hidden_stride_kernel:
 76 |   - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
 77 |   - 0,2,3
 78 |   - 0,1,3
 79 | 
 80 | # mel
 81 | mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
 82 | 
 83 | # loss lambda
 84 | lambda_f0: 1.0
 85 | lambda_uv: 1.0
 86 | lambda_energy: 0.1
 87 | lambda_ph_dur: 0.1
 88 | lambda_sent_dur: 1.0
 89 | lambda_word_dur: 1.0
 90 | predictor_grad: 0.1
 91 | 
 92 | # train and eval
 93 | pretrain_fs_ckpt: ''
 94 | warmup_updates: 2000
 95 | max_tokens: 32000
 96 | max_sentences: 100000
 97 | max_valid_sentences: 1
 98 | max_updates: 120000
 99 | use_gt_dur: false
100 | use_gt_f0: false
101 | ds_workers: 2
102 | lr: 1.0
103 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/fs2_adv.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ./fs2.yaml
 2 | task_cls: tasks.tts.fs2_adv.FastSpeech2AdvTask
 3 | 
 4 | disc_win_num: 3
 5 | disc_interval: 1
 6 | disc_reduction: stack # stack|sum|none
 7 | disc_start_steps: 0
 8 | rerun_gen: false
 9 | 
10 | disc_norm: in
11 | mel_disc_hidden_size: 128
12 | 
13 | # mel decoder
14 | mel_gan: true
15 | lambda_mel_adv: 0.1
16 | mel_hidden_size: 256
17 | 
18 | # others
19 | dropout: 0.05
20 | pitch_embed_type: 0
21 | enc_ffn_kernel_size: 9
22 | dec_ffn_kernel_size: 9
23 | use_cond_disc: false
24 | 
25 | optimizer_adam_beta1: 0.5
26 | optimizer_adam_beta2: 0.999
27 | generator_grad_norm: 5.0    # Generator's gradient norm.
28 | disc_hidden_size: 128
29 | disc_lr: 0.0001            # Discriminator's learning rate.
30 | discriminator_optimizer_params:
31 |   eps: 1.0e-6            # Discriminator's epsilon.
32 |   weight_decay: 0.0      # Discriminator's weight decay coefficient.
33 | discriminator_scheduler_params:
34 |   step_size: 60000      # Discriminator's scheduler step size.
35 |   gamma: 0.5             # D5iscriminator's scheduler gamma.
36 |   # At each step size, lr will be multiplied by this parameter.
37 | discriminator_grad_norm: 1 # Discriminator's gradient norm.
38 | 
39 | max_updates: 400000
40 | max_tokens: 30000
41 | max_sentences: 80
42 | val_check_interval: 2000
43 | 
44 | gen_dir_name: ''
45 | num_ckpt_keep: 2
46 | save_best: false
47 | 
48 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/ps.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ./fs2.yaml
 2 | 
 3 | ###########################
 4 | # models
 5 | ###########################
 6 | # encoders
 7 | hidden_size: 192
 8 | ffn_hidden_size: 768
 9 | enc_ffn_kernel_size: 5
10 | enc_layers: 4
11 | dur_level: word
12 | encoder_type: rel_fft
13 | use_word_encoder: true
14 | 
15 | # mix ling encoder
16 | word_enc_layers: 4
17 | word_encoder_type: rel_fft
18 | use_pitch_embed: false
19 | enc_prenet: true
20 | enc_pre_ln: true
21 | text_encoder_postnet: true
22 | dropout: 0.0
23 | add_word_pos: true
24 | 
25 | # dur predictor
26 | dur_predictor_layers: 3
27 | dur_predictor_kernel: 5
28 | predictor_dropout: 0.2
29 | 
30 | ## fvae
31 | use_fvae: true
32 | latent_size: 16
33 | fvae_encoder_type: conv
34 | fvae_decoder_type: conv
35 | fvae_enc_dec_hidden: 192
36 | fvae_kernel_size: 5
37 | fvae_enc_n_layers: 8
38 | fvae_dec_n_layers: 4
39 | fvae_strides: 4
40 | fvae_noise_scale: 1.0
41 | 
42 | # prior flow
43 | use_prior_flow: true
44 | prior_flow_hidden: 64
45 | prior_flow_kernel_size: 3
46 | prior_flow_n_blocks: 4
47 | 
48 | ###########################
49 | # training and inference
50 | ###########################
51 | lambda_kl: 1.0
52 | kl_min: 0.0
53 | lambda_sent_dur: 0.0
54 | kl_start_steps: 10000
55 | posterior_start_steps: 0
56 | frames_multiple: 4
57 | num_valid_plots: 10
58 | lr: 0.0002
59 | warmup_updates: 8000
60 | max_tokens: 40000
61 | valid_infer_interval: 10000
62 | max_sentences: 80
63 | max_updates: 480000


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/ps_flow.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ./ps2.yaml
 2 | task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
 3 | 
 4 | use_post_flow: true
 5 | detach_postflow_input: true
 6 | post_flow_lr: 0.001
 7 | post_glow_hidden: 192
 8 | post_glow_kernel_size: 3
 9 | post_glow_n_blocks: 12
10 | post_glow_n_block_layers: 3
11 | post_share_cond_layers: false
12 | share_wn_layers: 4
13 | use_cond_proj: false
14 | use_latent_cond: false
15 | use_txt_cond: true
16 | sigmoid_scale: false
17 | post_glow_training_start: 160000
18 | noise_scale: 0.8
19 | infer_post_glow: true
20 | two_stage: true


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/ps_flow_small.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ./ps_flow.yaml
 2 | 
 3 | ###########################
 4 | # models
 5 | ###########################
 6 | # encoders
 7 | hidden_size: 128
 8 | ffn_hidden_size: 512
 9 | enc_ffn_kernel_size: 3
10 | enc_layers: 3
11 | word_enc_layers: 3
12 | 
13 | # dur predictor
14 | dur_predictor_layers: 3
15 | dur_predictor_kernel: 5
16 | predictor_dropout: 0.2
17 | 
18 | ## fvae
19 | use_fvae: true
20 | latent_size: 16
21 | fvae_encoder_type: wn
22 | fvae_decoder_type: wn
23 | fvae_enc_dec_hidden: 128
24 | fvae_kernel_size: 3
25 | fvae_enc_n_layers: 8
26 | fvae_dec_n_layers: 3
27 | fvae_strides: 4
28 | fvae_noise_scale: 1.0
29 | 
30 | 
31 | # prior flow
32 | use_prior_flow: true
33 | prior_flow_hidden: 32
34 | prior_flow_kernel_size: 3
35 | prior_flow_n_blocks: 3
36 | # post flow
37 | post_glow_hidden: 128
38 | post_glow_kernel_size: 3
39 | post_glow_n_blocks: 8
40 | post_glow_n_block_layers: 3
41 | share_wn_layers: 4
42 | noise_scale: 0.6


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/vocoder/base.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ../base.yaml
 2 | binarization_args:
 3 |   with_wav: true
 4 |   with_spk_embed: false
 5 |   with_align: false
 6 |   with_word: false
 7 |   with_txt: false
 8 | 
 9 | ###########
10 | # train and eval
11 | ###########
12 | max_samples: 25600
13 | max_sentences: 5
14 | max_valid_sentences: 1
15 | max_updates: 1000000
16 | val_check_interval: 2000
17 | 
18 | ###########################################################
19 | #                FEATURE EXTRACTION SETTING               #
20 | ###########################################################
21 | fft_size: 1024           # FFT size.
22 | hop_size: 256            # Hop size.
23 | win_length: null         # Window length.
24 | # If set to null, it will be the same as fft_size.
25 | window: "hann"           # Window function.
26 | num_mels: 80             # Number of mel basis.
27 | fmin: 80                 # Minimum freq in mel basis calculation.
28 | fmax: 7600               # Maximum frequency in mel basis calculation.
29 | aux_context_window: 0 # Context window size for auxiliary feature.
30 | use_pitch_embed: false
31 | 
32 | generator_grad_norm: 10    # Generator's gradient norm.
33 | discriminator_grad_norm: 1 # Discriminator's gradient norm.
34 | disc_start_steps: 40000 # Number of steps to start to train discriminator.
35 | 


--------------------------------------------------------------------------------
/NeuralSeq/egs/egs_bases/tts/vocoder/hifigan.yaml:
--------------------------------------------------------------------------------
 1 | base_config: ./base.yaml
 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask
 3 | resblock: "1"
 4 | adam_b1: 0.8
 5 | adam_b2: 0.99
 6 | upsample_rates: [ 8,8,2,2 ]
 7 | upsample_kernel_sizes: [ 16,16,4,4 ]
 8 | upsample_initial_channel: 512
 9 | resblock_kernel_sizes: [ 3,7,11 ]
10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11 | 
12 | use_pitch_embed: false
13 | use_fm_loss: false
14 | use_ms_stft: false
15 | 
16 | lambda_mel: 5.0
17 | lambda_mel_adv: 1.0
18 | lambda_cdisc: 4.0
19 | lambda_adv: 1.0
20 | 
21 | lr: 0.0002            # Generator's learning rate.
22 | generator_scheduler_params:
23 |   step_size: 600
24 |   gamma: 0.999
25 | discriminator_scheduler_params:
26 |   step_size: 600
27 |   gamma: 0.999
28 | max_updates: 3000000


--------------------------------------------------------------------------------
/NeuralSeq/gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ftz filter=lfs diff=lfs merge=lfs -text
 6 | *.gz filter=lfs diff=lfs merge=lfs -text
 7 | *.h5 filter=lfs diff=lfs merge=lfs -text
 8 | *.joblib filter=lfs diff=lfs merge=lfs -text
 9 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
10 | *.model filter=lfs diff=lfs merge=lfs -text
11 | *.msgpack filter=lfs diff=lfs merge=lfs -text
12 | *.npy filter=lfs diff=lfs merge=lfs -text
13 | *.npz filter=lfs diff=lfs merge=lfs -text
14 | *.onnx filter=lfs diff=lfs merge=lfs -text
15 | *.ot filter=lfs diff=lfs merge=lfs -text
16 | *.parquet filter=lfs diff=lfs merge=lfs -text
17 | *.pickle filter=lfs diff=lfs merge=lfs -text
18 | *.pkl filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pt filter=lfs diff=lfs merge=lfs -text
21 | *.pth filter=lfs diff=lfs merge=lfs -text
22 | *.rar filter=lfs diff=lfs merge=lfs -text
23 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24 | *.tar.* filter=lfs diff=lfs merge=lfs -text
25 | *.tflite filter=lfs diff=lfs merge=lfs -text
26 | *.tgz filter=lfs diff=lfs merge=lfs -text
27 | *.wasm filter=lfs diff=lfs merge=lfs -text
28 | *.xz filter=lfs diff=lfs merge=lfs -text
29 | *.zip filter=lfs diff=lfs merge=lfs -text
30 | *.zstandard filter=lfs diff=lfs merge=lfs -text
31 | *tfevents* filter=lfs diff=lfs merge=lfs -text
32 | model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
33 | checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text
34 | 


--------------------------------------------------------------------------------
/NeuralSeq/inference/svs/ds_cascade.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from inference.svs.base_svs_infer import BaseSVSInfer
 3 | from utils import load_ckpt
 4 | from utils.hparams import hparams
 5 | from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
 6 | from tasks.svs.diffsinger_task import DIFF_DECODERS
 7 | 
 8 | class DiffSingerCascadeInfer(BaseSVSInfer):
 9 |     def build_model(self):
10 |         model = GaussianDiffusion(
11 |             phone_encoder=self.ph_encoder,
12 |             out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
13 |             timesteps=hparams['timesteps'],
14 |             K_step=hparams['K_step'],
15 |             loss_type=hparams['diff_loss_type'],
16 |             spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
17 |         )
18 |         model.eval()
19 |         load_ckpt(model, hparams['work_dir'], 'model')
20 |         return model
21 | 
22 |     def forward_model(self, inp):
23 |         sample = self.input_to_batch(inp)
24 |         txt_tokens = sample['txt_tokens']  # [B, T_t]
25 |         spk_id = sample.get('spk_ids')
26 |         with torch.no_grad():
27 |             output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
28 |                                 pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
29 |                                 is_slur=sample['is_slur'])
30 |             mel_out = output['mel_out']  # [B, T,80]
31 |             f0_pred = output['f0_denorm']
32 |             wav_out = self.run_vocoder(mel_out, f0=f0_pred)
33 |         wav_out = wav_out.cpu().numpy()
34 |         return wav_out[0]
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     inp = {
39 |         'text': '小酒窝长睫毛AP是你最美的记号',
40 |         'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
41 |         'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
42 |         'input_type': 'word'
43 |     }  # user input: Chinese characters
44 |     c = {
45 |         'text': '小酒窝长睫毛AP是你最美的记号',
46 |         'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
47 |         'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
48 |         'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
49 |         'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
50 |         'input_type': 'phoneme'
51 |     }  # input like Opencpop dataset.
52 |     DiffSingerCascadeInfer.example_run(inp)
53 | 
54 | # # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi


--------------------------------------------------------------------------------
/NeuralSeq/inference/svs/opencpop/map.py:
--------------------------------------------------------------------------------
1 | def cpop_pinyin2ph_func():
2 |     # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table"
3 |     pinyin2phs = {'AP': 'AP', 'SP': 'SP'}
4 |     with open('NeuralSeq/inference/svs/opencpop/cpop_pinyin2ph.txt') as rf:
5 |         for line in rf.readlines():
6 |             elements = [x.strip() for x in line.split('|') if x.strip() != '']
7 |             pinyin2phs[elements[0]] = elements[1]
8 |     return pinyin2phs


--------------------------------------------------------------------------------
/NeuralSeq/modules/GenerSpeech/config/generspeech.yaml:
--------------------------------------------------------------------------------
  1 | base_config:
  2 |   - egs/egs_bases/tts/fs2.yaml
  3 |   - egs/datasets/audio/emotion/base_text2mel.yaml
  4 | 
  5 | task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
  6 | 
  7 | # emotion encoder
  8 | emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path
  9 | 
 10 | # vocoder
 11 | vocoder: hifigan
 12 | vocoder_ckpt: checkpoints/trainset_hifigan
 13 | 
 14 | # dataset
 15 | raw_data_dir: 'data/raw/training_set'
 16 | processed_data_dir: 'data/processed/training_set'
 17 | binary_data_dir: 'data/binary/training_set'
 18 | test_input_dir: ''
 19 | 
 20 | # process
 21 | binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
 22 | audio_sample_rate: 16000
 23 | hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
 24 | win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
 25 | fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 26 | fmax: 7600  # To be increased/reduced depending on data.
 27 | fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
 28 | min_level_db: -100
 29 | ref_level_db: 20
 30 | 
 31 | binarization_args:
 32 |   reset_phone_dict: true
 33 |   reset_word_dict: true
 34 |   shuffle: true
 35 |   trim_eos_bos: false
 36 |   trim_sil: false
 37 |   with_align: true
 38 |   with_f0: true
 39 |   with_f0cwt: false
 40 |   with_linear: false
 41 |   with_spk_embed: true
 42 |   with_spk_id: true
 43 |   with_txt: true
 44 |   with_wav: true
 45 |   with_word: true
 46 | 
 47 | preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
 48 | preprocess_args:
 49 |   nsample_per_mfa_group: 1000
 50 |   # text process
 51 |   txt_processor: en
 52 |   use_mfa: true
 53 |   with_phsep: true
 54 |   reset_phone_dict: true
 55 |   reset_word_dict: true
 56 |   add_eos_bos: true
 57 |   # mfa
 58 |   mfa_group_shuffle: false
 59 |   mfa_offset: 0.02
 60 |   # wav processors
 61 |   wav_processors: []
 62 |   save_sil_mask: true
 63 |   vad_max_silence_length: 12
 64 | 
 65 | # data
 66 | word_dict_size: 10000
 67 | num_spk: 500
 68 | use_spk_embed: true
 69 | use_spk_id: false
 70 | use_word: true
 71 | use_emotion: true
 72 | use_gt_dur: false
 73 | ref_audio: ''
 74 | text: ''
 75 | 
 76 | # training
 77 | num_sanity_val_steps: -1
 78 | max_updates: 300000
 79 | max_sentences: 100000
 80 | num_test_samples: 72
 81 | 
 82 | ## glow
 83 | post_glow_hidden: 128
 84 | post_glow_kernel_size: 3
 85 | post_glow_n_blocks: 8
 86 | post_glow_n_block_layers: 3
 87 | share_wn_layers: 4
 88 | sigmoid_scale: false
 89 | post_share_cond_layers: false
 90 | use_txt_cond: true
 91 | use_latent_cond: true
 92 | noise_scale: 0.8
 93 | 
 94 | # prosody extractor
 95 | lambda_commit: 0.25
 96 | vq_start: 20500
 97 | vae_dropout: 0.0
 98 | nVQ: 128
 99 | forcing: 20000
100 | crop: false
101 | predictor_grad: 1.0


--------------------------------------------------------------------------------
/NeuralSeq/modules/GenerSpeech/model/mixstyle.py:
--------------------------------------------------------------------------------
 1 | from modules.commons.common_layers import *
 2 | import random
 3 | 
 4 | 
 5 | class MixStyle(nn.Module):
 6 |     """MixStyle.
 7 |     Reference:
 8 |       Zhou et al. Domain Generalization with MixStyle. ICLR 2021.
 9 |     """
10 | 
11 |     def __init__(self, p=0.5, alpha=0.1, eps=1e-6, hidden_size=256):
12 |         """
13 |         Args:
14 |           p (float): probability of using MixStyle.
15 |           alpha (float): parameter of the Beta distribution.
16 |           eps (float): scaling parameter to avoid numerical issues.
17 |           mix (str): how to mix.
18 |         """
19 |         super().__init__()
20 |         self.p = p
21 |         self.beta = torch.distributions.Beta(alpha, alpha)
22 |         self.eps = eps
23 |         self.alpha = alpha
24 |         self._activated = True
25 |         self.hidden_size = hidden_size
26 |         self.affine_layer = LinearNorm(
27 |             hidden_size,
28 |             2 * hidden_size, # For both b (bias) g (gain)
29 |         )
30 | 
31 |     def __repr__(self):
32 |         return f'MixStyle(p={self.p}, alpha={self.alpha}, eps={self.eps})'
33 | 
34 |     def set_activation_status(self, status=True):
35 |         self._activated = status
36 | 
37 |     def forward(self, x, spk_embed):
38 |         if not self.training or not self._activated:
39 |             return x
40 | 
41 |         if random.random() > self.p:
42 |             return x
43 | 
44 |         B = x.size(0)
45 | 
46 |         mu, sig = torch.mean(x, dim=-1, keepdim=True), torch.std(x, dim=-1, keepdim=True)
47 |         x_normed = (x - mu) / (sig + 1e-6)  # [B, T, H_m]
48 | 
49 |         lmda = self.beta.sample((B, 1, 1))
50 |         lmda = lmda.to(x.device)
51 | 
52 |         # Get Bias and Gain
53 |         mu1, sig1 = torch.split(self.affine_layer(spk_embed), self.hidden_size, dim=-1)  # [B, 1, 2 * H_m] --> 2 * [B, 1, H_m]
54 | 
55 |         # MixStyle
56 |         perm = torch.randperm(B)
57 |         mu2, sig2 = mu1[perm], sig1[perm]
58 | 
59 |         mu_mix = mu1*lmda + mu2 * (1-lmda)
60 |         sig_mix = sig1*lmda + sig2 * (1-lmda)
61 | 
62 |         # Perform Scailing and Shifting
63 |         return sig_mix * x_normed + mu_mix # [B, T, H_m]
64 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/modules/__init__.py


--------------------------------------------------------------------------------
/NeuralSeq/modules/commons/align_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def build_word_mask(x2word, y2word):
 6 |     return (x2word[:, :, None] == y2word[:, None, :]).long()
 7 | 
 8 | 
 9 | def mel2ph_to_mel2word(mel2ph, ph2word):
10 |     mel2word = (ph2word - 1).gather(1, (mel2ph - 1).clamp(min=0)) + 1
11 |     mel2word = mel2word * (mel2ph > 0).long()
12 |     return mel2word
13 | 
14 | 
15 | def clip_mel2token_to_multiple(mel2token, frames_multiple):
16 |     max_frames = mel2token.shape[1] // frames_multiple * frames_multiple
17 |     mel2token = mel2token[:, :max_frames]
18 |     return mel2token
19 | 
20 | 
21 | def expand_states(h, mel2token):
22 |     h = F.pad(h, [0, 0, 1, 0])
23 |     mel2token_ = mel2token[..., None].repeat([1, 1, h.shape[-1]])
24 |     h = torch.gather(h, 1, mel2token_)  # [B, T, H]
25 |     return h
26 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/commons/normalizing_flow/res_flow.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from modules.commons.conv import ConditionalConvBlocks
 4 | from modules.commons.wavenet import WN
 5 | 
 6 | 
 7 | class FlipLayer(nn.Module):
 8 |     def forward(self, x, nonpadding, cond=None, reverse=False):
 9 |         x = torch.flip(x, [1])
10 |         return x
11 | 
12 | 
13 | class CouplingLayer(nn.Module):
14 |     def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout=0, c_in_g=0, nn_type='wn'):
15 |         super().__init__()
16 |         self.channels = c_in
17 |         self.hidden_size = hidden_size
18 |         self.kernel_size = kernel_size
19 |         self.n_layers = n_layers
20 |         self.c_half = c_in // 2
21 | 
22 |         self.pre = nn.Conv1d(self.c_half, hidden_size, 1)
23 |         if nn_type == 'wn':
24 |             self.enc = WN(hidden_size, kernel_size, 1, n_layers, p_dropout=p_dropout,
25 |                           c_cond=c_in_g)
26 |         elif nn_type == 'conv':
27 |             self.enc = ConditionalConvBlocks(
28 |                 hidden_size, c_in_g, hidden_size, None, kernel_size,
29 |                 layers_in_block=1, is_BTC=False, num_layers=n_layers)
30 |         self.post = nn.Conv1d(hidden_size, self.c_half, 1)
31 | 
32 |     def forward(self, x, nonpadding, cond=None, reverse=False):
33 |         x0, x1 = x[:, :self.c_half], x[:, self.c_half:]
34 |         x_ = self.pre(x0) * nonpadding
35 |         x_ = self.enc(x_, nonpadding=nonpadding, cond=cond)
36 |         m = self.post(x_)
37 |         x1 = m + x1 if not reverse else x1 - m
38 |         x = torch.cat([x0, x1], 1)
39 |         return x * nonpadding
40 | 
41 | 
42 | class ResFlow(nn.Module):
43 |     def __init__(self,
44 |                  c_in,
45 |                  hidden_size,
46 |                  kernel_size,
47 |                  n_flow_layers,
48 |                  n_flow_steps=4,
49 |                  c_cond=0,
50 |                  nn_type='wn'):
51 |         super().__init__()
52 |         self.flows = nn.ModuleList()
53 |         for i in range(n_flow_steps):
54 |             self.flows.append(
55 |                 CouplingLayer(c_in, hidden_size, kernel_size, n_flow_layers, c_in_g=c_cond, nn_type=nn_type))
56 |             self.flows.append(FlipLayer())
57 | 
58 |     def forward(self, x, nonpadding, cond=None, reverse=False):
59 |         for flow in (self.flows if not reverse else reversed(self.flows)):
60 |             x = flow(x, nonpadding, cond=cond, reverse=reverse)
61 |         return x
62 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/commons/normalizing_flow/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def squeeze(x, x_mask=None, n_sqz=2):
 5 |     b, c, t = x.size()
 6 | 
 7 |     t = (t // n_sqz) * n_sqz
 8 |     x = x[:, :, :t]
 9 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
10 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
11 | 
12 |     if x_mask is not None:
13 |         x_mask = x_mask[:, :, n_sqz - 1::n_sqz]
14 |     else:
15 |         x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
16 |     return x_sqz * x_mask, x_mask
17 | 
18 | 
19 | def unsqueeze(x, x_mask=None, n_sqz=2):
20 |     b, c, t = x.size()
21 | 
22 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
23 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
24 | 
25 |     if x_mask is not None:
26 |         x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
27 |     else:
28 |         x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
29 |     return x_unsqz * x_mask, x_mask
30 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/hifigan/mel_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from librosa.filters import mel as librosa_mel_fn
 5 | from scipy.io.wavfile import read
 6 | 
 7 | MAX_WAV_VALUE = 32768.0
 8 | 
 9 | 
10 | def load_wav(full_path):
11 |     sampling_rate, data = read(full_path)
12 |     return data, sampling_rate
13 | 
14 | 
15 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
16 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17 | 
18 | 
19 | def dynamic_range_decompression(x, C=1):
20 |     return np.exp(x) / C
21 | 
22 | 
23 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24 |     return torch.log(torch.clamp(x, min=clip_val) * C)
25 | 
26 | 
27 | def dynamic_range_decompression_torch(x, C=1):
28 |     return torch.exp(x) / C
29 | 
30 | 
31 | def spectral_normalize_torch(magnitudes):
32 |     output = dynamic_range_compression_torch(magnitudes)
33 |     return output
34 | 
35 | 
36 | def spectral_de_normalize_torch(magnitudes):
37 |     output = dynamic_range_decompression_torch(magnitudes)
38 |     return output
39 | 
40 | 
41 | mel_basis = {}
42 | hann_window = {}
43 | 
44 | 
45 | def mel_spectrogram(y, hparams, center=False, complex=False):
46 |     # hop_size: 512  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
47 |     # win_size: 2048  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
48 |     # fmin: 55  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
49 |     # fmax: 10000  # To be increased/reduced depending on data.
50 |     # fft_size: 2048  # Extra window size is filled with 0 paddings to match this parameter
51 |     # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
52 |     n_fft = hparams['fft_size']
53 |     num_mels = hparams['audio_num_mel_bins']
54 |     sampling_rate = hparams['audio_sample_rate']
55 |     hop_size = hparams['hop_size']
56 |     win_size = hparams['win_size']
57 |     fmin = hparams['fmin']
58 |     fmax = hparams['fmax']
59 |     y = y.clamp(min=-1., max=1.)
60 |     global mel_basis, hann_window
61 |     if fmax not in mel_basis:
62 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
63 |         mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
64 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
65 | 
66 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
67 |                                 mode='reflect')
68 |     y = y.squeeze(1)
69 | 
70 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
71 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
72 | 
73 |     if not complex:
74 |         spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
75 |         spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
76 |         spec = spectral_normalize_torch(spec)
77 |     else:
78 |         B, C, T, _ = spec.shape
79 |         spec = spec.transpose(1, 2)  # [B, T, n_fft, 2]
80 |     return spec
81 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/modules/parallel_wavegan/__init__.py


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .causal_conv import *  # NOQA
2 | from .pqmf import *  # NOQA
3 | from .residual_block import *  # NOQA
4 | from modules.parallel_wavegan.layers.residual_stack import *  # NOQA
5 | from .upsample import *  # NOQA
6 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/layers/causal_conv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Causal convolusion layer modules."""
 7 | 
 8 | 
 9 | import torch
10 | 
11 | 
12 | class CausalConv1d(torch.nn.Module):
13 |     """CausalConv1d module with customized initialization."""
14 | 
15 |     def __init__(self, in_channels, out_channels, kernel_size,
16 |                  dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
17 |         """Initialize CausalConv1d module."""
18 |         super(CausalConv1d, self).__init__()
19 |         self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
20 |         self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
21 |                                     dilation=dilation, bias=bias)
22 | 
23 |     def forward(self, x):
24 |         """Calculate forward propagation.
25 | 
26 |         Args:
27 |             x (Tensor): Input tensor (B, in_channels, T).
28 | 
29 |         Returns:
30 |             Tensor: Output tensor (B, out_channels, T).
31 | 
32 |         """
33 |         return self.conv(self.pad(x))[:, :, :x.size(2)]
34 | 
35 | 
36 | class CausalConvTranspose1d(torch.nn.Module):
37 |     """CausalConvTranspose1d module with customized initialization."""
38 | 
39 |     def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
40 |         """Initialize CausalConvTranspose1d module."""
41 |         super(CausalConvTranspose1d, self).__init__()
42 |         self.deconv = torch.nn.ConvTranspose1d(
43 |             in_channels, out_channels, kernel_size, stride, bias=bias)
44 |         self.stride = stride
45 | 
46 |     def forward(self, x):
47 |         """Calculate forward propagation.
48 | 
49 |         Args:
50 |             x (Tensor): Input tensor (B, in_channels, T_in).
51 | 
52 |         Returns:
53 |             Tensor: Output tensor (B, out_channels, T_out).
54 | 
55 |         """
56 |         return self.deconv(x)[:, :, :-self.stride]
57 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/layers/residual_stack.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Residual stack module in MelGAN."""
 7 | 
 8 | import torch
 9 | 
10 | from . import CausalConv1d
11 | 
12 | 
13 | class ResidualStack(torch.nn.Module):
14 |     """Residual stack module introduced in MelGAN."""
15 | 
16 |     def __init__(self,
17 |                  kernel_size=3,
18 |                  channels=32,
19 |                  dilation=1,
20 |                  bias=True,
21 |                  nonlinear_activation="LeakyReLU",
22 |                  nonlinear_activation_params={"negative_slope": 0.2},
23 |                  pad="ReflectionPad1d",
24 |                  pad_params={},
25 |                  use_causal_conv=False,
26 |                  ):
27 |         """Initialize ResidualStack module.
28 | 
29 |         Args:
30 |             kernel_size (int): Kernel size of dilation convolution layer.
31 |             channels (int): Number of channels of convolution layers.
32 |             dilation (int): Dilation factor.
33 |             bias (bool): Whether to add bias parameter in convolution layers.
34 |             nonlinear_activation (str): Activation function module name.
35 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
36 |             pad (str): Padding function module name before dilated convolution layer.
37 |             pad_params (dict): Hyperparameters for padding function.
38 |             use_causal_conv (bool): Whether to use causal convolution.
39 | 
40 |         """
41 |         super(ResidualStack, self).__init__()
42 | 
43 |         # defile residual stack part
44 |         if not use_causal_conv:
45 |             assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
46 |             self.stack = torch.nn.Sequential(
47 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
48 |                 getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
49 |                 torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
50 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
51 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
52 |             )
53 |         else:
54 |             self.stack = torch.nn.Sequential(
55 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
56 |                 CausalConv1d(channels, channels, kernel_size, dilation=dilation,
57 |                              bias=bias, pad=pad, pad_params=pad_params),
58 |                 getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
59 |                 torch.nn.Conv1d(channels, channels, 1, bias=bias),
60 |             )
61 | 
62 |         # defile extra layer for skip connection
63 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
64 | 
65 |     def forward(self, c):
66 |         """Calculate forward propagation.
67 | 
68 |         Args:
69 |             c (Tensor): Input tensor (B, channels, T).
70 | 
71 |         Returns:
72 |             Tensor: Output tensor (B, chennels, T).
73 | 
74 |         """
75 |         return self.stack(c) + self.skip_layer(c)
76 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .stft_loss import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .melgan import *  # NOQA
2 | from .parallel_wavegan import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from torch.optim import *  # NOQA
2 | from .radam import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/NeuralSeq/modules/parallel_wavegan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/NeuralSeq/tasks/run.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from utils.hparams import set_hparams, hparams
 3 | 
 4 | 
 5 | def run_task():
 6 |     assert hparams['task_cls'] != ''
 7 |     pkg = ".".join(hparams["task_cls"].split(".")[:-1])
 8 |     cls_name = hparams["task_cls"].split(".")[-1]
 9 |     task_cls = getattr(importlib.import_module(pkg), cls_name)
10 |     task_cls.start()
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     set_hparams()
15 |     run_task()
16 | 


--------------------------------------------------------------------------------
/NeuralSeq/tasks/svs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/NeuralSeq/tasks/svs/__init__.py


--------------------------------------------------------------------------------
/NeuralSeq/tasks/tts/synta.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from torch import nn
 5 | 
 6 | from modules.tts.syntaspeech.syntaspeech import SyntaSpeech
 7 | from tasks.tts.ps_adv import PortaSpeechAdvTask
 8 | from utils.hparams import hparams
 9 | 
10 | 
11 | class SyntaSpeechTask(PortaSpeechAdvTask):
12 |     def build_tts_model(self):
13 |         ph_dict_size = len(self.token_encoder)
14 |         word_dict_size = len(self.word_encoder)
15 |         self.model = SyntaSpeech(ph_dict_size, word_dict_size, hparams)
16 |     
17 |         self.gen_params = [p for p in self.model.parameters() if p.requires_grad]
18 |         self.dp_params = [p for k, p in self.model.named_parameters() if (('dur_predictor' in k) and p.requires_grad)]
19 |         self.gen_params_except_dp = [p for k, p in self.model.named_parameters() if (('dur_predictor' not in k) and p.requires_grad)]        
20 |         self.bert_params = [p for k, p in self.model.named_parameters() if (('bert' in k) and p.requires_grad)]
21 |         self.gen_params_except_bert_and_dp = [p for k, p in self.model.named_parameters() if ('dur_predictor' not in k) and ('bert' not in k) and p.requires_grad ]
22 | 
23 |         self.use_bert = True if len(self.bert_params) > 0 else False
24 |     
25 |     


--------------------------------------------------------------------------------
/NeuralSeq/tasks/tts/tts_utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from data_gen.tts.base_binarizer import BaseBinarizer
 4 | from data_gen.tts.base_preprocess import BasePreprocessor
 5 | from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
 6 | from utils.hparams import hparams
 7 | 
 8 | 
 9 | def parse_dataset_configs():
10 |     max_tokens = hparams['max_tokens']
11 |     max_sentences = hparams['max_sentences']
12 |     max_valid_tokens = hparams['max_valid_tokens']
13 |     if max_valid_tokens == -1:
14 |         hparams['max_valid_tokens'] = max_valid_tokens = max_tokens
15 |     max_valid_sentences = hparams['max_valid_sentences']
16 |     if max_valid_sentences == -1:
17 |         hparams['max_valid_sentences'] = max_valid_sentences = max_sentences
18 |     return max_tokens, max_sentences, max_valid_tokens, max_valid_sentences
19 | 
20 | 
21 | def parse_mel_losses():
22 |     mel_losses = hparams['mel_losses'].split("|")
23 |     loss_and_lambda = {}
24 |     for i, l in enumerate(mel_losses):
25 |         if l == '':
26 |             continue
27 |         if ':' in l:
28 |             l, lbd = l.split(":")
29 |             lbd = float(lbd)
30 |         else:
31 |             lbd = 1.0
32 |         loss_and_lambda[l] = lbd
33 |     print("| Mel losses:", loss_and_lambda)
34 |     return loss_and_lambda
35 | 
36 | 
37 | def load_data_preprocessor():
38 |     preprocess_cls = hparams["preprocess_cls"]
39 |     pkg = ".".join(preprocess_cls.split(".")[:-1])
40 |     cls_name = preprocess_cls.split(".")[-1]
41 |     preprocessor: BasePreprocessor = getattr(importlib.import_module(pkg), cls_name)()
42 |     preprocess_args = {}
43 |     preprocess_args.update(hparams['preprocess_args'])
44 |     return preprocessor, preprocess_args
45 | 
46 | 
47 | def load_data_binarizer():
48 |     binarizer_cls = hparams['binarizer_cls']
49 |     pkg = ".".join(binarizer_cls.split(".")[:-1])
50 |     cls_name = binarizer_cls.split(".")[-1]
51 |     binarizer: BaseBinarizer = getattr(importlib.import_module(pkg), cls_name)()
52 |     binarization_args = {}
53 |     binarization_args.update(hparams['binarization_args'])
54 |     return binarizer, binarization_args


--------------------------------------------------------------------------------
/NeuralSeq/tasks/vocoder/vocoder_base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | import torch.distributed as dist
 5 | from torch.utils.data import DistributedSampler
 6 | 
 7 | from tasks.base_task import BaseTask
 8 | from tasks.base_task import data_loader
 9 | from tasks.vocoder.dataset_utils import VocoderDataset, EndlessDistributedSampler
10 | from utils.hparams import hparams
11 | 
12 | 
13 | class VocoderBaseTask(BaseTask):
14 |     def __init__(self):
15 |         super(VocoderBaseTask, self).__init__()
16 |         self.max_sentences = hparams['max_sentences']
17 |         self.max_valid_sentences = hparams['max_valid_sentences']
18 |         if self.max_valid_sentences == -1:
19 |             hparams['max_valid_sentences'] = self.max_valid_sentences = self.max_sentences
20 |         self.dataset_cls = VocoderDataset
21 | 
22 |     @data_loader
23 |     def train_dataloader(self):
24 |         train_dataset = self.dataset_cls('train', shuffle=True)
25 |         return self.build_dataloader(train_dataset, True, self.max_sentences, hparams['endless_ds'])
26 | 
27 |     @data_loader
28 |     def val_dataloader(self):
29 |         valid_dataset = self.dataset_cls('valid', shuffle=False)
30 |         return self.build_dataloader(valid_dataset, False, self.max_valid_sentences)
31 | 
32 |     @data_loader
33 |     def test_dataloader(self):
34 |         test_dataset = self.dataset_cls('test', shuffle=False)
35 |         return self.build_dataloader(test_dataset, False, self.max_valid_sentences)
36 | 
37 |     def build_dataloader(self, dataset, shuffle, max_sentences, endless=False):
38 |         world_size = 1
39 |         rank = 0
40 |         if dist.is_initialized():
41 |             world_size = dist.get_world_size()
42 |             rank = dist.get_rank()
43 |         sampler_cls = DistributedSampler if not endless else EndlessDistributedSampler
44 |         train_sampler = sampler_cls(
45 |             dataset=dataset,
46 |             num_replicas=world_size,
47 |             rank=rank,
48 |             shuffle=shuffle,
49 |         )
50 |         return torch.utils.data.DataLoader(
51 |             dataset=dataset,
52 |             shuffle=False,
53 |             collate_fn=dataset.collater,
54 |             batch_size=max_sentences,
55 |             num_workers=dataset.num_workers,
56 |             sampler=train_sampler,
57 |             pin_memory=True,
58 |         )
59 | 
60 |     def test_start(self):
61 |         self.gen_dir = os.path.join(hparams['work_dir'],
62 |                                     f'generated_{self.trainer.global_step}_{hparams["gen_dir_name"]}')
63 |         os.makedirs(self.gen_dir, exist_ok=True)
64 | 
65 |     def test_end(self, outputs):
66 |         return {}
67 | 


--------------------------------------------------------------------------------
/NeuralSeq/utils/ckpt_utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import logging
 3 | import os
 4 | import re
 5 | import torch
 6 | 
 7 | 
 8 | def get_last_checkpoint(work_dir, steps=None):
 9 |     checkpoint = None
10 |     last_ckpt_path = None
11 |     ckpt_paths = get_all_ckpts(work_dir, steps)
12 |     if len(ckpt_paths) > 0:
13 |         last_ckpt_path = ckpt_paths[0]
14 |         checkpoint = torch.load(last_ckpt_path, map_location='cpu')
15 |         logging.info(f'load module from checkpoint: {last_ckpt_path}')
16 |     return checkpoint, last_ckpt_path
17 | 
18 | 
19 | def get_all_ckpts(work_dir, steps=None):
20 |     if steps is None:
21 |         ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_*.ckpt'
22 |     else:
23 |         ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_{steps}.ckpt'
24 |     return sorted(glob.glob(ckpt_path_pattern),
25 |                   key=lambda x: -int(re.findall('.*steps\_(\d+)\.ckpt', x)[0]))
26 | 
27 | 
28 | def load_ckpt(cur_model, ckpt_base_dir, model_name='model', force=True, strict=True):
29 |     if os.path.isfile(ckpt_base_dir):
30 |         base_dir = os.path.dirname(ckpt_base_dir)
31 |         ckpt_path = ckpt_base_dir
32 |         checkpoint = torch.load(ckpt_base_dir, map_location='cpu')
33 |     else:
34 |         base_dir = ckpt_base_dir
35 |         checkpoint, ckpt_path = get_last_checkpoint(ckpt_base_dir)
36 |     if checkpoint is not None:
37 |         state_dict = checkpoint["state_dict"]
38 |         if len([k for k in state_dict.keys() if '.' in k]) > 0:
39 |             state_dict = {k[len(model_name) + 1:]: v for k, v in state_dict.items()
40 |                           if k.startswith(f'{model_name}.')}
41 |         else:
42 |             if '.' not in model_name:
43 |                 state_dict = state_dict[model_name]
44 |             else:
45 |                 base_model_name = model_name.split('.')[0]
46 |                 rest_model_name = model_name[len(base_model_name) + 1:]
47 |                 state_dict = {
48 |                     k[len(rest_model_name) + 1:]: v for k, v in state_dict[base_model_name].items()
49 |                     if k.startswith(f'{rest_model_name}.')}
50 |         if not strict:
51 |             cur_model_state_dict = cur_model.state_dict()
52 |             unmatched_keys = []
53 |             for key, param in state_dict.items():
54 |                 if key in cur_model_state_dict:
55 |                     new_param = cur_model_state_dict[key]
56 |                     if new_param.shape != param.shape:
57 |                         unmatched_keys.append(key)
58 |                         print("| Unmatched keys: ", key, new_param.shape, param.shape)
59 |             for key in unmatched_keys:
60 |                 del state_dict[key]
61 |         cur_model.load_state_dict(state_dict, strict=strict)
62 |         print(f"| load '{model_name}' from '{ckpt_path}'.")
63 |     else:
64 |         e_msg = f"| ckpt not found in {base_dir}."
65 |         if force:
66 |             assert False, e_msg
67 |         else:
68 |             print(e_msg)
69 | 


--------------------------------------------------------------------------------
/NeuralSeq/utils/indexed_datasets.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from copy import deepcopy
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class IndexedDataset:
 8 |     def __init__(self, path, num_cache=1):
 9 |         super().__init__()
10 |         self.path = path
11 |         self.data_file = None
12 |         self.data_offsets = np.load(f"{path}.idx", allow_pickle=True).item()['offsets']
13 |         self.data_file = open(f"{path}.data", 'rb', buffering=-1)
14 |         self.cache = []
15 |         self.num_cache = num_cache
16 | 
17 |     def check_index(self, i):
18 |         if i < 0 or i >= len(self.data_offsets) - 1:
19 |             raise IndexError('index out of range')
20 | 
21 |     def __del__(self):
22 |         if self.data_file:
23 |             self.data_file.close()
24 | 
25 |     def __getitem__(self, i):
26 |         self.check_index(i)
27 |         if self.num_cache > 0:
28 |             for c in self.cache:
29 |                 if c[0] == i:
30 |                     return c[1]
31 |         self.data_file.seek(self.data_offsets[i])
32 |         b = self.data_file.read(self.data_offsets[i + 1] - self.data_offsets[i])
33 |         item = pickle.loads(b)
34 |         if self.num_cache > 0:
35 |             self.cache = [(i, deepcopy(item))] + self.cache[:-1]
36 |         return item
37 | 
38 |     def __len__(self):
39 |         return len(self.data_offsets) - 1
40 | 
41 | class IndexedDatasetBuilder:
42 |     def __init__(self, path):
43 |         self.path = path
44 |         self.out_file = open(f"{path}.data", 'wb')
45 |         self.byte_offsets = [0]
46 | 
47 |     def add_item(self, item):
48 |         s = pickle.dumps(item)
49 |         bytes = self.out_file.write(s)
50 |         self.byte_offsets.append(self.byte_offsets[-1] + bytes)
51 | 
52 |     def finalize(self):
53 |         self.out_file.close()
54 |         np.save(open(f"{self.path}.idx", 'wb'), {'offsets': self.byte_offsets})
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     import random
59 |     from tqdm import tqdm
60 |     ds_path = '/tmp/indexed_ds_example'
61 |     size = 100
62 |     items = [{"a": np.random.normal(size=[10000, 10]),
63 |               "b": np.random.normal(size=[10000, 10])} for i in range(size)]
64 |     builder = IndexedDatasetBuilder(ds_path)
65 |     for i in tqdm(range(size)):
66 |         builder.add_item(items[i])
67 |     builder.finalize()
68 |     ds = IndexedDataset(ds_path)
69 |     for i in tqdm(range(10000)):
70 |         idx = random.randint(0, size - 1)
71 |         assert (ds[idx]['a'] == items[idx]['a']).all()
72 | 


--------------------------------------------------------------------------------
/NeuralSeq/utils/multiprocess_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import traceback
 3 | from multiprocessing import Queue, Process
 4 | 
 5 | 
 6 | def chunked_worker(worker_id, map_func, args, results_queue=None, init_ctx_func=None):
 7 |     ctx = init_ctx_func(worker_id) if init_ctx_func is not None else None
 8 |     for job_idx, arg in args:
 9 |         try:
10 |             if ctx is not None:
11 |                 res = map_func(*arg, ctx=ctx)
12 |             else:
13 |                 res = map_func(*arg)
14 |             results_queue.put((job_idx, res))
15 |         except:
16 |             traceback.print_exc()
17 |             results_queue.put((job_idx, None))
18 | 
19 | def chunked_multiprocess_run(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, q_max_size=1000):
20 |     args = zip(range(len(args)), args)
21 |     args = list(args)
22 |     n_jobs = len(args)
23 |     if num_workers is None:
24 |         num_workers = int(os.getenv('N_PROC', os.cpu_count()))
25 |     results_queues = []
26 |     if ordered:
27 |         for i in range(num_workers):
28 |             results_queues.append(Queue(maxsize=q_max_size // num_workers))
29 |     else:
30 |         results_queue = Queue(maxsize=q_max_size)
31 |         for i in range(num_workers):
32 |             results_queues.append(results_queue)
33 |     workers = []
34 |     for i in range(num_workers):
35 |         args_worker = args[i::num_workers]
36 |         p = Process(target=chunked_worker, args=(
37 |             i, map_func, args_worker, results_queues[i], init_ctx_func), daemon=True)
38 |         workers.append(p)
39 |         p.start()
40 |     for n_finished in range(n_jobs):
41 |         results_queue = results_queues[n_finished % num_workers]
42 |         job_idx, res = results_queue.get()
43 |         assert job_idx == n_finished or not ordered, (job_idx, n_finished)
44 |         yield res
45 |     for w in workers:
46 |         w.join()
47 |         w.close()
48 | 
49 | def multiprocess_run_tqdm(map_func, args, num_workers=None, ordered=True, init_ctx_func=None,
50 |                           multithread=False, desc=None):
51 |     for i, res in tqdm(enumerate(
52 |             multiprocess_run(map_func, args, num_workers, ordered, init_ctx_func, multithread)),
53 |             total=len(args), desc=desc):
54 |         yield i, res


--------------------------------------------------------------------------------
/NeuralSeq/utils/os_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | 
 5 | def link_file(from_file, to_file):
 6 |     subprocess.check_call(
 7 |         f'ln -s "`realpath --relative-to="{os.path.dirname(to_file)}" "{from_file}"`" "{to_file}"', shell=True)
 8 | 
 9 | 
10 | def move_file(from_file, to_file):
11 |     subprocess.check_call(f'mv "{from_file}" "{to_file}"', shell=True)
12 | 
13 | 
14 | def copy_file(from_file, to_file):
15 |     subprocess.check_call(f'cp -r "{from_file}" "{to_file}"', shell=True)
16 | 
17 | 
18 | def remove_file(*fns):
19 |     for f in fns:
20 |         subprocess.check_call(f'rm -rf "{f}"', shell=True)


--------------------------------------------------------------------------------
/NeuralSeq/utils/pitch_utils.py:
--------------------------------------------------------------------------------
 1 | #########
 2 | # world
 3 | ##########
 4 | import librosa
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | gamma = 0
 9 | mcepInput = 3  # 0 for dB, 3 for magnitude
10 | alpha = 0.45
11 | en_floor = 10 ** (-80 / 20)
12 | FFT_SIZE = 2048
13 | 
14 | 
15 | f0_bin = 256
16 | f0_max = 1100.0
17 | f0_min = 50.0
18 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
19 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
20 | 
21 | 
22 | def f0_to_coarse(f0):
23 |     is_torch = isinstance(f0, torch.Tensor)
24 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
25 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
26 | 
27 |     f0_mel[f0_mel <= 1] = 1
28 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
29 |     f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
30 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
31 |     return f0_coarse
32 | 
33 | 
34 | def norm_f0(f0, uv, hparams):
35 |     is_torch = isinstance(f0, torch.Tensor)
36 |     if hparams['pitch_norm'] == 'standard':
37 |         f0 = (f0 - hparams['f0_mean']) / hparams['f0_std']
38 |     if hparams['pitch_norm'] == 'log':
39 |         f0 = torch.log2(f0) if is_torch else np.log2(f0)
40 |     if uv is not None and hparams['use_uv']:
41 |         f0[uv > 0] = 0
42 |     return f0
43 | 
44 | 
45 | def norm_interp_f0(f0, hparams):
46 |     is_torch = isinstance(f0, torch.Tensor)
47 |     if is_torch:
48 |         device = f0.device
49 |         f0 = f0.data.cpu().numpy()
50 |     uv = f0 == 0
51 |     f0 = norm_f0(f0, uv, hparams)
52 |     if sum(uv) == len(f0):
53 |         f0[uv] = 0
54 |     elif sum(uv) > 0:
55 |         f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
56 |     uv = torch.FloatTensor(uv)
57 |     f0 = torch.FloatTensor(f0)
58 |     if is_torch:
59 |         f0 = f0.to(device)
60 |     return f0, uv
61 | 
62 | 
63 | def denorm_f0(f0, uv, hparams, pitch_padding=None, min=None, max=None):
64 |     if hparams['pitch_norm'] == 'standard':
65 |         f0 = f0 * hparams['f0_std'] + hparams['f0_mean']
66 |     if hparams['pitch_norm'] == 'log':
67 |         f0 = 2 ** f0
68 |     if min is not None:
69 |         f0 = f0.clamp(min=min)
70 |     if max is not None:
71 |         f0 = f0.clamp(max=max)
72 |     if uv is not None and hparams['use_uv']:
73 |         f0[uv > 0] = 0
74 |     if pitch_padding is not None:
75 |         f0[pitch_padding] = 0
76 |     return f0
77 | 


--------------------------------------------------------------------------------
/NeuralSeq/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | LINE_COLORS = ['w', 'r', 'y', 'cyan', 'm', 'b', 'lime']
 6 | 
 7 | 
 8 | def spec_to_figure(spec, vmin=None, vmax=None):
 9 |     if isinstance(spec, torch.Tensor):
10 |         spec = spec.cpu().numpy()
11 |     fig = plt.figure(figsize=(12, 6))
12 |     plt.pcolor(spec.T, vmin=vmin, vmax=vmax)
13 |     return fig
14 | 
15 | 
16 | def spec_f0_to_figure(spec, f0s, figsize=None):
17 |     max_y = spec.shape[1]
18 |     if isinstance(spec, torch.Tensor):
19 |         spec = spec.detach().cpu().numpy()
20 |         f0s = {k: f0.detach().cpu().numpy() for k, f0 in f0s.items()}
21 |     f0s = {k: f0 / 10 for k, f0 in f0s.items()}
22 |     fig = plt.figure(figsize=(12, 6) if figsize is None else figsize)
23 |     plt.pcolor(spec.T)
24 |     for i, (k, f0) in enumerate(f0s.items()):
25 |         plt.plot(f0.clip(0, max_y), label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.8)
26 |     plt.legend()
27 |     return fig
28 | 
29 | 
30 | def dur_to_figure(dur_gt, dur_pred, txt):
31 |     dur_gt = dur_gt.long().cpu().numpy()
32 |     dur_pred = dur_pred.long().cpu().numpy()
33 |     dur_gt = np.cumsum(dur_gt)
34 |     dur_pred = np.cumsum(dur_pred)
35 |     fig = plt.figure(figsize=(12, 6))
36 |     for i in range(len(dur_gt)):
37 |         shift = (i % 8) + 1
38 |         plt.text(dur_gt[i], shift, txt[i])
39 |         plt.text(dur_pred[i], 10 + shift, txt[i])
40 |         plt.vlines(dur_gt[i], 0, 10, colors='b')  # blue is gt
41 |         plt.vlines(dur_pred[i], 10, 20, colors='r')  # red is pred
42 |     return fig
43 | 
44 | 
45 | def f0_to_figure(f0_gt, f0_cwt=None, f0_pred=None):
46 |     fig = plt.figure()
47 |     f0_gt = f0_gt.cpu().numpy()
48 |     plt.plot(f0_gt, color='r', label='gt')
49 |     if f0_cwt is not None:
50 |         f0_cwt = f0_cwt.cpu().numpy()
51 |         plt.plot(f0_cwt, color='b', label='cwt')
52 |     if f0_pred is not None:
53 |         f0_pred = f0_pred.cpu().numpy()
54 |         plt.plot(f0_pred, color='green', label='pred')
55 |     plt.legend()
56 |     return fig
57 | 


--------------------------------------------------------------------------------
/NeuralSeq/utils/training_utils.py:
--------------------------------------------------------------------------------
 1 | from utils.hparams import hparams
 2 | 
 3 | 
 4 | class RSQRTSchedule(object):
 5 |     def __init__(self, optimizer):
 6 |         super().__init__()
 7 |         self.optimizer = optimizer
 8 |         self.constant_lr = hparams['lr']
 9 |         self.warmup_updates = hparams['warmup_updates']
10 |         self.hidden_size = hparams['hidden_size']
11 |         self.lr = hparams['lr']
12 |         for param_group in optimizer.param_groups:
13 |             param_group['lr'] = self.lr
14 |         self.step(0)
15 | 
16 |     def step(self, num_updates):
17 |         constant_lr = self.constant_lr
18 |         warmup = min(num_updates / self.warmup_updates, 1.0)
19 |         rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5
20 |         rsqrt_hidden = self.hidden_size ** -0.5
21 |         self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7)
22 |         for param_group in self.optimizer.param_groups:
23 |             param_group['lr'] = self.lr
24 |         return self.lr
25 | 
26 |     def get_lr(self):
27 |         return self.optimizer.param_groups[0]['lr']
28 | 


--------------------------------------------------------------------------------
/NeuralSeq/vocoders/__init__.py:
--------------------------------------------------------------------------------
1 | from vocoders import hifigan
2 | 


--------------------------------------------------------------------------------
/NeuralSeq/vocoders/base_vocoder.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | VOCODERS = {}
 3 | 
 4 | 
 5 | def register_vocoder(cls):
 6 |     VOCODERS[cls.__name__.lower()] = cls
 7 |     VOCODERS[cls.__name__] = cls
 8 |     return cls
 9 | 
10 | 
11 | def get_vocoder_cls(hparams):
12 |     if hparams['vocoder'] in VOCODERS:
13 |         return VOCODERS[hparams['vocoder']]
14 |     else:
15 |         vocoder_cls = hparams['vocoder']
16 |         pkg = ".".join(vocoder_cls.split(".")[:-1])
17 |         cls_name = vocoder_cls.split(".")[-1]
18 |         vocoder_cls = getattr(importlib.import_module(pkg), cls_name)
19 |         return vocoder_cls
20 | 
21 | 
22 | class BaseVocoder:
23 |     def spec2wav(self, mel):
24 |         """
25 | 
26 |         :param mel: [T, 80]
27 |         :return: wav: [T']
28 |         """
29 | 
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     def wav2spec(wav_fn):
34 |         """
35 | 
36 |         :param wav_fn: str
37 |         :return: wav, mel: [T, 80]
38 |         """
39 |         raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/NeuralSeq/vocoders/hifigan.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | import re
 5 | 
 6 | import librosa
 7 | import torch
 8 | 
 9 | import utils
10 | from modules.hifigan.hifigan import HifiGanGenerator
11 | from utils.hparams import hparams, set_hparams
12 | from vocoders.base_vocoder import register_vocoder
13 | from vocoders.pwg import PWG
14 | from vocoders.vocoder_utils import denoise
15 | 
16 | 
17 | def load_model(config_path, checkpoint_path):
18 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19 |     ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
20 |     if '.yaml' in config_path:
21 |         config = set_hparams(config_path, global_hparams=False)
22 |         state = ckpt_dict["state_dict"]["model_gen"]
23 |     elif '.json' in config_path:
24 |         config = json.load(open(config_path, 'r'))
25 |         state = ckpt_dict["generator"]
26 | 
27 |     model = HifiGanGenerator(config)
28 |     model.load_state_dict(state, strict=True)
29 |     model.remove_weight_norm()
30 |     model = model.eval().to(device)
31 |     print(f"| Loaded model parameters from {checkpoint_path}.")
32 |     print(f"| HifiGAN device: {device}.")
33 |     return model, config, device
34 | 
35 | 
36 | total_time = 0
37 | 
38 | 
39 | @register_vocoder
40 | class HifiGAN(PWG):
41 |     def __init__(self):
42 |         base_dir = hparams['vocoder_ckpt']
43 |         config_path = f'{base_dir}/config.yaml'
44 |         if os.path.exists(config_path):
45 |             ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
46 |             lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
47 |             print('| load HifiGAN: ', ckpt)
48 |             self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt)
49 |         else:
50 |             config_path = f'{base_dir}/config.json'
51 |             ckpt = f'{base_dir}/generator_v1'
52 |             if os.path.exists(config_path):
53 |                 self.model, self.config, self.device = load_model(config_path=config_path, checkpoint_path=ckpt)
54 | 
55 |     def spec2wav(self, mel, **kwargs):
56 |         device = self.device
57 |         with torch.no_grad():
58 |             c = torch.FloatTensor(mel).unsqueeze(0).transpose(2, 1).to(device)
59 |             with utils.Timer('hifigan', print_time=hparams['profile_infer']):
60 |                 f0 = kwargs.get('f0')
61 |                 if f0 is not None and hparams.get('use_nsf'):
62 |                     f0 = torch.FloatTensor(f0[None, :]).to(device)
63 |                     y = self.model(c, f0).view(-1)
64 |                 else:
65 |                     y = self.model(c).view(-1)
66 |         wav_out = y.cpu().numpy()
67 |         if hparams.get('vocoder_denoise_c', 0.0) > 0:
68 |             wav_out = denoise(wav_out, v=hparams['vocoder_denoise_c'])
69 |         return wav_out
70 | 
71 |     # @staticmethod
72 |     # def wav2spec(wav_fn, **kwargs):
73 |     #     wav, _ = librosa.core.load(wav_fn, sr=hparams['audio_sample_rate'])
74 |     #     wav_torch = torch.FloatTensor(wav)[None, :]
75 |     #     mel = mel_spectrogram(wav_torch, hparams).numpy()[0]
76 |     #     return wav, mel.T
77 | 


--------------------------------------------------------------------------------
/NeuralSeq/vocoders/vocoder_utils.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | 
 3 | from utils.hparams import hparams
 4 | import numpy as np
 5 | 
 6 | 
 7 | def denoise(wav, v=0.1):
 8 |     spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'],
 9 |                         win_length=hparams['win_size'], pad_mode='constant')
10 |     spec_m = np.abs(spec)
11 |     spec_m = np.clip(spec_m - v, a_min=0, a_max=None)
12 |     spec_a = np.angle(spec)
13 | 
14 |     return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'],
15 |                          win_length=hparams['win_size'])
16 | 


--------------------------------------------------------------------------------
/assets/2bf90e35.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/2bf90e35.wav


--------------------------------------------------------------------------------
/assets/5d67d1b9.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/5d67d1b9.wav


--------------------------------------------------------------------------------
/assets/7cb0d24f.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/7cb0d24f.wav


--------------------------------------------------------------------------------
/assets/7ef0ec0b.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/7ef0ec0b.wav


--------------------------------------------------------------------------------
/assets/Track 4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/Track 4.wav


--------------------------------------------------------------------------------
/assets/a-group-of-sheep-are-baaing.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/a-group-of-sheep-are-baaing.wav


--------------------------------------------------------------------------------
/assets/a2i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/a2i.png


--------------------------------------------------------------------------------
/assets/asr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/asr.png


--------------------------------------------------------------------------------
/assets/b973e878.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/b973e878.wav


--------------------------------------------------------------------------------
/assets/detection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/detection.png


--------------------------------------------------------------------------------
/assets/drums-and-music-playing-with-a-man-speaking.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/drums-and-music-playing-with-a-man-speaking.wav


--------------------------------------------------------------------------------
/assets/fd5cf55e.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/fd5cf55e.wav


--------------------------------------------------------------------------------
/assets/i2a-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/i2a-1.png


--------------------------------------------------------------------------------
/assets/i2a-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/i2a-2.png


--------------------------------------------------------------------------------
/assets/inpaint-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/inpaint-1.png


--------------------------------------------------------------------------------
/assets/inpaint-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/inpaint-2.png


--------------------------------------------------------------------------------
/assets/m2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/m2b.png


--------------------------------------------------------------------------------
/assets/mix1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/mix1.wav


--------------------------------------------------------------------------------
/assets/sound_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/sound_extraction.png


--------------------------------------------------------------------------------
/assets/style_transfer_tts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/style_transfer_tts.png


--------------------------------------------------------------------------------
/assets/t2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2a.png


--------------------------------------------------------------------------------
/assets/t2i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2i.png


--------------------------------------------------------------------------------
/assets/t2s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/t2s.png


--------------------------------------------------------------------------------
/assets/tsd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/tsd.png


--------------------------------------------------------------------------------
/assets/tts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/assets/tts.png


--------------------------------------------------------------------------------
/audio_detection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/__init__.py


--------------------------------------------------------------------------------
/audio_detection/audio_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/audio_infer/__init__.py


--------------------------------------------------------------------------------
/audio_detection/audio_infer/pytorch/evaluate.py:
--------------------------------------------------------------------------------
 1 | from sklearn import metrics
 2 | 
 3 | from pytorch_utils import forward
 4 | 
 5 | 
 6 | class Evaluator(object):
 7 |     def __init__(self, model):
 8 |         """Evaluator.
 9 | 
10 |         Args:
11 |           model: object
12 |         """
13 |         self.model = model
14 |         
15 |     def evaluate(self, data_loader):
16 |         """Forward evaluation data and calculate statistics.
17 | 
18 |         Args:
19 |           data_loader: object
20 | 
21 |         Returns:
22 |           statistics: dict, 
23 |               {'average_precision': (classes_num,), 'auc': (classes_num,)}
24 |         """
25 | 
26 |         # Forward
27 |         output_dict = forward(
28 |             model=self.model, 
29 |             generator=data_loader, 
30 |             return_target=True)
31 | 
32 |         clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
33 |         target = output_dict['target']    # (audios_num, classes_num)
34 | 
35 |         average_precision = metrics.average_precision_score(
36 |             target, clipwise_output, average=None)
37 | 
38 |         auc = metrics.roc_auc_score(target, clipwise_output, average=None)
39 |         
40 |         statistics = {'average_precision': average_precision, 'auc': auc}
41 | 
42 |         return statistics


--------------------------------------------------------------------------------
/audio_detection/audio_infer/pytorch/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def clip_bce(output_dict, target_dict):
 6 |     """Binary crossentropy loss.
 7 |     """
 8 |     return F.binary_cross_entropy(
 9 |         output_dict['clipwise_output'], target_dict['target'])
10 | 
11 | 
12 | def get_loss_func(loss_type):
13 |     if loss_type == 'clip_bce':
14 |         return clip_bce


--------------------------------------------------------------------------------
/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png


--------------------------------------------------------------------------------
/audio_detection/audio_infer/utils/crash.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class ExceptionHook:
 4 |     instance = None
 5 |     def __call__(self, *args, **kwargs):
 6 |         if self.instance is None:
 7 |             from IPython.core import ultratb
 8 |             self.instance = ultratb.FormattedTB(mode='Plain',
 9 |                  color_scheme='Linux', call_pdb=1)
10 |         return self.instance(*args, **kwargs)
11 | 
12 | sys.excepthook = ExceptionHook()
13 | 


--------------------------------------------------------------------------------
/audio_detection/audio_infer/utils/create_black_list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import csv
 3 | import os
 4 | 
 5 | from utilities import create_folder
 6 | 
 7 | 
 8 | def dcase2017task4(args):
 9 |     """Create black list. Black list is a list of audio ids that will be 
10 |     skipped in training. 
11 |     """
12 | 
13 |     # Augments & parameters
14 |     workspace = args.workspace
15 |     
16 |     # Black list from DCASE 2017 Task 4
17 |     test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
18 |     evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
19 |     
20 |     black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
21 |     create_folder(os.path.dirname(black_list_csv))
22 |     
23 |     def get_id_sets(csv_path):
24 |         with open(csv_path, 'r') as fr:
25 |             reader = csv.reader(fr, delimiter='\t')
26 |             lines = list(reader)
27 |          
28 |         ids_set = [] 
29 |         
30 |         for line in lines:
31 |             """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
32 |             ids_set.append(line[0][0 : 11])
33 |             
34 |         ids_set = list(set(ids_set))
35 |         return ids_set
36 |         
37 |     test_ids_set = get_id_sets(test_weak_csv)
38 |     evaluation_ids_set = get_id_sets(evaluation_weak_csv)
39 |     
40 |     full_ids_set = test_ids_set + evaluation_ids_set
41 |     
42 |     # Write black list
43 |     fw = open(black_list_csv, 'w')
44 |     
45 |     for id in full_ids_set:
46 |         fw.write('{}\n'.format(id))
47 |         
48 |     print('Write black list to {}'.format(black_list_csv))
49 |     
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser(description='')
53 |     subparsers = parser.add_subparsers(dest='mode')
54 | 
55 |     parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
56 |     parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
57 |         
58 |     args = parser.parse_args()
59 | 
60 |     if args.mode == 'dcase2017task4':
61 |         dcase2017task4(args)
62 |         
63 |     else:
64 |         raise Exception('Error argument!')


--------------------------------------------------------------------------------
/audio_to_text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/__init__.py


--------------------------------------------------------------------------------
/audio_to_text/captioning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/captioning/__init__.py


--------------------------------------------------------------------------------
/audio_to_text/captioning/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_model import *
2 | from .transformer_model import *
3 | 
4 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/README.md:
--------------------------------------------------------------------------------
 1 | # Utils
 2 | 
 3 | Scripts in this directory are used as utility functions.
 4 | 
 5 | ## BERT Pretrained Embeddings
 6 | 
 7 | You can load pretrained word embeddings in Google [BERT](https://github.com/google-research/bert#pre-trained-models) instead of training word embeddings from scratch. The scripts in `utils/bert` need a BERT server in the background. We use BERT server from [bert-as-service](https://github.com/hanxiao/bert-as-service).
 8 | 
 9 | To use bert-as-service, you need to first install the repository. It is recommended that you create a new environment with Tensorflow 1.3 to run BERT server since it is incompatible with Tensorflow 2.x.
10 | 
11 | After successful installation of [bert-as-service](https://github.com/hanxiao/bert-as-service), downloading and running the BERT server needs to execute:
12 | 
13 | ```bash
14 | bash scripts/prepare_bert_server.sh <path-to-server> <num-workers> zh
15 | ```
16 | 
17 | By default, server based on BERT base Chinese model is running in the background. You can change to other models by changing corresponding model name and path in `scripts/prepare_bert_server.sh`.
18 | 
19 | To extract BERT word embeddings, you need to execute `utils/bert/create_word_embedding.py`.
20 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/audio_to_text/captioning/utils/__init__.py


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/bert/create_word_embedding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | 
 6 | from bert_serving.client import BertClient
 7 | import numpy as np
 8 | from tqdm import tqdm
 9 | import fire
10 | import torch
11 | 
12 | sys.path.append(os.getcwd())
13 | from utils.build_vocab import Vocabulary
14 | 
15 | def main(vocab_file: str, output: str, server_hostname: str):
16 |     client = BertClient(ip=server_hostname)
17 |     vocabulary = torch.load(vocab_file)
18 |     vocab_size = len(vocabulary)
19 |     
20 |     fake_embedding = client.encode(["test"]).reshape(-1)
21 |     embed_size = fake_embedding.shape[0]
22 | 
23 |     print("Encoding words into embeddings with size: ", embed_size)
24 | 
25 |     embeddings = np.empty((vocab_size, embed_size))
26 |     for i in tqdm(range(len(embeddings)), ascii=True):
27 |         embeddings[i] = client.encode([vocabulary.idx2word[i]])
28 |     np.save(output, embeddings)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     fire.Fire(main)
33 | 
34 |     
35 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/fasttext/create_word_embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #!/usr/bin/env python3
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from gensim.models import FastText
 8 | from tqdm import tqdm
 9 | import fire
10 | 
11 | import sys
12 | import os
13 | sys.path.append(os.getcwd())
14 | from utils.build_vocab import Vocabulary
15 | 
16 | def create_embedding(caption_file: str,
17 |                      vocab_file: str,
18 |                      embed_size: int,
19 |                      output: str,
20 |                      **fasttext_kwargs):
21 |     caption_df = pd.read_json(caption_file)
22 |     caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"])
23 | 
24 |     sentences = list(caption_df["tokens"].values)
25 |     vocabulary = torch.load(vocab_file, map_location="cpu")
26 | 
27 |     epochs = fasttext_kwargs.get("epochs", 10)
28 |     model = FastText(size=embed_size, min_count=1, **fasttext_kwargs)
29 |     model.build_vocab(sentences=sentences)
30 |     model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
31 |     
32 |     word_embeddings = np.zeros((len(vocabulary), embed_size))
33 |     
34 |     with tqdm(total=len(vocabulary), ascii=True) as pbar:
35 |         for word, idx in vocabulary.word2idx.items():
36 |             if word == "<pad>" or word == "<unk>":
37 |                 continue
38 |             word_embeddings[idx] = model.wv[word]
39 |             pbar.update()
40 | 
41 |     np.save(output, word_embeddings)
42 | 
43 |     print("Finish writing fasttext embeddings to " + output)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     fire.Fire(create_embedding)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/predict_nn.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import argparse
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from h5py import File
 7 | import sklearn.metrics
 8 | 
 9 | random.seed(1)
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("train_feature", type=str)
13 | parser.add_argument("train_corpus", type=str)
14 | parser.add_argument("pred_feature", type=str)
15 | parser.add_argument("output_json", type=str)
16 | 
17 | args = parser.parse_args()
18 | train_embs = []
19 | train_idx_to_audioid = []
20 | with File(args.train_feature, "r") as store:
21 |     for audio_id, embedding in tqdm(store.items(), ascii=True):
22 |         train_embs.append(embedding[()])
23 |         train_idx_to_audioid.append(audio_id)
24 | 
25 | train_annotation = json.load(open(args.train_corpus, "r"))["audios"]
26 | train_audioid_to_tokens = {}
27 | for item in train_annotation:
28 |     audio_id = item["audio_id"]
29 |     train_audioid_to_tokens[audio_id] = [cap_item["tokens"] for cap_item in item["captions"]]
30 | train_embs = np.stack(train_embs)
31 | 
32 | 
33 | pred_data = []
34 | pred_embs = []
35 | pred_idx_to_audioids = []
36 | with File(args.pred_feature, "r") as store:
37 |     for audio_id, embedding in tqdm(store.items(), ascii=True):
38 |         pred_embs.append(embedding[()])
39 |         pred_idx_to_audioids.append(audio_id)
40 | pred_embs = np.stack(pred_embs)
41 | 
42 | similarity = sklearn.metrics.pairwise.cosine_similarity(pred_embs, train_embs)
43 | for idx, audio_id in enumerate(pred_idx_to_audioids):
44 |     train_idx = similarity[idx].argmax()
45 |     pred_data.append({
46 |         "filename": audio_id,
47 |         "tokens": random.choice(train_audioid_to_tokens[train_idx_to_audioid[train_idx]])
48 |     })
49 | json.dump({"predictions": pred_data}, open(args.output_json, "w"), ensure_ascii=False, indent=4)
50 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/remove_optimizer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | 
 5 | def main(checkpoint):
 6 |     state_dict = torch.load(checkpoint, map_location="cpu")
 7 |     if "optimizer" in state_dict:
 8 |         del state_dict["optimizer"]
 9 |     if "lr_scheduler" in state_dict:
10 |         del state_dict["lr_scheduler"]
11 |     torch.save(state_dict, checkpoint)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("checkpoint", type=str)
17 |     args = parser.parse_args()
18 |     main(args.checkpoint)
19 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/report_results.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--input", help="input filename", type=str, nargs="+")
 7 | parser.add_argument("--output", help="output result file", default=None)
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | 
12 | scores = {}
13 | for path in args.input:
14 |     with open(path, "r") as reader:
15 |         for line in reader.readlines():
16 |             metric, score = line.strip().split(": ")
17 |             score = float(score)
18 |             if metric not in scores:
19 |                 scores[metric] = []
20 |             scores[metric].append(score)
21 | 
22 | if len(scores) == 0:
23 |     print("No experiment directory found, wrong path?")
24 |     exit(1)
25 | 
26 | with open(args.output, "w") as writer:
27 |     print("Average results: ", file=writer)
28 |     for metric, score in scores.items():
29 |         score = np.array(score)
30 |         mean = np.mean(score)
31 |         std = np.std(score)
32 |         print(f"{metric}: {mean:.3f} (±{std:.3f})", file=writer)
33 |     print("", file=writer)
34 |     print("Best results: ", file=writer)
35 |     for metric, score in scores.items():
36 |         score = np.max(score)
37 |         print(f"{metric}: {score:.3f}", file=writer)
38 | 


--------------------------------------------------------------------------------
/audio_to_text/captioning/utils/word2vec/create_word_embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #!/usr/bin/env python3
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | import gensim
 8 | from gensim.models import Word2Vec
 9 | from tqdm import tqdm
10 | import fire
11 | 
12 | import sys
13 | import os
14 | sys.path.append(os.getcwd())
15 | from utils.build_vocab import Vocabulary
16 | 
17 | def create_embedding(vocab_file: str,
18 |                      embed_size: int,
19 |                      output: str,
20 |                      caption_file: str = None,
21 |                      pretrained_weights_path: str = None,
22 |                      **word2vec_kwargs):
23 |     vocabulary = torch.load(vocab_file, map_location="cpu")
24 | 
25 |     if pretrained_weights_path:
26 |         model = gensim.models.KeyedVectors.load_word2vec_format(
27 |             fname=pretrained_weights_path,
28 |             binary=True,
29 |         )
30 |         if model.vector_size != embed_size:
31 |             assert embed_size < model.vector_size, f"only reduce dimension, cannot add dimesion {model.vector_size} to {embed_size}"
32 |             from sklearn.decomposition import PCA
33 |             pca = PCA(n_components=embed_size)
34 |             model.vectors = pca.fit_transform(model.vectors)
35 |     else:
36 |         caption_df = pd.read_json(caption_file)
37 |         caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"])
38 |         sentences = list(caption_df["tokens"].values)
39 |         epochs = word2vec_kwargs.get("epochs", 10)
40 |         if "epochs" in word2vec_kwargs:
41 |             del word2vec_kwargs["epochs"]
42 |         model = Word2Vec(size=embed_size, min_count=1, **word2vec_kwargs)
43 |         model.build_vocab(sentences=sentences)
44 |         model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
45 |     
46 |     word_embeddings = np.random.randn(len(vocabulary), embed_size)
47 |     
48 |     if isinstance(model, gensim.models.word2vec.Word2Vec):
49 |         model = model.wv
50 |     with tqdm(total=len(vocabulary), ascii=True) as pbar:
51 |         for word, idx in vocabulary.word2idx.items():
52 |             try:
53 |                 word_embeddings[idx] = model.get_vector(word)
54 |             except KeyError:
55 |                 print(f"word {word} not found in word2vec model, it is random initialized!")
56 |             pbar.update()
57 | 
58 |     np.save(output, word_embeddings)
59 | 
60 |     print("Finish writing word2vec embeddings to " + output)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     fire.Fire(create_embedding)
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | accelerate
 3 | addict==2.4.0
 4 | aiofiles
 5 | albumentations==1.3.0
 6 | appdirs==1.4.4
 7 | basicsr==1.4.2
 8 | beautifulsoup4==4.10.0
 9 | Cython==0.29.24
10 | diffusers
11 | einops==0.3.0
12 | espnet
13 | espnet_model_zoo
14 | ffmpeg-python
15 | g2p-en==2.1.0
16 | google==3.0.0
17 | gradio
18 | h5py
19 | imageio==2.9.0
20 | imageio-ffmpeg==0.4.2
21 | invisible-watermark>=0.1.5
22 | jieba
23 | kornia==0.6
24 | langchain==0.0.101
25 | librosa
26 | loguru
27 | miditoolkit==0.1.7
28 | mmcv==1.5.0
29 | mmdet==2.23.0 
30 | mmengine==0.7.2
31 | moviepy==1.0.3
32 | numpy==1.23.1
33 | omegaconf==2.1.1
34 | open_clip_torch==2.0.2
35 | openai
36 | openai-whisper
37 | opencv-contrib-python==4.3.0.36
38 | praat-parselmouth==0.3.3
39 | prettytable==3.6.0
40 | proglog==0.1.9
41 | pycwt==0.3.0a22
42 | pyloudnorm==0.1.0
43 | pypinyin==0.43.0
44 | pytorch-lightning==1.5.0
45 | pytorch-ssim==0.1
46 | pyworld==0.3.0
47 | resampy==0.2.2
48 | Resemblyzer==0.1.1.dev0
49 | safetensors==0.2.7
50 | sklearn==0.0
51 | soundfile
52 | soupsieve==2.3
53 | streamlit==1.12.1
54 | streamlit-drawable-canvas==0.8.0
55 | tensorboardX==2.4
56 | test-tube>=0.7.5
57 | TextGrid==1.5
58 | timm==0.6.12
59 | torch==1.12.1
60 | torchaudio==0.12.1
61 | torch-fidelity==0.3.0
62 | torchlibrosa
63 | torchmetrics==0.6.0
64 | torchvision==0.13.1
65 | transformers==4.26.1
66 | typing-extensions==4.0.0
67 | uuid==1.30
68 | webdataset==0.2.5
69 | webrtcvad==2.0.10
70 | yapf==0.32.0
71 | git+https://github.com/openai/CLIP.git


--------------------------------------------------------------------------------
/run.md:
--------------------------------------------------------------------------------
 1 | # Run AudioGPT
 2 | ```
 3 | # create a new environment
 4 | conda create -n audiogpt python=3.8
 5 | 
 6 | #  prepare the basic environments
 7 | pip install -r requirements.txt
 8 | 
 9 | # download the foundation models you need
10 | bash download.sh
11 | 
12 | # prepare your private openAI private key
13 | export OPENAI_API_KEY={Your_Private_Openai_Key}
14 | 
15 | # Start AudioGPT !
16 | python audio-chatgpt.py
17 | ```
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/sound_extraction/model/LASSNet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .text_encoder import Text_Encoder
 5 | from .resunet_film import UNetRes_FiLM
 6 | 
 7 | class LASSNet(nn.Module):
 8 |     def __init__(self, device='cuda'):
 9 |         super(LASSNet, self).__init__()
10 |         self.text_embedder = Text_Encoder(device)
11 |         self.UNet = UNetRes_FiLM(channels=1, cond_embedding_dim=256)
12 | 
13 |     def forward(self, x, caption):
14 |         # x: (Batch, 1, T, 128))
15 |         input_ids, attns_mask = self.text_embedder.tokenize(caption)
16 |         
17 |         cond_vec = self.text_embedder(input_ids, attns_mask)[0]
18 |         dec_cond_vec = cond_vec
19 | 
20 |         mask = self.UNet(x, cond_vec, dec_cond_vec)
21 |         mask = torch.sigmoid(mask)
22 |         return mask
23 | 
24 |     def get_tokenizer(self):
25 |         return self.text_embedder.tokenizer
26 | 


--------------------------------------------------------------------------------
/sound_extraction/model/film.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class Film(nn.Module):
 5 |     def __init__(self, channels, cond_embedding_dim):
 6 |         super(Film, self).__init__()
 7 |         self.linear = nn.Sequential(
 8 |             nn.Linear(cond_embedding_dim, channels * 2),
 9 |             nn.ReLU(inplace=True),
10 |             nn.Linear(channels * 2, channels),
11 |             nn.ReLU(inplace=True)
12 |         )
13 | 
14 |     def forward(self, data, cond_vec):
15 |         """
16 |         :param data: [batchsize, channels, samples] or [batchsize, channels, T, F] or [batchsize, channels, F, T]
17 |         :param cond_vec: [batchsize, cond_embedding_dim]
18 |         :return:
19 |         """
20 |         bias = self.linear(cond_vec)  # [batchsize, channels]
21 |         if len(list(data.size())) == 3:
22 |             data = data + bias[..., None]
23 |         elif len(list(data.size())) == 4:
24 |             data = data + bias[..., None, None]
25 |         else:
26 |             print("Warning: The size of input tensor,", data.size(), "is not correct. Film is not working.")
27 |         return data


--------------------------------------------------------------------------------
/sound_extraction/model/text_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import *
 4 | import warnings
 5 | warnings.filterwarnings('ignore')
 6 | # pretrained model name: (model class, model tokenizer, output dimension, token style)
 7 | MODELS = {
 8 |     'prajjwal1/bert-mini': (BertModel, BertTokenizer),
 9 | }
10 | 
11 | class Text_Encoder(nn.Module):
12 |     def __init__(self, device):
13 |         super(Text_Encoder, self).__init__()
14 |         self.base_model = 'prajjwal1/bert-mini'
15 |         self.dropout = 0.1
16 | 
17 |         self.tokenizer = MODELS[self.base_model][1].from_pretrained(self.base_model)
18 | 
19 |         self.bert_layer =  MODELS[self.base_model][0].from_pretrained(self.base_model,
20 |                                                     add_pooling_layer=False,
21 |                                                     hidden_dropout_prob=self.dropout,
22 |                                                     attention_probs_dropout_prob=self.dropout,
23 |                                                     output_hidden_states=True)
24 |         
25 |         self.linear_layer = nn.Sequential(nn.Linear(256, 256), nn.ReLU(inplace=True))
26 |         
27 |         self.device = device
28 | 
29 |     def tokenize(self, caption):
30 |         # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31 |         tokenized = self.tokenizer(caption, add_special_tokens=False, padding=True, return_tensors='pt')
32 |         input_ids = tokenized['input_ids']
33 |         attns_mask = tokenized['attention_mask']
34 | 
35 |         input_ids = input_ids.to(self.device)
36 |         attns_mask = attns_mask.to(self.device)
37 |         return input_ids, attns_mask
38 | 
39 |     def forward(self, input_ids, attns_mask):
40 |         # input_ids, attns_mask = self.tokenize(caption)
41 |         output = self.bert_layer(input_ids=input_ids, attention_mask=attns_mask)[0]
42 |         cls_embed = output[:, 0, :]
43 |         text_embed = self.linear_layer(cls_embed)
44 | 
45 |         return text_embed, output  # text_embed: (batch, hidden_size)


--------------------------------------------------------------------------------
/sound_extraction/utils/wav_io.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import librosa.filters
 3 | import math
 4 | import numpy as np
 5 | import scipy.io.wavfile
 6 | 
 7 | def load_wav(path):
 8 |     max_length = 32000 * 10
 9 |     wav = librosa.core.load(path, sr=32000)[0]
10 |     if len(wav) > max_length:
11 |         audio = wav[0:max_length]
12 | 
13 |     # pad audio to max length, 10s for AudioCaps
14 |     if len(wav) < max_length:
15 |         # audio = torch.nn.functional.pad(audio, (0, self.max_length - audio.size(1)), 'constant')
16 |         wav = np.pad(wav, (0, max_length - len(wav)), 'constant')
17 |     wav = wav[...,None]
18 |     return wav
19 | 
20 | 
21 | def save_wav(wav, path):
22 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
23 |     scipy.io.wavfile.write(path, 32000, wav.astype(np.int16))


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-05
 3 |   target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
 4 |   params:
 5 |     linear_start: 0.00085
 6 |     linear_end: 0.0120
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: caption
12 |     image_size: 32      # unused
13 |     mel_dim: 10         # 80 // 2^3
14 |     mel_length: 78     # 624 // 2^3
15 |     channels: 4
16 |     cond_stage_trainable: false
17 |     conditioning_key: crossattn
18 |     monitor: val/loss_simple_ema
19 |     scale_by_std: True
20 |     use_ema: False
21 | 
22 |     scheduler_config: # 10000 warmup steps
23 |       target: ldm.lr_scheduler.LambdaLinearScheduler
24 |       params:
25 |         warm_up_steps: [10000]
26 |         cycle_lengths: [10000000000000]
27 |         f_start: [1.e-6]
28 |         f_max: [1.]
29 |         f_min: [ 1.]
30 | 
31 |     unet_config:
32 |       target: ldm.modules.diffusionmodules.custom_openaimodel.UNetModel
33 |       params:
34 |         image_size: 32 # ununsed
35 |         in_channels: 4
36 |         out_channels: 4
37 |         model_channels: 256
38 |         attention_resolutions:
39 |         - 1
40 |         - 2
41 |         num_res_blocks: 2
42 |         channel_mult:  # num_down = len(ch_mult)-1
43 |         - 1
44 |         - 2
45 |         num_head_channels: 32
46 |         use_spatial_transformer: true
47 |         transformer_depth: 1
48 |         context_dim: 1024
49 |         use_context_project: false
50 | 
51 | 
52 |     first_stage_config:
53 |       target: ldm.models.autoencoder.AutoencoderKL
54 |       params:
55 |         embed_dim: 4
56 |         monitor: val/rec_loss
57 |         ddconfig:
58 |           double_z: true
59 |           z_channels: 4
60 |           resolution: 848
61 |           in_channels: 1
62 |           out_ch: 1
63 |           ch: 128
64 |           ch_mult: [ 1, 2, 2, 4 ]  # num_down = len(ch_mult)-1
65 |           num_res_blocks: 2
66 |           attn_resolutions: [106, 212]
67 |           dropout: 0.0
68 |         lossconfig:
69 |           target: torch.nn.Identity
70 | 
71 |     cond_stage_config:
72 |       target: ldm.modules.encoders.modules.FrozenGlobalNormOpenCLIPEmbedder
73 |       params:
74 |         freeze: True
75 |         delvisual: False
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-05
 3 |   target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: masked_image
12 |     image_size: 32      # unused
13 |     mel_dim: 10         # 80 // 2^3
14 |     mel_length: 106     # 848 // 2^3
15 |     channels: 4
16 |     concat_mode: true
17 |     monitor: val/loss
18 |     use_ema: False
19 | 
20 |     scheduler_config:
21 |       target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
22 |       params:
23 |         verbosity_interval: 0
24 |         warm_up_steps: 1000
25 |         max_decay_steps: 50000
26 |         lr_start: 0.001
27 |         lr_max: 0.1
28 |         lr_min: 0.0001
29 | 
30 |     unet_config:
31 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32 |       params:
33 |         image_size: 32 # ununsed
34 |         in_channels: 9 # 4 + 1 + 4
35 |         out_channels: 4
36 |         model_channels: 320
37 |         attention_resolutions:
38 |         - 1
39 |         - 2
40 |         num_res_blocks: 2
41 |         channel_mult:  # num_down = len(ch_mult)-1
42 |         - 1
43 |         - 2
44 |         num_heads: 8
45 |         resblock_updown: true
46 | 
47 |     first_stage_config:
48 |       target: ldm.models.autoencoder.AutoencoderKL
49 |       params:
50 |         embed_dim: 4
51 |         monitor: val/rec_loss
52 |         ckpt_path: # /apdcephfs/share_1316500/nlphuang/results/Text_to_audio/ae15/2022-12-15T22-24-00_mixdata_kl_4_tile/epoch=000009-v2.ckpt
53 |         ddconfig:
54 |           double_z: true
55 |           z_channels: 4
56 |           resolution: 848
57 |           in_channels: 1
58 |           out_ch: 1
59 |           ch: 128
60 |           ch_mult: [ 1, 2, 2, 4 ]  # num_down = len(ch_mult)-1
61 |           num_res_blocks: 2
62 |           attn_resolutions: [106, 212]
63 |           dropout: 0.0
64 |         lossconfig:
65 |           target: torch.nn.Identity
66 | 
67 |     cond_stage_config: __is_first_stage__
68 | 
69 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/configs/text_to_audio/clap_args.yaml:
--------------------------------------------------------------------------------
 1 | # TEXT ENCODER CONFIG
 2 | text_model: 'bert-base-uncased'
 3 | text_len: 100
 4 | transformer_embed_dim: 768
 5 | freeze_text_encoder_weights: True
 6 | 
 7 | # AUDIO ENCODER CONFIG
 8 | audioenc_name: 'Cnn14'
 9 | out_emb: 2048
10 | sampling_rate: 44100
11 | duration: 9
12 | fmin: 50
13 | fmax: 14000
14 | n_fft: 1028
15 | hop_size: 320
16 | mel_bins: 64
17 | window_size: 1024
18 | 
19 | # PROJECTION SPACE CONFIG 
20 | d_proj: 1024
21 | temperature: 0.003
22 | 
23 | # TRAINING AND EVALUATION CONFIG
24 | num_classes: 527
25 | batch_size: 1024
26 | demo: False
27 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/configs/text_to_audio/hifigan_args.yaml:
--------------------------------------------------------------------------------
 1 | adam_b1: 0.8
 2 | adam_b2: 0.99
 3 | batch_size: 24
 4 | dist_config:
 5 |   dist_backend: nccl
 6 |   dist_url: tcp://localhost:54321
 7 |   world_size: 1
 8 | fmax: 8000
 9 | fmax_for_loss: null
10 | fmin: 0
11 | hop_size: 256
12 | learning_rate: 0.0002
13 | lr_decay: 0.999
14 | n_fft: 1024
15 | num_gpus: 0
16 | num_mels: 80
17 | num_workers: 4
18 | resblock: '1'
19 | resblock_dilation_sizes:
20 | - - 1
21 |   - 3
22 |   - 5
23 | - - 1
24 |   - 3
25 |   - 5
26 | - - 1
27 |   - 3
28 |   - 5
29 | resblock_kernel_sizes:
30 | - 3
31 | - 7
32 | - 11
33 | sampling_rate: 16000
34 | seed: 1234
35 | segment_size: 8192
36 | upsample_initial_channel: 512
37 | upsample_kernel_sizes:
38 | - 16
39 | - 16
40 | - 4
41 | - 4
42 | upsample_rates:
43 | - 8
44 | - 8
45 | - 2
46 | - 2
47 | win_size: 1024
48 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/configs/text_to_audio/txt2audio_args.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-05
 3 |   target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
 4 |   params:
 5 |     linear_start: 0.00085
 6 |     linear_end: 0.0120
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: caption
12 |     image_size: 32      # unused
13 |     mel_dim: 10         # 80 // 2^3
14 |     mel_length: 78     # 624 // 2^3
15 |     channels: 4
16 |     cond_stage_trainable: false
17 |     conditioning_key: crossattn
18 |     monitor: val/loss_simple_ema
19 |     scale_by_std: True
20 |     use_ema: False
21 | 
22 |     scheduler_config: # 10000 warmup steps
23 |       target: ldm.lr_scheduler.LambdaLinearScheduler
24 |       params:
25 |         warm_up_steps: [10000]
26 |         cycle_lengths: [10000000000000]
27 |         f_start: [1.e-6]
28 |         f_max: [1.]
29 |         f_min: [ 1.]
30 | 
31 |     unet_config:
32 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
33 |       params:
34 |         image_size: 32 # ununsed
35 |         in_channels: 4
36 |         out_channels: 4
37 |         model_channels: 320
38 |         attention_resolutions:
39 |         - 1
40 |         - 2
41 |         num_res_blocks: 2
42 |         channel_mult:  # num_down = len(ch_mult)-1
43 |         - 1
44 |         - 2
45 |         num_heads: 8
46 |         use_spatial_transformer: true
47 |         transformer_depth: 1
48 |         context_dim: 1024
49 |         use_checkpoint: true
50 |         legacy: False
51 | 
52 |     first_stage_config:
53 |       target: ldm.models.autoencoder.AutoencoderKL
54 |       params:
55 |         embed_dim: 4
56 |         monitor: val/rec_loss
57 |         ckpt_path: 
58 |         ddconfig:
59 |           double_z: true
60 |           z_channels: 4
61 |           resolution: 848
62 |           in_channels: 1
63 |           out_ch: 1
64 |           ch: 128
65 |           ch_mult: [ 1, 2, 2, 4 ]  # num_down = len(ch_mult)-1
66 |           num_res_blocks: 2
67 |           attn_resolutions: [106, 212]
68 |           dropout: 0.0
69 |         lossconfig:
70 |           target: torch.nn.Identity
71 | 
72 |     cond_stage_config:
73 |       target: ldm.modules.encoders.modules.FrozenCLAPEmbedder
74 |       params:
75 |         weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
76 | 
77 | ckpt_path: useful_ckpts/ta40multi_epoch=000085.ckpt
78 | 
79 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/__init__.py:
--------------------------------------------------------------------------------
1 | from . import clap 
2 | from . import audio
3 | from . import utils


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/config.yml:
--------------------------------------------------------------------------------
 1 | # TEXT ENCODER CONFIG
 2 | text_model: 'bert-base-uncased'
 3 | text_len: 100
 4 | transformer_embed_dim: 768
 5 | freeze_text_encoder_weights: True
 6 | 
 7 | # AUDIO ENCODER CONFIG
 8 | audioenc_name: 'Cnn14'
 9 | out_emb: 2048
10 | sampling_rate: 44100
11 | duration: 5
12 | fmin: 50
13 | fmax: 14000
14 | n_fft: 1028
15 | hop_size: 320
16 | mel_bins: 64
17 | window_size: 1024
18 | 
19 | # PROJECTION SPACE CONFIG 
20 | d_proj: 1024
21 | temperature: 0.003
22 | 
23 | # TRAINING AND EVALUATION CONFIG
24 | num_classes: 527
25 | batch_size: 1024
26 | demo: False
27 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/CLAP/utils.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import sys
 4 | 
 5 | def read_config_as_args(config_path,args=None,is_config_str=False):
 6 |     return_dict = {}
 7 | 
 8 |     if config_path is not None:
 9 |         if is_config_str:
10 |             yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
11 |         else:
12 |             with open(config_path, "r") as f:
13 |                 yml_config = yaml.load(f, Loader=yaml.FullLoader)
14 | 
15 |         if args != None:
16 |             for k, v in yml_config.items():
17 |                 if k in args.__dict__:
18 |                     args.__dict__[k] = v
19 |                 else:
20 |                     sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
21 |         else:
22 |             for k, v in yml_config.items():
23 |                 return_dict[k] = v
24 | 
25 |     args = args if args != None else return_dict
26 |     return argparse.Namespace(**args)
27 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import list_models, create_model, create_model_and_transforms, add_model_config
2 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
3 | from .model import CLAP, CLAPTextCfg, CLAPVisionCfg, CLAPAudioCfp, convert_weights_to_fp16, trace_model
4 | from .openai import load_openai_model, list_openai_models
5 | from .pretrained import list_pretrained, list_pretrained_tag_models, list_pretrained_model_tags,\
6 |     get_pretrained_url, download_pretrained
7 | from .tokenizer import SimpleTokenizer, tokenize
8 | from .transform import image_transform
9 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bert.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, BertModel
 2 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 3 | model = BertModel.from_pretrained("bert-base-uncased")
 4 | text = "Replace me by any text you'd like."
 5 | 
 6 | def bert_embeddings(text):
 7 |     # text = "Replace me by any text you'd like."
 8 |     encoded_input = tokenizer(text, return_tensors='pt')
 9 |     output = model(**encoded_input)
10 |     return output
11 |     
12 | from transformers import RobertaTokenizer, RobertaModel
13 | 
14 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
15 | model = RobertaModel.from_pretrained('roberta-base')
16 | text = "Replace me by any text you'd like."
17 | def Roberta_embeddings(text):
18 |     # text = "Replace me by any text you'd like."
19 |     encoded_input = tokenizer(text, return_tensors='pt')
20 |     output = model(**encoded_input)
21 |     return output
22 | 
23 | from transformers import BartTokenizer, BartModel
24 | 
25 | tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
26 | model = BartModel.from_pretrained('facebook/bart-base')
27 | text = "Replace me by any text you'd like."
28 | def bart_embeddings(text):
29 |     # text = "Replace me by any text you'd like."
30 |     encoded_input = tokenizer(text, return_tensors='pt')
31 |     output = model(**encoded_input)
32 |     return output


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/linear_probe.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | from .model import MLPLayers
 5 | 
 6 | 
 7 | class LinearProbe(nn.Module):
 8 |     def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None):
 9 |         """
10 |         Args:
11 |             model: nn.Module
12 |             mlp: bool, if True, then use the MLP layer as the linear probe module
13 |             freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe
14 |             in_ch: int, the output channel from CLAP model
15 |             out_ch: int, the output channel from linear probe (class_num)
16 |             act: torch.nn.functional, the activation function before the loss function
17 |         """
18 |         super().__init__()
19 |         in_ch = 512
20 |         self.clap_model = model
21 |         self.clap_model.text_branch = None  # to save memory
22 |         self.freeze = freeze
23 |         if mlp:
24 |             self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch])
25 |         else:
26 |             self.lp_layer = nn.Linear(in_ch, out_ch)
27 | 
28 |         if self.freeze:
29 |             for param in self.clap_model.parameters():
30 |                 param.requires_grad = False
31 | 
32 |         if act == 'None':
33 |             self.act = None
34 |         elif act == 'relu':
35 |             self.act = nn.ReLU()
36 |         elif act == 'elu':
37 |             self.act = nn.ELU()
38 |         elif act == 'prelu':
39 |             self.act = nn.PReLU(num_parameters=in_ch)
40 |         elif act == 'softmax':
41 |             self.act = nn.Softmax(dim=-1)
42 |         elif act == 'sigmoid':
43 |             self.act = nn.Sigmoid()
44 | 
45 |     def forward(self, x, mix_lambda=None, device=None):
46 |         """
47 |         Args:
48 |             x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list
49 |             mix_lambda: torch.tensor [batch], the mixup lambda
50 |         Returns:
51 |             class_prob: torch.tensor [batch, class_num]
52 | 
53 |         """
54 |         # batchnorm cancel grandient
55 |         if self.freeze:
56 |             self.clap_model.eval()
57 | 
58 |         x = self.clap_model.audio_projection(
59 |             self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)["embedding"])
60 |         out = self.lp_layer(x)
61 |         if self.act is not None:
62 |             out = self.act(out)
63 |         return out
64 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "base"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "large"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-tiny-win-1536.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1536,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "tiny"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/HTSAT-tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "HTSAT",
14 |         "model_name": "tiny"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn10"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-fmax-18k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 18000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-fmax-8k-20s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 960000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 360,
10 |         "fmin": 50,
11 |         "fmax": 8000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-tiny-transformer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 4
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14-win-1536.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1536,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 2048,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn14"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/PANN-6.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "audio_cfg": {
 4 |         "audio_length": 1024,
 5 |         "clip_samples": 480000,
 6 |         "mel_bins": 64,
 7 |         "sample_rate": 48000,
 8 |         "window_size": 1024,
 9 |         "hop_size": 480,
10 |         "fmin": 50,
11 |         "fmax": 14000,
12 |         "class_num": 527,
13 |         "model_type": "PANN",
14 |         "model_name": "Cnn6"
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 512,
20 |         "heads": 8,
21 |         "layers": 12
22 |     }
23 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/transform.py:
--------------------------------------------------------------------------------
 1 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
 2 |     CenterCrop
 3 | 
 4 | 
 5 | def _convert_to_rgb(image):
 6 |     return image.convert('RGB')
 7 | 
 8 | 
 9 | def image_transform(
10 |         image_size: int,
11 |         is_train: bool,
12 |         mean=(0.48145466, 0.4578275, 0.40821073),
13 |         std=(0.26862954, 0.26130258, 0.27577711)
14 | ):
15 |     normalize = Normalize(mean=mean, std=std)
16 |     if is_train:
17 |         return Compose([
18 |             RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
19 |             _convert_to_rgb,
20 |             ToTensor(),
21 |             normalize,
22 |         ])
23 |     else:
24 |         return Compose([
25 |             Resize(image_size, interpolation=InterpolationMode.BICUBIC),
26 |             CenterCrop(image_size),
27 |             _convert_to_rgb,
28 |             ToTensor(),
29 |             normalize,
30 |         ])
31 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/encoders/open_clap/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.2.1'
2 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.losses_audio.vqperceptual import DummyLoss
2 | 
3 | # relative imports pain
4 | import os
5 | import sys
6 | path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vggishish')
7 | sys.path.append(path)
8 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/config/melception.yaml:
--------------------------------------------------------------------------------
 1 | seed: 1337
 2 | log_code_state: True
 3 | # patterns to ignore when backing up the code folder
 4 | patterns_to_ignore: ['logs', '.git', '__pycache__', 'data', 'checkpoints', '*.pt']
 5 | 
 6 | # data:
 7 | mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/'
 8 | spec_shape: [80, 860]
 9 | cropped_size: [80, 848]
10 | random_crop: False
11 | 
12 | # train:
13 | device: 'cuda:0'
14 | batch_size: 8
15 | num_workers: 0
16 | optimizer: adam
17 | betas: [0.9, 0.999]
18 | momentum: 0.9
19 | learning_rate: 3e-4
20 | weight_decay: 0
21 | num_epochs: 100
22 | patience: 3
23 | logdir: './logs'
24 | cls_weights_in_loss: False
25 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/config/vggish.yaml:
--------------------------------------------------------------------------------
 1 | seed: 1337
 2 | log_code_state: True
 3 | # patterns to ignore when backing up the code folder
 4 | patterns_to_ignore: ['logs', '.git', '__pycache__']
 5 | 
 6 | # data:
 7 | mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/'
 8 | spec_shape: [80, 860]
 9 | cropped_size: [80, 848]
10 | random_crop: False
11 | 
12 | # model:
13 | # original vgg family except for MP is missing at the end
14 | # 'vggish': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512]
15 | # 'vgg11': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512],
16 | # 'vgg13': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512],
17 | # 'vgg16': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512],
18 | # 'vgg19': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 256, 'MP', 512, 512, 512, 512, 'MP', 512, 512, 512, 512],
19 | conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
20 | use_bn: False
21 | 
22 | # train:
23 | device: 'cuda:0'
24 | batch_size: 32
25 | num_workers: 0
26 | optimizer: adam
27 | betas: [0.9, 0.999]
28 | momentum: 0.9
29 | learning_rate: 3e-4
30 | weight_decay: 0.0001
31 | num_epochs: 100
32 | patience: 3
33 | logdir: './logs'
34 | cls_weights_in_loss: False
35 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/data/train_means_stds_melspec_10s_22050hz.txt:
--------------------------------------------------------------------------------
 1 | 0.51234712 0.08187601
 2 | 0.52630101 0.08393201
 3 | 0.52002938 0.08533191
 4 | 0.51831866 0.08651366
 5 | 0.52457265 0.08795700
 6 | 0.51129235 0.08924046
 7 | 0.51403755 0.09011565
 8 | 0.51189406 0.09138965
 9 | 0.50142221 0.09215379
10 | 0.50632402 0.09251092
11 | 0.49724399 0.09356590
12 | 0.49062130 0.09333057
13 | 0.49971113 0.09446411
14 | 0.48442903 0.09400197
15 | 0.48598301 0.09477853
16 | 0.48681630 0.09487848
17 | 0.47436119 0.09424447
18 | 0.48031359 0.09417475
19 | 0.47422810 0.09498061
20 | 0.46369397 0.09362415
21 | 0.47413055 0.09453825
22 | 0.46062686 0.09481226
23 | 0.45677793 0.09359464
24 | 0.46135474 0.09437913
25 | 0.45246800 0.09384014
26 | 0.45232766 0.09438247
27 | 0.45208386 0.09419720
28 | 0.44351671 0.09340880
29 | 0.44667316 0.09423184
30 | 0.44447692 0.09344461
31 | 0.43676363 0.09265266
32 | 0.44002381 0.09307925
33 | 0.43772414 0.09299363
34 | 0.43061019 0.09236166
35 | 0.43120828 0.09156491
36 | 0.42603271 0.09161403
37 | 0.42863234 0.09150530
38 | 0.42296206 0.09102536
39 | 0.41733331 0.09048366
40 | 0.41804121 0.09025013
41 | 0.41605068 0.09078869
42 | 0.40875265 0.08985338
43 | 0.40666997 0.08877566
44 | 0.40407463 0.08961667
45 | 0.40353311 0.08859275
46 | 0.39708031 0.08827818
47 | 0.39375066 0.08833999
48 | 0.39301091 0.08760654
49 | 0.39047117 0.08812327
50 | 0.38461680 0.08782288
51 | 0.38145284 0.08645484
52 | 0.37985209 0.08718211
53 | 0.37419526 0.08644421
54 | 0.37080597 0.08532454
55 | 0.36786535 0.08592822
56 | 0.36569049 0.08452069
57 | 0.36336079 0.08474272
58 | 0.35775191 0.08476392
59 | 0.35504801 0.08334654
60 | 0.35284816 0.08412110
61 | 0.34594865 0.08367254
62 | 0.34112312 0.08252251
63 | 0.33784886 0.08320975
64 | 0.33095703 0.08257768
65 | 0.32559461 0.08171253
66 | 0.32003106 0.08204872
67 | 0.31506222 0.08098545
68 | 0.31138077 0.08152917
69 | 0.30403516 0.08209135
70 | 0.29969540 0.08073266
71 | 0.29578024 0.08225822
72 | 0.28861871 0.08324076
73 | 0.28581686 0.08058489
74 | 0.27922253 0.08515350
75 | 0.27444035 0.08355056
76 | 0.27339468 0.08067638
77 | 0.26571759 0.08536921
78 | 0.26280864 0.08107620
79 | 0.25664202 0.08357468
80 | 0.24853513 0.08556041
81 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | 
 6 | class WeightedCrossEntropy(nn.CrossEntropyLoss):
 7 | 
 8 |     def __init__(self, weights, **pytorch_ce_loss_args) -> None:
 9 |         super().__init__(reduction='none', **pytorch_ce_loss_args)
10 |         self.weights = weights
11 | 
12 |     def __call__(self, outputs, targets, to_weight=True):
13 |         loss = super().__call__(outputs, targets)
14 |         if to_weight:
15 |             return (loss * self.weights[targets]).sum() / self.weights[targets].sum()
16 |         else:
17 |             return loss.mean()
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     x = torch.randn(10, 5)
22 |     target = torch.randint(0, 5, (10,))
23 |     weights = torch.tensor([1., 2., 3., 4., 5.])
24 | 
25 |     # criterion_weighted = nn.CrossEntropyLoss(weight=weights)
26 |     # loss_weighted = criterion_weighted(x, target)
27 | 
28 |     # criterion_weighted_manual = nn.CrossEntropyLoss(reduction='none')
29 |     # loss_weighted_manual = criterion_weighted_manual(x, target)
30 |     # print(loss_weighted, loss_weighted_manual.mean())
31 |     # loss_weighted_manual = (loss_weighted_manual * weights[target]).sum() / weights[target].sum()
32 |     # print(loss_weighted, loss_weighted_manual)
33 |     # print(torch.allclose(loss_weighted, loss_weighted_manual))
34 | 
35 |     pytorch_weighted = nn.CrossEntropyLoss(weight=weights)
36 |     pytorch_unweighted = nn.CrossEntropyLoss()
37 |     custom = WeightedCrossEntropy(weights)
38 | 
39 |     assert torch.allclose(pytorch_weighted(x, target), custom(x, target, to_weight=True))
40 |     assert torch.allclose(pytorch_unweighted(x, target), custom(x, target, to_weight=False))
41 |     print(custom(x, target, to_weight=True), custom(x, target, to_weight=False))
42 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/metrics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | import scipy
 5 | import torch
 6 | from sklearn.metrics import average_precision_score, roc_auc_score
 7 | 
 8 | logger = logging.getLogger(f'main.{__name__}')
 9 | 
10 | def metrics(targets, outputs, topk=(1, 5)):
11 |     """
12 |     Adapted from https://github.com/hche11/VGGSound/blob/master/utils.py
13 | 
14 |     Calculate statistics including mAP, AUC, and d-prime.
15 |         Args:
16 |             output: 2d tensors, (dataset_size, classes_num) - before softmax
17 |             target: 1d tensors, (dataset_size, )
18 |             topk: tuple
19 |         Returns:
20 |             metric_dict: a dict of metrics
21 |     """
22 |     metrics_dict = dict()
23 | 
24 |     num_cls = outputs.shape[-1]
25 | 
26 |     # accuracy@k
27 |     _, preds = torch.topk(outputs, k=max(topk), dim=1)
28 |     correct_for_maxtopk = preds == targets.view(-1, 1).expand_as(preds)
29 |     for k in topk:
30 |         metrics_dict[f'accuracy_{k}'] = float(correct_for_maxtopk[:, :k].sum() / correct_for_maxtopk.shape[0])
31 | 
32 |     # avg precision, average roc_auc, and dprime
33 |     targets = torch.nn.functional.one_hot(targets, num_classes=num_cls)
34 | 
35 |     # ids of the predicted classes (same as softmax)
36 |     targets_pred = torch.softmax(outputs, dim=1)
37 | 
38 |     targets = targets.numpy()
39 |     targets_pred = targets_pred.numpy()
40 | 
41 |     # one-vs-rest
42 |     avg_p = [average_precision_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)]
43 |     try:
44 |         roc_aucs = [roc_auc_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)]
45 |     except ValueError:
46 |         logger.warning('Weird... Some classes never occured in targets. Do not trust the metrics.')
47 |         roc_aucs = np.array([0.5])
48 |         avg_p = np.array([0])
49 | 
50 |     metrics_dict['mAP'] = np.mean(avg_p)
51 |     metrics_dict['mROCAUC'] = np.mean(roc_aucs)
52 |     # Percent point function (ppf) (inverse of cdf — percentiles).
53 |     metrics_dict['dprime'] = scipy.stats.norm().ppf(metrics_dict['mROCAUC']) * np.sqrt(2)
54 | 
55 |     return metrics_dict
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     targets = torch.tensor([3, 3, 1, 2, 1, 0])
60 |     outputs = torch.tensor([
61 |         [1.2, 1.3, 1.1, 1.5],
62 |         [1.3, 1.4, 1.0, 1.1],
63 |         [1.5, 1.1, 1.4, 1.3],
64 |         [1.0, 1.2, 1.4, 1.5],
65 |         [1.2, 1.3, 1.1, 1.1],
66 |         [1.2, 1.1, 1.1, 1.1],
67 |     ]).float()
68 |     metrics_dict = metrics(targets, outputs, topk=(1, 3))
69 |     print(metrics_dict)
70 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/ldm/modules/losses_audio/vggishish/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class VGGishish(nn.Module):
 6 | 
 7 |     def __init__(self, conv_layers, use_bn, num_classes):
 8 |         '''
 9 |         Mostly from
10 |             https://pytorch.org/vision/0.8/_modules/torchvision/models/vgg.html
11 |         '''
12 |         super().__init__()
13 |         layers = []
14 |         in_channels = 1
15 | 
16 |         # a list of channels with 'MP' (maxpool) from config
17 |         for v in conv_layers:
18 |             if v == 'MP':
19 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
20 |             else:
21 |                 conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1, stride=1)
22 |                 if use_bn:
23 |                     layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
24 |                 else:
25 |                     layers += [conv2d, nn.ReLU(inplace=True)]
26 |                 in_channels = v
27 |         self.features = nn.Sequential(*layers)
28 | 
29 |         self.avgpool = nn.AdaptiveAvgPool2d((5, 10))
30 | 
31 |         self.flatten = nn.Flatten()
32 |         self.classifier = nn.Sequential(
33 |             nn.Linear(512 * 5 * 10, 4096),
34 |             nn.ReLU(True),
35 |             nn.Linear(4096, 4096),
36 |             nn.ReLU(True),
37 |             nn.Linear(4096, num_classes)
38 |         )
39 | 
40 |         # weight init
41 |         self.reset_parameters()
42 | 
43 |     def forward(self, x):
44 |         # adding channel dim for conv2d (B, 1, F, T) <-
45 |         x = x.unsqueeze(1)
46 |         # backbone (B, 1, 5, 53) <- (B, 1, 80, 860)
47 |         x = self.features(x)
48 |         # adaptive avg pooling (B, 1, 5, 10) <- (B, 1, 5, 53) – if no MP is used as the end of VGG
49 |         x = self.avgpool(x)
50 |         # flatten
51 |         x = self.flatten(x)
52 |         # classify
53 |         x = self.classifier(x)
54 |         return x
55 | 
56 |     def reset_parameters(self):
57 |         for m in self.modules():
58 |             if isinstance(m, nn.Conv2d):
59 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
60 |                 if m.bias is not None:
61 |                     nn.init.constant_(m.bias, 0)
62 |             elif isinstance(m, nn.BatchNorm2d):
63 |                 nn.init.constant_(m.weight, 1)
64 |                 nn.init.constant_(m.bias, 0)
65 |             elif isinstance(m, nn.Linear):
66 |                 nn.init.normal_(m.weight, 0, 0.01)
67 |                 nn.init.constant_(m.bias, 0)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     num_classes = 309
72 |     inputs = torch.rand(3, 80, 848)
73 |     conv_layers = [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
74 |     # conv_layers = [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP']
75 |     model = VGGishish(conv_layers, use_bn=False, num_classes=num_classes)
76 |     outputs = model(inputs)
77 |     print(outputs.shape)
78 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/useful_ckpts/CLAP/config.yml:
--------------------------------------------------------------------------------
 1 | # TEXT ENCODER CONFIG
 2 | text_model: 'bert-base-uncased'
 3 | text_len: 100
 4 | transformer_embed_dim: 768
 5 | freeze_text_encoder_weights: True
 6 | 
 7 | # AUDIO ENCODER CONFIG
 8 | audioenc_name: 'Cnn14'
 9 | out_emb: 2048
10 | sampling_rate: 44100
11 | duration: 9
12 | fmin: 50
13 | fmax: 14000
14 | n_fft: 1028
15 | hop_size: 320
16 | mel_bins: 64
17 | window_size: 1024
18 | 
19 | # PROJECTION SPACE CONFIG 
20 | d_proj: 1024
21 | temperature: 0.003
22 | 
23 | # TRAINING AND EVALUATION CONFIG
24 | num_classes: 527
25 | batch_size: 1024
26 | demo: False
27 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/vocoder/bigvgan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AIGC-Audio/AudioGPT/a674543c537bb3a60b8f521df76b97a627c1c379/text_to_audio/Make_An_Audio/vocoder/bigvgan/__init__.py


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(self,
10 |                  activation,
11 |                  up_ratio: int = 2,
12 |                  down_ratio: int = 2,
13 |                  up_kernel_size: int = 12,
14 |                  down_kernel_size: int = 12):
15 |         super().__init__()
16 |         self.up_ratio = up_ratio
17 |         self.down_ratio = down_ratio
18 |         self.act = activation
19 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
20 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
21 | 
22 |     # x: [B,C,T]
23 |     def forward(self, x):
24 |         x = self.upsample(x)
25 |         x = self.act(x)
26 |         x = self.downsample(x)
27 | 
28 |         return x


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/vocoder/bigvgan/alias_free_torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
20 |                                       half_width=0.6 / ratio,
21 |                                       kernel_size=self.kernel_size)
22 |         self.register_buffer("filter", filter)
23 | 
24 |     # x: [B, C, T]
25 |     def forward(self, x):
26 |         _, C, _ = x.shape
27 | 
28 |         x = F.pad(x, (self.pad, self.pad), mode='replicate')
29 |         x = self.ratio * F.conv_transpose1d(
30 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
31 |         x = x[..., self.pad_left:-self.pad_right]
32 | 
33 |         return x
34 | 
35 | 
36 | class DownSample1d(nn.Module):
37 |     def __init__(self, ratio=2, kernel_size=None):
38 |         super().__init__()
39 |         self.ratio = ratio
40 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
41 |         self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
42 |                                        half_width=0.6 / ratio,
43 |                                        stride=ratio,
44 |                                        kernel_size=self.kernel_size)
45 | 
46 |     def forward(self, x):
47 |         xx = self.lowpass(x)
48 | 
49 |         return xx


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/vocoder/logs/hifi_0127/args.yml:
--------------------------------------------------------------------------------
 1 | adam_b1: 0.8
 2 | adam_b2: 0.99
 3 | batch_size: 24
 4 | dist_config:
 5 |   dist_backend: nccl
 6 |   dist_url: tcp://localhost:54321
 7 |   world_size: 1
 8 | fmax: 8000
 9 | fmax_for_loss: null
10 | fmin: 0
11 | hop_size: 256
12 | learning_rate: 0.0002
13 | lr_decay: 0.999
14 | n_fft: 1024
15 | num_gpus: 0
16 | num_mels: 80
17 | num_workers: 4
18 | resblock: '1'
19 | resblock_dilation_sizes:
20 | - - 1
21 |   - 3
22 |   - 5
23 | - - 1
24 |   - 3
25 |   - 5
26 | - - 1
27 |   - 3
28 |   - 5
29 | resblock_kernel_sizes:
30 | - 3
31 | - 7
32 | - 11
33 | sampling_rate: 16000
34 | seed: 1234
35 | segment_size: 8192
36 | upsample_initial_channel: 512
37 | upsample_kernel_sizes:
38 | - 16
39 | - 16
40 | - 4
41 | - 4
42 | upsample_rates:
43 | - 8
44 | - 8
45 | - 2
46 | - 2
47 | win_size: 1024
48 | 


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/wav_evaluation/models/__init__.py:
--------------------------------------------------------------------------------
1 | from . import clap 
2 | from . import audio
3 | from . import utils


--------------------------------------------------------------------------------
/text_to_audio/Make_An_Audio/wav_evaluation/models/utils.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import sys
 4 | 
 5 | def read_config_as_args(config_path,args=None,is_config_str=False):
 6 |     return_dict = {}
 7 | 
 8 |     if config_path is not None:
 9 |         if is_config_str:
10 |             yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
11 |         else:
12 |             with open(config_path, "r") as f:
13 |                 yml_config = yaml.load(f, Loader=yaml.FullLoader)
14 | 
15 |         if args != None:
16 |             for k, v in yml_config.items():
17 |                 if k in args.__dict__:
18 |                     args.__dict__[k] = v
19 |                 else:
20 |                     sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
21 |         else:
22 |             for k, v in yml_config.items():
23 |                 return_dict[k] = v
24 | 
25 |     args = args if args != None else return_dict
26 |     return argparse.Namespace(**args)
27 | 


--------------------------------------------------------------------------------