├── .gitignore ├── LICENSE ├── README.md ├── configs ├── lvcgan.v1.yaml ├── parallel_wavegan.v1.yaml └── pwg.v1.yaml ├── samples ├── 0001_lvc.wav ├── 0001_pwg.wav ├── 0001_real.wav ├── 0002_lvc.wav ├── 0002_pwg.wav ├── 0002_real.wav ├── 0003_lvc.wav ├── 0003_pwg.wav ├── 0003_real.wav ├── 0004_lvc.wav ├── 0004_pwg.wav ├── 0004_real.wav ├── 0005_gpwg.wav ├── 0005_lvc.wav ├── 0005_real.wav ├── 0006_lvc.wav ├── 0006_pwg.wav ├── 0006_real.wav ├── 0007_lvc.wav ├── 0007_pwg.wav ├── 0007_real.wav ├── 0008_lvc.wav ├── 0008_pwg.wav ├── 0008_real.wav ├── 0009_lvc.wav ├── 0009_pwg.wav ├── 0009_real.wav ├── 0010_lvc.wav ├── 0010_pwg.wav ├── 0010_real.wav ├── evaluate-loss.png └── train-loss.png ├── scripts └── preprocess.sh ├── test ├── __init__.py ├── test_dataset.py ├── test_log.py └── test_others.ipynb └── vocoder ├── __init__.py ├── audio ├── __init__.py ├── mel.py ├── stft.py └── util.py ├── datasets ├── __init__.py ├── audio_mel.py └── utils.py ├── hparams.py ├── inference.py ├── layers ├── __init__.py ├── causal_conv.py ├── location_variable_conv.py ├── pqmf.py ├── residual_block.py ├── residual_stack.py └── upsample.py ├── losses ├── __init__.py ├── pwg_loss.py └── stft_loss.py ├── models ├── __init__.py ├── lvcgan.py ├── lvcnet.py ├── melgan.py └── parallel_wavegan.py ├── optimizers ├── __init__.py ├── pwg_opt.py └── radam.py ├── preprocess.py ├── strategy ├── __init__.py ├── base.py └── pwg_strategy.py ├── test.py ├── train.py └── utils ├── __init__.py └── log.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | exps/ 3 | temp/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # LVCNet: Efficient Condition-Dependent Modeling Network for Waveform Generation 3 | 4 | Using LVCNet to design the generator of Parallel WaveGAN and the *same strategy* to train it, 5 | the inference speed of the new vocoder is **more than 5x faster** than the original vocoder 6 | *without any degradation in audio quality*. 7 | 8 | Our current works [[Paper](https://arxiv.org/abs/2102.10815)] has been accepted by ICASSP2021, and our previous works were described in [Melglow](https://arxiv.org/abs/2012.01684). 9 | 10 | ## Training and Test 11 | 12 | 1. prepare the data, download `LJSpeech` dataset from https://keithito.com/LJ-Speech-Dataset/, 13 | and save it in `data/LJSpeech-1.1`. Then run 14 | ```python 15 | python -m vocoder.preprocess --data-dir ./data/LJSpeech-1.1 --config configs/lvcgan.v1.yaml 16 | ``` 17 | The mel-sepctrums are calculated and saved in the folder `temp/`. 18 | 19 | 2. Training LVCNet 20 | ```python 21 | python -m vocoder.train --config configs/lvcgan.v1.yaml --exp-dir exps/exp.lvcgan.v1 22 | ``` 23 | 24 | 3. Test LVCNet 25 | ```python 26 | python -m vocoder.test --config configs/lvcgan.v1.yaml --exp-dir exps/exp.lvcgan.v1 27 | ``` 28 | 29 | 4. The experimental results, including training logs, model checkpoints and synthesized audios, are stored in the folder `exps/exp.lvcgan.v1/`. 30 | Similarity, you can also use the config file `configs/pwg.v1.yaml` to train a Parallel WaveGAN model. 31 | ```Python 32 | # training 33 | python -m vocoder.train --config configs/pwg.v1.yaml --exp-dir exps/exp.pwg.v1 34 | # test 35 | python -m vocoder.test --config configs/pwg.v1.yaml --exp-dir exps/exp.pwg.v1 36 | ``` 37 | 38 | ## Results 39 | 40 | ### Tensorboard 41 | 42 | Use the tensorboard to view the experimental training process: 43 | 44 | ``` 45 | tensorboard --logdir exps 46 | ``` 47 | 48 | ### Traning Loss 49 | ![image](samples/train-loss.png) 50 | 51 | ### Evaluate Loss 52 | ![image](samples/evaluate-loss.png) 53 | 54 | 55 | ### Aduio Sample 56 | 57 | Audio Samples are saved in `samples/`, where 58 | - `samples/*_lvc.wav` are generated by LVCNet, 59 | - `samples/*_pwg.wav` are generated by Parallel WaveGAN, 60 | - `samples/*_real.wav` are the real audio. 61 | 62 | 63 | ## Reference 64 | > LVCNet: Efficient Condition-Dependent Modeling Network for Waveform Generation, https://arxiv.org/abs/2102.10815 65 | > MelGlow: Efficient Waveform Generative Network Based on Location-Variable Convolution, https://arxiv.org/abs/2012.01684 66 | > https://github.com/kan-bayashi/ParallelWaveGAN 67 | > https://github.com/lmnt-com/diffwave 68 | -------------------------------------------------------------------------------- /configs/lvcgan.v1.yaml: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # ADUIO & MEL-SPECTRUM # 3 | ########################################################### 4 | sample_rate: 22050 5 | hop_length: 256 6 | win_length: 1024 7 | n_fft: 1024 8 | n_mels: 80 9 | mel_fmin: 70 10 | mel_fmax: 8000 11 | 12 | 13 | ########################################################### 14 | # MODEL SETTING # 15 | ########################################################### 16 | model_name: "LVCNetWaveGAN" 17 | model_params: 18 | generator_params: 19 | in_channels: 1 # Number of input channels. 20 | out_channels: 1 # Number of output channels. 21 | inner_channels: 8 22 | cond_channels: 80 23 | cond_hop_length: 256 24 | lvc_block_nums: 3 25 | lvc_layers_each_block: 10 26 | lvc_kernel_size: 3 27 | kpnet_hidden_channels: 64 28 | kpnet_conv_size: 1 29 | dropout: 0.0 30 | use_weight_norm: true # Whether to use weight norm. 31 | # If set to true, it will be applied to all of the conv layers. 32 | 33 | discriminator_params: 34 | in_channels: 1 # Number of input channels. 35 | out_channels: 1 # Number of output channels. 36 | kernel_size: 3 # Number of output channels. 37 | layers: 10 # Number of conv layers. 38 | conv_channels: 64 # Number of chnn layers. 39 | bias: true # Whether to use bias parameter in conv. 40 | use_weight_norm: true # Whether to use weight norm. 41 | # If set to true, it will be applied to all of the conv layers. 42 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 43 | nonlinear_activation_params: # Nonlinear function parameters 44 | negative_slope: 0.2 # Alpha in LeakyReLU. 45 | 46 | 47 | ########################################################### 48 | # LOSS SETTING # 49 | ########################################################### 50 | loss_name: "PWGLoss" 51 | loss_params: 52 | stft_loss_params: 53 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 54 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 55 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 56 | window: "hann_window" # Window function for STFT-based loss 57 | 58 | 59 | ########################################################### 60 | # OPTIMIZER & SCHEDULER SETTING # 61 | ########################################################### 62 | opt_name: "PWGOptimizer" 63 | opt_params: 64 | generator_optimizer_params: 65 | lr: 0.0001 # Generator's learning rate. 66 | eps: 1.0e-6 # Generator's epsilon. 67 | weight_decay: 0.0 # Generator's weight decay coefficient. 68 | generator_scheduler_params: 69 | step_size: 200000 # Generator's scheduler step size. 70 | gamma: 0.5 # Generator's scheduler gamma. 71 | # At each step size, lr will be multiplied by this parameter. 72 | discriminator_optimizer_params: 73 | lr: 0.00005 # Discriminator's learning rate. 74 | eps: 1.0e-6 # Discriminator's epsilon. 75 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 76 | discriminator_scheduler_params: 77 | step_size: 200000 # Discriminator's scheduler step size. 78 | gamma: 0.5 # Discriminator's scheduler gamma. 79 | # At each step size, lr will be multiplied by this parameter. 80 | 81 | 82 | ########################################################### 83 | # STRATEGY SETTING # 84 | ########################################################### 85 | strategy_name: "PWGStrategy" 86 | strategy_params: 87 | lambda_adv: 4.0 88 | discriminator_start_steps: 100000 89 | generator_grad_norm: 10 # Generator's gradient norm. 90 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 91 | 92 | 93 | ########################################################### 94 | # TRANINING SETTING # 95 | ########################################################### 96 | dataset_classname: "PWGAudioMelNoiseDataset" 97 | dataset_num_workers: 5 98 | batch_mel_length: 48 99 | train_batch_size: 8 100 | max_train_steps: 800000 101 | log_interval_steps: 100 102 | 103 | 104 | ########################################################### 105 | # EVALUATE & TEST # 106 | ########################################################### 107 | eval_interval_steps: 1000 108 | 109 | 110 | -------------------------------------------------------------------------------- /configs/parallel_wavegan.v1.yaml: -------------------------------------------------------------------------------- 1 | # This is the hyperparameter configuration file for Parallel WaveGAN. 2 | # Please make sure this is adjusted for the LJSpeech dataset. If you want to 3 | # apply to the other dataset, you might need to carefully change some parameters. 4 | # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V. 5 | 6 | ########################################################### 7 | # FEATURE EXTRACTION SETTING # 8 | ########################################################### 9 | sampling_rate: 22050 # Sampling rate. 10 | fft_size: 1024 # FFT size. 11 | hop_size: 256 # Hop size. 12 | win_length: null # Window length. 13 | # If set to null, it will be the same as fft_size. 14 | window: "hann" # Window function. 15 | num_mels: 80 # Number of mel basis. 16 | fmin: 80 # Minimum freq in mel basis calculation. 17 | fmax: 7600 # Maximum frequency in mel basis calculation. 18 | global_gain_scale: 1.0 # Will be multiplied to all of waveform. 19 | trim_silence: true # Whether to trim the start and end of silence. 20 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good. 21 | trim_frame_size: 2048 # Frame size in trimming. 22 | trim_hop_size: 512 # Hop size in trimming. 23 | format: "hdf5" # Feature file format. "npy" or "hdf5" is supported. 24 | 25 | ########################################################### 26 | # GENERATOR NETWORK ARCHITECTURE SETTING # 27 | ########################################################### 28 | generator_params: 29 | in_channels: 1 # Number of input channels. 30 | out_channels: 1 # Number of output channels. 31 | kernel_size: 3 # Kernel size of dilated convolution. 32 | layers: 30 # Number of residual block layers. 33 | stacks: 3 # Number of stacks i.e., dilation cycles. 34 | residual_channels: 64 # Number of channels in residual conv. 35 | gate_channels: 128 # Number of channels in gated conv. 36 | skip_channels: 64 # Number of channels in skip conv. 37 | aux_channels: 80 # Number of channels for auxiliary feature conv. 38 | # Must be the same as num_mels. 39 | aux_context_window: 2 # Context window size for auxiliary feature. 40 | # If set to 2, previous 2 and future 2 frames will be considered. 41 | dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. 42 | use_weight_norm: true # Whether to use weight norm. 43 | # If set to true, it will be applied to all of the conv layers. 44 | upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. 45 | upsample_params: # Upsampling network parameters. 46 | upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size. 47 | 48 | ########################################################### 49 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 50 | ########################################################### 51 | discriminator_params: 52 | in_channels: 1 # Number of input channels. 53 | out_channels: 1 # Number of output channels. 54 | kernel_size: 3 # Number of output channels. 55 | layers: 10 # Number of conv layers. 56 | conv_channels: 64 # Number of chnn layers. 57 | bias: true # Whether to use bias parameter in conv. 58 | use_weight_norm: true # Whether to use weight norm. 59 | # If set to true, it will be applied to all of the conv layers. 60 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 61 | nonlinear_activation_params: # Nonlinear function parameters 62 | negative_slope: 0.2 # Alpha in LeakyReLU. 63 | 64 | ########################################################### 65 | # STFT LOSS SETTING # 66 | ########################################################### 67 | stft_loss_params: 68 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 69 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 70 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 71 | window: "hann_window" # Window function for STFT-based loss 72 | 73 | ########################################################### 74 | # ADVERSARIAL LOSS SETTING # 75 | ########################################################### 76 | lambda_adv: 4.0 # Loss balancing coefficient. 77 | 78 | ########################################################### 79 | # DATA LOADER SETTING # 80 | ########################################################### 81 | batch_size: 6 # Batch size. 82 | batch_max_steps: 25600 # Length of each audio in batch. Make sure dividable by hop_size. 83 | pin_memory: true # Whether to pin memory in Pytorch DataLoader. 84 | num_workers: 2 # Number of workers in Pytorch DataLoader. 85 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. 86 | allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. 87 | 88 | ########################################################### 89 | # OPTIMIZER & SCHEDULER SETTING # 90 | ########################################################### 91 | generator_optimizer_params: 92 | lr: 0.0001 # Generator's learning rate. 93 | eps: 1.0e-6 # Generator's epsilon. 94 | weight_decay: 0.0 # Generator's weight decay coefficient. 95 | generator_scheduler_params: 96 | step_size: 200000 # Generator's scheduler step size. 97 | gamma: 0.5 # Generator's scheduler gamma. 98 | # At each step size, lr will be multiplied by this parameter. 99 | generator_grad_norm: 10 # Generator's gradient norm. 100 | discriminator_optimizer_params: 101 | lr: 0.00005 # Discriminator's learning rate. 102 | eps: 1.0e-6 # Discriminator's epsilon. 103 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 104 | discriminator_scheduler_params: 105 | step_size: 200000 # Discriminator's scheduler step size. 106 | gamma: 0.5 # Discriminator's scheduler gamma. 107 | # At each step size, lr will be multiplied by this parameter. 108 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 109 | 110 | ########################################################### 111 | # INTERVAL SETTING # 112 | ########################################################### 113 | discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. 114 | train_max_steps: 400000 # Number of training steps. 115 | save_interval_steps: 5000 # Interval steps to save checkpoint. 116 | eval_interval_steps: 1000 # Interval steps to evaluate the network. 117 | log_interval_steps: 100 # Interval steps to record the training log. 118 | 119 | ########################################################### 120 | # OTHER SETTING # 121 | ########################################################### 122 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 123 | -------------------------------------------------------------------------------- /configs/pwg.v1.yaml: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # ADUIO & MEL-SPECTRUM # 3 | ########################################################### 4 | sample_rate: 22050 5 | hop_length: 256 6 | win_length: 1024 7 | n_fft: 1024 8 | n_mels: 80 9 | mel_f_min: 8000 10 | mel_f_max: 70 11 | 12 | 13 | ########################################################### 14 | # MODEL SETTING # 15 | ########################################################### 16 | model_name: "ParallelWaveGAN" 17 | model_params: 18 | generator_params: 19 | in_channels: 1 # Number of input channels. 20 | out_channels: 1 # Number of output channels. 21 | kernel_size: 3 # Kernel size of dilated convolution. 22 | layers: 30 # Number of residual block layers. 23 | stacks: 3 # Number of stacks i.e., dilation cycles. 24 | residual_channels: 64 # Number of channels in residual conv. 25 | gate_channels: 128 # Number of channels in gated conv. 26 | skip_channels: 64 # Number of channels in skip conv. 27 | aux_channels: 80 # Number of channels for auxiliary feature conv. 28 | # Must be the same as num_mels. 29 | aux_context_window: 2 # Context window size for auxiliary feature. 30 | # If set to 2, previous 2 and future 2 frames will be considered. 31 | dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. 32 | use_weight_norm: true # Whether to use weight norm. 33 | # If set to true, it will be applied to all of the conv layers. 34 | upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. 35 | upsample_params: # Upsampling network parameters. 36 | upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size. 37 | 38 | discriminator_params: 39 | in_channels: 1 # Number of input channels. 40 | out_channels: 1 # Number of output channels. 41 | kernel_size: 3 # Number of output channels. 42 | layers: 10 # Number of conv layers. 43 | conv_channels: 64 # Number of chnn layers. 44 | bias: true # Whether to use bias parameter in conv. 45 | use_weight_norm: true # Whether to use weight norm. 46 | # If set to true, it will be applied to all of the conv layers. 47 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 48 | nonlinear_activation_params: # Nonlinear function parameters 49 | negative_slope: 0.2 # Alpha in LeakyReLU. 50 | 51 | 52 | ########################################################### 53 | # LOSS SETTING # 54 | ########################################################### 55 | loss_name: "PWGLoss" 56 | loss_params: 57 | stft_loss_params: 58 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 59 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 60 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 61 | window: "hann_window" # Window function for STFT-based loss 62 | 63 | 64 | ########################################################### 65 | # OPTIMIZER & SCHEDULER SETTING # 66 | ########################################################### 67 | opt_name: "PWGOptimizer" 68 | opt_params: 69 | generator_optimizer_params: 70 | lr: 0.0001 # Generator's learning rate. 71 | eps: 1.0e-6 # Generator's epsilon. 72 | weight_decay: 0.0 # Generator's weight decay coefficient. 73 | generator_scheduler_params: 74 | step_size: 200000 # Generator's scheduler step size. 75 | gamma: 0.5 # Generator's scheduler gamma. 76 | # At each step size, lr will be multiplied by this parameter. 77 | discriminator_optimizer_params: 78 | lr: 0.00005 # Discriminator's learning rate. 79 | eps: 1.0e-6 # Discriminator's epsilon. 80 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 81 | discriminator_scheduler_params: 82 | step_size: 200000 # Discriminator's scheduler step size. 83 | gamma: 0.5 # Discriminator's scheduler gamma. 84 | # At each step size, lr will be multiplied by this parameter. 85 | 86 | 87 | ########################################################### 88 | # STRATEGY SETTING # 89 | ########################################################### 90 | strategy_name: "PWGStrategy" 91 | strategy_params: 92 | lambda_adv: 4.0 93 | discriminator_start_steps: 100000 94 | generator_grad_norm: 10 # Generator's gradient norm. 95 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 96 | 97 | 98 | ########################################################### 99 | # TRANINING SETTING # 100 | ########################################################### 101 | dataset_classname: "PWGAudioMelNoiseDataset" 102 | dataset_num_workers: 5 103 | batch_mel_length: 48 104 | train_batch_size: 8 105 | max_train_steps: 400000 106 | log_interval_steps: 100 107 | 108 | 109 | ########################################################### 110 | # EVALUATE & TEST # 111 | ########################################################### 112 | eval_interval_steps: 1000 113 | 114 | 115 | -------------------------------------------------------------------------------- /samples/0001_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0001_lvc.wav -------------------------------------------------------------------------------- /samples/0001_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0001_pwg.wav -------------------------------------------------------------------------------- /samples/0001_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0001_real.wav -------------------------------------------------------------------------------- /samples/0002_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0002_lvc.wav -------------------------------------------------------------------------------- /samples/0002_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0002_pwg.wav -------------------------------------------------------------------------------- /samples/0002_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0002_real.wav -------------------------------------------------------------------------------- /samples/0003_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0003_lvc.wav -------------------------------------------------------------------------------- /samples/0003_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0003_pwg.wav -------------------------------------------------------------------------------- /samples/0003_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0003_real.wav -------------------------------------------------------------------------------- /samples/0004_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0004_lvc.wav -------------------------------------------------------------------------------- /samples/0004_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0004_pwg.wav -------------------------------------------------------------------------------- /samples/0004_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0004_real.wav -------------------------------------------------------------------------------- /samples/0005_gpwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0005_gpwg.wav -------------------------------------------------------------------------------- /samples/0005_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0005_lvc.wav -------------------------------------------------------------------------------- /samples/0005_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0005_real.wav -------------------------------------------------------------------------------- /samples/0006_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0006_lvc.wav -------------------------------------------------------------------------------- /samples/0006_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0006_pwg.wav -------------------------------------------------------------------------------- /samples/0006_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0006_real.wav -------------------------------------------------------------------------------- /samples/0007_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0007_lvc.wav -------------------------------------------------------------------------------- /samples/0007_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0007_pwg.wav -------------------------------------------------------------------------------- /samples/0007_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0007_real.wav -------------------------------------------------------------------------------- /samples/0008_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0008_lvc.wav -------------------------------------------------------------------------------- /samples/0008_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0008_pwg.wav -------------------------------------------------------------------------------- /samples/0008_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0008_real.wav -------------------------------------------------------------------------------- /samples/0009_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0009_lvc.wav -------------------------------------------------------------------------------- /samples/0009_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0009_pwg.wav -------------------------------------------------------------------------------- /samples/0009_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0009_real.wav -------------------------------------------------------------------------------- /samples/0010_lvc.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0010_lvc.wav -------------------------------------------------------------------------------- /samples/0010_pwg.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0010_pwg.wav -------------------------------------------------------------------------------- /samples/0010_real.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/0010_real.wav -------------------------------------------------------------------------------- /samples/evaluate-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/evaluate-loss.png -------------------------------------------------------------------------------- /samples/train-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/samples/train-loss.png -------------------------------------------------------------------------------- /scripts/preprocess.sh: -------------------------------------------------------------------------------- 1 | 2 | python -m vocoder.preprocess \ 3 | --data-dir ../data/LJSpeech-1.1 \ 4 | --config configs/lvcgan.v1.yaml 5 | 6 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/test/__init__.py -------------------------------------------------------------------------------- /test/test_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import tqdm 3 | from vocoder.datasets import create_dataloader 4 | 5 | def test_mel_audio_dataset(): 6 | dataset_config = { 7 | 'metadata_file': 'temp/metadata.txt', 8 | 'hop_length': 256, 9 | 'batch_mel_length': 64 10 | } 11 | 12 | dataloader = create_dataloader( "AudioMelNoiseDataset", 13 | dataset_config=dataset_config, 14 | batch_size=4, 15 | shuffle=True, 16 | num_workers=4, 17 | drop_last=False ) 18 | for batch in tqdm.tqdm(dataloader): 19 | wavs, mels, noises = batch 20 | 21 | 22 | 23 | if __name__ == "__main__": 24 | test_mel_audio_dataset() 25 | -------------------------------------------------------------------------------- /test/test_log.py: -------------------------------------------------------------------------------- 1 | 2 | import os, logging, time 3 | from vocoder.utils.log import Logger 4 | 5 | def test_logger(): 6 | exp_dir = 'exps/exp-test' 7 | os.makedirs( exp_dir, exist_ok=True ) 8 | 9 | log = Logger( exp_dir ) 10 | log.info('log test finish.') 11 | 12 | while True: 13 | time.sleep(1) 14 | log.add_scalars('train', {'test': 0.2}, 1) 15 | 16 | if __name__ == "__main__": 17 | test_logger() 18 | 19 | 20 | -------------------------------------------------------------------------------- /test/test_others.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('..')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import torch \n", 20 | "import numpy as np" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "mel_file = '../temp/mels/LJ001-0001.wav.mel.npy'\n", 30 | "\n", 31 | "a = np.load(mel_file)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "array([[0.13142769, 0.16869499, 0.21409377, ..., 0.14441383, 0.17254333,\n", 43 | " 0.14861533],\n", 44 | " [0.19932327, 0.26880988, 0.2584364 , ..., 0.5645433 , 0.59844124,\n", 45 | " 0.5185001 ],\n", 46 | " [0.30107734, 0.33759803, 0.33762237, ..., 0.6241323 , 0.66163576,\n", 47 | " 0.5661982 ],\n", 48 | " ...,\n", 49 | " [0.06722992, 0.22493614, 0.30000472, ..., 0.22025412, 0.23424743,\n", 50 | " 0.17675872],\n", 51 | " [0.1122142 , 0.23285972, 0.28549805, ..., 0.22142944, 0.24440841,\n", 52 | " 0.19024192],\n", 53 | " [0.21358238, 0.2089363 , 0.18085976, ..., 0.25496688, 0.219793 ,\n", 54 | " 0.19626419]], dtype=float32)" 55 | ] 56 | }, 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "a" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "0.0" 75 | ] 76 | }, 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "a.min()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 7, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "ename": "FileNotFoundError", 93 | "evalue": "[Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint.pt'", 94 | "output_type": "error", 95 | "traceback": [ 96 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 97 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 98 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../exps/exp-20201018-032250/checkpoint.pt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 99 | "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0mpickle_load_args\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'encoding'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 581\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0m_open_file_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 582\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_zipfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0;31m# The zipfile reader is going to advance the current file position.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 100 | "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36m_open_file_like\u001b[0;34m(name_or_buffer, mode)\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_open_file_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_open_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'w'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 101 | "\u001b[0;32m~/miniconda3/lib/python3.8/site-packages/torch/serialization.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0m_open_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_opener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_open_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__exit__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 102 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint.pt'" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "torch.load('../exps/exp-20201018-032250/checkpoint.pt')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 10, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "ename": "FileNotFoundError", 117 | "evalue": "[Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint'", 118 | "output_type": "error", 119 | "traceback": [ 120 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 121 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 122 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadlink\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m'../exps/exp-20201018-032250/checkpoint'\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 123 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../exps/exp-20201018-032250/checkpoint'" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "import os \n", 129 | "os.readlink( '../exps/exp-20201018-032250/checkpoint' )" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "False" 141 | ] 142 | }, 143 | "execution_count": 9, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "os.path.isfile( '../exps/exp-20201018-032250/checkpoint.pt' )" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "os.path.islink()" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.8.3" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 4 183 | } 184 | -------------------------------------------------------------------------------- /vocoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/vocoder/__init__.py -------------------------------------------------------------------------------- /vocoder/audio/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .util import load_wav_to_torch 3 | from .stft import TacotronSTFT 4 | from .mel import griffin_lim_inverse_mel 5 | -------------------------------------------------------------------------------- /vocoder/audio/mel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from .stft import TacotronSTFT 5 | from .util import griffin_lim 6 | 7 | 8 | def griffin_lim_inverse_mel(mel, tacotron_stft, griffin_iters=60): 9 | ''' generate waveform using griffin_lim according to the mel-spectrums. 10 | 11 | Args: 12 | mel (Tensor): shape (B, L, C) 13 | tacotron_stft (TacotronSTFT): A transformation class to calcuate the mel-spectrum 14 | griffin_iters (int): the iters for griffin_lim 15 | 16 | Returns: 17 | audio (Tensor): the generated waveform. 18 | ''' 19 | # mel = torch.stack([torch.from_numpy(_denormalize(mel.numpy()))]) 20 | mel_decompress = tacotron_stft.spectral_de_normalize(mel) 21 | mel_decompress = mel_decompress.transpose(1, 2).data.cpu() 22 | spec_from_mel_scaling = 1000 23 | spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis) 24 | spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) 25 | spec_from_mel = spec_from_mel * spec_from_mel_scaling 26 | 27 | audio = griffin_lim(torch.autograd.Variable( 28 | spec_from_mel[:, :, :-1]), tacotron_stft.stft_fn, griffin_iters) 29 | 30 | return audio 31 | -------------------------------------------------------------------------------- /vocoder/audio/stft.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | from scipy.signal import get_window 7 | from librosa.util import pad_center, tiny 8 | from librosa.filters import mel as librosa_mel_fn 9 | 10 | from .util import window_sumsquare 11 | from .util import dynamic_range_compression 12 | from .util import dynamic_range_decompression 13 | 14 | 15 | class STFT(torch.nn.Module): 16 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 17 | 18 | def __init__(self, filter_length, hop_length, win_length, 19 | window='hann'): 20 | super(STFT, self).__init__() 21 | self.filter_length = filter_length 22 | self.hop_length = hop_length 23 | self.win_length = win_length 24 | self.window = window 25 | self.forward_transform = None 26 | scale = self.filter_length / self.hop_length 27 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 28 | 29 | cutoff = int((self.filter_length / 2 + 1)) 30 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 31 | np.imag(fourier_basis[:cutoff, :])]) 32 | 33 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 34 | inverse_basis = torch.FloatTensor( 35 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 36 | 37 | if window is not None: 38 | assert(filter_length >= win_length) 39 | # get window and zero center pad it to filter_length 40 | fft_window = get_window(window, win_length, fftbins=True) 41 | fft_window = pad_center(fft_window, filter_length) 42 | fft_window = torch.from_numpy(fft_window).float() 43 | 44 | # window the bases 45 | forward_basis *= fft_window 46 | inverse_basis *= fft_window 47 | 48 | self.register_buffer('forward_basis', forward_basis.float()) 49 | self.register_buffer('inverse_basis', inverse_basis.float()) 50 | 51 | def transform(self, input_data): 52 | num_batches = input_data.size(0) 53 | num_samples = input_data.size(1) 54 | 55 | self.num_samples = num_samples 56 | 57 | # similar to librosa, reflect-pad the input 58 | input_data = input_data.view(num_batches, 1, num_samples) 59 | input_data = F.pad( 60 | input_data.unsqueeze(1), 61 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 62 | mode='reflect') 63 | input_data = input_data.squeeze(1) 64 | 65 | forward_transform = F.conv1d( 66 | input_data.cuda(), 67 | Variable(self.forward_basis, requires_grad=False).cuda(), 68 | stride=self.hop_length, 69 | padding=0).cpu() 70 | 71 | cutoff = int((self.filter_length / 2) + 1) 72 | real_part = forward_transform[:, :cutoff, :] 73 | imag_part = forward_transform[:, cutoff:, :] 74 | 75 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 76 | phase = torch.autograd.Variable( 77 | torch.atan2(imag_part.data, real_part.data)) 78 | 79 | return magnitude, phase 80 | 81 | def inverse(self, magnitude, phase): 82 | recombine_magnitude_phase = torch.cat( 83 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 84 | 85 | inverse_transform = F.conv_transpose1d( 86 | recombine_magnitude_phase, 87 | Variable(self.inverse_basis, requires_grad=False), 88 | stride=self.hop_length, 89 | padding=0) 90 | 91 | if self.window is not None: 92 | window_sum = window_sumsquare( 93 | self.window, magnitude.size(-1), hop_length=self.hop_length, 94 | win_length=self.win_length, n_fft=self.filter_length, 95 | dtype=np.float32) 96 | # remove modulation effects 97 | approx_nonzero_indices = torch.from_numpy( 98 | np.where(window_sum > tiny(window_sum))[0]) 99 | window_sum = torch.autograd.Variable( 100 | torch.from_numpy(window_sum), requires_grad=False) 101 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 102 | inverse_transform[:, :, 103 | approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 104 | 105 | # scale by hop ratio 106 | inverse_transform *= float(self.filter_length) / self.hop_length 107 | 108 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 109 | inverse_transform = inverse_transform[:, 110 | :, :-int(self.filter_length/2):] 111 | 112 | return inverse_transform 113 | 114 | def forward(self, input_data): 115 | self.magnitude, self.phase = self.transform(input_data) 116 | reconstruction = self.inverse(self.magnitude, self.phase) 117 | return reconstruction 118 | 119 | 120 | class TacotronSTFT(torch.nn.Module): 121 | ''' A mel-spectrum Transfomation class 122 | 123 | Example: 124 | n_fft = 1024 125 | hop_length = 256 126 | win_length = 1024 127 | n_mels = 80 128 | mel_fmin = 70 129 | mel_fmax = 8000 130 | sample_rate = 22050 131 | 132 | taco_stft = TacotronSTFT( filter_length=n_fft, 133 | hop_length=ho_length, 134 | win_length=win_length, 135 | n_mel_channels=n_mels, 136 | sampling_rate=sample_rate, 137 | mel_fmin=mel_fmin, 138 | mel_fmax=mel_fmax) 139 | 140 | audio, sr = load_wav_to_torch('path/to/00001.wav') 141 | mel, _ = taco_stft.mel_spectrogram(audio) 142 | ''' 143 | def __init__(self, filter_length, hop_length, win_length, 144 | n_mel_channels, sampling_rate, mel_fmin=0.0, 145 | mel_fmax=8000.0): 146 | super(TacotronSTFT, self).__init__() 147 | self.n_mel_channels = n_mel_channels 148 | self.sampling_rate = sampling_rate 149 | self.stft_fn = STFT(filter_length, hop_length, win_length) 150 | mel_basis = librosa_mel_fn( 151 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 152 | mel_basis = torch.from_numpy(mel_basis).float() 153 | self.register_buffer('mel_basis', mel_basis) 154 | 155 | def spectral_normalize(self, magnitudes): 156 | output = dynamic_range_compression(magnitudes) 157 | return output 158 | 159 | def spectral_de_normalize(self, magnitudes): 160 | output = dynamic_range_decompression(magnitudes) 161 | return output 162 | 163 | def mel_spectrogram(self, audio): 164 | """Computes mel-spectrograms from a batch of wav 165 | 166 | Args: 167 | audio: Variable(torch.FloatTensor) with shape (B, L) in range [-1, 1] 168 | 169 | Returns: 170 | mel (Tensor): torch.FloatTensor of shape (B, n_mel_channels, T) 171 | energy (Tensor): shape (B, n_fft, T) 172 | """ 173 | y = torch.autograd.Variable(audio, requires_grad=False) 174 | 175 | assert(torch.min(y.data) >= -1) 176 | assert(torch.max(y.data) <= 1) 177 | 178 | magnitudes, phases = self.stft_fn.transform(y) 179 | magnitudes = magnitudes.data 180 | mel_output = torch.matmul(self.mel_basis, magnitudes) 181 | mel_output = self.spectral_normalize(mel_output) 182 | energy = torch.norm(magnitudes, dim=1) 183 | 184 | return mel_output, energy 185 | -------------------------------------------------------------------------------- /vocoder/audio/util.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import numpy as np 4 | from scipy.signal import get_window 5 | import librosa 6 | 7 | 8 | def window_sumsquare(window, n_frames, hop_length, win_length, 9 | n_fft, dtype=np.float32, norm=None): 10 | """ 11 | # from librosa 0.6 12 | Compute the sum-square envelope of a window function at a given hop length. 13 | 14 | This is used to estimate modulation effects induced by windowing 15 | observations in short-time fourier transforms. 16 | 17 | Parameters 18 | ---------- 19 | window : string, tuple, number, callable, or list-like 20 | Window specification, as in `get_window` 21 | 22 | n_frames : int > 0 23 | The number of analysis frames 24 | 25 | hop_length : int > 0 26 | The number of samples to advance between frames 27 | 28 | win_length : [optional] 29 | The length of the window function. By default, this matches `n_fft`. 30 | 31 | n_fft : int > 0 32 | The length of each analysis frame. 33 | 34 | dtype : np.dtype 35 | The data type of the output 36 | 37 | Returns 38 | ------- 39 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 40 | The sum-squared envelope of the window function 41 | """ 42 | if win_length is None: 43 | win_length = n_fft 44 | 45 | n = n_fft + hop_length * (n_frames - 1) 46 | x = np.zeros(n, dtype=dtype) 47 | 48 | # Compute the squared window at the desired length 49 | win_sq = get_window(window, win_length, fftbins=True) 50 | win_sq = librosa.util.normalize(win_sq, norm=norm)**2 51 | win_sq = librosa.util.pad_center(win_sq, n_fft) 52 | 53 | # Fill the envelope 54 | for i in range(n_frames): 55 | sample = i * hop_length 56 | x[sample:min(n, sample + n_fft) 57 | ] += win_sq[:max(0, min(n_fft, n - sample))] 58 | return x 59 | 60 | 61 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 62 | """ 63 | PARAMS 64 | ------ 65 | magnitudes: spectrogram magnitudes 66 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 67 | """ 68 | 69 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 70 | angles = angles.astype(np.float32) 71 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 72 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 73 | 74 | for i in range(n_iters): 75 | _, angles = stft_fn.transform(signal) 76 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 77 | return signal 78 | 79 | 80 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 81 | """ 82 | PARAMS 83 | ------ 84 | C: compression factor 85 | """ 86 | x = 20 * torch.log10(torch.clamp(x, min=clip_val)) - 20 87 | x = torch.clamp((x + 100) / 100, 0.0, 1.0) 88 | return x 89 | 90 | 91 | def dynamic_range_decompression(x, C=1): 92 | """ 93 | PARAMS 94 | ------ 95 | C: compression factor used to compress 96 | """ 97 | x = x * 100 - 100 98 | x = torch.pow(10, x/20 + 1) 99 | return x 100 | 101 | 102 | def load_wav_to_torch(wav_file, sample_rate): 103 | ''' load wav and convert into Tensor. 104 | 105 | Args: 106 | wav_file (str): the path of wav file. 107 | sample_rate (int): sample_rate 108 | 109 | Returns: 110 | audio (Tensor): shape (1, L) 111 | sample_rate (int) 112 | ''' 113 | data, sr = librosa.load(wav_file, sample_rate) 114 | if len(data.shape) != 1: 115 | raise ValueError( f"the audio ({wav_file}) is not single channel." ) 116 | return torch.FloatTensor(data).unsqueeze(0), sr 117 | 118 | -------------------------------------------------------------------------------- /vocoder/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .audio_mel import PWGAudioMelNoiseDataset, DataLoader 3 | 4 | dataset_class_dict = { 5 | "PWGAudioMelNoiseDataset": PWGAudioMelNoiseDataset 6 | } 7 | 8 | def create_dataloader(dataset_classname, 9 | dataset_config, 10 | batch_size=1, 11 | collate_fn=None, 12 | shuffle=False, 13 | num_workers=0, 14 | drop_last=False, 15 | ) -> DataLoader: 16 | ''' create dataloader 17 | Args: 18 | dataset_classname (str) : the classname of dataset. 19 | dataset_config (dict): the config for dataset. 20 | ... 21 | Returns: 22 | Dataloader. 23 | ''' 24 | dataset = dataset_class_dict[ dataset_classname ]( **dataset_config ) 25 | dataloader = DataLoader( dataset, 26 | batch_size=batch_size, 27 | collate_fn=collate_fn, 28 | shuffle=shuffle, 29 | num_workers=num_workers, 30 | drop_last=drop_last) 31 | return dataloader 32 | 33 | -------------------------------------------------------------------------------- /vocoder/datasets/audio_mel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from scipy.io import wavfile 4 | 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset 7 | 8 | from vocoder.audio import load_wav_to_torch 9 | from .utils import read_metadata 10 | 11 | class PWGAudioMelNoiseDataset(Dataset): 12 | ''' the Pytorch Dataset for loading audio(.wav) and mel(.npy) ''' 13 | 14 | def __init__(self, metadata_file, batch_mel_length, sample_rate, hop_length, cut=True): 15 | '''Initialize 16 | Args: 17 | metadata_file (str): the file including paths of audio and mel. 18 | batch_mel_length (int): the length of mel-spectrum for batch. 19 | hop_length (int): the hop length used when calculating mel-spectrum. 20 | 21 | Description: 22 | Example of metadata_file: 23 | ./data/wavs/001.wav|./temp/mels/001.npy 24 | ./data/wavs/002.wav|./temp/mels/002.npy 25 | ./data/wavs/003.wav|./temp/mels/003.npy 26 | ''' 27 | super().__init__() 28 | self.batch_mel_length = batch_mel_length 29 | self.hop_length = hop_length 30 | self.sample_rate = sample_rate 31 | self.cut = cut 32 | 33 | self.metadata = read_metadata( metadata_file ) 34 | # metadata: contains paths of entire wav files and mel-spectrum files. 35 | # Examples: [ ('./data/wavs/001.wav', './dump/mels/001.npy'), ... ] 36 | 37 | def __len__(self): 38 | return len(self.metadata) 39 | 40 | def __getitem__(self, idx): 41 | ''' 42 | Returns: 43 | Tensor (float): audio, shape (L,) 44 | Tensor (float): mel-spectrum, shape ( ML, MC) 45 | Tensor (float): guassian noise with the same shape as audio, shape (L,) 46 | 47 | Note: 48 | the length of mel-spectrum (ML) is equal to `batch_mel_length` 49 | the equation relationship between the length of audio and mel-spectrum: 50 | L = ML * hop_length 51 | ''' 52 | wav_path, mel_path = self.metadata[ idx ] 53 | 54 | audio, sr = load_wav_to_torch( wav_path, self.sample_rate ) 55 | assert sr == self.sample_rate 56 | 57 | mel = np.load( mel_path ) 58 | 59 | if self.cut: 60 | assert mel.shape[0] > self.batch_mel_length + 1, f"the length of audio is too short: {wav_path}" 61 | mel_start = np.random.randint( 0, mel.shape[0] - self.batch_mel_length - 1 ) 62 | audio_start = (mel_start + 2) * self.hop_length 63 | 64 | mel = mel[ mel_start : mel_start + self.batch_mel_length ] 65 | audio = audio[ :, audio_start : audio_start + (self.batch_mel_length - 4) * self.hop_length ] 66 | else: 67 | audio = audio[ :, 2*self.hop_length:(mel.shape[0] - 2) * self.hop_length ] 68 | 69 | mel = torch.from_numpy( mel.T ) 70 | noise = torch.randn_like( audio ) 71 | return audio, mel, noise 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /vocoder/datasets/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | def read_metadata(metadata_path, split='|'): 6 | ''' read data from metadata file. 7 | 8 | Args: 9 | metadata_path (str): the path of metadata file. 10 | split (str): the char to split each line in metadata file. 11 | default: '|' 12 | Returns: 13 | list: data from metadata file. 14 | ''' 15 | with open(metadata_path, 'r', encoding='utf-8') as f: 16 | data = f.readlines() 17 | data = [ d.strip().split('|') for d in data ] 18 | return data 19 | 20 | 21 | def save_metadata(data, metadata_path, split='|'): 22 | '''save data to file as metadata. 23 | 24 | Args: 25 | data (list): data 26 | metadata_path (str): path for saving file. 27 | split (str): the char to join each element of data. 28 | Returns: 29 | None 30 | ''' 31 | with open(metadata_path, 'w', encoding='utf-8') as f: 32 | for d in data: 33 | line = split.join( d ) 34 | line.replace('\n',' ') 35 | f.write(line + '\n') -------------------------------------------------------------------------------- /vocoder/hparams.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import yaml 4 | 5 | 6 | class Hyperparameter: 7 | ''' hyperparameter manager ''' 8 | 9 | def __init__(self, config_file: str): 10 | ''' Hyperparameter 11 | Args: 12 | config_file (str): the config file. 13 | ''' 14 | # Audio 15 | self.sample_rate = 22050 16 | self.hop_length = 256 17 | self.win_length = 1024 18 | self.n_fft = 1024 19 | self.n_mels = 80 20 | self.mel_fmax = 8000 21 | self.mel_fmin = 70 22 | 23 | # Moel 24 | self.model_name = 'ParallelWaveGAN' 25 | self.model_params = dict() 26 | 27 | # Loss 28 | self.loss_name = 'PWGLoss' 29 | self.loss_params = dict() 30 | 31 | # Optimizer 32 | self.opt_name = 'PWGOptimizer' 33 | self.opt_params = dict() 34 | 35 | # Strategy 36 | self.strategy_name = "PWGStrategy" 37 | self.strategy_params = dict() 38 | 39 | # Training 40 | self.dataset_classname = 'PWGAudioMelNoiseDataset' 41 | self.dataset_num_workers = 5 42 | self.train_metadata_file = 'temp/metadata.train.txt' 43 | self.batch_mel_length = 52 44 | self.train_batch_size = 8 45 | self.max_train_steps = 40000 46 | self.log_interval_steps = 100 47 | self.save_interval_steps = 10000 48 | 49 | # Evaluate 50 | self.eval_sample_num = 500 51 | self.eval_metadata_file = 'temp/metadata.eval.txt' 52 | self.eval_interval_steps = 1000 53 | 54 | # Test 55 | self.test_sample_num = 100 56 | self.test_metadata_file = 'temp/metadata.test.txt' 57 | 58 | 59 | with open(config_file, 'r', encoding='utf-8') as f: 60 | config = yaml.safe_load( f ) 61 | for k in config: 62 | self.__setattr__(k, config[k]) 63 | 64 | def save_config(self, file): 65 | with open(file, 'w', encoding='utf-8') as f: 66 | yaml.safe_dump(self.__dict__, f) 67 | 68 | def __str__(self): 69 | return yaml.safe_dump(self.__dict__) 70 | 71 | 72 | -------------------------------------------------------------------------------- /vocoder/inference.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /vocoder/layers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .causal_conv import * # NOQA 3 | from .pqmf import * # NOQA 4 | from .residual_block import * # NOQA 5 | from .residual_stack import * # NOQA 6 | from .upsample import * # NOQA 7 | 8 | -------------------------------------------------------------------------------- /vocoder/layers/causal_conv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Causal convolusion layer modules.""" 7 | 8 | 9 | import torch 10 | 11 | 12 | class CausalConv1d(torch.nn.Module): 13 | """CausalConv1d module with customized initialization.""" 14 | 15 | def __init__(self, in_channels, out_channels, kernel_size, 16 | dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}): 17 | """Initialize CausalConv1d module.""" 18 | super(CausalConv1d, self).__init__() 19 | self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params) 20 | self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, 21 | dilation=dilation, bias=bias) 22 | 23 | def forward(self, x): 24 | """Calculate forward propagation. 25 | 26 | Args: 27 | x (Tensor): Input tensor (B, in_channels, T). 28 | 29 | Returns: 30 | Tensor: Output tensor (B, out_channels, T). 31 | 32 | """ 33 | return self.conv(self.pad(x))[:, :, :x.size(2)] 34 | 35 | 36 | class CausalConvTranspose1d(torch.nn.Module): 37 | """CausalConvTranspose1d module with customized initialization.""" 38 | 39 | def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): 40 | """Initialize CausalConvTranspose1d module.""" 41 | super(CausalConvTranspose1d, self).__init__() 42 | self.deconv = torch.nn.ConvTranspose1d( 43 | in_channels, out_channels, kernel_size, stride, bias=bias) 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | """Calculate forward propagation. 48 | 49 | Args: 50 | x (Tensor): Input tensor (B, in_channels, T_in). 51 | 52 | Returns: 53 | Tensor: Output tensor (B, out_channels, T_out). 54 | 55 | """ 56 | return self.deconv(x)[:, :, :-self.stride] 57 | -------------------------------------------------------------------------------- /vocoder/layers/location_variable_conv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/vocoder/layers/location_variable_conv.py -------------------------------------------------------------------------------- /vocoder/layers/pqmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Pseudo QMF modules.""" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | from scipy.signal import kaiser 13 | 14 | 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): 16 | """Design prototype filter for PQMF. 17 | 18 | This method is based on `A Kaiser window approach for the design of prototype 19 | filters of cosine modulated filterbanks`_. 20 | 21 | Args: 22 | taps (int): The number of filter taps. 23 | cutoff_ratio (float): Cut-off frequency ratio. 24 | beta (float): Beta coefficient for kaiser window. 25 | 26 | Returns: 27 | ndarray: Impluse response of prototype filter (taps + 1,). 28 | 29 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 30 | https://ieeexplore.ieee.org/abstract/document/681427 31 | 32 | """ 33 | # check the arguments are valid 34 | assert taps % 2 == 0, "The number of taps mush be even number." 35 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 36 | 37 | # make initial filter 38 | omega_c = np.pi * cutoff_ratio 39 | with np.errstate(invalid='ignore'): 40 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \ 41 | / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) 42 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 43 | 44 | # apply kaiser window 45 | w = kaiser(taps + 1, beta) 46 | h = h_i * w 47 | 48 | return h 49 | 50 | 51 | class PQMF(torch.nn.Module): 52 | """PQMF module. 53 | 54 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 55 | 56 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 57 | https://ieeexplore.ieee.org/document/258122 58 | 59 | """ 60 | 61 | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0): 62 | """Initilize PQMF module. 63 | 64 | The cutoff_ratio and beta parameters are optimized for #subbands = 4. 65 | See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. 66 | 67 | Args: 68 | subbands (int): The number of subbands. 69 | taps (int): The number of filter taps. 70 | cutoff_ratio (float): Cut-off frequency ratio. 71 | beta (float): Beta coefficient for kaiser window. 72 | 73 | """ 74 | super(PQMF, self).__init__() 75 | 76 | # build analysis & synthesis filter coefficients 77 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 78 | h_analysis = np.zeros((subbands, len(h_proto))) 79 | h_synthesis = np.zeros((subbands, len(h_proto))) 80 | for k in range(subbands): 81 | h_analysis[k] = 2 * h_proto * np.cos( 82 | (2 * k + 1) * (np.pi / (2 * subbands)) * 83 | (np.arange(taps + 1) - (taps / 2)) + 84 | (-1) ** k * np.pi / 4) 85 | h_synthesis[k] = 2 * h_proto * np.cos( 86 | (2 * k + 1) * (np.pi / (2 * subbands)) * 87 | (np.arange(taps + 1) - (taps / 2)) - 88 | (-1) ** k * np.pi / 4) 89 | 90 | # convert to tensor 91 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1) 92 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0) 93 | 94 | # register coefficients as beffer 95 | self.register_buffer("analysis_filter", analysis_filter) 96 | self.register_buffer("synthesis_filter", synthesis_filter) 97 | 98 | # filter for downsampling & upsampling 99 | updown_filter = torch.zeros((subbands, subbands, subbands)).float() 100 | for k in range(subbands): 101 | updown_filter[k, k, 0] = 1.0 102 | self.register_buffer("updown_filter", updown_filter) 103 | self.subbands = subbands 104 | 105 | # keep padding info 106 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 107 | 108 | def analysis(self, x): 109 | """Analysis with PQMF. 110 | 111 | Args: 112 | x (Tensor): Input tensor (B, 1, T). 113 | 114 | Returns: 115 | Tensor: Output tensor (B, subbands, T // subbands). 116 | 117 | """ 118 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 119 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 120 | 121 | def synthesis(self, x): 122 | """Synthesis with PQMF. 123 | 124 | Args: 125 | x (Tensor): Input tensor (B, subbands, T // subbands). 126 | 127 | Returns: 128 | Tensor: Output tensor (B, 1, T). 129 | 130 | """ 131 | # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands. 132 | # Not sure this is the correct way, it is better to check again. 133 | # TODO(kan-bayashi): Understand the reconstruction procedure 134 | x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) 135 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) 136 | -------------------------------------------------------------------------------- /vocoder/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Residual block module in WaveNet. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | 15 | class Conv1d(torch.nn.Conv1d): 16 | """Conv1d module with customized initialization.""" 17 | 18 | def __init__(self, *args, **kwargs): 19 | """Initialize Conv1d module.""" 20 | super(Conv1d, self).__init__(*args, **kwargs) 21 | 22 | def reset_parameters(self): 23 | """Reset parameters.""" 24 | torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 25 | if self.bias is not None: 26 | torch.nn.init.constant_(self.bias, 0.0) 27 | 28 | 29 | class Conv1d1x1(Conv1d): 30 | """1x1 Conv1d with customized initialization.""" 31 | 32 | def __init__(self, in_channels, out_channels, bias): 33 | """Initialize 1x1 Conv1d module.""" 34 | super(Conv1d1x1, self).__init__(in_channels, out_channels, 35 | kernel_size=1, padding=0, 36 | dilation=1, bias=bias) 37 | 38 | 39 | class ResidualBlock(torch.nn.Module): 40 | """Residual block module in WaveNet.""" 41 | 42 | def __init__(self, 43 | kernel_size=3, 44 | residual_channels=64, 45 | gate_channels=128, 46 | skip_channels=64, 47 | aux_channels=80, 48 | dropout=0.0, 49 | dilation=1, 50 | bias=True, 51 | use_causal_conv=False 52 | ): 53 | """Initialize ResidualBlock module. 54 | 55 | Args: 56 | kernel_size (int): Kernel size of dilation convolution layer. 57 | residual_channels (int): Number of channels for residual connection. 58 | skip_channels (int): Number of channels for skip connection. 59 | aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. 60 | dropout (float): Dropout probability. 61 | dilation (int): Dilation factor. 62 | bias (bool): Whether to add bias parameter in convolution layers. 63 | use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution. 64 | 65 | """ 66 | super(ResidualBlock, self).__init__() 67 | self.dropout = dropout 68 | # no future time stamps available 69 | if use_causal_conv: 70 | padding = (kernel_size - 1) * dilation 71 | else: 72 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 73 | padding = (kernel_size - 1) // 2 * dilation 74 | self.use_causal_conv = use_causal_conv 75 | 76 | # dilation conv 77 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 78 | padding=padding, dilation=dilation, bias=bias) 79 | 80 | # local conditioning 81 | if aux_channels > 0: 82 | self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) 83 | else: 84 | self.conv1x1_aux = None 85 | 86 | # conv output is split into two groups 87 | gate_out_channels = gate_channels // 2 88 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) 89 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) 90 | 91 | def forward(self, x, c): 92 | """Calculate forward propagation. 93 | 94 | Args: 95 | x (Tensor): Input tensor (B, residual_channels, T). 96 | c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). 97 | 98 | Returns: 99 | Tensor: Output tensor for residual connection (B, residual_channels, T). 100 | Tensor: Output tensor for skip connection (B, skip_channels, T). 101 | 102 | """ 103 | residual = x 104 | x = F.dropout(x, p=self.dropout, training=self.training) 105 | x = self.conv(x) 106 | 107 | # remove future time steps if use_causal_conv conv 108 | x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x 109 | 110 | # split into two part for gated activation 111 | splitdim = 1 112 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 113 | 114 | # local conditioning 115 | if c is not None: 116 | assert self.conv1x1_aux is not None 117 | c = self.conv1x1_aux(c) 118 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 119 | xa, xb = xa + ca, xb + cb 120 | 121 | x = torch.tanh(xa) * torch.sigmoid(xb) 122 | 123 | # for skip connection 124 | s = self.conv1x1_skip(x) 125 | 126 | # for residual connection 127 | x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) 128 | 129 | return x, s 130 | -------------------------------------------------------------------------------- /vocoder/layers/residual_stack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual stack module in MelGAN.""" 7 | 8 | import torch 9 | 10 | from vocoder.layers import CausalConv1d 11 | 12 | 13 | class ResidualStack(torch.nn.Module): 14 | """Residual stack module introduced in MelGAN.""" 15 | 16 | def __init__(self, 17 | kernel_size=3, 18 | channels=32, 19 | dilation=1, 20 | bias=True, 21 | nonlinear_activation="LeakyReLU", 22 | nonlinear_activation_params={"negative_slope": 0.2}, 23 | pad="ReflectionPad1d", 24 | pad_params={}, 25 | use_causal_conv=False, 26 | ): 27 | """Initialize ResidualStack module. 28 | 29 | Args: 30 | kernel_size (int): Kernel size of dilation convolution layer. 31 | channels (int): Number of channels of convolution layers. 32 | dilation (int): Dilation factor. 33 | bias (bool): Whether to add bias parameter in convolution layers. 34 | nonlinear_activation (str): Activation function module name. 35 | nonlinear_activation_params (dict): Hyperparameters for activation function. 36 | pad (str): Padding function module name before dilated convolution layer. 37 | pad_params (dict): Hyperparameters for padding function. 38 | use_causal_conv (bool): Whether to use causal convolution. 39 | 40 | """ 41 | super(ResidualStack, self).__init__() 42 | 43 | # defile residual stack part 44 | if not use_causal_conv: 45 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 46 | self.stack = torch.nn.Sequential( 47 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 48 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 49 | torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), 50 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 51 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 52 | ) 53 | else: 54 | self.stack = torch.nn.Sequential( 55 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 56 | CausalConv1d(channels, channels, kernel_size, dilation=dilation, 57 | bias=bias, pad=pad, pad_params=pad_params), 58 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 59 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 60 | ) 61 | 62 | # defile extra layer for skip connection 63 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 64 | 65 | def forward(self, c): 66 | """Calculate forward propagation. 67 | 68 | Args: 69 | c (Tensor): Input tensor (B, channels, T). 70 | 71 | Returns: 72 | Tensor: Output tensor (B, chennels, T). 73 | 74 | """ 75 | return self.stack(c) + self.skip_layer(c) 76 | -------------------------------------------------------------------------------- /vocoder/layers/upsample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Upsampling module. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn.functional as F 12 | 13 | from vocoder.layers import Conv1d 14 | 15 | 16 | class Stretch2d(torch.nn.Module): 17 | """Stretch2d module.""" 18 | 19 | def __init__(self, x_scale, y_scale, mode="nearest"): 20 | """Initialize Stretch2d module. 21 | 22 | Args: 23 | x_scale (int): X scaling factor (Time axis in spectrogram). 24 | y_scale (int): Y scaling factor (Frequency axis in spectrogram). 25 | mode (str): Interpolation mode. 26 | 27 | """ 28 | super(Stretch2d, self).__init__() 29 | self.x_scale = x_scale 30 | self.y_scale = y_scale 31 | self.mode = mode 32 | 33 | def forward(self, x): 34 | """Calculate forward propagation. 35 | 36 | Args: 37 | x (Tensor): Input tensor (B, C, F, T). 38 | 39 | Returns: 40 | Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), 41 | 42 | """ 43 | return F.interpolate( 44 | x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) 45 | 46 | 47 | class Conv2d(torch.nn.Conv2d): 48 | """Conv2d module with customized initialization.""" 49 | 50 | def __init__(self, *args, **kwargs): 51 | """Initialize Conv2d module.""" 52 | super(Conv2d, self).__init__(*args, **kwargs) 53 | 54 | def reset_parameters(self): 55 | """Reset parameters.""" 56 | self.weight.data.fill_(1. / np.prod(self.kernel_size)) 57 | if self.bias is not None: 58 | torch.nn.init.constant_(self.bias, 0.0) 59 | 60 | 61 | class UpsampleNetwork(torch.nn.Module): 62 | """Upsampling network module.""" 63 | 64 | def __init__(self, 65 | upsample_scales, 66 | nonlinear_activation=None, 67 | nonlinear_activation_params={}, 68 | interpolate_mode="nearest", 69 | freq_axis_kernel_size=1, 70 | use_causal_conv=False, 71 | ): 72 | """Initialize upsampling network module. 73 | 74 | Args: 75 | upsample_scales (list): List of upsampling scales. 76 | nonlinear_activation (str): Activation function name. 77 | nonlinear_activation_params (dict): Arguments for specified activation function. 78 | interpolate_mode (str): Interpolation mode. 79 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 80 | 81 | """ 82 | super(UpsampleNetwork, self).__init__() 83 | self.use_causal_conv = use_causal_conv 84 | self.up_layers = torch.nn.ModuleList() 85 | for scale in upsample_scales: 86 | # interpolation layer 87 | stretch = Stretch2d(scale, 1, interpolate_mode) 88 | self.up_layers += [stretch] 89 | 90 | # conv layer 91 | assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." 92 | freq_axis_padding = (freq_axis_kernel_size - 1) // 2 93 | kernel_size = (freq_axis_kernel_size, scale * 2 + 1) 94 | if use_causal_conv: 95 | padding = (freq_axis_padding, scale * 2) 96 | else: 97 | padding = (freq_axis_padding, scale) 98 | conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) 99 | self.up_layers += [conv] 100 | 101 | # nonlinear 102 | if nonlinear_activation is not None: 103 | nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) 104 | self.up_layers += [nonlinear] 105 | 106 | def forward(self, c): 107 | """Calculate forward propagation. 108 | 109 | Args: 110 | c : Input tensor (B, C, T). 111 | 112 | Returns: 113 | Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales). 114 | 115 | """ 116 | c = c.unsqueeze(1) # (B, 1, C, T) 117 | for f in self.up_layers: 118 | if self.use_causal_conv and isinstance(f, Conv2d): 119 | c = f(c)[..., :c.size(-1)] 120 | else: 121 | c = f(c) 122 | return c.squeeze(1) # (B, C, T') 123 | 124 | 125 | class ConvInUpsampleNetwork(torch.nn.Module): 126 | """Convolution + upsampling network module.""" 127 | 128 | def __init__(self, 129 | upsample_scales, 130 | nonlinear_activation=None, 131 | nonlinear_activation_params={}, 132 | interpolate_mode="nearest", 133 | freq_axis_kernel_size=1, 134 | aux_channels=80, 135 | aux_context_window=0, 136 | use_causal_conv=False 137 | ): 138 | """Initialize convolution + upsampling network module. 139 | 140 | Args: 141 | upsample_scales (list): List of upsampling scales. 142 | nonlinear_activation (str): Activation function name. 143 | nonlinear_activation_params (dict): Arguments for specified activation function. 144 | mode (str): Interpolation mode. 145 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 146 | aux_channels (int): Number of channels of pre-convolutional layer. 147 | aux_context_window (int): Context window size of the pre-convolutional layer. 148 | use_causal_conv (bool): Whether to use causal structure. 149 | 150 | """ 151 | super(ConvInUpsampleNetwork, self).__init__() 152 | self.aux_context_window = aux_context_window 153 | self.use_causal_conv = use_causal_conv and aux_context_window > 0 154 | # To capture wide-context information in conditional features 155 | kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 156 | # NOTE(kan-bayashi): Here do not use padding because the input is already padded 157 | self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) 158 | self.upsample = UpsampleNetwork( 159 | upsample_scales=upsample_scales, 160 | nonlinear_activation=nonlinear_activation, 161 | nonlinear_activation_params=nonlinear_activation_params, 162 | interpolate_mode=interpolate_mode, 163 | freq_axis_kernel_size=freq_axis_kernel_size, 164 | use_causal_conv=use_causal_conv, 165 | ) 166 | 167 | def forward(self, c): 168 | """Calculate forward propagation. 169 | 170 | Args: 171 | c : Input tensor (B, C, T'). 172 | 173 | Returns: 174 | Tensor: Upsampled tensor (B, C, T), 175 | where T = (T' - aux_context_window * 2) * prod(upsample_scales). 176 | 177 | Note: 178 | The length of inputs considers the context window size. 179 | 180 | """ 181 | c_ = self.conv_in(c) 182 | c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ 183 | return self.upsample(c) 184 | -------------------------------------------------------------------------------- /vocoder/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .pwg_loss import PWGLoss 4 | 5 | loss_modules = { 6 | "PWGLoss": PWGLoss 7 | } 8 | 9 | def create_loss(name, params, device) -> Union[PWGLoss]: 10 | return loss_modules[ name ]( **params ).to(device) -------------------------------------------------------------------------------- /vocoder/losses/pwg_loss.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from .stft_loss import * 5 | 6 | class PWGLoss(torch.nn.Module): 7 | 8 | def __init__(self, stft_loss_params={}): 9 | super(PWGLoss, self).__init__() 10 | self.stft_criterion = MultiResolutionSTFTLoss( **stft_loss_params ) 11 | self.mse_criterion = torch.nn.MSELoss() 12 | 13 | 14 | def stft_loss(self, audio, audio_): 15 | sc_loss, mag_loss = self.stft_criterion( audio_.squeeze(1), audio.squeeze(1) ) 16 | return sc_loss, mag_loss 17 | 18 | def adversarial_loss(self, prob_ ): 19 | return self.mse_criterion( prob_, torch.ones_like( prob_ ) ) 20 | 21 | def discriminator_loss(self, prob, prob_ ): 22 | real_loss = self.mse_criterion( prob, torch.ones_like( prob ) ) 23 | fake_loss = self.mse_criterion( prob_, torch.zeros_like(prob_) ) 24 | return real_loss, fake_loss 25 | -------------------------------------------------------------------------------- /vocoder/losses/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | 15 | Args: 16 | x (Tensor): Input signal tensor (B, T). 17 | fft_size (int): FFT size. 18 | hop_size (int): Hop size. 19 | win_length (int): Window length. 20 | window (str): Window function type. 21 | 22 | Returns: 23 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 24 | 25 | """ 26 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window) 27 | real = x_stft[..., 0] 28 | imag = x_stft[..., 1] 29 | 30 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 31 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 32 | 33 | 34 | class SpectralConvergenceLoss(torch.nn.Module): 35 | """Spectral convergence loss module.""" 36 | 37 | def __init__(self): 38 | """Initilize spectral convergence loss module.""" 39 | super(SpectralConvergenceLoss, self).__init__() 40 | 41 | def forward(self, x_mag, y_mag): 42 | """Calculate forward propagation. 43 | 44 | Args: 45 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 46 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 47 | 48 | Returns: 49 | Tensor: Spectral convergence loss value. 50 | 51 | """ 52 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 53 | 54 | 55 | class LogSTFTMagnitudeLoss(torch.nn.Module): 56 | """Log STFT magnitude loss module.""" 57 | 58 | def __init__(self): 59 | """Initilize los STFT magnitude loss module.""" 60 | super(LogSTFTMagnitudeLoss, self).__init__() 61 | 62 | def forward(self, x_mag, y_mag): 63 | """Calculate forward propagation. 64 | 65 | Args: 66 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 67 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 68 | 69 | Returns: 70 | Tensor: Log STFT magnitude loss value. 71 | 72 | """ 73 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 74 | 75 | 76 | class STFTLoss(torch.nn.Module): 77 | """STFT loss module.""" 78 | 79 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 80 | """Initialize STFT loss module.""" 81 | super(STFTLoss, self).__init__() 82 | self.fft_size = fft_size 83 | self.shift_size = shift_size 84 | self.win_length = win_length 85 | window = getattr(torch, window)(win_length) 86 | self.register_buffer('window', window) 87 | self.spectral_convergence_loss = SpectralConvergenceLoss() 88 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 89 | 90 | def forward(self, x, y): 91 | """Calculate forward propagation. 92 | 93 | Args: 94 | x (Tensor): Predicted signal (B, T). 95 | y (Tensor): Groundtruth signal (B, T). 96 | 97 | Returns: 98 | Tensor: Spectral convergence loss value. 99 | Tensor: Log STFT magnitude loss value. 100 | 101 | """ 102 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 103 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 104 | sc_loss = self.spectral_convergence_loss(x_mag, y_mag) 105 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 106 | 107 | return sc_loss, mag_loss 108 | 109 | 110 | class MultiResolutionSTFTLoss(torch.nn.Module): 111 | """Multi resolution STFT loss module.""" 112 | 113 | def __init__(self, 114 | fft_sizes=[1024, 2048, 512], 115 | hop_sizes=[120, 240, 50], 116 | win_lengths=[600, 1200, 240], 117 | window="hann_window"): 118 | """Initialize Multi resolution STFT loss module. 119 | 120 | Args: 121 | fft_sizes (list): List of FFT sizes. 122 | hop_sizes (list): List of hop sizes. 123 | win_lengths (list): List of window lengths. 124 | window (str): Window function type. 125 | 126 | """ 127 | super(MultiResolutionSTFTLoss, self).__init__() 128 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 129 | self.stft_losses = torch.nn.ModuleList() 130 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 131 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 132 | 133 | def forward(self, x, y): 134 | """Calculate forward propagation. 135 | 136 | Args: 137 | x (Tensor): Predicted signal (B, T). 138 | y (Tensor): Groundtruth signal (B, T). 139 | 140 | Returns: 141 | Tensor: Multi resolution spectral convergence loss value. 142 | Tensor: Multi resolution log STFT magnitude loss value. 143 | 144 | """ 145 | sc_loss = 0.0 146 | mag_loss = 0.0 147 | for f in self.stft_losses: 148 | # print( 'stft parameter device:', next(f.parameters()).device ) 149 | sc_l, mag_l = f(x, y) 150 | sc_loss += sc_l 151 | mag_loss += mag_l 152 | sc_loss /= len(self.stft_losses) 153 | mag_loss /= len(self.stft_losses) 154 | 155 | return sc_loss, mag_loss 156 | -------------------------------------------------------------------------------- /vocoder/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Union 3 | 4 | from .parallel_wavegan import ParallelWaveGAN 5 | from .lvcgan import LVCNetWaveGAN 6 | 7 | model_list = { 8 | "ParallelWaveGAN": ParallelWaveGAN, 9 | "LVCNetWaveGAN": LVCNetWaveGAN 10 | } 11 | 12 | 13 | def create_model(name, params, device) -> Union[ParallelWaveGAN]: 14 | ''' Create model according to the model classname 15 | Args: 16 | name (str): model classname. 17 | params (dict): the parameter for create model. 18 | Return: 19 | torch.nn.Module : Model. 20 | ''' 21 | return model_list[ name ](**params).to(device) 22 | 23 | -------------------------------------------------------------------------------- /vocoder/models/lvcgan.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import logging 4 | import math 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from vocoder.layers import Conv1d 10 | from vocoder.layers import Conv1d1x1 11 | from vocoder.layers import ResidualBlock 12 | from vocoder.layers import upsample 13 | from vocoder import models 14 | 15 | from .parallel_wavegan import ParallelWaveGANDiscriminator 16 | from .lvcnet import LVCBlock 17 | 18 | 19 | class LVCNetWaveGAN(torch.nn.Module): 20 | """Parallel WaveGAN module""" 21 | 22 | def __init__(self, generator_params={}, discriminator_params={}): 23 | super().__init__() 24 | 25 | self.generator = LVCNetGenerator(**generator_params) 26 | self.discriminator = ParallelWaveGANDiscriminator(**discriminator_params) 27 | 28 | def generator_forward(self, x, c): 29 | return self.generator(x, c) 30 | 31 | def discriminator_forward(self, x): 32 | return self.discriminator(x) 33 | 34 | 35 | class LVCNetGenerator(torch.nn.Module): 36 | """Parallel WaveGAN Generator module.""" 37 | 38 | def __init__(self, 39 | in_channels=1, 40 | out_channels=1, 41 | inner_channels=8, 42 | cond_channels=80, 43 | cond_hop_length=256, 44 | lvc_block_nums=3, 45 | lvc_layers_each_block=10, 46 | lvc_kernel_size=3, 47 | kpnet_hidden_channels=64, 48 | kpnet_conv_size=1, 49 | dropout=0.0, 50 | use_weight_norm=True, 51 | ): 52 | """Initialize Parallel WaveGAN Generator module. 53 | 54 | Args: 55 | in_channels (int): Number of input channels. 56 | out_channels (int): Number of output channels. 57 | kernel_size (int): Kernel size of dilated convolution. 58 | layers (int): Number of residual block layers. 59 | stacks (int): Number of stacks i.e., dilation cycles. 60 | residual_channels (int): Number of channels in residual conv. 61 | gate_channels (int): Number of channels in gated conv. 62 | skip_channels (int): Number of channels in skip conv. 63 | aux_channels (int): Number of channels for auxiliary feature conv. 64 | aux_context_window (int): Context window size for auxiliary feature. 65 | dropout (float): Dropout rate. 0.0 means no dropout applied. 66 | bias (bool): Whether to use bias parameter in conv layer. 67 | use_weight_norm (bool): Whether to use weight norm. 68 | If set to true, it will be applied to all of the conv layers. 69 | use_causal_conv (bool): Whether to use causal structure. 70 | upsample_conditional_features (bool): Whether to use upsampling network. 71 | upsample_net (str): Upsampling network architecture. 72 | upsample_params (dict): Upsampling network parameters. 73 | 74 | """ 75 | super().__init__() 76 | self.in_channels = in_channels 77 | self.out_channels = out_channels 78 | self.cond_channels = cond_channels 79 | self.lvc_block_nums = lvc_block_nums 80 | 81 | # define first convolution 82 | self.first_conv = Conv1d1x1(in_channels, inner_channels, bias=True) 83 | 84 | # define residual blocks 85 | self.lvc_blocks = torch.nn.ModuleList() 86 | for n in range(lvc_block_nums): 87 | lvcb = LVCBlock( 88 | in_channels=inner_channels, 89 | cond_channels=cond_channels, 90 | conv_layers=lvc_layers_each_block, 91 | conv_kernel_size=lvc_kernel_size, 92 | cond_hop_length=cond_hop_length, 93 | kpnet_hidden_channels=kpnet_hidden_channels, 94 | kpnet_conv_size=kpnet_conv_size, 95 | kpnet_dropout=dropout, 96 | ) 97 | self.lvc_blocks += [lvcb] 98 | 99 | # define output layers 100 | self.last_conv_layers = torch.nn.ModuleList([ 101 | torch.nn.ReLU(inplace=True), 102 | Conv1d1x1(inner_channels, inner_channels, bias=True), 103 | torch.nn.ReLU(inplace=True), 104 | Conv1d1x1(inner_channels, out_channels, bias=True), 105 | ]) 106 | 107 | # apply weight norm 108 | if use_weight_norm: 109 | self.apply_weight_norm() 110 | 111 | def forward(self, x, c): 112 | """Calculate forward propagation. 113 | 114 | Args: 115 | x (Tensor): Input noise signal (B, 1, T). 116 | c (Tensor): Local conditioning auxiliary features (B, C ,T'). 117 | 118 | Returns: 119 | Tensor: Output tensor (B, out_channels, T) 120 | 121 | """ 122 | 123 | x = self.first_conv(x) 124 | x = self.lvc_blocks[0]( x, c ) 125 | for n in range(1, self.lvc_block_nums): 126 | x = x + self.lvc_blocks[n]( x, c ) 127 | 128 | # apply final layers 129 | for f in self.last_conv_layers: 130 | x = f(x) 131 | 132 | return x 133 | 134 | def remove_weight_norm(self): 135 | """Remove weight normalization module from all of the layers.""" 136 | def _remove_weight_norm(m): 137 | try: 138 | logging.debug(f"Weight norm is removed from {m}.") 139 | torch.nn.utils.remove_weight_norm(m) 140 | except ValueError: # this module didn't have weight norm 141 | return 142 | 143 | self.apply(_remove_weight_norm) 144 | 145 | def apply_weight_norm(self): 146 | """Apply weight normalization module from all of the layers.""" 147 | def _apply_weight_norm(m): 148 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 149 | torch.nn.utils.weight_norm(m) 150 | logging.debug(f"Weight norm is applied to {m}.") 151 | 152 | self.apply(_apply_weight_norm) 153 | 154 | @staticmethod 155 | def _get_receptive_field_size(layers, stacks, kernel_size, 156 | dilation=lambda x: 2 ** x): 157 | assert layers % stacks == 0 158 | layers_per_cycle = layers // stacks 159 | dilations = [dilation(i % layers_per_cycle) for i in range(layers)] 160 | return (kernel_size - 1) * sum(dilations) + 1 161 | 162 | @property 163 | def receptive_field_size(self): 164 | """Return receptive field size.""" 165 | return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) 166 | 167 | def inference(self, c=None, x=None): 168 | """Perform inference. 169 | 170 | Args: 171 | c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). 172 | x (Union[Tensor, ndarray]): Input noise signal (T, 1). 173 | 174 | Returns: 175 | Tensor: Output tensor (T, out_channels) 176 | 177 | """ 178 | if x is not None: 179 | if not isinstance(x, torch.Tensor): 180 | x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) 181 | x = x.transpose(1, 0).unsqueeze(0) 182 | else: 183 | assert c is not None 184 | x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) 185 | if c is not None: 186 | if not isinstance(c, torch.Tensor): 187 | c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) 188 | c = c.transpose(1, 0).unsqueeze(0) 189 | c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) 190 | return self.forward(x, c).squeeze(0).transpose(1, 0) 191 | -------------------------------------------------------------------------------- /vocoder/models/lvcnet.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from vocoder.layers import Conv1d 7 | from vocoder.layers import Conv1d1x1 8 | 9 | class KernelPredictor(torch.nn.Module): 10 | ''' Kernel predictor for the location-variable convolutions 11 | ''' 12 | 13 | def __init__(self, 14 | cond_channels, 15 | conv_in_channels, 16 | conv_out_channels, 17 | conv_layers, 18 | conv_kernel_size=3, 19 | kpnet_hidden_channels=64, 20 | kpnet_conv_size=1, 21 | kpnet_dropout=0.0, 22 | kpnet_nonlinear_activation="LeakyReLU", 23 | kpnet_nonlinear_activation_params={"negative_slope":0.1} 24 | ): 25 | ''' 26 | Args: 27 | cond_channels (int): number of channel for the conditioning sequence, 28 | conv_in_channels (int): number of channel for the input sequence, 29 | conv_out_channels (int): number of channel for the output sequence, 30 | conv_layers (int): 31 | kpnet_ 32 | ''' 33 | super().__init__() 34 | 35 | self.conv_in_channels = conv_in_channels 36 | self.conv_out_channels = conv_out_channels 37 | self.conv_kernel_size = conv_kernel_size 38 | self.conv_layers = conv_layers 39 | 40 | kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers 41 | kpnet_bias_channels = conv_out_channels * conv_layers 42 | 43 | padding = (kpnet_conv_size - 1)//2 44 | self.input_conv = torch.nn.Sequential( 45 | torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=0, bias=True), 46 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 47 | ) 48 | 49 | self.residual_conv = torch.nn.Sequential( 50 | torch.nn.Dropout(kpnet_dropout), 51 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 52 | getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), 53 | torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), 54 | ) 55 | 56 | self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True) 57 | self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True) 58 | 59 | def forward(self, c): 60 | ''' 61 | Args: 62 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 63 | Returns: 64 | ''' 65 | batch, cond_channels, cond_length = c.shape 66 | 67 | c = self.input_conv( c ) 68 | c = c + self.residual_conv( c ) 69 | k = self.kernel_conv( c ) 70 | b = self.bias_conv( c ) 71 | kernels = k.contiguous().view( batch, 72 | self.conv_layers, 73 | self.conv_in_channels, 74 | self.conv_out_channels, 75 | self.conv_kernel_size, 76 | cond_length - 4 ) 77 | bias = b.contiguous().view( batch, 78 | self.conv_layers, 79 | self.conv_out_channels, 80 | cond_length - 4 ) 81 | return kernels, bias 82 | 83 | 84 | 85 | class LVCBlock(torch.nn.Module): 86 | ''' the location-variable convolutions 87 | ''' 88 | 89 | def __init__(self, 90 | in_channels, 91 | cond_channels, 92 | conv_layers=10, 93 | conv_kernel_size=3, 94 | cond_hop_length=256, 95 | kpnet_hidden_channels=64, 96 | kpnet_conv_size=1, 97 | kpnet_dropout=0.0 98 | ): 99 | super().__init__() 100 | 101 | self.cond_hop_length = cond_hop_length 102 | self.conv_layers = conv_layers 103 | self.conv_kernel_size = conv_kernel_size 104 | 105 | self.kernel_predictor = KernelPredictor( 106 | cond_channels=cond_channels, 107 | conv_in_channels=in_channels, 108 | conv_out_channels=2*in_channels, 109 | conv_layers=conv_layers, 110 | conv_kernel_size=conv_kernel_size, 111 | kpnet_hidden_channels=kpnet_hidden_channels, 112 | kpnet_conv_size=kpnet_conv_size, 113 | kpnet_dropout=kpnet_dropout 114 | ) 115 | 116 | def forward(self, x, c): 117 | ''' forward propagation of the location-variable convolutions. 118 | Args: 119 | x (Tensor): the input sequence (batch, in_channels, in_length) 120 | c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) 121 | 122 | Returns: 123 | Tensor: the output sequence (batch, in_channels, in_length) 124 | ''' 125 | batch, in_channels, in_length = x.shape 126 | batch, cond_channels, cond_length = c.shape 127 | assert in_length == ( (cond_length - 4) * self.cond_hop_length ), ( 128 | f"the length of input ({in_length}, {cond_length}) is not match in LVCNet" ) 129 | 130 | kernels, bias = self.kernel_predictor( c ) 131 | 132 | for i in range(self.conv_layers): 133 | dilation = 2**i 134 | k = kernels[ :, i, :, :, :, : ] 135 | b = bias[ :, i, :, : ] 136 | x = self.location_variable_convolution( x, k, b, dilation, self.cond_hop_length ) 137 | x = torch.sigmoid( x[ :, :in_channels, : ] ) * torch.tanh( x[ :, in_channels:, : ] ) 138 | return x 139 | 140 | 141 | def location_variable_convolution(self, x, kernel, bias, dilation, hop_size): 142 | ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. 143 | Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. 144 | 145 | Args: 146 | x (Tensor): the input sequence (batch, in_channels, in_length). 147 | kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) 148 | bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) 149 | dilation (int): the dilation of convolution. 150 | hop_size (int): the hop_size of the conditioning sequence. 151 | 152 | Returns: 153 | (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). 154 | ''' 155 | batch, in_channels, in_length = x.shape 156 | batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape 157 | 158 | assert in_length == (kernel_length*hop_size), "length of (x, kernel) is not matched" 159 | 160 | padding = dilation * int( (kernel_size - 1) / 2 ) 161 | x = F.pad( x, (padding, padding), 'constant', 0 ) # (batch, in_channels, in_length + 2*padding) 162 | x = x.unfold( 2, hop_size + 2 * padding, hop_size ) # (batch, in_channels, kernel_length, hop_size + 2*padding) 163 | 164 | if hop_size < dilation: 165 | x = F.pad( x, (0, dilation), 'constant', 0 ) 166 | x = x.unfold(3, dilation, dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) 167 | x = x[ :, :, :, :, :hop_size ] 168 | x = x.transpose( 3, 4 ) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) 169 | x = x.unfold( 4, kernel_size, 1 ) # (batch, in_channels, kernel_length, dilation, _, kernel_size) 170 | 171 | o = torch.einsum( 'bildsk,biokl->bolsd', x, kernel ) 172 | o = o + bias.unsqueeze(-1).unsqueeze(-1) 173 | o = o.contiguous().view(batch, out_channels, -1) 174 | return o 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /vocoder/models/melgan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """MelGAN Modules.""" 7 | 8 | import logging 9 | 10 | import numpy as np 11 | import torch 12 | 13 | from vocoder.layers import CausalConv1d 14 | from vocoder.layers import CausalConvTranspose1d 15 | from vocoder.layers import ResidualStack 16 | 17 | 18 | class MelGANGenerator(torch.nn.Module): 19 | """MelGAN generator module.""" 20 | 21 | def __init__(self, 22 | in_channels=80, 23 | out_channels=1, 24 | kernel_size=7, 25 | channels=512, 26 | bias=True, 27 | upsample_scales=[8, 8, 2, 2], 28 | stack_kernel_size=3, 29 | stacks=3, 30 | nonlinear_activation="LeakyReLU", 31 | nonlinear_activation_params={"negative_slope": 0.2}, 32 | pad="ReflectionPad1d", 33 | pad_params={}, 34 | use_final_nonlinear_activation=True, 35 | use_weight_norm=True, 36 | use_causal_conv=False, 37 | ): 38 | """Initialize MelGANGenerator module. 39 | 40 | Args: 41 | in_channels (int): Number of input channels. 42 | out_channels (int): Number of output channels. 43 | kernel_size (int): Kernel size of initial and final conv layer. 44 | channels (int): Initial number of channels for conv layer. 45 | bias (bool): Whether to add bias parameter in convolution layers. 46 | upsample_scales (list): List of upsampling scales. 47 | stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. 48 | stacks (int): Number of stacks in a single residual stack. 49 | nonlinear_activation (str): Activation function module name. 50 | nonlinear_activation_params (dict): Hyperparameters for activation function. 51 | pad (str): Padding function module name before dilated convolution layer. 52 | pad_params (dict): Hyperparameters for padding function. 53 | use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer. 54 | use_weight_norm (bool): Whether to use weight norm. 55 | If set to true, it will be applied to all of the conv layers. 56 | use_causal_conv (bool): Whether to use causal convolution. 57 | 58 | """ 59 | super(MelGANGenerator, self).__init__() 60 | 61 | # check hyper parameters is valid 62 | assert channels >= np.prod(upsample_scales) 63 | assert channels % (2 ** len(upsample_scales)) == 0 64 | if not use_causal_conv: 65 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 66 | 67 | # add initial layer 68 | layers = [] 69 | if not use_causal_conv: 70 | layers += [ 71 | getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params), 72 | torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias), 73 | ] 74 | else: 75 | layers += [ 76 | CausalConv1d(in_channels, channels, kernel_size, 77 | bias=bias, pad=pad, pad_params=pad_params), 78 | ] 79 | 80 | for i, upsample_scale in enumerate(upsample_scales): 81 | # add upsampling layer 82 | layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)] 83 | if not use_causal_conv: 84 | layers += [ 85 | torch.nn.ConvTranspose1d( 86 | channels // (2 ** i), 87 | channels // (2 ** (i + 1)), 88 | upsample_scale * 2, 89 | stride=upsample_scale, 90 | padding=upsample_scale // 2 + upsample_scale % 2, 91 | output_padding=upsample_scale % 2, 92 | bias=bias, 93 | ) 94 | ] 95 | else: 96 | layers += [ 97 | CausalConvTranspose1d( 98 | channels // (2 ** i), 99 | channels // (2 ** (i + 1)), 100 | upsample_scale * 2, 101 | stride=upsample_scale, 102 | bias=bias, 103 | ) 104 | ] 105 | 106 | # add residual stack 107 | for j in range(stacks): 108 | layers += [ 109 | ResidualStack( 110 | kernel_size=stack_kernel_size, 111 | channels=channels // (2 ** (i + 1)), 112 | dilation=stack_kernel_size ** j, 113 | bias=bias, 114 | nonlinear_activation=nonlinear_activation, 115 | nonlinear_activation_params=nonlinear_activation_params, 116 | pad=pad, 117 | pad_params=pad_params, 118 | use_causal_conv=use_causal_conv, 119 | ) 120 | ] 121 | 122 | # add final layer 123 | layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)] 124 | if not use_causal_conv: 125 | layers += [ 126 | getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params), 127 | torch.nn.Conv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias), 128 | ] 129 | else: 130 | layers += [ 131 | CausalConv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, 132 | bias=bias, pad=pad, pad_params=pad_params), 133 | ] 134 | if use_final_nonlinear_activation: 135 | layers += [torch.nn.Tanh()] 136 | 137 | # define the model as a single function 138 | self.melgan = torch.nn.Sequential(*layers) 139 | 140 | # apply weight norm 141 | if use_weight_norm: 142 | self.apply_weight_norm() 143 | 144 | # reset parameters 145 | self.reset_parameters() 146 | 147 | # initialize pqmf for inference 148 | self.pqmf = None 149 | 150 | def forward(self, c): 151 | """Calculate forward propagation. 152 | 153 | Args: 154 | c (Tensor): Input tensor (B, channels, T). 155 | 156 | Returns: 157 | Tensor: Output tensor (B, 1, T ** prod(upsample_scales)). 158 | 159 | """ 160 | return self.melgan(c) 161 | 162 | def remove_weight_norm(self): 163 | """Remove weight normalization module from all of the layers.""" 164 | def _remove_weight_norm(m): 165 | try: 166 | logging.debug(f"Weight norm is removed from {m}.") 167 | torch.nn.utils.remove_weight_norm(m) 168 | except ValueError: # this module didn't have weight norm 169 | return 170 | 171 | self.apply(_remove_weight_norm) 172 | 173 | def apply_weight_norm(self): 174 | """Apply weight normalization module from all of the layers.""" 175 | def _apply_weight_norm(m): 176 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): 177 | torch.nn.utils.weight_norm(m) 178 | logging.debug(f"Weight norm is applied to {m}.") 179 | 180 | self.apply(_apply_weight_norm) 181 | 182 | def reset_parameters(self): 183 | """Reset parameters. 184 | 185 | This initialization follows official implementation manner. 186 | https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py 187 | 188 | """ 189 | def _reset_parameters(m): 190 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): 191 | m.weight.data.normal_(0.0, 0.02) 192 | logging.debug(f"Reset parameters in {m}.") 193 | 194 | self.apply(_reset_parameters) 195 | 196 | def inference(self, c): 197 | """Perform inference. 198 | 199 | Args: 200 | c (Union[Tensor, ndarray]): Input tensor (T, in_channels). 201 | 202 | Returns: 203 | Tensor: Output tensor (T ** prod(upsample_scales), out_channels). 204 | 205 | """ 206 | if not isinstance(c, torch.Tensor): 207 | c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) 208 | c = self.melgan(c.transpose(1, 0).unsqueeze(0)) 209 | if self.pqmf is not None: 210 | c = self.pqmf.synthesis(c) 211 | return c.squeeze(0).transpose(1, 0) 212 | 213 | 214 | class MelGANDiscriminator(torch.nn.Module): 215 | """MelGAN discriminator module.""" 216 | 217 | def __init__(self, 218 | in_channels=1, 219 | out_channels=1, 220 | kernel_sizes=[5, 3], 221 | channels=16, 222 | max_downsample_channels=1024, 223 | bias=True, 224 | downsample_scales=[4, 4, 4, 4], 225 | nonlinear_activation="LeakyReLU", 226 | nonlinear_activation_params={"negative_slope": 0.2}, 227 | pad="ReflectionPad1d", 228 | pad_params={}, 229 | ): 230 | """Initilize MelGAN discriminator module. 231 | 232 | Args: 233 | in_channels (int): Number of input channels. 234 | out_channels (int): Number of output channels. 235 | kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer, 236 | and the first and the second kernel sizes will be used for the last two layers. 237 | For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, 238 | the last two layers' kernel size will be 5 and 3, respectively. 239 | channels (int): Initial number of channels for conv layer. 240 | max_downsample_channels (int): Maximum number of channels for downsampling layers. 241 | bias (bool): Whether to add bias parameter in convolution layers. 242 | downsample_scales (list): List of downsampling scales. 243 | nonlinear_activation (str): Activation function module name. 244 | nonlinear_activation_params (dict): Hyperparameters for activation function. 245 | pad (str): Padding function module name before dilated convolution layer. 246 | pad_params (dict): Hyperparameters for padding function. 247 | 248 | """ 249 | super(MelGANDiscriminator, self).__init__() 250 | self.layers = torch.nn.ModuleList() 251 | 252 | # check kernel size is valid 253 | assert len(kernel_sizes) == 2 254 | assert kernel_sizes[0] % 2 == 1 255 | assert kernel_sizes[1] % 2 == 1 256 | 257 | # add first layer 258 | self.layers += [ 259 | torch.nn.Sequential( 260 | getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params), 261 | torch.nn.Conv1d(in_channels, channels, np.prod(kernel_sizes), bias=bias), 262 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 263 | ) 264 | ] 265 | 266 | # add downsample layers 267 | in_chs = channels 268 | for downsample_scale in downsample_scales: 269 | out_chs = min(in_chs * downsample_scale, max_downsample_channels) 270 | self.layers += [ 271 | torch.nn.Sequential( 272 | torch.nn.Conv1d( 273 | in_chs, out_chs, 274 | kernel_size=downsample_scale * 10 + 1, 275 | stride=downsample_scale, 276 | padding=downsample_scale * 5, 277 | groups=in_chs // 4, 278 | bias=bias, 279 | ), 280 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 281 | ) 282 | ] 283 | in_chs = out_chs 284 | 285 | # add final layers 286 | out_chs = min(in_chs * 2, max_downsample_channels) 287 | self.layers += [ 288 | torch.nn.Sequential( 289 | torch.nn.Conv1d( 290 | in_chs, out_chs, kernel_sizes[0], 291 | padding=(kernel_sizes[0] - 1) // 2, 292 | bias=bias, 293 | ), 294 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 295 | ) 296 | ] 297 | self.layers += [ 298 | torch.nn.Conv1d( 299 | out_chs, out_channels, kernel_sizes[1], 300 | padding=(kernel_sizes[1] - 1) // 2, 301 | bias=bias, 302 | ), 303 | ] 304 | 305 | def forward(self, x): 306 | """Calculate forward propagation. 307 | 308 | Args: 309 | x (Tensor): Input noise signal (B, 1, T). 310 | 311 | Returns: 312 | List: List of output tensors of each layer. 313 | 314 | """ 315 | outs = [] 316 | for f in self.layers: 317 | x = f(x) 318 | outs += [x] 319 | 320 | return outs 321 | 322 | 323 | class MelGANMultiScaleDiscriminator(torch.nn.Module): 324 | """MelGAN multi-scale discriminator module.""" 325 | 326 | def __init__(self, 327 | in_channels=1, 328 | out_channels=1, 329 | scales=3, 330 | downsample_pooling="AvgPool1d", 331 | # follow the official implementation setting 332 | downsample_pooling_params={ 333 | "kernel_size": 4, 334 | "stride": 2, 335 | "padding": 1, 336 | "count_include_pad": False, 337 | }, 338 | kernel_sizes=[5, 3], 339 | channels=16, 340 | max_downsample_channels=1024, 341 | bias=True, 342 | downsample_scales=[4, 4, 4, 4], 343 | nonlinear_activation="LeakyReLU", 344 | nonlinear_activation_params={"negative_slope": 0.2}, 345 | pad="ReflectionPad1d", 346 | pad_params={}, 347 | use_weight_norm=True, 348 | ): 349 | """Initilize MelGAN multi-scale discriminator module. 350 | 351 | Args: 352 | in_channels (int): Number of input channels. 353 | out_channels (int): Number of output channels. 354 | downsample_pooling (str): Pooling module name for downsampling of the inputs. 355 | downsample_pooling_params (dict): Parameters for the above pooling module. 356 | kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer, 357 | and the first and the second kernel sizes will be used for the last two layers. 358 | channels (int): Initial number of channels for conv layer. 359 | max_downsample_channels (int): Maximum number of channels for downsampling layers. 360 | bias (bool): Whether to add bias parameter in convolution layers. 361 | downsample_scales (list): List of downsampling scales. 362 | nonlinear_activation (str): Activation function module name. 363 | nonlinear_activation_params (dict): Hyperparameters for activation function. 364 | pad (str): Padding function module name before dilated convolution layer. 365 | pad_params (dict): Hyperparameters for padding function. 366 | use_causal_conv (bool): Whether to use causal convolution. 367 | 368 | """ 369 | super(MelGANMultiScaleDiscriminator, self).__init__() 370 | self.discriminators = torch.nn.ModuleList() 371 | 372 | # add discriminators 373 | for _ in range(scales): 374 | self.discriminators += [ 375 | MelGANDiscriminator( 376 | in_channels=in_channels, 377 | out_channels=out_channels, 378 | kernel_sizes=kernel_sizes, 379 | channels=channels, 380 | max_downsample_channels=max_downsample_channels, 381 | bias=bias, 382 | downsample_scales=downsample_scales, 383 | nonlinear_activation=nonlinear_activation, 384 | nonlinear_activation_params=nonlinear_activation_params, 385 | pad=pad, 386 | pad_params=pad_params, 387 | ) 388 | ] 389 | self.pooling = getattr(torch.nn, downsample_pooling)(**downsample_pooling_params) 390 | 391 | # apply weight norm 392 | if use_weight_norm: 393 | self.apply_weight_norm() 394 | 395 | # reset parameters 396 | self.reset_parameters() 397 | 398 | def forward(self, x): 399 | """Calculate forward propagation. 400 | 401 | Args: 402 | x (Tensor): Input noise signal (B, 1, T). 403 | 404 | Returns: 405 | List: List of list of each discriminator outputs, which consists of each layer output tensors. 406 | 407 | """ 408 | outs = [] 409 | for f in self.discriminators: 410 | outs += [f(x)] 411 | x = self.pooling(x) 412 | 413 | return outs 414 | 415 | def remove_weight_norm(self): 416 | """Remove weight normalization module from all of the layers.""" 417 | def _remove_weight_norm(m): 418 | try: 419 | logging.debug(f"Weight norm is removed from {m}.") 420 | torch.nn.utils.remove_weight_norm(m) 421 | except ValueError: # this module didn't have weight norm 422 | return 423 | 424 | self.apply(_remove_weight_norm) 425 | 426 | def apply_weight_norm(self): 427 | """Apply weight normalization module from all of the layers.""" 428 | def _apply_weight_norm(m): 429 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): 430 | torch.nn.utils.weight_norm(m) 431 | logging.debug(f"Weight norm is applied to {m}.") 432 | 433 | self.apply(_apply_weight_norm) 434 | 435 | def reset_parameters(self): 436 | """Reset parameters. 437 | 438 | This initialization follows official implementation manner. 439 | https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py 440 | 441 | """ 442 | def _reset_parameters(m): 443 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): 444 | m.weight.data.normal_(0.0, 0.02) 445 | logging.debug(f"Reset parameters in {m}.") 446 | 447 | self.apply(_reset_parameters) 448 | -------------------------------------------------------------------------------- /vocoder/models/parallel_wavegan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Parallel WaveGAN Modules.""" 7 | 8 | import logging 9 | import math 10 | 11 | import numpy as np 12 | import torch 13 | 14 | from vocoder.layers import Conv1d 15 | from vocoder.layers import Conv1d1x1 16 | from vocoder.layers import ResidualBlock 17 | from vocoder.layers import upsample 18 | from vocoder import models 19 | 20 | class ParallelWaveGAN(torch.nn.Module): 21 | """Parallel WaveGAN module""" 22 | 23 | def __init__(self, generator_params={}, discriminator_params={}): 24 | super().__init__() 25 | 26 | self.generator = ParallelWaveGANGenerator(**generator_params) 27 | self.discriminator = ParallelWaveGANDiscriminator(**discriminator_params) 28 | 29 | def generator_forward(self, x, c): 30 | return self.generator(x, c) 31 | 32 | def discriminator_forward(self, x): 33 | return self.discriminator(x) 34 | 35 | 36 | class ParallelWaveGANGenerator(torch.nn.Module): 37 | """Parallel WaveGAN Generator module.""" 38 | 39 | def __init__(self, 40 | in_channels=1, 41 | out_channels=1, 42 | kernel_size=3, 43 | layers=30, 44 | stacks=3, 45 | residual_channels=64, 46 | gate_channels=128, 47 | skip_channels=64, 48 | aux_channels=80, 49 | aux_context_window=2, 50 | dropout=0.0, 51 | bias=True, 52 | use_weight_norm=True, 53 | use_causal_conv=False, 54 | upsample_conditional_features=True, 55 | upsample_net="ConvInUpsampleNetwork", 56 | upsample_params={"upsample_scales": [4, 4, 4, 4]}, 57 | ): 58 | """Initialize Parallel WaveGAN Generator module. 59 | 60 | Args: 61 | in_channels (int): Number of input channels. 62 | out_channels (int): Number of output channels. 63 | kernel_size (int): Kernel size of dilated convolution. 64 | layers (int): Number of residual block layers. 65 | stacks (int): Number of stacks i.e., dilation cycles. 66 | residual_channels (int): Number of channels in residual conv. 67 | gate_channels (int): Number of channels in gated conv. 68 | skip_channels (int): Number of channels in skip conv. 69 | aux_channels (int): Number of channels for auxiliary feature conv. 70 | aux_context_window (int): Context window size for auxiliary feature. 71 | dropout (float): Dropout rate. 0.0 means no dropout applied. 72 | bias (bool): Whether to use bias parameter in conv layer. 73 | use_weight_norm (bool): Whether to use weight norm. 74 | If set to true, it will be applied to all of the conv layers. 75 | use_causal_conv (bool): Whether to use causal structure. 76 | upsample_conditional_features (bool): Whether to use upsampling network. 77 | upsample_net (str): Upsampling network architecture. 78 | upsample_params (dict): Upsampling network parameters. 79 | 80 | """ 81 | super(ParallelWaveGANGenerator, self).__init__() 82 | self.in_channels = in_channels 83 | self.out_channels = out_channels 84 | self.aux_channels = aux_channels 85 | self.aux_context_window = aux_context_window 86 | self.layers = layers 87 | self.stacks = stacks 88 | self.kernel_size = kernel_size 89 | 90 | # check the number of layers and stacks 91 | assert layers % stacks == 0 92 | layers_per_stack = layers // stacks 93 | 94 | # define first convolution 95 | self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) 96 | 97 | # define conv + upsampling network 98 | if upsample_conditional_features: 99 | upsample_params.update({ 100 | "use_causal_conv": use_causal_conv, 101 | }) 102 | if upsample_net == "MelGANGenerator": 103 | assert aux_context_window == 0 104 | upsample_params.update({ 105 | "use_weight_norm": False, # not to apply twice 106 | "use_final_nonlinear_activation": False, 107 | }) 108 | self.upsample_net = getattr(models, upsample_net)(**upsample_params) 109 | else: 110 | if upsample_net == "ConvInUpsampleNetwork": 111 | upsample_params.update({ 112 | "aux_channels": aux_channels, 113 | "aux_context_window": aux_context_window, 114 | }) 115 | self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) 116 | self.upsample_factor = np.prod(upsample_params["upsample_scales"]) 117 | else: 118 | self.upsample_net = None 119 | self.upsample_factor = 1 120 | 121 | # define residual blocks 122 | self.conv_layers = torch.nn.ModuleList() 123 | for layer in range(layers): 124 | dilation = 2 ** (layer % layers_per_stack) 125 | conv = ResidualBlock( 126 | kernel_size=kernel_size, 127 | residual_channels=residual_channels, 128 | gate_channels=gate_channels, 129 | skip_channels=skip_channels, 130 | aux_channels=aux_channels, 131 | dilation=dilation, 132 | dropout=dropout, 133 | bias=bias, 134 | use_causal_conv=use_causal_conv, 135 | ) 136 | self.conv_layers += [conv] 137 | 138 | # define output layers 139 | self.last_conv_layers = torch.nn.ModuleList([ 140 | torch.nn.ReLU(inplace=True), 141 | Conv1d1x1(skip_channels, skip_channels, bias=True), 142 | torch.nn.ReLU(inplace=True), 143 | Conv1d1x1(skip_channels, out_channels, bias=True), 144 | ]) 145 | 146 | # apply weight norm 147 | if use_weight_norm: 148 | self.apply_weight_norm() 149 | 150 | def forward(self, x, c): 151 | """Calculate forward propagation. 152 | 153 | Args: 154 | x (Tensor): Input noise signal (B, 1, T). 155 | c (Tensor): Local conditioning auxiliary features (B, C ,T'). 156 | 157 | Returns: 158 | Tensor: Output tensor (B, out_channels, T) 159 | 160 | """ 161 | # perform upsampling 162 | if c is not None and self.upsample_net is not None: 163 | c = self.upsample_net(c) 164 | assert c.size(-1) == x.size(-1), f"c {c.shape}, x {x.shape}" 165 | 166 | # encode to hidden representation 167 | x = self.first_conv(x) 168 | skips = 0 169 | for f in self.conv_layers: 170 | x, h = f(x, c) 171 | skips += h 172 | skips *= math.sqrt(1.0 / len(self.conv_layers)) 173 | 174 | # apply final layers 175 | x = skips 176 | for f in self.last_conv_layers: 177 | x = f(x) 178 | 179 | return x 180 | 181 | def remove_weight_norm(self): 182 | """Remove weight normalization module from all of the layers.""" 183 | def _remove_weight_norm(m): 184 | try: 185 | logging.debug(f"Weight norm is removed from {m}.") 186 | torch.nn.utils.remove_weight_norm(m) 187 | except ValueError: # this module didn't have weight norm 188 | return 189 | 190 | self.apply(_remove_weight_norm) 191 | 192 | def apply_weight_norm(self): 193 | """Apply weight normalization module from all of the layers.""" 194 | def _apply_weight_norm(m): 195 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 196 | torch.nn.utils.weight_norm(m) 197 | logging.debug(f"Weight norm is applied to {m}.") 198 | 199 | self.apply(_apply_weight_norm) 200 | 201 | @staticmethod 202 | def _get_receptive_field_size(layers, stacks, kernel_size, 203 | dilation=lambda x: 2 ** x): 204 | assert layers % stacks == 0 205 | layers_per_cycle = layers // stacks 206 | dilations = [dilation(i % layers_per_cycle) for i in range(layers)] 207 | return (kernel_size - 1) * sum(dilations) + 1 208 | 209 | @property 210 | def receptive_field_size(self): 211 | """Return receptive field size.""" 212 | return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) 213 | 214 | def inference(self, c=None, x=None): 215 | """Perform inference. 216 | 217 | Args: 218 | c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). 219 | x (Union[Tensor, ndarray]): Input noise signal (T, 1). 220 | 221 | Returns: 222 | Tensor: Output tensor (T, out_channels) 223 | 224 | """ 225 | if x is not None: 226 | if not isinstance(x, torch.Tensor): 227 | x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) 228 | x = x.transpose(1, 0).unsqueeze(0) 229 | else: 230 | assert c is not None 231 | x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) 232 | if c is not None: 233 | if not isinstance(c, torch.Tensor): 234 | c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) 235 | c = c.transpose(1, 0).unsqueeze(0) 236 | c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) 237 | return self.forward(x, c).squeeze(0).transpose(1, 0) 238 | 239 | 240 | class ParallelWaveGANDiscriminator(torch.nn.Module): 241 | """Parallel WaveGAN Discriminator module.""" 242 | 243 | def __init__(self, 244 | in_channels=1, 245 | out_channels=1, 246 | kernel_size=3, 247 | layers=10, 248 | conv_channels=64, 249 | dilation_factor=1, 250 | nonlinear_activation="LeakyReLU", 251 | nonlinear_activation_params={"negative_slope": 0.2}, 252 | bias=True, 253 | use_weight_norm=True, 254 | ): 255 | """Initialize Parallel WaveGAN Discriminator module. 256 | 257 | Args: 258 | in_channels (int): Number of input channels. 259 | out_channels (int): Number of output channels. 260 | kernel_size (int): Number of output channels. 261 | layers (int): Number of conv layers. 262 | conv_channels (int): Number of chnn layers. 263 | dilation_factor (int): Dilation factor. For example, if dilation_factor = 2, 264 | the dilation will be 2, 4, 8, ..., and so on. 265 | nonlinear_activation (str): Nonlinear function after each conv. 266 | nonlinear_activation_params (dict): Nonlinear function parameters 267 | bias (bool): Whether to use bias parameter in conv. 268 | use_weight_norm (bool) Whether to use weight norm. 269 | If set to true, it will be applied to all of the conv layers. 270 | 271 | """ 272 | super(ParallelWaveGANDiscriminator, self).__init__() 273 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 274 | assert dilation_factor > 0, "Dilation factor must be > 0." 275 | self.conv_layers = torch.nn.ModuleList() 276 | conv_in_channels = in_channels 277 | for i in range(layers - 1): 278 | if i == 0: 279 | dilation = 1 280 | else: 281 | dilation = i if dilation_factor == 1 else dilation_factor ** i 282 | conv_in_channels = conv_channels 283 | padding = (kernel_size - 1) // 2 * dilation 284 | conv_layer = [ 285 | Conv1d(conv_in_channels, conv_channels, 286 | kernel_size=kernel_size, padding=padding, 287 | dilation=dilation, bias=bias), 288 | getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params) 289 | ] 290 | self.conv_layers += conv_layer 291 | padding = (kernel_size - 1) // 2 292 | last_conv_layer = Conv1d( 293 | conv_in_channels, out_channels, 294 | kernel_size=kernel_size, padding=padding, bias=bias) 295 | self.conv_layers += [last_conv_layer] 296 | 297 | # apply weight norm 298 | if use_weight_norm: 299 | self.apply_weight_norm() 300 | 301 | def forward(self, x): 302 | """Calculate forward propagation. 303 | 304 | Args: 305 | x (Tensor): Input noise signal (B, 1, T). 306 | 307 | Returns: 308 | Tensor: Output tensor (B, 1, T) 309 | 310 | """ 311 | for f in self.conv_layers: 312 | x = f(x) 313 | return x 314 | 315 | def apply_weight_norm(self): 316 | """Apply weight normalization module from all of the layers.""" 317 | def _apply_weight_norm(m): 318 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 319 | torch.nn.utils.weight_norm(m) 320 | logging.debug(f"Weight norm is applied to {m}.") 321 | 322 | self.apply(_apply_weight_norm) 323 | 324 | def remove_weight_norm(self): 325 | """Remove weight normalization module from all of the layers.""" 326 | def _remove_weight_norm(m): 327 | try: 328 | logging.debug(f"Weight norm is removed from {m}.") 329 | torch.nn.utils.remove_weight_norm(m) 330 | except ValueError: # this module didn't have weight norm 331 | return 332 | 333 | self.apply(_remove_weight_norm) 334 | 335 | 336 | class ResidualParallelWaveGANDiscriminator(torch.nn.Module): 337 | """Parallel WaveGAN Discriminator module.""" 338 | 339 | def __init__(self, 340 | in_channels=1, 341 | out_channels=1, 342 | kernel_size=3, 343 | layers=30, 344 | stacks=3, 345 | residual_channels=64, 346 | gate_channels=128, 347 | skip_channels=64, 348 | dropout=0.0, 349 | bias=True, 350 | use_weight_norm=True, 351 | use_causal_conv=False, 352 | nonlinear_activation="LeakyReLU", 353 | nonlinear_activation_params={"negative_slope": 0.2}, 354 | ): 355 | """Initialize Parallel WaveGAN Discriminator module. 356 | 357 | Args: 358 | in_channels (int): Number of input channels. 359 | out_channels (int): Number of output channels. 360 | kernel_size (int): Kernel size of dilated convolution. 361 | layers (int): Number of residual block layers. 362 | stacks (int): Number of stacks i.e., dilation cycles. 363 | residual_channels (int): Number of channels in residual conv. 364 | gate_channels (int): Number of channels in gated conv. 365 | skip_channels (int): Number of channels in skip conv. 366 | dropout (float): Dropout rate. 0.0 means no dropout applied. 367 | bias (bool): Whether to use bias parameter in conv. 368 | use_weight_norm (bool): Whether to use weight norm. 369 | If set to true, it will be applied to all of the conv layers. 370 | use_causal_conv (bool): Whether to use causal structure. 371 | nonlinear_activation_params (dict): Nonlinear function parameters 372 | 373 | """ 374 | super(ResidualParallelWaveGANDiscriminator, self).__init__() 375 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 376 | 377 | self.in_channels = in_channels 378 | self.out_channels = out_channels 379 | self.layers = layers 380 | self.stacks = stacks 381 | self.kernel_size = kernel_size 382 | 383 | # check the number of layers and stacks 384 | assert layers % stacks == 0 385 | layers_per_stack = layers // stacks 386 | 387 | # define first convolution 388 | self.first_conv = torch.nn.Sequential( 389 | Conv1d1x1(in_channels, residual_channels, bias=True), 390 | getattr(torch.nn, nonlinear_activation)( 391 | inplace=True, **nonlinear_activation_params), 392 | ) 393 | 394 | # define residual blocks 395 | self.conv_layers = torch.nn.ModuleList() 396 | for layer in range(layers): 397 | dilation = 2 ** (layer % layers_per_stack) 398 | conv = ResidualBlock( 399 | kernel_size=kernel_size, 400 | residual_channels=residual_channels, 401 | gate_channels=gate_channels, 402 | skip_channels=skip_channels, 403 | aux_channels=-1, 404 | dilation=dilation, 405 | dropout=dropout, 406 | bias=bias, 407 | use_causal_conv=use_causal_conv, 408 | ) 409 | self.conv_layers += [conv] 410 | 411 | # define output layers 412 | self.last_conv_layers = torch.nn.ModuleList([ 413 | getattr(torch.nn, nonlinear_activation)( 414 | inplace=True, **nonlinear_activation_params), 415 | Conv1d1x1(skip_channels, skip_channels, bias=True), 416 | getattr(torch.nn, nonlinear_activation)( 417 | inplace=True, **nonlinear_activation_params), 418 | Conv1d1x1(skip_channels, out_channels, bias=True), 419 | ]) 420 | 421 | # apply weight norm 422 | if use_weight_norm: 423 | self.apply_weight_norm() 424 | 425 | def forward(self, x): 426 | """Calculate forward propagation. 427 | 428 | Args: 429 | x (Tensor): Input noise signal (B, 1, T). 430 | 431 | Returns: 432 | Tensor: Output tensor (B, 1, T) 433 | 434 | """ 435 | x = self.first_conv(x) 436 | 437 | skips = 0 438 | for f in self.conv_layers: 439 | x, h = f(x, None) 440 | skips += h 441 | skips *= math.sqrt(1.0 / len(self.conv_layers)) 442 | 443 | # apply final layers 444 | x = skips 445 | for f in self.last_conv_layers: 446 | x = f(x) 447 | return x 448 | 449 | def apply_weight_norm(self): 450 | """Apply weight normalization module from all of the layers.""" 451 | def _apply_weight_norm(m): 452 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 453 | torch.nn.utils.weight_norm(m) 454 | logging.debug(f"Weight norm is applied to {m}.") 455 | 456 | self.apply(_apply_weight_norm) 457 | 458 | def remove_weight_norm(self): 459 | """Remove weight normalization module from all of the layers.""" 460 | def _remove_weight_norm(m): 461 | try: 462 | logging.debug(f"Weight norm is removed from {m}.") 463 | torch.nn.utils.remove_weight_norm(m) 464 | except ValueError: # this module didn't have weight norm 465 | return 466 | 467 | self.apply(_remove_weight_norm) 468 | -------------------------------------------------------------------------------- /vocoder/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from typing import Union 4 | 5 | from .pwg_opt import PWGOptimizer 6 | 7 | optimizer_list = { 8 | "PWGOptimizer": PWGOptimizer 9 | } 10 | 11 | def create_optimizer(name, model, params) -> Union[PWGOptimizer]: 12 | return optimizer_list[ name ]( model, **params ) 13 | -------------------------------------------------------------------------------- /vocoder/optimizers/pwg_opt.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from vocoder.models import ParallelWaveGAN 5 | from .radam import RAdam 6 | 7 | 8 | class PWGOptimizer: 9 | 10 | def __init__(self, model: ParallelWaveGAN, 11 | generator_optimizer_params={"lr": 1e-4, "eps": 1e-6}, 12 | generator_scheduler_params={"step_size": 200000, "gamma": 0.5}, 13 | discriminator_optimizer_params={"lr": 5e-5, "eps": 1e-6}, 14 | discriminator_scheduler_params={"step_size": 200000, "gamma": 0.5}): 15 | self.generator_optimizer = RAdam( 16 | model.generator.parameters(), **generator_optimizer_params ) 17 | self.generator_scheduler = torch.optim.lr_scheduler.StepLR( 18 | optimizer=self.generator_optimizer, **generator_scheduler_params) 19 | 20 | self.discriminator_optimizer = RAdam( 21 | model.discriminator.parameters(), **discriminator_optimizer_params ) 22 | self.discriminator_scheduler = torch.optim.lr_scheduler.StepLR( 23 | optimizer=self.discriminator_optimizer, **discriminator_scheduler_params) 24 | 25 | def state_dict(self): 26 | return { 27 | "generator_optimizer": self.generator_optimizer.state_dict(), 28 | "generator_scheduler": self.generator_scheduler.state_dict(), 29 | "discriminator_optimizer": self.discriminator_optimizer.state_dict(), 30 | "discriminator_scheduler": self.discriminator_scheduler.state_dict() 31 | } 32 | 33 | def load_state_dict(self, state_dict): 34 | self.generator_optimizer.load_state_dict( state_dict["generator_optimizer"] ) 35 | self.generator_scheduler.load_state_dict( state_dict["generator_scheduler"] ) 36 | self.discriminator_optimizer.load_state_dict( state_dict["discriminator_optimizer"] ) 37 | self.discriminator_scheduler.load_state_dict( state_dict["discriminator_scheduler"] ) 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /vocoder/optimizers/radam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """RAdam optimizer. 4 | 5 | This code is drived from https://github.com/LiyuanLucasLiu/RAdam. 6 | """ 7 | 8 | import math 9 | import torch 10 | 11 | from torch.optim.optimizer import Optimizer 12 | 13 | 14 | class RAdam(Optimizer): 15 | """Rectified Adam optimizer.""" 16 | 17 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 18 | """Initilize RAdam optimizer.""" 19 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 20 | self.buffer = [[None, None, None] for ind in range(10)] 21 | super(RAdam, self).__init__(params, defaults) 22 | 23 | def __setstate__(self, state): 24 | """Set state.""" 25 | super(RAdam, self).__setstate__(state) 26 | 27 | def step(self, closure=None): 28 | """Run one step.""" 29 | loss = None 30 | if closure is not None: 31 | loss = closure() 32 | 33 | for group in self.param_groups: 34 | 35 | for p in group['params']: 36 | if p.grad is None: 37 | continue 38 | grad = p.grad.data.float() 39 | if grad.is_sparse: 40 | raise RuntimeError('RAdam does not support sparse gradients') 41 | 42 | p_data_fp32 = p.data.float() 43 | 44 | state = self.state[p] 45 | 46 | if len(state) == 0: 47 | state['step'] = 0 48 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 49 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 50 | else: 51 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 52 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 53 | 54 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 55 | beta1, beta2 = group['betas'] 56 | 57 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value = 1 - beta2) 58 | exp_avg.mul_(beta1).add_( grad, alpha = 1 - beta1) 59 | 60 | state['step'] += 1 61 | buffered = self.buffer[int(state['step'] % 10)] 62 | if state['step'] == buffered[0]: 63 | N_sma, step_size = buffered[1], buffered[2] 64 | else: 65 | buffered[0] = state['step'] 66 | beta2_t = beta2 ** state['step'] 67 | N_sma_max = 2 / (1 - beta2) - 1 68 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 69 | buffered[1] = N_sma 70 | 71 | # more conservative since it's an approximated value 72 | if N_sma >= 5: 73 | step_size = math.sqrt( 74 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA 75 | else: 76 | step_size = 1.0 / (1 - beta1 ** state['step']) 77 | buffered[2] = step_size 78 | 79 | if group['weight_decay'] != 0: 80 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 81 | 82 | # more conservative since it's an approximated value 83 | if N_sma >= 5: 84 | denom = exp_avg_sq.sqrt().add_(group['eps']) 85 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 86 | else: 87 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 88 | 89 | p.data.copy_(p_data_fp32) 90 | 91 | return loss 92 | -------------------------------------------------------------------------------- /vocoder/preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, glob, tqdm, os, random 3 | from functools import partial 4 | from concurrent.futures import ProcessPoolExecutor 5 | import numpy as np 6 | import torch 7 | 8 | from .audio import TacotronSTFT, load_wav_to_torch 9 | 10 | from vocoder.datasets.utils import save_metadata 11 | from vocoder.hparams import Hyperparameter 12 | 13 | 14 | 15 | def mel_transform(wav_files, mel_dir, mel_config, device, min_wav_length): 16 | # device = torch.device( device ) 17 | # transfomer = MelSpectrogram( **mel_config ).to( device ) 18 | taco_stft = TacotronSTFT( **mel_config ) 19 | files = [] 20 | with torch.no_grad(): 21 | for fn in wav_files: 22 | audio, sr = load_wav_to_torch( fn, mel_config['sampling_rate'] ) 23 | if audio.shape[1] < min_wav_length: 24 | print( 'skip {}, sr: {}, length: {}'.format(fn, sr, audio.shape[1]) ) 25 | continue 26 | # audio = audio.to( device ) 27 | mel, _ = taco_stft.mel_spectrogram( audio ) 28 | mel_fn = os.path.join( mel_dir, os.path.basename(fn) + '.mel.npy' ) 29 | np.save( mel_fn, mel[0].cpu().numpy().T ) 30 | files.append( (fn, mel_fn) ) 31 | return files 32 | 33 | 34 | 35 | def preprocess( data_dir, 36 | hparams: Hyperparameter, 37 | temp_dir='temp', 38 | device='cuda:0', 39 | max_workers=4 ): 40 | '''Preprocess for LVC-WaveGAN. 41 | Args: 42 | data_dir (str): the directory containing .wav files. 43 | hparams (Hyperparameter): including parameter for calculating mel-spectrogram. 44 | temp_dir (str): the directory for saving preprocessing results. 45 | device (str): the cuda device for runing preprocessing. 46 | max_workers (int): the number of process worker. 47 | ''' 48 | data_dir = os.path.abspath(data_dir) 49 | temp_dir = os.path.abspath(temp_dir) 50 | mel_dir = os.path.join( temp_dir, 'mels' ) 51 | os.makedirs(mel_dir, exist_ok=True) 52 | mel_config = { 53 | 'sampling_rate': hparams.sample_rate, 54 | 'win_length': hparams.win_length, 55 | 'hop_length': hparams.hop_length, 56 | 'filter_length': hparams.n_fft, 57 | 'mel_fmin': hparams.mel_fmin, 58 | 'mel_fmax': hparams.mel_fmax, 59 | 'n_mel_channels': hparams.n_mels, 60 | } 61 | min_wav_length = hparams.batch_mel_length * hparams.hop_length 62 | 63 | wav_files = glob.glob(f'{data_dir}/**/*.wav', recursive=True) 64 | print('num of wavs:', len(wav_files)) 65 | 66 | batch_size = 100 67 | batch_num = int(np.ceil( len(wav_files) / batch_size )) 68 | batches = [ wav_files[ i*batch_size : (i+1)*batch_size ] for i in range( batch_num ) ] 69 | results = [] 70 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 71 | futures = [ executor.submit( mel_transform, batch, mel_dir, mel_config, 72 | device, min_wav_length ) for batch in batches ] 73 | for f in tqdm.tqdm( futures, desc='Preprocessing', total=batch_num ): 74 | results.extend( f.result() ) 75 | 76 | save_metadata(results, os.path.join(temp_dir, 'metadata.txt')) 77 | 78 | # 产生训练、验证、测试训练集 79 | random.shuffle(results) 80 | save_metadata(results[ : hparams.eval_sample_num ], hparams.eval_metadata_file ) 81 | save_metadata(results[ -hparams.test_sample_num : ], hparams.test_metadata_file ) 82 | save_metadata(results[ hparams.eval_sample_num : -hparams.test_sample_num ], 83 | hparams.train_metadata_file ) 84 | 85 | 86 | 87 | def main(): 88 | parser = argparse.ArgumentParser( 89 | description="Preprocess for LVC-WaveGAN (See detail in vocoder/preprocess.py).") 90 | parser.add_argument("--data-dir", type=str, required=True, 91 | help="the directory containing .wav files") 92 | parser.add_argument("--config", type=str, required=True, 93 | help="yaml format configuration file.") 94 | parser.add_argument("--temp-dir", type=str, default='temp', 95 | help="the directory to save preprocessing results") 96 | parser.add_argument("--max-workers", type=int, default=4, 97 | help="yaml format configuration file.") 98 | parser.add_argument("--device", default='cuda:0', type=str, 99 | help="the device for training. (default: cuda:0)") 100 | args = parser.parse_args() 101 | hparams = Hyperparameter( args.config ) 102 | 103 | preprocess( args.data_dir, 104 | hparams, 105 | temp_dir=args.temp_dir, 106 | device=args.device, 107 | max_workers=args.max_workers) 108 | 109 | 110 | if __name__ == '__main__': 111 | main() 112 | 113 | -------------------------------------------------------------------------------- /vocoder/strategy/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from .base import TrainStrategy 5 | from .pwg_strategy import PWGStrategy 6 | 7 | strategy_classes = { 8 | "PWGStrategy": PWGStrategy 9 | } 10 | 11 | 12 | def create_strategy(name, params) -> TrainStrategy: 13 | return strategy_classes[ name ](**params) -------------------------------------------------------------------------------- /vocoder/strategy/base.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class TrainStrategy: 4 | 5 | def __init__(self): 6 | pass 7 | 8 | def train_step(self, batch, cur_step, model, loss, optimizer): 9 | return {'train_loss': 0} 10 | 11 | def eval_step(self, batch, model, loss): 12 | return {'eval_loss': 0} 13 | 14 | def test_step(self, batch, model): 15 | return {'audio': 0} -------------------------------------------------------------------------------- /vocoder/strategy/pwg_strategy.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | 5 | from vocoder.models import ParallelWaveGAN 6 | from vocoder.losses import PWGLoss 7 | from vocoder.optimizers import PWGOptimizer 8 | from .base import TrainStrategy 9 | 10 | 11 | 12 | 13 | class PWGStrategy(TrainStrategy): 14 | 15 | def __init__(self, 16 | lambda_adv=4.0, 17 | discriminator_start_steps=100000, 18 | generator_grad_norm=10, 19 | discriminator_grad_norm=1): 20 | super().__init__() 21 | 22 | self.lambda_adv = lambda_adv 23 | self.discriminator_start_steps = discriminator_start_steps 24 | self.generator_grad_norm = generator_grad_norm 25 | self.discriminator_grad_norm = discriminator_grad_norm 26 | 27 | def train_step(self, batch, step, 28 | model: ParallelWaveGAN, 29 | loss: PWGLoss, 30 | optimizer: PWGOptimizer): 31 | '''Train strategy for Parallel WaveGAN. 32 | Args: 33 | batch (list): the batch data for training model. 34 | [ audio(B,L), mel(B,ML,MC), noise(B,L) ] 35 | step (int): current global step in training process. 36 | model (ParallelWaveGAN): the parallel wavegan model. 37 | loss (PWGLoss): the loss module for parallel wavegan 38 | optimizer (PWGOptimizer): customized optimizer. 39 | Returns: 40 | dict: the loss value dict. 41 | ''' 42 | device = next(model.parameters()).device 43 | audio, mel, noise = [ x.to(device) for x in batch ] 44 | 45 | ####################### 46 | # Generator # 47 | ####################### 48 | audio_ = model.generator(noise, mel) 49 | 50 | sc_loss, mag_loss = loss.stft_loss( audio, audio_ ) 51 | gen_loss = sc_loss + mag_loss 52 | 53 | adv_loss = torch.zeros(1) 54 | if step > self.discriminator_start_steps: 55 | prob_ = model.discriminator( audio_ ) 56 | adv_loss = loss.adversarial_loss( prob_ ) 57 | gen_loss += self.lambda_adv * adv_loss 58 | 59 | optimizer.generator_optimizer.zero_grad() 60 | gen_loss.backward() 61 | if self.generator_grad_norm > 0: 62 | torch.nn.utils.clip_grad_norm_( 63 | model.generator.parameters(), 64 | self.generator_grad_norm) 65 | optimizer.generator_optimizer.step() 66 | optimizer.generator_scheduler.step() 67 | 68 | ####################### 69 | # Discriminator # 70 | ####################### 71 | real_loss, fake_loss, disc_loss = torch.zeros(1), torch.zeros(1), torch.zeros(1) 72 | if step > self.discriminator_start_steps: 73 | with torch.no_grad(): 74 | audio_ = model.generator( noise, mel ) 75 | prob = model.discriminator( audio ) 76 | prob_ = model.discriminator( audio_.detach() ) 77 | 78 | real_loss, fake_loss = loss.discriminator_loss( prob, prob_ ) 79 | disc_loss = real_loss + fake_loss 80 | 81 | optimizer.discriminator_optimizer.zero_grad() 82 | disc_loss.backward() 83 | if self.discriminator_grad_norm > 0: 84 | torch.nn.utils.clip_grad_norm_( 85 | model.discriminator.parameters(), 86 | self.discriminator_grad_norm) 87 | optimizer.discriminator_optimizer.step() 88 | optimizer.discriminator_scheduler.step() 89 | 90 | return { 91 | "generator_loss": gen_loss.item(), 92 | "spectral_convergence_loss": sc_loss.item(), 93 | "log_stft_magnitude_loss": mag_loss.item(), 94 | "adversarial_loss": adv_loss.item(), 95 | "discriminator_loss": disc_loss.item(), 96 | "real_loss": real_loss.item(), 97 | "fake_loss": fake_loss.item() 98 | } 99 | 100 | @torch.no_grad() 101 | def eval_step(self, batch, 102 | model: ParallelWaveGAN, 103 | loss: PWGLoss): 104 | device = next(model.parameters()).device 105 | audio, mel, noise = [ x.to(device) for x in batch ] 106 | 107 | audio_ = model.generator( noise, mel ) 108 | prob_ = model.discriminator( audio_ ) 109 | prob = model.discriminator( audio ) 110 | 111 | sc_loss, mag_loss = loss.stft_loss( audio, audio_ ) 112 | adv_loss = loss.adversarial_loss( prob_ ) 113 | gen_loss = sc_loss + mag_loss + self.lambda_adv * adv_loss 114 | 115 | real_loss, fake_loss = loss.discriminator_loss( prob, prob_ ) 116 | disc_loss = real_loss + fake_loss 117 | 118 | return { 119 | "generator_loss": gen_loss.item(), 120 | "spectral_convergence_loss": sc_loss.item(), 121 | "log_stft_magnitude_loss": mag_loss.item(), 122 | "adversarial_loss": adv_loss.item(), 123 | "discriminator_loss": disc_loss.item(), 124 | "real_loss": real_loss.item(), 125 | "fake_loss": fake_loss.item() 126 | } 127 | 128 | @torch.no_grad() 129 | def test_step(self, batch, model: ParallelWaveGAN): 130 | device = next(model.parameters()).device 131 | audio, mel, noise = [ x.to(device) for x in batch ] 132 | 133 | audio_ = model.generator( noise, mel ) 134 | return { 'audio' : audio_ } 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /vocoder/test.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, yaml, datetime, os, time 3 | import yaml, tqdm 4 | from collections import defaultdict 5 | import soundfile 6 | 7 | import torch 8 | from vocoder.datasets import create_dataloader 9 | from vocoder.models import create_model 10 | from vocoder.strategy import create_strategy 11 | from vocoder.utils.log import Logger 12 | from vocoder.hparams import Hyperparameter 13 | 14 | 15 | 16 | class Tester: 17 | 18 | def __init__(self, args, hparams: Hyperparameter): 19 | self.log = Logger(args.exp_dir, tensorboard=False) 20 | 21 | self.exp_dir = args.exp_dir 22 | self.device = torch.device( args.device ) 23 | self.hparams = hparams 24 | 25 | self.model = create_model( hparams.model_name, hparams.model_params, device=self.device ) 26 | self.strategy = create_strategy( hparams.strategy_name, hparams.strategy_params ) 27 | self.restore_checkpoint() 28 | 29 | self.test_result_dir = os.path.join( self.exp_dir, f'test-{self.step}-step' ) 30 | os.makedirs( self.test_result_dir, exist_ok=True ) 31 | 32 | self.train_results = defaultdict(float) 33 | 34 | def restore_checkpoint(self, checkpoint=None): 35 | pt = os.path.join( self.exp_dir, 'checkpoint.pt') 36 | if checkpoint is None and os.path.islink(pt): 37 | checkpoint = os.path.join( self.exp_dir, os.readlink(pt) ) 38 | state_dict = torch.load( checkpoint, map_location='cpu') 39 | self.step = state_dict['step'] 40 | self.model.load_state_dict( state_dict['model'] ) 41 | self.log.info( f"Restore model from {checkpoint}" ) 42 | 43 | def init_dataloader(self): 44 | ''' initialize dataloader for training and evaluate ''' 45 | dataset_config = { 46 | 'metadata_file': self.hparams.test_metadata_file, 47 | 'hop_length': self.hparams.hop_length, 48 | 'sample_rate': self.hparams.sample_rate, 49 | 'batch_mel_length': self.hparams.batch_mel_length, 50 | 'cut': False 51 | } 52 | self.dataloader = create_dataloader( 53 | dataset_classname=self.hparams.dataset_classname, 54 | dataset_config=dataset_config, 55 | batch_size=1, 56 | num_workers=self.hparams.dataset_num_workers, 57 | shuffle=False, 58 | drop_last=False ) 59 | 60 | def run(self): 61 | # 初始化 dataloader 62 | self.init_dataloader() 63 | total_rtf = 0.0 64 | with tqdm.tqdm( self.dataloader, desc= "Test" ) as phbar: 65 | for idx, batch in enumerate( phbar, start=1 ): 66 | st = time.time() 67 | result = self.strategy.test_step( batch, self.model) 68 | tc = time.time() - st 69 | 70 | audio = result['audio'].squeeze(0).squeeze(0).cpu().numpy() 71 | soundfile.write(os.path.join( self.test_result_dir, f"{idx:04d}_gene.wav"), 72 | audio, self.hparams.sample_rate, "PCM_16") 73 | real_audio = batch[0].squeeze(0).squeeze(0).numpy() 74 | soundfile.write(os.path.join( self.test_result_dir, f"{idx:04d}_real.wav"), 75 | real_audio, self.hparams.sample_rate, "PCM_16") 76 | 77 | rtf = tc*self.hparams.sample_rate/len(audio) 78 | total_rtf += rtf 79 | phbar.set_postfix({"RTF": rtf}) 80 | 81 | self.log.info('Average RTF: {}'.format( total_rtf/idx ) ) 82 | self.log.info( f'Test result saving into {self.test_result_dir}' ) 83 | 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser( 87 | description="Train LVC-WaveGAN (See detail in vocoder/train.py).") 88 | parser.add_argument("--config", type=str, required=True, 89 | help="yaml format configuration file.") 90 | parser.add_argument("--exp-dir", type=str, required=True, 91 | help="the directory saving expriment data, " 92 | "including model checkpoints, log, results. ") 93 | parser.add_argument("--checkpoint", default=None, type=str, 94 | help="checkpoint file path to load saving model") 95 | parser.add_argument("--device", default='cuda', type=str, 96 | help="the device for training. (default: cuda:0)") 97 | args = parser.parse_args() 98 | hparams = Hyperparameter( args.config ) 99 | 100 | tester = Tester(args, hparams) 101 | 102 | try: 103 | tester.run() 104 | except KeyboardInterrupt: 105 | pass 106 | 107 | 108 | if __name__ == "__main__": 109 | main() -------------------------------------------------------------------------------- /vocoder/train.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, yaml, datetime, os 3 | import yaml, tqdm 4 | from collections import defaultdict 5 | 6 | import torch 7 | from vocoder.datasets import create_dataloader 8 | from vocoder.models import create_model 9 | from vocoder.losses import create_loss 10 | from vocoder.optimizers import create_optimizer 11 | from vocoder.strategy import create_strategy 12 | from vocoder.utils.log import Logger 13 | from vocoder.hparams import Hyperparameter 14 | 15 | 16 | 17 | class Trainer: 18 | 19 | def __init__(self, args, hparams: Hyperparameter): 20 | self.log = Logger(args.exp_dir) 21 | 22 | self.exp_dir = args.exp_dir 23 | os.makedirs( self.exp_dir, exist_ok=True ) 24 | 25 | self.device = torch.device( args.device ) 26 | self.hparams = hparams 27 | 28 | self.step = 1 29 | self.epoch = 1 30 | 31 | self.model = create_model( hparams.model_name, hparams.model_params, device=self.device ) 32 | self.loss = create_loss( hparams.loss_name, hparams.loss_params, device=self.device ) 33 | self.optimizer = create_optimizer( hparams.opt_name, self.model, hparams.opt_params ) 34 | self.strategy = create_strategy( hparams.strategy_name, hparams.strategy_params ) 35 | 36 | self.restore_checkpoint(args.restart, args.checkpoint) 37 | 38 | self.train_results = defaultdict(float) 39 | self.num_train_reuslts = 0 40 | 41 | def restore_checkpoint(self, restart=False, checkpoint=None): 42 | if not restart: 43 | try: 44 | pt = os.path.join( self.exp_dir, 'checkpoint.pt') 45 | if checkpoint is None and os.path.islink(pt): 46 | checkpoint = os.path.join( self.exp_dir, os.readlink(pt) ) 47 | if not os.path.isfile( checkpoint ): 48 | print('start new training.') 49 | return 50 | state_dict = torch.load( checkpoint, map_location='cpu') 51 | self.step = state_dict['step'] 52 | self.epoch = state_dict['epoch'] 53 | self.model.load_state_dict( state_dict['model'] ) 54 | self.optimizer.load_state_dict( state_dict['optimizer'] ) 55 | self.log.info( f"Restore model from {checkpoint}") 56 | except: 57 | print('Error in restore model. Start New training') 58 | 59 | def save_checkpoint(self): 60 | state_dict = { 61 | "step": self.step, 62 | "epoch": self.epoch, 63 | "optimizer": self.optimizer.state_dict(), 64 | "model": self.model.state_dict() 65 | } 66 | save_path = os.path.join( self.exp_dir, f'checkpoint-{self.step}.pt') 67 | link_path = os.path.join( self.exp_dir, 'checkpoint.pt') 68 | torch.save( state_dict, save_path ) 69 | if os.path.islink(link_path): 70 | os.unlink(link_path) 71 | os.symlink(f'checkpoint-{self.step}.pt', link_path) 72 | self.log.info( f'Save chechpoint as {save_path}' ) 73 | 74 | def init_dataloader(self): 75 | ''' initialize dataloader for training and evaluate ''' 76 | train_dataset_config = { 77 | 'metadata_file': self.hparams.train_metadata_file, 78 | 'hop_length': self.hparams.hop_length, 79 | 'sample_rate': self.hparams.sample_rate, 80 | 'batch_mel_length': self.hparams.batch_mel_length 81 | } 82 | eval_dataset_config = { 83 | 'metadata_file': self.hparams.eval_metadata_file, 84 | 'hop_length': self.hparams.hop_length, 85 | 'sample_rate': self.hparams.sample_rate, 86 | 'batch_mel_length': self.hparams.batch_mel_length 87 | } 88 | self.dataloader = { 89 | "train": create_dataloader( 90 | dataset_classname=self.hparams.dataset_classname, 91 | dataset_config=train_dataset_config, 92 | batch_size=self.hparams.train_batch_size, 93 | num_workers=self.hparams.dataset_num_workers, 94 | shuffle=True, 95 | drop_last=True ), 96 | "eval": create_dataloader( 97 | dataset_classname=self.hparams.dataset_classname, 98 | dataset_config=eval_dataset_config, 99 | batch_size=self.hparams.train_batch_size, 100 | num_workers=1, 101 | shuffle=False, 102 | drop_last=False ) 103 | } 104 | 105 | def train(self): 106 | # 初始化 dataloader 107 | self.init_dataloader() 108 | while True: 109 | with tqdm.tqdm( self.dataloader["train"], desc= f"Train, Epoch: {self.epoch}" ) as tqbar: 110 | for batch in tqbar: 111 | if self.step > self.hparams.max_train_steps: 112 | return 113 | tqbar.set_postfix({"Step": self.step}) 114 | 115 | result = self.strategy.train_step( batch, self.step, self.model, self.loss, self.optimizer ) 116 | 117 | self._check_log(result) 118 | self._check_evaluate() 119 | 120 | self.step += 1 121 | self.epoch += 1 122 | 123 | def evaluate(self): 124 | eval_results = defaultdict(float) 125 | for batch in tqdm.tqdm(self.dataloader["eval"], desc= f"Evaluate"): 126 | result = self.strategy.eval_step( batch, self.model, self.loss ) 127 | for k in result: 128 | eval_results[k] += result[k] 129 | 130 | self.log.info( f'Step {self.step}, Evaluate results:') 131 | for k in eval_results: 132 | v = eval_results[k] / len( self.dataloader["eval"] ) 133 | self.log.add_scalar( f'evaluate/{k}', v, self.step) 134 | self.log.info( f' {k}: {v:.4f}' ) 135 | 136 | self.log.flush() 137 | 138 | def _check_evaluate(self): 139 | if self.step % self.hparams.eval_interval_steps == 0: 140 | self.model.eval() 141 | self.evaluate() 142 | self.model.train() 143 | 144 | def _check_log(self, train_result): 145 | for k in train_result: 146 | self.train_results[k] += train_result[k] 147 | self.num_train_reuslts += 1 148 | 149 | if self.step % self.hparams.log_interval_steps == 0: 150 | for k in self.train_results: 151 | v = self.train_results[k] / self.num_train_reuslts 152 | self.log.add_scalar( f'train/{k}', v, self.step) 153 | 154 | self.train_results = defaultdict(float) 155 | self.num_train_reuslts = 0 156 | 157 | if self.step % self.hparams.save_interval_steps == 0: 158 | self.save_checkpoint() 159 | 160 | 161 | def check_args(args, hparams: Hyperparameter): 162 | if args.exp_dir is None: 163 | args.exp_dir = os.path.join('exps', datetime.datetime.now().strftime('exp-%Y%m%d-%H%M%S') ) 164 | 165 | # 保存配置文件 166 | hparams.save_config( os.path.join( args.exp_dir, 'config.yaml' ) ) 167 | 168 | # 是否需要进行数据预处理 169 | if args.preprocess or not os.path.isfile( hparams.train_metadata_file ): 170 | if args.data_dir is None: 171 | raise RuntimeError('Must provide data directory for training.') 172 | from vocoder.preprocess import preprocess 173 | preprocess(args.data_dir, hparams, args.temp_dir, args.device) 174 | 175 | 176 | 177 | def main(): 178 | parser = argparse.ArgumentParser( 179 | description="Train LVC-WaveGAN (See detail in vocoder/train.py).") 180 | parser.add_argument("--config", type=str, required=True, 181 | help="yaml format configuration file.") 182 | parser.add_argument("--exp-dir", default=None, type=str, 183 | help="the directory for saving expriment data, " 184 | "including model checkpoints, log, results. ") 185 | parser.add_argument("--data-dir", default=None, type=str, 186 | help="the directory containing .wav files for training") 187 | parser.add_argument("--temp-dir", default='temp', type=str, 188 | help="the directory containing preprocess results") 189 | parser.add_argument("--restart", action="store_true", default=False, 190 | help="Whether to restart a new training") 191 | parser.add_argument("--preprocess", action="store_true", default=False, 192 | help="Whether force to preprocess data") 193 | parser.add_argument("--checkpoint", default=None, type=str, 194 | help="checkpoint file path to load saving model") 195 | parser.add_argument("--device", default='cuda', type=str, 196 | help="the device for training. (default: cuda:0)") 197 | args = parser.parse_args() 198 | hparams = Hyperparameter( args.config ) 199 | 200 | check_args( args, hparams ) 201 | 202 | trainer = Trainer(args, hparams) 203 | 204 | try: 205 | trainer.train() 206 | except KeyboardInterrupt: 207 | trainer.save_checkpoint() 208 | trainer.log.flush() 209 | 210 | 211 | if __name__ == "__main__": 212 | main() -------------------------------------------------------------------------------- /vocoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zceng/LVCNet/e81e13f3479a4d85f498a02e42338ebe823a8b3d/vocoder/utils/__init__.py -------------------------------------------------------------------------------- /vocoder/utils/log.py: -------------------------------------------------------------------------------- 1 | 2 | from functools import wraps 3 | import logging, os, time, sys 4 | from logging import DEBUG, INFO, WARN, ERROR 5 | 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | logging.basicConfig( 9 | stream=sys.stdout, 10 | format='[ %(levelname)s ] %(message)s', 11 | level=DEBUG) 12 | 13 | class Logger: 14 | 15 | def __init__(self, log_dir, level=DEBUG, tensorboard=True): 16 | os.makedirs( log_dir, exist_ok=True ) 17 | 18 | self.logger = logging.getLogger('log') 19 | self.logger.setLevel(level) 20 | handler = logging.FileHandler( 21 | os.path.join( log_dir, time.strftime('%Y%m%d-%H%M%S.log') ), 22 | mode='w', 23 | encoding='utf-8') 24 | handler.setFormatter( logging.Formatter('[ %(levelname)s, %(asctime)s ] %(message)s') ) 25 | self.logger.addHandler( handler ) 26 | self.handler = handler 27 | 28 | if tensorboard: 29 | self.tbwriter = SummaryWriter(log_dir) 30 | 31 | def add_scalar(self, tag, value, step): 32 | self.tbwriter.add_scalar(tag, value, step) 33 | 34 | def info(self, *msg, **kwargs): 35 | self.logger.info( *msg, **kwargs ) 36 | 37 | def warn(self, *msg, **kwargs): 38 | self.logger.warn( *msg, **kwargs ) 39 | 40 | def error(self, *msg, **kwargs): 41 | self.logger.error( *msg, **kwargs ) 42 | 43 | def debug(self, *msg, **kwargs): 44 | self.logger.debug( *msg, **kwargs ) 45 | 46 | def flush(self): 47 | self.tbwriter.flush() 48 | self.handler.flush() 49 | 50 | 51 | 52 | --------------------------------------------------------------------------------