├── wavenet_vocoder ├── __init__.py ├── bin │ ├── __init__.py │ ├── calc_stats.py │ ├── noise_shaping.py │ ├── decode.py │ └── feature_extract.py ├── nets │ └── __init__.py └── utils │ ├── __init__.py │ ├── download_from_google_drive.sh │ ├── parse_options.sh │ ├── utils.py │ └── run.pl ├── egs ├── arctic │ ├── sd │ │ ├── conf │ │ │ ├── awb.f0 │ │ │ ├── bdl.f0 │ │ │ ├── clb.f0 │ │ │ ├── jmk.f0 │ │ │ ├── ksp.f0 │ │ │ ├── rms.f0 │ │ │ ├── slt.f0 │ │ │ └── slurm.conf │ │ ├── path.sh │ │ ├── cmd.sh │ │ └── run.sh │ ├── sd-mini │ │ ├── conf │ │ │ ├── awb.f0 │ │ │ ├── bdl.f0 │ │ │ ├── clb.f0 │ │ │ ├── jmk.f0 │ │ │ ├── ksp.f0 │ │ │ ├── rms.f0 │ │ │ └── slt.f0 │ │ ├── path.sh │ │ └── run.sh │ ├── si-close │ │ ├── conf │ │ │ ├── awb.f0 │ │ │ ├── bdl.f0 │ │ │ ├── clb.f0 │ │ │ ├── jmk.f0 │ │ │ ├── ksp.f0 │ │ │ ├── rms.f0 │ │ │ ├── slt.f0 │ │ │ └── slurm.conf │ │ ├── path.sh │ │ └── cmd.sh │ ├── si-open │ │ ├── conf │ │ │ ├── awb.f0 │ │ │ ├── bdl.f0 │ │ │ ├── clb.f0 │ │ │ ├── jmk.f0 │ │ │ ├── ksp.f0 │ │ │ ├── rms.f0 │ │ │ ├── slt.f0 │ │ │ └── slurm.conf │ │ ├── path.sh │ │ └── cmd.sh │ ├── sd-melspc │ │ ├── path.sh │ │ ├── conf │ │ │ └── slurm.conf │ │ ├── cmd.sh │ │ └── run.sh │ ├── si-close-melspc │ │ ├── path.sh │ │ ├── conf │ │ │ └── slurm.conf │ │ └── cmd.sh │ └── si-open-melspc │ │ ├── path.sh │ │ ├── conf │ │ └── slurm.conf │ │ └── cmd.sh ├── ljspeech │ ├── sd │ │ ├── path.sh │ │ ├── conf │ │ │ └── slurm.conf │ │ ├── cmd.sh │ │ └── run.sh │ └── sd-melspc │ │ ├── path.sh │ │ ├── conf │ │ └── slurm.conf │ │ ├── cmd.sh │ │ └── run.sh ├── m-ailabs-speech │ ├── sd │ │ ├── path.sh │ │ ├── conf │ │ │ └── slurm.conf │ │ ├── cmd.sh │ │ └── run.sh │ └── sd-melspc │ │ ├── path.sh │ │ ├── conf │ │ └── slurm.conf │ │ └── cmd.sh └── README.md ├── setup.cfg ├── .gitignore ├── tools └── Makefile ├── test ├── test_upsampling.py ├── test_preprocessing.py ├── test_generator.py └── test_wavenet.py ├── .travis.yml ├── setup.py ├── README.md └── LICENSE /wavenet_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wavenet_vocoder/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/awb.f0: -------------------------------------------------------------------------------- 1 | 65 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/bdl.f0: -------------------------------------------------------------------------------- 1 | 70 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/clb.f0: -------------------------------------------------------------------------------- 1 | 110 270 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/jmk.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/ksp.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/rms.f0: -------------------------------------------------------------------------------- 1 | 55 200 2 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/slt.f0: -------------------------------------------------------------------------------- 1 | 120 275 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/awb.f0: -------------------------------------------------------------------------------- 1 | 65 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/bdl.f0: -------------------------------------------------------------------------------- 1 | 70 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/clb.f0: -------------------------------------------------------------------------------- 1 | 110 270 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/jmk.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/ksp.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/rms.f0: -------------------------------------------------------------------------------- 1 | 55 200 2 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/conf/slt.f0: -------------------------------------------------------------------------------- 1 | 120 275 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/awb.f0: -------------------------------------------------------------------------------- 1 | 65 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/bdl.f0: -------------------------------------------------------------------------------- 1 | 70 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/clb.f0: -------------------------------------------------------------------------------- 1 | 110 270 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/jmk.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/ksp.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/rms.f0: -------------------------------------------------------------------------------- 1 | 55 200 2 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/slt.f0: -------------------------------------------------------------------------------- 1 | 120 275 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/awb.f0: -------------------------------------------------------------------------------- 1 | 65 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/bdl.f0: -------------------------------------------------------------------------------- 1 | 70 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/clb.f0: -------------------------------------------------------------------------------- 1 | 110 270 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/jmk.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/ksp.f0: -------------------------------------------------------------------------------- 1 | 60 210 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/rms.f0: -------------------------------------------------------------------------------- 1 | 55 200 2 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/slt.f0: -------------------------------------------------------------------------------- 1 | 120 275 2 | -------------------------------------------------------------------------------- /wavenet_vocoder/nets/__init__.py: -------------------------------------------------------------------------------- 1 | from .wavenet import * # NOQA 2 | -------------------------------------------------------------------------------- /wavenet_vocoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # NOQA 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts = --verbose 3 | testpaths = test 4 | 5 | [flake8] 6 | ignore = H102,D100,D105,D107 7 | # 120 is a workaround, 79 is good 8 | max-line-length = 100 9 | exclude = wavenet_vocoder/utils 10 | -------------------------------------------------------------------------------- /egs/arctic/sd/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/si-close/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/si-open/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/ljspeech/sd/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/sd-melspc/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/ljspeech/sd-melspc/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/si-close-melspc/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/arctic/si-open-melspc/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd-melspc/path.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 2 | export CUDA_HOME=/usr/local/cuda 3 | export PRJ_ROOT=../../.. 4 | source $PRJ_ROOT/tools/venv/bin/activate 5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | slurm-*.out 3 | .DS_Store 4 | exp/ 5 | snippet.sh 6 | tools/venv 7 | tools/sprocket 8 | egs/*/*/data/ 9 | egs/*/*/exp/ 10 | egs/*/*/wav/ 11 | egs/*/*/wav_ns/ 12 | egs/*/*/wav_hpf/ 13 | egs/*/*/wav_nwf/ 14 | egs/*/*/hdf5/ 15 | egs/*/*/downloads/ 16 | egs/*/*/downloads 17 | .pytest_cache/ 18 | .eggs/ 19 | *.egg-info/ 20 | -------------------------------------------------------------------------------- /egs/arctic/sd/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/arctic/si-open/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/ljspeech/sd/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/arctic/sd-melspc/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/arctic/si-close/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/ljspeech/sd-melspc/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/arctic/si-close-melspc/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/arctic/si-open-melspc/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd-melspc/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | command sbatch --export=PATH --ntasks-per-node=1 2 | option time=* --time $0 3 | option mem=* --mem-per-cpu $0 4 | option mem=0 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 6 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 7 | default gpu=0 8 | option gpu=0 -p all,hpc 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00 10 | -------------------------------------------------------------------------------- /tools/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | all: venv/bin/activate 4 | 5 | venv/bin/activate: 6 | test -d venv || virtualenv -p python3.6 venv 7 | . venv/bin/activate && pip install --upgrade pip 8 | . venv/bin/activate && cd ../ && pip install torch==1.0.1 torchvision==0.2.2 9 | . venv/bin/activate && cd ../ && pip install -e . 10 | . venv/bin/activate && cd ../ && pip install -e .[test] 11 | 12 | clean: 13 | rm -fr venv 14 | find -iname "*.pyc" -delete 15 | -------------------------------------------------------------------------------- /test/test_upsampling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from wavenet_vocoder.nets import initialize 10 | from wavenet_vocoder.nets import UpSampling 11 | 12 | 13 | def test_upsampling(): 14 | aux = np.random.randn(1, 28, 1000) 15 | conv = UpSampling(10) 16 | conv.apply(initialize) 17 | batch = torch.from_numpy(aux).float() 18 | out = conv(batch) 19 | out = out.detach().numpy() 20 | assert out.shape[-1] == aux.shape[-1] * 10 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: 4 | - pip 5 | - ccache 6 | 7 | matrix: 8 | include: 9 | - os: linux 10 | python: "3.6" 11 | 12 | install: 13 | - pip3 install -U pip wheel 14 | - pip3 install numpy 15 | # NOTE: use 1.0.1 for travis check because 1.1.0 > argmax behavior is strange 16 | - pip3 install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl 17 | - pip3 install torchvision==0.2.2 18 | - pip3 install -e . 19 | - pip3 install -e .[test] 20 | 21 | script: 22 | - flake8 wavenet_vocoder test 23 | - autopep8 -r wavenet_vocoder test --exclude wavenet_vocoder/utils --global-config .pep8 --diff --max-line-length 120 | tee check_autopep8 24 | - test ! -s check_autopep8 25 | - pytest 26 | 27 | sudo: false 28 | 29 | addons: 30 | apt: 31 | packages: 32 | - cmake 33 | - python3-dev 34 | -------------------------------------------------------------------------------- /egs/arctic/sd/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | 17 | # for slurm (you can change configuration file "conf/slurm.conf") 18 | # export train_cmd="slurm.pl --config conf/slurm.conf" 19 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 20 | -------------------------------------------------------------------------------- /egs/arctic/sd-melspc/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | 17 | # for slurm (you can change configuration file "conf/slurm.conf") 18 | # export train_cmd="slurm.pl --config conf/slurm.conf" 19 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 20 | -------------------------------------------------------------------------------- /egs/ljspeech/sd/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/arctic/si-close/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/arctic/si-open/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/arctic/si-close-melspc/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/arctic/si-open-melspc/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/ljspeech/sd-melspc/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd-melspc/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # for local 14 | export train_cmd="run.pl" 15 | export cuda_cmd="run.pl --gpu 1" 16 | export max_jobs=1 17 | 18 | # for slurm (you can change configuration file "conf/slurm.conf") 19 | # export train_cmd="slurm.pl --config conf/slurm.conf" 20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 21 | # export max_jobs=-1 22 | -------------------------------------------------------------------------------- /wavenet_vocoder/utils/download_from_google_drive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Download compressed file from google drive 4 | 5 | # Copyright 2019 Tomoki Hayashi 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | share_url=$1 9 | download_dir=${2:-"downloads"} 10 | file_ext=${3:-"tar.gz"} 11 | 12 | if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then 13 | echo "Usage: $0 [ ]"; 14 | echo "e.g.: $0 https://drive.google.com/open?id= downloads tar.gz" 15 | echo "Options:" 16 | echo " : directory to save downloaded file. (Default=downloads)" 17 | echo " : file extension of the file to be downloaded. (Default=tar.gz)" 18 | if [ "$1" = "--help" ]; then 19 | exit 0; 20 | fi 21 | exit 1; 22 | fi 23 | 24 | [ ! -e "${download_dir}" ] && mkdir -p "${download_dir}" 25 | tmp=$(mktemp "${download_dir}/XXXXXX.${file_ext}") 26 | 27 | # file id in google drive can be obtain from sharing link 28 | # ref: https://qiita.com/namakemono/items/c963e75e0af3f7eed732 29 | file_id=$(echo "${share_url}" | cut -d"=" -f 2) 30 | 31 | # define decompressor 32 | decompress () { 33 | filename=$1 34 | decompress_dir=$2 35 | if echo "${filename}" | grep -q ".zip"; then 36 | unzip "${filename}" -d "${decompress_dir}" 37 | elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then 38 | tar xvzf "${filename}" -C "${decompress_dir}" 39 | else 40 | echo "Unsupported file extension." >&2 && exit 1 41 | fi 42 | } 43 | 44 | # Try-catch like processing 45 | ( 46 | wget "https://drive.google.com/uc?export=download&id=${file_id}" -O "${tmp}" 47 | decompress "${tmp}" "${download_dir}" 48 | ) || { 49 | # Do not allow error from here 50 | set -e 51 | # sometimes, wget from google drive is failed due to virus check confirmation 52 | # to avoid it, we need to do some tricky processings 53 | # see https://stackoverflow.com/questions/20665881/direct-download-from-google-drive-using-google-drive-api 54 | curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${file_id}" > /tmp/intermezzo.html 55 | postfix=$(grep -Po 'uc-download-link" [^>]* href="\K[^"]*' /tmp/intermezzo.html | sed 's/\&/\&/g') 56 | curl -L -b /tmp/cookies "https://drive.google.com${postfix}" > "${tmp}" 57 | decompress "${tmp}" "${download_dir}" 58 | } 59 | 60 | # remove tmpfiles 61 | rm "${tmp}" 62 | echo "Sucessfully downloaded zip file from ${share_url}" 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import pip 6 | import sys 7 | 8 | from distutils.version import LooseVersion 9 | from setuptools import find_packages 10 | from setuptools import setup 11 | 12 | 13 | if LooseVersion(sys.version) < LooseVersion("3.6"): 14 | raise RuntimeError( 15 | "Python>=3.6 is required, " 16 | "but your Python is {}".format(sys.version)) 17 | if LooseVersion(pip.__version__) < LooseVersion("19"): 18 | raise RuntimeError( 19 | "pip>=19.0.0 is required, but your pip is {}. " 20 | "Try again after \"pip install -U pip\"".format(pip.__version__)) 21 | 22 | requirements = { 23 | "install": [ 24 | "h5py>=2.8.0", 25 | "scikit-learn==0.22.2", 26 | "librosa>=0.6.2", 27 | "soundfile>=0.10.2", 28 | "torch>=1.0.1", 29 | "torchvision>=0.2.2", 30 | "sprocket-vc>=0.18.2", 31 | "matplotlib>=3.0.3", 32 | ], 33 | "setup": [ 34 | "numpy", 35 | "pytest-runner" 36 | ], 37 | "test": [ 38 | "pytest>=3.3.0", 39 | "hacking==1.1.0", 40 | "autopep8==1.2.4", 41 | ]} 42 | install_requires = requirements["install"] 43 | setup_requires = requirements["setup"] 44 | tests_require = requirements["test"] 45 | extras_require = {k: v for k, v in requirements.items() 46 | if k not in ["install", "setup"]} 47 | 48 | dirname = os.path.dirname(__file__) 49 | setup(name="wavenet_vocoder", 50 | version="0.1.1", 51 | url="http://github.com/kan-bayashi/PytorchWaveNetVocoder", 52 | author="Tomoki Hayashi", 53 | author_email="hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp", 54 | description="Pytorch WaveNet Vocoder", 55 | long_description=open(os.path.join(dirname, "README.md"), 56 | encoding="utf-8").read(), 57 | license="Apache Software License", 58 | packages=find_packages(include="wavenet_vocoder*"), 59 | install_requires=install_requires, 60 | setup_requires=setup_requires, 61 | tests_require=tests_require, 62 | extras_require=extras_require, 63 | classifiers=[ 64 | "Programming Language :: Python", 65 | "Programming Language :: Python :: 3.6", 66 | "Intended Audience :: Science/Research", 67 | "Operating System :: POSIX :: Linux", 68 | "License :: OSI Approved :: Apache Software License", 69 | "Topic :: Software Development :: Libraries :: Python Modules"], 70 | ) 71 | -------------------------------------------------------------------------------- /wavenet_vocoder/bin/calc_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import argparse 8 | import logging 9 | 10 | import numpy as np 11 | 12 | from sklearn.preprocessing import StandardScaler 13 | 14 | from wavenet_vocoder.utils import read_hdf5 15 | from wavenet_vocoder.utils import read_txt 16 | from wavenet_vocoder.utils import write_hdf5 17 | 18 | 19 | def calc_stats(file_list, args): 20 | """CALCULATE STATISTICS.""" 21 | scaler = StandardScaler() 22 | 23 | # process over all of data 24 | for i, filename in enumerate(file_list): 25 | logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) 26 | feat = read_hdf5(filename, "/" + args.feature_type) 27 | scaler.partial_fit(feat) 28 | 29 | # add uv term 30 | mean = scaler.mean_ 31 | scale = scaler.scale_ 32 | if args.feature_type == "world": 33 | mean[0] = 0.0 34 | scale[0] = 1.0 35 | 36 | # write to hdf5 37 | write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean)) 38 | write_hdf5(args.stats, "/" + args.feature_type + "/scale", np.float32(scale)) 39 | 40 | 41 | def main(): 42 | """RUN CALCULATION OF STATISTICS.""" 43 | parser = argparse.ArgumentParser() 44 | 45 | parser.add_argument( 46 | "--feats", default=None, required=True, 47 | type=str, help="name of the list of hdf5 files") 48 | parser.add_argument( 49 | "--stats", default=None, required=True, 50 | type=str, help="filename of hdf5 format") 51 | parser.add_argument( 52 | "--feature_type", default="world", choices=["world", "melspc", "mcep"], 53 | type=str, help="feature type") 54 | parser.add_argument( 55 | "--verbose", default=1, 56 | type=int, help="log message level") 57 | 58 | args = parser.parse_args() 59 | 60 | # set log level 61 | if args.verbose == 1: 62 | logging.basicConfig(level=logging.INFO, 63 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 64 | datefmt='%m/%d/%Y %I:%M:%S') 65 | elif args.verbose > 1: 66 | logging.basicConfig(level=logging.DEBUG, 67 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 68 | datefmt='%m/%d/%Y %I:%M:%S') 69 | else: 70 | logging.basicConfig(level=logging.WARNING, 71 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 72 | datefmt='%m/%d/%Y %I:%M:%S') 73 | logging.warning("logging is disabled.") 74 | 75 | # show arguments 76 | for key, value in vars(args).items(): 77 | logging.info("%s = %s" % (key, str(value))) 78 | 79 | # read file list 80 | file_list = read_txt(args.feats) 81 | logging.info("number of utterances = %d" % len(file_list)) 82 | 83 | # calculate statistics 84 | calc_stats(file_list, args) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /test/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | import os 8 | import shutil 9 | 10 | import numpy as np 11 | import pytest 12 | 13 | from scipy.io import wavfile 14 | 15 | from wavenet_vocoder.bin.calc_stats import calc_stats 16 | from wavenet_vocoder.bin.feature_extract import melcepstrum_extract 17 | from wavenet_vocoder.bin.feature_extract import melspectrogram_extract 18 | from wavenet_vocoder.bin.feature_extract import world_feature_extract 19 | from wavenet_vocoder.bin.noise_shaping import convert_mcep_to_mlsa_coef 20 | from wavenet_vocoder.bin.noise_shaping import noise_shaping 21 | from wavenet_vocoder.utils import check_hdf5 22 | from wavenet_vocoder.utils import find_files 23 | from wavenet_vocoder.utils import read_hdf5 24 | from wavenet_vocoder.utils import write_hdf5 25 | 26 | 27 | def make_dummy_wav(name, maxlen=32000, fs=16000): 28 | length = np.random.randint(maxlen // 2, maxlen) 29 | x = np.random.randn(length) 30 | x = x / np.abs(x).max() 31 | x = np.int16(x * (np.iinfo(np.int16).max + 1)) 32 | wavfile.write(name, fs, x) 33 | 34 | 35 | def make_args(**kwargs): 36 | defaults = dict( 37 | hdf5dir="tmp/hdf5", 38 | wavdir="tmp/wav_filtered", 39 | outdir="tmp/wav_nwf", 40 | stats="tmp/stats.h5", 41 | feature_type="world", 42 | fs=16000, 43 | shiftms=5, 44 | minf0=40, 45 | maxf0=400, 46 | mspc_dim=80, 47 | mcep_dim=24, 48 | mcep_alpha=0.41, 49 | fftl=1024, 50 | highpass_cutoff=70, 51 | mcep_dim_start=2, 52 | mcep_dim_end=25, 53 | fmin=None, 54 | fmax=None, 55 | mag=0.5, 56 | save_wav=True, 57 | inv=False, 58 | ) 59 | defaults.update(kwargs) 60 | return argparse.Namespace(**defaults) 61 | 62 | 63 | @pytest.mark.parametrize("feature_type", [ 64 | ("melspc"), ("world"), ("mcep"), 65 | ]) 66 | def test_preprocessing(feature_type): 67 | # make arguments 68 | args = make_args(feature_type=feature_type) 69 | 70 | # prepare dummy wav files 71 | wavdir = "tmp/wav" 72 | if not os.path.exists(wavdir): 73 | os.makedirs(wavdir) 74 | for i in range(5): 75 | make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs) 76 | 77 | # feature extract 78 | wav_list = find_files(wavdir, "*.wav") 79 | if not os.path.exists(args.wavdir): 80 | os.makedirs(args.wavdir) 81 | if args.feature_type == "world": 82 | world_feature_extract(wav_list, args) 83 | elif args.feature_type == "melspc": 84 | melspectrogram_extract(wav_list, args) 85 | else: 86 | melcepstrum_extract(wav_list, args) 87 | 88 | # calc_stats 89 | file_list = find_files(args.hdf5dir, "*.h5") 90 | calc_stats(file_list, args) 91 | 92 | # noise shaping 93 | if feature_type != "melspc": 94 | wav_list = find_files(args.wavdir, "*.wav") 95 | if not os.path.exists(args.outdir): 96 | os.makedirs(args.outdir) 97 | if not check_hdf5(args.stats, "/mlsa/coef"): 98 | avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") 99 | if args.feature_type == "world": 100 | avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] 101 | mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) 102 | write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) 103 | write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) 104 | noise_shaping(wav_list, args) 105 | 106 | # remove 107 | shutil.rmtree("tmp") 108 | -------------------------------------------------------------------------------- /wavenet_vocoder/utils/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### I released new implementation [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN). Please enjoy your hacking! 4 | 5 | # PYTORCH-WAVENET-VOCODER 6 | 7 | [![Build Status](https://travis-ci.org/kan-bayashi/PytorchWaveNetVocoder.svg?branch=master)](https://travis-ci.org/kan-bayashi/PytorchWaveNetVocoder) 8 | 9 | This repository is the wavenet-vocoder implementation with pytorch. 10 | 11 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/overview.bmp) 12 | 13 | You can try the demo recipe in Google colab from now! 14 | 15 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kan-bayashi/INTERSPEECH19_TUTORIAL/blob/master/notebooks/wavenet_vocoder/wavenet_vocoder.ipynb) 16 | 17 | ## Key features 18 | 19 | - Support kaldi-like recipe, easy to reproduce the results 20 | - Support multi-gpu training / decoding 21 | - Support world features / mel-spectrogram as auxiliary features 22 | - Support recipes of three public databases 23 | 24 | - [CMU Arctic database](http://www.festvox.org/cmu_arctic/): `egs/arctic` 25 | - [LJ Speech database](https://keithito.com/LJ-Speech-Dataset/): `egs/ljspeech` 26 | - [M-AILABS speech database](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/): `egs/m-ailabs-speech` 27 | 28 | ## Requirements 29 | 30 | - python 3.6+ 31 | - virtualenv 32 | - cuda 9.0+ 33 | - cndnn 7.1+ 34 | - nccl 2.0+ (for the use of multi-gpus) 35 | 36 | Recommend to use the GPU with 10GB> memory. 37 | 38 | ## Setup 39 | 40 | ### A. Make virtualenv 41 | 42 | ```bash 43 | $ git clone https://github.com/kan-bayashi/PytorchWaveNetVocoder.git 44 | $ cd PytorchWaveNetVocoder/tools 45 | $ make 46 | ``` 47 | 48 | ### B. Install with pip 49 | 50 | ``` 51 | $ git clone https://github.com/kan-bayashi/PytorchWaveNetVocoder.git 52 | $ cd PytorchWaveNetVocoder 53 | 54 | # recommend to use with pytorch 1.0.1 because only tested on 1.0.1 55 | $ pip install torch==1.0.1 torchvision==0.2.2 56 | $ pip install -e . 57 | 58 | # please make dummy activate file to suppress warning in the recipe 59 | $ mkdir -p tools/venv/bin && touch tools/venv/bin/activate 60 | ``` 61 | 62 | ## How-to-run 63 | 64 | ```bash 65 | $ cd egs/arctic/sd 66 | $ ./run.sh 67 | ``` 68 | 69 | See more detail of the recipes in [egs/README.md](egs/README.md). 70 | 71 | ## Results 72 | 73 | You can listen to samples from [kan-bayashi/WaveNetVocoderSamples](https://kan-bayashi.github.io/WaveNetVocoderSamples/). 74 | 75 | This is the subjective evaluation results using `arctic` recipe. 76 | 77 | **Comparison between model type** 78 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/mos.bmp) 79 | 80 | **Effect of the amount of training data** 81 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/mos_num_train.bmp) 82 | 83 | If you want to listen more samples, please access our google drive from [here](https://drive.google.com/drive/folders/1zC1WDiMu4SOdc7UeOayoEe_79PdnPBu6?usp=sharing). 84 | 85 | Here is the list of samples: 86 | - `arctic_raw_16k`: original in arctic database 87 | - `arctic_sd_16k_world`: sd model with world aux feats + noise shaping with world mcep 88 | - `arctic_si-open_16k_world`: si-open model with world aux feats + noise shaping with world mcep 89 | - `arctic_si-close_16k_world`: si-close model with world aux feats + noise shaping with world mcep 90 | - `arctic_si-close_16k_melspc`: si-close model with mel-spectrogram aux feats 91 | - `arctic_si-close_16k_melspc_ns`: si-close model with mel-spectrogram aux feats + noise shaping with stft mcep 92 | - `ljspeech_raw_22.05k`: original in ljspeech database 93 | - `ljspeech_sd_22.05k_world`: sd model with world aux feats + noise shaping with world mcep 94 | - `ljspeech_sd_22.05k_melspc`: sd model with mel-spectrogram aux feats 95 | - `ljspeech_sd_22.05k_melspc_ns`: sd model with mel-spectrogram aux feats + noise shaping with stft mcep 96 | - `m-ailabs_raw_16k`: original in m-ailabs speech database 97 | - `m-ailabs_sd_16k_melspc`: sd model with mel-spectrogram aux feats 98 | 99 | ## References 100 | 101 | Please cite the following articles. 102 | 103 | ``` 104 | @inproceedings{tamamori2017speaker, 105 | title={Speaker-dependent WaveNet vocoder}, 106 | author={Tamamori, Akira and Hayashi, Tomoki and Kobayashi, Kazuhiro and Takeda, Kazuya and Toda, Tomoki}, 107 | booktitle={Proceedings of Interspeech}, 108 | pages={1118--1122}, 109 | year={2017} 110 | } 111 | @inproceedings{hayashi2017multi, 112 | title={An Investigation of Multi-Speaker Training for WaveNet Vocoder}, 113 | author={Hayashi, Tomoki and Tamamori, Akira and Kobayashi, Kazuhiro and Takeda, Kazuya and Toda, Tomoki}, 114 | booktitle={Proc. ASRU 2017}, 115 | year={2017} 116 | } 117 | @article{hayashi2018sp, 118 | title={複数話者WaveNetボコーダに関する調査}. 119 | author={林知樹 and 小林和弘 and 玉森聡 and 武田一哉 and 戸田智基}, 120 | journal={電子情報通信学会技術研究報告}, 121 | year={2018} 122 | } 123 | ``` 124 | 125 | ## Author 126 | 127 | Tomoki Hayashi @ Nagoya University 128 | e-mail:hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp 129 | -------------------------------------------------------------------------------- /egs/README.md: -------------------------------------------------------------------------------- 1 | # Outline of recipes 2 | 3 | Here we introcude the outline of recipes. 4 | 5 | If you want to learn step-by-step, you can try the demo recipe in Google colab! 6 | 7 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kan-bayashi/INTERSPEECH19_TUTORIAL/blob/master/notebooks/wavenet_vocoder/wavenet_vocoder.ipynb) 8 | 9 | ## Supported database 10 | 11 | - [CMU Arctic database](http://www.festvox.org/cmu_arctic/): `egs/arctic` 12 | - [LJ Speech database](https://keithito.com/LJ-Speech-Dataset/): `egs/ljspeech` 13 | - [M-AILABS speech database](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/): `egs/m-ailabs-speech` 14 | 15 | ## Type of recipe 16 | 17 | `sd`: speaker-dependent model 18 | 19 | - build speaker dependent model 20 | - the speaker of training data is the same as that of evaluation data 21 | - auxiliary features are based on World analysis 22 | - noise shaping with world mel-cepstrum is applied 23 | 24 | `si-open`: speaker-independent model in open condition 25 | 26 | - build speaker independent model in spekaer-open condition 27 | - the speakers of evaluation data does not include those of training data 28 | - auxiliary features are based on World analysis 29 | - noise shaping with world mel-cepstrum is applied 30 | 31 | `si-close`: speaker-independent model in speaker-closed condition 32 | 33 | - build speaker independent model in open condition 34 | - the speakers of evaluation data includes those of training data 35 | - auxiliary features are based on World analysis 36 | - noise shaping with world mel-cepstrum is applied 37 | 38 | `*-melspc`: model with mel-spectrogram 39 | 40 | - build the model with mel-spectrogram 41 | - auxiliary features are mel-spectrogram 42 | - noise shaping with stft mel-cepstrum is applied 43 | 44 | ## Flow of recipe 45 | 46 | 0. data preparation (`stage 0`) 47 | 1. auxiliary feature extraction (`stage 1`) 48 | 2. statistics calculation (`stage 2`) 49 | 3. noise weighting (`stage 3`) 50 | 4. WaveNet training (`stage 4`) 51 | 5. WaveNet decoding (`stage 5`) 52 | 6. noise shaping (`stage 6`) 53 | 54 | ## How-to-run 55 | 56 | ```bash 57 | # change directory to one of the recipe 58 | $ cd arctic/sd 59 | 60 | # run the recipe 61 | $ ./run.sh 62 | 63 | # you can skip some stages (in this case only stage 4,5,6 will be conducted) 64 | $ ./run.sh --stage 456 65 | 66 | # you can also change hyperparameters via command line 67 | $ ./run.sh --lr 1e-3 --batch_length 10000 68 | 69 | # multi-gpu training / decoding are supported (batch size should be greater than #gpus) 70 | $ ./run.sh --n_gpus 3 --batch_size 3 71 | ``` 72 | 73 | ## Run recipe with slurm 74 | 75 | If slurm is installed in your servers, you can run recipes with slurm. 76 | 77 | ```bash 78 | $ cd egs/arctic/sd 79 | 80 | # edit configuration 81 | $ vim cmd.sh 82 | # please edit as follows 83 | -- cmd.sh -- 84 | # for local 85 | # export train_cmd="run.pl" 86 | # export cuda_cmd="run.pl --gpu 1" 87 | 88 | # for slurm (you can change configuration file "conf/slurm.conf") 89 | export train_cmd="slurm.pl --config conf/slurm.conf" 90 | export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf" 91 | 92 | $ vim conf/slurm.conf 93 | # edit 94 | -- slurm.conf -- 95 | command sbatch --export=PATH --ntasks-per-node=1 96 | option time=* --time $0 97 | option mem=* --mem-per-cpu $0 98 | option mem=0 99 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 100 | option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 101 | default gpu=0 102 | option gpu=0 -p 103 | option gpu=* -p --gres=gpu:$0 --time 10-00:00:00 104 | 105 | # run the recipe 106 | $ ./run.sh 107 | ``` 108 | 109 | If you want to know more info about `run.pl` and `slurm.pl`, see [https://kaldi-asr.org/doc/queue.html](https://kaldi-asr.org/doc/queue.html). 110 | 111 | ## Use pre-trained model to decode your own data 112 | 113 | To synthesize your own data, things what you need are as follows: 114 | 115 | ``` 116 | - checkpoint-final.pkl (model parameter file) 117 | - model.conf (model configuration file) 118 | - stats.h5 (feature statistics file) 119 | - *.wav (your own wav file, should be 16000 Hz) 120 | ``` 121 | 122 | The procedure is as follows: 123 | 124 | ```bash 125 | $ cd egs/arctic/si-close 126 | 127 | # download pre-trained model which trained with 6 arctic speakers and world features 128 | $ wget "https://www.dropbox.com/s/xt7qqmfgamwpqqg/si-close_lr1e-4_wd0_bs20k_ns_up.zip?dl=0" -O si-close_lr1e-4_wd0_bs20k_ns_up.zip 129 | 130 | # unzip 131 | $ unzip si-close_lr1e-4_wd0_bs20k_ns_up.zip 132 | 133 | # make filelist of your own wav files 134 | $ find -name "*.wav" > wav.scp 135 | 136 | # feature extraction 137 | $ . ./path.sh 138 | $ feature_extract.py \ 139 | --waveforms wav.scp \ 140 | --wavdir wav/test \ 141 | --hdf5dir hdf5/test \ 142 | --feature_type world \ 143 | --fs 16000 \ 144 | --shiftms 5 \ 145 | --minf0 \ 146 | --maxf0 \ 147 | --mcep_dim 24 \ 148 | --mcep_alpha 0.41 \ 149 | --highpass_cutoff 70 \ 150 | --fftl 1024 \ 151 | --n_jobs 1 152 | 153 | # make filelist of feature file 154 | $ find hdf5/test -name "*.h5" > feats.scp 155 | 156 | # decode with pre-trained model 157 | $ decode.py \ 158 | --feats feats.scp \ 159 | --stats si-close_lr1e-4_wd0_bs20k_ns_up/stats.h5 \ 160 | --outdir si-close_lr1e-4_wd0_bs20k_ns_up/wav \ 161 | --checkpoint si-close_lr1e-4_wd0_bs20k_ns_up/checkpoint-final.pkl \ 162 | --config si-close_lr1e-4_wd0_bs20k_ns_up/model.conf \ 163 | --fs 16000 \ 164 | --batch_size 32 \ 165 | --n_gpus 1 166 | 167 | # make filelist of generated wav file 168 | $ find si-close_lr1e-4_wd0_bs20k_ns_up/wav -name "*.wav" > wav_generated.scp 169 | 170 | # perform noise shaping 171 | $ noise_shaping.py \ 172 | --waveforms wav_generated.scp \ 173 | --stats si-close_lr1e-4_wd0_bs20k_ns_up/stats.h5 \ 174 | --outdir si-close_lr1e-4_wd0_bs20k_ns_up/wav_nsf \ 175 | --feature_type world \ 176 | --fs 16000 \ 177 | --shiftms 5 \ 178 | --mcep_dim_start 2 \ 179 | --mcep_dim_end 27 \ 180 | --mcep_alpha 0.41 \ 181 | --mag 0.5 \ 182 | --inv false \ 183 | --n_jobs 1 184 | ``` 185 | 186 | Finally, you can hear the generated wav files in `si-close_lr1e-4_wd0_bs20k_ns_up/wav_nsf`. 187 | 188 | ## Author 189 | 190 | Tomoki Hayashi @ Nagoya University 191 | e-mail:hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp 192 | -------------------------------------------------------------------------------- /wavenet_vocoder/bin/noise_shaping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import argparse 8 | import logging 9 | import multiprocessing as mp 10 | import os 11 | import sys 12 | 13 | from distutils.util import strtobool 14 | 15 | import numpy as np 16 | import pysptk 17 | 18 | from scipy.io import wavfile 19 | 20 | from wavenet_vocoder.utils import check_hdf5 21 | from wavenet_vocoder.utils import find_files 22 | from wavenet_vocoder.utils import read_hdf5 23 | from wavenet_vocoder.utils import read_txt 24 | from wavenet_vocoder.utils import write_hdf5 25 | 26 | 27 | def convert_mcep_to_mlsa_coef(avg_mcep, mag, alpha): 28 | """CONVERT AVERAGE MEL-CEPTSRUM TO MLSA FILTER COEFFICIENT. 29 | 30 | Args: 31 | avg_mcep (ndarray): Averaged Mel-cepstrum (D,). 32 | mag (float): Magnification of noise shaping. 33 | alpha (float): All pass constant value. 34 | 35 | Return: 36 | ndarray: MLSA filter coefficient (D,). 37 | 38 | """ 39 | avg_mcep *= mag 40 | avg_mcep[0] = 0.0 41 | coef = pysptk.mc2b(avg_mcep.astype(np.float64), alpha) 42 | assert np.isfinite(coef).all() 43 | return coef 44 | 45 | 46 | def noise_shaping(wav_list, args): 47 | """APPLY NOISE SHAPING BASED ON MLSA FILTER.""" 48 | # load coefficient of filter 49 | if check_hdf5(args.stats, "/mlsa/coef"): 50 | mlsa_coef = read_hdf5(args.stats, "/mlsa/coef") 51 | alpha = read_hdf5(args.stats, "/mlsa/alpha") 52 | else: 53 | raise KeyError("\"/mlsa/coef\" is not found in %s." % (args.stats)) 54 | if args.inv: 55 | mlsa_coef *= -1.0 56 | 57 | # define synthesizer 58 | shiftl = int(args.fs / 1000 * args.shiftms) 59 | synthesizer = pysptk.synthesis.Synthesizer( 60 | pysptk.synthesis.MLSADF( 61 | order=mlsa_coef.shape[0] - 1, 62 | alpha=alpha), 63 | hopsize=shiftl 64 | ) 65 | 66 | for i, wav_name in enumerate(wav_list): 67 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 68 | 69 | # load wavfile and apply low cut filter 70 | fs, x = wavfile.read(wav_name) 71 | if x.dtype != np.int16: 72 | logging.warning("wav file format is not 16 bit PCM.") 73 | x = np.float64(x) 74 | 75 | # check sampling frequency 76 | if not fs == args.fs: 77 | logging.error("sampling frequency is not matched.") 78 | sys.exit(1) 79 | 80 | # replicate coef for time-invariant filtering 81 | num_frames = int(len(x) / shiftl) + 1 82 | mlsa_coefs = np.float64(np.tile(mlsa_coef, [num_frames, 1])) 83 | 84 | # synthesis and write 85 | x_ns = synthesizer.synthesis(x, mlsa_coefs) 86 | write_name = args.outdir + "/" + os.path.basename(wav_name) 87 | wavfile.write(write_name, args.fs, np.int16(x_ns)) 88 | 89 | 90 | def main(): 91 | """RUN NOISE SHAPING IN PARALLEL.""" 92 | parser = argparse.ArgumentParser( 93 | description="making feature file argsurations.") 94 | 95 | parser.add_argument( 96 | "--waveforms", default=None, 97 | help="directory or list of filename of input wavfile") 98 | parser.add_argument( 99 | "--stats", default=None, 100 | help="filename of hdf5 format") 101 | parser.add_argument( 102 | "--outdir", default=None, 103 | help="directory to save preprocessed wav file") 104 | parser.add_argument( 105 | "--fs", default=16000, 106 | type=int, help="Sampling frequency") 107 | parser.add_argument( 108 | "--shiftms", default=5, 109 | type=float, help="Frame shift in msec") 110 | parser.add_argument( 111 | "--feature_type", default="world", choices=["world", "mcep", "melspc"], 112 | type=str, help="feature type") 113 | parser.add_argument( 114 | "--mcep_dim_start", default=2, 115 | type=int, help="Start index of mel cepstrum") 116 | parser.add_argument( 117 | "--mcep_dim_end", default=27, 118 | type=int, help="End index of mel cepstrum") 119 | parser.add_argument( 120 | "--mcep_alpha", default=0.41, 121 | type=float, help="Alpha of mel cepstrum") 122 | parser.add_argument( 123 | "--mag", default=0.5, 124 | type=float, help="magnification of noise shaping") 125 | parser.add_argument( 126 | "--verbose", default=1, 127 | type=int, help="log message level") 128 | parser.add_argument( 129 | '--n_jobs', default=10, 130 | type=int, help="number of parallel jobs") 131 | parser.add_argument( 132 | '--inv', default=False, type=strtobool, 133 | help="if True, inverse filtering will be performed") 134 | 135 | args = parser.parse_args() 136 | 137 | # set log level 138 | if args.verbose == 1: 139 | logging.basicConfig(level=logging.INFO, 140 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 141 | datefmt='%m/%d/%Y %I:%M:%S') 142 | elif args.verbose > 1: 143 | logging.basicConfig(level=logging.DEBUG, 144 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 145 | datefmt='%m/%d/%Y %I:%M:%S') 146 | else: 147 | logging.basicConfig(level=logging.WARNING, 148 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 149 | datefmt='%m/%d/%Y %I:%M:%S') 150 | logging.warning("logging is disabled.") 151 | 152 | # show arguments 153 | for key, value in vars(args).items(): 154 | logging.info("%s = %s" % (key, str(value))) 155 | 156 | # read list 157 | if os.path.isdir(args.waveforms): 158 | file_list = sorted(find_files(args.waveforms, "*.wav")) 159 | else: 160 | file_list = read_txt(args.waveforms) 161 | logging.info("number of utterances = %d" % len(file_list)) 162 | 163 | # check directory existence 164 | if not os.path.exists(args.outdir): 165 | os.makedirs(args.outdir) 166 | 167 | # divide list 168 | file_lists = np.array_split(file_list, args.n_jobs) 169 | file_lists = [f_list.tolist() for f_list in file_lists] 170 | 171 | # calculate MLSA coef ans save it 172 | if not check_hdf5(args.stats, "/mlsa/coef"): 173 | avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") 174 | if args.feature_type == "world": 175 | avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] 176 | mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) 177 | write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) 178 | write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) 179 | 180 | # multi processing 181 | processes = [] 182 | if args.feature_type == "melspc": 183 | # TODO(kan-bayashi): implement noise shaping using melspectrogram 184 | raise NotImplementedError("currently, support only world and mcep.") 185 | for f in file_lists: 186 | p = mp.Process(target=noise_shaping, args=(f, args,)) 187 | p.start() 188 | processes.append(p) 189 | 190 | # wait for all process 191 | for p in processes: 192 | p.join() 193 | 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /wavenet_vocoder/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import fnmatch 7 | import logging 8 | import os 9 | import sys 10 | import threading 11 | 12 | import h5py 13 | import numpy as np 14 | 15 | from numpy.matlib import repmat 16 | 17 | 18 | def check_hdf5(hdf5_name, hdf5_path): 19 | """CHECK HDF5 EXISTENCE. 20 | 21 | Args: 22 | hdf5_name (str): Filename of hdf5 file. 23 | hdf5_path (str): Dataset name in hdf5 file. 24 | 25 | Returns: 26 | bool: Dataset exists then return True. 27 | 28 | """ 29 | if not os.path.exists(hdf5_name): 30 | return False 31 | else: 32 | with h5py.File(hdf5_name, "r") as f: 33 | if hdf5_path in f: 34 | return True 35 | else: 36 | return False 37 | 38 | 39 | def read_hdf5(hdf5_name, hdf5_path): 40 | """READ HDF5 DATASET. 41 | 42 | Args: 43 | hdf5_name (str): Filename of hdf5 file. 44 | hdf5_path (str): Dataset name in hdf5 file. 45 | 46 | Return: 47 | any: Dataset values. 48 | 49 | """ 50 | if not os.path.exists(hdf5_name): 51 | logging.error("there is no such a hdf5 file (%s)." % hdf5_name) 52 | sys.exit(1) 53 | 54 | hdf5_file = h5py.File(hdf5_name, "r") 55 | 56 | if hdf5_path not in hdf5_file: 57 | logging.error("there is no such a data in hdf5 file. (%s)" % hdf5_path) 58 | sys.exit(1) 59 | 60 | hdf5_data = hdf5_file[hdf5_path][()] 61 | hdf5_file.close() 62 | 63 | return hdf5_data 64 | 65 | 66 | def shape_hdf5(hdf5_name, hdf5_path): 67 | """GET HDF5 DATASET SHAPE. 68 | 69 | Args: 70 | hdf5_name (str): Filename of hdf5 file. 71 | hdf5_path (str): Dataset name in hdf5 file. 72 | 73 | Returns: 74 | (tuple): Shape of dataset. 75 | 76 | """ 77 | if check_hdf5(hdf5_name, hdf5_path): 78 | with h5py.File(hdf5_name, "r") as f: 79 | hdf5_shape = f[hdf5_path].shape 80 | return hdf5_shape 81 | else: 82 | logging.error("there is no such a file or dataset") 83 | sys.exit(1) 84 | 85 | 86 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): 87 | """WRITE DATASET TO HDF5. 88 | 89 | Args: 90 | hdf5_name (str): Hdf5 dataset filename. 91 | hdf5_path (str): Dataset path in hdf5. 92 | write_data (ndarray): Data to write. 93 | is_overwrite (bool): Whether to overwrite dataset. 94 | 95 | """ 96 | # convert to numpy array 97 | write_data = np.array(write_data) 98 | 99 | # check folder existence 100 | folder_name, _ = os.path.split(hdf5_name) 101 | if not os.path.exists(folder_name) and len(folder_name) != 0: 102 | os.makedirs(folder_name) 103 | 104 | # check hdf5 existence 105 | if os.path.exists(hdf5_name): 106 | # if already exists, open with r+ mode 107 | hdf5_file = h5py.File(hdf5_name, "r+") 108 | # check dataset existence 109 | if hdf5_path in hdf5_file: 110 | if is_overwrite: 111 | logging.warning("dataset in hdf5 file already exists.") 112 | logging.warning("recreate dataset in hdf5.") 113 | hdf5_file.__delitem__(hdf5_path) 114 | else: 115 | logging.error("dataset in hdf5 file already exists.") 116 | logging.error("if you want to overwrite, please set is_overwrite = True.") 117 | hdf5_file.close() 118 | sys.exit(1) 119 | else: 120 | # if not exists, open with w mode 121 | hdf5_file = h5py.File(hdf5_name, "w") 122 | 123 | # write data to hdf5 124 | hdf5_file.create_dataset(hdf5_path, data=write_data) 125 | hdf5_file.flush() 126 | hdf5_file.close() 127 | 128 | 129 | def find_files(directory, pattern="*.wav", use_dir_name=True): 130 | """FIND FILES RECURSIVELY. 131 | 132 | Args: 133 | directory (str): Root directory to find. 134 | pattern (str): Query to find. 135 | use_dir_name (bool): If False, directory name is not included. 136 | 137 | Returns: 138 | list: List of found filenames. 139 | 140 | """ 141 | files = [] 142 | for root, dirnames, filenames in os.walk(directory, followlinks=True): 143 | for filename in fnmatch.filter(filenames, pattern): 144 | files.append(os.path.join(root, filename)) 145 | if not use_dir_name: 146 | files = [file_.replace(directory + "/", "") for file_ in files] 147 | return files 148 | 149 | 150 | def read_txt(file_list): 151 | """READ TXT FILE. 152 | 153 | Args: 154 | file_list (str): TXT file filename. 155 | 156 | Returns: 157 | list: List of read lines. 158 | 159 | """ 160 | with open(file_list, "r") as f: 161 | filenames = f.readlines() 162 | return [filename.replace("\n", "") for filename in filenames] 163 | 164 | 165 | class BackgroundGenerator(threading.Thread): 166 | """BACKGROUND GENERATOR. 167 | 168 | Args: 169 | generator (object): Generator instance. 170 | max_prefetch (int): Max number of prefetch. 171 | 172 | References: 173 | https://stackoverflow.com/questions/7323664/python-generator-pre-fetch 174 | 175 | """ 176 | 177 | def __init__(self, generator, max_prefetch=1): 178 | threading.Thread.__init__(self) 179 | if sys.version_info.major == 2: 180 | from Queue import Queue 181 | else: 182 | from queue import Queue 183 | self.queue = Queue(max_prefetch) 184 | self.generator = generator 185 | self.daemon = True 186 | self.start() 187 | 188 | def run(self): 189 | """STORE ITEMS IN QUEUE.""" 190 | for item in self.generator: 191 | self.queue.put(item) 192 | self.queue.put(None) 193 | 194 | def next(self): 195 | """GET ITEM IN THE QUEUE.""" 196 | next_item = self.queue.get() 197 | if next_item is None: 198 | raise StopIteration 199 | return next_item 200 | 201 | def __next__(self): 202 | return self.next() 203 | 204 | def __iter__(self): 205 | return self 206 | 207 | 208 | class background(object): 209 | """BACKGROUND GENERATOR DECORATOR.""" 210 | 211 | def __init__(self, max_prefetch=1): 212 | self.max_prefetch = max_prefetch 213 | 214 | def __call__(self, gen): 215 | def bg_generator(*args, **kwargs): 216 | return BackgroundGenerator(gen(*args, **kwargs)) 217 | return bg_generator 218 | 219 | 220 | def extend_time(feats, upsampling_factor): 221 | """EXTEND TIME RESOLUTION. 222 | 223 | Args: 224 | feats (ndarray): Feature vector with the shape (T, D). 225 | upsampling_factor (int): Upsampling_factor. 226 | 227 | Returns: 228 | (ndarray): Extended feats with the shape (upsampling_factor * T, D). 229 | 230 | """ 231 | # get number 232 | n_frames = feats.shape[0] 233 | n_dims = feats.shape[1] 234 | 235 | # extend time 236 | feats_extended = np.zeros((n_frames * upsampling_factor, n_dims)) 237 | for j in range(n_frames): 238 | start_idx = j * upsampling_factor 239 | end_idx = (j + 1) * upsampling_factor 240 | feats_extended[start_idx: end_idx] = repmat(feats[j, :], upsampling_factor, 1) 241 | 242 | return feats_extended 243 | -------------------------------------------------------------------------------- /test/test_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | import os 8 | 9 | from wavenet_vocoder.bin.decode import decode_generator 10 | from wavenet_vocoder.bin.feature_extract import melspectrogram_extract 11 | from wavenet_vocoder.bin.feature_extract import world_feature_extract 12 | from wavenet_vocoder.bin.train import train_generator 13 | from wavenet_vocoder.utils import find_files 14 | 15 | from test_preprocessing import make_args as make_feature_args 16 | from test_preprocessing import make_dummy_wav 17 | 18 | 19 | def make_train_generator_args(**kwargs): 20 | defaults = dict( 21 | wav_list=None, 22 | feat_list=None, 23 | receptive_field=1000, 24 | batch_length=3000, 25 | batch_size=5, 26 | feature_type="world", 27 | wav_transform=None, 28 | feat_transform=None, 29 | shuffle=False, 30 | upsampling_factor=80, 31 | use_upsampling_layer=True, 32 | use_speaker_code=False 33 | ) 34 | defaults.update(kwargs) 35 | return argparse.Namespace(**defaults) 36 | 37 | 38 | def make_decode_generator_args(**kwargs): 39 | defaults = dict( 40 | feat_list=None, 41 | batch_size=5, 42 | feature_type="world", 43 | wav_transform=None, 44 | feat_transform=None, 45 | upsampling_factor=80, 46 | use_upsampling_layer=True, 47 | use_speaker_code=False 48 | ) 49 | defaults.update(kwargs) 50 | return argparse.Namespace(**defaults) 51 | 52 | 53 | def test_train_generator(): 54 | # make dummy wavfiles 55 | wavdir = "data/wav" 56 | if not os.path.exists(wavdir): 57 | os.makedirs(wavdir) 58 | for i in range(5): 59 | make_dummy_wav(wavdir + "/%d.wav" % i) 60 | 61 | # make features 62 | feat_args = make_feature_args() 63 | wav_list = find_files(wavdir, "*.wav") 64 | if not os.path.exists(feat_args.wavdir): 65 | os.makedirs(feat_args.wavdir) 66 | feat_args.feature_type = "melspc" 67 | melspectrogram_extract(wav_list, feat_args) 68 | feat_args.feature_type = "world" 69 | world_feature_extract(wav_list, feat_args) 70 | feat_list = find_files(feat_args.hdf5dir, "*.h5") 71 | 72 | for ft in ["world", "melspc"]: 73 | # ---------------------------------- 74 | # minibatch without upsampling layer 75 | # ---------------------------------- 76 | generator_args = make_train_generator_args( 77 | wav_list=wav_list, 78 | feat_list=feat_list, 79 | feature_type=ft, 80 | use_upsampling_layer=False, 81 | batch_length=10000, 82 | batch_size=5 83 | ) 84 | generator = train_generator(**vars(generator_args)) 85 | (x, h), t = next(generator) 86 | assert x.size(0) == t.size(0) == h.size(0) 87 | assert x.size(1) == t.size(1) == h.size(2) 88 | 89 | # ---------------------------------------- 90 | # utterance batch without upsampling layer 91 | # ---------------------------------------- 92 | generator_args = make_train_generator_args( 93 | wav_list=wav_list, 94 | feat_list=feat_list, 95 | feature_type=ft, 96 | use_upsampling_layer=False, 97 | batch_length=None, 98 | batch_size=5 99 | ) 100 | generator = train_generator(**vars(generator_args)) 101 | (x, h), t = next(generator) 102 | assert x.size(0) == t.size(0) == h.size(0) == 1 103 | assert x.size(1) == t.size(1) == h.size(2) 104 | 105 | # ------------------------------- 106 | # minibatch with upsampling layer 107 | # ------------------------------- 108 | generator_args = make_train_generator_args( 109 | wav_list=wav_list, 110 | feat_list=feat_list, 111 | feature_type=ft, 112 | use_upsampling_layer=True, 113 | batch_length=10000, 114 | batch_size=5 115 | ) 116 | generator = train_generator(**vars(generator_args)) 117 | (x, h), t = next(generator) 118 | assert x.size(0) == t.size(0) == h.size(0) 119 | assert x.size(1) == t.size(1) == h.size(2) * generator_args.upsampling_factor 120 | 121 | # ------------------------------------- 122 | # utterance batch with upsampling layer 123 | # ------------------------------------- 124 | generator_args = make_train_generator_args( 125 | wav_list=wav_list, 126 | feat_list=feat_list, 127 | feature_type=ft, 128 | use_upsampling_layer=True, 129 | batch_length=None, 130 | batch_size=5 131 | ) 132 | generator = train_generator(**vars(generator_args)) 133 | (x, h), t = next(generator) 134 | assert x.size(0) == t.size(0) == h.size(0) == 1 135 | assert x.size(1) == t.size(1) == h.size(2) * generator_args.upsampling_factor 136 | 137 | 138 | def test_decode_generator(): 139 | # make dummy wavfiles 140 | wavdir = "data/wav" 141 | if not os.path.exists(wavdir): 142 | os.makedirs(wavdir) 143 | for i in range(5): 144 | make_dummy_wav(wavdir + "/%d.wav" % i) 145 | 146 | # make features 147 | feat_args = make_feature_args() 148 | wav_list = find_files(wavdir, "*.wav") 149 | if not os.path.exists(feat_args.wavdir): 150 | os.makedirs(feat_args.wavdir) 151 | feat_args.feature_type = "melspc" 152 | melspectrogram_extract(wav_list, feat_args) 153 | feat_args.feature_type = "world" 154 | world_feature_extract(wav_list, feat_args) 155 | feat_list = find_files(feat_args.hdf5dir, "*.h5") 156 | 157 | for ft in ["world", "melspc"]: 158 | # ---------------------------------- 159 | # non-batch without upsampling layer 160 | # ---------------------------------- 161 | generator_args = make_decode_generator_args( 162 | feat_list=feat_list, 163 | feature_type=ft, 164 | use_upsampling_layer=False, 165 | batch_size=1 166 | ) 167 | generator = decode_generator(**vars(generator_args)) 168 | _, (x, h, n_samples) = next(generator) 169 | assert x.size(0) == h.size(0) == 1 170 | assert h.size(2) == n_samples + 1 171 | 172 | # ------------------------------- 173 | # non-batch with upsampling layer 174 | # ------------------------------- 175 | generator_args = make_decode_generator_args( 176 | feat_list=feat_list, 177 | feature_type=ft, 178 | use_upsampling_layer=True, 179 | batch_size=1 180 | ) 181 | generator = decode_generator(**vars(generator_args)) 182 | _, (x, h, n_samples) = next(generator) 183 | assert x.size(0) == h.size(0) == 1 184 | assert h.size(2) * generator_args.upsampling_factor == n_samples + 1 185 | 186 | # ---------------------------------- 187 | # minibatch without upsampling layer 188 | # ---------------------------------- 189 | generator_args = make_decode_generator_args( 190 | feat_list=feat_list, 191 | feature_type=ft, 192 | use_upsampling_layer=False, 193 | batch_size=5 194 | ) 195 | generator = decode_generator(**vars(generator_args)) 196 | _, (batch_x, batch_h, n_samples_list) = next(generator) 197 | assert batch_x.size(0) == batch_h.size(0) == len(n_samples_list) 198 | assert batch_h.size(2) == max(n_samples_list) + 1 199 | 200 | # ------------------------------- 201 | # minibatch with upsampling layer 202 | # ------------------------------- 203 | generator_args = make_decode_generator_args( 204 | feat_list=feat_list, 205 | feature_type=ft, 206 | use_upsampling_layer=True, 207 | batch_size=5 208 | ) 209 | generator = decode_generator(**vars(generator_args)) 210 | _, (batch_x, batch_h, n_samples_list) = next(generator) 211 | assert batch_x.size(0) == batch_h.size(0) == len(n_samples_list) 212 | assert batch_h.size(2) * generator_args.upsampling_factor == max(n_samples_list) + 1 213 | -------------------------------------------------------------------------------- /test/test_wavenet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import logging 7 | 8 | import numpy as np 9 | import torch 10 | 11 | from wavenet_vocoder.nets import encode_mu_law 12 | from wavenet_vocoder.nets import initialize 13 | from wavenet_vocoder.nets import WaveNet 14 | 15 | # set log level 16 | logging.basicConfig(level=logging.DEBUG, 17 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 18 | datefmt='%m/%d/%Y %I:%M:%S') 19 | 20 | 21 | def sine_generator(seq_size=100, mu=256): 22 | t = np.linspace(0, 1, 16000) 23 | data = np.sin(2 * np.pi * 220 * t) + np.sin(2 * np.pi * 224 * t) 24 | data = data / 2 25 | while True: 26 | ys = data[:seq_size] 27 | ys = encode_mu_law(data, mu) 28 | yield torch.from_numpy(ys[:seq_size]) 29 | 30 | 31 | def test_forward(): 32 | # get batch 33 | generator = sine_generator(100) 34 | batch = next(generator) 35 | batch_input = batch.view(1, -1) 36 | batch_aux = torch.rand(1, 28, batch_input.size(1)).float() 37 | 38 | # define model without upsampling with kernel size = 2 39 | net = WaveNet(256, 28, 32, 128, 10, 1, 2) 40 | net.apply(initialize) 41 | net.eval() 42 | y = net(batch_input, batch_aux)[0] 43 | assert y.size(0) == batch_input.size(1) 44 | assert y.size(1) == 256 45 | 46 | # define model without upsampling with kernel size = 3 47 | net = WaveNet(256, 28, 32, 128, 10, 1, 2) 48 | net.apply(initialize) 49 | net.eval() 50 | y = net(batch_input, batch_aux)[0] 51 | assert y.size(0) == batch_input.size(1) 52 | assert y.size(1) == 256 53 | 54 | batch_input = batch.view(1, -1) 55 | batch_aux = torch.rand(1, 28, batch_input.size(1) // 10).float() 56 | 57 | # define model with upsampling and kernel size = 2 58 | net = WaveNet(256, 28, 32, 128, 10, 1, 2, 10) 59 | net.apply(initialize) 60 | net.eval() 61 | y = net(batch_input, batch_aux)[0] 62 | assert y.size(0) == batch_input.size(1) 63 | assert y.size(1) == 256 64 | 65 | # define model with upsampling and kernel size = 3 66 | net = WaveNet(256, 28, 32, 128, 10, 1, 3, 10) 67 | net.apply(initialize) 68 | net.eval() 69 | y = net(batch_input, batch_aux)[0] 70 | assert y.size(0) == batch_input.size(1) 71 | assert y.size(1) == 256 72 | 73 | 74 | def test_generate(): 75 | batch = 2 76 | x = np.random.randint(0, 256, size=(batch, 1)) 77 | h = np.random.randn(batch, 28, 10) 78 | length = h.shape[-1] - 1 79 | with torch.no_grad(): 80 | net = WaveNet(256, 28, 4, 4, 10, 3, 2) 81 | net.apply(initialize) 82 | net.eval() 83 | for x_, h_ in zip(x, h): 84 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 85 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 86 | net.generate(batch_x, batch_h, length, 1, "sampling") 87 | net.fast_generate(batch_x, batch_h, length, 1, "sampling") 88 | batch_x = torch.from_numpy(x).long() 89 | batch_h = torch.from_numpy(h).float() 90 | net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "sampling") 91 | 92 | 93 | def test_assert_fast_generation(): 94 | # get batch 95 | batch = 2 96 | x = np.random.randint(0, 256, size=(batch, 1)) 97 | h = np.random.randn(batch, 28, 32) 98 | length = h.shape[-1] - 1 99 | 100 | with torch.no_grad(): 101 | # -------------------------------------------------------- 102 | # define model without upsampling and with kernel size = 2 103 | # -------------------------------------------------------- 104 | net = WaveNet(256, 28, 4, 4, 10, 3, 2) 105 | net.apply(initialize) 106 | net.eval() 107 | 108 | # sample-by-sample generation 109 | gen1_list = [] 110 | gen2_list = [] 111 | for x_, h_ in zip(x, h): 112 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 113 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 114 | gen1 = net.generate(batch_x, batch_h, length, 1, "argmax") 115 | gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax") 116 | np.testing.assert_array_equal(gen1, gen2) 117 | gen1_list += [gen1] 118 | gen2_list += [gen2] 119 | gen1 = np.stack(gen1_list) 120 | gen2 = np.stack(gen2_list) 121 | np.testing.assert_array_equal(gen1, gen2) 122 | 123 | # batch generation 124 | batch_x = torch.from_numpy(x).long() 125 | batch_h = torch.from_numpy(h).float() 126 | gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax") 127 | gen3 = np.stack(gen3_list) 128 | np.testing.assert_array_equal(gen3, gen2) 129 | 130 | # -------------------------------------------------------- 131 | # define model without upsampling and with kernel size = 3 132 | # -------------------------------------------------------- 133 | net = WaveNet(256, 28, 4, 4, 10, 3, 3) 134 | net.apply(initialize) 135 | net.eval() 136 | 137 | # sample-by-sample generation 138 | gen1_list = [] 139 | gen2_list = [] 140 | for x_, h_ in zip(x, h): 141 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 142 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 143 | gen1 = net.generate(batch_x, batch_h, length, 1, "argmax") 144 | gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax") 145 | np.testing.assert_array_equal(gen1, gen2) 146 | gen1_list += [gen1] 147 | gen2_list += [gen2] 148 | gen1 = np.stack(gen1_list) 149 | gen2 = np.stack(gen2_list) 150 | np.testing.assert_array_equal(gen1, gen2) 151 | 152 | # batch generation 153 | batch_x = torch.from_numpy(x).long() 154 | batch_h = torch.from_numpy(h).float() 155 | gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax") 156 | gen3 = np.stack(gen3_list) 157 | np.testing.assert_array_equal(gen3, gen2) 158 | 159 | # get batch 160 | batch = 2 161 | upsampling_factor = 10 162 | x = np.random.randint(0, 256, size=(batch, 1)) 163 | h = np.random.randn(batch, 28, 3) 164 | length = h.shape[-1] * upsampling_factor - 1 165 | 166 | # ----------------------------------------------------- 167 | # define model with upsampling and with kernel size = 2 168 | # ----------------------------------------------------- 169 | net = WaveNet(256, 28, 4, 4, 10, 3, 2, upsampling_factor) 170 | net.apply(initialize) 171 | net.eval() 172 | 173 | # sample-by-sample generation 174 | gen1_list = [] 175 | gen2_list = [] 176 | for x_, h_ in zip(x, h): 177 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 178 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 179 | gen1 = net.generate(batch_x, batch_h, length, 1, "argmax") 180 | gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax") 181 | np.testing.assert_array_equal(gen1, gen2) 182 | gen1_list += [gen1] 183 | gen2_list += [gen2] 184 | gen1 = np.stack(gen1_list) 185 | gen2 = np.stack(gen2_list) 186 | np.testing.assert_array_equal(gen1, gen2) 187 | 188 | # batch generation 189 | batch_x = torch.from_numpy(x).long() 190 | batch_h = torch.from_numpy(h).float() 191 | gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax") 192 | gen3 = np.stack(gen3_list) 193 | np.testing.assert_array_equal(gen3, gen2) 194 | 195 | # ----------------------------------------------------- 196 | # define model with upsampling and with kernel size = 3 197 | # ----------------------------------------------------- 198 | net = WaveNet(256, 28, 4, 4, 10, 3, 2, upsampling_factor) 199 | net.apply(initialize) 200 | net.eval() 201 | 202 | # sample-by-sample generation 203 | gen1_list = [] 204 | gen2_list = [] 205 | for x_, h_ in zip(x, h): 206 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 207 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 208 | gen1 = net.generate(batch_x, batch_h, length, 1, "argmax") 209 | gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax") 210 | np.testing.assert_array_equal(gen1, gen2) 211 | gen1_list += [gen1] 212 | gen2_list += [gen2] 213 | gen1 = np.stack(gen1_list) 214 | gen2 = np.stack(gen2_list) 215 | np.testing.assert_array_equal(gen1, gen2) 216 | 217 | # batch generation 218 | batch_x = torch.from_numpy(x).long() 219 | batch_h = torch.from_numpy(h).float() 220 | gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax") 221 | gen3 = np.stack(gen3_list) 222 | np.testing.assert_array_equal(gen3, gen2) 223 | 224 | 225 | def test_assert_different_length_batch_generation(): 226 | # prepare batch 227 | batch = 4 228 | length = 32 229 | x = np.random.randint(0, 256, size=(batch, 1)) 230 | h = np.random.randn(batch, 28, length) 231 | length_list = sorted(list(np.random.randint(length // 2, length - 1, batch))) 232 | 233 | with torch.no_grad(): 234 | net = WaveNet(256, 28, 4, 4, 10, 3, 2) 235 | net.apply(initialize) 236 | net.eval() 237 | 238 | # sample-by-sample generation 239 | gen1_list = [] 240 | for x_, h_, length in zip(x, h, length_list): 241 | batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long() 242 | batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float() 243 | gen1 = net.fast_generate(batch_x, batch_h, length, 1, "argmax") 244 | gen1_list += [gen1] 245 | 246 | # batch generation 247 | batch_x = torch.from_numpy(x).long() 248 | batch_h = torch.from_numpy(h).float() 249 | gen2_list = net.batch_fast_generate(batch_x, batch_h, length_list, 1, "argmax") 250 | 251 | # assertion 252 | for gen1, gen2 in zip(gen1_list, gen2_list): 253 | np.testing.assert_array_equal(gen1, gen2) 254 | -------------------------------------------------------------------------------- /wavenet_vocoder/utils/run.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | # In general, doing 5 | # run.pl some.log a b c is like running the command a b c in 6 | # the bash shell, and putting the standard error and output into some.log. 7 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.) 8 | # run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB 9 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. 10 | # If any of the jobs fails, this script will fail. 11 | 12 | # A typical example is: 13 | # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz 14 | # and run.pl will run something like: 15 | # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log 16 | # 17 | # Basically it takes the command-line arguments, quotes them 18 | # as necessary to preserve spaces, and evaluates them with bash. 19 | # In addition it puts the command line at the top of the log, and 20 | # the start and end times of the command at the beginning and end. 21 | # The reason why this is useful is so that we can create a different 22 | # version of this program that uses a queueing system instead. 23 | 24 | # use Data::Dumper; 25 | 26 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; 27 | 28 | 29 | $max_jobs_run = -1; 30 | $jobstart = 1; 31 | $jobend = 1; 32 | $ignored_opts = ""; # These will be ignored. 33 | 34 | # First parse an option like JOB=1:4, and any 35 | # options that would normally be given to 36 | # queue.pl, which we will just discard. 37 | 38 | for (my $x = 1; $x <= 2; $x++) { # This for-loop is to 39 | # allow the JOB=1:n option to be interleaved with the 40 | # options to qsub. 41 | while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { 42 | # parse any options that would normally go to qsub, but which will be ignored here. 43 | my $switch = shift @ARGV; 44 | if ($switch eq "-V") { 45 | $ignored_opts .= "-V "; 46 | } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { 47 | # we do support the option --max-jobs-run n, and its GridEngine form -tc n. 48 | $max_jobs_run = shift @ARGV; 49 | if (! ($max_jobs_run > 0)) { 50 | die "run.pl: invalid option --max-jobs-run $max_jobs_run"; 51 | } 52 | } else { 53 | my $argument = shift @ARGV; 54 | if ($argument =~ m/^--/) { 55 | print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; 56 | } 57 | if ($switch eq "-sync" && $argument =~ m/^[yY]/) { 58 | $ignored_opts .= "-sync "; # Note: in the 59 | # corresponding code in queue.pl it says instead, just "$sync = 1;". 60 | } elsif ($switch eq "-pe") { # e.g. -pe smp 5 61 | my $argument2 = shift @ARGV; 62 | $ignored_opts .= "$switch $argument $argument2 "; 63 | } elsif ($switch eq "--gpu") { 64 | $using_gpu = $argument; 65 | } else { 66 | # Ignore option. 67 | $ignored_opts .= "$switch $argument "; 68 | } 69 | } 70 | } 71 | if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 72 | $jobname = $1; 73 | $jobstart = $2; 74 | $jobend = $3; 75 | shift; 76 | if ($jobstart > $jobend) { 77 | die "run.pl: invalid job range $ARGV[0]"; 78 | } 79 | if ($jobstart <= 0) { 80 | die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)."; 81 | } 82 | } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. 83 | $jobname = $1; 84 | $jobstart = $2; 85 | $jobend = $2; 86 | shift; 87 | } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { 88 | print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n"; 89 | } 90 | } 91 | 92 | # Users found this message confusing so we are removing it. 93 | # if ($ignored_opts ne "") { 94 | # print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; 95 | # } 96 | 97 | if ($max_jobs_run == -1) { # If --max-jobs-run option not set, 98 | # then work out the number of processors if possible, 99 | # and set it based on that. 100 | $max_jobs_run = 0; 101 | if ($using_gpu) { 102 | if (open(P, "nvidia-smi -L |")) { 103 | $max_jobs_run++ while (

); 104 | close(P); 105 | } 106 | if ($max_jobs_run == 0) { 107 | $max_jobs_run = 1; 108 | print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n"; 109 | } 110 | } elsif (open(P, ") { if (m/^processor/) { $max_jobs_run++; } } 112 | if ($max_jobs_run == 0) { 113 | print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n"; 114 | $max_jobs_run = 10; # reasonable default. 115 | } 116 | close(P); 117 | } elsif (open(P, "sysctl -a |")) { # BSD/Darwin 118 | while (

) { 119 | if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4 120 | $max_jobs_run = $1; 121 | last; 122 | } 123 | } 124 | close(P); 125 | if ($max_jobs_run == 0) { 126 | print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n"; 127 | $max_jobs_run = 10; # reasonable default. 128 | } 129 | } else { 130 | # allow at most 32 jobs at once, on non-UNIX systems; change this code 131 | # if you need to change this default. 132 | $max_jobs_run = 32; 133 | } 134 | # The just-computed value of $max_jobs_run is just the number of processors 135 | # (or our best guess); and if it happens that the number of jobs we need to 136 | # run is just slightly above $max_jobs_run, it will make sense to increase 137 | # $max_jobs_run to equal the number of jobs, so we don't have a small number 138 | # of leftover jobs. 139 | $num_jobs = $jobend - $jobstart + 1; 140 | if (!$using_gpu && 141 | $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) { 142 | $max_jobs_run = $num_jobs; 143 | } 144 | } 145 | 146 | $logfile = shift @ARGV; 147 | 148 | if (defined $jobname && $logfile !~ m/$jobname/ && 149 | $jobend > $jobstart) { 150 | print STDERR "run.pl: you are trying to run a parallel job but " 151 | . "you are putting the output into just one log file ($logfile)\n"; 152 | exit(1); 153 | } 154 | 155 | $cmd = ""; 156 | 157 | foreach $x (@ARGV) { 158 | if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } 159 | elsif ($x =~ m:\":) { $cmd .= "'$x' "; } 160 | else { $cmd .= "\"$x\" "; } 161 | } 162 | 163 | #$Data::Dumper::Indent=0; 164 | $ret = 0; 165 | $numfail = 0; 166 | %active_pids=(); 167 | 168 | use POSIX ":sys_wait_h"; 169 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 170 | if (scalar(keys %active_pids) >= $max_jobs_run) { 171 | 172 | # Lets wait for a change in any child's status 173 | # Then we have to work out which child finished 174 | $r = waitpid(-1, 0); 175 | $code = $?; 176 | if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. 177 | if ( defined $active_pids{$r} ) { 178 | $jid=$active_pids{$r}; 179 | $fail[$jid]=$code; 180 | if ($code !=0) { $numfail++;} 181 | delete $active_pids{$r}; 182 | # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; 183 | } else { 184 | die "run.pl: Cannot find the PID of the chold process that just finished."; 185 | } 186 | 187 | # In theory we could do a non-blocking waitpid over all jobs running just 188 | # to find out if only one or more jobs finished during the previous waitpid() 189 | # However, we just omit this and will reap the next one in the next pass 190 | # through the for(;;) cycle 191 | } 192 | $childpid = fork(); 193 | if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } 194 | if ($childpid == 0) { # We're in the child... this branch 195 | # executes the job and returns (possibly with an error status). 196 | if (defined $jobname) { 197 | $cmd =~ s/$jobname/$jobid/g; 198 | $logfile =~ s/$jobname/$jobid/g; 199 | } 200 | system("mkdir -p `dirname $logfile` 2>/dev/null"); 201 | open(F, ">$logfile") || die "run.pl: Error opening log file $logfile"; 202 | print F "# " . $cmd . "\n"; 203 | print F "# Started at " . `date`; 204 | $starttime = `date +'%s'`; 205 | print F "#\n"; 206 | close(F); 207 | 208 | # Pipe into bash.. make sure we're not using any other shell. 209 | open(B, "|bash") || die "run.pl: Error opening shell command"; 210 | print B "( " . $cmd . ") 2>>$logfile >> $logfile"; 211 | close(B); # If there was an error, exit status is in $? 212 | $ret = $?; 213 | 214 | $lowbits = $ret & 127; 215 | $highbits = $ret >> 8; 216 | if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" } 217 | else { $return_str = "code $highbits"; } 218 | 219 | $endtime = `date +'%s'`; 220 | open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)"; 221 | $enddate = `date`; 222 | chop $enddate; 223 | print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; 224 | print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; 225 | close(F); 226 | exit($ret == 0 ? 0 : 1); 227 | } else { 228 | $pid[$jobid] = $childpid; 229 | $active_pids{$childpid} = $jobid; 230 | # print STDERR "Queued: " . Dumper(\%active_pids) . "\n"; 231 | } 232 | } 233 | 234 | # Now we have submitted all the jobs, lets wait until all the jobs finish 235 | foreach $child (keys %active_pids) { 236 | $jobid=$active_pids{$child}; 237 | $r = waitpid($pid[$jobid], 0); 238 | $code = $?; 239 | if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen. 240 | if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully 241 | } 242 | 243 | # Some sanity checks: 244 | # The $fail array should not contain undefined codes 245 | # The number of non-zeros in that array should be equal to $numfail 246 | # We cannot do foreach() here, as the JOB ids do not necessarily start by zero 247 | $failed_jids=0; 248 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { 249 | $job_return = $fail[$jobid]; 250 | if (not defined $job_return ) { 251 | # print Dumper(\@fail); 252 | 253 | die "run.pl: Sanity check failed: we have indication that some jobs are running " . 254 | "even after we waited for all jobs to finish" ; 255 | } 256 | if ($job_return != 0 ){ $failed_jids++;} 257 | } 258 | if ($failed_jids != $numfail) { 259 | die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)." 260 | } 261 | if ($numfail > 0) { $ret = 1; } 262 | 263 | if ($ret != 0) { 264 | $njobs = $jobend - $jobstart + 1; 265 | if ($njobs == 1) { 266 | if (defined $jobname) { 267 | $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with 268 | # that job. 269 | } 270 | print STDERR "run.pl: job failed, log is in $logfile\n"; 271 | if ($logfile =~ m/JOB/) { 272 | print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; 273 | } 274 | } 275 | else { 276 | $logfile =~ s/$jobname/*/g; 277 | print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; 278 | } 279 | } 280 | 281 | 282 | exit ($ret); 283 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2017 Tomoki Hayashi (Nagoya University) 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /egs/ljspeech/sd/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | . ./cmd.sh || exit 1; 11 | 12 | # USER SETTINGS {{{ 13 | ####################################### 14 | # STAGE SETTING # 15 | ####################################### 16 | stage=0123456 17 | # 0: data preparation step 18 | # 1: feature extraction step 19 | # 2: statistics calculation step 20 | # 3: noise weighting step 21 | # 4: training step 22 | # 5: decoding step 23 | # 6: noise shaping step 24 | 25 | ####################################### 26 | # FEATURE SETTING # 27 | ####################################### 28 | feature_type=world # world or melspc (in this recipe fixed to "world") 29 | minf0=40 # minimum f0 30 | maxf0=400 # maximum f0 31 | shiftms=5 # shift length in msec 32 | fftl=1024 # fft length 33 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 34 | fs=22050 # sampling rate 35 | mcep_dim=34 # dimension of mel-cepstrum 36 | mcep_alpha=0.455 # alpha value of mel-cepstrum 37 | use_noise_shaping=true # whether to use noise shaping 38 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 39 | n_jobs=10 # number of parallel jobs 40 | 41 | ####################################### 42 | # TRAINING SETTING # 43 | ####################################### 44 | n_gpus=1 # number of gpus 45 | n_quantize=256 # number of quantization of waveform 46 | n_aux=39 # number of auxiliary features 47 | n_resch=512 # number of residual channels 48 | n_skipch=256 # number of skip channels 49 | dilation_depth=10 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 50 | dilation_repeat=3 # number of dilation repeats 51 | kernel_size=3 # kernel size of dilated convolution 52 | lr=1e-4 # learning rate 53 | weight_decay=0.0 # weight decay coef 54 | iters=200000 # number of iterations 55 | batch_length=15000 # batch length 56 | batch_size=1 # batch size 57 | checkpoint_interval=10000 # save model per this number 58 | use_upsampling=true # whether to use upsampling layer 59 | resume="" # checkpoint path to resume (Optional) 60 | 61 | ####################################### 62 | # DECODING SETTING # 63 | ####################################### 64 | outdir="" # directory to save decoded wav dir (Optional) 65 | checkpoint="" # checkpoint path to be used for decoding (Optional) 66 | config="" # model configuration path (Optional) 67 | stats="" # statistics path (Optional) 68 | feats="" # list or directory of feature files (Optional) 69 | decode_batch_size=16 # batch size in decoding 70 | 71 | ####################################### 72 | # OTHER SETTING # 73 | ####################################### 74 | LJSPEECH_DB_ROOT=downloads # directory including DB (if DB not exists, it will be downloaded) 75 | tag="" # tag for network directory naming (Optional) 76 | 77 | # parse options 78 | . parse_options.sh || exit 1; 79 | 80 | # check feature type 81 | if [ ${feature_type} != "world" ]; then 82 | echo "This recipe does not support feature_type=\"melspc\"." 2>&1 83 | echo "Please try the egs/ljspeech/sd-melspc." 2>&1 84 | exit 1; 85 | fi 86 | 87 | # set directory names 88 | train=tr 89 | eval=ev 90 | 91 | # stop when error occurred 92 | set -euo pipefail 93 | # }}} 94 | 95 | 96 | # STAGE 0 {{{ 97 | if echo ${stage} | grep -q 0; then 98 | echo "###########################################################" 99 | echo "# DATA PREPARATION STEP #" 100 | echo "###########################################################" 101 | if [ ! -e ${LJSPEECH_DB_ROOT}/.done ];then 102 | mkdir -p ${LJSPEECH_DB_ROOT} 103 | cd ${LJSPEECH_DB_ROOT} 104 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 105 | tar -vxf ./*.tar.bz2 106 | rm ./*.tar.bz2 107 | cd ../ 108 | touch ${LJSPEECH_DB_ROOT}/.done 109 | echo "database is successfully downloaded." 110 | fi 111 | [ ! -e data/local ] && mkdir -p data/local 112 | [ ! -e data/${train} ] && mkdir -p data/${train} 113 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 114 | find ${LJSPEECH_DB_ROOT}/LJSpeech-1.1/wavs -name "*.wav" \ 115 | | sort > data/local/wav.scp 116 | grep -v LJ050 data/local/wav.scp > data/${train}/wav.scp 117 | grep LJ050 data/local/wav.scp > data/${eval}/wav.scp 118 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 119 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 120 | fi 121 | # }}} 122 | 123 | 124 | # STAGE 1 {{{ 125 | if echo ${stage} | grep -q 1; then 126 | echo "###########################################################" 127 | echo "# FEATURE EXTRACTION STEP #" 128 | echo "###########################################################" 129 | for set in ${train} ${eval};do 130 | # training data feature extraction 131 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \ 132 | feature_extract.py \ 133 | --waveforms data/${set}/wav.scp \ 134 | --wavdir wav_hpf/${set} \ 135 | --hdf5dir hdf5/${set} \ 136 | --feature_type ${feature_type} \ 137 | --fs ${fs} \ 138 | --shiftms ${shiftms} \ 139 | --minf0 ${minf0} \ 140 | --maxf0 ${maxf0} \ 141 | --mcep_dim ${mcep_dim} \ 142 | --mcep_alpha ${mcep_alpha} \ 143 | --highpass_cutoff ${highpass_cutoff} \ 144 | --fftl ${fftl} \ 145 | --n_jobs ${n_jobs} 146 | 147 | # check the number of feature files 148 | n_wavs=$(wc -l data/${set}/wav.scp) 149 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 150 | echo "${n_feats}/${n_wavs} files are successfully processed." 151 | 152 | # make scp files 153 | if [ ${highpass_cutoff} -eq 0 ];then 154 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 155 | else 156 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 157 | fi 158 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 159 | done 160 | fi 161 | # }}} 162 | 163 | 164 | # STAGE 2 {{{ 165 | if echo ${stage} | grep -q 2; then 166 | echo "###########################################################" 167 | echo "# CALCULATE STATISTICS STEP #" 168 | echo "###########################################################" 169 | ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \ 170 | calc_stats.py \ 171 | --feats data/${train}/feats.scp \ 172 | --stats data/${train}/stats.h5 \ 173 | --feature_type ${feature_type} 174 | echo "statistics are successfully calculated." 175 | fi 176 | # }}} 177 | 178 | 179 | # STAGE 3 {{{ 180 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 181 | echo "###########################################################" 182 | echo "# NOISE WEIGHTING STEP #" 183 | echo "###########################################################" 184 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \ 185 | noise_shaping.py \ 186 | --waveforms data/${train}/wav_hpf.scp \ 187 | --stats data/${train}/stats.h5 \ 188 | --outdir wav_nwf/${train} \ 189 | --feature_type ${feature_type} \ 190 | --fs ${fs} \ 191 | --shiftms ${shiftms} \ 192 | --mcep_dim_start 2 \ 193 | --mcep_dim_end $(( 2 + mcep_dim + 1 )) \ 194 | --mcep_alpha ${mcep_alpha} \ 195 | --mag ${mag} \ 196 | --inv true \ 197 | --n_jobs ${n_jobs} 198 | 199 | # check the number of feature files 200 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 201 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 202 | echo "${n_ns}/${n_wavs} files are successfully processed." 203 | 204 | # make scp files 205 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 206 | fi # }}} 207 | 208 | 209 | # STAGE 4 {{{ 210 | # set variables 211 | if [ ! -n "${tag}" ];then 212 | expdir=exp/tr_ljspeech_22k_sd_${feature_type}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 213 | if ${use_noise_shaping};then 214 | expdir=${expdir}_ns 215 | fi 216 | if ${use_upsampling};then 217 | expdir=${expdir}_up 218 | fi 219 | else 220 | expdir=exp/tr_ljspeech_22k_${tag} 221 | fi 222 | if echo ${stage} | grep -q 4; then 223 | echo "###########################################################" 224 | echo "# WAVENET TRAINING STEP #" 225 | echo "###########################################################" 226 | if ${use_noise_shaping};then 227 | waveforms=data/${train}/wav_nwf.scp 228 | else 229 | waveforms=data/${train}/wav_hpf.scp 230 | fi 231 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 232 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 233 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 234 | ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \ 235 | train.py \ 236 | --n_gpus ${n_gpus} \ 237 | --waveforms ${waveforms} \ 238 | --feats data/${train}/feats.scp \ 239 | --stats data/${train}/stats.h5 \ 240 | --expdir "${expdir}" \ 241 | --feature_type ${feature_type} \ 242 | --n_quantize ${n_quantize} \ 243 | --n_aux ${n_aux} \ 244 | --n_resch ${n_resch} \ 245 | --n_skipch ${n_skipch} \ 246 | --dilation_depth ${dilation_depth} \ 247 | --dilation_repeat ${dilation_repeat} \ 248 | --kernel_size ${kernel_size} \ 249 | --lr ${lr} \ 250 | --weight_decay ${weight_decay} \ 251 | --iters ${iters} \ 252 | --batch_length ${batch_length} \ 253 | --batch_size ${batch_size} \ 254 | --checkpoint_interval ${checkpoint_interval} \ 255 | --upsampling_factor "${upsampling_factor}" \ 256 | --use_upsampling_layer ${use_upsampling} \ 257 | --resume "${resume}" 258 | fi 259 | # }}} 260 | 261 | 262 | # STAGE 5 {{{ 263 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 264 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 265 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 266 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 267 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 268 | if echo ${stage} | grep -q 5; then 269 | echo "###########################################################" 270 | echo "# WAVENET DECODING STEP #" 271 | echo "###########################################################" 272 | ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \ 273 | decode.py \ 274 | --n_gpus ${n_gpus} \ 275 | --feats ${feats} \ 276 | --stats ${stats} \ 277 | --outdir "${outdir}" \ 278 | --checkpoint "${checkpoint}" \ 279 | --config "${config}" \ 280 | --fs ${fs} \ 281 | --batch_size ${decode_batch_size} 282 | fi 283 | # }}} 284 | 285 | 286 | # STAGE 6 {{{ 287 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 288 | echo "###########################################################" 289 | echo "# NOISE SHAPING STEP #" 290 | echo "###########################################################" 291 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 292 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \ 293 | noise_shaping.py \ 294 | --waveforms ${outdir}/wav.scp \ 295 | --stats ${stats} \ 296 | --outdir "${outdir}_nsf" \ 297 | --feature_type ${feature_type} \ 298 | --fs ${fs} \ 299 | --shiftms ${shiftms} \ 300 | --mcep_dim_start 2 \ 301 | --mcep_dim_end $(( 2 + mcep_dim + 1 )) \ 302 | --mcep_alpha ${mcep_alpha} \ 303 | --mag ${mag} \ 304 | --n_jobs ${n_jobs} \ 305 | --inv false 306 | fi 307 | # }}} 308 | -------------------------------------------------------------------------------- /egs/arctic/sd-mini/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # DEMO SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | 11 | # USER SETTINGS {{{ 12 | ####################################### 13 | # STAGE SETTING # 14 | ####################################### 15 | stage=0123456 16 | # 0: data preparation step 17 | # 1: feature extraction step 18 | # 2: statistics calculation step 19 | # 3: noise weighting step 20 | # 4: training step 21 | # 5: decoding step 22 | # 6: noise shaping step 23 | 24 | ####################################### 25 | # FEATURE SETTING # 26 | ####################################### 27 | feature_type=world # world or melspc (in this recipe fixed to "world") 28 | spk=slt # target spekaer in arctic 29 | minf0="" # minimum f0 (if not set, conf/*.f0 will be used) 30 | maxf0="" # maximum f0 (if not set, conf/*.f0 will be used) 31 | shiftms=5 # shift length in msec 32 | fftl=1024 # fft length 33 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 34 | fs=16000 # sampling rate 35 | mcep_dim=24 # dimension of mel-cepstrum 36 | mcep_alpha=0.410 # alpha value of mel-cepstrum 37 | use_noise_shaping=true # whether to use noise shaping 38 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 39 | n_jobs=10 # number of parallel jobs 40 | 41 | ####################################### 42 | # TRAINING SETTING # 43 | ####################################### 44 | n_gpus=1 # number of gpus 45 | n_quantize=256 # number of quantization of waveform 46 | n_aux=28 # number of auxiliary features 47 | n_resch=32 # number of residual channels 48 | n_skipch=16 # number of skip channels 49 | dilation_depth=5 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 50 | dilation_repeat=1 # number of dilation repeats 51 | kernel_size=2 # kernel size of dilated convolution 52 | lr=1e-4 # learning rate 53 | weight_decay=0.0 # weight decay coef 54 | iters=1000 # number of iterations 55 | batch_length=10000 # batch length 56 | batch_size=1 # batch size 57 | checkpoint_interval=100 # save model per this number 58 | use_upsampling=true # whether to use upsampling layer 59 | resume="" # checkpoint path to resume (Optional) 60 | 61 | ####################################### 62 | # DECODING SETTING # 63 | ####################################### 64 | outdir="" # directory to save decoded wav dir (Optional) 65 | checkpoint="" # checkpoint path to be used for decoding (Optional) 66 | config="" # model configuration path (Optional) 67 | stats="" # statistics path (Optional) 68 | feats="" # list or directory of feature files (Optional) 69 | decode_batch_size=4 # batch size in decoding 70 | 71 | ####################################### 72 | # OTHER SETTING # 73 | ####################################### 74 | download_dir=downloads # download directory to save corpus 75 | download_url="https://drive.google.com/open?id=1NIia89CL2qqqDzNNc718wycRmI_jkLxR" # download URL of gooogle drive 76 | tag="" # tag for network directory naming (Optional) 77 | 78 | # This enable argparse-like parsing of the above variables e.g. ./run.sh --stage 0 79 | . parse_options.sh || exit 1; 80 | 81 | # check feature type 82 | if [ ${feature_type} != "world" ]; then 83 | echo "This recipe does not support feature_type=\"melspc\"." 2>&1 84 | echo "Please try the egs/*/*-melspc." 2>&1 85 | exit 1; 86 | fi 87 | 88 | # set directory names 89 | train=tr_${spk} 90 | eval=ev_${spk} 91 | 92 | # stop when error occurred 93 | set -euo pipefail 94 | # }}} 95 | 96 | 97 | # STAGE 0 {{{ 98 | if echo ${stage} | grep -q 0; then 99 | echo "###########################################################" 100 | echo "# DATA PREPARATION STEP #" 101 | echo "###########################################################" 102 | # download dataset 103 | if [ ! -e ${download_dir}/.done ];then 104 | download_from_google_drive.sh "${download_url}" ${download_dir} tar.gz 105 | touch ${download_dir}/.done 106 | echo "database is successfully downloaded." 107 | fi 108 | 109 | # directory check 110 | [ ! -e data/local ] && mkdir -p data/local 111 | [ ! -e data/${train} ] && mkdir -p data/${train} 112 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 113 | 114 | # make list of all of the utterances 115 | find "${download_dir}/cmu_us_${spk}_arctic_mini/wav" -name "*.wav" \ 116 | | sort > "data/local/wav.${spk}.scp" 117 | 118 | # use first 32 utterances as training data 119 | head -n 32 "data/local/wav.${spk}.scp" > data/${train}/wav.scp 120 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 121 | 122 | # use next 4 utterances as evaluation data 123 | tail -n 4 "data/local/wav.${spk}.scp" > data/${eval}/wav.scp 124 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 125 | fi 126 | # }}} 127 | 128 | 129 | # STAGE 1 {{{ 130 | if echo ${stage} | grep -q 1; then 131 | echo "###########################################################" 132 | echo "# FEATURE EXTRACTION STEP #" 133 | echo "###########################################################" 134 | [ ! -n "${minf0}" ] && minf0=$(awk '{print $1}' conf/${spk}.f0) 135 | [ ! -n "${maxf0}" ] && maxf0=$(awk '{print $2}' conf/${spk}.f0) 136 | [ ! -e exp/feature_extract ] && mkdir -p exp/feature_extract 137 | for set in ${train} ${eval};do 138 | [ "${set}" = "${train}" ] && save_wav=true || save_wav=false 139 | feature_extract.py \ 140 | --waveforms data/${set}/wav.scp \ 141 | --wavdir wav_hpf/${set} \ 142 | --hdf5dir hdf5/${set} \ 143 | --feature_type ${feature_type} \ 144 | --fs ${fs} \ 145 | --shiftms ${shiftms} \ 146 | --minf0 "${minf0}" \ 147 | --maxf0 "${maxf0}" \ 148 | --mcep_dim ${mcep_dim} \ 149 | --mcep_alpha ${mcep_alpha} \ 150 | --highpass_cutoff ${highpass_cutoff} \ 151 | --fftl ${fftl} \ 152 | --save_wav ${save_wav} \ 153 | --n_jobs ${n_jobs} 2>&1 | tee exp/feature_extract/feature_extract_${set}.log 154 | 155 | # check the number of feature files 156 | n_wavs=$(wc -l data/${set}/wav.scp) 157 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 158 | echo "${n_feats}/${n_wavs} files are successfully processed." 159 | 160 | # make scp files 161 | if [ ${highpass_cutoff} -eq 0 ];then 162 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 163 | elif ${save_wav}; then 164 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 165 | fi 166 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 167 | done 168 | fi 169 | # }}} 170 | 171 | 172 | # STAGE 2 {{{ 173 | if echo ${stage} | grep -q 2; then 174 | echo "###########################################################" 175 | echo "# CALCULATE STATISTICS STEP #" 176 | echo "###########################################################" 177 | [ ! -e exp/calculate_statistics ] && mkdir -p exp/calculate_statistics 178 | calc_stats.py \ 179 | --feats data/${train}/feats.scp \ 180 | --stats data/${train}/stats.h5 \ 181 | --feature_type ${feature_type} | tee exp/calculate_statistics/calc_stats_${train}.log 182 | echo "statistics are successfully calculated." 183 | fi 184 | # }}} 185 | 186 | 187 | # STAGE 3 {{{ 188 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 189 | echo "###########################################################" 190 | echo "# NOISE WEIGHTING STEP #" 191 | echo "###########################################################" 192 | [ ! -e exp/noise_shaping ] && mkdir -p exp/noise_shaping 193 | noise_shaping.py \ 194 | --waveforms data/${train}/wav_hpf.scp \ 195 | --stats data/${train}/stats.h5 \ 196 | --outdir wav_nwf/${train} \ 197 | --feature_type ${feature_type} \ 198 | --fs ${fs} \ 199 | --shiftms ${shiftms} \ 200 | --mcep_dim_start 2 \ 201 | --mcep_dim_end $(( 2 + mcep_dim +1 )) \ 202 | --mcep_alpha ${mcep_alpha} \ 203 | --mag ${mag} \ 204 | --inv true \ 205 | --n_jobs ${n_jobs} 2>&1 | tee exp/noise_shaping/noise_shaping_apply_${train}.log 206 | 207 | # check the number of feature files 208 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 209 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 210 | echo "${n_ns}/${n_wavs} files are successfully processed." 211 | 212 | # make scp files 213 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 214 | fi # }}} 215 | 216 | 217 | # STAGE 4 {{{ 218 | # set variables 219 | if [ ! -n "${tag}" ];then 220 | expdir=exp/tr_arctic_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 221 | if ${use_noise_shaping};then 222 | expdir=${expdir}_ns 223 | fi 224 | if ${use_upsampling};then 225 | expdir=${expdir}_up 226 | fi 227 | else 228 | expdir=exp/tr_arctic_${tag} 229 | fi 230 | if echo ${stage} | grep -q 4; then 231 | echo "###########################################################" 232 | echo "# WAVENET TRAINING STEP #" 233 | echo "###########################################################" 234 | if ${use_noise_shaping};then 235 | waveforms=data/${train}/wav_nwf.scp 236 | else 237 | waveforms=data/${train}/wav_hpf.scp 238 | fi 239 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 240 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 241 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 242 | train.py \ 243 | --n_gpus ${n_gpus} \ 244 | --waveforms ${waveforms} \ 245 | --feats data/${train}/feats.scp \ 246 | --stats data/${train}/stats.h5 \ 247 | --expdir "${expdir}" \ 248 | --feature_type ${feature_type} \ 249 | --n_quantize ${n_quantize} \ 250 | --n_aux ${n_aux} \ 251 | --n_resch ${n_resch} \ 252 | --n_skipch ${n_skipch} \ 253 | --dilation_depth ${dilation_depth} \ 254 | --dilation_repeat ${dilation_repeat} \ 255 | --kernel_size ${kernel_size} \ 256 | --lr ${lr} \ 257 | --weight_decay ${weight_decay} \ 258 | --iters ${iters} \ 259 | --batch_length ${batch_length} \ 260 | --batch_size ${batch_size} \ 261 | --checkpoint_interval ${checkpoint_interval} \ 262 | --upsampling_factor "${upsampling_factor}" \ 263 | --use_upsampling_layer ${use_upsampling} \ 264 | --resume "${resume}" 2>&1 | tee -a ${expdir}/log/${train}.log 265 | fi 266 | # }}} 267 | 268 | 269 | # STAGE 5 {{{ 270 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 271 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 272 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 273 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 274 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 275 | if echo ${stage} | grep -q 5; then 276 | echo "###########################################################" 277 | echo "# WAVENET DECODING STEP #" 278 | echo "###########################################################" 279 | [ ! -e ${outdir}/log ] && mkdir -p ${outdir}/log 280 | decode.py \ 281 | --n_gpus ${n_gpus} \ 282 | --feats ${feats} \ 283 | --stats "${stats}" \ 284 | --outdir "${outdir}" \ 285 | --checkpoint "${checkpoint}" \ 286 | --config "${config}" \ 287 | --fs ${fs} \ 288 | --batch_size ${decode_batch_size} 2>&1 | tee ${outdir}/log/decode.log 289 | fi 290 | # }}} 291 | 292 | 293 | # STAGE 6 {{{ 294 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 295 | echo "###########################################################" 296 | echo "# NOISE SHAPING STEP #" 297 | echo "###########################################################" 298 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 299 | [ ! -e exp/noise_shaping ] && mkdir -p exp/noise_shaping 300 | noise_shaping.py \ 301 | --waveforms ${outdir}/wav.scp \ 302 | --stats "${stats}" \ 303 | --outdir ${outdir}_nsf \ 304 | --fs ${fs} \ 305 | --shiftms ${shiftms} \ 306 | --n_jobs ${n_jobs} \ 307 | --inv false 2>&1 | tee exp/noise_shaping/noise_shaping_restore_${eval}.log 308 | fi 309 | # }}} 310 | -------------------------------------------------------------------------------- /egs/arctic/sd/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | . ./cmd.sh || exit 1; 11 | 12 | # USER SETTINGS {{{ 13 | ####################################### 14 | # STAGE SETTING # 15 | ####################################### 16 | stage=0123456 17 | # 0: data preparation step 18 | # 1: feature extraction step 19 | # 2: statistics calculation step 20 | # 3: noise weighting step 21 | # 4: training step 22 | # 5: decoding step 23 | # 6: noise shaping step 24 | 25 | ####################################### 26 | # FEATURE SETTING # 27 | ####################################### 28 | feature_type=world # world or melspc (in this recipe fixed to "world") 29 | spk=slt # target spekaer in arctic 30 | minf0="" # minimum f0 (if not set, conf/*.f0 will be used) 31 | maxf0="" # maximum f0 (if not set, conf/*.f0 will be used) 32 | shiftms=5 # shift length in msec 33 | fftl=1024 # fft length 34 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 35 | fs=16000 # sampling rate 36 | mcep_dim=24 # dimension of mel-cepstrum 37 | mcep_alpha=0.410 # alpha value of mel-cepstrum 38 | use_noise_shaping=true # whether to use noise shaping 39 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 40 | n_jobs=10 # number of parallel jobs 41 | 42 | ####################################### 43 | # TRAINING SETTING # 44 | ####################################### 45 | n_gpus=1 # number of gpus 46 | n_quantize=256 # number of quantization of waveform 47 | n_aux=28 # number of auxiliary features 48 | n_resch=512 # number of residual channels 49 | n_skipch=256 # number of skip channels 50 | dilation_depth=10 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 51 | dilation_repeat=3 # number of dilation repeats 52 | kernel_size=2 # kernel size of dilated convolution 53 | lr=1e-4 # learning rate 54 | weight_decay=0.0 # weight decay coef 55 | iters=200000 # number of iterations 56 | batch_length=20000 # batch length 57 | batch_size=1 # batch size 58 | checkpoint_interval=10000 # save model per this number 59 | use_upsampling=true # whether to use upsampling layer 60 | resume="" # checkpoint path to resume (Optional) 61 | 62 | ####################################### 63 | # DECODING SETTING # 64 | ####################################### 65 | outdir="" # directory to save decoded wav dir (Optional) 66 | checkpoint="" # checkpoint path to be used for decoding (Optional) 67 | config="" # model configuration path (Optional) 68 | stats="" # statistics path (Optional) 69 | feats="" # list or directory of feature files (Optional) 70 | decode_batch_size=32 # batch size in decoding 71 | 72 | ####################################### 73 | # OTHER SETTING # 74 | ####################################### 75 | ARCTIC_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded) 76 | tag="" # tag for network directory naming (Optional) 77 | 78 | # parse options 79 | . parse_options.sh || exit 1; 80 | 81 | # check feature type 82 | if [ ${feature_type} != "world" ]; then 83 | echo "This recipe does not support feature_type=\"melspc\"." 2>&1 84 | echo "Please try the egs/arctic/sd-melspc." 2>&1 85 | exit 1; 86 | fi 87 | 88 | # set directory names 89 | train=tr_${spk} 90 | eval=ev_${spk} 91 | 92 | # stop when error occurred 93 | set -euo pipefail 94 | # }}} 95 | 96 | 97 | # STAGE 0 {{{ 98 | if echo ${stage} | grep -q 0; then 99 | echo "###########################################################" 100 | echo "# DATA PREPARATION STEP #" 101 | echo "###########################################################" 102 | if [ ! -e ${ARCTIC_DB_ROOT}/.done ];then 103 | mkdir -p ${ARCTIC_DB_ROOT} 104 | cd ${ARCTIC_DB_ROOT} 105 | for id in bdl slt rms clb jmk ksp awb;do 106 | wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${id}_arctic-0.95-release.tar.bz2 107 | tar xf cmu_us_${id}*.tar.bz2 108 | done 109 | rm ./*.tar.bz2 110 | cd ../ 111 | touch ${ARCTIC_DB_ROOT}/.done 112 | echo "database is successfully downloaded." 113 | fi 114 | [ ! -e data/local ] && mkdir -p data/local 115 | [ ! -e data/${train} ] && mkdir -p data/${train} 116 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 117 | find "${ARCTIC_DB_ROOT}/cmu_us_${spk}_arctic/wav" -name "*.wav" \ 118 | | sort > "data/local/wav.${spk}.scp" 119 | head -n 1028 "data/local/wav.${spk}.scp" >> "data/${train}/wav.scp" 120 | tail -n 104 "data/local/wav.${spk}.scp" >> "data/${eval}/wav.scp" 121 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 122 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 123 | fi 124 | # }}} 125 | 126 | 127 | # STAGE 1 {{{ 128 | if echo ${stage} | grep -q 1; then 129 | echo "###########################################################" 130 | echo "# FEATURE EXTRACTION STEP #" 131 | echo "###########################################################" 132 | [ ! -n "${minf0}" ] && minf0=$(awk '{print $1}' conf/${spk}.f0) 133 | [ ! -n "${maxf0}" ] && maxf0=$(awk '{print $2}' conf/${spk}.f0) 134 | for set in ${train} ${eval};do 135 | # training data feature extraction 136 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \ 137 | feature_extract.py \ 138 | --waveforms data/${set}/wav.scp \ 139 | --wavdir wav_hpf/${set} \ 140 | --hdf5dir hdf5/${set} \ 141 | --feature_type ${feature_type} \ 142 | --fs ${fs} \ 143 | --shiftms ${shiftms} \ 144 | --minf0 "${minf0}" \ 145 | --maxf0 "${maxf0}" \ 146 | --mcep_dim ${mcep_dim} \ 147 | --mcep_alpha ${mcep_alpha} \ 148 | --highpass_cutoff ${highpass_cutoff} \ 149 | --fftl ${fftl} \ 150 | --n_jobs ${n_jobs} 151 | 152 | # check the number of feature files 153 | n_wavs=$(wc -l data/${set}/wav.scp) 154 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 155 | echo "${n_feats}/${n_wavs} files are successfully processed." 156 | 157 | # make scp files 158 | if [ ${highpass_cutoff} -eq 0 ];then 159 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 160 | else 161 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 162 | fi 163 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 164 | done 165 | fi 166 | # }}} 167 | 168 | 169 | # STAGE 2 {{{ 170 | if echo ${stage} | grep -q 2; then 171 | echo "###########################################################" 172 | echo "# CALCULATE STATISTICS STEP #" 173 | echo "###########################################################" 174 | ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \ 175 | calc_stats.py \ 176 | --feats data/${train}/feats.scp \ 177 | --stats data/${train}/stats.h5 \ 178 | --feature_type ${feature_type} 179 | echo "statistics are successfully calculated." 180 | fi 181 | # }}} 182 | 183 | 184 | # STAGE 3 {{{ 185 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 186 | echo "###########################################################" 187 | echo "# NOISE WEIGHTING STEP #" 188 | echo "###########################################################" 189 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \ 190 | noise_shaping.py \ 191 | --waveforms data/${train}/wav_hpf.scp \ 192 | --stats data/${train}/stats.h5 \ 193 | --outdir wav_nwf/${train} \ 194 | --feature_type ${feature_type} \ 195 | --fs ${fs} \ 196 | --shiftms ${shiftms} \ 197 | --mcep_dim_start 2 \ 198 | --mcep_dim_end $(( 2 + mcep_dim +1 )) \ 199 | --mcep_alpha ${mcep_alpha} \ 200 | --mag ${mag} \ 201 | --inv true \ 202 | --n_jobs ${n_jobs} 203 | 204 | # check the number of feature files 205 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 206 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 207 | echo "${n_ns}/${n_wavs} files are successfully processed." 208 | 209 | # make scp files 210 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 211 | fi # }}} 212 | 213 | 214 | # STAGE 4 {{{ 215 | # set variables 216 | if [ ! -n "${tag}" ];then 217 | expdir=exp/tr_arctic_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 218 | if ${use_noise_shaping};then 219 | expdir=${expdir}_ns 220 | fi 221 | if ${use_upsampling};then 222 | expdir=${expdir}_up 223 | fi 224 | else 225 | expdir=exp/tr_arctic_${tag} 226 | fi 227 | if echo ${stage} | grep -q 4; then 228 | echo "###########################################################" 229 | echo "# WAVENET TRAINING STEP #" 230 | echo "###########################################################" 231 | if ${use_noise_shaping};then 232 | waveforms=data/${train}/wav_nwf.scp 233 | else 234 | waveforms=data/${train}/wav_hpf.scp 235 | fi 236 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 237 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 238 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 239 | ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \ 240 | train.py \ 241 | --n_gpus ${n_gpus} \ 242 | --waveforms ${waveforms} \ 243 | --feats data/${train}/feats.scp \ 244 | --stats data/${train}/stats.h5 \ 245 | --expdir "${expdir}" \ 246 | --feature_type ${feature_type} \ 247 | --n_quantize ${n_quantize} \ 248 | --n_aux ${n_aux} \ 249 | --n_resch ${n_resch} \ 250 | --n_skipch ${n_skipch} \ 251 | --dilation_depth ${dilation_depth} \ 252 | --dilation_repeat ${dilation_repeat} \ 253 | --kernel_size ${kernel_size} \ 254 | --lr ${lr} \ 255 | --weight_decay ${weight_decay} \ 256 | --iters ${iters} \ 257 | --batch_length ${batch_length} \ 258 | --batch_size ${batch_size} \ 259 | --checkpoint_interval ${checkpoint_interval} \ 260 | --upsampling_factor "${upsampling_factor}" \ 261 | --use_upsampling_layer ${use_upsampling} \ 262 | --resume "${resume}" 263 | fi 264 | # }}} 265 | 266 | 267 | # STAGE 5 {{{ 268 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 269 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 270 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 271 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 272 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 273 | if echo ${stage} | grep -q 5; then 274 | echo "###########################################################" 275 | echo "# WAVENET DECODING STEP #" 276 | echo "###########################################################" 277 | ${cuda_cmd} --gpu ${n_gpus} "${outdir}/log/decode.log" \ 278 | decode.py \ 279 | --n_gpus ${n_gpus} \ 280 | --feats ${feats} \ 281 | --stats ${stats} \ 282 | --outdir "${outdir}" \ 283 | --checkpoint "${checkpoint}" \ 284 | --config "${config}" \ 285 | --fs ${fs} \ 286 | --batch_size ${decode_batch_size} 287 | fi 288 | # }}} 289 | 290 | 291 | # STAGE 6 {{{ 292 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 293 | echo "###########################################################" 294 | echo "# NOISE SHAPING STEP #" 295 | echo "###########################################################" 296 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 297 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \ 298 | noise_shaping.py \ 299 | --waveforms ${outdir}/wav.scp \ 300 | --stats ${stats} \ 301 | --outdir "${outdir}"_nsf \ 302 | --feature_type ${feature_type} \ 303 | --fs ${fs} \ 304 | --shiftms ${shiftms} \ 305 | --mcep_dim_start 2 \ 306 | --mcep_dim_end $(( 2 + mcep_dim +1 )) \ 307 | --mcep_alpha ${mcep_alpha} \ 308 | --mag ${mag} \ 309 | --n_jobs ${n_jobs} \ 310 | --inv false 311 | fi 312 | # }}} 313 | -------------------------------------------------------------------------------- /wavenet_vocoder/bin/decode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import argparse 8 | import logging 9 | import math 10 | import os 11 | import sys 12 | 13 | import numpy as np 14 | import soundfile as sf 15 | import torch 16 | import torch.multiprocessing as mp 17 | 18 | from sklearn.preprocessing import StandardScaler 19 | from torchvision import transforms 20 | 21 | from wavenet_vocoder.nets import decode_mu_law 22 | from wavenet_vocoder.nets import encode_mu_law 23 | from wavenet_vocoder.nets import WaveNet 24 | from wavenet_vocoder.utils import extend_time 25 | from wavenet_vocoder.utils import find_files 26 | from wavenet_vocoder.utils import read_hdf5 27 | from wavenet_vocoder.utils import read_txt 28 | from wavenet_vocoder.utils import shape_hdf5 29 | 30 | 31 | def pad_list(batch_list, pad_value=0.0): 32 | """PAD VALUE. 33 | 34 | Args: 35 | batch_list (list): List of batch, where the shape of i-th batch (T_i, C). 36 | pad_value (float): Value to pad. 37 | 38 | Returns: 39 | ndarray: Padded batch with the shape (B, T_max, C). 40 | 41 | """ 42 | batch_size = len(batch_list) 43 | maxlen = max([batch.shape[0] for batch in batch_list]) 44 | n_feats = batch_list[0].shape[-1] 45 | batch_pad = np.zeros((batch_size, maxlen, n_feats)) 46 | for idx, batch in enumerate(batch_list): 47 | batch_pad[idx, :batch.shape[0]] = batch 48 | 49 | return batch_pad 50 | 51 | 52 | def decode_generator(feat_list, 53 | batch_size=32, 54 | feature_type="world", 55 | wav_transform=None, 56 | feat_transform=None, 57 | upsampling_factor=80, 58 | use_upsampling_layer=True, 59 | use_speaker_code=False): 60 | """GENERATE DECODING BATCH. 61 | 62 | Args: 63 | feat_list (list): List of feature files. 64 | batch_size (int): Batch size in decoding. 65 | feature_type (str): Feature type. 66 | wav_transform (func): Preprocessing function for waveform. 67 | feat_transform (func): Preprocessing function for aux feats. 68 | upsampling_factor (int): Upsampling factor. 69 | use_upsampling_layer (bool): Whether to use upsampling layer. 70 | use_speaker_code (bool): Whether to use speaker code> 71 | 72 | Returns: 73 | generator: Generator instance. 74 | 75 | """ 76 | # --------------------------- 77 | # sample-by-sample generation 78 | # --------------------------- 79 | if batch_size == 1: 80 | for featfile in feat_list: 81 | x = np.zeros((1)) 82 | h = read_hdf5(featfile, "/" + feature_type) 83 | if not use_upsampling_layer: 84 | h = extend_time(h, upsampling_factor) 85 | if use_speaker_code: 86 | sc = read_hdf5(featfile, "/speaker_code") 87 | sc = np.tile(sc, [h.shape[0], 1]) 88 | h = np.concatenate([h, sc], axis=1) 89 | 90 | # perform pre-processing 91 | if wav_transform is not None: 92 | x = wav_transform(x) 93 | if feat_transform is not None: 94 | h = feat_transform(h) 95 | 96 | # convert to torch variable 97 | x = torch.from_numpy(x).long() 98 | h = torch.from_numpy(h).float() 99 | x = x.unsqueeze(0) # 1 => 1 x 1 100 | h = h.transpose(0, 1).unsqueeze(0) # T x C => 1 x C x T 101 | 102 | # send to cuda 103 | if torch.cuda.is_available(): 104 | x = x.cuda() 105 | h = h.cuda() 106 | 107 | # get target length and file id 108 | if not use_upsampling_layer: 109 | n_samples = h.size(2) - 1 110 | else: 111 | n_samples = h.size(2) * upsampling_factor - 1 112 | feat_id = os.path.basename(featfile).replace(".h5", "") 113 | 114 | yield feat_id, (x, h, n_samples) 115 | 116 | # ---------------- 117 | # batch generation 118 | # ---------------- 119 | else: 120 | # sort with the feature length 121 | shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list] 122 | idx = np.argsort(shape_list) 123 | feat_list = [feat_list[i] for i in idx] 124 | 125 | # divide into batch list 126 | n_batch = math.ceil(len(feat_list) / batch_size) 127 | batch_lists = np.array_split(feat_list, n_batch) 128 | batch_lists = [f.tolist() for f in batch_lists] 129 | 130 | for batch_list in batch_lists: 131 | batch_x = [] 132 | batch_h = [] 133 | n_samples_list = [] 134 | feat_ids = [] 135 | for featfile in batch_list: 136 | # make seed waveform and load aux feature 137 | x = np.zeros((1)) 138 | h = read_hdf5(featfile, "/" + feature_type) 139 | if not use_upsampling_layer: 140 | h = extend_time(h, upsampling_factor) 141 | if use_speaker_code: 142 | sc = read_hdf5(featfile, "/speaker_code") 143 | sc = np.tile(sc, [h.shape[0], 1]) 144 | h = np.concatenate([h, sc], axis=1) 145 | 146 | # perform pre-processing 147 | if wav_transform is not None: 148 | x = wav_transform(x) 149 | if feat_transform is not None: 150 | h = feat_transform(h) 151 | 152 | # append to list 153 | batch_x += [x] 154 | batch_h += [h] 155 | if not use_upsampling_layer: 156 | n_samples_list += [h.shape[0] - 1] 157 | else: 158 | n_samples_list += [h.shape[0] * upsampling_factor - 1] 159 | feat_ids += [os.path.basename(featfile).replace(".h5", "")] 160 | 161 | # convert list to ndarray 162 | batch_x = np.stack(batch_x, axis=0) 163 | batch_h = pad_list(batch_h) 164 | 165 | # convert to torch variable 166 | batch_x = torch.from_numpy(batch_x).long() 167 | batch_h = torch.from_numpy(batch_h).float().transpose(1, 2) 168 | 169 | # send to cuda 170 | if torch.cuda.is_available(): 171 | batch_x = batch_x.cuda() 172 | batch_h = batch_h.cuda() 173 | 174 | yield feat_ids, (batch_x, batch_h, n_samples_list) 175 | 176 | 177 | def main(): 178 | """RUN DECODING.""" 179 | parser = argparse.ArgumentParser() 180 | # decode setting 181 | parser.add_argument("--feats", required=True, 182 | type=str, help="list or directory of aux feat files") 183 | parser.add_argument("--checkpoint", required=True, 184 | type=str, help="model file") 185 | parser.add_argument("--outdir", required=True, 186 | type=str, help="directory to save generated samples") 187 | parser.add_argument("--stats", default=None, 188 | type=str, help="hdf5 file including statistics") 189 | parser.add_argument("--config", default=None, 190 | type=str, help="configure file") 191 | parser.add_argument("--fs", default=16000, 192 | type=int, help="sampling rate") 193 | parser.add_argument("--batch_size", default=32, 194 | type=int, help="number of batch size in decoding") 195 | parser.add_argument("--n_gpus", default=1, 196 | type=int, help="number of gpus") 197 | # other setting 198 | parser.add_argument("--intervals", default=1000, 199 | type=int, help="log interval") 200 | parser.add_argument("--seed", default=1, 201 | type=int, help="seed number") 202 | parser.add_argument("--verbose", default=1, 203 | type=int, help="log level") 204 | args = parser.parse_args() 205 | 206 | # set log level 207 | if args.verbose > 0: 208 | logging.basicConfig(level=logging.INFO, 209 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 210 | datefmt='%m/%d/%Y %I:%M:%S') 211 | elif args.verbose > 1: 212 | logging.basicConfig(level=logging.DEBUG, 213 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 214 | datefmt='%m/%d/%Y %I:%M:%S') 215 | else: 216 | logging.basicConfig(level=logging.WARNING, 217 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 218 | datefmt='%m/%d/%Y %I:%M:%S') 219 | logging.warning("logging is disabled.") 220 | 221 | # show arguments 222 | for key, value in vars(args).items(): 223 | logging.info("%s = %s" % (key, str(value))) 224 | 225 | # check arguments 226 | if args.stats is None: 227 | args.stats = os.path.dirname(args.checkpoint) + "/stats.h5" 228 | if args.config is None: 229 | args.config = os.path.dirname(args.checkpoint) + "/model.conf" 230 | if not os.path.exists(args.stats): 231 | raise FileNotFoundError("statistics file is missing (%s)." % (args.stats)) 232 | if not os.path.exists(args.config): 233 | raise FileNotFoundError("config file is missing (%s)." % (args.config)) 234 | 235 | # check directory existence 236 | if not os.path.exists(args.outdir): 237 | os.makedirs(args.outdir) 238 | 239 | # fix seed 240 | os.environ['PYTHONHASHSEED'] = str(args.seed) 241 | np.random.seed(args.seed) 242 | torch.manual_seed(args.seed) 243 | 244 | # fix slow computation of dilated conv 245 | # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923 246 | torch.backends.cudnn.benchmark = True 247 | 248 | # load config 249 | config = torch.load(args.config) 250 | 251 | # get file list 252 | if os.path.isdir(args.feats): 253 | feat_list = sorted(find_files(args.feats, "*.h5")) 254 | elif os.path.isfile(args.feats): 255 | feat_list = read_txt(args.feats) 256 | else: 257 | logging.error("--feats should be directory or list.") 258 | sys.exit(1) 259 | 260 | # prepare the file list for parallel decoding 261 | feat_lists = np.array_split(feat_list, args.n_gpus) 262 | feat_lists = [f_list.tolist() for f_list in feat_lists] 263 | 264 | # define transform 265 | scaler = StandardScaler() 266 | scaler.mean_ = read_hdf5(args.stats, "/" + config.feature_type + "/mean") 267 | scaler.scale_ = read_hdf5(args.stats, "/" + config.feature_type + "/scale") 268 | wav_transform = transforms.Compose([ 269 | lambda x: encode_mu_law(x, config.n_quantize)]) 270 | feat_transform = transforms.Compose([ 271 | lambda x: scaler.transform(x)]) 272 | 273 | # define gpu decode function 274 | def gpu_decode(feat_list, gpu): 275 | # set default gpu and do not track gradient 276 | torch.cuda.set_device(gpu) 277 | torch.set_grad_enabled(False) 278 | 279 | # define model and load parameters 280 | if config.use_upsampling_layer: 281 | upsampling_factor = config.upsampling_factor 282 | else: 283 | upsampling_factor = 0 284 | model = WaveNet( 285 | n_quantize=config.n_quantize, 286 | n_aux=config.n_aux, 287 | n_resch=config.n_resch, 288 | n_skipch=config.n_skipch, 289 | dilation_depth=config.dilation_depth, 290 | dilation_repeat=config.dilation_repeat, 291 | kernel_size=config.kernel_size, 292 | upsampling_factor=upsampling_factor) 293 | model.load_state_dict(torch.load( 294 | args.checkpoint, 295 | map_location=lambda storage, 296 | loc: storage)["model"]) 297 | model.eval() 298 | model.cuda() 299 | 300 | # define generator 301 | generator = decode_generator( 302 | feat_list, 303 | batch_size=args.batch_size, 304 | feature_type=config.feature_type, 305 | wav_transform=wav_transform, 306 | feat_transform=feat_transform, 307 | upsampling_factor=config.upsampling_factor, 308 | use_upsampling_layer=config.use_upsampling_layer, 309 | use_speaker_code=config.use_speaker_code) 310 | 311 | # decode 312 | if args.batch_size > 1: 313 | for feat_ids, (batch_x, batch_h, n_samples_list) in generator: 314 | logging.info("decoding start") 315 | samples_list = model.batch_fast_generate( 316 | batch_x, batch_h, n_samples_list, args.intervals) 317 | for feat_id, samples in zip(feat_ids, samples_list): 318 | wav = decode_mu_law(samples, config.n_quantize) 319 | sf.write(args.outdir + "/" + feat_id + ".wav", wav, args.fs, "PCM_16") 320 | logging.info("wrote %s.wav in %s." % (feat_id, args.outdir)) 321 | else: 322 | for feat_id, (x, h, n_samples) in generator: 323 | logging.info("decoding %s (length = %d)" % (feat_id, n_samples)) 324 | samples = model.fast_generate(x, h, n_samples, args.intervals) 325 | wav = decode_mu_law(samples, config.n_quantize) 326 | sf.write(args.outdir + "/" + feat_id + ".wav", wav, args.fs, "PCM_16") 327 | logging.info("wrote %s.wav in %s." % (feat_id, args.outdir)) 328 | 329 | # parallel decode 330 | processes = [] 331 | for gpu, feat_list in enumerate(feat_lists): 332 | p = mp.Process(target=gpu_decode, args=(feat_list, gpu,)) 333 | p.start() 334 | processes.append(p) 335 | 336 | # wait for all process 337 | for p in processes: 338 | p.join() 339 | 340 | 341 | if __name__ == "__main__": 342 | main() 343 | -------------------------------------------------------------------------------- /egs/ljspeech/sd-melspc/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | . ./cmd.sh || exit 1; 11 | 12 | # USER SETTINGS {{{ 13 | ####################################### 14 | # STAGE SETTING # 15 | ####################################### 16 | stage=0123456 17 | # 0: data preparation step 18 | # 1: feature extraction step 19 | # 2: statistics calculation step 20 | # 3: noise weighting step 21 | # 4: training step 22 | # 5: decoding step 23 | # 6: noise shaping step 24 | 25 | ####################################### 26 | # FEATURE SETTING # 27 | ####################################### 28 | feature_type=melspc # world or melspc (in this recipe fixed to "melspc") 29 | shiftms=11.61 # shift length in msec (in point: shiftms * fs / 1000) 30 | fftl=1024 # fft length 31 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 32 | fs=22050 # sampling rate 33 | mspc_dim=80 # dimension of mel-spectrogram 34 | mcep_dim=35 # dimension of mel-cepstrum 35 | mcep_alpha=0.455 # alpha value of mel-cepstrum 36 | fmin="" # minimum frequency in melspc calculation 37 | fmax="" # maximum frequency in melspc calculation 38 | use_noise_shaping=true # whether to use noise shaping 39 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 40 | n_jobs=10 # number of parallel jobs 41 | 42 | ####################################### 43 | # TRAINING SETTING # 44 | ####################################### 45 | n_gpus=1 # number of gpus 46 | n_quantize=256 # number of quantization of waveform 47 | n_aux=80 # number of auxiliary features 48 | n_resch=512 # number of residual channels 49 | n_skipch=256 # number of skip channels 50 | dilation_depth=10 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 51 | dilation_repeat=3 # number of dilation repeats 52 | kernel_size=3 # kernel size of dilated convolution 53 | lr=1e-4 # learning rate 54 | weight_decay=0.0 # weight decay coef 55 | iters=200000 # number of iterations 56 | batch_length=15000 # batch length 57 | batch_size=1 # batch size 58 | checkpoint_interval=10000 # save model per this number 59 | use_upsampling=true # whether to use upsampling layer 60 | resume="" # checkpoint path to resume (Optional) 61 | 62 | ####################################### 63 | # DECODING SETTING # 64 | ####################################### 65 | outdir="" # directory to save decoded wav dir (Optional) 66 | checkpoint="" # checkpoint path to be used for decoding (Optional) 67 | config="" # model configuration path (Optional) 68 | stats="" # statistics path (Optional) 69 | feats="" # list or directory of feature files (Optional) 70 | decode_batch_size=16 # batch size in decoding 71 | 72 | ####################################### 73 | # OTHER SETTING # 74 | ####################################### 75 | LJSPEECH_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded) 76 | tag="" # tag for network directory naming (Optional) 77 | 78 | # parse options 79 | . parse_options.sh || exit 1; 80 | 81 | # check feature type 82 | if [ ${feature_type} != "melspc" ]; then 83 | echo "This recipe does not support feature_type=\"world\"." 2>&1 84 | echo "Please try the egs/ljspeech/sd." 2>&1 85 | exit 1; 86 | fi 87 | 88 | # set directory names 89 | train=tr 90 | eval=ev 91 | 92 | # stop when error occurred 93 | set -euo pipefail 94 | # }}} 95 | 96 | 97 | # STAGE 0 {{{ 98 | if echo ${stage} | grep -q 0; then 99 | echo "###########################################################" 100 | echo "# DATA PREPARATION STEP #" 101 | echo "###########################################################" 102 | if [ ! -e ${LJSPEECH_DB_ROOT}/.done ];then 103 | mkdir -p ${LJSPEECH_DB_ROOT} 104 | cd ${LJSPEECH_DB_ROOT} 105 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 106 | tar -vxf ./*.tar.bz2 107 | rm ./*.tar.bz2 108 | cd ../ 109 | touch ${LJSPEECH_DB_ROOT}/.done 110 | echo "database is successfully downloaded." 111 | fi 112 | [ ! -e data/local ] && mkdir -p data/local 113 | [ ! -e data/${train} ] && mkdir -p data/${train} 114 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 115 | find ${LJSPEECH_DB_ROOT}/LJSpeech-1.1/wavs -name "*.wav" \ 116 | | sort > data/local/wav.scp 117 | grep -v LJ050 data/local/wav.scp > data/${train}/wav.scp 118 | grep LJ050 data/local/wav.scp > data/${eval}/wav.scp 119 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 120 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 121 | fi 122 | # }}} 123 | 124 | 125 | # STAGE 1 {{{ 126 | if echo ${stage} | grep -q 1; then 127 | echo "###########################################################" 128 | echo "# FEATURE EXTRACTION STEP #" 129 | echo "###########################################################" 130 | for set in ${train} ${eval};do 131 | # training data feature extraction 132 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${feature_type}_${set}.log \ 133 | feature_extract.py \ 134 | --waveforms data/${set}/wav.scp \ 135 | --wavdir wav_hpf/${set} \ 136 | --hdf5dir hdf5/${set} \ 137 | --feature_type ${feature_type} \ 138 | --fs ${fs} \ 139 | --shiftms ${shiftms} \ 140 | --mspc_dim ${mspc_dim} \ 141 | --highpass_cutoff ${highpass_cutoff} \ 142 | --fftl ${fftl} \ 143 | --fmin "${fmin}" \ 144 | --fmax "${fmax}" \ 145 | --n_jobs ${n_jobs} 146 | 147 | # extract stft-baed mel-cepstrum for noise shaping 148 | if [ ${set} = ${train} ] && ${use_noise_shaping};then 149 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_mcep_${set}.log \ 150 | feature_extract.py \ 151 | --waveforms data/${set}/wav.scp \ 152 | --wavdir wav_hpf/${set} \ 153 | --hdf5dir hdf5/${set} \ 154 | --feature_type mcep \ 155 | --fs ${fs} \ 156 | --shiftms ${shiftms} \ 157 | --mcep_dim ${mcep_dim} \ 158 | --mcep_alpha ${mcep_alpha} \ 159 | --highpass_cutoff ${highpass_cutoff} \ 160 | --save_wav false \ 161 | --fftl ${fftl} \ 162 | --n_jobs ${n_jobs} 163 | fi 164 | 165 | # check the number of feature files 166 | n_wavs=$(wc -l data/${set}/wav.scp) 167 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 168 | echo "${n_feats}/${n_wavs} files are successfully processed." 169 | 170 | # make scp files 171 | if [ ${highpass_cutoff} -eq 0 ];then 172 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 173 | else 174 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 175 | fi 176 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 177 | done 178 | 179 | fi 180 | # }}} 181 | 182 | 183 | # STAGE 2 {{{ 184 | if echo ${stage} | grep -q 2; then 185 | echo "###########################################################" 186 | echo "# CALCULATE STATISTICS STEP #" 187 | echo "###########################################################" 188 | ${train_cmd} exp/calculate_statistics/calc_stats_${feature_type}_${train}.log \ 189 | calc_stats.py \ 190 | --feats data/${train}/feats.scp \ 191 | --stats data/${train}/stats.h5 \ 192 | --feature_type ${feature_type} 193 | if ${use_noise_shaping};then 194 | ${train_cmd} exp/calculate_statistics/calc_stats_mcep_${train}.log \ 195 | calc_stats.py \ 196 | --feats data/${train}/feats.scp \ 197 | --stats data/${train}/stats.h5 \ 198 | --feature_type mcep 199 | fi 200 | echo "statistics are successfully calculated." 201 | fi 202 | # }}} 203 | 204 | 205 | # STAGE 3 {{{ 206 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 207 | echo "###########################################################" 208 | echo "# NOISE WEIGHTING STEP #" 209 | echo "###########################################################" 210 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_mcep_${train}.log \ 211 | noise_shaping.py \ 212 | --waveforms data/${train}/wav_hpf.scp \ 213 | --stats data/${train}/stats.h5 \ 214 | --outdir wav_nwf/${train} \ 215 | --feature_type mcep \ 216 | --fs ${fs} \ 217 | --shiftms ${shiftms} \ 218 | --mcep_alpha ${mcep_alpha} \ 219 | --mag ${mag} \ 220 | --inv true \ 221 | --n_jobs ${n_jobs} 222 | 223 | # check the number of feature files 224 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 225 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 226 | echo "${n_ns}/${n_wavs} files are successfully processed." 227 | 228 | # make scp files 229 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 230 | fi 231 | # }}} 232 | 233 | 234 | # STAGE 4 {{{ 235 | # set variables 236 | if [ ! -n "${tag}" ];then 237 | expdir=exp/tr_ljspeech_22k_sd_${feature_type}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 238 | if ${use_noise_shaping};then 239 | expdir=${expdir}_ns 240 | fi 241 | if ${use_upsampling};then 242 | expdir=${expdir}_up 243 | fi 244 | else 245 | expdir=exp/tr_ljspeech_22k_${tag} 246 | fi 247 | if echo ${stage} | grep -q 4; then 248 | echo "###########################################################" 249 | echo "# WAVENET TRAINING STEP #" 250 | echo "###########################################################" 251 | if ${use_noise_shaping};then 252 | waveforms=data/${train}/wav_nwf.scp 253 | else 254 | waveforms=data/${train}/wav_hpf.scp 255 | fi 256 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 257 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 258 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 259 | ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \ 260 | train.py \ 261 | --n_gpus ${n_gpus} \ 262 | --waveforms ${waveforms} \ 263 | --feats data/${train}/feats.scp \ 264 | --stats data/${train}/stats.h5 \ 265 | --expdir "${expdir}" \ 266 | --feature_type ${feature_type} \ 267 | --n_quantize ${n_quantize} \ 268 | --n_aux ${n_aux} \ 269 | --n_resch ${n_resch} \ 270 | --n_skipch ${n_skipch} \ 271 | --dilation_depth ${dilation_depth} \ 272 | --dilation_repeat ${dilation_repeat} \ 273 | --kernel_size ${kernel_size} \ 274 | --lr ${lr} \ 275 | --weight_decay ${weight_decay} \ 276 | --iters ${iters} \ 277 | --batch_length ${batch_length} \ 278 | --batch_size ${batch_size} \ 279 | --checkpoint_interval ${checkpoint_interval} \ 280 | --upsampling_factor "${upsampling_factor}" \ 281 | --use_upsampling_layer ${use_upsampling} \ 282 | --resume "${resume}" 283 | fi 284 | # }}} 285 | 286 | 287 | # STAGE 5 {{{ 288 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 289 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 290 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 291 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 292 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 293 | if echo ${stage} | grep -q 5; then 294 | echo "###########################################################" 295 | echo "# WAVENET DECODING STEP #" 296 | echo "###########################################################" 297 | ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \ 298 | decode.py \ 299 | --n_gpus ${n_gpus} \ 300 | --feats ${feats} \ 301 | --stats ${stats} \ 302 | --outdir "${outdir}" \ 303 | --checkpoint "${checkpoint}" \ 304 | --config "${config}" \ 305 | --fs ${fs} \ 306 | --batch_size ${decode_batch_size} 307 | fi 308 | # }}} 309 | 310 | 311 | # STAGE 6 {{{ 312 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 313 | echo "###########################################################" 314 | echo "# NOISE SHAPING STEP #" 315 | echo "###########################################################" 316 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 317 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_mcep_${eval}.log \ 318 | noise_shaping.py \ 319 | --waveforms ${outdir}/wav.scp \ 320 | --stats ${stats} \ 321 | --outdir "${outdir}_nsf" \ 322 | --feature_type mcep \ 323 | --fs ${fs} \ 324 | --shiftms ${shiftms} \ 325 | --mcep_alpha ${mcep_alpha} \ 326 | --mag ${mag} \ 327 | --n_jobs ${n_jobs} \ 328 | --inv false 329 | fi 330 | # }}} 331 | -------------------------------------------------------------------------------- /wavenet_vocoder/bin/feature_extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import argparse 8 | import logging 9 | import multiprocessing as mp 10 | import os 11 | import sys 12 | 13 | from distutils.util import strtobool 14 | 15 | import librosa 16 | import numpy as np 17 | import pysptk 18 | 19 | from scipy.interpolate import interp1d 20 | from scipy.io import wavfile 21 | from scipy.signal import firwin 22 | from scipy.signal import get_window 23 | from scipy.signal import lfilter 24 | from sprocket.speech.feature_extractor import FeatureExtractor 25 | 26 | from wavenet_vocoder.utils import find_files 27 | from wavenet_vocoder.utils import read_txt 28 | from wavenet_vocoder.utils import write_hdf5 29 | 30 | EPS = 1e-10 31 | 32 | 33 | def low_cut_filter(x, fs, cutoff=70): 34 | """APPLY LOW CUT FILTER. 35 | 36 | Args: 37 | x (ndarray): Waveform sequence. 38 | fs (int): Sampling frequency. 39 | cutoff (float): Cutoff frequency of low cut filter. 40 | 41 | Return: 42 | ndarray: Low cut filtered waveform sequence. 43 | 44 | """ 45 | nyquist = fs // 2 46 | norm_cutoff = cutoff / nyquist 47 | 48 | # low cut filter 49 | fil = firwin(255, norm_cutoff, pass_zero=False) 50 | lcf_x = lfilter(fil, 1, x) 51 | 52 | return lcf_x 53 | 54 | 55 | def low_pass_filter(x, fs, cutoff=70, padding=True): 56 | """APPLY LOW PASS FILTER. 57 | 58 | Args: 59 | x (ndarray): Waveform sequence. 60 | fs (int): Sampling frequency. 61 | cutoff (float): Cutoff frequency of low pass filter. 62 | 63 | Returns: 64 | ndarray: Low pass filtered waveform sequence 65 | 66 | """ 67 | nyquist = fs // 2 68 | norm_cutoff = cutoff / nyquist 69 | 70 | # low cut filter 71 | numtaps = 255 72 | fil = firwin(numtaps, norm_cutoff) 73 | x_pad = np.pad(x, (numtaps, numtaps), 'edge') 74 | lpf_x = lfilter(fil, 1, x_pad) 75 | lpf_x = lpf_x[numtaps + numtaps // 2: -numtaps // 2] 76 | 77 | return lpf_x 78 | 79 | 80 | def convert_to_continuos_f0(f0): 81 | """CONVERT F0 TO CONTINUOUS F0. 82 | 83 | Args: 84 | f0 (ndarray): original f0 sequence with the shape (T,). 85 | 86 | Returns: 87 | ndarray: continuous f0 with the shape (T,). 88 | 89 | """ 90 | # get uv information as binary 91 | uv = np.float32(f0 != 0) 92 | 93 | # get start and end of f0 94 | if (f0 == 0).all(): 95 | logging.warning("all of the f0 values are 0.") 96 | return uv, f0 97 | start_f0 = f0[f0 != 0][0] 98 | end_f0 = f0[f0 != 0][-1] 99 | 100 | # padding start and end of f0 sequence 101 | start_idx = np.where(f0 == start_f0)[0][0] 102 | end_idx = np.where(f0 == end_f0)[0][-1] 103 | f0[:start_idx] = start_f0 104 | f0[end_idx:] = end_f0 105 | 106 | # get non-zero frame index 107 | nz_frames = np.where(f0 != 0)[0] 108 | 109 | # perform linear interpolation 110 | f = interp1d(nz_frames, f0[nz_frames]) 111 | cont_f0 = f(np.arange(0, f0.shape[0])) 112 | 113 | return uv, cont_f0 114 | 115 | 116 | def stft_mcep(x, fftl=512, shiftl=256, dim=25, alpha=0.41, window="hamming", is_padding=False): 117 | """EXTRACT STFT-BASED MEL-CEPSTRUM. 118 | 119 | Args: 120 | x (ndarray): Numpy double array with the size (T,). 121 | fftl (int): FFT length in point (default=512). 122 | shiftl (int): Shift length in point (default=256). 123 | dim (int): Dimension of mel-cepstrum (default=25). 124 | alpha (float): All pass filter coefficient (default=0.41). 125 | window (str): Analysis window type (default="hamming"). 126 | is_padding (bool): Whether to pad the end of signal (default=False). 127 | 128 | Returns: 129 | ndarray: Mel-cepstrum with the size (N, n_fft). 130 | 131 | """ 132 | # perform padding 133 | if is_padding: 134 | n_pad = fftl - (len(x) - fftl) % shiftl 135 | x = np.pad(x, (0, n_pad), 'reflect') 136 | 137 | # get number of frames 138 | n_frame = (len(x) - fftl) // shiftl + 1 139 | 140 | # get window function 141 | win = get_window(window, fftl) 142 | 143 | # calculate spectrogram 144 | mcep = [pysptk.mcep(x[shiftl * i: shiftl * i + fftl] * win, 145 | dim, alpha, eps=EPS, etype=1) 146 | for i in range(n_frame)] 147 | 148 | return np.stack(mcep) 149 | 150 | 151 | def world_feature_extract(wav_list, args): 152 | """EXTRACT WORLD FEATURE VECTOR.""" 153 | # define feature extractor 154 | feature_extractor = FeatureExtractor( 155 | analyzer="world", 156 | fs=args.fs, 157 | shiftms=args.shiftms, 158 | minf0=args.minf0, 159 | maxf0=args.maxf0, 160 | fftl=args.fftl) 161 | 162 | for i, wav_name in enumerate(wav_list): 163 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 164 | 165 | # load wavfile and apply low cut filter 166 | fs, x = wavfile.read(wav_name) 167 | if x.dtype != np.int16: 168 | logging.warning("wav file format is not 16 bit PCM.") 169 | x = np.array(x, dtype=np.float64) 170 | if args.highpass_cutoff != 0: 171 | x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) 172 | 173 | # check sampling frequency 174 | if not fs == args.fs: 175 | logging.error("sampling frequency is not matched.") 176 | sys.exit(1) 177 | 178 | # extract features 179 | f0, _, _ = feature_extractor.analyze(x) 180 | uv, cont_f0 = convert_to_continuos_f0(f0) 181 | cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) 182 | codeap = feature_extractor.codeap() 183 | mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha) 184 | 185 | # concatenate 186 | cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) 187 | uv = np.expand_dims(uv, axis=-1) 188 | feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1) 189 | 190 | # save to hdf5 191 | hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5") 192 | write_hdf5(hdf5name, "/world", feats) 193 | 194 | # overwrite wav file 195 | if args.highpass_cutoff != 0 and args.save_wav: 196 | wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x)) 197 | 198 | 199 | def melspectrogram_extract(wav_list, args): 200 | """EXTRACT MEL SPECTROGRAM.""" 201 | # define feature extractor 202 | for i, wav_name in enumerate(wav_list): 203 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 204 | 205 | # load wavfile and apply low cut filter 206 | fs, x = wavfile.read(wav_name) 207 | if x.dtype != np.int16: 208 | logging.warning("wav file format is not 16 bit PCM.") 209 | x = np.array(x, dtype=np.float64) 210 | if args.highpass_cutoff != 0: 211 | x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) 212 | 213 | # check sampling frequency 214 | if not fs == args.fs: 215 | logging.error("sampling frequency is not matched.") 216 | sys.exit(1) 217 | 218 | # extract features 219 | x_norm = x / (np.iinfo(np.int16).max + 1) 220 | shiftl = int(args.shiftms * fs * 0.001) 221 | mspc = librosa.feature.melspectrogram( 222 | x_norm, fs, 223 | n_fft=args.fftl, 224 | hop_length=shiftl, 225 | n_mels=args.mspc_dim, 226 | fmin=args.fmin if args.fmin is not None else 0, 227 | fmax=args.fmax if args.fmax is not None else fs // 2, 228 | power=1.0) 229 | mspc = np.log10(np.maximum(EPS, mspc.T)) 230 | 231 | # save to hdf5 232 | hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5") 233 | write_hdf5(hdf5name, "/melspc", np.float32(mspc)) 234 | 235 | # overwrite wav file 236 | if args.highpass_cutoff != 0 and args.save_wav: 237 | wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x)) 238 | 239 | 240 | def melcepstrum_extract(wav_list, args): 241 | """EXTRACT MEL CEPSTRUM.""" 242 | # define feature extractor 243 | for i, wav_name in enumerate(wav_list): 244 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 245 | 246 | # load wavfile and apply low cut filter 247 | fs, x = wavfile.read(wav_name) 248 | if x.dtype != np.int16: 249 | logging.warning("wav file format is not 16 bit PCM.") 250 | x = np.array(x, dtype=np.float64) 251 | if args.highpass_cutoff != 0: 252 | x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) 253 | 254 | # check sampling frequency 255 | if not fs == args.fs: 256 | logging.error("sampling frequency is not matched.") 257 | sys.exit(1) 258 | 259 | # extract features 260 | shiftl = int(args.shiftms * fs * 0.001) 261 | mcep = stft_mcep(x, args.fftl, shiftl, args.mcep_dim, args.mcep_alpha) 262 | 263 | # save to hdf5 264 | hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5") 265 | write_hdf5(hdf5name, "/mcep", np.float32(mcep)) 266 | 267 | # overwrite wav file 268 | if args.highpass_cutoff != 0 and args.save_wav: 269 | wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x)) 270 | 271 | 272 | def main(): 273 | """RUN FEATURE EXTRACTION IN PARALLEL.""" 274 | parser = argparse.ArgumentParser( 275 | description="making feature file argsurations.") 276 | 277 | parser.add_argument( 278 | "--waveforms", default=None, 279 | help="directory or list of filename of input wavfile") 280 | parser.add_argument( 281 | "--hdf5dir", default=None, 282 | help="directory to save hdf5") 283 | parser.add_argument( 284 | "--wavdir", default=None, 285 | help="directory to save of preprocessed wav file") 286 | parser.add_argument( 287 | "--fs", default=16000, 288 | type=int, help="Sampling frequency") 289 | parser.add_argument( 290 | "--shiftms", default=5, 291 | type=float, help="Frame shift in msec") 292 | parser.add_argument( 293 | "--feature_type", default="world", choices=["world", "melspc", "mcep"], 294 | type=str, help="feature type") 295 | parser.add_argument( 296 | "--mspc_dim", default=80, 297 | type=int, help="Dimension of mel spectrogram") 298 | parser.add_argument( 299 | "--minf0", default=40, 300 | type=int, help="minimum f0 for world analysis") 301 | parser.add_argument( 302 | "--maxf0", default=400, 303 | type=int, help="maximum f0 for world analysis") 304 | parser.add_argument( 305 | "--fmin", default=None, nargs="?", 306 | type=int, help="minimum frequency for melspc") 307 | parser.add_argument( 308 | "--fmax", default=None, nargs="?", 309 | type=int, help="maximum frequency for melspc") 310 | parser.add_argument( 311 | "--mcep_dim", default=24, 312 | type=int, help="Dimension of mel cepstrum") 313 | parser.add_argument( 314 | "--mcep_alpha", default=0.41, 315 | type=float, help="Alpha of mel cepstrum") 316 | parser.add_argument( 317 | "--fftl", default=1024, 318 | type=int, help="FFT length") 319 | parser.add_argument( 320 | "--highpass_cutoff", default=70, 321 | type=int, help="Cut off frequency in lowpass filter") 322 | parser.add_argument( 323 | "--save_wav", default=True, 324 | type=strtobool, help="Whether to save filtered wav file") 325 | parser.add_argument( 326 | "--n_jobs", default=10, 327 | type=int, help="number of parallel jobs") 328 | parser.add_argument( 329 | "--verbose", default=1, 330 | type=int, help="log message level") 331 | 332 | args = parser.parse_args() 333 | 334 | # set log level 335 | if args.verbose == 1: 336 | logging.basicConfig(level=logging.INFO, 337 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 338 | datefmt='%m/%d/%Y %I:%M:%S') 339 | elif args.verbose > 1: 340 | logging.basicConfig(level=logging.DEBUG, 341 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 342 | datefmt='%m/%d/%Y %I:%M:%S') 343 | else: 344 | logging.basicConfig(level=logging.WARNING, 345 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 346 | datefmt='%m/%d/%Y %I:%M:%S') 347 | logging.warning("logging is disabled.") 348 | 349 | # show arguments 350 | for key, value in vars(args).items(): 351 | logging.info("%s = %s" % (key, str(value))) 352 | 353 | # read list 354 | if os.path.isdir(args.waveforms): 355 | file_list = sorted(find_files(args.waveforms, "*.wav")) 356 | else: 357 | file_list = read_txt(args.waveforms) 358 | logging.info("number of utterances = %d" % len(file_list)) 359 | 360 | # check directory existence 361 | if not os.path.exists(args.wavdir) and args.highpass_cutoff != 0 and args.save_wav: 362 | os.makedirs(args.wavdir) 363 | if not os.path.exists(args.hdf5dir): 364 | os.makedirs(args.hdf5dir) 365 | 366 | # divide list 367 | file_lists = np.array_split(file_list, args.n_jobs) 368 | file_lists = [f_list.tolist() for f_list in file_lists] 369 | 370 | # multi processing 371 | processes = [] 372 | if args.feature_type == "world": 373 | target_fn = world_feature_extract 374 | elif args.feature_type == "melspc": 375 | target_fn = melspectrogram_extract 376 | else: 377 | target_fn = melcepstrum_extract 378 | for f in file_lists: 379 | p = mp.Process(target=target_fn, args=(f, args,)) 380 | p.start() 381 | processes.append(p) 382 | 383 | # wait for all process 384 | for p in processes: 385 | p.join() 386 | 387 | 388 | if __name__ == "__main__": 389 | main() 390 | -------------------------------------------------------------------------------- /egs/arctic/sd-melspc/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | . ./cmd.sh || exit 1; 11 | 12 | # USER SETTINGS {{{ 13 | ####################################### 14 | # STAGE SETTING # 15 | ####################################### 16 | stage=0123456 17 | # 0: data preparation step 18 | # 1: feature extraction step 19 | # 2: statistics calculation step 20 | # 3: noise weighting step 21 | # 4: training step 22 | # 5: decoding step 23 | # 6: noise shaping step 24 | 25 | ####################################### 26 | # FEATURE SETTING # 27 | ####################################### 28 | feature_type=melspc # world or melspc (in this recipe fixed to "melspc") 29 | spk=slt # target spekaer in arctic 30 | shiftms=5 # shift length in msec 31 | fftl=1024 # fft length 32 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 33 | fs=16000 # sampling rate 34 | mspc_dim=80 # dimension of mel-spectrogram 35 | mcep_dim=25 # dimension of mel-cepstrum 36 | mcep_alpha=0.410 # alpha value of mel-cepstrum 37 | fmin="" # minimum frequency in melspc calculation 38 | fmax="" # maximum frequency in melspc calculation 39 | use_noise_shaping=true # whether to use noise shaping 40 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 41 | n_jobs=10 # number of parallel jobs 42 | 43 | ####################################### 44 | # TRAINING SETTING # 45 | ####################################### 46 | n_gpus=1 # number of gpus 47 | n_quantize=256 # number of quantization of waveform 48 | n_aux=80 # number of auxiliary features 49 | n_resch=512 # number of residual channels 50 | n_skipch=256 # number of skip channels 51 | dilation_depth=10 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 52 | dilation_repeat=3 # number of dilation repeats 53 | kernel_size=2 # kernel size of dilated convolution 54 | lr=1e-4 # learning rate 55 | weight_decay=0.0 # weight decay coef 56 | iters=200000 # number of iterations 57 | batch_length=20000 # batch length 58 | batch_size=1 # batch size 59 | checkpoint_interval=10000 # save model per this number 60 | use_upsampling=true # whether to use upsampling layer 61 | resume="" # checkpoint path to resume (Optional) 62 | 63 | ####################################### 64 | # DECODING SETTING # 65 | ####################################### 66 | outdir="" # directory to save decoded wav dir (Optional) 67 | checkpoint="" # checkpoint path to be used for decoding (Optional) 68 | config="" # model configuration path (Optional) 69 | stats="" # statistics path (Optional) 70 | feats="" # list or directory of feature files (Optional) 71 | decode_batch_size=32 # batch size in decoding 72 | 73 | ####################################### 74 | # OTHER SETTING # 75 | ####################################### 76 | ARCTIC_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded) 77 | tag="" # tag for network directory naming (Optional) 78 | 79 | # parse options 80 | . parse_options.sh || exit 1; 81 | 82 | # check feature type 83 | if [ ${feature_type} != "melspc" ]; then 84 | echo "This recipe does not support feature_type=\"world\"." 2>&1 85 | echo "Please try the egs/arctic/sd." 2>&1 86 | exit 1; 87 | fi 88 | 89 | # set directory names 90 | train=tr_${spk} 91 | eval=ev_${spk} 92 | 93 | # stop when error occurred 94 | set -euo pipefail 95 | # }}} 96 | 97 | 98 | # STAGE 0 {{{ 99 | if echo ${stage} | grep -q 0; then 100 | echo "###########################################################" 101 | echo "# DATA PREPARATION STEP #" 102 | echo "###########################################################" 103 | if [ ! -e ${ARCTIC_DB_ROOT}/.done ];then 104 | mkdir -p ${ARCTIC_DB_ROOT} 105 | cd ${ARCTIC_DB_ROOT} 106 | for id in bdl slt rms clb jmk ksp awb;do 107 | wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${id}_arctic-0.95-release.tar.bz2 108 | tar xf cmu_us_${id}*.tar.bz2 109 | done 110 | rm ./*.tar.bz2 111 | cd ../ 112 | touch ${ARCTIC_DB_ROOT}/.done 113 | echo "database is successfully downloaded." 114 | fi 115 | [ ! -e data/local ] && mkdir -p data/local 116 | [ ! -e data/${train} ] && mkdir -p data/${train} 117 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 118 | find "${ARCTIC_DB_ROOT}/cmu_us_${spk}_arctic/wav" -name "*.wav" \ 119 | | sort > "data/local/wav.${spk}.scp" 120 | head -n 1028 "data/local/wav.${spk}.scp" >> "data/${train}/wav.scp" 121 | tail -n 104 "data/local/wav.${spk}.scp" >> "data/${eval}/wav.scp" 122 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 123 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 124 | fi 125 | # }}} 126 | 127 | 128 | # STAGE 1 {{{ 129 | if echo ${stage} | grep -q 1; then 130 | echo "###########################################################" 131 | echo "# FEATURE EXTRACTION STEP #" 132 | echo "###########################################################" 133 | for set in ${train} ${eval};do 134 | # training data feature extraction 135 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${feature_type}_${set}.log \ 136 | feature_extract.py \ 137 | --waveforms data/${set}/wav.scp \ 138 | --wavdir wav_hpf/${set} \ 139 | --hdf5dir hdf5/${set} \ 140 | --feature_type ${feature_type} \ 141 | --fs ${fs} \ 142 | --shiftms ${shiftms} \ 143 | --mspc_dim ${mspc_dim} \ 144 | --highpass_cutoff ${highpass_cutoff} \ 145 | --fftl ${fftl} \ 146 | --fmin "${fmin}" \ 147 | --fmax "${fmax}" \ 148 | --n_jobs ${n_jobs} 149 | 150 | # extract stft-baed mel-cepstrum for noise shaping 151 | if [ ${set} = ${train} ] && ${use_noise_shaping};then 152 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_mcep_${set}.log \ 153 | feature_extract.py \ 154 | --waveforms data/${set}/wav.scp \ 155 | --wavdir wav_hpf/${set} \ 156 | --hdf5dir hdf5/${set} \ 157 | --feature_type mcep \ 158 | --fs ${fs} \ 159 | --shiftms ${shiftms} \ 160 | --mcep_dim ${mcep_dim} \ 161 | --mcep_alpha ${mcep_alpha} \ 162 | --highpass_cutoff ${highpass_cutoff} \ 163 | --save_wav false \ 164 | --fftl ${fftl} \ 165 | --n_jobs ${n_jobs} 166 | fi 167 | 168 | # check the number of feature files 169 | n_wavs=$(wc -l data/${set}/wav.scp) 170 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 171 | echo "${n_feats}/${n_wavs} files are successfully processed." 172 | 173 | # make scp files 174 | if [ ${highpass_cutoff} -eq 0 ];then 175 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 176 | else 177 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 178 | fi 179 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 180 | done 181 | fi 182 | # }}} 183 | 184 | 185 | # STAGE 2 {{{ 186 | if echo ${stage} | grep -q 2; then 187 | echo "###########################################################" 188 | echo "# CALCULATE STATISTICS STEP #" 189 | echo "###########################################################" 190 | ${train_cmd} exp/calculate_statistics/calc_stats_${feature_type}_${train}.log \ 191 | calc_stats.py \ 192 | --feats data/${train}/feats.scp \ 193 | --stats data/${train}/stats.h5 \ 194 | --feature_type ${feature_type} 195 | if ${use_noise_shaping};then 196 | ${train_cmd} exp/calculate_statistics/calc_stats_mcep_${train}.log \ 197 | calc_stats.py \ 198 | --feats data/${train}/feats.scp \ 199 | --stats data/${train}/stats.h5 \ 200 | --feature_type mcep 201 | fi 202 | echo "statistics are successfully calculated." 203 | fi 204 | # }}} 205 | 206 | 207 | # STAGE 3 {{{ 208 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 209 | echo "###########################################################" 210 | echo "# NOISE WEIGHTING STEP #" 211 | echo "###########################################################" 212 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_mcep_${train}.log \ 213 | noise_shaping.py \ 214 | --waveforms data/${train}/wav_hpf.scp \ 215 | --stats data/${train}/stats.h5 \ 216 | --outdir wav_nwf/${train} \ 217 | --feature_type mcep \ 218 | --fs ${fs} \ 219 | --shiftms ${shiftms} \ 220 | --mcep_alpha ${mcep_alpha} \ 221 | --mag ${mag} \ 222 | --inv true \ 223 | --n_jobs ${n_jobs} 224 | 225 | # check the number of feature files 226 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 227 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 228 | echo "${n_ns}/${n_wavs} files are successfully processed." 229 | 230 | # make scp files 231 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 232 | fi 233 | # }}} 234 | 235 | 236 | # STAGE 4 {{{ 237 | # set variables 238 | if [ ! -n "${tag}" ];then 239 | expdir=exp/tr_arctic_16k_sd_melspc_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 240 | if ${use_noise_shaping};then 241 | expdir=${expdir}_ns 242 | fi 243 | if ${use_upsampling};then 244 | expdir=${expdir}_up 245 | fi 246 | else 247 | expdir=exp/tr_arctic_${tag} 248 | fi 249 | if echo ${stage} | grep -q 4; then 250 | echo "###########################################################" 251 | echo "# WAVENET TRAINING STEP #" 252 | echo "###########################################################" 253 | if ${use_noise_shaping};then 254 | waveforms=data/${train}/wav_nwf.scp 255 | else 256 | waveforms=data/${train}/wav_hpf.scp 257 | fi 258 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 259 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 260 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 261 | ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \ 262 | train.py \ 263 | --n_gpus ${n_gpus} \ 264 | --waveforms ${waveforms} \ 265 | --feats data/${train}/feats.scp \ 266 | --stats data/${train}/stats.h5 \ 267 | --expdir "${expdir}" \ 268 | --feature_type ${feature_type} \ 269 | --n_quantize ${n_quantize} \ 270 | --n_aux ${n_aux} \ 271 | --n_resch ${n_resch} \ 272 | --n_skipch ${n_skipch} \ 273 | --dilation_depth ${dilation_depth} \ 274 | --dilation_repeat ${dilation_repeat} \ 275 | --kernel_size ${kernel_size} \ 276 | --lr ${lr} \ 277 | --weight_decay ${weight_decay} \ 278 | --iters ${iters} \ 279 | --batch_length ${batch_length} \ 280 | --batch_size ${batch_size} \ 281 | --checkpoint_interval ${checkpoint_interval} \ 282 | --upsampling_factor "${upsampling_factor}" \ 283 | --use_upsampling_layer ${use_upsampling} \ 284 | --resume "${resume}" 285 | fi 286 | # }}} 287 | 288 | 289 | # STAGE 5 {{{ 290 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 291 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 292 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 293 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 294 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 295 | if echo ${stage} | grep -q 5; then 296 | echo "###########################################################" 297 | echo "# WAVENET DECODING STEP #" 298 | echo "###########################################################" 299 | ${cuda_cmd} --gpu ${n_gpus} "${outdir}/log/decode.log" \ 300 | decode.py \ 301 | --n_gpus ${n_gpus} \ 302 | --feats ${feats} \ 303 | --stats ${stats} \ 304 | --outdir "${outdir}" \ 305 | --checkpoint "${checkpoint}" \ 306 | --config "${config}" \ 307 | --fs ${fs} \ 308 | --batch_size ${decode_batch_size} 309 | fi 310 | # }}} 311 | 312 | 313 | # STAGE 6 {{{ 314 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 315 | echo "###########################################################" 316 | echo "# NOISE SHAPING STEP #" 317 | echo "###########################################################" 318 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 319 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_mcep_${eval}.log \ 320 | noise_shaping.py \ 321 | --waveforms ${outdir}/wav.scp \ 322 | --stats ${stats} \ 323 | --outdir "${outdir}_nsf" \ 324 | --feature_type mcep \ 325 | --fs ${fs} \ 326 | --shiftms ${shiftms} \ 327 | --mcep_alpha ${mcep_alpha} \ 328 | --mag ${mag} \ 329 | --n_jobs ${n_jobs} \ 330 | --inv false 331 | fi 332 | # }}} 333 | -------------------------------------------------------------------------------- /egs/m-ailabs-speech/sd/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################ 3 | # SCRIPT TO BUILD SD WAVENET VOCODER # 4 | ############################################################ 5 | 6 | # Copyright 2017 Tomoki Hayashi (Nagoya University) 7 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 8 | 9 | . ./path.sh || exit 1; 10 | . ./cmd.sh || exit 1; 11 | 12 | # USER SETTINGS {{{ 13 | ####################################### 14 | # STAGE SETTING # 15 | ####################################### 16 | stage=0123456 17 | # 0: data preparation step 18 | # 1: feature extraction step 19 | # 2: statistics calculation step 20 | # 3: noise weighting step 21 | # 4: training step 22 | # 5: decoding step 23 | # 6: noise shaping step 24 | 25 | ####################################### 26 | # FEATURE SETTING # 27 | ####################################### 28 | feature_type=world # world or melspc (in this recipe fixed to "world") 29 | spk=elizabeth # judy (F) or mary (F) or elliot (M) or elizabeth (F) 30 | minf0=40 # minimum f0 31 | maxf0=400 # maximum f0 32 | shiftms=5 # shift length in msec 33 | fftl=1024 # fft length 34 | highpass_cutoff=70 # highpass filter cutoff frequency (if 0, will not apply) 35 | fs=16000 # sampling rate 36 | mcep_dim=24 # dimension of mel-cepstrum 37 | mcep_alpha=0.410 # alpha value of mel-cepstrum 38 | use_noise_shaping=true # whether to use noise shaping 39 | mag=0.5 # strength of noise shaping (0.0 < mag <= 1.0) 40 | n_jobs=10 # number of parallel jobs 41 | 42 | ####################################### 43 | # TRAINING SETTING # 44 | ####################################### 45 | n_gpus=1 # number of gpus 46 | n_quantize=256 # number of quantization of waveform 47 | n_aux=28 # number of auxiliary features 48 | n_resch=512 # number of residual channels 49 | n_skipch=256 # number of skip channels 50 | dilation_depth=10 # dilation depth (e.g. if set 10, max dilation = 2^(10-1)) 51 | dilation_repeat=3 # number of dilation repeats 52 | kernel_size=2 # kernel size of dilated convolution 53 | lr=1e-4 # learning rate 54 | weight_decay=0.0 # weight decay coef 55 | iters=200000 # number of iterations 56 | batch_length=20000 # batch length 57 | batch_size=1 # batch size 58 | checkpoint_interval=10000 # save model per this number 59 | use_upsampling=true # whether to use upsampling layer 60 | resume="" # checkpoint path to resume (Optional) 61 | 62 | ####################################### 63 | # DECODING SETTING # 64 | ####################################### 65 | outdir="" # directory to save decoded wav dir (Optional) 66 | checkpoint="" # checkpoint path to be used for decoding (Optional) 67 | config="" # model configuration path (Optional) 68 | stats="" # statistics path (Optional) 69 | feats="" # list or directory of feature files (Optional) 70 | decode_batch_size=32 # batch size in decoding 71 | 72 | ####################################### 73 | # OTHER SETTING # 74 | ####################################### 75 | DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded) 76 | tag="" # tag for network directory naming (Optional) 77 | 78 | # parse options 79 | . parse_options.sh || exit 1; 80 | 81 | # check feature type 82 | if [ ${feature_type} != "world" ]; then 83 | echo "This recipe does not support feature_type=\"melspc\"." 2>&1 84 | echo "Please try the egs/m-ailabs-speech/sd-melspc." 2>&1 85 | exit 1; 86 | fi 87 | 88 | # set directory names 89 | train=tr_${spk} 90 | eval=ev_${spk} 91 | 92 | # stop when error occurred 93 | set -euo pipefail 94 | # }}} 95 | 96 | 97 | # STAGE 0 {{{ 98 | if echo ${stage} | grep -q 0; then 99 | echo "###########################################################" 100 | echo "# DATA PREPARATION STEP #" 101 | echo "###########################################################" 102 | if [ ! -e ${DB_ROOT}/.done ];then 103 | mkdir -p ${DB_ROOT} 104 | cd ${DB_ROOT} 105 | wget http://www.caito.de/data/Training/stt_tts/en_US.tgz 106 | wget http://www.caito.de/data/Training/stt_tts/en_UK.tgz 107 | tar xzvf en_US.tgz 108 | tar xzvf en_UK.tgz 109 | rm ./*.tgz 110 | cd ../ 111 | touch ${DB_ROOT}/.done 112 | echo "database is successfully downloaded." 113 | fi 114 | [ ! -e data/local ] && mkdir -p data/local 115 | [ ! -e data/${train} ] && mkdir -p data/${train} 116 | [ ! -e data/${eval} ] && mkdir -p data/${eval} 117 | if [ ${spk} = "elizabeth" ]; then 118 | find ${DB_ROOT}/en_UK/by_book/female/elizabeth_klett -name "*.wav" \ 119 | | sort > data/local/wav.${spk}.scp 120 | grep -v "wives_and_daughters_60_" data/local/wav.${spk}.scp > data/${train}/wav.scp 121 | grep "wives_and_daughters_60_" data/local/wav.${spk}.scp > data/${eval}/wav.scp 122 | elif [ ${spk} = "judy" ]; then 123 | find ${DB_ROOT}/en_US/by_book/female/judy_bieber -name "*.wav" \ 124 | | sort > data/local/wav.${spk}.scp 125 | grep -v "the_sea_faries_22_" data/local/wav.${spk}.scp > data/${train}/wav.scp 126 | grep "the_sea_faries_22_" data/local/wav.${spk}.scp > data/${eval}/wav.scp 127 | elif [ ${spk} = "mary" ]; then 128 | find ${DB_ROOT}/en_US/by_book/female/mary_ann -name "*.wav" \ 129 | | sort > data/local/wav.${spk}.scp 130 | grep -v "northandsouth_52_" data/local/wav.${spk}.scp > data/${train}/wav.scp 131 | grep "northandsouth_52_" data/local/wav.${spk}.scp > data/${eval}/wav.scp 132 | elif [ ${spk} = "elliot" ]; then 133 | find ${DB_ROOT}/en_US/by_book/male/elliot_miller -name "*.wav" \ 134 | | sort > data/local/wav.${spk}.scp 135 | grep -v "silent_bullet_13_" data/local/wav.${spk}.scp > data/${train}/wav.scp 136 | grep "silent_bullet_13_" data/local/wav.${spk}.scp > data/${eval}/wav.scp 137 | else 138 | echo "ERROR: spk should be selected from elizabeth, judy, mary, and elliot" 139 | exit 1 140 | fi 141 | echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))" 142 | echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))" 143 | fi 144 | # }}} 145 | 146 | 147 | # STAGE 1 {{{ 148 | if echo ${stage} | grep -q 1; then 149 | echo "###########################################################" 150 | echo "# FEATURE EXTRACTION STEP #" 151 | echo "###########################################################" 152 | for set in ${train} ${eval};do 153 | # training data feature extraction 154 | ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \ 155 | feature_extract.py \ 156 | --waveforms data/${set}/wav.scp \ 157 | --wavdir wav_hpf/${set} \ 158 | --hdf5dir hdf5/${set} \ 159 | --feature_type ${feature_type} \ 160 | --fs ${fs} \ 161 | --shiftms ${shiftms} \ 162 | --minf0 ${minf0} \ 163 | --maxf0 ${maxf0} \ 164 | --mcep_dim ${mcep_dim} \ 165 | --mcep_alpha ${mcep_alpha} \ 166 | --highpass_cutoff ${highpass_cutoff} \ 167 | --fftl ${fftl} \ 168 | --n_jobs ${n_jobs} 169 | 170 | # check the number of feature files 171 | n_wavs=$(wc -l data/${set}/wav.scp) 172 | n_feats=$(find hdf5/${set} -name "*.h5" | wc -l) 173 | echo "${n_feats}/${n_wavs} files are successfully processed." 174 | 175 | # make scp files 176 | if [ ${highpass_cutoff} -eq 0 ];then 177 | cp data/${set}/wav.scp data/${set}/wav_hpf.scp 178 | else 179 | find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp 180 | fi 181 | find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp 182 | done 183 | fi 184 | # }}} 185 | 186 | 187 | # STAGE 2 {{{ 188 | if echo ${stage} | grep -q 2; then 189 | echo "###########################################################" 190 | echo "# CALCULATE STATISTICS STEP #" 191 | echo "###########################################################" 192 | ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \ 193 | calc_stats.py \ 194 | --feats data/${train}/feats.scp \ 195 | --stats data/${train}/stats.h5 \ 196 | --feature_type ${feature_type} 197 | echo "statistics are successfully calculated." 198 | fi 199 | # }}} 200 | 201 | 202 | # STAGE 3 {{{ 203 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then 204 | echo "###########################################################" 205 | echo "# NOISE WEIGHTING STEP #" 206 | echo "###########################################################" 207 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \ 208 | noise_shaping.py \ 209 | --waveforms data/${train}/wav_hpf.scp \ 210 | --stats data/${train}/stats.h5 \ 211 | --outdir wav_nwf/${train} \ 212 | --feature_type ${feature_type} \ 213 | --fs ${fs} \ 214 | --shiftms ${shiftms} \ 215 | --mcep_dim_start 2 \ 216 | --mcep_dim_end $(( 2 + mcep_dim + 1 )) \ 217 | --mcep_alpha ${mcep_alpha} \ 218 | --mag ${mag} \ 219 | --inv true \ 220 | --n_jobs ${n_jobs} 221 | 222 | # check the number of feature files 223 | n_wavs=$(wc -l data/${train}/wav_hpf.scp) 224 | n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l) 225 | echo "${n_ns}/${n_wavs} files are successfully processed." 226 | 227 | # make scp files 228 | find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp 229 | fi # }}} 230 | 231 | 232 | # STAGE 4 {{{ 233 | # set variables 234 | if [ ! -n "${tag}" ];then 235 | expdir=exp/tr_mai_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size} 236 | if ${use_noise_shaping};then 237 | expdir=${expdir}_ns 238 | fi 239 | if ${use_upsampling};then 240 | expdir=${expdir}_up 241 | fi 242 | else 243 | expdir=exp/tr_mai_16k_${tag} 244 | fi 245 | if echo ${stage} | grep -q 4; then 246 | echo "###########################################################" 247 | echo "# WAVENET TRAINING STEP #" 248 | echo "###########################################################" 249 | if ${use_noise_shaping};then 250 | waveforms=data/${train}/wav_nwf.scp 251 | else 252 | waveforms=data/${train}/wav_hpf.scp 253 | fi 254 | upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc) 255 | [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log 256 | [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir} 257 | ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \ 258 | train.py \ 259 | --n_gpus ${n_gpus} \ 260 | --waveforms ${waveforms} \ 261 | --feats data/${train}/feats.scp \ 262 | --stats data/${train}/stats.h5 \ 263 | --expdir "${expdir}" \ 264 | --feature_type ${feature_type} \ 265 | --n_quantize ${n_quantize} \ 266 | --n_aux ${n_aux} \ 267 | --n_resch ${n_resch} \ 268 | --n_skipch ${n_skipch} \ 269 | --dilation_depth ${dilation_depth} \ 270 | --dilation_repeat ${dilation_repeat} \ 271 | --kernel_size ${kernel_size} \ 272 | --lr ${lr} \ 273 | --weight_decay ${weight_decay} \ 274 | --iters ${iters} \ 275 | --batch_length ${batch_length} \ 276 | --batch_size ${batch_size} \ 277 | --checkpoint_interval ${checkpoint_interval} \ 278 | --upsampling_factor "${upsampling_factor}" \ 279 | --use_upsampling_layer ${use_upsampling} \ 280 | --resume "${resume}" 281 | fi 282 | # }}} 283 | 284 | 285 | # STAGE 5 {{{ 286 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav 287 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl 288 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf 289 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5 290 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp 291 | if echo ${stage} | grep -q 5; then 292 | echo "###########################################################" 293 | echo "# WAVENET DECODING STEP #" 294 | echo "###########################################################" 295 | ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \ 296 | decode.py \ 297 | --n_gpus ${n_gpus} \ 298 | --feats ${feats} \ 299 | --stats ${stats} \ 300 | --outdir "${outdir}" \ 301 | --checkpoint "${checkpoint}" \ 302 | --config "${config}" \ 303 | --fs ${fs} \ 304 | --batch_size ${decode_batch_size} 305 | fi 306 | # }}} 307 | 308 | 309 | # STAGE 6 {{{ 310 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then 311 | echo "###########################################################" 312 | echo "# NOISE SHAPING STEP #" 313 | echo "###########################################################" 314 | find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp 315 | ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \ 316 | noise_shaping.py \ 317 | --waveforms ${outdir}/wav.scp \ 318 | --stats ${stats} \ 319 | --outdir "${outdir}_nsf" \ 320 | --feature_type ${feature_type} \ 321 | --fs ${fs} \ 322 | --shiftms ${shiftms} \ 323 | --mcep_dim_start 2 \ 324 | --mcep_dim_end $(( 2 + mcep_dim + 1 )) \ 325 | --mcep_alpha ${mcep_alpha} \ 326 | --mag ${mag} \ 327 | --n_jobs ${n_jobs} \ 328 | --inv false 329 | fi 330 | # }}} 331 | --------------------------------------------------------------------------------