├── wavenet_vocoder
    ├── __init__.py
    ├── bin
    │   ├── __init__.py
    │   ├── calc_stats.py
    │   ├── noise_shaping.py
    │   ├── decode.py
    │   └── feature_extract.py
    ├── nets
    │   └── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── download_from_google_drive.sh
    │   ├── parse_options.sh
    │   ├── utils.py
    │   └── run.pl
├── egs
    ├── arctic
    │   ├── sd
    │   │   ├── conf
    │   │   │   ├── awb.f0
    │   │   │   ├── bdl.f0
    │   │   │   ├── clb.f0
    │   │   │   ├── jmk.f0
    │   │   │   ├── ksp.f0
    │   │   │   ├── rms.f0
    │   │   │   ├── slt.f0
    │   │   │   └── slurm.conf
    │   │   ├── path.sh
    │   │   ├── cmd.sh
    │   │   └── run.sh
    │   ├── sd-mini
    │   │   ├── conf
    │   │   │   ├── awb.f0
    │   │   │   ├── bdl.f0
    │   │   │   ├── clb.f0
    │   │   │   ├── jmk.f0
    │   │   │   ├── ksp.f0
    │   │   │   ├── rms.f0
    │   │   │   └── slt.f0
    │   │   ├── path.sh
    │   │   └── run.sh
    │   ├── si-close
    │   │   ├── conf
    │   │   │   ├── awb.f0
    │   │   │   ├── bdl.f0
    │   │   │   ├── clb.f0
    │   │   │   ├── jmk.f0
    │   │   │   ├── ksp.f0
    │   │   │   ├── rms.f0
    │   │   │   ├── slt.f0
    │   │   │   └── slurm.conf
    │   │   ├── path.sh
    │   │   └── cmd.sh
    │   ├── si-open
    │   │   ├── conf
    │   │   │   ├── awb.f0
    │   │   │   ├── bdl.f0
    │   │   │   ├── clb.f0
    │   │   │   ├── jmk.f0
    │   │   │   ├── ksp.f0
    │   │   │   ├── rms.f0
    │   │   │   ├── slt.f0
    │   │   │   └── slurm.conf
    │   │   ├── path.sh
    │   │   └── cmd.sh
    │   ├── sd-melspc
    │   │   ├── path.sh
    │   │   ├── conf
    │   │   │   └── slurm.conf
    │   │   ├── cmd.sh
    │   │   └── run.sh
    │   ├── si-close-melspc
    │   │   ├── path.sh
    │   │   ├── conf
    │   │   │   └── slurm.conf
    │   │   └── cmd.sh
    │   └── si-open-melspc
    │   │   ├── path.sh
    │   │   ├── conf
    │   │       └── slurm.conf
    │   │   └── cmd.sh
    ├── ljspeech
    │   ├── sd
    │   │   ├── path.sh
    │   │   ├── conf
    │   │   │   └── slurm.conf
    │   │   ├── cmd.sh
    │   │   └── run.sh
    │   └── sd-melspc
    │   │   ├── path.sh
    │   │   ├── conf
    │   │       └── slurm.conf
    │   │   ├── cmd.sh
    │   │   └── run.sh
    ├── m-ailabs-speech
    │   ├── sd
    │   │   ├── path.sh
    │   │   ├── conf
    │   │   │   └── slurm.conf
    │   │   ├── cmd.sh
    │   │   └── run.sh
    │   └── sd-melspc
    │   │   ├── path.sh
    │   │   ├── conf
    │   │       └── slurm.conf
    │   │   └── cmd.sh
    └── README.md
├── setup.cfg
├── .gitignore
├── tools
    └── Makefile
├── test
    ├── test_upsampling.py
    ├── test_preprocessing.py
    ├── test_generator.py
    └── test_wavenet.py
├── .travis.yml
├── setup.py
├── README.md
└── LICENSE


/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/awb.f0:
--------------------------------------------------------------------------------
1 | 65 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/bdl.f0:
--------------------------------------------------------------------------------
1 | 70 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/clb.f0:
--------------------------------------------------------------------------------
1 | 110 270
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/jmk.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/ksp.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/rms.f0:
--------------------------------------------------------------------------------
1 | 55 200
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/slt.f0:
--------------------------------------------------------------------------------
1 | 120 275
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/awb.f0:
--------------------------------------------------------------------------------
1 | 65 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/bdl.f0:
--------------------------------------------------------------------------------
1 | 70 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/clb.f0:
--------------------------------------------------------------------------------
1 | 110 270
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/jmk.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/ksp.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/rms.f0:
--------------------------------------------------------------------------------
1 | 55 200
2 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/conf/slt.f0:
--------------------------------------------------------------------------------
1 | 120 275
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/awb.f0:
--------------------------------------------------------------------------------
1 | 65 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/bdl.f0:
--------------------------------------------------------------------------------
1 | 70 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/clb.f0:
--------------------------------------------------------------------------------
1 | 110 270
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/jmk.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/ksp.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/rms.f0:
--------------------------------------------------------------------------------
1 | 55 200
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/slt.f0:
--------------------------------------------------------------------------------
1 | 120 275
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/awb.f0:
--------------------------------------------------------------------------------
1 | 65 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/bdl.f0:
--------------------------------------------------------------------------------
1 | 70 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/clb.f0:
--------------------------------------------------------------------------------
1 | 110 270
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/jmk.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/ksp.f0:
--------------------------------------------------------------------------------
1 | 60 210
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/rms.f0:
--------------------------------------------------------------------------------
1 | 55 200
2 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/slt.f0:
--------------------------------------------------------------------------------
1 | 120 275
2 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/nets/__init__.py:
--------------------------------------------------------------------------------
1 | from .wavenet import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | addopts = --verbose
 3 | testpaths = test
 4 | 
 5 | [flake8]
 6 | ignore = H102,D100,D105,D107
 7 | # 120 is a workaround, 79 is good
 8 | max-line-length = 100
 9 | exclude = wavenet_vocoder/utils
10 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-melspc/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd-melspc/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close-melspc/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open-melspc/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd-melspc/path.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
2 | export CUDA_HOME=/usr/local/cuda
3 | export PRJ_ROOT=../../..
4 | source $PRJ_ROOT/tools/venv/bin/activate
5 | export PATH=$PATH:$PRJ_ROOT/wavenet_vocoder/bin:$PRJ_ROOT/wavenet_vocoder/utils
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | slurm-*.out
 3 | .DS_Store
 4 | exp/
 5 | snippet.sh
 6 | tools/venv
 7 | tools/sprocket
 8 | egs/*/*/data/
 9 | egs/*/*/exp/
10 | egs/*/*/wav/
11 | egs/*/*/wav_ns/
12 | egs/*/*/wav_hpf/
13 | egs/*/*/wav_nwf/
14 | egs/*/*/hdf5/
15 | egs/*/*/downloads/
16 | egs/*/*/downloads
17 | .pytest_cache/
18 | .eggs/
19 | *.egg-info/
20 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-melspc/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd-melspc/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close-melspc/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open-melspc/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd-melspc/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --export=PATH  --ntasks-per-node=1
 2 | option time=* --time $0
 3 | option mem=* --mem-per-cpu $0
 4 | option mem=0
 5 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
 7 | default gpu=0
 8 | option gpu=0 -p all,hpc
 9 | option gpu=* -p hpc --gres=gpu:$0 --time 10-00:00:00
10 | 


--------------------------------------------------------------------------------
/tools/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all clean
 2 | 
 3 | all: venv/bin/activate
 4 | 
 5 | venv/bin/activate:
 6 | 	test -d venv || virtualenv -p python3.6 venv
 7 | 	. venv/bin/activate && pip install --upgrade pip
 8 | 	. venv/bin/activate && cd ../ &&  pip install torch==1.0.1 torchvision==0.2.2
 9 | 	. venv/bin/activate && cd ../ &&  pip install -e .
10 | 	. venv/bin/activate && cd ../ &&  pip install -e .[test]
11 | 
12 | clean:
13 | 	rm -fr venv
14 | 	find -iname "*.pyc" -delete
15 | 


--------------------------------------------------------------------------------
/test/test_upsampling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from wavenet_vocoder.nets import initialize
10 | from wavenet_vocoder.nets import UpSampling
11 | 
12 | 
13 | def test_upsampling():
14 |     aux = np.random.randn(1, 28, 1000)
15 |     conv = UpSampling(10)
16 |     conv.apply(initialize)
17 |     batch = torch.from_numpy(aux).float()
18 |     out = conv(batch)
19 |     out = out.detach().numpy()
20 |     assert out.shape[-1] == aux.shape[-1] * 10
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | cache:
 4 |   - pip
 5 |   - ccache
 6 | 
 7 | matrix:
 8 |   include:
 9 |     - os: linux
10 |       python: "3.6"
11 | 
12 | install:
13 |   - pip3 install -U pip wheel
14 |   - pip3 install numpy
15 |     # NOTE: use 1.0.1 for travis check because 1.1.0 > argmax behavior is strange
16 |   - pip3 install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
17 |   - pip3 install torchvision==0.2.2
18 |   - pip3 install -e .
19 |   - pip3 install -e .[test]
20 | 
21 | script:
22 |   - flake8 wavenet_vocoder test
23 |   - autopep8 -r wavenet_vocoder test --exclude wavenet_vocoder/utils --global-config .pep8 --diff --max-line-length 120 | tee check_autopep8
24 |   - test ! -s check_autopep8
25 |   - pytest
26 | 
27 | sudo: false
28 | 
29 | addons:
30 |   apt:
31 |     packages:
32 |       - cmake
33 |       - python3-dev
34 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | 
17 | # for slurm (you can change configuration file "conf/slurm.conf")
18 | # export train_cmd="slurm.pl --config conf/slurm.conf"
19 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
20 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-melspc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | 
17 | # for slurm (you can change configuration file "conf/slurm.conf")
18 | # export train_cmd="slurm.pl --config conf/slurm.conf"
19 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
20 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/arctic/si-close-melspc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/arctic/si-open-melspc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd-melspc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd-melspc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # for local
14 | export train_cmd="run.pl"
15 | export cuda_cmd="run.pl --gpu 1"
16 | export max_jobs=1
17 | 
18 | # for slurm (you can change configuration file "conf/slurm.conf")
19 | # export train_cmd="slurm.pl --config conf/slurm.conf"
20 | # export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
21 | # export max_jobs=-1
22 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/utils/download_from_google_drive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Download compressed file from google drive
 4 | 
 5 | # Copyright 2019 Tomoki Hayashi
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | share_url=$1
 9 | download_dir=${2:-"downloads"}
10 | file_ext=${3:-"tar.gz"}
11 | 
12 | if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then
13 |    echo "Usage: $0 <share-url> [<download_dir> <file_ext>]";
14 |    echo "e.g.: $0 https://drive.google.com/open?id=<file_id> downloads tar.gz"
15 |    echo "Options:"
16 |    echo "    <download_dir>: directory to save downloaded file. (Default=downloads)"
17 |    echo "    <file_ext>: file extension of the file to be downloaded. (Default=tar.gz)"
18 |    if [ "$1" = "--help" ]; then
19 |        exit 0;
20 |    fi
21 |    exit 1;
22 | fi
23 | 
24 | [ ! -e "${download_dir}" ] && mkdir -p "${download_dir}"
25 | tmp=$(mktemp "${download_dir}/XXXXXX.${file_ext}")
26 | 
27 | # file id in google drive can be obtain from sharing link
28 | # ref: https://qiita.com/namakemono/items/c963e75e0af3f7eed732
29 | file_id=$(echo "${share_url}" | cut -d"=" -f 2)
30 | 
31 | # define decompressor
32 | decompress () {
33 |     filename=$1
34 |     decompress_dir=$2
35 |     if echo "${filename}" | grep -q ".zip"; then
36 |         unzip "${filename}" -d "${decompress_dir}"
37 |     elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then
38 |         tar xvzf "${filename}" -C "${decompress_dir}"
39 |     else
40 |         echo "Unsupported file extension." >&2 && exit 1
41 |     fi
42 | }
43 | 
44 | # Try-catch like processing
45 | (
46 |     wget "https://drive.google.com/uc?export=download&id=${file_id}" -O "${tmp}"
47 |     decompress "${tmp}" "${download_dir}"
48 | ) || {
49 |     # Do not allow error from here
50 |     set -e
51 |     # sometimes, wget from google drive is failed due to virus check confirmation
52 |     # to avoid it, we need to do some tricky processings
53 |     # see https://stackoverflow.com/questions/20665881/direct-download-from-google-drive-using-google-drive-api
54 |     curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${file_id}" > /tmp/intermezzo.html
55 |     postfix=$(grep -Po 'uc-download-link" [^>]* href="\K[^"]*' /tmp/intermezzo.html | sed 's/\&amp;/\&/g')
56 |     curl -L -b /tmp/cookies "https://drive.google.com${postfix}" > "${tmp}"
57 |     decompress "${tmp}" "${download_dir}"
58 | }
59 | 
60 | # remove tmpfiles
61 | rm "${tmp}"
62 | echo "Sucessfully downloaded zip file from ${share_url}"
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import pip
 6 | import sys
 7 | 
 8 | from distutils.version import LooseVersion
 9 | from setuptools import find_packages
10 | from setuptools import setup
11 | 
12 | 
13 | if LooseVersion(sys.version) < LooseVersion("3.6"):
14 |     raise RuntimeError(
15 |         "Python>=3.6 is required, "
16 |         "but your Python is {}".format(sys.version))
17 | if LooseVersion(pip.__version__) < LooseVersion("19"):
18 |     raise RuntimeError(
19 |         "pip>=19.0.0 is required, but your pip is {}. "
20 |         "Try again after \"pip install -U pip\"".format(pip.__version__))
21 | 
22 | requirements = {
23 |     "install": [
24 |         "h5py>=2.8.0",
25 |         "scikit-learn==0.22.2",
26 |         "librosa>=0.6.2",
27 |         "soundfile>=0.10.2",
28 |         "torch>=1.0.1",
29 |         "torchvision>=0.2.2",
30 |         "sprocket-vc>=0.18.2",
31 |         "matplotlib>=3.0.3",
32 |     ],
33 |     "setup": [
34 |         "numpy",
35 |         "pytest-runner"
36 |     ],
37 |     "test": [
38 |         "pytest>=3.3.0",
39 |         "hacking==1.1.0",
40 |         "autopep8==1.2.4",
41 |     ]}
42 | install_requires = requirements["install"]
43 | setup_requires = requirements["setup"]
44 | tests_require = requirements["test"]
45 | extras_require = {k: v for k, v in requirements.items()
46 |                   if k not in ["install", "setup"]}
47 | 
48 | dirname = os.path.dirname(__file__)
49 | setup(name="wavenet_vocoder",
50 |       version="0.1.1",
51 |       url="http://github.com/kan-bayashi/PytorchWaveNetVocoder",
52 |       author="Tomoki Hayashi",
53 |       author_email="hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp",
54 |       description="Pytorch WaveNet Vocoder",
55 |       long_description=open(os.path.join(dirname, "README.md"),
56 |                             encoding="utf-8").read(),
57 |       license="Apache Software License",
58 |       packages=find_packages(include="wavenet_vocoder*"),
59 |       install_requires=install_requires,
60 |       setup_requires=setup_requires,
61 |       tests_require=tests_require,
62 |       extras_require=extras_require,
63 |       classifiers=[
64 |           "Programming Language :: Python",
65 |           "Programming Language :: Python :: 3.6",
66 |           "Intended Audience :: Science/Research",
67 |           "Operating System :: POSIX :: Linux",
68 |           "License :: OSI Approved :: Apache Software License",
69 |           "Topic :: Software Development :: Libraries :: Python Modules"],
70 |       )
71 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/bin/calc_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import argparse
 8 | import logging
 9 | 
10 | import numpy as np
11 | 
12 | from sklearn.preprocessing import StandardScaler
13 | 
14 | from wavenet_vocoder.utils import read_hdf5
15 | from wavenet_vocoder.utils import read_txt
16 | from wavenet_vocoder.utils import write_hdf5
17 | 
18 | 
19 | def calc_stats(file_list, args):
20 |     """CALCULATE STATISTICS."""
21 |     scaler = StandardScaler()
22 | 
23 |     # process over all of data
24 |     for i, filename in enumerate(file_list):
25 |         logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list)))
26 |         feat = read_hdf5(filename, "/" + args.feature_type)
27 |         scaler.partial_fit(feat)
28 | 
29 |     # add uv term
30 |     mean = scaler.mean_
31 |     scale = scaler.scale_
32 |     if args.feature_type == "world":
33 |         mean[0] = 0.0
34 |         scale[0] = 1.0
35 | 
36 |     # write to hdf5
37 |     write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean))
38 |     write_hdf5(args.stats, "/" + args.feature_type + "/scale", np.float32(scale))
39 | 
40 | 
41 | def main():
42 |     """RUN CALCULATION OF STATISTICS."""
43 |     parser = argparse.ArgumentParser()
44 | 
45 |     parser.add_argument(
46 |         "--feats", default=None, required=True,
47 |         type=str, help="name of the list of hdf5 files")
48 |     parser.add_argument(
49 |         "--stats", default=None, required=True,
50 |         type=str, help="filename of hdf5 format")
51 |     parser.add_argument(
52 |         "--feature_type", default="world", choices=["world", "melspc", "mcep"],
53 |         type=str, help="feature type")
54 |     parser.add_argument(
55 |         "--verbose", default=1,
56 |         type=int, help="log message level")
57 | 
58 |     args = parser.parse_args()
59 | 
60 |     # set log level
61 |     if args.verbose == 1:
62 |         logging.basicConfig(level=logging.INFO,
63 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
64 |                             datefmt='%m/%d/%Y %I:%M:%S')
65 |     elif args.verbose > 1:
66 |         logging.basicConfig(level=logging.DEBUG,
67 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
68 |                             datefmt='%m/%d/%Y %I:%M:%S')
69 |     else:
70 |         logging.basicConfig(level=logging.WARNING,
71 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
72 |                             datefmt='%m/%d/%Y %I:%M:%S')
73 |         logging.warning("logging is disabled.")
74 | 
75 |     # show arguments
76 |     for key, value in vars(args).items():
77 |         logging.info("%s = %s" % (key, str(value)))
78 | 
79 |     # read file list
80 |     file_list = read_txt(args.feats)
81 |     logging.info("number of utterances = %d" % len(file_list))
82 | 
83 |     # calculate statistics
84 |     calc_stats(file_list, args)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/test/test_preprocessing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import argparse
  7 | import os
  8 | import shutil
  9 | 
 10 | import numpy as np
 11 | import pytest
 12 | 
 13 | from scipy.io import wavfile
 14 | 
 15 | from wavenet_vocoder.bin.calc_stats import calc_stats
 16 | from wavenet_vocoder.bin.feature_extract import melcepstrum_extract
 17 | from wavenet_vocoder.bin.feature_extract import melspectrogram_extract
 18 | from wavenet_vocoder.bin.feature_extract import world_feature_extract
 19 | from wavenet_vocoder.bin.noise_shaping import convert_mcep_to_mlsa_coef
 20 | from wavenet_vocoder.bin.noise_shaping import noise_shaping
 21 | from wavenet_vocoder.utils import check_hdf5
 22 | from wavenet_vocoder.utils import find_files
 23 | from wavenet_vocoder.utils import read_hdf5
 24 | from wavenet_vocoder.utils import write_hdf5
 25 | 
 26 | 
 27 | def make_dummy_wav(name, maxlen=32000, fs=16000):
 28 |     length = np.random.randint(maxlen // 2, maxlen)
 29 |     x = np.random.randn(length)
 30 |     x = x / np.abs(x).max()
 31 |     x = np.int16(x * (np.iinfo(np.int16).max + 1))
 32 |     wavfile.write(name, fs, x)
 33 | 
 34 | 
 35 | def make_args(**kwargs):
 36 |     defaults = dict(
 37 |         hdf5dir="tmp/hdf5",
 38 |         wavdir="tmp/wav_filtered",
 39 |         outdir="tmp/wav_nwf",
 40 |         stats="tmp/stats.h5",
 41 |         feature_type="world",
 42 |         fs=16000,
 43 |         shiftms=5,
 44 |         minf0=40,
 45 |         maxf0=400,
 46 |         mspc_dim=80,
 47 |         mcep_dim=24,
 48 |         mcep_alpha=0.41,
 49 |         fftl=1024,
 50 |         highpass_cutoff=70,
 51 |         mcep_dim_start=2,
 52 |         mcep_dim_end=25,
 53 |         fmin=None,
 54 |         fmax=None,
 55 |         mag=0.5,
 56 |         save_wav=True,
 57 |         inv=False,
 58 |     )
 59 |     defaults.update(kwargs)
 60 |     return argparse.Namespace(**defaults)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize("feature_type", [
 64 |     ("melspc"), ("world"), ("mcep"),
 65 | ])
 66 | def test_preprocessing(feature_type):
 67 |     # make arguments
 68 |     args = make_args(feature_type=feature_type)
 69 | 
 70 |     # prepare dummy wav files
 71 |     wavdir = "tmp/wav"
 72 |     if not os.path.exists(wavdir):
 73 |         os.makedirs(wavdir)
 74 |     for i in range(5):
 75 |         make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs)
 76 | 
 77 |     # feature extract
 78 |     wav_list = find_files(wavdir, "*.wav")
 79 |     if not os.path.exists(args.wavdir):
 80 |         os.makedirs(args.wavdir)
 81 |     if args.feature_type == "world":
 82 |         world_feature_extract(wav_list, args)
 83 |     elif args.feature_type == "melspc":
 84 |         melspectrogram_extract(wav_list, args)
 85 |     else:
 86 |         melcepstrum_extract(wav_list, args)
 87 | 
 88 |     # calc_stats
 89 |     file_list = find_files(args.hdf5dir, "*.h5")
 90 |     calc_stats(file_list, args)
 91 | 
 92 |     # noise shaping
 93 |     if feature_type != "melspc":
 94 |         wav_list = find_files(args.wavdir, "*.wav")
 95 |         if not os.path.exists(args.outdir):
 96 |             os.makedirs(args.outdir)
 97 |         if not check_hdf5(args.stats, "/mlsa/coef"):
 98 |             avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
 99 |             if args.feature_type == "world":
100 |                 avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
101 |             mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha)
102 |             write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
103 |             write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)
104 |         noise_shaping(wav_list, args)
105 | 
106 |     # remove
107 |     shutil.rmtree("tmp")
108 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/utils/parse_options.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
 4 | #                 Arnab Ghoshal, Karel Vesely
 5 | 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #  http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
16 | # See the Apache 2 License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | 
20 | # Parse command-line options.
21 | # To be sourced by another script (as in ". parse_options.sh").
22 | # Option format is: --option-name arg
23 | # and shell variable "option_name" gets set to value "arg."
24 | # The exception is --help, which takes no arguments, but prints the 
25 | # $help_message variable (if defined).
26 | 
27 | 
28 | ###
29 | ### The --config file options have lower priority to command line 
30 | ### options, so we need to import them first...
31 | ###
32 | 
33 | # Now import all the configs specified by command-line, in left-to-right order
34 | for ((argpos=1; argpos<$#; argpos++)); do
35 |   if [ "${!argpos}" == "--config" ]; then
36 |     argpos_plus1=$((argpos+1))
37 |     config=${!argpos_plus1}
38 |     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
39 |     . $config  # source the config file.
40 |   fi
41 | done
42 | 
43 | 
44 | ###
45 | ### No we process the command line options
46 | ###
47 | while true; do
48 |   [ -z "${1:-}" ] && break;  # break if there are no arguments
49 |   case "$1" in
50 |     # If the enclosing script is called with --help option, print the help 
51 |     # message and exit.  Scripts should put help messages in $help_message
52 |   --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
53 | 	  else printf "$help_message\n" 1>&2 ; fi; 
54 | 	  exit 0 ;; 
55 |   --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
56 |        exit 1 ;;
57 |     # If the first command-line argument begins with "--" (e.g. --foo-bar), 
58 |     # then work out the variable name as $name, which will equal "foo_bar".
59 |   --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
60 |     # Next we test whether the variable in question is undefned-- if so it's 
61 |     # an invalid option and we die.  Note: $0 evaluates to the name of the 
62 |     # enclosing script.
63 |     # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
64 |     # is undefined.  We then have to wrap this test inside "eval" because 
65 |     # foo_bar is itself inside a variable ($name).
66 |       eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
67 |       
68 |       oldval="`eval echo \\$$name`";
69 |     # Work out whether we seem to be expecting a Boolean argument.
70 |       if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
71 | 	was_bool=true;
72 |       else 
73 | 	was_bool=false;
74 |       fi
75 | 
76 |     # Set the variable to the right value-- the escaped quotes make it work if
77 |     # the option had spaces, like --cmd "queue.pl -sync y"
78 |       eval $name=\"$2\"; 
79 |         
80 |     # Check that Boolean-valued arguments are really Boolean.
81 |       if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
82 |         echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
83 |         exit 1;
84 |       fi
85 |       shift 2;
86 |       ;;
87 |   *) break;
88 |   esac
89 | done
90 | 
91 | 
92 | # Check for an empty argument to the --cmd option, which can easily occur as a 
93 | # result of scripting errors.
94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
95 | 
96 | 
97 | true; # so this script returns exit code 0.
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 |  
  2 | 
  3 | ### I released new implementation [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN). Please enjoy your hacking!
  4 | 
  5 | # PYTORCH-WAVENET-VOCODER
  6 | 
  7 | [![Build Status](https://travis-ci.org/kan-bayashi/PytorchWaveNetVocoder.svg?branch=master)](https://travis-ci.org/kan-bayashi/PytorchWaveNetVocoder)
  8 | 
  9 | This repository is the wavenet-vocoder implementation with pytorch.
 10 | 
 11 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/overview.bmp)
 12 | 
 13 | You can try the demo recipe in Google colab from now!
 14 | 
 15 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kan-bayashi/INTERSPEECH19_TUTORIAL/blob/master/notebooks/wavenet_vocoder/wavenet_vocoder.ipynb)
 16 | 
 17 | ## Key features
 18 | 
 19 | - Support kaldi-like recipe, easy to reproduce the results
 20 | - Support multi-gpu training / decoding
 21 | - Support world features / mel-spectrogram as auxiliary features
 22 | - Support recipes of three public databases
 23 | 
 24 |     - [CMU Arctic database](http://www.festvox.org/cmu_arctic/): `egs/arctic`
 25 |     - [LJ Speech database](https://keithito.com/LJ-Speech-Dataset/): `egs/ljspeech`
 26 |     - [M-AILABS speech database](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/): `egs/m-ailabs-speech`
 27 | 
 28 | ## Requirements
 29 | 
 30 | - python 3.6+
 31 | - virtualenv
 32 | - cuda 9.0+
 33 | - cndnn 7.1+
 34 | - nccl 2.0+ (for the use of multi-gpus)
 35 | 
 36 | Recommend to use the GPU with 10GB> memory.
 37 | 
 38 | ## Setup
 39 | 
 40 | ### A. Make virtualenv
 41 | 
 42 | ```bash
 43 | $ git clone https://github.com/kan-bayashi/PytorchWaveNetVocoder.git
 44 | $ cd PytorchWaveNetVocoder/tools
 45 | $ make
 46 | ```
 47 | 
 48 | ### B. Install with pip
 49 | 
 50 | ```
 51 | $ git clone https://github.com/kan-bayashi/PytorchWaveNetVocoder.git
 52 | $ cd PytorchWaveNetVocoder
 53 | 
 54 | # recommend to use with pytorch 1.0.1 because only tested on 1.0.1
 55 | $ pip install torch==1.0.1 torchvision==0.2.2
 56 | $ pip install -e .
 57 | 
 58 | # please make dummy activate file to suppress warning in the recipe
 59 | $ mkdir -p tools/venv/bin && touch tools/venv/bin/activate
 60 | ```
 61 | 
 62 | ## How-to-run
 63 | 
 64 | ```bash
 65 | $ cd egs/arctic/sd
 66 | $ ./run.sh
 67 | ```
 68 | 
 69 | See more detail of the recipes in [egs/README.md](egs/README.md).
 70 | 
 71 | ## Results
 72 | 
 73 | You can listen to samples from [kan-bayashi/WaveNetVocoderSamples](https://kan-bayashi.github.io/WaveNetVocoderSamples/).
 74 | 
 75 | This is the subjective evaluation results using `arctic` recipe.
 76 | 
 77 | **Comparison between model type**
 78 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/mos.bmp)
 79 | 
 80 | **Effect of the amount of training data**
 81 | ![](https://kan-bayashi.github.io/WaveNetVocoderSamples/images/mos_num_train.bmp)
 82 | 
 83 | If you want to listen more samples, please access our google drive from [here](https://drive.google.com/drive/folders/1zC1WDiMu4SOdc7UeOayoEe_79PdnPBu6?usp=sharing).
 84 | 
 85 | Here is the list of samples:
 86 | - `arctic_raw_16k`: original in arctic database
 87 | - `arctic_sd_16k_world`: sd model with world aux feats + noise shaping with world mcep
 88 | - `arctic_si-open_16k_world`: si-open model with world aux feats + noise shaping with world mcep
 89 | - `arctic_si-close_16k_world`: si-close model with world aux feats + noise shaping with world mcep
 90 | - `arctic_si-close_16k_melspc`: si-close model with mel-spectrogram aux feats
 91 | - `arctic_si-close_16k_melspc_ns`: si-close model with mel-spectrogram aux feats + noise shaping with stft mcep
 92 | - `ljspeech_raw_22.05k`: original in ljspeech database
 93 | - `ljspeech_sd_22.05k_world`: sd model with world aux feats + noise shaping with world mcep
 94 | - `ljspeech_sd_22.05k_melspc`: sd model with mel-spectrogram aux feats
 95 | - `ljspeech_sd_22.05k_melspc_ns`: sd model with mel-spectrogram aux feats + noise shaping with stft mcep
 96 | - `m-ailabs_raw_16k`: original in m-ailabs speech database
 97 | - `m-ailabs_sd_16k_melspc`: sd model with mel-spectrogram aux feats
 98 | 
 99 | ## References
100 | 
101 | Please cite the following articles.
102 | 
103 | ```
104 | @inproceedings{tamamori2017speaker,
105 |   title={Speaker-dependent WaveNet vocoder},
106 |   author={Tamamori, Akira and Hayashi, Tomoki and Kobayashi, Kazuhiro and Takeda, Kazuya and Toda, Tomoki},
107 |   booktitle={Proceedings of Interspeech},
108 |   pages={1118--1122},
109 |   year={2017}
110 | }
111 | @inproceedings{hayashi2017multi,
112 |   title={An Investigation of Multi-Speaker Training for WaveNet Vocoder},
113 |   author={Hayashi, Tomoki and Tamamori, Akira and Kobayashi, Kazuhiro and Takeda, Kazuya and Toda, Tomoki},
114 |   booktitle={Proc. ASRU 2017},
115 |   year={2017}
116 | }
117 | @article{hayashi2018sp,
118 |   title={複数話者WaveNetボコーダに関する調査}.
119 |   author={林知樹 and 小林和弘 and 玉森聡 and 武田一哉 and 戸田智基},
120 |   journal={電子情報通信学会技術研究報告},
121 |   year={2018}
122 | }
123 | ```
124 | 
125 | ## Author
126 | 
127 | Tomoki Hayashi @ Nagoya University  
128 | e-mail:hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp
129 | 


--------------------------------------------------------------------------------
/egs/README.md:
--------------------------------------------------------------------------------
  1 | # Outline of recipes
  2 | 
  3 | Here we introcude the outline of recipes.
  4 | 
  5 | If you want to learn step-by-step, you can try the demo recipe in Google colab!
  6 | 
  7 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kan-bayashi/INTERSPEECH19_TUTORIAL/blob/master/notebooks/wavenet_vocoder/wavenet_vocoder.ipynb)
  8 | 
  9 | ## Supported database
 10 | 
 11 | - [CMU Arctic database](http://www.festvox.org/cmu_arctic/): `egs/arctic`
 12 | - [LJ Speech database](https://keithito.com/LJ-Speech-Dataset/): `egs/ljspeech`
 13 | - [M-AILABS speech database](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/): `egs/m-ailabs-speech`
 14 | 
 15 | ## Type of recipe
 16 | 
 17 | `sd`: speaker-dependent model
 18 | 
 19 | - build speaker dependent model
 20 | - the speaker of training data is the same as that of evaluation data
 21 | - auxiliary features are based on World analysis
 22 | - noise shaping with world mel-cepstrum is applied
 23 | 
 24 | `si-open`: speaker-independent model in open condition
 25 | 
 26 | - build speaker independent model in spekaer-open condition
 27 | - the speakers of evaluation data does not include those of training data
 28 | - auxiliary features are based on World analysis
 29 | - noise shaping with world mel-cepstrum is applied
 30 | 
 31 | `si-close`: speaker-independent model in speaker-closed condition
 32 | 
 33 | - build speaker independent model in open condition
 34 | - the speakers of evaluation data includes those of training data
 35 | - auxiliary features are based on World analysis
 36 | - noise shaping with world mel-cepstrum is applied
 37 | 
 38 | `*-melspc`: model with mel-spectrogram
 39 | 
 40 | - build the model with mel-spectrogram
 41 | - auxiliary features are mel-spectrogram
 42 | - noise shaping with stft mel-cepstrum is applied
 43 | 
 44 | ## Flow of recipe
 45 | 
 46 | 0. data preparation (`stage 0`)
 47 | 1. auxiliary feature extraction (`stage 1`)
 48 | 2. statistics calculation (`stage 2`)
 49 | 3. noise weighting (`stage 3`)
 50 | 4. WaveNet training (`stage 4`)
 51 | 5. WaveNet decoding (`stage 5`)
 52 | 6. noise shaping (`stage 6`)
 53 | 
 54 | ## How-to-run
 55 | 
 56 | ```bash
 57 | # change directory to one of the recipe
 58 | $ cd arctic/sd
 59 | 
 60 | # run the recipe
 61 | $ ./run.sh
 62 | 
 63 | # you can skip some stages (in this case only stage 4,5,6 will be conducted)
 64 | $ ./run.sh --stage 456
 65 | 
 66 | # you can also change hyperparameters via command line
 67 | $ ./run.sh --lr 1e-3 --batch_length 10000
 68 | 
 69 | # multi-gpu training / decoding are supported (batch size should be greater than #gpus)
 70 | $ ./run.sh --n_gpus 3 --batch_size 3
 71 | ```
 72 | 
 73 | ## Run recipe with slurm
 74 | 
 75 | If slurm is installed in your servers, you can run recipes with slurm.
 76 | 
 77 | ```bash
 78 | $ cd egs/arctic/sd
 79 | 
 80 | # edit configuration
 81 | $ vim cmd.sh
 82 | # please edit as follows
 83 | -- cmd.sh --
 84 | # for local
 85 | # export train_cmd="run.pl"
 86 | # export cuda_cmd="run.pl --gpu 1"
 87 | 
 88 | # for slurm (you can change configuration file "conf/slurm.conf")
 89 | export train_cmd="slurm.pl --config conf/slurm.conf"
 90 | export cuda_cmd="slurm.pl --gpu 1 --config conf/slurm.conf"
 91 | 
 92 | $ vim conf/slurm.conf
 93 | # edit <your_partition_name>
 94 | -- slurm.conf --
 95 | command sbatch --export=PATH  --ntasks-per-node=1
 96 | option time=* --time $0
 97 | option mem=* --mem-per-cpu $0
 98 | option mem=0
 99 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
100 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1
101 | default gpu=0
102 | option gpu=0 -p <your_partion_name>
103 | option gpu=* -p <your_partion_name> --gres=gpu:$0 --time 10-00:00:00
104 | 
105 | # run the recipe
106 | $ ./run.sh
107 | ```
108 | 
109 | If you want to know more info about `run.pl` and `slurm.pl`, see [https://kaldi-asr.org/doc/queue.html](https://kaldi-asr.org/doc/queue.html).
110 | 
111 | ## Use pre-trained model to decode your own data
112 | 
113 | To synthesize your own data, things what you need are as follows:
114 | 
115 | ```
116 | - checkpoint-final.pkl (model parameter file)
117 | - model.conf (model configuration file)
118 | - stats.h5 (feature statistics file)
119 | - *.wav (your own wav file, should be 16000 Hz)
120 | ```
121 | 
122 | The procedure is as follows:
123 | 
124 | ```bash
125 | $ cd egs/arctic/si-close
126 | 
127 | # download pre-trained model which trained with 6 arctic speakers and world features
128 | $ wget "https://www.dropbox.com/s/xt7qqmfgamwpqqg/si-close_lr1e-4_wd0_bs20k_ns_up.zip?dl=0" -O si-close_lr1e-4_wd0_bs20k_ns_up.zip
129 | 
130 | # unzip
131 | $ unzip si-close_lr1e-4_wd0_bs20k_ns_up.zip
132 | 
133 | # make filelist of your own wav files
134 | $ find <your_wav_dir> -name "*.wav" > wav.scp
135 | 
136 | # feature extraction
137 | $ . ./path.sh
138 | $ feature_extract.py \
139 |     --waveforms wav.scp \
140 |     --wavdir wav/test \
141 |     --hdf5dir hdf5/test \
142 |     --feature_type world \
143 |     --fs 16000 \
144 |     --shiftms 5 \
145 |     --minf0 <set_appropriate_value> \
146 |     --maxf0 <set_appropriate_value> \
147 |     --mcep_dim 24 \
148 |     --mcep_alpha 0.41 \
149 |     --highpass_cutoff 70 \
150 |     --fftl 1024 \
151 |     --n_jobs 1
152 | 
153 | # make filelist of feature file
154 | $ find hdf5/test -name "*.h5" > feats.scp
155 | 
156 | # decode with pre-trained model
157 | $ decode.py \
158 |     --feats feats.scp \
159 |     --stats si-close_lr1e-4_wd0_bs20k_ns_up/stats.h5 \
160 |     --outdir si-close_lr1e-4_wd0_bs20k_ns_up/wav \
161 |     --checkpoint si-close_lr1e-4_wd0_bs20k_ns_up/checkpoint-final.pkl \
162 |     --config si-close_lr1e-4_wd0_bs20k_ns_up/model.conf \
163 |     --fs 16000 \
164 |     --batch_size 32 \
165 |     --n_gpus 1
166 | 
167 | # make filelist of generated wav file
168 | $ find si-close_lr1e-4_wd0_bs20k_ns_up/wav -name "*.wav" > wav_generated.scp
169 | 
170 | # perform noise shaping
171 | $ noise_shaping.py \
172 |     --waveforms wav_generated.scp \
173 |     --stats si-close_lr1e-4_wd0_bs20k_ns_up/stats.h5 \
174 |     --outdir si-close_lr1e-4_wd0_bs20k_ns_up/wav_nsf \
175 |     --feature_type world \
176 |     --fs 16000 \
177 |     --shiftms 5 \
178 |     --mcep_dim_start 2 \
179 |     --mcep_dim_end 27 \
180 |     --mcep_alpha 0.41 \
181 |     --mag 0.5 \
182 |     --inv false \
183 |     --n_jobs 1
184 | ```
185 | 
186 | Finally, you can hear the generated wav files in `si-close_lr1e-4_wd0_bs20k_ns_up/wav_nsf`.
187 | 
188 | ## Author
189 | 
190 | Tomoki Hayashi @ Nagoya University  
191 | e-mail:hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp
192 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/bin/noise_shaping.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | import argparse
  8 | import logging
  9 | import multiprocessing as mp
 10 | import os
 11 | import sys
 12 | 
 13 | from distutils.util import strtobool
 14 | 
 15 | import numpy as np
 16 | import pysptk
 17 | 
 18 | from scipy.io import wavfile
 19 | 
 20 | from wavenet_vocoder.utils import check_hdf5
 21 | from wavenet_vocoder.utils import find_files
 22 | from wavenet_vocoder.utils import read_hdf5
 23 | from wavenet_vocoder.utils import read_txt
 24 | from wavenet_vocoder.utils import write_hdf5
 25 | 
 26 | 
 27 | def convert_mcep_to_mlsa_coef(avg_mcep, mag, alpha):
 28 |     """CONVERT AVERAGE MEL-CEPTSRUM TO MLSA FILTER COEFFICIENT.
 29 | 
 30 |     Args:
 31 |         avg_mcep (ndarray): Averaged Mel-cepstrum (D,).
 32 |         mag (float): Magnification of noise shaping.
 33 |         alpha (float): All pass constant value.
 34 | 
 35 |     Return:
 36 |         ndarray: MLSA filter coefficient (D,).
 37 | 
 38 |     """
 39 |     avg_mcep *= mag
 40 |     avg_mcep[0] = 0.0
 41 |     coef = pysptk.mc2b(avg_mcep.astype(np.float64), alpha)
 42 |     assert np.isfinite(coef).all()
 43 |     return coef
 44 | 
 45 | 
 46 | def noise_shaping(wav_list, args):
 47 |     """APPLY NOISE SHAPING BASED ON MLSA FILTER."""
 48 |     # load coefficient of filter
 49 |     if check_hdf5(args.stats, "/mlsa/coef"):
 50 |         mlsa_coef = read_hdf5(args.stats, "/mlsa/coef")
 51 |         alpha = read_hdf5(args.stats, "/mlsa/alpha")
 52 |     else:
 53 |         raise KeyError("\"/mlsa/coef\" is not found in %s." % (args.stats))
 54 |     if args.inv:
 55 |         mlsa_coef *= -1.0
 56 | 
 57 |     # define synthesizer
 58 |     shiftl = int(args.fs / 1000 * args.shiftms)
 59 |     synthesizer = pysptk.synthesis.Synthesizer(
 60 |         pysptk.synthesis.MLSADF(
 61 |             order=mlsa_coef.shape[0] - 1,
 62 |             alpha=alpha),
 63 |         hopsize=shiftl
 64 |     )
 65 | 
 66 |     for i, wav_name in enumerate(wav_list):
 67 |         logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
 68 | 
 69 |         # load wavfile and apply low cut filter
 70 |         fs, x = wavfile.read(wav_name)
 71 |         if x.dtype != np.int16:
 72 |             logging.warning("wav file format is not 16 bit PCM.")
 73 |         x = np.float64(x)
 74 | 
 75 |         # check sampling frequency
 76 |         if not fs == args.fs:
 77 |             logging.error("sampling frequency is not matched.")
 78 |             sys.exit(1)
 79 | 
 80 |         # replicate coef for time-invariant filtering
 81 |         num_frames = int(len(x) / shiftl) + 1
 82 |         mlsa_coefs = np.float64(np.tile(mlsa_coef, [num_frames, 1]))
 83 | 
 84 |         # synthesis and write
 85 |         x_ns = synthesizer.synthesis(x, mlsa_coefs)
 86 |         write_name = args.outdir + "/" + os.path.basename(wav_name)
 87 |         wavfile.write(write_name, args.fs, np.int16(x_ns))
 88 | 
 89 | 
 90 | def main():
 91 |     """RUN NOISE SHAPING IN PARALLEL."""
 92 |     parser = argparse.ArgumentParser(
 93 |         description="making feature file argsurations.")
 94 | 
 95 |     parser.add_argument(
 96 |         "--waveforms", default=None,
 97 |         help="directory or list of filename of input wavfile")
 98 |     parser.add_argument(
 99 |         "--stats", default=None,
100 |         help="filename of hdf5 format")
101 |     parser.add_argument(
102 |         "--outdir", default=None,
103 |         help="directory to save preprocessed wav file")
104 |     parser.add_argument(
105 |         "--fs", default=16000,
106 |         type=int, help="Sampling frequency")
107 |     parser.add_argument(
108 |         "--shiftms", default=5,
109 |         type=float, help="Frame shift in msec")
110 |     parser.add_argument(
111 |         "--feature_type", default="world", choices=["world", "mcep", "melspc"],
112 |         type=str, help="feature type")
113 |     parser.add_argument(
114 |         "--mcep_dim_start", default=2,
115 |         type=int, help="Start index of mel cepstrum")
116 |     parser.add_argument(
117 |         "--mcep_dim_end", default=27,
118 |         type=int, help="End index of mel cepstrum")
119 |     parser.add_argument(
120 |         "--mcep_alpha", default=0.41,
121 |         type=float, help="Alpha of mel cepstrum")
122 |     parser.add_argument(
123 |         "--mag", default=0.5,
124 |         type=float, help="magnification of noise shaping")
125 |     parser.add_argument(
126 |         "--verbose", default=1,
127 |         type=int, help="log message level")
128 |     parser.add_argument(
129 |         '--n_jobs', default=10,
130 |         type=int, help="number of parallel jobs")
131 |     parser.add_argument(
132 |         '--inv', default=False, type=strtobool,
133 |         help="if True, inverse filtering will be performed")
134 | 
135 |     args = parser.parse_args()
136 | 
137 |     # set log level
138 |     if args.verbose == 1:
139 |         logging.basicConfig(level=logging.INFO,
140 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
141 |                             datefmt='%m/%d/%Y %I:%M:%S')
142 |     elif args.verbose > 1:
143 |         logging.basicConfig(level=logging.DEBUG,
144 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
145 |                             datefmt='%m/%d/%Y %I:%M:%S')
146 |     else:
147 |         logging.basicConfig(level=logging.WARNING,
148 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
149 |                             datefmt='%m/%d/%Y %I:%M:%S')
150 |         logging.warning("logging is disabled.")
151 | 
152 |     # show arguments
153 |     for key, value in vars(args).items():
154 |         logging.info("%s = %s" % (key, str(value)))
155 | 
156 |     # read list
157 |     if os.path.isdir(args.waveforms):
158 |         file_list = sorted(find_files(args.waveforms, "*.wav"))
159 |     else:
160 |         file_list = read_txt(args.waveforms)
161 |     logging.info("number of utterances = %d" % len(file_list))
162 | 
163 |     # check directory existence
164 |     if not os.path.exists(args.outdir):
165 |         os.makedirs(args.outdir)
166 | 
167 |     # divide list
168 |     file_lists = np.array_split(file_list, args.n_jobs)
169 |     file_lists = [f_list.tolist() for f_list in file_lists]
170 | 
171 |     # calculate MLSA coef ans save it
172 |     if not check_hdf5(args.stats, "/mlsa/coef"):
173 |         avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
174 |         if args.feature_type == "world":
175 |             avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
176 |         mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha)
177 |         write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
178 |         write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)
179 | 
180 |     # multi processing
181 |     processes = []
182 |     if args.feature_type == "melspc":
183 |         # TODO(kan-bayashi): implement noise shaping using melspectrogram
184 |         raise NotImplementedError("currently, support only world and mcep.")
185 |     for f in file_lists:
186 |         p = mp.Process(target=noise_shaping, args=(f, args,))
187 |         p.start()
188 |         processes.append(p)
189 | 
190 |     # wait for all process
191 |     for p in processes:
192 |         p.join()
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import fnmatch
  7 | import logging
  8 | import os
  9 | import sys
 10 | import threading
 11 | 
 12 | import h5py
 13 | import numpy as np
 14 | 
 15 | from numpy.matlib import repmat
 16 | 
 17 | 
 18 | def check_hdf5(hdf5_name, hdf5_path):
 19 |     """CHECK HDF5 EXISTENCE.
 20 | 
 21 |     Args:
 22 |         hdf5_name (str): Filename of hdf5 file.
 23 |         hdf5_path (str): Dataset name in hdf5 file.
 24 | 
 25 |     Returns:
 26 |         bool: Dataset exists then return True.
 27 | 
 28 |     """
 29 |     if not os.path.exists(hdf5_name):
 30 |         return False
 31 |     else:
 32 |         with h5py.File(hdf5_name, "r") as f:
 33 |             if hdf5_path in f:
 34 |                 return True
 35 |             else:
 36 |                 return False
 37 | 
 38 | 
 39 | def read_hdf5(hdf5_name, hdf5_path):
 40 |     """READ HDF5 DATASET.
 41 | 
 42 |     Args:
 43 |         hdf5_name (str): Filename of hdf5 file.
 44 |         hdf5_path (str): Dataset name in hdf5 file.
 45 | 
 46 |     Return:
 47 |         any: Dataset values.
 48 | 
 49 |     """
 50 |     if not os.path.exists(hdf5_name):
 51 |         logging.error("there is no such a hdf5 file (%s)." % hdf5_name)
 52 |         sys.exit(1)
 53 | 
 54 |     hdf5_file = h5py.File(hdf5_name, "r")
 55 | 
 56 |     if hdf5_path not in hdf5_file:
 57 |         logging.error("there is no such a data in hdf5 file. (%s)" % hdf5_path)
 58 |         sys.exit(1)
 59 | 
 60 |     hdf5_data = hdf5_file[hdf5_path][()]
 61 |     hdf5_file.close()
 62 | 
 63 |     return hdf5_data
 64 | 
 65 | 
 66 | def shape_hdf5(hdf5_name, hdf5_path):
 67 |     """GET HDF5 DATASET SHAPE.
 68 | 
 69 |     Args:
 70 |         hdf5_name (str): Filename of hdf5 file.
 71 |         hdf5_path (str): Dataset name in hdf5 file.
 72 | 
 73 |     Returns:
 74 |         (tuple): Shape of dataset.
 75 | 
 76 |     """
 77 |     if check_hdf5(hdf5_name, hdf5_path):
 78 |         with h5py.File(hdf5_name, "r") as f:
 79 |             hdf5_shape = f[hdf5_path].shape
 80 |         return hdf5_shape
 81 |     else:
 82 |         logging.error("there is no such a file or dataset")
 83 |         sys.exit(1)
 84 | 
 85 | 
 86 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
 87 |     """WRITE DATASET TO HDF5.
 88 | 
 89 |     Args:
 90 |         hdf5_name (str): Hdf5 dataset filename.
 91 |         hdf5_path (str): Dataset path in hdf5.
 92 |         write_data (ndarray): Data to write.
 93 |         is_overwrite (bool): Whether to overwrite dataset.
 94 | 
 95 |     """
 96 |     # convert to numpy array
 97 |     write_data = np.array(write_data)
 98 | 
 99 |     # check folder existence
100 |     folder_name, _ = os.path.split(hdf5_name)
101 |     if not os.path.exists(folder_name) and len(folder_name) != 0:
102 |         os.makedirs(folder_name)
103 | 
104 |     # check hdf5 existence
105 |     if os.path.exists(hdf5_name):
106 |         # if already exists, open with r+ mode
107 |         hdf5_file = h5py.File(hdf5_name, "r+")
108 |         # check dataset existence
109 |         if hdf5_path in hdf5_file:
110 |             if is_overwrite:
111 |                 logging.warning("dataset in hdf5 file already exists.")
112 |                 logging.warning("recreate dataset in hdf5.")
113 |                 hdf5_file.__delitem__(hdf5_path)
114 |             else:
115 |                 logging.error("dataset in hdf5 file already exists.")
116 |                 logging.error("if you want to overwrite, please set is_overwrite = True.")
117 |                 hdf5_file.close()
118 |                 sys.exit(1)
119 |     else:
120 |         # if not exists, open with w mode
121 |         hdf5_file = h5py.File(hdf5_name, "w")
122 | 
123 |     # write data to hdf5
124 |     hdf5_file.create_dataset(hdf5_path, data=write_data)
125 |     hdf5_file.flush()
126 |     hdf5_file.close()
127 | 
128 | 
129 | def find_files(directory, pattern="*.wav", use_dir_name=True):
130 |     """FIND FILES RECURSIVELY.
131 | 
132 |     Args:
133 |         directory (str): Root directory to find.
134 |         pattern (str): Query to find.
135 |         use_dir_name (bool): If False, directory name is not included.
136 | 
137 |     Returns:
138 |         list: List of found filenames.
139 | 
140 |     """
141 |     files = []
142 |     for root, dirnames, filenames in os.walk(directory, followlinks=True):
143 |         for filename in fnmatch.filter(filenames, pattern):
144 |             files.append(os.path.join(root, filename))
145 |     if not use_dir_name:
146 |         files = [file_.replace(directory + "/", "") for file_ in files]
147 |     return files
148 | 
149 | 
150 | def read_txt(file_list):
151 |     """READ TXT FILE.
152 | 
153 |     Args:
154 |         file_list (str): TXT file filename.
155 | 
156 |     Returns:
157 |         list: List of read lines.
158 | 
159 |     """
160 |     with open(file_list, "r") as f:
161 |         filenames = f.readlines()
162 |     return [filename.replace("\n", "") for filename in filenames]
163 | 
164 | 
165 | class BackgroundGenerator(threading.Thread):
166 |     """BACKGROUND GENERATOR.
167 | 
168 |     Args:
169 |         generator (object): Generator instance.
170 |         max_prefetch (int): Max number of prefetch.
171 | 
172 |     References:
173 |         https://stackoverflow.com/questions/7323664/python-generator-pre-fetch
174 | 
175 |     """
176 | 
177 |     def __init__(self, generator, max_prefetch=1):
178 |         threading.Thread.__init__(self)
179 |         if sys.version_info.major == 2:
180 |             from Queue import Queue
181 |         else:
182 |             from queue import Queue
183 |         self.queue = Queue(max_prefetch)
184 |         self.generator = generator
185 |         self.daemon = True
186 |         self.start()
187 | 
188 |     def run(self):
189 |         """STORE ITEMS IN QUEUE."""
190 |         for item in self.generator:
191 |             self.queue.put(item)
192 |         self.queue.put(None)
193 | 
194 |     def next(self):
195 |         """GET ITEM IN THE QUEUE."""
196 |         next_item = self.queue.get()
197 |         if next_item is None:
198 |             raise StopIteration
199 |         return next_item
200 | 
201 |     def __next__(self):
202 |         return self.next()
203 | 
204 |     def __iter__(self):
205 |         return self
206 | 
207 | 
208 | class background(object):
209 |     """BACKGROUND GENERATOR DECORATOR."""
210 | 
211 |     def __init__(self, max_prefetch=1):
212 |         self.max_prefetch = max_prefetch
213 | 
214 |     def __call__(self, gen):
215 |         def bg_generator(*args, **kwargs):
216 |             return BackgroundGenerator(gen(*args, **kwargs))
217 |         return bg_generator
218 | 
219 | 
220 | def extend_time(feats, upsampling_factor):
221 |     """EXTEND TIME RESOLUTION.
222 | 
223 |     Args:
224 |         feats (ndarray): Feature vector with the shape (T, D).
225 |         upsampling_factor (int): Upsampling_factor.
226 | 
227 |     Returns:
228 |         (ndarray): Extended feats with the shape (upsampling_factor * T, D).
229 | 
230 |     """
231 |     # get number
232 |     n_frames = feats.shape[0]
233 |     n_dims = feats.shape[1]
234 | 
235 |     # extend time
236 |     feats_extended = np.zeros((n_frames * upsampling_factor, n_dims))
237 |     for j in range(n_frames):
238 |         start_idx = j * upsampling_factor
239 |         end_idx = (j + 1) * upsampling_factor
240 |         feats_extended[start_idx: end_idx] = repmat(feats[j, :], upsampling_factor, 1)
241 | 
242 |     return feats_extended
243 | 


--------------------------------------------------------------------------------
/test/test_generator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import argparse
  7 | import os
  8 | 
  9 | from wavenet_vocoder.bin.decode import decode_generator
 10 | from wavenet_vocoder.bin.feature_extract import melspectrogram_extract
 11 | from wavenet_vocoder.bin.feature_extract import world_feature_extract
 12 | from wavenet_vocoder.bin.train import train_generator
 13 | from wavenet_vocoder.utils import find_files
 14 | 
 15 | from test_preprocessing import make_args as make_feature_args
 16 | from test_preprocessing import make_dummy_wav
 17 | 
 18 | 
 19 | def make_train_generator_args(**kwargs):
 20 |     defaults = dict(
 21 |         wav_list=None,
 22 |         feat_list=None,
 23 |         receptive_field=1000,
 24 |         batch_length=3000,
 25 |         batch_size=5,
 26 |         feature_type="world",
 27 |         wav_transform=None,
 28 |         feat_transform=None,
 29 |         shuffle=False,
 30 |         upsampling_factor=80,
 31 |         use_upsampling_layer=True,
 32 |         use_speaker_code=False
 33 |     )
 34 |     defaults.update(kwargs)
 35 |     return argparse.Namespace(**defaults)
 36 | 
 37 | 
 38 | def make_decode_generator_args(**kwargs):
 39 |     defaults = dict(
 40 |         feat_list=None,
 41 |         batch_size=5,
 42 |         feature_type="world",
 43 |         wav_transform=None,
 44 |         feat_transform=None,
 45 |         upsampling_factor=80,
 46 |         use_upsampling_layer=True,
 47 |         use_speaker_code=False
 48 |     )
 49 |     defaults.update(kwargs)
 50 |     return argparse.Namespace(**defaults)
 51 | 
 52 | 
 53 | def test_train_generator():
 54 |     # make dummy wavfiles
 55 |     wavdir = "data/wav"
 56 |     if not os.path.exists(wavdir):
 57 |         os.makedirs(wavdir)
 58 |     for i in range(5):
 59 |         make_dummy_wav(wavdir + "/%d.wav" % i)
 60 | 
 61 |     # make features
 62 |     feat_args = make_feature_args()
 63 |     wav_list = find_files(wavdir, "*.wav")
 64 |     if not os.path.exists(feat_args.wavdir):
 65 |         os.makedirs(feat_args.wavdir)
 66 |     feat_args.feature_type = "melspc"
 67 |     melspectrogram_extract(wav_list, feat_args)
 68 |     feat_args.feature_type = "world"
 69 |     world_feature_extract(wav_list, feat_args)
 70 |     feat_list = find_files(feat_args.hdf5dir, "*.h5")
 71 | 
 72 |     for ft in ["world", "melspc"]:
 73 |         # ----------------------------------
 74 |         # minibatch without upsampling layer
 75 |         # ----------------------------------
 76 |         generator_args = make_train_generator_args(
 77 |             wav_list=wav_list,
 78 |             feat_list=feat_list,
 79 |             feature_type=ft,
 80 |             use_upsampling_layer=False,
 81 |             batch_length=10000,
 82 |             batch_size=5
 83 |         )
 84 |         generator = train_generator(**vars(generator_args))
 85 |         (x, h), t = next(generator)
 86 |         assert x.size(0) == t.size(0) == h.size(0)
 87 |         assert x.size(1) == t.size(1) == h.size(2)
 88 | 
 89 |         # ----------------------------------------
 90 |         # utterance batch without upsampling layer
 91 |         # ----------------------------------------
 92 |         generator_args = make_train_generator_args(
 93 |             wav_list=wav_list,
 94 |             feat_list=feat_list,
 95 |             feature_type=ft,
 96 |             use_upsampling_layer=False,
 97 |             batch_length=None,
 98 |             batch_size=5
 99 |         )
100 |         generator = train_generator(**vars(generator_args))
101 |         (x, h), t = next(generator)
102 |         assert x.size(0) == t.size(0) == h.size(0) == 1
103 |         assert x.size(1) == t.size(1) == h.size(2)
104 | 
105 |         # -------------------------------
106 |         # minibatch with upsampling layer
107 |         # -------------------------------
108 |         generator_args = make_train_generator_args(
109 |             wav_list=wav_list,
110 |             feat_list=feat_list,
111 |             feature_type=ft,
112 |             use_upsampling_layer=True,
113 |             batch_length=10000,
114 |             batch_size=5
115 |         )
116 |         generator = train_generator(**vars(generator_args))
117 |         (x, h), t = next(generator)
118 |         assert x.size(0) == t.size(0) == h.size(0)
119 |         assert x.size(1) == t.size(1) == h.size(2) * generator_args.upsampling_factor
120 | 
121 |         # -------------------------------------
122 |         # utterance batch with upsampling layer
123 |         # -------------------------------------
124 |         generator_args = make_train_generator_args(
125 |             wav_list=wav_list,
126 |             feat_list=feat_list,
127 |             feature_type=ft,
128 |             use_upsampling_layer=True,
129 |             batch_length=None,
130 |             batch_size=5
131 |         )
132 |         generator = train_generator(**vars(generator_args))
133 |         (x, h), t = next(generator)
134 |         assert x.size(0) == t.size(0) == h.size(0) == 1
135 |         assert x.size(1) == t.size(1) == h.size(2) * generator_args.upsampling_factor
136 | 
137 | 
138 | def test_decode_generator():
139 |     # make dummy wavfiles
140 |     wavdir = "data/wav"
141 |     if not os.path.exists(wavdir):
142 |         os.makedirs(wavdir)
143 |     for i in range(5):
144 |         make_dummy_wav(wavdir + "/%d.wav" % i)
145 | 
146 |     # make features
147 |     feat_args = make_feature_args()
148 |     wav_list = find_files(wavdir, "*.wav")
149 |     if not os.path.exists(feat_args.wavdir):
150 |         os.makedirs(feat_args.wavdir)
151 |     feat_args.feature_type = "melspc"
152 |     melspectrogram_extract(wav_list, feat_args)
153 |     feat_args.feature_type = "world"
154 |     world_feature_extract(wav_list, feat_args)
155 |     feat_list = find_files(feat_args.hdf5dir, "*.h5")
156 | 
157 |     for ft in ["world", "melspc"]:
158 |         # ----------------------------------
159 |         # non-batch without upsampling layer
160 |         # ----------------------------------
161 |         generator_args = make_decode_generator_args(
162 |             feat_list=feat_list,
163 |             feature_type=ft,
164 |             use_upsampling_layer=False,
165 |             batch_size=1
166 |         )
167 |         generator = decode_generator(**vars(generator_args))
168 |         _, (x, h, n_samples) = next(generator)
169 |         assert x.size(0) == h.size(0) == 1
170 |         assert h.size(2) == n_samples + 1
171 | 
172 |         # -------------------------------
173 |         # non-batch with upsampling layer
174 |         # -------------------------------
175 |         generator_args = make_decode_generator_args(
176 |             feat_list=feat_list,
177 |             feature_type=ft,
178 |             use_upsampling_layer=True,
179 |             batch_size=1
180 |         )
181 |         generator = decode_generator(**vars(generator_args))
182 |         _, (x, h, n_samples) = next(generator)
183 |         assert x.size(0) == h.size(0) == 1
184 |         assert h.size(2) * generator_args.upsampling_factor == n_samples + 1
185 | 
186 |         # ----------------------------------
187 |         # minibatch without upsampling layer
188 |         # ----------------------------------
189 |         generator_args = make_decode_generator_args(
190 |             feat_list=feat_list,
191 |             feature_type=ft,
192 |             use_upsampling_layer=False,
193 |             batch_size=5
194 |         )
195 |         generator = decode_generator(**vars(generator_args))
196 |         _, (batch_x, batch_h, n_samples_list) = next(generator)
197 |         assert batch_x.size(0) == batch_h.size(0) == len(n_samples_list)
198 |         assert batch_h.size(2) == max(n_samples_list) + 1
199 | 
200 |         # -------------------------------
201 |         # minibatch with upsampling layer
202 |         # -------------------------------
203 |         generator_args = make_decode_generator_args(
204 |             feat_list=feat_list,
205 |             feature_type=ft,
206 |             use_upsampling_layer=True,
207 |             batch_size=5
208 |         )
209 |         generator = decode_generator(**vars(generator_args))
210 |         _, (batch_x, batch_h, n_samples_list) = next(generator)
211 |         assert batch_x.size(0) == batch_h.size(0) == len(n_samples_list)
212 |         assert batch_h.size(2) * generator_args.upsampling_factor == max(n_samples_list) + 1
213 | 


--------------------------------------------------------------------------------
/test/test_wavenet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | from wavenet_vocoder.nets import encode_mu_law
 12 | from wavenet_vocoder.nets import initialize
 13 | from wavenet_vocoder.nets import WaveNet
 14 | 
 15 | # set log level
 16 | logging.basicConfig(level=logging.DEBUG,
 17 |                     format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
 18 |                     datefmt='%m/%d/%Y %I:%M:%S')
 19 | 
 20 | 
 21 | def sine_generator(seq_size=100, mu=256):
 22 |     t = np.linspace(0, 1, 16000)
 23 |     data = np.sin(2 * np.pi * 220 * t) + np.sin(2 * np.pi * 224 * t)
 24 |     data = data / 2
 25 |     while True:
 26 |         ys = data[:seq_size]
 27 |         ys = encode_mu_law(data, mu)
 28 |         yield torch.from_numpy(ys[:seq_size])
 29 | 
 30 | 
 31 | def test_forward():
 32 |     # get batch
 33 |     generator = sine_generator(100)
 34 |     batch = next(generator)
 35 |     batch_input = batch.view(1, -1)
 36 |     batch_aux = torch.rand(1, 28, batch_input.size(1)).float()
 37 | 
 38 |     # define model without upsampling with kernel size = 2
 39 |     net = WaveNet(256, 28, 32, 128, 10, 1, 2)
 40 |     net.apply(initialize)
 41 |     net.eval()
 42 |     y = net(batch_input, batch_aux)[0]
 43 |     assert y.size(0) == batch_input.size(1)
 44 |     assert y.size(1) == 256
 45 | 
 46 |     # define model without upsampling with kernel size = 3
 47 |     net = WaveNet(256, 28, 32, 128, 10, 1, 2)
 48 |     net.apply(initialize)
 49 |     net.eval()
 50 |     y = net(batch_input, batch_aux)[0]
 51 |     assert y.size(0) == batch_input.size(1)
 52 |     assert y.size(1) == 256
 53 | 
 54 |     batch_input = batch.view(1, -1)
 55 |     batch_aux = torch.rand(1, 28, batch_input.size(1) // 10).float()
 56 | 
 57 |     # define model with upsampling and kernel size = 2
 58 |     net = WaveNet(256, 28, 32, 128, 10, 1, 2, 10)
 59 |     net.apply(initialize)
 60 |     net.eval()
 61 |     y = net(batch_input, batch_aux)[0]
 62 |     assert y.size(0) == batch_input.size(1)
 63 |     assert y.size(1) == 256
 64 | 
 65 |     # define model with upsampling and kernel size = 3
 66 |     net = WaveNet(256, 28, 32, 128, 10, 1, 3, 10)
 67 |     net.apply(initialize)
 68 |     net.eval()
 69 |     y = net(batch_input, batch_aux)[0]
 70 |     assert y.size(0) == batch_input.size(1)
 71 |     assert y.size(1) == 256
 72 | 
 73 | 
 74 | def test_generate():
 75 |     batch = 2
 76 |     x = np.random.randint(0, 256, size=(batch, 1))
 77 |     h = np.random.randn(batch, 28, 10)
 78 |     length = h.shape[-1] - 1
 79 |     with torch.no_grad():
 80 |         net = WaveNet(256, 28, 4, 4, 10, 3, 2)
 81 |         net.apply(initialize)
 82 |         net.eval()
 83 |         for x_, h_ in zip(x, h):
 84 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
 85 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
 86 |             net.generate(batch_x, batch_h, length, 1, "sampling")
 87 |             net.fast_generate(batch_x, batch_h, length, 1, "sampling")
 88 |         batch_x = torch.from_numpy(x).long()
 89 |         batch_h = torch.from_numpy(h).float()
 90 |         net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "sampling")
 91 | 
 92 | 
 93 | def test_assert_fast_generation():
 94 |     # get batch
 95 |     batch = 2
 96 |     x = np.random.randint(0, 256, size=(batch, 1))
 97 |     h = np.random.randn(batch, 28, 32)
 98 |     length = h.shape[-1] - 1
 99 | 
100 |     with torch.no_grad():
101 |         # --------------------------------------------------------
102 |         # define model without upsampling and with kernel size = 2
103 |         # --------------------------------------------------------
104 |         net = WaveNet(256, 28, 4, 4, 10, 3, 2)
105 |         net.apply(initialize)
106 |         net.eval()
107 | 
108 |         # sample-by-sample generation
109 |         gen1_list = []
110 |         gen2_list = []
111 |         for x_, h_ in zip(x, h):
112 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
113 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
114 |             gen1 = net.generate(batch_x, batch_h, length, 1, "argmax")
115 |             gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax")
116 |             np.testing.assert_array_equal(gen1, gen2)
117 |             gen1_list += [gen1]
118 |             gen2_list += [gen2]
119 |         gen1 = np.stack(gen1_list)
120 |         gen2 = np.stack(gen2_list)
121 |         np.testing.assert_array_equal(gen1, gen2)
122 | 
123 |         # batch generation
124 |         batch_x = torch.from_numpy(x).long()
125 |         batch_h = torch.from_numpy(h).float()
126 |         gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax")
127 |         gen3 = np.stack(gen3_list)
128 |         np.testing.assert_array_equal(gen3, gen2)
129 | 
130 |         # --------------------------------------------------------
131 |         # define model without upsampling and with kernel size = 3
132 |         # --------------------------------------------------------
133 |         net = WaveNet(256, 28, 4, 4, 10, 3, 3)
134 |         net.apply(initialize)
135 |         net.eval()
136 | 
137 |         # sample-by-sample generation
138 |         gen1_list = []
139 |         gen2_list = []
140 |         for x_, h_ in zip(x, h):
141 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
142 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
143 |             gen1 = net.generate(batch_x, batch_h, length, 1, "argmax")
144 |             gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax")
145 |             np.testing.assert_array_equal(gen1, gen2)
146 |             gen1_list += [gen1]
147 |             gen2_list += [gen2]
148 |         gen1 = np.stack(gen1_list)
149 |         gen2 = np.stack(gen2_list)
150 |         np.testing.assert_array_equal(gen1, gen2)
151 | 
152 |         # batch generation
153 |         batch_x = torch.from_numpy(x).long()
154 |         batch_h = torch.from_numpy(h).float()
155 |         gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax")
156 |         gen3 = np.stack(gen3_list)
157 |         np.testing.assert_array_equal(gen3, gen2)
158 | 
159 |         # get batch
160 |         batch = 2
161 |         upsampling_factor = 10
162 |         x = np.random.randint(0, 256, size=(batch, 1))
163 |         h = np.random.randn(batch, 28, 3)
164 |         length = h.shape[-1] * upsampling_factor - 1
165 | 
166 |         # -----------------------------------------------------
167 |         # define model with upsampling and with kernel size = 2
168 |         # -----------------------------------------------------
169 |         net = WaveNet(256, 28, 4, 4, 10, 3, 2, upsampling_factor)
170 |         net.apply(initialize)
171 |         net.eval()
172 | 
173 |         # sample-by-sample generation
174 |         gen1_list = []
175 |         gen2_list = []
176 |         for x_, h_ in zip(x, h):
177 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
178 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
179 |             gen1 = net.generate(batch_x, batch_h, length, 1, "argmax")
180 |             gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax")
181 |             np.testing.assert_array_equal(gen1, gen2)
182 |             gen1_list += [gen1]
183 |             gen2_list += [gen2]
184 |         gen1 = np.stack(gen1_list)
185 |         gen2 = np.stack(gen2_list)
186 |         np.testing.assert_array_equal(gen1, gen2)
187 | 
188 |         # batch generation
189 |         batch_x = torch.from_numpy(x).long()
190 |         batch_h = torch.from_numpy(h).float()
191 |         gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax")
192 |         gen3 = np.stack(gen3_list)
193 |         np.testing.assert_array_equal(gen3, gen2)
194 | 
195 |         # -----------------------------------------------------
196 |         # define model with upsampling and with kernel size = 3
197 |         # -----------------------------------------------------
198 |         net = WaveNet(256, 28, 4, 4, 10, 3, 2, upsampling_factor)
199 |         net.apply(initialize)
200 |         net.eval()
201 | 
202 |         # sample-by-sample generation
203 |         gen1_list = []
204 |         gen2_list = []
205 |         for x_, h_ in zip(x, h):
206 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
207 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
208 |             gen1 = net.generate(batch_x, batch_h, length, 1, "argmax")
209 |             gen2 = net.fast_generate(batch_x, batch_h, length, 1, "argmax")
210 |             np.testing.assert_array_equal(gen1, gen2)
211 |             gen1_list += [gen1]
212 |             gen2_list += [gen2]
213 |         gen1 = np.stack(gen1_list)
214 |         gen2 = np.stack(gen2_list)
215 |         np.testing.assert_array_equal(gen1, gen2)
216 | 
217 |         # batch generation
218 |         batch_x = torch.from_numpy(x).long()
219 |         batch_h = torch.from_numpy(h).float()
220 |         gen3_list = net.batch_fast_generate(batch_x, batch_h, [length] * batch, 1, "argmax")
221 |         gen3 = np.stack(gen3_list)
222 |         np.testing.assert_array_equal(gen3, gen2)
223 | 
224 | 
225 | def test_assert_different_length_batch_generation():
226 |     # prepare batch
227 |     batch = 4
228 |     length = 32
229 |     x = np.random.randint(0, 256, size=(batch, 1))
230 |     h = np.random.randn(batch, 28, length)
231 |     length_list = sorted(list(np.random.randint(length // 2, length - 1, batch)))
232 | 
233 |     with torch.no_grad():
234 |         net = WaveNet(256, 28, 4, 4, 10, 3, 2)
235 |         net.apply(initialize)
236 |         net.eval()
237 | 
238 |         # sample-by-sample generation
239 |         gen1_list = []
240 |         for x_, h_, length in zip(x, h, length_list):
241 |             batch_x = torch.from_numpy(np.expand_dims(x_, 0)).long()
242 |             batch_h = torch.from_numpy(np.expand_dims(h_, 0)).float()
243 |             gen1 = net.fast_generate(batch_x, batch_h, length, 1, "argmax")
244 |             gen1_list += [gen1]
245 | 
246 |         # batch generation
247 |         batch_x = torch.from_numpy(x).long()
248 |         batch_h = torch.from_numpy(h).float()
249 |         gen2_list = net.batch_fast_generate(batch_x, batch_h, length_list, 1, "argmax")
250 | 
251 |         # assertion
252 |         for gen1, gen2 in zip(gen1_list, gen2_list):
253 |             np.testing.assert_array_equal(gen1, gen2)
254 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/utils/run.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings; #sed replacement for -w perl parameter
  3 | 
  4 | # In general, doing
  5 | #  run.pl some.log a b c is like running the command a b c in
  6 | # the bash shell, and putting the standard error and output into some.log.
  7 | # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
  8 | #  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
  9 | # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
 10 | # If any of the jobs fails, this script will fail.
 11 | 
 12 | # A typical example is:
 13 | #  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
 14 | # and run.pl will run something like:
 15 | # ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
 16 | #
 17 | # Basically it takes the command-line arguments, quotes them
 18 | # as necessary to preserve spaces, and evaluates them with bash.
 19 | # In addition it puts the command line at the top of the log, and
 20 | # the start and end times of the command at the beginning and end.
 21 | # The reason why this is useful is so that we can create a different
 22 | # version of this program that uses a queueing system instead.
 23 | 
 24 | # use Data::Dumper;
 25 | 
 26 | @ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
 27 | 
 28 | 
 29 | $max_jobs_run = -1;
 30 | $jobstart = 1;
 31 | $jobend = 1;
 32 | $ignored_opts = ""; # These will be ignored.
 33 | 
 34 | # First parse an option like JOB=1:4, and any
 35 | # options that would normally be given to
 36 | # queue.pl, which we will just discard.
 37 | 
 38 | for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
 39 |   # allow the JOB=1:n option to be interleaved with the
 40 |   # options to qsub.
 41 |   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
 42 |     # parse any options that would normally go to qsub, but which will be ignored here.
 43 |     my $switch = shift @ARGV;
 44 |     if ($switch eq "-V") {
 45 |       $ignored_opts .= "-V ";
 46 |     } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
 47 |       # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
 48 |       $max_jobs_run = shift @ARGV;
 49 |       if (! ($max_jobs_run > 0)) {
 50 |         die "run.pl: invalid option --max-jobs-run $max_jobs_run";
 51 |       }
 52 |     } else {
 53 |       my $argument = shift @ARGV;
 54 |       if ($argument =~ m/^--/) {
 55 |         print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
 56 |       }
 57 |       if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
 58 |         $ignored_opts .= "-sync "; # Note: in the
 59 |         # corresponding code in queue.pl it says instead, just "$sync = 1;".
 60 |       } elsif ($switch eq "-pe") { # e.g. -pe smp 5
 61 |         my $argument2 = shift @ARGV;
 62 |         $ignored_opts .= "$switch $argument $argument2 ";
 63 |       } elsif ($switch eq "--gpu") {
 64 |         $using_gpu = $argument;
 65 |       } else {
 66 |         # Ignore option.
 67 |         $ignored_opts .= "$switch $argument ";
 68 |       }
 69 |     }
 70 |   }
 71 |   if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
 72 |     $jobname = $1;
 73 |     $jobstart = $2;
 74 |     $jobend = $3;
 75 |     shift;
 76 |     if ($jobstart > $jobend) {
 77 |       die "run.pl: invalid job range $ARGV[0]";
 78 |     }
 79 |     if ($jobstart <= 0) {
 80 |       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
 81 |     }
 82 |   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
 83 |     $jobname = $1;
 84 |     $jobstart = $2;
 85 |     $jobend = $2;
 86 |     shift;
 87 |   } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
 88 |     print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
 89 |   }
 90 | }
 91 | 
 92 | # Users found this message confusing so we are removing it.
 93 | # if ($ignored_opts ne "") {
 94 | #   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
 95 | # }
 96 | 
 97 | if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
 98 |                            # then work out the number of processors if possible,
 99 |                            # and set it based on that.
100 |   $max_jobs_run = 0;
101 |   if ($using_gpu) {
102 |     if (open(P, "nvidia-smi -L |")) {
103 |       $max_jobs_run++ while (<P>);
104 |       close(P);
105 |     }
106 |     if ($max_jobs_run == 0) {
107 |       $max_jobs_run = 1;
108 |       print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
109 |     }
110 |   } elsif (open(P, "</proc/cpuinfo")) {  # Linux
111 |     while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
112 |     if ($max_jobs_run == 0) {
113 |       print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
114 |       $max_jobs_run = 10;  # reasonable default.
115 |     }
116 |     close(P);
117 |   } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
118 |     while (<P>) {
119 |       if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
120 |         $max_jobs_run = $1;
121 |         last;
122 |       }
123 |     }
124 |     close(P);
125 |     if ($max_jobs_run == 0) {
126 |       print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
127 |       $max_jobs_run = 10;  # reasonable default.
128 |     }
129 |   } else {
130 |     # allow at most 32 jobs at once, on non-UNIX systems; change this code
131 |     # if you need to change this default.
132 |     $max_jobs_run = 32;
133 |   }
134 |   # The just-computed value of $max_jobs_run is just the number of processors
135 |   # (or our best guess); and if it happens that the number of jobs we need to
136 |   # run is just slightly above $max_jobs_run, it will make sense to increase
137 |   # $max_jobs_run to equal the number of jobs, so we don't have a small number
138 |   # of leftover jobs.
139 |   $num_jobs = $jobend - $jobstart + 1;
140 |   if (!$using_gpu &&
141 |       $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
142 |     $max_jobs_run = $num_jobs;
143 |   }
144 | }
145 | 
146 | $logfile = shift @ARGV;
147 | 
148 | if (defined $jobname && $logfile !~ m/$jobname/ &&
149 |     $jobend > $jobstart) {
150 |   print STDERR "run.pl: you are trying to run a parallel job but "
151 |     . "you are putting the output into just one log file ($logfile)\n";
152 |   exit(1);
153 | }
154 | 
155 | $cmd = "";
156 | 
157 | foreach $x (@ARGV) {
158 |     if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
159 |     elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
160 |     else { $cmd .= "\"$x\" "; }
161 | }
162 | 
163 | #$Data::Dumper::Indent=0;
164 | $ret = 0;
165 | $numfail = 0;
166 | %active_pids=();
167 | 
168 | use POSIX ":sys_wait_h";
169 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
170 |   if (scalar(keys %active_pids) >= $max_jobs_run) {
171 | 
172 |     # Lets wait for a change in any child's status
173 |     # Then we have to work out which child finished
174 |     $r = waitpid(-1, 0);
175 |     $code = $?;
176 |     if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
177 |     if ( defined $active_pids{$r} ) {
178 |         $jid=$active_pids{$r};
179 |         $fail[$jid]=$code;
180 |         if ($code !=0) { $numfail++;}
181 |         delete $active_pids{$r};
182 |         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
183 |     } else {
184 |         die "run.pl: Cannot find the PID of the chold process that just finished.";
185 |     }
186 | 
187 |     # In theory we could do a non-blocking waitpid over all jobs running just
188 |     # to find out if only one or more jobs finished during the previous waitpid()
189 |     # However, we just omit this and will reap the next one in the next pass
190 |     # through the for(;;) cycle
191 |   }
192 |   $childpid = fork();
193 |   if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
194 |   if ($childpid == 0) { # We're in the child... this branch
195 |     # executes the job and returns (possibly with an error status).
196 |     if (defined $jobname) {
197 |       $cmd =~ s/$jobname/$jobid/g;
198 |       $logfile =~ s/$jobname/$jobid/g;
199 |     }
200 |     system("mkdir -p `dirname $logfile` 2>/dev/null");
201 |     open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
202 |     print F "# " . $cmd . "\n";
203 |     print F "# Started at " . `date`;
204 |     $starttime = `date +'%s'`;
205 |     print F "#\n";
206 |     close(F);
207 | 
208 |     # Pipe into bash.. make sure we're not using any other shell.
209 |     open(B, "|bash") || die "run.pl: Error opening shell command";
210 |     print B "( " . $cmd . ") 2>>$logfile >> $logfile";
211 |     close(B);                   # If there was an error, exit status is in $?
212 |     $ret = $?;
213 | 
214 |     $lowbits = $ret & 127;
215 |     $highbits = $ret >> 8;
216 |     if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
217 |     else { $return_str = "code $highbits"; }
218 | 
219 |     $endtime = `date +'%s'`;
220 |     open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
221 |     $enddate = `date`;
222 |     chop $enddate;
223 |     print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
224 |     print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
225 |     close(F);
226 |     exit($ret == 0 ? 0 : 1);
227 |   } else {
228 |     $pid[$jobid] = $childpid;
229 |     $active_pids{$childpid} = $jobid;
230 |     # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
231 |   }
232 | }
233 | 
234 | # Now we have submitted all the jobs, lets wait until all the jobs finish
235 | foreach $child (keys %active_pids) {
236 |     $jobid=$active_pids{$child};
237 |     $r = waitpid($pid[$jobid], 0);
238 |     $code = $?;
239 |     if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
240 |     if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
241 | }
242 | 
243 | # Some sanity checks:
244 | # The $fail array should not contain undefined codes
245 | # The number of non-zeros in that array  should be equal to $numfail
246 | # We cannot do foreach() here, as the JOB ids do not necessarily start by zero
247 | $failed_jids=0;
248 | for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
249 |   $job_return = $fail[$jobid];
250 |   if (not defined $job_return ) {
251 |     # print Dumper(\@fail);
252 | 
253 |     die "run.pl: Sanity check failed: we have indication that some jobs are running " .
254 |       "even after we waited for all jobs to finish" ;
255 |   }
256 |   if ($job_return != 0 ){ $failed_jids++;}
257 | }
258 | if ($failed_jids != $numfail) {
259 |   die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
260 | }
261 | if ($numfail > 0) { $ret = 1; }
262 | 
263 | if ($ret != 0) {
264 |   $njobs = $jobend - $jobstart + 1;
265 |   if ($njobs == 1) {
266 |     if (defined $jobname) {
267 |       $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
268 |                                          # that job.
269 |     }
270 |     print STDERR "run.pl: job failed, log is in $logfile\n";
271 |     if ($logfile =~ m/JOB/) {
272 |       print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
273 |     }
274 |   }
275 |   else {
276 |     $logfile =~ s/$jobname/*/g;
277 |     print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
278 |   }
279 | }
280 | 
281 | 
282 | exit ($ret);
283 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2017 Tomoki Hayashi (Nagoya University)
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #           SCRIPT TO BUILD SD WAVENET VOCODER             #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | . ./cmd.sh || exit 1;
 11 | 
 12 | # USER SETTINGS {{{
 13 | #######################################
 14 | #           STAGE SETTING             #
 15 | #######################################
 16 | stage=0123456
 17 | # 0: data preparation step
 18 | # 1: feature extraction step
 19 | # 2: statistics calculation step
 20 | # 3: noise weighting step
 21 | # 4: training step
 22 | # 5: decoding step
 23 | # 6: noise shaping step
 24 | 
 25 | #######################################
 26 | #          FEATURE SETTING            #
 27 | #######################################
 28 | feature_type=world     # world or melspc (in this recipe fixed to "world")
 29 | minf0=40               # minimum f0
 30 | maxf0=400              # maximum f0
 31 | shiftms=5              # shift length in msec
 32 | fftl=1024              # fft length
 33 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 34 | fs=22050               # sampling rate
 35 | mcep_dim=34            # dimension of mel-cepstrum
 36 | mcep_alpha=0.455       # alpha value of mel-cepstrum
 37 | use_noise_shaping=true # whether to use noise shaping
 38 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 39 | n_jobs=10              # number of parallel jobs
 40 | 
 41 | #######################################
 42 | #          TRAINING SETTING           #
 43 | #######################################
 44 | n_gpus=1                  # number of gpus
 45 | n_quantize=256            # number of quantization of waveform
 46 | n_aux=39                  # number of auxiliary features
 47 | n_resch=512               # number of residual channels
 48 | n_skipch=256              # number of skip channels
 49 | dilation_depth=10         # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 50 | dilation_repeat=3         # number of dilation repeats
 51 | kernel_size=3             # kernel size of dilated convolution
 52 | lr=1e-4                   # learning rate
 53 | weight_decay=0.0          # weight decay coef
 54 | iters=200000              # number of iterations
 55 | batch_length=15000        # batch length
 56 | batch_size=1              # batch size
 57 | checkpoint_interval=10000 # save model per this number
 58 | use_upsampling=true       # whether to use upsampling layer
 59 | resume=""                 # checkpoint path to resume (Optional)
 60 | 
 61 | #######################################
 62 | #          DECODING SETTING           #
 63 | #######################################
 64 | outdir=""            # directory to save decoded wav dir (Optional)
 65 | checkpoint=""        # checkpoint path to be used for decoding (Optional)
 66 | config=""            # model configuration path (Optional)
 67 | stats=""             # statistics path (Optional)
 68 | feats=""             # list or directory of feature files (Optional)
 69 | decode_batch_size=16 # batch size in decoding
 70 | 
 71 | #######################################
 72 | #            OTHER SETTING            #
 73 | #######################################
 74 | LJSPEECH_DB_ROOT=downloads # directory including DB (if DB not exists, it will be downloaded)
 75 | tag=""                     # tag for network directory naming (Optional)
 76 | 
 77 | # parse options
 78 | . parse_options.sh || exit 1;
 79 | 
 80 | # check feature type
 81 | if [ ${feature_type} != "world" ]; then
 82 |     echo "This recipe does not support feature_type=\"melspc\"." 2>&1
 83 |     echo "Please try the egs/ljspeech/sd-melspc." 2>&1
 84 |     exit 1;
 85 | fi
 86 | 
 87 | # set directory names
 88 | train=tr
 89 | eval=ev
 90 | 
 91 | # stop when error occurred
 92 | set -euo pipefail
 93 | # }}}
 94 | 
 95 | 
 96 | # STAGE 0 {{{
 97 | if echo ${stage} | grep -q 0; then
 98 |     echo "###########################################################"
 99 |     echo "#                 DATA PREPARATION STEP                   #"
100 |     echo "###########################################################"
101 |     if [ ! -e ${LJSPEECH_DB_ROOT}/.done ];then
102 |         mkdir -p ${LJSPEECH_DB_ROOT}
103 |         cd ${LJSPEECH_DB_ROOT}
104 |         wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
105 |         tar -vxf ./*.tar.bz2
106 |         rm ./*.tar.bz2
107 |         cd ../
108 |         touch ${LJSPEECH_DB_ROOT}/.done
109 |         echo "database is successfully downloaded."
110 |     fi
111 |     [ ! -e data/local ] && mkdir -p data/local
112 |     [ ! -e data/${train} ] && mkdir -p data/${train}
113 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
114 |     find ${LJSPEECH_DB_ROOT}/LJSpeech-1.1/wavs -name "*.wav" \
115 |         | sort > data/local/wav.scp
116 |     grep -v LJ050 data/local/wav.scp > data/${train}/wav.scp
117 |     grep LJ050 data/local/wav.scp > data/${eval}/wav.scp
118 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
119 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
120 | fi
121 | # }}}
122 | 
123 | 
124 | # STAGE 1 {{{
125 | if echo ${stage} | grep -q 1; then
126 |     echo "###########################################################"
127 |     echo "#               FEATURE EXTRACTION STEP                   #"
128 |     echo "###########################################################"
129 |     for set in ${train} ${eval};do
130 |         # training data feature extraction
131 |         ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \
132 |             feature_extract.py \
133 |                 --waveforms data/${set}/wav.scp \
134 |                 --wavdir wav_hpf/${set} \
135 |                 --hdf5dir hdf5/${set} \
136 |                 --feature_type ${feature_type} \
137 |                 --fs ${fs} \
138 |                 --shiftms ${shiftms} \
139 |                 --minf0 ${minf0} \
140 |                 --maxf0 ${maxf0} \
141 |                 --mcep_dim ${mcep_dim} \
142 |                 --mcep_alpha ${mcep_alpha} \
143 |                 --highpass_cutoff ${highpass_cutoff} \
144 |                 --fftl ${fftl} \
145 |                 --n_jobs ${n_jobs}
146 | 
147 |         # check the number of feature files
148 |         n_wavs=$(wc -l data/${set}/wav.scp)
149 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
150 |         echo "${n_feats}/${n_wavs} files are successfully processed."
151 | 
152 |         # make scp files
153 |         if [ ${highpass_cutoff} -eq 0 ];then
154 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
155 |         else
156 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
157 |         fi
158 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
159 |     done
160 | fi
161 | # }}}
162 | 
163 | 
164 | # STAGE 2 {{{
165 | if echo ${stage} | grep -q 2; then
166 |     echo "###########################################################"
167 |     echo "#              CALCULATE STATISTICS STEP                  #"
168 |     echo "###########################################################"
169 |     ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \
170 |         calc_stats.py \
171 |             --feats data/${train}/feats.scp \
172 |             --stats data/${train}/stats.h5 \
173 |             --feature_type ${feature_type}
174 |     echo "statistics are successfully calculated."
175 | fi
176 | # }}}
177 | 
178 | 
179 | # STAGE 3 {{{
180 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
181 |     echo "###########################################################"
182 |     echo "#                  NOISE WEIGHTING STEP                   #"
183 |     echo "###########################################################"
184 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \
185 |         noise_shaping.py \
186 |             --waveforms data/${train}/wav_hpf.scp \
187 |             --stats data/${train}/stats.h5 \
188 |             --outdir wav_nwf/${train} \
189 |             --feature_type ${feature_type} \
190 |             --fs ${fs} \
191 |             --shiftms ${shiftms} \
192 |             --mcep_dim_start 2 \
193 |             --mcep_dim_end $(( 2 + mcep_dim + 1 )) \
194 |             --mcep_alpha ${mcep_alpha} \
195 |             --mag ${mag} \
196 |             --inv true \
197 |             --n_jobs ${n_jobs}
198 | 
199 |     # check the number of feature files
200 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
201 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
202 |     echo "${n_ns}/${n_wavs} files are successfully processed."
203 | 
204 |     # make scp files
205 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
206 | fi # }}}
207 | 
208 | 
209 | # STAGE 4 {{{
210 | # set variables
211 | if [ ! -n "${tag}" ];then
212 |     expdir=exp/tr_ljspeech_22k_sd_${feature_type}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
213 |     if ${use_noise_shaping};then
214 |         expdir=${expdir}_ns
215 |     fi
216 |     if ${use_upsampling};then
217 |         expdir=${expdir}_up
218 |     fi
219 | else
220 |     expdir=exp/tr_ljspeech_22k_${tag}
221 | fi
222 | if echo ${stage} | grep -q 4; then
223 |     echo "###########################################################"
224 |     echo "#               WAVENET TRAINING STEP                     #"
225 |     echo "###########################################################"
226 |     if ${use_noise_shaping};then
227 |         waveforms=data/${train}/wav_nwf.scp
228 |     else
229 |         waveforms=data/${train}/wav_hpf.scp
230 |     fi
231 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
232 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
233 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
234 |     ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \
235 |         train.py \
236 |             --n_gpus ${n_gpus} \
237 |             --waveforms ${waveforms} \
238 |             --feats data/${train}/feats.scp \
239 |             --stats data/${train}/stats.h5 \
240 |             --expdir "${expdir}" \
241 |             --feature_type ${feature_type} \
242 |             --n_quantize ${n_quantize} \
243 |             --n_aux ${n_aux} \
244 |             --n_resch ${n_resch} \
245 |             --n_skipch ${n_skipch} \
246 |             --dilation_depth ${dilation_depth} \
247 |             --dilation_repeat ${dilation_repeat} \
248 |             --kernel_size ${kernel_size} \
249 |             --lr ${lr} \
250 |             --weight_decay ${weight_decay} \
251 |             --iters ${iters} \
252 |             --batch_length ${batch_length} \
253 |             --batch_size ${batch_size} \
254 |             --checkpoint_interval ${checkpoint_interval} \
255 |             --upsampling_factor "${upsampling_factor}" \
256 |             --use_upsampling_layer ${use_upsampling} \
257 |             --resume "${resume}"
258 | fi
259 | # }}}
260 | 
261 | 
262 | # STAGE 5 {{{
263 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
264 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
265 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
266 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
267 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
268 | if echo ${stage} | grep -q 5; then
269 |     echo "###########################################################"
270 |     echo "#               WAVENET DECODING STEP                     #"
271 |     echo "###########################################################"
272 |     ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \
273 |         decode.py \
274 |             --n_gpus ${n_gpus} \
275 |             --feats ${feats} \
276 |             --stats ${stats} \
277 |             --outdir "${outdir}" \
278 |             --checkpoint "${checkpoint}" \
279 |             --config "${config}" \
280 |             --fs ${fs} \
281 |             --batch_size ${decode_batch_size}
282 | fi
283 | # }}}
284 | 
285 | 
286 | # STAGE 6 {{{
287 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
288 |     echo "###########################################################"
289 |     echo "#                  NOISE SHAPING STEP                     #"
290 |     echo "###########################################################"
291 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
292 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \
293 |         noise_shaping.py \
294 |             --waveforms ${outdir}/wav.scp \
295 |             --stats ${stats} \
296 |             --outdir "${outdir}_nsf" \
297 |             --feature_type ${feature_type} \
298 |             --fs ${fs} \
299 |             --shiftms ${shiftms} \
300 |             --mcep_dim_start 2 \
301 |             --mcep_dim_end $(( 2 + mcep_dim + 1 )) \
302 |             --mcep_alpha ${mcep_alpha} \
303 |             --mag ${mag} \
304 |             --n_jobs ${n_jobs} \
305 |             --inv false
306 | fi
307 | # }}}
308 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-mini/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #         DEMO SCRIPT TO BUILD SD WAVENET VOCODER          #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | 
 11 | # USER SETTINGS {{{
 12 | #######################################
 13 | #           STAGE SETTING             #
 14 | #######################################
 15 | stage=0123456
 16 | # 0: data preparation step
 17 | # 1: feature extraction step
 18 | # 2: statistics calculation step
 19 | # 3: noise weighting step
 20 | # 4: training step
 21 | # 5: decoding step
 22 | # 6: noise shaping step
 23 | 
 24 | #######################################
 25 | #          FEATURE SETTING            #
 26 | #######################################
 27 | feature_type=world     # world or melspc (in this recipe fixed to "world")
 28 | spk=slt                # target spekaer in arctic
 29 | minf0=""               # minimum f0 (if not set, conf/*.f0 will be used)
 30 | maxf0=""               # maximum f0 (if not set, conf/*.f0 will be used)
 31 | shiftms=5              # shift length in msec
 32 | fftl=1024              # fft length
 33 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 34 | fs=16000               # sampling rate
 35 | mcep_dim=24            # dimension of mel-cepstrum
 36 | mcep_alpha=0.410       # alpha value of mel-cepstrum
 37 | use_noise_shaping=true # whether to use noise shaping
 38 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 39 | n_jobs=10              # number of parallel jobs
 40 | 
 41 | #######################################
 42 | #          TRAINING SETTING           #
 43 | #######################################
 44 | n_gpus=1                # number of gpus
 45 | n_quantize=256          # number of quantization of waveform
 46 | n_aux=28                # number of auxiliary features
 47 | n_resch=32              # number of residual channels
 48 | n_skipch=16             # number of skip channels
 49 | dilation_depth=5        # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 50 | dilation_repeat=1       # number of dilation repeats
 51 | kernel_size=2           # kernel size of dilated convolution
 52 | lr=1e-4                 # learning rate
 53 | weight_decay=0.0        # weight decay coef
 54 | iters=1000              # number of iterations
 55 | batch_length=10000      # batch length
 56 | batch_size=1            # batch size
 57 | checkpoint_interval=100 # save model per this number
 58 | use_upsampling=true     # whether to use upsampling layer
 59 | resume=""               # checkpoint path to resume (Optional)
 60 | 
 61 | #######################################
 62 | #          DECODING SETTING           #
 63 | #######################################
 64 | outdir=""           # directory to save decoded wav dir (Optional)
 65 | checkpoint=""       # checkpoint path to be used for decoding (Optional)
 66 | config=""           # model configuration path (Optional)
 67 | stats=""            # statistics path (Optional)
 68 | feats=""            # list or directory of feature files (Optional)
 69 | decode_batch_size=4 # batch size in decoding
 70 | 
 71 | #######################################
 72 | #            OTHER SETTING            #
 73 | #######################################
 74 | download_dir=downloads # download directory to save corpus
 75 | download_url="https://drive.google.com/open?id=1NIia89CL2qqqDzNNc718wycRmI_jkLxR" # download URL of gooogle drive
 76 | tag="" # tag for network directory naming (Optional)
 77 | 
 78 | # This enable argparse-like parsing of the above variables e.g. ./run.sh --stage 0
 79 | . parse_options.sh || exit 1;
 80 | 
 81 | # check feature type
 82 | if [ ${feature_type} != "world" ]; then
 83 |     echo "This recipe does not support feature_type=\"melspc\"." 2>&1
 84 |     echo "Please try the egs/*/*-melspc." 2>&1
 85 |     exit 1;
 86 | fi
 87 | 
 88 | # set directory names
 89 | train=tr_${spk}
 90 | eval=ev_${spk}
 91 | 
 92 | # stop when error occurred
 93 | set -euo pipefail
 94 | # }}}
 95 | 
 96 | 
 97 | # STAGE 0 {{{
 98 | if echo ${stage} | grep -q 0; then
 99 |     echo "###########################################################"
100 |     echo "#                 DATA PREPARATION STEP                   #"
101 |     echo "###########################################################"
102 |     # download dataset
103 |     if [ ! -e ${download_dir}/.done ];then
104 |         download_from_google_drive.sh "${download_url}" ${download_dir} tar.gz
105 |         touch ${download_dir}/.done
106 |         echo "database is successfully downloaded."
107 |     fi
108 | 
109 |     # directory check
110 |     [ ! -e data/local ] && mkdir -p data/local
111 |     [ ! -e data/${train} ] && mkdir -p data/${train}
112 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
113 | 
114 |     # make list of all of the utterances
115 |     find "${download_dir}/cmu_us_${spk}_arctic_mini/wav" -name "*.wav" \
116 |         | sort > "data/local/wav.${spk}.scp"
117 | 
118 |     # use first 32 utterances as training data
119 |     head -n 32 "data/local/wav.${spk}.scp" > data/${train}/wav.scp
120 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
121 | 
122 |     # use next 4 utterances as evaluation data
123 |     tail -n 4 "data/local/wav.${spk}.scp" > data/${eval}/wav.scp
124 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
125 | fi
126 | # }}}
127 | 
128 | 
129 | # STAGE 1 {{{
130 | if echo ${stage} | grep -q 1; then
131 |     echo "###########################################################"
132 |     echo "#               FEATURE EXTRACTION STEP                   #"
133 |     echo "###########################################################"
134 |     [ ! -n "${minf0}" ] && minf0=$(awk '{print $1}' conf/${spk}.f0)
135 |     [ ! -n "${maxf0}" ] && maxf0=$(awk '{print $2}' conf/${spk}.f0)
136 |     [ ! -e exp/feature_extract ] && mkdir -p exp/feature_extract
137 |     for set in ${train} ${eval};do
138 |         [ "${set}" = "${train}" ] && save_wav=true || save_wav=false
139 |         feature_extract.py \
140 |             --waveforms data/${set}/wav.scp \
141 |             --wavdir wav_hpf/${set} \
142 |             --hdf5dir hdf5/${set} \
143 |             --feature_type ${feature_type} \
144 |             --fs ${fs} \
145 |             --shiftms ${shiftms} \
146 |             --minf0 "${minf0}" \
147 |             --maxf0 "${maxf0}" \
148 |             --mcep_dim ${mcep_dim} \
149 |             --mcep_alpha ${mcep_alpha} \
150 |             --highpass_cutoff ${highpass_cutoff} \
151 |             --fftl ${fftl} \
152 |             --save_wav ${save_wav} \
153 |             --n_jobs ${n_jobs} 2>&1 | tee exp/feature_extract/feature_extract_${set}.log
154 | 
155 |         # check the number of feature files
156 |         n_wavs=$(wc -l data/${set}/wav.scp)
157 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
158 |         echo "${n_feats}/${n_wavs} files are successfully processed."
159 | 
160 |         # make scp files
161 |         if [ ${highpass_cutoff} -eq 0 ];then
162 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
163 |         elif ${save_wav}; then
164 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
165 |         fi
166 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
167 |     done
168 | fi
169 | # }}}
170 | 
171 | 
172 | # STAGE 2 {{{
173 | if echo ${stage} | grep -q 2; then
174 |     echo "###########################################################"
175 |     echo "#              CALCULATE STATISTICS STEP                  #"
176 |     echo "###########################################################"
177 |     [ ! -e exp/calculate_statistics ] && mkdir -p exp/calculate_statistics
178 |     calc_stats.py \
179 |         --feats data/${train}/feats.scp \
180 |         --stats data/${train}/stats.h5 \
181 |         --feature_type ${feature_type} | tee exp/calculate_statistics/calc_stats_${train}.log
182 |     echo "statistics are successfully calculated."
183 | fi
184 | # }}}
185 | 
186 | 
187 | # STAGE 3 {{{
188 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
189 |     echo "###########################################################"
190 |     echo "#                  NOISE WEIGHTING STEP                   #"
191 |     echo "###########################################################"
192 |     [ ! -e exp/noise_shaping ] && mkdir -p exp/noise_shaping
193 |     noise_shaping.py \
194 |         --waveforms data/${train}/wav_hpf.scp \
195 |         --stats data/${train}/stats.h5 \
196 |         --outdir wav_nwf/${train} \
197 |         --feature_type ${feature_type} \
198 |         --fs ${fs} \
199 |         --shiftms ${shiftms} \
200 |         --mcep_dim_start 2 \
201 |         --mcep_dim_end $(( 2 + mcep_dim +1 )) \
202 |         --mcep_alpha ${mcep_alpha} \
203 |         --mag ${mag} \
204 |         --inv true \
205 |         --n_jobs ${n_jobs} 2>&1 | tee exp/noise_shaping/noise_shaping_apply_${train}.log
206 | 
207 |     # check the number of feature files
208 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
209 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
210 |     echo "${n_ns}/${n_wavs} files are successfully processed."
211 | 
212 |     # make scp files
213 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
214 | fi # }}}
215 | 
216 | 
217 | # STAGE 4 {{{
218 | # set variables
219 | if [ ! -n "${tag}" ];then
220 |     expdir=exp/tr_arctic_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
221 |     if ${use_noise_shaping};then
222 |         expdir=${expdir}_ns
223 |     fi
224 |     if ${use_upsampling};then
225 |         expdir=${expdir}_up
226 |     fi
227 | else
228 |     expdir=exp/tr_arctic_${tag}
229 | fi
230 | if echo ${stage} | grep -q 4; then
231 |     echo "###########################################################"
232 |     echo "#               WAVENET TRAINING STEP                     #"
233 |     echo "###########################################################"
234 |     if ${use_noise_shaping};then
235 |         waveforms=data/${train}/wav_nwf.scp
236 |     else
237 |         waveforms=data/${train}/wav_hpf.scp
238 |     fi
239 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
240 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
241 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
242 |     train.py \
243 |         --n_gpus ${n_gpus} \
244 |         --waveforms ${waveforms} \
245 |         --feats data/${train}/feats.scp \
246 |         --stats data/${train}/stats.h5 \
247 |         --expdir "${expdir}" \
248 |         --feature_type ${feature_type} \
249 |         --n_quantize ${n_quantize} \
250 |         --n_aux ${n_aux} \
251 |         --n_resch ${n_resch} \
252 |         --n_skipch ${n_skipch} \
253 |         --dilation_depth ${dilation_depth} \
254 |         --dilation_repeat ${dilation_repeat} \
255 |         --kernel_size ${kernel_size} \
256 |         --lr ${lr} \
257 |         --weight_decay ${weight_decay} \
258 |         --iters ${iters} \
259 |         --batch_length ${batch_length} \
260 |         --batch_size ${batch_size} \
261 |         --checkpoint_interval ${checkpoint_interval} \
262 |         --upsampling_factor "${upsampling_factor}" \
263 |         --use_upsampling_layer ${use_upsampling} \
264 |         --resume "${resume}" 2>&1 | tee -a ${expdir}/log/${train}.log
265 | fi
266 | # }}}
267 | 
268 | 
269 | # STAGE 5 {{{
270 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
271 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
272 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
273 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
274 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
275 | if echo ${stage} | grep -q 5; then
276 |     echo "###########################################################"
277 |     echo "#               WAVENET DECODING STEP                     #"
278 |     echo "###########################################################"
279 |     [ ! -e ${outdir}/log ] && mkdir -p ${outdir}/log
280 |     decode.py \
281 |         --n_gpus ${n_gpus} \
282 |         --feats ${feats} \
283 |         --stats "${stats}" \
284 |         --outdir "${outdir}" \
285 |         --checkpoint "${checkpoint}" \
286 |         --config "${config}" \
287 |         --fs ${fs} \
288 |         --batch_size ${decode_batch_size} 2>&1 | tee ${outdir}/log/decode.log
289 | fi
290 | # }}}
291 | 
292 | 
293 | # STAGE 6 {{{
294 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
295 |     echo "###########################################################"
296 |     echo "#                  NOISE SHAPING STEP                     #"
297 |     echo "###########################################################"
298 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
299 |     [ ! -e exp/noise_shaping ] && mkdir -p exp/noise_shaping
300 |     noise_shaping.py \
301 |         --waveforms ${outdir}/wav.scp \
302 |         --stats "${stats}" \
303 |         --outdir ${outdir}_nsf \
304 |         --fs ${fs} \
305 |         --shiftms ${shiftms} \
306 |         --n_jobs ${n_jobs} \
307 |         --inv false 2>&1 | tee exp/noise_shaping/noise_shaping_restore_${eval}.log
308 | fi
309 | # }}}
310 | 


--------------------------------------------------------------------------------
/egs/arctic/sd/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #           SCRIPT TO BUILD SD WAVENET VOCODER             #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | . ./cmd.sh || exit 1;
 11 | 
 12 | # USER SETTINGS {{{
 13 | #######################################
 14 | #           STAGE SETTING             #
 15 | #######################################
 16 | stage=0123456
 17 | # 0: data preparation step
 18 | # 1: feature extraction step
 19 | # 2: statistics calculation step
 20 | # 3: noise weighting step
 21 | # 4: training step
 22 | # 5: decoding step
 23 | # 6: noise shaping step
 24 | 
 25 | #######################################
 26 | #          FEATURE SETTING            #
 27 | #######################################
 28 | feature_type=world     # world or melspc (in this recipe fixed to "world")
 29 | spk=slt                # target spekaer in arctic
 30 | minf0=""               # minimum f0 (if not set, conf/*.f0 will be used)
 31 | maxf0=""               # maximum f0 (if not set, conf/*.f0 will be used)
 32 | shiftms=5              # shift length in msec
 33 | fftl=1024              # fft length
 34 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 35 | fs=16000               # sampling rate
 36 | mcep_dim=24            # dimension of mel-cepstrum
 37 | mcep_alpha=0.410       # alpha value of mel-cepstrum
 38 | use_noise_shaping=true # whether to use noise shaping
 39 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 40 | n_jobs=10              # number of parallel jobs
 41 | 
 42 | #######################################
 43 | #          TRAINING SETTING           #
 44 | #######################################
 45 | n_gpus=1                  # number of gpus
 46 | n_quantize=256            # number of quantization of waveform
 47 | n_aux=28                  # number of auxiliary features
 48 | n_resch=512               # number of residual channels
 49 | n_skipch=256              # number of skip channels
 50 | dilation_depth=10         # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 51 | dilation_repeat=3         # number of dilation repeats
 52 | kernel_size=2             # kernel size of dilated convolution
 53 | lr=1e-4                   # learning rate
 54 | weight_decay=0.0          # weight decay coef
 55 | iters=200000              # number of iterations
 56 | batch_length=20000        # batch length
 57 | batch_size=1              # batch size
 58 | checkpoint_interval=10000 # save model per this number
 59 | use_upsampling=true       # whether to use upsampling layer
 60 | resume=""                 # checkpoint path to resume (Optional)
 61 | 
 62 | #######################################
 63 | #          DECODING SETTING           #
 64 | #######################################
 65 | outdir=""            # directory to save decoded wav dir (Optional)
 66 | checkpoint=""        # checkpoint path to be used for decoding (Optional)
 67 | config=""            # model configuration path (Optional)
 68 | stats=""             # statistics path (Optional)
 69 | feats=""             # list or directory of feature files (Optional)
 70 | decode_batch_size=32 # batch size in decoding
 71 | 
 72 | #######################################
 73 | #            OTHER SETTING            #
 74 | #######################################
 75 | ARCTIC_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded)
 76 | tag=""                   # tag for network directory naming (Optional)
 77 | 
 78 | # parse options
 79 | . parse_options.sh || exit 1;
 80 | 
 81 | # check feature type
 82 | if [ ${feature_type} != "world" ]; then
 83 |     echo "This recipe does not support feature_type=\"melspc\"." 2>&1
 84 |     echo "Please try the egs/arctic/sd-melspc." 2>&1
 85 |     exit 1;
 86 | fi
 87 | 
 88 | # set directory names
 89 | train=tr_${spk}
 90 | eval=ev_${spk}
 91 | 
 92 | # stop when error occurred
 93 | set -euo pipefail
 94 | # }}}
 95 | 
 96 | 
 97 | # STAGE 0 {{{
 98 | if echo ${stage} | grep -q 0; then
 99 |     echo "###########################################################"
100 |     echo "#                 DATA PREPARATION STEP                   #"
101 |     echo "###########################################################"
102 |     if [ ! -e ${ARCTIC_DB_ROOT}/.done ];then
103 |         mkdir -p ${ARCTIC_DB_ROOT}
104 |         cd ${ARCTIC_DB_ROOT}
105 |         for id in bdl slt rms clb jmk ksp awb;do
106 |             wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${id}_arctic-0.95-release.tar.bz2
107 |             tar xf cmu_us_${id}*.tar.bz2
108 |         done
109 |         rm ./*.tar.bz2
110 |         cd ../
111 |         touch ${ARCTIC_DB_ROOT}/.done
112 |         echo "database is successfully downloaded."
113 |     fi
114 |     [ ! -e data/local ] && mkdir -p data/local
115 |     [ ! -e data/${train} ] && mkdir -p data/${train}
116 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
117 |     find "${ARCTIC_DB_ROOT}/cmu_us_${spk}_arctic/wav" -name "*.wav" \
118 |         | sort > "data/local/wav.${spk}.scp"
119 |     head -n 1028 "data/local/wav.${spk}.scp" >> "data/${train}/wav.scp"
120 |     tail -n 104 "data/local/wav.${spk}.scp" >> "data/${eval}/wav.scp"
121 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
122 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
123 | fi
124 | # }}}
125 | 
126 | 
127 | # STAGE 1 {{{
128 | if echo ${stage} | grep -q 1; then
129 |     echo "###########################################################"
130 |     echo "#               FEATURE EXTRACTION STEP                   #"
131 |     echo "###########################################################"
132 |     [ ! -n "${minf0}" ] && minf0=$(awk '{print $1}' conf/${spk}.f0)
133 |     [ ! -n "${maxf0}" ] && maxf0=$(awk '{print $2}' conf/${spk}.f0)
134 |     for set in ${train} ${eval};do
135 |         # training data feature extraction
136 |         ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \
137 |             feature_extract.py \
138 |                 --waveforms data/${set}/wav.scp \
139 |                 --wavdir wav_hpf/${set} \
140 |                 --hdf5dir hdf5/${set} \
141 |                 --feature_type ${feature_type} \
142 |                 --fs ${fs} \
143 |                 --shiftms ${shiftms} \
144 |                 --minf0 "${minf0}" \
145 |                 --maxf0 "${maxf0}" \
146 |                 --mcep_dim ${mcep_dim} \
147 |                 --mcep_alpha ${mcep_alpha} \
148 |                 --highpass_cutoff ${highpass_cutoff} \
149 |                 --fftl ${fftl} \
150 |                 --n_jobs ${n_jobs}
151 | 
152 |         # check the number of feature files
153 |         n_wavs=$(wc -l data/${set}/wav.scp)
154 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
155 |         echo "${n_feats}/${n_wavs} files are successfully processed."
156 | 
157 |         # make scp files
158 |         if [ ${highpass_cutoff} -eq 0 ];then
159 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
160 |         else
161 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
162 |         fi
163 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
164 |     done
165 | fi
166 | # }}}
167 | 
168 | 
169 | # STAGE 2 {{{
170 | if echo ${stage} | grep -q 2; then
171 |     echo "###########################################################"
172 |     echo "#              CALCULATE STATISTICS STEP                  #"
173 |     echo "###########################################################"
174 |     ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \
175 |         calc_stats.py \
176 |             --feats data/${train}/feats.scp \
177 |             --stats data/${train}/stats.h5 \
178 |             --feature_type ${feature_type}
179 |     echo "statistics are successfully calculated."
180 | fi
181 | # }}}
182 | 
183 | 
184 | # STAGE 3 {{{
185 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
186 |     echo "###########################################################"
187 |     echo "#                  NOISE WEIGHTING STEP                   #"
188 |     echo "###########################################################"
189 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \
190 |         noise_shaping.py \
191 |             --waveforms data/${train}/wav_hpf.scp \
192 |             --stats data/${train}/stats.h5 \
193 |             --outdir wav_nwf/${train} \
194 |             --feature_type ${feature_type} \
195 |             --fs ${fs} \
196 |             --shiftms ${shiftms} \
197 |             --mcep_dim_start 2 \
198 |             --mcep_dim_end $(( 2 + mcep_dim +1 )) \
199 |             --mcep_alpha ${mcep_alpha} \
200 |             --mag ${mag} \
201 |             --inv true \
202 |             --n_jobs ${n_jobs}
203 | 
204 |     # check the number of feature files
205 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
206 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
207 |     echo "${n_ns}/${n_wavs} files are successfully processed."
208 | 
209 |     # make scp files
210 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
211 | fi # }}}
212 | 
213 | 
214 | # STAGE 4 {{{
215 | # set variables
216 | if [ ! -n "${tag}" ];then
217 |     expdir=exp/tr_arctic_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
218 |     if ${use_noise_shaping};then
219 |         expdir=${expdir}_ns
220 |     fi
221 |     if ${use_upsampling};then
222 |         expdir=${expdir}_up
223 |     fi
224 | else
225 |     expdir=exp/tr_arctic_${tag}
226 | fi
227 | if echo ${stage} | grep -q 4; then
228 |     echo "###########################################################"
229 |     echo "#               WAVENET TRAINING STEP                     #"
230 |     echo "###########################################################"
231 |     if ${use_noise_shaping};then
232 |         waveforms=data/${train}/wav_nwf.scp
233 |     else
234 |         waveforms=data/${train}/wav_hpf.scp
235 |     fi
236 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
237 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
238 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
239 |     ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \
240 |         train.py \
241 |             --n_gpus ${n_gpus} \
242 |             --waveforms ${waveforms} \
243 |             --feats data/${train}/feats.scp \
244 |             --stats data/${train}/stats.h5 \
245 |             --expdir "${expdir}" \
246 |             --feature_type ${feature_type} \
247 |             --n_quantize ${n_quantize} \
248 |             --n_aux ${n_aux} \
249 |             --n_resch ${n_resch} \
250 |             --n_skipch ${n_skipch} \
251 |             --dilation_depth ${dilation_depth} \
252 |             --dilation_repeat ${dilation_repeat} \
253 |             --kernel_size ${kernel_size} \
254 |             --lr ${lr} \
255 |             --weight_decay ${weight_decay} \
256 |             --iters ${iters} \
257 |             --batch_length ${batch_length} \
258 |             --batch_size ${batch_size} \
259 |             --checkpoint_interval ${checkpoint_interval} \
260 |             --upsampling_factor "${upsampling_factor}" \
261 |             --use_upsampling_layer ${use_upsampling} \
262 |             --resume "${resume}"
263 | fi
264 | # }}}
265 | 
266 | 
267 | # STAGE 5 {{{
268 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
269 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
270 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
271 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
272 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
273 | if echo ${stage} | grep -q 5; then
274 |     echo "###########################################################"
275 |     echo "#               WAVENET DECODING STEP                     #"
276 |     echo "###########################################################"
277 |     ${cuda_cmd} --gpu ${n_gpus} "${outdir}/log/decode.log" \
278 |         decode.py \
279 |             --n_gpus ${n_gpus} \
280 |             --feats ${feats} \
281 |             --stats ${stats} \
282 |             --outdir "${outdir}" \
283 |             --checkpoint "${checkpoint}" \
284 |             --config "${config}" \
285 |             --fs ${fs} \
286 |             --batch_size ${decode_batch_size}
287 | fi
288 | # }}}
289 | 
290 | 
291 | # STAGE 6 {{{
292 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
293 |     echo "###########################################################"
294 |     echo "#                  NOISE SHAPING STEP                     #"
295 |     echo "###########################################################"
296 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
297 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \
298 |         noise_shaping.py \
299 |             --waveforms ${outdir}/wav.scp \
300 |             --stats ${stats} \
301 |             --outdir "${outdir}"_nsf \
302 |             --feature_type ${feature_type} \
303 |             --fs ${fs} \
304 |             --shiftms ${shiftms} \
305 |             --mcep_dim_start 2 \
306 |             --mcep_dim_end $(( 2 + mcep_dim +1 )) \
307 |             --mcep_alpha ${mcep_alpha} \
308 |             --mag ${mag} \
309 |             --n_jobs ${n_jobs} \
310 |             --inv false
311 | fi
312 | # }}}
313 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/bin/decode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | import argparse
  8 | import logging
  9 | import math
 10 | import os
 11 | import sys
 12 | 
 13 | import numpy as np
 14 | import soundfile as sf
 15 | import torch
 16 | import torch.multiprocessing as mp
 17 | 
 18 | from sklearn.preprocessing import StandardScaler
 19 | from torchvision import transforms
 20 | 
 21 | from wavenet_vocoder.nets import decode_mu_law
 22 | from wavenet_vocoder.nets import encode_mu_law
 23 | from wavenet_vocoder.nets import WaveNet
 24 | from wavenet_vocoder.utils import extend_time
 25 | from wavenet_vocoder.utils import find_files
 26 | from wavenet_vocoder.utils import read_hdf5
 27 | from wavenet_vocoder.utils import read_txt
 28 | from wavenet_vocoder.utils import shape_hdf5
 29 | 
 30 | 
 31 | def pad_list(batch_list, pad_value=0.0):
 32 |     """PAD VALUE.
 33 | 
 34 |     Args:
 35 |         batch_list (list): List of batch, where the shape of i-th batch (T_i, C).
 36 |         pad_value (float): Value to pad.
 37 | 
 38 |     Returns:
 39 |         ndarray: Padded batch with the shape (B, T_max, C).
 40 | 
 41 |     """
 42 |     batch_size = len(batch_list)
 43 |     maxlen = max([batch.shape[0] for batch in batch_list])
 44 |     n_feats = batch_list[0].shape[-1]
 45 |     batch_pad = np.zeros((batch_size, maxlen, n_feats))
 46 |     for idx, batch in enumerate(batch_list):
 47 |         batch_pad[idx, :batch.shape[0]] = batch
 48 | 
 49 |     return batch_pad
 50 | 
 51 | 
 52 | def decode_generator(feat_list,
 53 |                      batch_size=32,
 54 |                      feature_type="world",
 55 |                      wav_transform=None,
 56 |                      feat_transform=None,
 57 |                      upsampling_factor=80,
 58 |                      use_upsampling_layer=True,
 59 |                      use_speaker_code=False):
 60 |     """GENERATE DECODING BATCH.
 61 | 
 62 |     Args:
 63 |         feat_list (list): List of feature files.
 64 |         batch_size (int): Batch size in decoding.
 65 |         feature_type (str): Feature type.
 66 |         wav_transform (func): Preprocessing function for waveform.
 67 |         feat_transform (func): Preprocessing function for aux feats.
 68 |         upsampling_factor (int): Upsampling factor.
 69 |         use_upsampling_layer (bool): Whether to use upsampling layer.
 70 |         use_speaker_code (bool): Whether to use speaker code>
 71 | 
 72 |     Returns:
 73 |         generator: Generator instance.
 74 | 
 75 |     """
 76 |     # ---------------------------
 77 |     # sample-by-sample generation
 78 |     # ---------------------------
 79 |     if batch_size == 1:
 80 |         for featfile in feat_list:
 81 |             x = np.zeros((1))
 82 |             h = read_hdf5(featfile, "/" + feature_type)
 83 |             if not use_upsampling_layer:
 84 |                 h = extend_time(h, upsampling_factor)
 85 |             if use_speaker_code:
 86 |                 sc = read_hdf5(featfile, "/speaker_code")
 87 |                 sc = np.tile(sc, [h.shape[0], 1])
 88 |                 h = np.concatenate([h, sc], axis=1)
 89 | 
 90 |             # perform pre-processing
 91 |             if wav_transform is not None:
 92 |                 x = wav_transform(x)
 93 |             if feat_transform is not None:
 94 |                 h = feat_transform(h)
 95 | 
 96 |             # convert to torch variable
 97 |             x = torch.from_numpy(x).long()
 98 |             h = torch.from_numpy(h).float()
 99 |             x = x.unsqueeze(0)  # 1 => 1 x 1
100 |             h = h.transpose(0, 1).unsqueeze(0)  # T x C => 1 x C x T
101 | 
102 |             # send to cuda
103 |             if torch.cuda.is_available():
104 |                 x = x.cuda()
105 |                 h = h.cuda()
106 | 
107 |             # get target length and file id
108 |             if not use_upsampling_layer:
109 |                 n_samples = h.size(2) - 1
110 |             else:
111 |                 n_samples = h.size(2) * upsampling_factor - 1
112 |             feat_id = os.path.basename(featfile).replace(".h5", "")
113 | 
114 |             yield feat_id, (x, h, n_samples)
115 | 
116 |     # ----------------
117 |     # batch generation
118 |     # ----------------
119 |     else:
120 |         # sort with the feature length
121 |         shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list]
122 |         idx = np.argsort(shape_list)
123 |         feat_list = [feat_list[i] for i in idx]
124 | 
125 |         # divide into batch list
126 |         n_batch = math.ceil(len(feat_list) / batch_size)
127 |         batch_lists = np.array_split(feat_list, n_batch)
128 |         batch_lists = [f.tolist() for f in batch_lists]
129 | 
130 |         for batch_list in batch_lists:
131 |             batch_x = []
132 |             batch_h = []
133 |             n_samples_list = []
134 |             feat_ids = []
135 |             for featfile in batch_list:
136 |                 # make seed waveform and load aux feature
137 |                 x = np.zeros((1))
138 |                 h = read_hdf5(featfile, "/" + feature_type)
139 |                 if not use_upsampling_layer:
140 |                     h = extend_time(h, upsampling_factor)
141 |                 if use_speaker_code:
142 |                     sc = read_hdf5(featfile, "/speaker_code")
143 |                     sc = np.tile(sc, [h.shape[0], 1])
144 |                     h = np.concatenate([h, sc], axis=1)
145 | 
146 |                 # perform pre-processing
147 |                 if wav_transform is not None:
148 |                     x = wav_transform(x)
149 |                 if feat_transform is not None:
150 |                     h = feat_transform(h)
151 | 
152 |                 # append to list
153 |                 batch_x += [x]
154 |                 batch_h += [h]
155 |                 if not use_upsampling_layer:
156 |                     n_samples_list += [h.shape[0] - 1]
157 |                 else:
158 |                     n_samples_list += [h.shape[0] * upsampling_factor - 1]
159 |                 feat_ids += [os.path.basename(featfile).replace(".h5", "")]
160 | 
161 |             # convert list to ndarray
162 |             batch_x = np.stack(batch_x, axis=0)
163 |             batch_h = pad_list(batch_h)
164 | 
165 |             # convert to torch variable
166 |             batch_x = torch.from_numpy(batch_x).long()
167 |             batch_h = torch.from_numpy(batch_h).float().transpose(1, 2)
168 | 
169 |             # send to cuda
170 |             if torch.cuda.is_available():
171 |                 batch_x = batch_x.cuda()
172 |                 batch_h = batch_h.cuda()
173 | 
174 |             yield feat_ids, (batch_x, batch_h, n_samples_list)
175 | 
176 | 
177 | def main():
178 |     """RUN DECODING."""
179 |     parser = argparse.ArgumentParser()
180 |     # decode setting
181 |     parser.add_argument("--feats", required=True,
182 |                         type=str, help="list or directory of aux feat files")
183 |     parser.add_argument("--checkpoint", required=True,
184 |                         type=str, help="model file")
185 |     parser.add_argument("--outdir", required=True,
186 |                         type=str, help="directory to save generated samples")
187 |     parser.add_argument("--stats", default=None,
188 |                         type=str, help="hdf5 file including statistics")
189 |     parser.add_argument("--config", default=None,
190 |                         type=str, help="configure file")
191 |     parser.add_argument("--fs", default=16000,
192 |                         type=int, help="sampling rate")
193 |     parser.add_argument("--batch_size", default=32,
194 |                         type=int, help="number of batch size in decoding")
195 |     parser.add_argument("--n_gpus", default=1,
196 |                         type=int, help="number of gpus")
197 |     # other setting
198 |     parser.add_argument("--intervals", default=1000,
199 |                         type=int, help="log interval")
200 |     parser.add_argument("--seed", default=1,
201 |                         type=int, help="seed number")
202 |     parser.add_argument("--verbose", default=1,
203 |                         type=int, help="log level")
204 |     args = parser.parse_args()
205 | 
206 |     # set log level
207 |     if args.verbose > 0:
208 |         logging.basicConfig(level=logging.INFO,
209 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
210 |                             datefmt='%m/%d/%Y %I:%M:%S')
211 |     elif args.verbose > 1:
212 |         logging.basicConfig(level=logging.DEBUG,
213 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
214 |                             datefmt='%m/%d/%Y %I:%M:%S')
215 |     else:
216 |         logging.basicConfig(level=logging.WARNING,
217 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
218 |                             datefmt='%m/%d/%Y %I:%M:%S')
219 |         logging.warning("logging is disabled.")
220 | 
221 |     # show arguments
222 |     for key, value in vars(args).items():
223 |         logging.info("%s = %s" % (key, str(value)))
224 | 
225 |     # check arguments
226 |     if args.stats is None:
227 |         args.stats = os.path.dirname(args.checkpoint) + "/stats.h5"
228 |     if args.config is None:
229 |         args.config = os.path.dirname(args.checkpoint) + "/model.conf"
230 |     if not os.path.exists(args.stats):
231 |         raise FileNotFoundError("statistics file is missing (%s)." % (args.stats))
232 |     if not os.path.exists(args.config):
233 |         raise FileNotFoundError("config file is missing (%s)." % (args.config))
234 | 
235 |     # check directory existence
236 |     if not os.path.exists(args.outdir):
237 |         os.makedirs(args.outdir)
238 | 
239 |     # fix seed
240 |     os.environ['PYTHONHASHSEED'] = str(args.seed)
241 |     np.random.seed(args.seed)
242 |     torch.manual_seed(args.seed)
243 | 
244 |     # fix slow computation of dilated conv
245 |     # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923
246 |     torch.backends.cudnn.benchmark = True
247 | 
248 |     # load config
249 |     config = torch.load(args.config)
250 | 
251 |     # get file list
252 |     if os.path.isdir(args.feats):
253 |         feat_list = sorted(find_files(args.feats, "*.h5"))
254 |     elif os.path.isfile(args.feats):
255 |         feat_list = read_txt(args.feats)
256 |     else:
257 |         logging.error("--feats should be directory or list.")
258 |         sys.exit(1)
259 | 
260 |     # prepare the file list for parallel decoding
261 |     feat_lists = np.array_split(feat_list, args.n_gpus)
262 |     feat_lists = [f_list.tolist() for f_list in feat_lists]
263 | 
264 |     # define transform
265 |     scaler = StandardScaler()
266 |     scaler.mean_ = read_hdf5(args.stats, "/" + config.feature_type + "/mean")
267 |     scaler.scale_ = read_hdf5(args.stats, "/" + config.feature_type + "/scale")
268 |     wav_transform = transforms.Compose([
269 |         lambda x: encode_mu_law(x, config.n_quantize)])
270 |     feat_transform = transforms.Compose([
271 |         lambda x: scaler.transform(x)])
272 | 
273 |     # define gpu decode function
274 |     def gpu_decode(feat_list, gpu):
275 |         # set default gpu and do not track gradient
276 |         torch.cuda.set_device(gpu)
277 |         torch.set_grad_enabled(False)
278 | 
279 |         # define model and load parameters
280 |         if config.use_upsampling_layer:
281 |             upsampling_factor = config.upsampling_factor
282 |         else:
283 |             upsampling_factor = 0
284 |         model = WaveNet(
285 |             n_quantize=config.n_quantize,
286 |             n_aux=config.n_aux,
287 |             n_resch=config.n_resch,
288 |             n_skipch=config.n_skipch,
289 |             dilation_depth=config.dilation_depth,
290 |             dilation_repeat=config.dilation_repeat,
291 |             kernel_size=config.kernel_size,
292 |             upsampling_factor=upsampling_factor)
293 |         model.load_state_dict(torch.load(
294 |             args.checkpoint,
295 |             map_location=lambda storage,
296 |             loc: storage)["model"])
297 |         model.eval()
298 |         model.cuda()
299 | 
300 |         # define generator
301 |         generator = decode_generator(
302 |             feat_list,
303 |             batch_size=args.batch_size,
304 |             feature_type=config.feature_type,
305 |             wav_transform=wav_transform,
306 |             feat_transform=feat_transform,
307 |             upsampling_factor=config.upsampling_factor,
308 |             use_upsampling_layer=config.use_upsampling_layer,
309 |             use_speaker_code=config.use_speaker_code)
310 | 
311 |         # decode
312 |         if args.batch_size > 1:
313 |             for feat_ids, (batch_x, batch_h, n_samples_list) in generator:
314 |                 logging.info("decoding start")
315 |                 samples_list = model.batch_fast_generate(
316 |                     batch_x, batch_h, n_samples_list, args.intervals)
317 |                 for feat_id, samples in zip(feat_ids, samples_list):
318 |                     wav = decode_mu_law(samples, config.n_quantize)
319 |                     sf.write(args.outdir + "/" + feat_id + ".wav", wav, args.fs, "PCM_16")
320 |                     logging.info("wrote %s.wav in %s." % (feat_id, args.outdir))
321 |         else:
322 |             for feat_id, (x, h, n_samples) in generator:
323 |                 logging.info("decoding %s (length = %d)" % (feat_id, n_samples))
324 |                 samples = model.fast_generate(x, h, n_samples, args.intervals)
325 |                 wav = decode_mu_law(samples, config.n_quantize)
326 |                 sf.write(args.outdir + "/" + feat_id + ".wav", wav, args.fs, "PCM_16")
327 |                 logging.info("wrote %s.wav in %s." % (feat_id, args.outdir))
328 | 
329 |     # parallel decode
330 |     processes = []
331 |     for gpu, feat_list in enumerate(feat_lists):
332 |         p = mp.Process(target=gpu_decode, args=(feat_list, gpu,))
333 |         p.start()
334 |         processes.append(p)
335 | 
336 |     # wait for all process
337 |     for p in processes:
338 |         p.join()
339 | 
340 | 
341 | if __name__ == "__main__":
342 |     main()
343 | 


--------------------------------------------------------------------------------
/egs/ljspeech/sd-melspc/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #           SCRIPT TO BUILD SD WAVENET VOCODER             #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | . ./cmd.sh || exit 1;
 11 | 
 12 | # USER SETTINGS {{{
 13 | #######################################
 14 | #           STAGE SETTING             #
 15 | #######################################
 16 | stage=0123456
 17 | # 0: data preparation step
 18 | # 1: feature extraction step
 19 | # 2: statistics calculation step
 20 | # 3: noise weighting step
 21 | # 4: training step
 22 | # 5: decoding step
 23 | # 6: noise shaping step
 24 | 
 25 | #######################################
 26 | #          FEATURE SETTING            #
 27 | #######################################
 28 | feature_type=melspc    # world or melspc (in this recipe fixed to "melspc")
 29 | shiftms=11.61          # shift length in msec (in point: shiftms * fs / 1000)
 30 | fftl=1024              # fft length
 31 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 32 | fs=22050               # sampling rate
 33 | mspc_dim=80            # dimension of mel-spectrogram
 34 | mcep_dim=35            # dimension of mel-cepstrum
 35 | mcep_alpha=0.455       # alpha value of mel-cepstrum
 36 | fmin=""                # minimum frequency in melspc calculation
 37 | fmax=""                # maximum frequency in melspc calculation
 38 | use_noise_shaping=true # whether to use noise shaping
 39 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 40 | n_jobs=10              # number of parallel jobs
 41 | 
 42 | #######################################
 43 | #          TRAINING SETTING           #
 44 | #######################################
 45 | n_gpus=1                  # number of gpus
 46 | n_quantize=256            # number of quantization of waveform
 47 | n_aux=80                  # number of auxiliary features
 48 | n_resch=512               # number of residual channels
 49 | n_skipch=256              # number of skip channels
 50 | dilation_depth=10         # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 51 | dilation_repeat=3         # number of dilation repeats
 52 | kernel_size=3             # kernel size of dilated convolution
 53 | lr=1e-4                   # learning rate
 54 | weight_decay=0.0          # weight decay coef
 55 | iters=200000              # number of iterations
 56 | batch_length=15000        # batch length
 57 | batch_size=1              # batch size
 58 | checkpoint_interval=10000 # save model per this number
 59 | use_upsampling=true       # whether to use upsampling layer
 60 | resume=""                 # checkpoint path to resume (Optional)
 61 | 
 62 | #######################################
 63 | #          DECODING SETTING           #
 64 | #######################################
 65 | outdir=""            # directory to save decoded wav dir (Optional)
 66 | checkpoint=""        # checkpoint path to be used for decoding (Optional)
 67 | config=""            # model configuration path (Optional)
 68 | stats=""             # statistics path (Optional)
 69 | feats=""             # list or directory of feature files (Optional)
 70 | decode_batch_size=16 # batch size in decoding
 71 | 
 72 | #######################################
 73 | #            OTHER SETTING            #
 74 | #######################################
 75 | LJSPEECH_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded)
 76 | tag=""                   # tag for network directory naming (Optional)
 77 | 
 78 | # parse options
 79 | . parse_options.sh || exit 1;
 80 | 
 81 | # check feature type
 82 | if [ ${feature_type} != "melspc" ]; then
 83 |     echo "This recipe does not support feature_type=\"world\"." 2>&1
 84 |     echo "Please try the egs/ljspeech/sd." 2>&1
 85 |     exit 1;
 86 | fi
 87 | 
 88 | # set directory names
 89 | train=tr
 90 | eval=ev
 91 | 
 92 | # stop when error occurred
 93 | set -euo pipefail
 94 | # }}}
 95 | 
 96 | 
 97 | # STAGE 0 {{{
 98 | if echo ${stage} | grep -q 0; then
 99 |     echo "###########################################################"
100 |     echo "#                 DATA PREPARATION STEP                   #"
101 |     echo "###########################################################"
102 |     if [ ! -e ${LJSPEECH_DB_ROOT}/.done ];then
103 |         mkdir -p ${LJSPEECH_DB_ROOT}
104 |         cd ${LJSPEECH_DB_ROOT}
105 |         wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
106 |         tar -vxf ./*.tar.bz2
107 |         rm ./*.tar.bz2
108 |         cd ../
109 |         touch ${LJSPEECH_DB_ROOT}/.done
110 |         echo "database is successfully downloaded."
111 |     fi
112 |     [ ! -e data/local ] && mkdir -p data/local
113 |     [ ! -e data/${train} ] && mkdir -p data/${train}
114 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
115 |     find ${LJSPEECH_DB_ROOT}/LJSpeech-1.1/wavs -name "*.wav" \
116 |         | sort > data/local/wav.scp
117 |     grep -v LJ050 data/local/wav.scp > data/${train}/wav.scp
118 |     grep LJ050 data/local/wav.scp > data/${eval}/wav.scp
119 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
120 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
121 | fi
122 | # }}}
123 | 
124 | 
125 | # STAGE 1 {{{
126 | if echo ${stage} | grep -q 1; then
127 |     echo "###########################################################"
128 |     echo "#               FEATURE EXTRACTION STEP                   #"
129 |     echo "###########################################################"
130 |     for set in ${train} ${eval};do
131 |         # training data feature extraction
132 |         ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${feature_type}_${set}.log \
133 |             feature_extract.py \
134 |                 --waveforms data/${set}/wav.scp \
135 |                 --wavdir wav_hpf/${set} \
136 |                 --hdf5dir hdf5/${set} \
137 |                 --feature_type ${feature_type} \
138 |                 --fs ${fs} \
139 |                 --shiftms ${shiftms} \
140 |                 --mspc_dim ${mspc_dim} \
141 |                 --highpass_cutoff ${highpass_cutoff} \
142 |                 --fftl ${fftl} \
143 |                 --fmin "${fmin}" \
144 |                 --fmax "${fmax}" \
145 |                 --n_jobs ${n_jobs}
146 | 
147 |         # extract stft-baed mel-cepstrum for noise shaping
148 |         if [ ${set} = ${train} ] && ${use_noise_shaping};then
149 |             ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_mcep_${set}.log \
150 |                 feature_extract.py \
151 |                     --waveforms data/${set}/wav.scp \
152 |                     --wavdir wav_hpf/${set} \
153 |                     --hdf5dir hdf5/${set} \
154 |                     --feature_type mcep \
155 |                     --fs ${fs} \
156 |                     --shiftms ${shiftms} \
157 |                     --mcep_dim ${mcep_dim} \
158 |                     --mcep_alpha ${mcep_alpha} \
159 |                     --highpass_cutoff ${highpass_cutoff} \
160 |                     --save_wav false \
161 |                     --fftl ${fftl} \
162 |                     --n_jobs ${n_jobs}
163 |         fi
164 | 
165 |         # check the number of feature files
166 |         n_wavs=$(wc -l data/${set}/wav.scp)
167 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
168 |         echo "${n_feats}/${n_wavs} files are successfully processed."
169 | 
170 |         # make scp files
171 |         if [ ${highpass_cutoff} -eq 0 ];then
172 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
173 |         else
174 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
175 |         fi
176 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
177 |     done
178 | 
179 | fi
180 | # }}}
181 | 
182 | 
183 | # STAGE 2 {{{
184 | if echo ${stage} | grep -q 2; then
185 |     echo "###########################################################"
186 |     echo "#              CALCULATE STATISTICS STEP                  #"
187 |     echo "###########################################################"
188 |     ${train_cmd} exp/calculate_statistics/calc_stats_${feature_type}_${train}.log \
189 |         calc_stats.py \
190 |             --feats data/${train}/feats.scp \
191 |             --stats data/${train}/stats.h5 \
192 |             --feature_type ${feature_type}
193 |     if ${use_noise_shaping};then
194 |         ${train_cmd} exp/calculate_statistics/calc_stats_mcep_${train}.log \
195 |             calc_stats.py \
196 |                 --feats data/${train}/feats.scp \
197 |                 --stats data/${train}/stats.h5 \
198 |                 --feature_type mcep
199 |     fi
200 |     echo "statistics are successfully calculated."
201 | fi
202 | # }}}
203 | 
204 | 
205 | # STAGE 3 {{{
206 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
207 |     echo "###########################################################"
208 |     echo "#                  NOISE WEIGHTING STEP                   #"
209 |     echo "###########################################################"
210 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_mcep_${train}.log \
211 |         noise_shaping.py \
212 |             --waveforms data/${train}/wav_hpf.scp \
213 |             --stats data/${train}/stats.h5 \
214 |             --outdir wav_nwf/${train} \
215 |             --feature_type mcep \
216 |             --fs ${fs} \
217 |             --shiftms ${shiftms} \
218 |             --mcep_alpha ${mcep_alpha} \
219 |             --mag ${mag} \
220 |             --inv true \
221 |             --n_jobs ${n_jobs}
222 | 
223 |     # check the number of feature files
224 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
225 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
226 |     echo "${n_ns}/${n_wavs} files are successfully processed."
227 | 
228 |     # make scp files
229 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
230 | fi
231 | # }}}
232 | 
233 | 
234 | # STAGE 4 {{{
235 | # set variables
236 | if [ ! -n "${tag}" ];then
237 |     expdir=exp/tr_ljspeech_22k_sd_${feature_type}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
238 |     if ${use_noise_shaping};then
239 |         expdir=${expdir}_ns
240 |     fi
241 |     if ${use_upsampling};then
242 |         expdir=${expdir}_up
243 |     fi
244 | else
245 |     expdir=exp/tr_ljspeech_22k_${tag}
246 | fi
247 | if echo ${stage} | grep -q 4; then
248 |     echo "###########################################################"
249 |     echo "#               WAVENET TRAINING STEP                     #"
250 |     echo "###########################################################"
251 |     if ${use_noise_shaping};then
252 |         waveforms=data/${train}/wav_nwf.scp
253 |     else
254 |         waveforms=data/${train}/wav_hpf.scp
255 |     fi
256 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
257 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
258 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
259 |     ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \
260 |         train.py \
261 |             --n_gpus ${n_gpus} \
262 |             --waveforms ${waveforms} \
263 |             --feats data/${train}/feats.scp \
264 |             --stats data/${train}/stats.h5 \
265 |             --expdir "${expdir}" \
266 |             --feature_type ${feature_type} \
267 |             --n_quantize ${n_quantize} \
268 |             --n_aux ${n_aux} \
269 |             --n_resch ${n_resch} \
270 |             --n_skipch ${n_skipch} \
271 |             --dilation_depth ${dilation_depth} \
272 |             --dilation_repeat ${dilation_repeat} \
273 |             --kernel_size ${kernel_size} \
274 |             --lr ${lr} \
275 |             --weight_decay ${weight_decay} \
276 |             --iters ${iters} \
277 |             --batch_length ${batch_length} \
278 |             --batch_size ${batch_size} \
279 |             --checkpoint_interval ${checkpoint_interval} \
280 |             --upsampling_factor "${upsampling_factor}" \
281 |             --use_upsampling_layer ${use_upsampling} \
282 |             --resume "${resume}"
283 | fi
284 | # }}}
285 | 
286 | 
287 | # STAGE 5 {{{
288 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
289 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
290 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
291 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
292 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
293 | if echo ${stage} | grep -q 5; then
294 |     echo "###########################################################"
295 |     echo "#               WAVENET DECODING STEP                     #"
296 |     echo "###########################################################"
297 |     ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \
298 |         decode.py \
299 |             --n_gpus ${n_gpus} \
300 |             --feats ${feats} \
301 |             --stats ${stats} \
302 |             --outdir "${outdir}" \
303 |             --checkpoint "${checkpoint}" \
304 |             --config "${config}" \
305 |             --fs ${fs} \
306 |             --batch_size ${decode_batch_size}
307 | fi
308 | # }}}
309 | 
310 | 
311 | # STAGE 6 {{{
312 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
313 |     echo "###########################################################"
314 |     echo "#                  NOISE SHAPING STEP                     #"
315 |     echo "###########################################################"
316 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
317 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_mcep_${eval}.log \
318 |         noise_shaping.py \
319 |             --waveforms ${outdir}/wav.scp \
320 |             --stats ${stats} \
321 |             --outdir "${outdir}_nsf" \
322 |             --feature_type mcep \
323 |             --fs ${fs} \
324 |             --shiftms ${shiftms} \
325 |             --mcep_alpha ${mcep_alpha} \
326 |             --mag ${mag} \
327 |             --n_jobs ${n_jobs} \
328 |             --inv false
329 | fi
330 | # }}}
331 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/bin/feature_extract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | import argparse
  8 | import logging
  9 | import multiprocessing as mp
 10 | import os
 11 | import sys
 12 | 
 13 | from distutils.util import strtobool
 14 | 
 15 | import librosa
 16 | import numpy as np
 17 | import pysptk
 18 | 
 19 | from scipy.interpolate import interp1d
 20 | from scipy.io import wavfile
 21 | from scipy.signal import firwin
 22 | from scipy.signal import get_window
 23 | from scipy.signal import lfilter
 24 | from sprocket.speech.feature_extractor import FeatureExtractor
 25 | 
 26 | from wavenet_vocoder.utils import find_files
 27 | from wavenet_vocoder.utils import read_txt
 28 | from wavenet_vocoder.utils import write_hdf5
 29 | 
 30 | EPS = 1e-10
 31 | 
 32 | 
 33 | def low_cut_filter(x, fs, cutoff=70):
 34 |     """APPLY LOW CUT FILTER.
 35 | 
 36 |     Args:
 37 |         x (ndarray): Waveform sequence.
 38 |         fs (int): Sampling frequency.
 39 |         cutoff (float): Cutoff frequency of low cut filter.
 40 | 
 41 |     Return:
 42 |         ndarray: Low cut filtered waveform sequence.
 43 | 
 44 |     """
 45 |     nyquist = fs // 2
 46 |     norm_cutoff = cutoff / nyquist
 47 | 
 48 |     # low cut filter
 49 |     fil = firwin(255, norm_cutoff, pass_zero=False)
 50 |     lcf_x = lfilter(fil, 1, x)
 51 | 
 52 |     return lcf_x
 53 | 
 54 | 
 55 | def low_pass_filter(x, fs, cutoff=70, padding=True):
 56 |     """APPLY LOW PASS FILTER.
 57 | 
 58 |     Args:
 59 |         x (ndarray): Waveform sequence.
 60 |         fs (int): Sampling frequency.
 61 |         cutoff (float): Cutoff frequency of low pass filter.
 62 | 
 63 |     Returns:
 64 |         ndarray: Low pass filtered waveform sequence
 65 | 
 66 |     """
 67 |     nyquist = fs // 2
 68 |     norm_cutoff = cutoff / nyquist
 69 | 
 70 |     # low cut filter
 71 |     numtaps = 255
 72 |     fil = firwin(numtaps, norm_cutoff)
 73 |     x_pad = np.pad(x, (numtaps, numtaps), 'edge')
 74 |     lpf_x = lfilter(fil, 1, x_pad)
 75 |     lpf_x = lpf_x[numtaps + numtaps // 2: -numtaps // 2]
 76 | 
 77 |     return lpf_x
 78 | 
 79 | 
 80 | def convert_to_continuos_f0(f0):
 81 |     """CONVERT F0 TO CONTINUOUS F0.
 82 | 
 83 |     Args:
 84 |         f0 (ndarray): original f0 sequence with the shape (T,).
 85 | 
 86 |     Returns:
 87 |         ndarray: continuous f0 with the shape (T,).
 88 | 
 89 |     """
 90 |     # get uv information as binary
 91 |     uv = np.float32(f0 != 0)
 92 | 
 93 |     # get start and end of f0
 94 |     if (f0 == 0).all():
 95 |         logging.warning("all of the f0 values are 0.")
 96 |         return uv, f0
 97 |     start_f0 = f0[f0 != 0][0]
 98 |     end_f0 = f0[f0 != 0][-1]
 99 | 
100 |     # padding start and end of f0 sequence
101 |     start_idx = np.where(f0 == start_f0)[0][0]
102 |     end_idx = np.where(f0 == end_f0)[0][-1]
103 |     f0[:start_idx] = start_f0
104 |     f0[end_idx:] = end_f0
105 | 
106 |     # get non-zero frame index
107 |     nz_frames = np.where(f0 != 0)[0]
108 | 
109 |     # perform linear interpolation
110 |     f = interp1d(nz_frames, f0[nz_frames])
111 |     cont_f0 = f(np.arange(0, f0.shape[0]))
112 | 
113 |     return uv, cont_f0
114 | 
115 | 
116 | def stft_mcep(x, fftl=512, shiftl=256, dim=25, alpha=0.41, window="hamming", is_padding=False):
117 |     """EXTRACT STFT-BASED MEL-CEPSTRUM.
118 | 
119 |     Args:
120 |         x (ndarray): Numpy double array with the size (T,).
121 |         fftl (int): FFT length in point (default=512).
122 |         shiftl (int): Shift length in point (default=256).
123 |         dim (int): Dimension of mel-cepstrum (default=25).
124 |         alpha (float): All pass filter coefficient (default=0.41).
125 |         window (str): Analysis window type (default="hamming").
126 |         is_padding (bool): Whether to pad the end of signal (default=False).
127 | 
128 |     Returns:
129 |         ndarray: Mel-cepstrum with the size (N, n_fft).
130 | 
131 |     """
132 |     # perform padding
133 |     if is_padding:
134 |         n_pad = fftl - (len(x) - fftl) % shiftl
135 |         x = np.pad(x, (0, n_pad), 'reflect')
136 | 
137 |     # get number of frames
138 |     n_frame = (len(x) - fftl) // shiftl + 1
139 | 
140 |     # get window function
141 |     win = get_window(window, fftl)
142 | 
143 |     # calculate spectrogram
144 |     mcep = [pysptk.mcep(x[shiftl * i: shiftl * i + fftl] * win,
145 |                         dim, alpha, eps=EPS, etype=1)
146 |             for i in range(n_frame)]
147 | 
148 |     return np.stack(mcep)
149 | 
150 | 
151 | def world_feature_extract(wav_list, args):
152 |     """EXTRACT WORLD FEATURE VECTOR."""
153 |     # define feature extractor
154 |     feature_extractor = FeatureExtractor(
155 |         analyzer="world",
156 |         fs=args.fs,
157 |         shiftms=args.shiftms,
158 |         minf0=args.minf0,
159 |         maxf0=args.maxf0,
160 |         fftl=args.fftl)
161 | 
162 |     for i, wav_name in enumerate(wav_list):
163 |         logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
164 | 
165 |         # load wavfile and apply low cut filter
166 |         fs, x = wavfile.read(wav_name)
167 |         if x.dtype != np.int16:
168 |             logging.warning("wav file format is not 16 bit PCM.")
169 |         x = np.array(x, dtype=np.float64)
170 |         if args.highpass_cutoff != 0:
171 |             x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)
172 | 
173 |         # check sampling frequency
174 |         if not fs == args.fs:
175 |             logging.error("sampling frequency is not matched.")
176 |             sys.exit(1)
177 | 
178 |         # extract features
179 |         f0, _, _ = feature_extractor.analyze(x)
180 |         uv, cont_f0 = convert_to_continuos_f0(f0)
181 |         cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20)
182 |         codeap = feature_extractor.codeap()
183 |         mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha)
184 | 
185 |         # concatenate
186 |         cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1)
187 |         uv = np.expand_dims(uv, axis=-1)
188 |         feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1)
189 | 
190 |         # save to hdf5
191 |         hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5")
192 |         write_hdf5(hdf5name, "/world", feats)
193 | 
194 |         # overwrite wav file
195 |         if args.highpass_cutoff != 0 and args.save_wav:
196 |             wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
197 | 
198 | 
199 | def melspectrogram_extract(wav_list, args):
200 |     """EXTRACT MEL SPECTROGRAM."""
201 |     # define feature extractor
202 |     for i, wav_name in enumerate(wav_list):
203 |         logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
204 | 
205 |         # load wavfile and apply low cut filter
206 |         fs, x = wavfile.read(wav_name)
207 |         if x.dtype != np.int16:
208 |             logging.warning("wav file format is not 16 bit PCM.")
209 |         x = np.array(x, dtype=np.float64)
210 |         if args.highpass_cutoff != 0:
211 |             x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)
212 | 
213 |         # check sampling frequency
214 |         if not fs == args.fs:
215 |             logging.error("sampling frequency is not matched.")
216 |             sys.exit(1)
217 | 
218 |         # extract features
219 |         x_norm = x / (np.iinfo(np.int16).max + 1)
220 |         shiftl = int(args.shiftms * fs * 0.001)
221 |         mspc = librosa.feature.melspectrogram(
222 |             x_norm, fs,
223 |             n_fft=args.fftl,
224 |             hop_length=shiftl,
225 |             n_mels=args.mspc_dim,
226 |             fmin=args.fmin if args.fmin is not None else 0,
227 |             fmax=args.fmax if args.fmax is not None else fs // 2,
228 |             power=1.0)
229 |         mspc = np.log10(np.maximum(EPS, mspc.T))
230 | 
231 |         # save to hdf5
232 |         hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5")
233 |         write_hdf5(hdf5name, "/melspc", np.float32(mspc))
234 | 
235 |         # overwrite wav file
236 |         if args.highpass_cutoff != 0 and args.save_wav:
237 |             wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
238 | 
239 | 
240 | def melcepstrum_extract(wav_list, args):
241 |     """EXTRACT MEL CEPSTRUM."""
242 |     # define feature extractor
243 |     for i, wav_name in enumerate(wav_list):
244 |         logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
245 | 
246 |         # load wavfile and apply low cut filter
247 |         fs, x = wavfile.read(wav_name)
248 |         if x.dtype != np.int16:
249 |             logging.warning("wav file format is not 16 bit PCM.")
250 |         x = np.array(x, dtype=np.float64)
251 |         if args.highpass_cutoff != 0:
252 |             x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff)
253 | 
254 |         # check sampling frequency
255 |         if not fs == args.fs:
256 |             logging.error("sampling frequency is not matched.")
257 |             sys.exit(1)
258 | 
259 |         # extract features
260 |         shiftl = int(args.shiftms * fs * 0.001)
261 |         mcep = stft_mcep(x, args.fftl, shiftl, args.mcep_dim, args.mcep_alpha)
262 | 
263 |         # save to hdf5
264 |         hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(".wav", ".h5")
265 |         write_hdf5(hdf5name, "/mcep", np.float32(mcep))
266 | 
267 |         # overwrite wav file
268 |         if args.highpass_cutoff != 0 and args.save_wav:
269 |             wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
270 | 
271 | 
272 | def main():
273 |     """RUN FEATURE EXTRACTION IN PARALLEL."""
274 |     parser = argparse.ArgumentParser(
275 |         description="making feature file argsurations.")
276 | 
277 |     parser.add_argument(
278 |         "--waveforms", default=None,
279 |         help="directory or list of filename of input wavfile")
280 |     parser.add_argument(
281 |         "--hdf5dir", default=None,
282 |         help="directory to save hdf5")
283 |     parser.add_argument(
284 |         "--wavdir", default=None,
285 |         help="directory to save of preprocessed wav file")
286 |     parser.add_argument(
287 |         "--fs", default=16000,
288 |         type=int, help="Sampling frequency")
289 |     parser.add_argument(
290 |         "--shiftms", default=5,
291 |         type=float, help="Frame shift in msec")
292 |     parser.add_argument(
293 |         "--feature_type", default="world", choices=["world", "melspc", "mcep"],
294 |         type=str, help="feature type")
295 |     parser.add_argument(
296 |         "--mspc_dim", default=80,
297 |         type=int, help="Dimension of mel spectrogram")
298 |     parser.add_argument(
299 |         "--minf0", default=40,
300 |         type=int, help="minimum f0 for world analysis")
301 |     parser.add_argument(
302 |         "--maxf0", default=400,
303 |         type=int, help="maximum f0 for world analysis")
304 |     parser.add_argument(
305 |         "--fmin", default=None, nargs="?",
306 |         type=int, help="minimum frequency for melspc")
307 |     parser.add_argument(
308 |         "--fmax", default=None, nargs="?",
309 |         type=int, help="maximum frequency for melspc")
310 |     parser.add_argument(
311 |         "--mcep_dim", default=24,
312 |         type=int, help="Dimension of mel cepstrum")
313 |     parser.add_argument(
314 |         "--mcep_alpha", default=0.41,
315 |         type=float, help="Alpha of mel cepstrum")
316 |     parser.add_argument(
317 |         "--fftl", default=1024,
318 |         type=int, help="FFT length")
319 |     parser.add_argument(
320 |         "--highpass_cutoff", default=70,
321 |         type=int, help="Cut off frequency in lowpass filter")
322 |     parser.add_argument(
323 |         "--save_wav", default=True,
324 |         type=strtobool, help="Whether to save filtered wav file")
325 |     parser.add_argument(
326 |         "--n_jobs", default=10,
327 |         type=int, help="number of parallel jobs")
328 |     parser.add_argument(
329 |         "--verbose", default=1,
330 |         type=int, help="log message level")
331 | 
332 |     args = parser.parse_args()
333 | 
334 |     # set log level
335 |     if args.verbose == 1:
336 |         logging.basicConfig(level=logging.INFO,
337 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
338 |                             datefmt='%m/%d/%Y %I:%M:%S')
339 |     elif args.verbose > 1:
340 |         logging.basicConfig(level=logging.DEBUG,
341 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
342 |                             datefmt='%m/%d/%Y %I:%M:%S')
343 |     else:
344 |         logging.basicConfig(level=logging.WARNING,
345 |                             format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
346 |                             datefmt='%m/%d/%Y %I:%M:%S')
347 |         logging.warning("logging is disabled.")
348 | 
349 |     # show arguments
350 |     for key, value in vars(args).items():
351 |         logging.info("%s = %s" % (key, str(value)))
352 | 
353 |     # read list
354 |     if os.path.isdir(args.waveforms):
355 |         file_list = sorted(find_files(args.waveforms, "*.wav"))
356 |     else:
357 |         file_list = read_txt(args.waveforms)
358 |     logging.info("number of utterances = %d" % len(file_list))
359 | 
360 |     # check directory existence
361 |     if not os.path.exists(args.wavdir) and args.highpass_cutoff != 0 and args.save_wav:
362 |         os.makedirs(args.wavdir)
363 |     if not os.path.exists(args.hdf5dir):
364 |         os.makedirs(args.hdf5dir)
365 | 
366 |     # divide list
367 |     file_lists = np.array_split(file_list, args.n_jobs)
368 |     file_lists = [f_list.tolist() for f_list in file_lists]
369 | 
370 |     # multi processing
371 |     processes = []
372 |     if args.feature_type == "world":
373 |         target_fn = world_feature_extract
374 |     elif args.feature_type == "melspc":
375 |         target_fn = melspectrogram_extract
376 |     else:
377 |         target_fn = melcepstrum_extract
378 |     for f in file_lists:
379 |         p = mp.Process(target=target_fn, args=(f, args,))
380 |         p.start()
381 |         processes.append(p)
382 | 
383 |     # wait for all process
384 |     for p in processes:
385 |         p.join()
386 | 
387 | 
388 | if __name__ == "__main__":
389 |     main()
390 | 


--------------------------------------------------------------------------------
/egs/arctic/sd-melspc/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #           SCRIPT TO BUILD SD WAVENET VOCODER             #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | . ./cmd.sh || exit 1;
 11 | 
 12 | # USER SETTINGS {{{
 13 | #######################################
 14 | #           STAGE SETTING             #
 15 | #######################################
 16 | stage=0123456
 17 | # 0: data preparation step
 18 | # 1: feature extraction step
 19 | # 2: statistics calculation step
 20 | # 3: noise weighting step
 21 | # 4: training step
 22 | # 5: decoding step
 23 | # 6: noise shaping step
 24 | 
 25 | #######################################
 26 | #          FEATURE SETTING            #
 27 | #######################################
 28 | feature_type=melspc    # world or melspc (in this recipe fixed to "melspc")
 29 | spk=slt                # target spekaer in arctic
 30 | shiftms=5              # shift length in msec
 31 | fftl=1024              # fft length
 32 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 33 | fs=16000               # sampling rate
 34 | mspc_dim=80            # dimension of mel-spectrogram
 35 | mcep_dim=25            # dimension of mel-cepstrum
 36 | mcep_alpha=0.410       # alpha value of mel-cepstrum
 37 | fmin=""                # minimum frequency in melspc calculation
 38 | fmax=""                # maximum frequency in melspc calculation
 39 | use_noise_shaping=true # whether to use noise shaping
 40 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 41 | n_jobs=10              # number of parallel jobs
 42 | 
 43 | #######################################
 44 | #          TRAINING SETTING           #
 45 | #######################################
 46 | n_gpus=1                  # number of gpus
 47 | n_quantize=256            # number of quantization of waveform
 48 | n_aux=80                  # number of auxiliary features
 49 | n_resch=512               # number of residual channels
 50 | n_skipch=256              # number of skip channels
 51 | dilation_depth=10         # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 52 | dilation_repeat=3         # number of dilation repeats
 53 | kernel_size=2             # kernel size of dilated convolution
 54 | lr=1e-4                   # learning rate
 55 | weight_decay=0.0          # weight decay coef
 56 | iters=200000              # number of iterations
 57 | batch_length=20000        # batch length
 58 | batch_size=1              # batch size
 59 | checkpoint_interval=10000 # save model per this number
 60 | use_upsampling=true       # whether to use upsampling layer
 61 | resume=""                 # checkpoint path to resume (Optional)
 62 | 
 63 | #######################################
 64 | #          DECODING SETTING           #
 65 | #######################################
 66 | outdir=""            # directory to save decoded wav dir (Optional)
 67 | checkpoint=""        # checkpoint path to be used for decoding (Optional)
 68 | config=""            # model configuration path (Optional)
 69 | stats=""             # statistics path (Optional)
 70 | feats=""             # list or directory of feature files (Optional)
 71 | decode_batch_size=32 # batch size in decoding
 72 | 
 73 | #######################################
 74 | #            OTHER SETTING            #
 75 | #######################################
 76 | ARCTIC_DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded)
 77 | tag=""                   # tag for network directory naming (Optional)
 78 | 
 79 | # parse options
 80 | . parse_options.sh || exit 1;
 81 | 
 82 | # check feature type
 83 | if [ ${feature_type} != "melspc" ]; then
 84 |     echo "This recipe does not support feature_type=\"world\"." 2>&1
 85 |     echo "Please try the egs/arctic/sd." 2>&1
 86 |     exit 1;
 87 | fi
 88 | 
 89 | # set directory names
 90 | train=tr_${spk}
 91 | eval=ev_${spk}
 92 | 
 93 | # stop when error occurred
 94 | set -euo pipefail
 95 | # }}}
 96 | 
 97 | 
 98 | # STAGE 0 {{{
 99 | if echo ${stage} | grep -q 0; then
100 |     echo "###########################################################"
101 |     echo "#                 DATA PREPARATION STEP                   #"
102 |     echo "###########################################################"
103 |     if [ ! -e ${ARCTIC_DB_ROOT}/.done ];then
104 |         mkdir -p ${ARCTIC_DB_ROOT}
105 |         cd ${ARCTIC_DB_ROOT}
106 |         for id in bdl slt rms clb jmk ksp awb;do
107 |             wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${id}_arctic-0.95-release.tar.bz2
108 |             tar xf cmu_us_${id}*.tar.bz2
109 |         done
110 |         rm ./*.tar.bz2
111 |         cd ../
112 |         touch ${ARCTIC_DB_ROOT}/.done
113 |         echo "database is successfully downloaded."
114 |     fi
115 |     [ ! -e data/local ] && mkdir -p data/local
116 |     [ ! -e data/${train} ] && mkdir -p data/${train}
117 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
118 |     find "${ARCTIC_DB_ROOT}/cmu_us_${spk}_arctic/wav" -name "*.wav" \
119 |         | sort > "data/local/wav.${spk}.scp"
120 |     head -n 1028 "data/local/wav.${spk}.scp" >> "data/${train}/wav.scp"
121 |     tail -n 104 "data/local/wav.${spk}.scp" >> "data/${eval}/wav.scp"
122 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
123 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
124 | fi
125 | # }}}
126 | 
127 | 
128 | # STAGE 1 {{{
129 | if echo ${stage} | grep -q 1; then
130 |     echo "###########################################################"
131 |     echo "#               FEATURE EXTRACTION STEP                   #"
132 |     echo "###########################################################"
133 |     for set in ${train} ${eval};do
134 |         # training data feature extraction
135 |         ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${feature_type}_${set}.log \
136 |             feature_extract.py \
137 |                 --waveforms data/${set}/wav.scp \
138 |                 --wavdir wav_hpf/${set} \
139 |                 --hdf5dir hdf5/${set} \
140 |                 --feature_type ${feature_type} \
141 |                 --fs ${fs} \
142 |                 --shiftms ${shiftms} \
143 |                 --mspc_dim ${mspc_dim} \
144 |                 --highpass_cutoff ${highpass_cutoff} \
145 |                 --fftl ${fftl} \
146 |                 --fmin "${fmin}" \
147 |                 --fmax "${fmax}" \
148 |                 --n_jobs ${n_jobs}
149 | 
150 |         # extract stft-baed mel-cepstrum for noise shaping
151 |         if [ ${set} = ${train} ] && ${use_noise_shaping};then
152 |             ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_mcep_${set}.log \
153 |                 feature_extract.py \
154 |                     --waveforms data/${set}/wav.scp \
155 |                     --wavdir wav_hpf/${set} \
156 |                     --hdf5dir hdf5/${set} \
157 |                     --feature_type mcep \
158 |                     --fs ${fs} \
159 |                     --shiftms ${shiftms} \
160 |                     --mcep_dim ${mcep_dim} \
161 |                     --mcep_alpha ${mcep_alpha} \
162 |                     --highpass_cutoff ${highpass_cutoff} \
163 |                     --save_wav false \
164 |                     --fftl ${fftl} \
165 |                     --n_jobs ${n_jobs}
166 |         fi
167 | 
168 |         # check the number of feature files
169 |         n_wavs=$(wc -l data/${set}/wav.scp)
170 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
171 |         echo "${n_feats}/${n_wavs} files are successfully processed."
172 | 
173 |         # make scp files
174 |         if [ ${highpass_cutoff} -eq 0 ];then
175 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
176 |         else
177 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
178 |         fi
179 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
180 |     done
181 | fi
182 | # }}}
183 | 
184 | 
185 | # STAGE 2 {{{
186 | if echo ${stage} | grep -q 2; then
187 |     echo "###########################################################"
188 |     echo "#              CALCULATE STATISTICS STEP                  #"
189 |     echo "###########################################################"
190 |     ${train_cmd} exp/calculate_statistics/calc_stats_${feature_type}_${train}.log \
191 |         calc_stats.py \
192 |             --feats data/${train}/feats.scp \
193 |             --stats data/${train}/stats.h5 \
194 |             --feature_type ${feature_type}
195 |     if ${use_noise_shaping};then
196 |         ${train_cmd} exp/calculate_statistics/calc_stats_mcep_${train}.log \
197 |             calc_stats.py \
198 |                 --feats data/${train}/feats.scp \
199 |                 --stats data/${train}/stats.h5 \
200 |                 --feature_type mcep
201 |     fi
202 |     echo "statistics are successfully calculated."
203 | fi
204 | # }}}
205 | 
206 | 
207 | # STAGE 3 {{{
208 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
209 |     echo "###########################################################"
210 |     echo "#                  NOISE WEIGHTING STEP                   #"
211 |     echo "###########################################################"
212 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_mcep_${train}.log \
213 |         noise_shaping.py \
214 |             --waveforms data/${train}/wav_hpf.scp \
215 |             --stats data/${train}/stats.h5 \
216 |             --outdir wav_nwf/${train} \
217 |             --feature_type mcep \
218 |             --fs ${fs} \
219 |             --shiftms ${shiftms} \
220 |             --mcep_alpha ${mcep_alpha} \
221 |             --mag ${mag} \
222 |             --inv true \
223 |             --n_jobs ${n_jobs}
224 | 
225 |     # check the number of feature files
226 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
227 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
228 |     echo "${n_ns}/${n_wavs} files are successfully processed."
229 | 
230 |     # make scp files
231 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
232 | fi
233 | # }}}
234 | 
235 | 
236 | # STAGE 4 {{{
237 | # set variables
238 | if [ ! -n "${tag}" ];then
239 |     expdir=exp/tr_arctic_16k_sd_melspc_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
240 |     if ${use_noise_shaping};then
241 |         expdir=${expdir}_ns
242 |     fi
243 |     if ${use_upsampling};then
244 |         expdir=${expdir}_up
245 |     fi
246 | else
247 |     expdir=exp/tr_arctic_${tag}
248 | fi
249 | if echo ${stage} | grep -q 4; then
250 |     echo "###########################################################"
251 |     echo "#               WAVENET TRAINING STEP                     #"
252 |     echo "###########################################################"
253 |     if ${use_noise_shaping};then
254 |         waveforms=data/${train}/wav_nwf.scp
255 |     else
256 |         waveforms=data/${train}/wav_hpf.scp
257 |     fi
258 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
259 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
260 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
261 |     ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \
262 |         train.py \
263 |             --n_gpus ${n_gpus} \
264 |             --waveforms ${waveforms} \
265 |             --feats data/${train}/feats.scp \
266 |             --stats data/${train}/stats.h5 \
267 |             --expdir "${expdir}" \
268 |             --feature_type ${feature_type} \
269 |             --n_quantize ${n_quantize} \
270 |             --n_aux ${n_aux} \
271 |             --n_resch ${n_resch} \
272 |             --n_skipch ${n_skipch} \
273 |             --dilation_depth ${dilation_depth} \
274 |             --dilation_repeat ${dilation_repeat} \
275 |             --kernel_size ${kernel_size} \
276 |             --lr ${lr} \
277 |             --weight_decay ${weight_decay} \
278 |             --iters ${iters} \
279 |             --batch_length ${batch_length} \
280 |             --batch_size ${batch_size} \
281 |             --checkpoint_interval ${checkpoint_interval} \
282 |             --upsampling_factor "${upsampling_factor}" \
283 |             --use_upsampling_layer ${use_upsampling} \
284 |             --resume "${resume}"
285 | fi
286 | # }}}
287 | 
288 | 
289 | # STAGE 5 {{{
290 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
291 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
292 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
293 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
294 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
295 | if echo ${stage} | grep -q 5; then
296 |     echo "###########################################################"
297 |     echo "#               WAVENET DECODING STEP                     #"
298 |     echo "###########################################################"
299 |     ${cuda_cmd} --gpu ${n_gpus} "${outdir}/log/decode.log" \
300 |         decode.py \
301 |             --n_gpus ${n_gpus} \
302 |             --feats ${feats} \
303 |             --stats ${stats} \
304 |             --outdir "${outdir}" \
305 |             --checkpoint "${checkpoint}" \
306 |             --config "${config}" \
307 |             --fs ${fs} \
308 |             --batch_size ${decode_batch_size}
309 | fi
310 | # }}}
311 | 
312 | 
313 | # STAGE 6 {{{
314 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
315 |     echo "###########################################################"
316 |     echo "#                  NOISE SHAPING STEP                     #"
317 |     echo "###########################################################"
318 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
319 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_mcep_${eval}.log \
320 |         noise_shaping.py \
321 |             --waveforms ${outdir}/wav.scp \
322 |             --stats ${stats} \
323 |             --outdir "${outdir}_nsf" \
324 |             --feature_type mcep \
325 |             --fs ${fs} \
326 |             --shiftms ${shiftms} \
327 |             --mcep_alpha ${mcep_alpha} \
328 |             --mag ${mag} \
329 |             --n_jobs ${n_jobs} \
330 |             --inv false
331 | fi
332 | # }}}
333 | 


--------------------------------------------------------------------------------
/egs/m-ailabs-speech/sd/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ############################################################
  3 | #           SCRIPT TO BUILD SD WAVENET VOCODER             #
  4 | ############################################################
  5 | 
  6 | # Copyright 2017 Tomoki Hayashi (Nagoya University)
  7 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  8 | 
  9 | . ./path.sh || exit 1;
 10 | . ./cmd.sh || exit 1;
 11 | 
 12 | # USER SETTINGS {{{
 13 | #######################################
 14 | #           STAGE SETTING             #
 15 | #######################################
 16 | stage=0123456
 17 | # 0: data preparation step
 18 | # 1: feature extraction step
 19 | # 2: statistics calculation step
 20 | # 3: noise weighting step
 21 | # 4: training step
 22 | # 5: decoding step
 23 | # 6: noise shaping step
 24 | 
 25 | #######################################
 26 | #          FEATURE SETTING            #
 27 | #######################################
 28 | feature_type=world     # world or melspc (in this recipe fixed to "world")
 29 | spk=elizabeth          # judy (F) or mary (F) or elliot (M) or elizabeth (F)
 30 | minf0=40               # minimum f0
 31 | maxf0=400              # maximum f0
 32 | shiftms=5              # shift length in msec
 33 | fftl=1024              # fft length
 34 | highpass_cutoff=70     # highpass filter cutoff frequency (if 0, will not apply)
 35 | fs=16000               # sampling rate
 36 | mcep_dim=24            # dimension of mel-cepstrum
 37 | mcep_alpha=0.410       # alpha value of mel-cepstrum
 38 | use_noise_shaping=true # whether to use noise shaping
 39 | mag=0.5                # strength of noise shaping (0.0 < mag <= 1.0)
 40 | n_jobs=10              # number of parallel jobs
 41 | 
 42 | #######################################
 43 | #          TRAINING SETTING           #
 44 | #######################################
 45 | n_gpus=1                  # number of gpus
 46 | n_quantize=256            # number of quantization of waveform
 47 | n_aux=28                  # number of auxiliary features
 48 | n_resch=512               # number of residual channels
 49 | n_skipch=256              # number of skip channels
 50 | dilation_depth=10         # dilation depth (e.g. if set 10, max dilation = 2^(10-1))
 51 | dilation_repeat=3         # number of dilation repeats
 52 | kernel_size=2             # kernel size of dilated convolution
 53 | lr=1e-4                   # learning rate
 54 | weight_decay=0.0          # weight decay coef
 55 | iters=200000              # number of iterations
 56 | batch_length=20000        # batch length
 57 | batch_size=1              # batch size
 58 | checkpoint_interval=10000 # save model per this number
 59 | use_upsampling=true       # whether to use upsampling layer
 60 | resume=""                 # checkpoint path to resume (Optional)
 61 | 
 62 | #######################################
 63 | #          DECODING SETTING           #
 64 | #######################################
 65 | outdir=""            # directory to save decoded wav dir (Optional)
 66 | checkpoint=""        # checkpoint path to be used for decoding (Optional)
 67 | config=""            # model configuration path (Optional)
 68 | stats=""             # statistics path (Optional)
 69 | feats=""             # list or directory of feature files (Optional)
 70 | decode_batch_size=32 # batch size in decoding
 71 | 
 72 | #######################################
 73 | #            OTHER SETTING            #
 74 | #######################################
 75 | DB_ROOT=downloads # directory including DB (if DB not exists, will be downloaded)
 76 | tag=""            # tag for network directory naming (Optional)
 77 | 
 78 | # parse options
 79 | . parse_options.sh || exit 1;
 80 | 
 81 | # check feature type
 82 | if [ ${feature_type} != "world" ]; then
 83 |     echo "This recipe does not support feature_type=\"melspc\"." 2>&1
 84 |     echo "Please try the egs/m-ailabs-speech/sd-melspc." 2>&1
 85 |     exit 1;
 86 | fi
 87 | 
 88 | # set directory names
 89 | train=tr_${spk}
 90 | eval=ev_${spk}
 91 | 
 92 | # stop when error occurred
 93 | set -euo pipefail
 94 | # }}}
 95 | 
 96 | 
 97 | # STAGE 0 {{{
 98 | if echo ${stage} | grep -q 0; then
 99 |     echo "###########################################################"
100 |     echo "#                 DATA PREPARATION STEP                   #"
101 |     echo "###########################################################"
102 |     if [ ! -e ${DB_ROOT}/.done ];then
103 |         mkdir -p ${DB_ROOT}
104 |         cd ${DB_ROOT}
105 |         wget http://www.caito.de/data/Training/stt_tts/en_US.tgz
106 |         wget http://www.caito.de/data/Training/stt_tts/en_UK.tgz
107 |         tar xzvf en_US.tgz
108 |         tar xzvf en_UK.tgz
109 |         rm ./*.tgz
110 |         cd ../
111 |         touch ${DB_ROOT}/.done
112 |         echo "database is successfully downloaded."
113 |     fi
114 |     [ ! -e data/local ] && mkdir -p data/local
115 |     [ ! -e data/${train} ] && mkdir -p data/${train}
116 |     [ ! -e data/${eval} ] && mkdir -p data/${eval}
117 |     if [ ${spk} = "elizabeth" ]; then
118 |         find ${DB_ROOT}/en_UK/by_book/female/elizabeth_klett -name "*.wav" \
119 |            | sort > data/local/wav.${spk}.scp
120 |         grep -v "wives_and_daughters_60_" data/local/wav.${spk}.scp > data/${train}/wav.scp
121 |         grep "wives_and_daughters_60_" data/local/wav.${spk}.scp > data/${eval}/wav.scp
122 |     elif [ ${spk} = "judy" ]; then
123 |         find ${DB_ROOT}/en_US/by_book/female/judy_bieber -name "*.wav" \
124 |            | sort > data/local/wav.${spk}.scp
125 |         grep -v "the_sea_faries_22_" data/local/wav.${spk}.scp > data/${train}/wav.scp
126 |         grep "the_sea_faries_22_" data/local/wav.${spk}.scp > data/${eval}/wav.scp
127 |     elif [ ${spk} = "mary" ]; then
128 |         find ${DB_ROOT}/en_US/by_book/female/mary_ann -name "*.wav" \
129 |            | sort > data/local/wav.${spk}.scp
130 |         grep -v "northandsouth_52_" data/local/wav.${spk}.scp > data/${train}/wav.scp
131 |         grep "northandsouth_52_" data/local/wav.${spk}.scp > data/${eval}/wav.scp
132 |     elif [ ${spk} = "elliot" ]; then
133 |         find ${DB_ROOT}/en_US/by_book/male/elliot_miller -name "*.wav" \
134 |            | sort > data/local/wav.${spk}.scp
135 |         grep -v "silent_bullet_13_" data/local/wav.${spk}.scp > data/${train}/wav.scp
136 |         grep "silent_bullet_13_" data/local/wav.${spk}.scp > data/${eval}/wav.scp
137 |     else
138 |         echo "ERROR: spk should be selected from elizabeth, judy, mary, and elliot"
139 |         exit 1
140 |     fi
141 |     echo "making wav list for training is successfully done. (#training = $(wc -l < data/${train}/wav.scp))"
142 |     echo "making wav list for evaluation is successfully done. (#evaluation = $(wc -l < data/${eval}/wav.scp))"
143 | fi
144 | # }}}
145 | 
146 | 
147 | # STAGE 1 {{{
148 | if echo ${stage} | grep -q 1; then
149 |     echo "###########################################################"
150 |     echo "#               FEATURE EXTRACTION STEP                   #"
151 |     echo "###########################################################"
152 |     for set in ${train} ${eval};do
153 |         # training data feature extraction
154 |         ${train_cmd} --num-threads ${n_jobs} exp/feature_extract/feature_extract_${set}.log \
155 |             feature_extract.py \
156 |                 --waveforms data/${set}/wav.scp \
157 |                 --wavdir wav_hpf/${set} \
158 |                 --hdf5dir hdf5/${set} \
159 |                 --feature_type ${feature_type} \
160 |                 --fs ${fs} \
161 |                 --shiftms ${shiftms} \
162 |                 --minf0 ${minf0} \
163 |                 --maxf0 ${maxf0} \
164 |                 --mcep_dim ${mcep_dim} \
165 |                 --mcep_alpha ${mcep_alpha} \
166 |                 --highpass_cutoff ${highpass_cutoff} \
167 |                 --fftl ${fftl} \
168 |                 --n_jobs ${n_jobs}
169 | 
170 |         # check the number of feature files
171 |         n_wavs=$(wc -l data/${set}/wav.scp)
172 |         n_feats=$(find hdf5/${set} -name "*.h5" | wc -l)
173 |         echo "${n_feats}/${n_wavs} files are successfully processed."
174 | 
175 |         # make scp files
176 |         if [ ${highpass_cutoff} -eq 0 ];then
177 |             cp data/${set}/wav.scp data/${set}/wav_hpf.scp
178 |         else
179 |             find wav_hpf/${set} -name "*.wav" | sort > data/${set}/wav_hpf.scp
180 |         fi
181 |         find hdf5/${set} -name "*.h5" | sort > data/${set}/feats.scp
182 |     done
183 | fi
184 | # }}}
185 | 
186 | 
187 | # STAGE 2 {{{
188 | if echo ${stage} | grep -q 2; then
189 |     echo "###########################################################"
190 |     echo "#              CALCULATE STATISTICS STEP                  #"
191 |     echo "###########################################################"
192 |     ${train_cmd} exp/calculate_statistics/calc_stats_${train}.log \
193 |         calc_stats.py \
194 |             --feats data/${train}/feats.scp \
195 |             --stats data/${train}/stats.h5 \
196 |             --feature_type ${feature_type}
197 |     echo "statistics are successfully calculated."
198 | fi
199 | # }}}
200 | 
201 | 
202 | # STAGE 3 {{{
203 | if echo ${stage} | grep -q 3 && ${use_noise_shaping}; then
204 |     echo "###########################################################"
205 |     echo "#                  NOISE WEIGHTING STEP                   #"
206 |     echo "###########################################################"
207 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_apply_${train}.log \
208 |         noise_shaping.py \
209 |             --waveforms data/${train}/wav_hpf.scp \
210 |             --stats data/${train}/stats.h5 \
211 |             --outdir wav_nwf/${train} \
212 |             --feature_type ${feature_type} \
213 |             --fs ${fs} \
214 |             --shiftms ${shiftms} \
215 |             --mcep_dim_start 2 \
216 |             --mcep_dim_end $(( 2 + mcep_dim + 1 )) \
217 |             --mcep_alpha ${mcep_alpha} \
218 |             --mag ${mag} \
219 |             --inv true \
220 |             --n_jobs ${n_jobs}
221 | 
222 |     # check the number of feature files
223 |     n_wavs=$(wc -l data/${train}/wav_hpf.scp)
224 |     n_ns=$(find wav_nwf/${train} -name "*.wav" | wc -l)
225 |     echo "${n_ns}/${n_wavs} files are successfully processed."
226 | 
227 |     # make scp files
228 |     find wav_nwf/${train} -name "*.wav" | sort > data/${train}/wav_nwf.scp
229 | fi # }}}
230 | 
231 | 
232 | # STAGE 4 {{{
233 | # set variables
234 | if [ ! -n "${tag}" ];then
235 |     expdir=exp/tr_mai_16k_sd_${feature_type}_${spk}_nq${n_quantize}_na${n_aux}_nrc${n_resch}_nsc${n_skipch}_ks${kernel_size}_dp${dilation_depth}_dr${dilation_repeat}_lr${lr}_wd${weight_decay}_bl${batch_length}_bs${batch_size}
236 |     if ${use_noise_shaping};then
237 |         expdir=${expdir}_ns
238 |     fi
239 |     if ${use_upsampling};then
240 |         expdir=${expdir}_up
241 |     fi
242 | else
243 |     expdir=exp/tr_mai_16k_${tag}
244 | fi
245 | if echo ${stage} | grep -q 4; then
246 |     echo "###########################################################"
247 |     echo "#               WAVENET TRAINING STEP                     #"
248 |     echo "###########################################################"
249 |     if ${use_noise_shaping};then
250 |         waveforms=data/${train}/wav_nwf.scp
251 |     else
252 |         waveforms=data/${train}/wav_hpf.scp
253 |     fi
254 |     upsampling_factor=$(echo "${shiftms} * ${fs} / 1000" | bc)
255 |     [ ! -e ${expdir}/log ] && mkdir -p ${expdir}/log
256 |     [ ! -e ${expdir}/stats.h5 ] && cp -v data/${train}/stats.h5 ${expdir}
257 |     ${cuda_cmd} --gpu ${n_gpus} "${expdir}/log/${train}.log" \
258 |         train.py \
259 |             --n_gpus ${n_gpus} \
260 |             --waveforms ${waveforms} \
261 |             --feats data/${train}/feats.scp \
262 |             --stats data/${train}/stats.h5 \
263 |             --expdir "${expdir}" \
264 |             --feature_type ${feature_type} \
265 |             --n_quantize ${n_quantize} \
266 |             --n_aux ${n_aux} \
267 |             --n_resch ${n_resch} \
268 |             --n_skipch ${n_skipch} \
269 |             --dilation_depth ${dilation_depth} \
270 |             --dilation_repeat ${dilation_repeat} \
271 |             --kernel_size ${kernel_size} \
272 |             --lr ${lr} \
273 |             --weight_decay ${weight_decay} \
274 |             --iters ${iters} \
275 |             --batch_length ${batch_length} \
276 |             --batch_size ${batch_size} \
277 |             --checkpoint_interval ${checkpoint_interval} \
278 |             --upsampling_factor "${upsampling_factor}" \
279 |             --use_upsampling_layer ${use_upsampling} \
280 |             --resume "${resume}"
281 | fi
282 | # }}}
283 | 
284 | 
285 | # STAGE 5 {{{
286 | [ ! -n "${outdir}" ] && outdir=${expdir}/wav
287 | [ ! -n "${checkpoint}" ] && checkpoint=${expdir}/checkpoint-final.pkl
288 | [ ! -n "${config}" ] && config=$(dirname ${checkpoint})/model.conf
289 | [ ! -n "${stats}" ] && stats=$(dirname ${checkpoint})/stats.h5
290 | [ ! -n "${feats}" ] && feats=data/${eval}/feats.scp
291 | if echo ${stage} | grep -q 5; then
292 |     echo "###########################################################"
293 |     echo "#               WAVENET DECODING STEP                     #"
294 |     echo "###########################################################"
295 |     ${cuda_cmd} --gpu ${n_gpus} "${outdir}"/log/decode.log \
296 |         decode.py \
297 |             --n_gpus ${n_gpus} \
298 |             --feats ${feats} \
299 |             --stats ${stats} \
300 |             --outdir "${outdir}" \
301 |             --checkpoint "${checkpoint}" \
302 |             --config "${config}" \
303 |             --fs ${fs} \
304 |             --batch_size ${decode_batch_size}
305 | fi
306 | # }}}
307 | 
308 | 
309 | # STAGE 6 {{{
310 | if echo ${stage} | grep -q 6 && ${use_noise_shaping}; then
311 |     echo "###########################################################"
312 |     echo "#                  NOISE SHAPING STEP                     #"
313 |     echo "###########################################################"
314 |     find "${outdir}" -name "*.wav" | sort > ${outdir}/wav.scp
315 |     ${train_cmd} --num-threads ${n_jobs} exp/noise_shaping/noise_shaping_restore_${eval}.log \
316 |         noise_shaping.py \
317 |             --waveforms ${outdir}/wav.scp \
318 |             --stats ${stats} \
319 |             --outdir "${outdir}_nsf" \
320 |             --feature_type ${feature_type} \
321 |             --fs ${fs} \
322 |             --shiftms ${shiftms} \
323 |             --mcep_dim_start 2 \
324 |             --mcep_dim_end $(( 2 + mcep_dim + 1 )) \
325 |             --mcep_alpha ${mcep_alpha} \
326 |             --mag ${mag} \
327 |             --n_jobs ${n_jobs} \
328 |             --inv false
329 | fi
330 | # }}}
331 | 


--------------------------------------------------------------------------------