├── tests
    ├── __init__.py
    ├── test-deep-learning-yuduki-yukari.wav
    └── test_dataset.py
├── become_yukarin
    ├── model
    │   ├── __init__.py
    │   ├── sr_model.py
    │   ├── model.py
    │   └── cbhg_model.py
    ├── config
    │   ├── __init__.py
    │   ├── old_config.py
    │   ├── sr_config.py
    │   └── config.py
    ├── updater
    │   ├── __init__.py
    │   ├── sr_updater.py
    │   └── updater.py
    ├── dataset
    │   ├── __init__.py
    │   ├── utility.py
    │   └── dataset.py
    ├── __init__.py
    ├── param.py
    ├── voice_changer.py
    ├── vocoder.py
    ├── data_struct.py
    ├── super_resolution.py
    └── acoustic_converter.py
├── requirements.txt
├── recipe
    ├── recipe.json
    ├── config_sr.json
    └── config.json
├── scripts
    ├── ln_atr503_to_subset.py
    ├── ln_jnas_subset.py
    ├── ln_apply_subset.py
    ├── voice_conversion_test.py
    ├── extract_spectrogram_pair.py
    ├── super_resolution_test.py
    ├── launch.py
    └── extract_acoustic_feature.py
├── setup.py
├── LICENSE
├── README_jp.md
├── README.md
├── train.py
└── train_sr.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/become_yukarin/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import model
2 | from . import sr_model
3 | 


--------------------------------------------------------------------------------
/become_yukarin/config/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import sr_config
3 | 


--------------------------------------------------------------------------------
/become_yukarin/updater/__init__.py:
--------------------------------------------------------------------------------
1 | from . import sr_updater
2 | from . import updater
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | cupy<6.0.0
 3 | chainer<6.0.0
 4 | librosa<0.7.0
 5 | pysptk
 6 | pyworld
 7 | fastdtw
 8 | matplotlib
 9 | tqdm
10 | 


--------------------------------------------------------------------------------
/tests/test-deep-learning-yuduki-yukari.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hiroshiba/become-yukarin/HEAD/tests/test-deep-learning-yuduki-yukari.wav


--------------------------------------------------------------------------------
/become_yukarin/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from . import dataset
2 | from . import utility
3 | from .dataset import create
4 | from .dataset import create_sr
5 | 


--------------------------------------------------------------------------------
/become_yukarin/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import dataset
3 | from . import param
4 | from .acoustic_converter import AcousticConverter
5 | from .super_resolution import SuperResolution
6 | from .vocoder import RealtimeVocoder
7 | from .vocoder import Vocoder
8 | from .voice_changer import VoiceChanger
9 | 


--------------------------------------------------------------------------------
/become_yukarin/param.py:
--------------------------------------------------------------------------------
 1 | from typing import NamedTuple
 2 | 
 3 | 
 4 | class VoiceParam(NamedTuple):
 5 |     sample_rate: int = 24000
 6 |     top_db: float = None
 7 |     pad_second: float = 0.0
 8 | 
 9 | 
10 | class AcousticFeatureParam(NamedTuple):
11 |     frame_period: int = 5
12 |     order: int = 8
13 |     alpha: float = 0.466
14 |     f0_estimating_method: str = 'harvest'  # dio / harvest
15 | 
16 | 
17 | class Param(NamedTuple):
18 |     voice_param: VoiceParam = VoiceParam()
19 |     acoustic_feature_param: AcousticFeatureParam = AcousticFeatureParam()
20 | 


--------------------------------------------------------------------------------
/recipe/recipe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "each": {
 3 |     "model/generator_extensive_layers": [
 4 |       8,
 5 |       0,
 6 |       8,
 7 |       0
 8 |     ],
 9 |     "model/discriminator_extensive_layers": [
10 |       5,
11 |       0,
12 |       5,
13 |       0
14 |     ],
15 |     "model/weak_discriminator": [
16 |       true,
17 |       true,
18 |       false,
19 |       false
20 |     ],
21 |     "train/gpu": [
22 |       0,
23 |       1,
24 |       2,
25 |       3
26 |     ],
27 |     "project/name": [
28 |       "pp-weakD-el8",
29 |       "pp-weakD-el0",
30 |       "pp-el8",
31 |       "pp-el0"
32 |     ]
33 |   },
34 |   "all": {
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/recipe/config_sr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset": {
 3 |     "input_glob": "./feature/*.npy",
 4 |     "num_test": 5,
 5 |     "input_global_noise": 3,
 6 |     "input_local_noise": 3,
 7 |     "blur_size_factor": 0,
 8 |     "seed": 0,
 9 |     "train_crop_size": 512,
10 |     "generator_base_channels": 64,
11 |     "generator_extensive_layers": 8,
12 |     "discriminator_base_channels": 32,
13 |     "discriminator_extensive_layers": 5
14 |   },
15 |   "loss": {
16 |     "mse": 100,
17 |     "adversarial": 1
18 |   },
19 |   "model": {
20 |   },
21 |   "project": {
22 |     "name": "",
23 |     "tags": []
24 |   },
25 |   "train": {
26 |     "batchsize": 8,
27 |     "gpu": 0,
28 |     "log_iteration": 250,
29 |     "snapshot_iteration": 5000
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/scripts/ln_atr503_to_subset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('input', type=Path)
 6 | parser.add_argument('output', type=Path)
 7 | parser.add_argument('--prefix', default='')
 8 | argument = parser.parse_args()
 9 | 
10 | input = argument.input  # type: Path
11 | output = argument.output  # type: Path
12 | 
13 | paths = list(sorted(input.glob('*'), key=lambda p: int(''.join(filter(str.isdigit, p.name)))))
14 | assert len(paths) == 503
15 | 
16 | output.mkdir(exist_ok=True)
17 | 
18 | names = ['{}{:02d}'.format(s, n + 1) for s in 'ABCDEFGHIJ' for n in range(50)]
19 | names += ['J51', 'J52', 'J53']
20 | 
21 | for p, n in zip(paths, names):
22 |     out = output / (argument.prefix + n + p.suffix)
23 |     out.symlink_to(p)
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='become_yukarin',
 5 |     version='1.0.0',
 6 |     packages=find_packages(),
 7 |     url='https://github.com/Hiroshiba/become-yukarin',
 8 |     author='Kazuyuki Hiroshiba',
 9 |     author_email='hihokaruta@gmail.com',
10 |     description='become Yuduki Yukari with DeepLearning power.',
11 |     license='MIT License',
12 |     install_requires=[
13 |         'numpy',
14 |         'chainer',
15 |         'librosa',
16 |         'pysptk',
17 |         'pyworld',
18 |         'fastdtw',
19 |         'chainerui',
20 |     ],
21 |     classifiers=[
22 |         'Programming Language :: Python :: 3.5',
23 |         'Programming Language :: Python :: 3.6',
24 |         'License :: OSI Approved :: MIT License',
25 |     ]
26 | )
27 | 


--------------------------------------------------------------------------------
/become_yukarin/config/old_config.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import NamedTuple
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class CBHGDiscriminatorModelConfig(NamedTuple):
 7 |     in_channels: int
 8 |     hidden_channels_list: List[int]
 9 | 
10 | 
11 | class CBHGModelConfig(NamedTuple):
12 |     in_channels: int
13 |     conv_bank_out_channels: int
14 |     conv_bank_k: int
15 |     max_pooling_k: int
16 |     conv_projections_hidden_channels: int
17 |     highway_layers: int
18 |     out_channels: int
19 |     out_size: int
20 |     aligner_out_time_length: int
21 |     disable_last_rnn: bool
22 |     enable_aligner: bool
23 |     discriminator: Optional[CBHGDiscriminatorModelConfig]
24 | 
25 | 
26 | class CBHGLossConfig(NamedTuple):
27 |     l1: float
28 |     predictor_fake: float
29 |     discriminator_true: float
30 |     discriminator_fake: float
31 |     discriminator_grad: float
32 | 


--------------------------------------------------------------------------------
/scripts/ln_jnas_subset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import multiprocessing
 3 | from pathlib import Path
 4 | 
 5 | from jnas_metadata_loader import load_from_directory
 6 | from jnas_metadata_loader.jnas_metadata import JnasMetadata
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('jnas', type=Path)
10 | parser.add_argument('output', type=Path)
11 | parser.add_argument('--format', default='{sex}{text_id}_{mic}_atr_{subset}{sen_id}.wav')
12 | argument = parser.parse_args()
13 | 
14 | jnas = argument.jnas  # type: Path
15 | output = argument.output  # type: Path
16 | 
17 | jnas_list = load_from_directory(str(jnas))
18 | atr_list = jnas_list.subset_news_or_atr('B')
19 | 
20 | output.mkdir(exist_ok=True)
21 | 
22 | 
23 | def process(d: JnasMetadata):
24 |     p = d.path
25 |     out = output / argument.format.format(**d._asdict())
26 |     out.symlink_to(p)
27 | 
28 | 
29 | pool = multiprocessing.Pool()
30 | pool.map(process, atr_list)
31 | 


--------------------------------------------------------------------------------
/become_yukarin/voice_changer.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from .acoustic_converter import AcousticConverter
 4 | from .data_struct import AcousticFeature
 5 | from .super_resolution import SuperResolution
 6 | 
 7 | 
 8 | class VoiceChanger(object):
 9 |     def __init__(
10 |             self,
11 |             acoustic_converter: AcousticConverter,
12 |             super_resolution: SuperResolution,
13 |             output_sampling_rate: int = None,
14 |     ) -> None:
15 |         if output_sampling_rate is None:
16 |             output_sampling_rate = super_resolution.config.dataset.param.voice_param.sample_rate
17 | 
18 |         self.acoustic_converter = acoustic_converter
19 |         self.super_resolution = super_resolution
20 |         self.output_sampling_rate = output_sampling_rate
21 | 
22 |     def convert_from_acoustic_feature(self, f_in: AcousticFeature):
23 |         f_low = self.acoustic_converter.convert_to_feature(f_in)
24 |         s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32))
25 |         f_high = self.super_resolution.convert_to_feature(s_high, f_low)
26 |         return f_high
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 Kazuyuki Hiroshiba.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/recipe/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset": {
 3 |     "features": [
 4 |       "f0",
 5 |       "mfcc"
 6 |     ],
 7 |     "input_glob": "/hiho-pause/hiho-pause-atr-*.npy",
 8 |     "input_global_noise": 0.01,
 9 |     "input_local_noise": 0.01,
10 |     "input_mean_path": "/hiho-pause/mean.npy",
11 |     "input_var_path": "/hiho-pause/var.npy",
12 |     "num_test": 1,
13 |     "seed": 0,
14 |     "target_glob": "/yukari-pause/yukari-pause-atr-*.npy",
15 |     "target_global_noise": 0.01,
16 |     "target_local_noise": 0.01,
17 |     "target_mean_path": "/yukari-pause/mean.npy",
18 |     "target_var_path": "/yukari-pause/var.npy",
19 |     "train_crop_size": 512
20 |   },
21 |   "loss": {
22 |     "adversarial": 1,
23 |     "mse": 100
24 |   },
25 |   "model": {
26 |     "in_channels": 10,
27 |     "out_channels": 10,
28 |     "generator_base_channels": 64,
29 |     "generator_extensive_layers": 8,
30 |     "discriminator_base_channels": 32,
31 |     "discriminator_extensive_layers": 5,
32 |     "weak_discriminator": false
33 |   },
34 |   "project": {
35 |     "name": "",
36 |     "tags": []
37 |   },
38 |   "train": {
39 |     "batchsize": 8,
40 |     "gpu": 0,
41 |     "log_iteration": 250,
42 |     "snapshot_iteration": 5000
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/scripts/ln_apply_subset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ある話者のATR503サブセットを、他の話者に対応するようにコピーする。
 3 | targetは、拡張子前3文字がATR503サブセットでないといけない。
 4 | """
 5 | 
 6 | import argparse
 7 | from pathlib import Path
 8 | import re
 9 | from itertools import chain, groupby
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('source', type=Path)
13 | parser.add_argument('target', type=Path)
14 | parser.add_argument('output', type=Path)
15 | parser.add_argument('--prefix', default='')
16 | argument = parser.parse_args()
17 | 
18 | source = argument.source  # type: Path
19 | target = argument.target  # type: Path
20 | output = argument.output  # type: Path
21 | 
22 | # source
23 | sources = list(sorted(source.glob('*')))
24 | assert len(sources) == 503
25 | 
26 | names = ['{}{:02d}'.format(s, n + 1) for s in 'ABCDEFGHIJ' for n in range(50)]
27 | names += ['J51', 'J52', 'J53']
28 | 
29 | assert all(n in s.name for s, n in zip(sources, names))
30 | 
31 | map_source = {n: s for s, n in zip(sources, names)}
32 | 
33 | # target
34 | keyfunc = lambda t: t.stem[-3:]
35 | targets = list(target.glob('*'))
36 | map_targets = {n: list(vs) for n, vs in groupby(sorted(targets, key=keyfunc), key=keyfunc)}
37 | 
38 | assert all(n in names for n in map_targets.keys())
39 | assert len(list(chain.from_iterable(map_targets.values()))) == len(targets)
40 | 
41 | # output
42 | output.mkdir(exist_ok=True)
43 | 
44 | for n in names:
45 |     s = map_source[n]
46 |     for t in map_targets[n]:
47 |         out = output / (argument.prefix + t.stem + s.suffix)
48 |         out.symlink_to(s)
49 | 


--------------------------------------------------------------------------------
/README_jp.md:
--------------------------------------------------------------------------------
 1 | # Become Yukarin: 誰でも好きなキャラの声に
 2 | Become Yukarinは、機械学習（ディープラーニング）で声質変換を実現するリポジトリです。
 3 | 元の声と好きな声の音声データを大量に用いて機械学習することで、
 4 | 元の声を好きな声に変換することができるようになります。
 5 | 
 6 | [English README](./README.md)
 7 | 
 8 | ## 推奨環境
 9 | * Linux OS
10 | * Python 3.6
11 | 
12 | ## 準備
13 | ```bash
14 | # 必要なライブラリをインストール
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ## 学習させる
19 | 学習用のPythonスクリプトを実行するには、`become_yukarin`ライブラリをパス（PYTHONPATH）に通す必要があります。
20 | 例えば`scripts/extract_acoustic_feature.py`を以下のように書いて、パスを通しつつ実行します。
21 | 
22 | ```bash
23 | PYTHONPATH=`pwd` python scripts/extract_acoustic_feature.py ---
24 | ```
25 | 
26 | ### 第１段階の学習
27 | * 音声データを用意する
28 |   * ２つのディレクトリに、入出力の音声データを置く（ファイル名を揃える）
29 | * 音響特徴量を作成する
30 |   * `scripts/extract_acoustic_feature.py`
31 | * 学習を回す
32 |   * `train.py`
33 | * テストする
34 |   * `scripts/voice_conversion_test.py`
35 | 
36 | ### 第２段階の学習
37 | * 音声データを用意する
38 |   * １つのディレクトリに音声データを置く
39 | * 音響特徴量を作成する
40 |   * `scripts/extract_spectrogram_pair.py`
41 | * 学習を回す
42 |   * `train_sr.py`
43 | * テストする
44 |   * `scripts/super_resolution_test.py`
45 | * 別の音声データを変換する
46 |   * SuperResolutionクラスとAcousticConverterクラスを使うことで変換できます
47 |   * [サンプルコード](https://github.com/Hiroshiba/become-yukarin/blob/ipynb/show%20vc%20and%20sr.ipynb)
48 | 
49 | ## 参考
50 |   * [ipynbブランチ](https://github.com/Hiroshiba/become-yukarin/tree/ipynb)に大量にサンプルが置いてあります
51 |   * [解説ブログ](https://hiroshiba.github.io/blog/became-yuduki-yukari-with-deep-learning-power/)
52 |   * [Realtime Yukarin](https://github.com/Hiroshiba/realtime-yukarin)を使うことで、リアルタイムに声質変換することができます
53 | 
54 | ## License
55 | [MIT License](./LICENSE)
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Become Yukarin: Convert your voice to favorite voice
 2 | Become Yukarin is a repository for voice conversion with a Deep Learning model.
 3 | By traingin with a large amount of the original and favorite voice,
 4 | The Deep Learning model can convert the original voice to the favorite voice.
 5 | 
 6 | [Japanese README](./README_jp.md)
 7 | 
 8 | ## Supported environment
 9 | * Linux OS
10 | * Python 3.6
11 | 
12 | ## Preparation
13 | ```bash
14 | # install required libraries
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ## Training
19 | To run a Python script for training,
20 | you should set the environment variable `PYTHONPATH` to find the `become_yukarin` library.
21 | For example, you can execute `scripts/extract_acoustic_feature.py` with the following command:
22 | 
23 | ```bash
24 | PYTHONPATH=`pwd` python scripts/extract_acoustic_feature.py ---
25 | ```
26 | 
27 | ## First Stage Model
28 | * Prepare voice data
29 |   * Put input/target voice data in two directories (with same file names)
30 | * Create acoustic feature
31 |   * `scripts/extract_acoustic_feature.py`
32 | * Train
33 |   * `train.py`
34 | * Test
35 |   * `scripts/voice_conversion_test.py`
36 | 
37 | ## Second Stage Model
38 | * Prepare voice data
39 |   * Put input/target voice data in two directories
40 | * Create acoustic feature
41 |   * `scripts/extract_spectrogram_pair.py`
42 | * Train
43 |   * `train_sr.py`
44 | * Test
45 |   * `scripts/super_resolution_test.py`
46 | * Convert other voice data
47 |   * Use SuperResolution class and AcousticConverter class
48 |   * [sample code](https://github.com/Hiroshiba/become-yukarin/blob/ipynb/show%20vc%20and%20sr.ipynb)
49 | 
50 | ## Reference
51 |   * [ipynb branch](https://github.com/Hiroshiba/become-yukarin/tree/ipynb): Other sample code
52 |   * [Commentary Blog (Japanese)](https://hiroshiba.github.io/blog/became-yuduki-yukari-with-deep-learning-power/)
53 |   * [Realtime Yukarin](https://github.com/Hiroshiba/realtime-yukarin): Real-time voice conversion system
54 | 
55 | ## License
56 | [MIT License](./LICENSE)
57 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy
 4 | from become_yukarin.dataset import dataset
 5 | 
 6 | 
 7 | class TestDataset(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.sample_rate = 24000
10 |         self.len_time = len_time = 100
11 |         self.fft_size = fft_size = 1024
12 |         self.order = order = 59
13 |         self.dummy_feature = dataset.AcousticFeature(
14 |             f0=numpy.arange(len_time).reshape((len_time, -1)),
15 |             spectrogram=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)),
16 |             aperiodicity=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)),
17 |             mfcc=numpy.arange(len_time * (order + 1)).reshape((len_time, -1)),
18 |             voiced=(numpy.arange(len_time) % 2 == 1).reshape((len_time, -1)),
19 |         )
20 |         self.feature_sizes = dataset.AcousticFeature.get_sizes(
21 |             sampling_rate=self.sample_rate,
22 |             order=self.order,
23 |         )
24 | 
25 |     def test_encode_decode_feature(self):
26 |         encode_feature = dataset.EncodeFeatureProcess(['mfcc'])
27 |         decode_feature = dataset.DecodeFeatureProcess(['mfcc'], self.feature_sizes)
28 |         e = encode_feature(self.dummy_feature, test=True)
29 |         d = decode_feature(e, test=True)
30 |         self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc))
31 | 
32 |     def test_encode_decode_feature2(self):
33 |         encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0'])
34 |         decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes)
35 |         e = encode_feature(self.dummy_feature, test=True)
36 |         d = decode_feature(e, test=True)
37 |         self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc))
38 |         self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0))
39 | 
40 |     def test_encode_decode_feature3(self):
41 |         encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0'])
42 |         decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes)
43 |         e = encode_feature(self.dummy_feature, test=True)
44 |         e[0] = numpy.nan
45 |         d = decode_feature(e, test=True)
46 |         self.assertFalse(numpy.all(self.dummy_feature.mfcc == d.mfcc))
47 |         self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0))
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     unittest.main()
52 | 


--------------------------------------------------------------------------------
/scripts/voice_conversion_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import multiprocessing
 4 | import re
 5 | from functools import partial
 6 | from pathlib import Path
 7 | 
 8 | import librosa
 9 | import numpy
10 | 
11 | from become_yukarin import AcousticConverter
12 | from become_yukarin.config.config import create_from_json as create_config
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('model_names', nargs='+')
16 | parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
17 | parser.add_argument('-iwd', '--input_wave_directory', type=Path,
18 |                     default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/hiho-wave/hiho-pause-atr503-subset/'))
19 | parser.add_argument('-it', '--iteration', type=int)
20 | parser.add_argument('-g', '--gpu', type=int)
21 | args = parser.parse_args()
22 | 
23 | model_directory = args.model_directory  # type: Path
24 | input_wave_directory = args.input_wave_directory  # type: Path
25 | it = args.iteration
26 | gpu = args.gpu
27 | 
28 | paths_test = list(Path('./test_data/').glob('*.wav'))
29 | 
30 | 
31 | def extract_number(f):
32 |     s = re.findall("\d+", str(f))
33 |     return int(s[-1]) if s else -1
34 | 
35 | 
36 | def process(p: Path, acoustic_converter: AcousticConverter):
37 |     try:
38 |         if p.suffix in ['.npy', '.npz']:
39 |             fn = glob.glob(str(input_wave_directory / p.stem) + '.*')[0]
40 |             p = Path(fn)
41 |         wave = acoustic_converter(p)
42 |         librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True)
43 |     except:
44 |         import traceback
45 |         print('error!', str(p))
46 |         print(traceback.format_exc())
47 | 
48 | 
49 | for model_name in args.model_names:
50 |     base_model = model_directory / model_name
51 |     config = create_config(base_model / 'config.json')
52 | 
53 |     input_paths = list(sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
54 |     numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
55 |     path_train = input_paths[0]
56 |     path_test = input_paths[-1]
57 | 
58 |     if it is not None:
59 |         model_path = base_model / 'predictor_{}.npz'.format(it)
60 |     else:
61 |         model_paths = base_model.glob('predictor_*.npz')
62 |         model_path = list(sorted(model_paths, key=extract_number))[-1]
63 |     print(model_path)
64 |     acoustic_converter = AcousticConverter(config, model_path, gpu=gpu)
65 | 
66 |     output = Path('./output').absolute() / base_model.name
67 |     output.mkdir(exist_ok=True)
68 | 
69 |     paths = [path_train, path_test] + paths_test
70 | 
71 |     process_partial = partial(process, acoustic_converter=acoustic_converter)
72 |     if gpu is None:
73 |         pool = multiprocessing.Pool()
74 |         pool.map(process_partial, paths)
75 |     else:
76 |         list(map(process_partial, paths))
77 | 


--------------------------------------------------------------------------------
/scripts/extract_spectrogram_pair.py:
--------------------------------------------------------------------------------
 1 | """
 2 | extract low and high quality spectrogram data.
 3 | """
 4 | 
 5 | import argparse
 6 | import multiprocessing
 7 | from pathlib import Path
 8 | from pprint import pprint
 9 | 
10 | import numpy
11 | import pysptk
12 | import pyworld
13 | from tqdm import tqdm
14 | 
15 | from become_yukarin.dataset.dataset import AcousticFeatureProcess
16 | from become_yukarin.dataset.dataset import WaveFileLoadProcess
17 | from become_yukarin.param import AcousticFeatureParam
18 | from become_yukarin.param import VoiceParam
19 | 
20 | base_voice_param = VoiceParam()
21 | base_acoustic_feature_param = AcousticFeatureParam()
22 | 
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument('--input_directory', '-i', type=Path)
25 | parser.add_argument('--output_directory', '-o', type=Path)
26 | parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate)
27 | parser.add_argument('--top_db', type=float, default=base_voice_param.top_db)
28 | parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_second)
29 | parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period)
30 | parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order)
31 | parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha)
32 | parser.add_argument('--f0_estimating_method', default=base_acoustic_feature_param.f0_estimating_method)
33 | parser.add_argument('--enable_overwrite', action='store_true')
34 | arguments = parser.parse_args()
35 | 
36 | 
37 | def generate_file(path):
38 |     out = Path(arguments.output_directory, path.stem + '.npy')
39 |     if out.exists() and not arguments.enable_overwrite:
40 |         return
41 | 
42 |     # load wave and padding
43 |     wave_file_load_process = WaveFileLoadProcess(
44 |         sample_rate=arguments.sample_rate,
45 |         top_db=arguments.top_db,
46 |         pad_second=arguments.pad_second,
47 |     )
48 |     wave = wave_file_load_process(path, test=True)
49 | 
50 |     # make acoustic feature
51 |     acoustic_feature_process = AcousticFeatureProcess(
52 |         frame_period=arguments.frame_period,
53 |         order=arguments.order,
54 |         alpha=arguments.alpha,
55 |         f0_estimating_method=arguments.f0_estimating_method,
56 |     )
57 |     feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32)
58 |     high_spectrogram = feature.spectrogram
59 | 
60 |     fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate)
61 |     low_spectrogram = pysptk.mc2sp(
62 |         feature.mfcc,
63 |         alpha=arguments.alpha,
64 |         fftlen=fftlen,
65 |     )
66 | 
67 |     # save
68 |     numpy.save(out.absolute(), {
69 |         'low': low_spectrogram,
70 |         'high': high_spectrogram,
71 |     })
72 | 
73 | 
74 | def main():
75 |     pprint(vars(arguments))
76 | 
77 |     paths = list(sorted(arguments.input_directory.glob('*')))
78 |     arguments.output_directory.mkdir(exist_ok=True)
79 | 
80 |     pool = multiprocessing.Pool()
81 |     list(tqdm(pool.imap(generate_file, paths), total=len(paths)))
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/become_yukarin/updater/sr_updater.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | from become_yukarin.config.sr_config import SRLossConfig
 4 | 
 5 | from become_yukarin.model.sr_model import SRDiscriminator
 6 | from become_yukarin.model.sr_model import SRPredictor
 7 | 
 8 | 
 9 | class SRUpdater(chainer.training.StandardUpdater):
10 |     def __init__(
11 |             self,
12 |             loss_config: SRLossConfig,
13 |             predictor: SRPredictor,
14 |             discriminator: SRDiscriminator,
15 |             *args,
16 |             **kwargs,
17 |     ) -> None:
18 |         super().__init__(*args, **kwargs)
19 |         self.loss_config = loss_config
20 |         self.predictor = predictor
21 |         self.discriminator = discriminator
22 | 
23 |     def _loss_predictor(self, predictor, output, target, d_fake):
24 |         b, _, w, h = d_fake.data.shape
25 | 
26 |         loss_mse = (F.mean_absolute_error(output, target))
27 |         chainer.report({'mse': loss_mse}, predictor)
28 | 
29 |         loss_adv = F.sum(F.softplus(-d_fake)) / (b * w * h)
30 |         chainer.report({'adversarial': loss_adv}, predictor)
31 | 
32 |         loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv
33 |         chainer.report({'loss': loss}, predictor)
34 |         return loss
35 | 
36 |     def _loss_discriminator(self, discriminator, d_real, d_fake):
37 |         b, _, w, h = d_real.data.shape
38 | 
39 |         loss_real = F.sum(F.softplus(-d_real)) / (b * w * h)
40 |         chainer.report({'real': loss_real}, discriminator)
41 | 
42 |         loss_fake = F.sum(F.softplus(d_fake)) / (b * w * h)
43 |         chainer.report({'fake': loss_fake}, discriminator)
44 | 
45 |         loss = loss_real + loss_fake
46 |         chainer.report({'loss': loss}, discriminator)
47 | 
48 |         tp = (d_real.data > 0.5).sum()
49 |         fp = (d_fake.data > 0.5).sum()
50 |         fn = (d_real.data <= 0.5).sum()
51 |         tn = (d_fake.data <= 0.5).sum()
52 |         accuracy = (tp + tn) / (tp + fp + fn + tn)
53 |         precision = tp / (tp + fp)
54 |         recall = tp / (tp + fn)
55 |         chainer.report({'accuracy': accuracy}, self.discriminator)
56 |         chainer.report({'precision': precision}, self.discriminator)
57 |         chainer.report({'recall': recall}, self.discriminator)
58 |         return loss
59 | 
60 |     def forward(self, input, target):
61 |         output = self.predictor(input)
62 |         d_fake = self.discriminator(input, output)
63 |         d_real = self.discriminator(input, target)
64 | 
65 |         loss = {
66 |             'predictor': self._loss_predictor(self.predictor, output, target, d_fake),
67 |             'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake),
68 |         }
69 |         return loss
70 | 
71 |     def update_core(self):
72 |         opt_predictor = self.get_optimizer('predictor')
73 |         opt_discriminator = self.get_optimizer('discriminator')
74 | 
75 |         batch = self.get_iterator('main').next()
76 |         batch = self.converter(batch, self.device)
77 |         loss = self.forward(**batch)
78 | 
79 |         opt_predictor.update(loss.get, 'predictor')
80 |         opt_discriminator.update(loss.get, 'discriminator')
81 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from functools import partial
 3 | from pathlib import Path
 4 | 
 5 | from chainer import cuda
 6 | from chainer import optimizers
 7 | from chainer import training
 8 | from chainer.dataset import convert
 9 | from chainer.iterators import MultiprocessIterator
10 | from chainer.training import extensions
11 | 
12 | from become_yukarin.config.config import create_from_json
13 | from become_yukarin.dataset import create as create_dataset
14 | from become_yukarin.model.model import create
15 | from become_yukarin.updater.updater import Updater
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('config_json_path', type=Path)
20 |     parser.add_argument('output', type=Path)
21 |     arguments = parser.parse_args()
22 | 
23 |     config = create_from_json(arguments.config_json_path)
24 |     arguments.output.mkdir(exist_ok=True)
25 |     config.save_as_json((arguments.output / 'config.json').absolute())
26 | 
27 |     # model
28 |     if config.train.gpu >= 0:
29 |         cuda.get_device_from_id(config.train.gpu).use()
30 |     predictor, discriminator = create(config.model)
31 |     models = {
32 |         'predictor': predictor,
33 |         'discriminator': discriminator,
34 |     }
35 | 
36 |     # dataset
37 |     dataset = create_dataset(config.dataset)
38 |     train_iter = MultiprocessIterator(dataset['train'], config.train.batchsize)
39 |     test_iter = MultiprocessIterator(dataset['test'], config.train.batchsize, repeat=False, shuffle=False)
40 |     train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batchsize, repeat=False, shuffle=False)
41 | 
42 | 
43 |     # optimizer
44 |     def create_optimizer(model):
45 |         optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999)
46 |         optimizer.setup(model)
47 |         return optimizer
48 | 
49 | 
50 |     opts = {key: create_optimizer(model) for key, model in models.items()}
51 | 
52 |     # updater
53 |     converter = partial(convert.concat_examples, padding=0)
54 |     updater = Updater(
55 |         loss_config=config.loss,
56 |         predictor=predictor,
57 |         discriminator=discriminator,
58 |         device=config.train.gpu,
59 |         iterator=train_iter,
60 |         optimizer=opts,
61 |         converter=converter,
62 |     )
63 | 
64 |     # trainer
65 |     trigger_log = (config.train.log_iteration, 'iteration')
66 |     trigger_snapshot = (config.train.snapshot_iteration, 'iteration')
67 | 
68 |     trainer = training.Trainer(updater, out=arguments.output)
69 | 
70 |     ext = extensions.Evaluator(test_iter, models, converter, device=config.train.gpu, eval_func=updater.forward)
71 |     trainer.extend(ext, name='test', trigger=trigger_log)
72 |     ext = extensions.Evaluator(train_eval_iter, models, converter, device=config.train.gpu, eval_func=updater.forward)
73 |     trainer.extend(ext, name='train', trigger=trigger_log)
74 | 
75 |     trainer.extend(extensions.dump_graph('predictor/loss'))
76 | 
77 |     ext = extensions.snapshot_object(predictor, filename='predictor_{.updater.iteration}.npz')
78 |     trainer.extend(ext, trigger=trigger_snapshot)
79 | 
80 |     trainer.extend(extensions.LogReport(trigger=trigger_log))
81 | 
82 |     trainer.run()
83 | 


--------------------------------------------------------------------------------
/train_sr.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from functools import partial
 3 | from pathlib import Path
 4 | 
 5 | from chainer import cuda
 6 | from chainer import optimizers
 7 | from chainer import training
 8 | from chainer.dataset import convert
 9 | from chainer.iterators import MultiprocessIterator
10 | from chainer.training import extensions
11 | 
12 | from become_yukarin.config.sr_config import create_from_json
13 | from become_yukarin.dataset import create_sr as create_sr_dataset
14 | from become_yukarin.model.sr_model import create_sr as create_sr_model
15 | from become_yukarin.updater.sr_updater import SRUpdater
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('config_json_path', type=Path)
20 |     parser.add_argument('output', type=Path)
21 |     arguments = parser.parse_args()
22 | 
23 |     config = create_from_json(arguments.config_json_path)
24 |     arguments.output.mkdir(exist_ok=True)
25 |     config.save_as_json((arguments.output / 'config.json').absolute())
26 | 
27 |     # model
28 |     if config.train.gpu >= 0:
29 |         cuda.get_device_from_id(config.train.gpu).use()
30 |     predictor, discriminator = create_sr_model(config.model)
31 |     models = {
32 |         'predictor': predictor,
33 |         'discriminator': discriminator,
34 |     }
35 | 
36 |     # dataset
37 |     dataset = create_sr_dataset(config.dataset)
38 |     train_iter = MultiprocessIterator(dataset['train'], config.train.batchsize)
39 |     test_iter = MultiprocessIterator(dataset['test'], config.train.batchsize, repeat=False, shuffle=False)
40 |     train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batchsize, repeat=False, shuffle=False)
41 | 
42 | 
43 |     # optimizer
44 |     def create_optimizer(model):
45 |         optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999)
46 |         optimizer.setup(model)
47 |         return optimizer
48 | 
49 | 
50 |     opts = {key: create_optimizer(model) for key, model in models.items()}
51 | 
52 |     # updater
53 |     converter = partial(convert.concat_examples, padding=0)
54 |     updater = SRUpdater(
55 |         loss_config=config.loss,
56 |         predictor=predictor,
57 |         discriminator=discriminator,
58 |         device=config.train.gpu,
59 |         iterator=train_iter,
60 |         optimizer=opts,
61 |         converter=converter,
62 |     )
63 | 
64 |     # trainer
65 |     trigger_log = (config.train.log_iteration, 'iteration')
66 |     trigger_snapshot = (config.train.snapshot_iteration, 'iteration')
67 | 
68 |     trainer = training.Trainer(updater, out=arguments.output)
69 | 
70 |     ext = extensions.Evaluator(test_iter, models, converter, device=config.train.gpu, eval_func=updater.forward)
71 |     trainer.extend(ext, name='test', trigger=trigger_log)
72 |     ext = extensions.Evaluator(train_eval_iter, models, converter, device=config.train.gpu, eval_func=updater.forward)
73 |     trainer.extend(ext, name='train', trigger=trigger_log)
74 | 
75 |     trainer.extend(extensions.dump_graph('predictor/loss'))
76 | 
77 |     ext = extensions.snapshot_object(predictor, filename='predictor_{.updater.iteration}.npz')
78 |     trainer.extend(ext, trigger=trigger_snapshot)
79 | 
80 |     trainer.extend(extensions.LogReport(trigger=trigger_log))
81 |     trainer.extend(extensions.PrintReport(['predictor/loss']))
82 | 
83 |     trainer.run()
84 | 


--------------------------------------------------------------------------------
/become_yukarin/updater/updater.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | 
 4 | from become_yukarin.config.config import LossConfig
 5 | from become_yukarin.model.model import Discriminator
 6 | from become_yukarin.model.model import Predictor
 7 | 
 8 | 
 9 | class Updater(chainer.training.StandardUpdater):
10 |     def __init__(
11 |             self,
12 |             loss_config: LossConfig,
13 |             predictor: Predictor,
14 |             discriminator: Discriminator,
15 |             *args,
16 |             **kwargs,
17 |     ) -> None:
18 |         super().__init__(*args, **kwargs)
19 |         self.loss_config = loss_config
20 |         self.predictor = predictor
21 |         self.discriminator = discriminator
22 | 
23 |     def _loss_predictor(self, predictor, output, target, d_fake):
24 |         b, _, t = d_fake.data.shape
25 | 
26 |         loss_mse = (F.mean_absolute_error(output, target))
27 |         chainer.report({'mse': loss_mse}, predictor)
28 | 
29 |         loss_adv = F.sum(F.softplus(-d_fake)) / (b * t)
30 |         chainer.report({'adversarial': loss_adv}, predictor)
31 | 
32 |         loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv
33 |         chainer.report({'loss': loss}, predictor)
34 |         return loss
35 | 
36 |     def _loss_discriminator(self, discriminator, d_real, d_fake):
37 |         b, _, t = d_real.data.shape
38 | 
39 |         loss_real = F.sum(F.softplus(-d_real)) / (b * t)
40 |         chainer.report({'real': loss_real}, discriminator)
41 | 
42 |         loss_fake = F.sum(F.softplus(d_fake)) / (b * t)
43 |         chainer.report({'fake': loss_fake}, discriminator)
44 | 
45 |         loss = loss_real + loss_fake
46 |         chainer.report({'loss': loss}, discriminator)
47 | 
48 |         tp = (d_real.data > 0.5).sum()
49 |         fp = (d_fake.data > 0.5).sum()
50 |         fn = (d_real.data <= 0.5).sum()
51 |         tn = (d_fake.data <= 0.5).sum()
52 |         accuracy = (tp + tn) / (tp + fp + fn + tn)
53 |         precision = tp / (tp + fp)
54 |         recall = tp / (tp + fn)
55 |         chainer.report({'accuracy': accuracy}, self.discriminator)
56 |         chainer.report({'precision': precision}, self.discriminator)
57 |         chainer.report({'recall': recall}, self.discriminator)
58 |         return loss
59 | 
60 |     def forward(self, input, target, mask):
61 |         input = chainer.as_variable(input)
62 |         target = chainer.as_variable(target)
63 |         mask = chainer.as_variable(mask)
64 | 
65 |         output = self.predictor(input)
66 |         output = output * mask
67 |         target = target * mask
68 | 
69 |         d_fake = self.discriminator(input, output)
70 |         d_real = self.discriminator(input, target)
71 | 
72 |         loss = {
73 |             'predictor': self._loss_predictor(self.predictor, output, target, d_fake),
74 |             'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake),
75 |         }
76 |         return loss
77 | 
78 |     def update_core(self):
79 |         opt_predictor = self.get_optimizer('predictor')
80 |         opt_discriminator = self.get_optimizer('discriminator')
81 | 
82 |         batch = self.get_iterator('main').next()
83 |         batch = self.converter(batch, self.device)
84 |         loss = self.forward(**batch)
85 | 
86 |         opt_predictor.update(loss.get, 'predictor')
87 |         opt_discriminator.update(loss.get, 'discriminator')
88 | 


--------------------------------------------------------------------------------
/scripts/super_resolution_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import multiprocessing
 4 | import re
 5 | from functools import partial
 6 | from pathlib import Path
 7 | 
 8 | import librosa
 9 | import numpy
10 | 
11 | from become_yukarin import SuperResolution
12 | from become_yukarin.config.sr_config import create_from_json as create_config
13 | from become_yukarin.dataset.dataset import AcousticFeatureProcess
14 | from become_yukarin.dataset.dataset import WaveFileLoadProcess
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('model_names', nargs='+')
18 | parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
19 | parser.add_argument('-iwd', '--input_wave_directory', type=Path,
20 |                     default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/yukari-wave/yukari-news/'))
21 | parser.add_argument('-g', '--gpu', type=int)
22 | args = parser.parse_args()
23 | 
24 | model_directory = args.model_directory  # type: Path
25 | input_wave_directory = args.input_wave_directory  # type: Path
26 | gpu = args.gpu
27 | 
28 | paths_test = list(Path('./test_data_sr/').glob('*.wav'))
29 | 
30 | 
31 | def extract_number(f):
32 |     s = re.findall("\d+", str(f))
33 |     return int(s[-1]) if s else -1
34 | 
35 | 
36 | def process(p: Path, super_resolution: SuperResolution):
37 |     param = config.dataset.param
38 |     wave_process = WaveFileLoadProcess(
39 |         sample_rate=param.voice_param.sample_rate,
40 |         top_db=None,
41 |     )
42 |     acoustic_feature_process = AcousticFeatureProcess(
43 |         frame_period=param.acoustic_feature_param.frame_period,
44 |         order=param.acoustic_feature_param.order,
45 |         alpha=param.acoustic_feature_param.alpha,
46 |         f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
47 |     )
48 | 
49 |     try:
50 |         if p.suffix in ['.npy', '.npz']:
51 |             p = glob.glob(str(input_wave_directory / p.stem) + '.*')[0]
52 |             p = Path(p)
53 |         input = acoustic_feature_process(wave_process(str(p)))
54 |         wave = super_resolution(input.spectrogram, acoustic_feature=input, sampling_rate=param.voice_param.sample_rate)
55 |         librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True)
56 |     except:
57 |         import traceback
58 |         print('error!', str(p))
59 |         print(traceback.format_exc())
60 | 
61 | 
62 | for model_name in args.model_names:
63 |     base_model = model_directory / model_name
64 |     config = create_config(base_model / 'config.json')
65 | 
66 |     input_paths = list(sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
67 |     numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
68 |     path_train = input_paths[0]
69 |     path_test = input_paths[-1]
70 | 
71 |     model_paths = base_model.glob('predictor*.npz')
72 |     model_path = list(sorted(model_paths, key=extract_number))[-1]
73 |     print(model_path)
74 |     super_resolution = SuperResolution(config, model_path, gpu=gpu)
75 | 
76 |     output = Path('./output').absolute() / base_model.name
77 |     output.mkdir(exist_ok=True)
78 | 
79 |     paths = [path_train, path_test] + paths_test
80 | 
81 |     process_partial = partial(process, super_resolution=super_resolution)
82 |     if gpu is None:
83 |         pool = multiprocessing.Pool()
84 |         pool.map(process_partial, paths)
85 |     else:
86 |         list(map(process_partial, paths))
87 | 


--------------------------------------------------------------------------------
/scripts/launch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | launcher for some task that have diff params
  3 | """
  4 | 
  5 | import argparse
  6 | import copy
  7 | import datetime
  8 | import hashlib
  9 | import json
 10 | import subprocess
 11 | import time
 12 | from pathlib import Path
 13 | 
 14 | base_command_default = \
 15 |     "screen -d -m -S {project/name}_gpu{train/gpu} ;" + \
 16 |     "screen -S {project/name}_gpu{train/gpu} -X stuff 'python3 {python_file_path} {recipe_path} {output}\n'"
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('output_dir', type=Path)
 20 | parser.add_argument('--python_file_path', default='train.py')
 21 | parser.add_argument('--recipe_json_path', default='recipe/recipe.json')
 22 | parser.add_argument('--base_config_json_path', default='recipe/config.json')
 23 | parser.add_argument('--base_command', default=base_command_default)
 24 | args = parser.parse_args()
 25 | 
 26 | recipe = json.load(open(args.recipe_json_path, encoding='utf-8'))
 27 | recipe_each = recipe['each']
 28 | recipe_all = recipe['all']
 29 | base_config = json.load(open(args.base_config_json_path, encoding='utf-8'))
 30 | 
 31 | 
 32 | def put_config_value(config, recipe_key, value):
 33 |     key_tree = recipe_key.split('/')
 34 |     target = config
 35 |     for key in key_tree[:-1]:
 36 |         target = target[key]
 37 | 
 38 |     target[key_tree[-1]] = value
 39 | 
 40 | 
 41 | def _replace_name(dist):
 42 |     _format = {}
 43 |     now = datetime.datetime.now()
 44 | 
 45 |     if '{date}' in dist['project']['name']:
 46 |         _format['date'] = now.strftime('%Y%m%d%H%M%S')
 47 |     if '{hash}' in dist['project']['name']:
 48 |         _format['hash'] = hashlib.md5(bytes(str(now), 'utf')).hexdigest()[:6]
 49 | 
 50 |     if len(_format) > 0:
 51 |         dist['project']['name'] = dist['project']['name'].format(**_format)
 52 | 
 53 | 
 54 | num_task = min(len(list(value)) for value in recipe_each.values())
 55 | command_list = []
 56 | 
 57 | for i in range(num_task):
 58 |     config = copy.deepcopy(base_config)
 59 | 
 60 |     for recipe_key in recipe_all.keys():
 61 |         put_config_value(config, recipe_key, recipe_all[recipe_key])
 62 | 
 63 |     for recipe_key in recipe_each.keys():
 64 |         put_config_value(config, recipe_key, recipe_each[recipe_key][i])
 65 | 
 66 |     _replace_name(config)
 67 | 
 68 |     # add git branch name
 69 |     git_branch = subprocess.check_output('git rev-parse --abbrev-ref HEAD', shell=True).decode("utf-8").strip()
 70 |     config['project']['tags'].append('git branch name:' + git_branch)
 71 | 
 72 |     made_recipe_path = "{}.{}.json".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'), i)
 73 |     with open(made_recipe_path, 'w', encoding='utf') as f:
 74 |         json.dump(config, f, indent=2, sort_keys=True, ensure_ascii=False)
 75 | 
 76 | 
 77 |     def make_key_chain(key_chain, value, dist):
 78 |         if not isinstance(value, dict):
 79 |             dist['/'.join(key_chain)] = value
 80 |         else:
 81 |             for key in value.keys():
 82 |                 make_key_chain(key_chain + [key], value[key], dist)
 83 | 
 84 | 
 85 |     dist = {}
 86 |     make_key_chain([], config, dist)
 87 | 
 88 |     dist['output'] = args.output_dir / config['project']['name']
 89 |     dist['python_file_path'] = args.python_file_path
 90 |     dist['recipe_path'] = made_recipe_path
 91 | 
 92 |     command = args.base_command.format(**dist)
 93 |     command_list += [command]
 94 | 
 95 |     print(config['project']['name'])
 96 | 
 97 | for command in command_list:
 98 |     time.sleep(1)
 99 |     subprocess.check_output(command, shell=True)
100 | 


--------------------------------------------------------------------------------
/become_yukarin/config/sr_config.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | from typing import Dict
  4 | from typing import List
  5 | from typing import NamedTuple
  6 | from typing import Union
  7 | 
  8 | from become_yukarin.param import Param
  9 | 
 10 | 
 11 | class SRDatasetConfig(NamedTuple):
 12 |     param: Param
 13 |     input_glob: Path
 14 |     train_crop_size: int
 15 |     input_global_noise: float
 16 |     input_local_noise: float
 17 |     blur_size_factor: float
 18 |     seed: int
 19 |     num_test: int
 20 | 
 21 | 
 22 | class SRModelConfig(NamedTuple):
 23 |     generator_base_channels: int
 24 |     generator_extensive_layers: int
 25 |     discriminator_base_channels: int
 26 |     discriminator_extensive_layers: int
 27 | 
 28 | 
 29 | class SRLossConfig(NamedTuple):
 30 |     mse: float
 31 |     adversarial: float
 32 | 
 33 | 
 34 | class SRTrainConfig(NamedTuple):
 35 |     batchsize: int
 36 |     gpu: int
 37 |     log_iteration: int
 38 |     snapshot_iteration: int
 39 | 
 40 | 
 41 | class SRProjectConfig(NamedTuple):
 42 |     name: str
 43 |     tags: List[str]
 44 | 
 45 | 
 46 | class SRConfig(NamedTuple):
 47 |     dataset: SRDatasetConfig
 48 |     model: SRModelConfig
 49 |     loss: SRLossConfig
 50 |     train: SRTrainConfig
 51 |     project: SRProjectConfig
 52 | 
 53 |     def save_as_json(self, path):
 54 |         d = _namedtuple_to_dict(self)
 55 |         json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path)
 56 | 
 57 | 
 58 | def _default_path(o):
 59 |     if isinstance(o, Path):
 60 |         return str(o)
 61 |     raise TypeError(repr(o) + " is not JSON serializable")
 62 | 
 63 | 
 64 | def _namedtuple_to_dict(o: NamedTuple):
 65 |     return {
 66 |         k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v)
 67 |         for k, v in o._asdict().items()
 68 |     }
 69 | 
 70 | 
 71 | def create_from_json(s: Union[str, Path]):
 72 |     try:
 73 |         d = json.loads(s)
 74 |     except TypeError:
 75 |         d = json.load(open(s))
 76 | 
 77 |     backward_compatible(d)
 78 | 
 79 |     return SRConfig(
 80 |         dataset=SRDatasetConfig(
 81 |             param=Param(),
 82 |             input_glob=Path(d['dataset']['input_glob']),
 83 |             train_crop_size=d['dataset']['train_crop_size'],
 84 |             input_global_noise=d['dataset']['input_global_noise'],
 85 |             input_local_noise=d['dataset']['input_local_noise'],
 86 |             blur_size_factor=d['dataset']['blur_size_factor'],
 87 |             seed=d['dataset']['seed'],
 88 |             num_test=d['dataset']['num_test'],
 89 |         ),
 90 |         model=SRModelConfig(
 91 |             generator_base_channels=d['model']['generator_base_channels'],
 92 |             generator_extensive_layers=d['model']['generator_extensive_layers'],
 93 |             discriminator_base_channels=d['model']['discriminator_base_channels'],
 94 |             discriminator_extensive_layers=d['model']['discriminator_extensive_layers'],
 95 |         ),
 96 |         loss=SRLossConfig(
 97 |             mse=d['loss']['mse'],
 98 |             adversarial=d['loss']['adversarial'],
 99 |         ),
100 |         train=SRTrainConfig(
101 |             batchsize=d['train']['batchsize'],
102 |             gpu=d['train']['gpu'],
103 |             log_iteration=d['train']['log_iteration'],
104 |             snapshot_iteration=d['train']['snapshot_iteration'],
105 |         ),
106 |         project=SRProjectConfig(
107 |             name=d['project']['name'],
108 |             tags=d['project']['tags'],
109 |         )
110 |     )
111 | 
112 | 
113 | def backward_compatible(d: Dict):
114 |     if 'blur_size_factor' not in d['dataset']:
115 |         d['dataset']['blur_size_factor'] = 0
116 | 
117 |     if 'generator_base_channels' not in d['model']:
118 |         d['model']['generator_base_channels'] = 64
119 |         d['model']['generator_extensive_layers'] = 8
120 |         d['model']['discriminator_base_channels'] = 32
121 |         d['model']['discriminator_extensive_layers'] = 5
122 | 


--------------------------------------------------------------------------------
/become_yukarin/dataset/utility.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import fastdtw
  4 | import numpy
  5 | 
  6 | _logdb_const = 10.0 / numpy.log(10.0) * numpy.sqrt(2.0)
  7 | 
  8 | 
  9 | # should work on torch and numpy arrays
 10 | def _sqrt(x):
 11 |     isnumpy = isinstance(x, numpy.ndarray)
 12 |     isscalar = numpy.isscalar(x)
 13 |     return numpy.sqrt(x) if isnumpy else math.sqrt(x) if isscalar else x.sqrt()
 14 | 
 15 | 
 16 | def _exp(x):
 17 |     isnumpy = isinstance(x, numpy.ndarray)
 18 |     isscalar = numpy.isscalar(x)
 19 |     return numpy.exp(x) if isnumpy else math.exp(x) if isscalar else x.exp()
 20 | 
 21 | 
 22 | def _sum(x):
 23 |     if isinstance(x, list) or isinstance(x, numpy.ndarray):
 24 |         return numpy.sum(x)
 25 |     return float(x.sum())
 26 | 
 27 | 
 28 | def melcd(X, Y, lengths=None):
 29 |     """Mel-cepstrum distortion (MCD).
 30 | 
 31 |     The function computes MCD for time-aligned mel-cepstrum sequences.
 32 | 
 33 |     Args:
 34 |         X (ndarray): Input mel-cepstrum, shape can be either of
 35 |           (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
 36 |           are supported.
 37 |         Y (ndarray): Target mel-cepstrum, shape can be either of
 38 |           (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
 39 |           are supported.
 40 |         lengths (list): Lengths of padded inputs. This should only be specified
 41 |           if you give mini-batch inputs.
 42 | 
 43 |     Returns:
 44 |         float: Mean mel-cepstrum distortion in dB.
 45 | 
 46 |     .. note::
 47 | 
 48 |         The function doesn't check if inputs are actually mel-cepstrum.
 49 |     """
 50 |     # summing against feature axis, and then take mean against time axis
 51 |     # Eq. (1a)
 52 |     # https://www.cs.cmu.edu/~awb/papers/sltu2008/kominek_black.sltu_2008.pdf
 53 |     if lengths is None:
 54 |         z = X - Y
 55 |         r = _sqrt((z * z).sum(-1))
 56 |         if not numpy.isscalar(r):
 57 |             r = r.mean()
 58 |         return _logdb_const * r
 59 | 
 60 |     # Case for 1-dim features.
 61 |     if len(X.shape) == 2:
 62 |         # Add feature axis
 63 |         X, Y = X[:, :, None], Y[:, :, None]
 64 | 
 65 |     s = 0.0
 66 |     T = _sum(lengths)
 67 |     for x, y, length in zip(X, Y, lengths):
 68 |         x, y = x[:length], y[:length]
 69 |         z = x - y
 70 |         s += _sqrt((z * z).sum(-1)).sum()
 71 | 
 72 |     return _logdb_const * s / T
 73 | 
 74 | 
 75 | class DTWAligner(object):
 76 |     """
 77 |     from https://github.com/r9y9/nnmnkwii/blob/4cade86b5c35b4e35615a2a8162ddc638018af0e/nnmnkwii/preprocessing/alignment.py#L14
 78 |     """
 79 | 
 80 |     def __init__(self, x, y, dist=lambda x, y: numpy.linalg.norm(x - y), radius=1) -> None:
 81 |         assert x.ndim == 2 and y.ndim == 2
 82 | 
 83 |         _, path = fastdtw.fastdtw(x, y, radius=radius, dist=dist)
 84 |         path = numpy.array(path)
 85 |         self.normed_path_x = path[:, 0] / len(x)
 86 |         self.normed_path_y = path[:, 1] / len(y)
 87 | 
 88 |     def align_x(self, x):
 89 |         path = self._interp_path(self.normed_path_x, len(x))
 90 |         return x[path]
 91 | 
 92 |     def align_y(self, y):
 93 |         path = self._interp_path(self.normed_path_y, len(y))
 94 |         return y[path]
 95 | 
 96 |     def align(self, x, y):
 97 |         return self.align_x(x), self.align_y(y)
 98 | 
 99 |     @staticmethod
100 |     def align_and_transform(x, y, *args, **kwargs):
101 |         aligner = DTWAligner(*args, x=x, y=y, **kwargs)
102 |         return aligner.align(x, y)
103 | 
104 |     @staticmethod
105 |     def _interp_path(normed_path: numpy.ndarray, target_length: int):
106 |         path = numpy.floor(normed_path * target_length).astype(numpy.int)
107 |         return path
108 | 
109 | 
110 | class MelCepstrumAligner(DTWAligner):
111 |     def __init__(self, x, y, *args, **kwargs) -> None:
112 |         x = self._calc_aligner_feature(x)
113 |         y = self._calc_aligner_feature(y)
114 |         kwargs.update(dist=melcd)
115 |         super().__init__(x, y, *args, **kwargs)
116 | 
117 |     @classmethod
118 |     def _calc_delta(cls, x):
119 |         x = numpy.zeros_like(x, x.dtype)
120 |         x[:-1] = x[1:] - x[:-1]
121 |         x[-1] = 0
122 |         return x
123 | 
124 |     @classmethod
125 |     def _calc_aligner_feature(cls, x):
126 |         d = cls._calc_delta(x)
127 |         feature = numpy.concatenate((x, d), axis=1)[:, 1:]
128 |         return feature
129 | 


--------------------------------------------------------------------------------
/become_yukarin/vocoder.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pyworld
  3 | 
  4 | from become_yukarin.data_struct import AcousticFeature
  5 | from become_yukarin.data_struct import Wave
  6 | from become_yukarin.dataset.dataset import AcousticFeatureProcess
  7 | from become_yukarin.param import AcousticFeatureParam
  8 | 
  9 | 
 10 | class Vocoder(object):
 11 |     def __init__(
 12 |             self,
 13 |             acoustic_feature_param: AcousticFeatureParam,
 14 |             out_sampling_rate: int,
 15 |     ):
 16 |         self.acoustic_feature_param = acoustic_feature_param
 17 |         self.out_sampling_rate = out_sampling_rate
 18 |         self._encoder = AcousticFeatureProcess(
 19 |             frame_period=acoustic_feature_param.frame_period,
 20 |             order=acoustic_feature_param.order,
 21 |             alpha=acoustic_feature_param.alpha,
 22 |             f0_estimating_method=acoustic_feature_param.f0_estimating_method,
 23 |         )
 24 | 
 25 |     def encode(self, wave: Wave):
 26 |         return self._encoder(wave)
 27 | 
 28 |     def decode(
 29 |             self,
 30 |             acoustic_feature: AcousticFeature,
 31 |     ):
 32 |         acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
 33 |         out = pyworld.synthesize(
 34 |             f0=acoustic_feature.f0.ravel(),
 35 |             spectrogram=acoustic_feature.spectrogram,
 36 |             aperiodicity=acoustic_feature.aperiodicity,
 37 |             fs=self.out_sampling_rate,
 38 |             frame_period=self.acoustic_feature_param.frame_period
 39 |         )
 40 |         return Wave(out, sampling_rate=self.out_sampling_rate)
 41 | 
 42 | 
 43 | class RealtimeVocoder(Vocoder):
 44 |     def __init__(
 45 |             self,
 46 |             acoustic_feature_param: AcousticFeatureParam,
 47 |             out_sampling_rate: int,
 48 |             buffer_size: int,
 49 |             number_of_pointers: int,
 50 |     ):
 51 |         from world4py.native import structures, apidefinitions
 52 |         super().__init__(
 53 |             acoustic_feature_param=acoustic_feature_param,
 54 |             out_sampling_rate=out_sampling_rate,
 55 |         )
 56 | 
 57 |         self.buffer_size = buffer_size
 58 | 
 59 |         self._synthesizer = structures.WorldSynthesizer()
 60 |         apidefinitions._InitializeSynthesizer(
 61 |             self.out_sampling_rate,  # sampling rate
 62 |             self.acoustic_feature_param.frame_period,  # frame period
 63 |             pyworld.get_cheaptrick_fft_size(out_sampling_rate),  # fft size
 64 |             buffer_size,  # buffer size
 65 |             number_of_pointers,  # number of pointers
 66 |             self._synthesizer,
 67 |         )
 68 |         self._before_buffer = []  # for holding memory
 69 | 
 70 |     def decode(
 71 |             self,
 72 |             acoustic_feature: AcousticFeature,
 73 |     ):
 74 |         from world4py.native import apidefinitions, utils
 75 |         length = len(acoustic_feature.f0)
 76 |         f0_buffer = utils.cast_1d_list_to_1d_pointer(acoustic_feature.f0.flatten().tolist())
 77 |         sp_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.spectrogram.tolist())
 78 |         ap_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.aperiodicity.tolist())
 79 |         apidefinitions._AddParameters(f0_buffer, length, sp_buffer, ap_buffer, self._synthesizer)
 80 | 
 81 |         ys = []
 82 |         while apidefinitions._Synthesis2(self._synthesizer) != 0:
 83 |             y = numpy.array([self._synthesizer.buffer[i] for i in range(self.buffer_size)])
 84 |             ys.append(y)
 85 | 
 86 |         if len(ys) > 0:
 87 |             out_wave = Wave(
 88 |                 wave=numpy.concatenate(ys),
 89 |                 sampling_rate=self.out_sampling_rate,
 90 |             )
 91 |         else:
 92 |             out_wave = Wave(
 93 |                 wave=numpy.empty(0),
 94 |                 sampling_rate=self.out_sampling_rate,
 95 |             )
 96 | 
 97 |         self._before_buffer.append((f0_buffer, sp_buffer, ap_buffer))  # for holding memory
 98 |         if len(self._before_buffer) > 16:
 99 |             self._before_buffer.pop(0)
100 |         return out_wave
101 | 
102 |     def warm_up(self, time_length: float):
103 |         y = numpy.zeros(int(time_length * self.out_sampling_rate))
104 |         w = Wave(wave=y, sampling_rate=self.out_sampling_rate)
105 |         f = self.encode(w)
106 |         self.decode(f)
107 | 
108 |     def __del__(self):
109 |         from world4py.native import apidefinitions
110 |         if hasattr(self, '_synthesizer'):
111 |             apidefinitions._DestroySynthesizer(self._synthesizer)
112 | 


--------------------------------------------------------------------------------
/become_yukarin/data_struct.py:
--------------------------------------------------------------------------------
  1 | from typing import NamedTuple, Dict, List
  2 | 
  3 | import numpy
  4 | import pyworld
  5 | 
  6 | _min_mc = -18.3
  7 | 
  8 | 
  9 | class Wave(NamedTuple):
 10 |     wave: numpy.ndarray
 11 |     sampling_rate: int
 12 | 
 13 | 
 14 | class AcousticFeature(NamedTuple):
 15 |     f0: numpy.ndarray = numpy.nan
 16 |     spectrogram: numpy.ndarray = numpy.nan
 17 |     aperiodicity: numpy.ndarray = numpy.nan
 18 |     mfcc: numpy.ndarray = numpy.nan
 19 |     voiced: numpy.ndarray = numpy.nan
 20 | 
 21 |     @staticmethod
 22 |     def dtypes():
 23 |         return dict(
 24 |             f0=numpy.float32,
 25 |             spectrogram=numpy.float32,
 26 |             aperiodicity=numpy.float32,
 27 |             mfcc=numpy.float32,
 28 |             voiced=numpy.bool,
 29 |         )
 30 | 
 31 |     def astype(self, dtype):
 32 |         return AcousticFeature(
 33 |             f0=self.f0.astype(dtype),
 34 |             spectrogram=self.spectrogram.astype(dtype),
 35 |             aperiodicity=self.aperiodicity.astype(dtype),
 36 |             mfcc=self.mfcc.astype(dtype),
 37 |             voiced=self.voiced.astype(dtype),
 38 |         )
 39 | 
 40 |     def astype_only_float(self, dtype):
 41 |         return AcousticFeature(
 42 |             f0=self.f0.astype(dtype),
 43 |             spectrogram=self.spectrogram.astype(dtype),
 44 |             aperiodicity=self.aperiodicity.astype(dtype),
 45 |             mfcc=self.mfcc.astype(dtype),
 46 |             voiced=self.voiced,
 47 |         )
 48 | 
 49 |     def validate(self):
 50 |         assert self.f0.ndim == 2
 51 |         assert self.spectrogram.ndim == 2
 52 |         assert self.aperiodicity.ndim == 2
 53 |         assert self.mfcc.ndim == 2
 54 |         assert self.voiced.ndim == 2
 55 | 
 56 |         len_time = len(self.f0)
 57 |         assert len(self.spectrogram) == len_time
 58 |         assert len(self.aperiodicity) == len_time
 59 |         assert len(self.mfcc) == len_time
 60 |         assert len(self.voiced) == len_time
 61 | 
 62 |         assert self.voiced.dtype == numpy.bool
 63 | 
 64 |     @staticmethod
 65 |     def silent(length: int, sizes: Dict[str, int], keys: List[str]):
 66 |         d = {}
 67 |         if 'f0' in keys:
 68 |             d['f0'] = numpy.zeros((length, sizes['f0']), dtype=AcousticFeature.dtypes()['f0'])
 69 |         if 'spectrogram' in keys:
 70 |             d['spectrogram'] = numpy.zeros((length, sizes['spectrogram']),
 71 |                                            dtype=AcousticFeature.dtypes()['spectrogram'])
 72 |         if 'aperiodicity' in keys:
 73 |             d['aperiodicity'] = numpy.zeros((length, sizes['aperiodicity']),
 74 |                                             dtype=AcousticFeature.dtypes()['aperiodicity'])
 75 |         if 'mfcc' in keys:
 76 |             d['mfcc'] = numpy.hstack((
 77 |                 numpy.ones((length, 1), dtype=AcousticFeature.dtypes()['mfcc']) * _min_mc,
 78 |                 numpy.zeros((length, sizes['mfcc'] - 1), dtype=AcousticFeature.dtypes()['mfcc'])
 79 |             ))
 80 |         if 'voiced' in keys:
 81 |             d['voiced'] = numpy.zeros((length, sizes['voiced']), dtype=AcousticFeature.dtypes()['voiced'])
 82 |         feature = AcousticFeature(**d)
 83 |         return feature
 84 | 
 85 |     @staticmethod
 86 |     def concatenate(fs: List['AcousticFeature'], keys: List[str]):
 87 |         is_target = lambda a: not numpy.any(numpy.isnan(a))
 88 |         return AcousticFeature(**{
 89 |             key: numpy.concatenate([getattr(f, key) for f in fs]) if is_target(getattr(fs[0], key)) else numpy.nan
 90 |             for key in keys
 91 |         })
 92 | 
 93 |     def pick(self, first: int, last: int):
 94 |         is_target = lambda a: not numpy.any(numpy.isnan(a))
 95 |         return AcousticFeature(
 96 |             f0=self.f0[first:last] if is_target(self.f0) else numpy.nan,
 97 |             spectrogram=self.spectrogram[first:last] if is_target(self.spectrogram) else numpy.nan,
 98 |             aperiodicity=self.aperiodicity[first:last] if is_target(self.aperiodicity) else numpy.nan,
 99 |             mfcc=self.mfcc[first:last] if is_target(self.mfcc) else numpy.nan,
100 |             voiced=self.voiced[first:last] if is_target(self.voiced) else numpy.nan,
101 |         )
102 | 
103 |     @staticmethod
104 |     def get_sizes(sampling_rate: int, order: int):
105 |         fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate)
106 |         return dict(
107 |             f0=1,
108 |             spectrogram=fft_size // 2 + 1,
109 |             aperiodicity=fft_size // 2 + 1,
110 |             mfcc=order + 1,
111 |             voiced=1,
112 |         )
113 | 
114 | 
115 | class LowHighSpectrogramFeature(NamedTuple):
116 |     low: numpy.ndarray
117 |     high: numpy.ndarray
118 | 
119 |     def validate(self):
120 |         assert self.low.ndim == 2
121 |         assert self.high.ndim == 2
122 |         assert self.low.shape == self.high.shape
123 | 


--------------------------------------------------------------------------------
/become_yukarin/config/config.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | from typing import Dict
  4 | from typing import List
  5 | from typing import NamedTuple
  6 | from typing import Optional
  7 | from typing import Union
  8 | 
  9 | from become_yukarin.param import Param
 10 | 
 11 | 
 12 | class DatasetConfig(NamedTuple):
 13 |     param: Param
 14 |     input_glob: Path
 15 |     target_glob: Path
 16 |     input_mean_path: Path
 17 |     input_var_path: Path
 18 |     target_mean_path: Path
 19 |     target_var_path: Path
 20 |     features: List[str]
 21 |     train_crop_size: int
 22 |     input_global_noise: float
 23 |     input_local_noise: float
 24 |     target_global_noise: float
 25 |     target_local_noise: float
 26 |     seed: int
 27 |     num_test: int
 28 | 
 29 | 
 30 | class ModelConfig(NamedTuple):
 31 |     in_channels: int
 32 |     out_channels: int
 33 |     generator_base_channels: int
 34 |     generator_extensive_layers: int
 35 |     discriminator_base_channels: int
 36 |     discriminator_extensive_layers: int
 37 |     weak_discriminator: bool
 38 | 
 39 | 
 40 | class LossConfig(NamedTuple):
 41 |     mse: float
 42 |     adversarial: float
 43 | 
 44 | 
 45 | class TrainConfig(NamedTuple):
 46 |     batchsize: int
 47 |     gpu: int
 48 |     log_iteration: int
 49 |     snapshot_iteration: int
 50 | 
 51 | 
 52 | class ProjectConfig(NamedTuple):
 53 |     name: str
 54 |     tags: List[str]
 55 | 
 56 | 
 57 | class Config(NamedTuple):
 58 |     dataset: DatasetConfig
 59 |     model: ModelConfig
 60 |     loss: LossConfig
 61 |     train: TrainConfig
 62 |     project: ProjectConfig
 63 | 
 64 |     def save_as_json(self, path):
 65 |         d = _namedtuple_to_dict(self)
 66 |         json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path)
 67 | 
 68 | 
 69 | def _default_path(o):
 70 |     if isinstance(o, Path):
 71 |         return str(o)
 72 |     raise TypeError(repr(o) + " is not JSON serializable")
 73 | 
 74 | 
 75 | def _namedtuple_to_dict(o: NamedTuple):
 76 |     return {
 77 |         k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v)
 78 |         for k, v in o._asdict().items()
 79 |     }
 80 | 
 81 | 
 82 | def create_from_json(s: Union[str, Path]):
 83 |     try:
 84 |         d = json.loads(s)
 85 |     except TypeError:
 86 |         d = json.load(open(s))
 87 | 
 88 |     backward_compatible(d)
 89 | 
 90 |     return Config(
 91 |         dataset=DatasetConfig(
 92 |             param=Param(),
 93 |             input_glob=Path(d['dataset']['input_glob']),
 94 |             target_glob=Path(d['dataset']['target_glob']),
 95 |             input_mean_path=Path(d['dataset']['input_mean_path']),
 96 |             input_var_path=Path(d['dataset']['input_var_path']),
 97 |             target_mean_path=Path(d['dataset']['target_mean_path']),
 98 |             target_var_path=Path(d['dataset']['target_var_path']),
 99 |             features=d['dataset']['features'],
100 |             train_crop_size=d['dataset']['train_crop_size'],
101 |             input_global_noise=d['dataset']['input_global_noise'],
102 |             input_local_noise=d['dataset']['input_local_noise'],
103 |             target_global_noise=d['dataset']['target_global_noise'],
104 |             target_local_noise=d['dataset']['target_local_noise'],
105 |             seed=d['dataset']['seed'],
106 |             num_test=d['dataset']['num_test'],
107 |         ),
108 |         model=ModelConfig(
109 |             in_channels=d['model']['in_channels'],
110 |             out_channels=d['model']['out_channels'],
111 |             generator_base_channels=d['model']['generator_base_channels'],
112 |             generator_extensive_layers=d['model']['generator_extensive_layers'],
113 |             discriminator_base_channels=d['model']['discriminator_base_channels'],
114 |             discriminator_extensive_layers=d['model']['discriminator_extensive_layers'],
115 |             weak_discriminator=d['model']['weak_discriminator'],
116 |         ),
117 |         loss=LossConfig(
118 |             mse=d['loss']['mse'],
119 |             adversarial=d['loss']['adversarial'],
120 |         ),
121 |         train=TrainConfig(
122 |             batchsize=d['train']['batchsize'],
123 |             gpu=d['train']['gpu'],
124 |             log_iteration=d['train']['log_iteration'],
125 |             snapshot_iteration=d['train']['snapshot_iteration'],
126 |         ),
127 |         project=ProjectConfig(
128 |             name=d['project']['name'],
129 |             tags=d['project']['tags'],
130 |         )
131 |     )
132 | 
133 | 
134 | def backward_compatible(d: Dict):
135 |     if 'input_global_noise' not in d['dataset']:
136 |         d['dataset']['input_global_noise'] = d['dataset']['global_noise']
137 |         d['dataset']['input_local_noise'] = d['dataset']['local_noise']
138 | 
139 |     if 'target_global_noise' not in d['dataset']:
140 |         d['dataset']['target_global_noise'] = d['dataset']['global_noise']
141 |         d['dataset']['target_local_noise'] = d['dataset']['local_noise']
142 | 
143 |     if 'generator_base_channels' not in d['model']:
144 |         d['model']['generator_base_channels'] = 64
145 |         d['model']['generator_extensive_layers'] = 8
146 |         d['model']['discriminator_base_channels'] = 32
147 |         d['model']['discriminator_extensive_layers'] = 5
148 | 
149 |     if 'weak_discriminator' not in d['model']:
150 |         d['model']['weak_discriminator'] = False
151 | 


--------------------------------------------------------------------------------
/become_yukarin/super_resolution.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from pathlib import Path
  3 | from typing import List
  4 | 
  5 | import chainer
  6 | import numpy
  7 | import pyworld
  8 | 
  9 | from become_yukarin.config.sr_config import SRConfig
 10 | from become_yukarin.data_struct import AcousticFeature
 11 | from become_yukarin.data_struct import Wave
 12 | from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureLoadProcess
 13 | from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureProcess
 14 | from become_yukarin.dataset.dataset import WaveFileLoadProcess
 15 | from become_yukarin.model.sr_model import create_predictor_sr
 16 | 
 17 | 
 18 | class SuperResolution(object):
 19 |     def __init__(self, config: SRConfig, model_path: Path, gpu: int = None) -> None:
 20 |         self.config = config
 21 |         self.model_path = model_path
 22 |         self.gpu = gpu
 23 | 
 24 |         self.model = model = create_predictor_sr(config.model)
 25 |         chainer.serializers.load_npz(str(model_path), model)
 26 |         if self.gpu is not None:
 27 |             model.to_gpu(self.gpu)
 28 | 
 29 |         self._param = param = config.dataset.param
 30 |         self._wave_process = WaveFileLoadProcess(
 31 |             sample_rate=param.voice_param.sample_rate,
 32 |             top_db=None,
 33 |         )
 34 |         self._low_high_spectrogram_process = LowHighSpectrogramFeatureProcess(
 35 |             frame_period=param.acoustic_feature_param.frame_period,
 36 |             order=param.acoustic_feature_param.order,
 37 |             alpha=param.acoustic_feature_param.alpha,
 38 |             f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
 39 |         )
 40 |         self._low_high_spectrogram_load_process = LowHighSpectrogramFeatureLoadProcess(
 41 |             validate=True,
 42 |         )
 43 | 
 44 |     def convert(self, input: numpy.ndarray) -> numpy.ndarray:
 45 |         converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
 46 |         pad = 128 - len(input) % 128
 47 |         input = numpy.pad(input, [(0, pad), (0, 0)], mode='minimum')
 48 |         input = numpy.log(input)[:, :-1]
 49 |         input = input[numpy.newaxis]
 50 |         inputs = converter([input])
 51 | 
 52 |         with chainer.using_config('train', False):
 53 |             out = self.model(inputs).data[0]
 54 | 
 55 |         if self.gpu is not None:
 56 |             out = chainer.cuda.to_cpu(out)
 57 | 
 58 |         out = out[0]
 59 |         out = numpy.pad(out, [(0, 0), (0, 1)], mode='edge')
 60 |         out = numpy.exp(out)
 61 |         out = out[:-pad]
 62 |         return out
 63 | 
 64 |     def convert_loop(self, input: numpy.ndarray, n_len: int = 512, n_wrap: int = 128):
 65 |         out_feature_list: List[AcousticFeature] = []
 66 |         N = len(input)
 67 |         for i in numpy.arange(0, int(numpy.ceil(N / n_len))):
 68 |             # convert with overwrapped
 69 |             start = i * n_len
 70 |             mi = max(start - n_wrap, 0)
 71 |             ma = min(start + n_len + n_wrap, N)
 72 |             f = input[numpy.arange(mi, ma)]
 73 |             o_warp = self.convert(f)
 74 | 
 75 |             # eliminate overwrap
 76 |             ex_mi = start - mi
 77 |             ex_len = min(ma - start, n_len)
 78 |             o = o_warp[numpy.arange(ex_mi, ex_mi + ex_len)]
 79 |             out_feature_list.append(o)
 80 |         return numpy.concatenate(out_feature_list)
 81 | 
 82 |     def convert_to_feature(
 83 |             self,
 84 |             spectrogram: numpy.ndarray,
 85 |             acoustic_feature: AcousticFeature,
 86 |     ):
 87 |         acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
 88 |         f_out = AcousticFeature(
 89 |             f0=acoustic_feature.f0,
 90 |             spectrogram=spectrogram.astype(numpy.float64),
 91 |             aperiodicity=acoustic_feature.aperiodicity,
 92 |             mfcc=acoustic_feature.mfcc,
 93 |             voiced=acoustic_feature.voiced,
 94 |         )
 95 |         return f_out
 96 | 
 97 |     def convert_to_audio(
 98 |             self,
 99 |             input: numpy.ndarray,
100 |             acoustic_feature: AcousticFeature,
101 |             sampling_rate: int,
102 |     ):
103 |         acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
104 |         out = pyworld.synthesize(
105 |             f0=acoustic_feature.f0.ravel(),
106 |             spectrogram=input.astype(numpy.float64),
107 |             aperiodicity=acoustic_feature.aperiodicity,
108 |             fs=sampling_rate,
109 |             frame_period=self._param.acoustic_feature_param.frame_period,
110 |         )
111 |         return Wave(out, sampling_rate=sampling_rate)
112 | 
113 |     def convert_from_audio_path(self, input: Path):
114 |         wave = self._wave_process(str(input), test=True)
115 |         feature = self._low_high_spectrogram_process(wave, test=True)
116 |         return self.convert(feature.low)
117 | 
118 |     def convert_from_feature_path(self, input: Path):
119 |         feature = self._low_high_spectrogram_load_process(input, test=True)
120 |         return self.convert(feature.low)
121 | 
122 |     def __call__(
123 |             self,
124 |             input: numpy.ndarray,
125 |             acoustic_feature: AcousticFeature,
126 |             sampling_rate: int,
127 |     ):
128 |         high = self.convert(input)
129 |         return self.convert_to_audio(high, acoustic_feature=acoustic_feature, sampling_rate=sampling_rate)
130 | 


--------------------------------------------------------------------------------
/become_yukarin/acoustic_converter.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | import chainer
  6 | import numpy
  7 | import pysptk
  8 | import pyworld
  9 | 
 10 | from become_yukarin.config.config import Config
 11 | from become_yukarin.data_struct import AcousticFeature
 12 | from become_yukarin.data_struct import Wave
 13 | from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess
 14 | from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess
 15 | from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess
 16 | from become_yukarin.dataset.dataset import AcousticFeatureProcess
 17 | from become_yukarin.dataset.dataset import DecodeFeatureProcess
 18 | from become_yukarin.dataset.dataset import EncodeFeatureProcess
 19 | from become_yukarin.dataset.dataset import WaveFileLoadProcess
 20 | from become_yukarin.model.model import create_predictor
 21 | 
 22 | 
 23 | class AcousticConverter(object):
 24 |     def __init__(self, config: Config, model_path: Path, gpu: int = None) -> None:
 25 |         self.config = config
 26 |         self.model_path = model_path
 27 |         self.gpu = gpu
 28 | 
 29 |         self.model = model = create_predictor(config.model)
 30 |         chainer.serializers.load_npz(str(model_path), model)
 31 |         if self.gpu is not None:
 32 |             model.to_gpu(self.gpu)
 33 | 
 34 |         self._param = param = config.dataset.param
 35 |         self._wave_process = WaveFileLoadProcess(
 36 |             sample_rate=param.voice_param.sample_rate,
 37 |             top_db=None,
 38 |         )
 39 |         self._feature_process = AcousticFeatureProcess(
 40 |             frame_period=param.acoustic_feature_param.frame_period,
 41 |             order=param.acoustic_feature_param.order,
 42 |             alpha=param.acoustic_feature_param.alpha,
 43 |             f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
 44 |         )
 45 | 
 46 |         self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess()
 47 | 
 48 |         input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True)
 49 |         input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True)
 50 |         target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True)
 51 |         target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True)
 52 |         self._feature_normalize = AcousticFeatureNormalizeProcess(
 53 |             mean=input_mean,
 54 |             var=input_var,
 55 |         )
 56 |         self._feature_denormalize = AcousticFeatureDenormalizeProcess(
 57 |             mean=target_mean,
 58 |             var=target_var,
 59 |         )
 60 | 
 61 |         feature_sizes = AcousticFeature.get_sizes(
 62 |             sampling_rate=param.voice_param.sample_rate,
 63 |             order=param.acoustic_feature_param.order,
 64 |         )
 65 |         self._encode_feature = EncodeFeatureProcess(config.dataset.features)
 66 |         self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes)
 67 | 
 68 |     def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
 69 |         if out_sampling_rate is None:
 70 |             out_sampling_rate = self.config.dataset.param.voice_param.sample_rate
 71 | 
 72 |         input_feature = input
 73 |         input = self._feature_normalize(input, test=True)
 74 |         input = self._encode_feature(input, test=True)
 75 | 
 76 |         pad = 128 - input.shape[1] % 128
 77 |         input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum')
 78 | 
 79 |         converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
 80 |         inputs = converter([input])
 81 | 
 82 |         with chainer.using_config('train', False):
 83 |             out = self.model(inputs).data[0]
 84 | 
 85 |         if self.gpu is not None:
 86 |             out = chainer.cuda.to_cpu(out)
 87 |         out = out[:, :-pad]
 88 | 
 89 |         out = self._decode_feature(out, test=True)
 90 |         out = AcousticFeature(
 91 |             f0=out.f0,
 92 |             spectrogram=out.spectrogram,
 93 |             aperiodicity=out.aperiodicity,
 94 |             mfcc=out.mfcc,
 95 |             voiced=input_feature.voiced,
 96 |         )
 97 |         out = self._feature_denormalize(out, test=True)
 98 |         out = AcousticFeature(
 99 |             f0=out.f0,
100 |             spectrogram=out.spectrogram,
101 |             aperiodicity=input_feature.aperiodicity,
102 |             mfcc=out.mfcc,
103 |             voiced=out.voiced,
104 |         )
105 | 
106 |         fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
107 |         spectrogram = pysptk.mc2sp(
108 |             out.mfcc,
109 |             alpha=self._param.acoustic_feature_param.alpha,
110 |             fftlen=fftlen,
111 |         )
112 | 
113 |         out = AcousticFeature(
114 |             f0=out.f0,
115 |             spectrogram=spectrogram,
116 |             aperiodicity=out.aperiodicity,
117 |             mfcc=out.mfcc,
118 |             voiced=out.voiced,
119 |         ).astype(numpy.float64)
120 |         return out
121 | 
122 |     def convert_from_audio_path(self, path: Path, out_sampling_rate: Optional[int] = None):
123 |         wave = self._wave_process(str(path), test=True)
124 |         feature = self._feature_process(wave, test=True)
125 |         return self.convert_from_feature(feature, out_sampling_rate)
126 | 
127 |     def convert_from_feature_path(self, path: Path, out_sampling_rate: Optional[int] = None):
128 |         feature = self._acoustic_feature_load_process(path, test=True)
129 |         return self.convert_from_feature(feature, out_sampling_rate)
130 | 
131 |     def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
132 |         if out_sampling_rate is None:
133 |             out_sampling_rate = self.config.dataset.param.voice_param.sample_rate
134 | 
135 |         out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate)
136 |         out = pyworld.synthesize(
137 |             f0=out.f0.ravel(),
138 |             spectrogram=out.spectrogram,
139 |             aperiodicity=out.aperiodicity,
140 |             fs=out_sampling_rate,
141 |             frame_period=self._param.acoustic_feature_param.frame_period,
142 |         )
143 |         return Wave(out, sampling_rate=out_sampling_rate)
144 | 
145 |     def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None):
146 |         return self.convert_from_audio_path(voice_path, out_sampling_rate)
147 | 


--------------------------------------------------------------------------------
/become_yukarin/model/sr_model.py:
--------------------------------------------------------------------------------
  1 | import chainer
  2 | import chainer.functions as F
  3 | import chainer.links as L
  4 | 
  5 | from become_yukarin.config.sr_config import SRModelConfig
  6 | 
  7 | 
  8 | class CBR(chainer.Chain):
  9 |     def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False) -> None:
 10 |         super().__init__()
 11 |         self.bn = bn
 12 |         self.activation = activation
 13 |         self.dropout = dropout
 14 | 
 15 |         w = chainer.initializers.Normal(0.02)
 16 |         with self.init_scope():
 17 |             if sample == 'down':
 18 |                 self.c = L.Convolution2D(ch0, ch1, 4, 2, 1, initialW=w)
 19 |             elif sample == 'up':
 20 |                 self.c = L.Deconvolution2D(ch0, ch1, 4, 2, 1, initialW=w)
 21 |             else:
 22 |                 self.c = L.Convolution2D(ch0, ch1, 1, 1, 0, initialW=w)
 23 |             if bn:
 24 |                 self.batchnorm = L.BatchNormalization(ch1)
 25 | 
 26 |     def __call__(self, x):
 27 |         h = self.c(x)
 28 |         if self.bn:
 29 |             h = self.batchnorm(h)
 30 |         if self.dropout:
 31 |             h = F.dropout(h)
 32 |         if self.activation is not None:
 33 |             h = self.activation(h)
 34 |         return h
 35 | 
 36 | 
 37 | class SREncoder(chainer.Chain):
 38 |     def __init__(self, in_ch, base=64, extensive_layers=8) -> None:
 39 |         super().__init__()
 40 |         w = chainer.initializers.Normal(0.02)
 41 |         with self.init_scope():
 42 |             if extensive_layers > 0:
 43 |                 self.c0 = L.Convolution2D(in_ch, base * 1, 3, 1, 1, initialW=w)
 44 |             else:
 45 |                 self.c0 = L.Convolution2D(in_ch, base * 1, 1, 1, 0, initialW=w)
 46 | 
 47 |             _choose = lambda i: 'down' if i < extensive_layers else 'same'
 48 |             self.c1 = CBR(base * 1, base * 2, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False)
 49 |             self.c2 = CBR(base * 2, base * 4, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False)
 50 |             self.c3 = CBR(base * 4, base * 8, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False)
 51 |             self.c4 = CBR(base * 8, base * 8, bn=True, sample=_choose(4), activation=F.leaky_relu, dropout=False)
 52 |             self.c5 = CBR(base * 8, base * 8, bn=True, sample=_choose(5), activation=F.leaky_relu, dropout=False)
 53 |             self.c6 = CBR(base * 8, base * 8, bn=True, sample=_choose(6), activation=F.leaky_relu, dropout=False)
 54 |             self.c7 = CBR(base * 8, base * 8, bn=True, sample=_choose(7), activation=F.leaky_relu, dropout=False)
 55 | 
 56 |     def __call__(self, x):
 57 |         hs = [F.leaky_relu(self.c0(x))]
 58 |         for i in range(1, 8):
 59 |             hs.append(self['c%d' % i](hs[i - 1]))
 60 |         return hs
 61 | 
 62 | 
 63 | class SRDecoder(chainer.Chain):
 64 |     def __init__(self, out_ch, base=64, extensive_layers=8) -> None:
 65 |         super().__init__()
 66 |         w = chainer.initializers.Normal(0.02)
 67 |         with self.init_scope():
 68 |             _choose = lambda i: 'up' if i >= 8 - extensive_layers else 'same'
 69 |             self.c0 = CBR(base * 8, base * 8, bn=True, sample=_choose(0), activation=F.relu, dropout=True)
 70 |             self.c1 = CBR(base * 16, base * 8, bn=True, sample=_choose(1), activation=F.relu, dropout=True)
 71 |             self.c2 = CBR(base * 16, base * 8, bn=True, sample=_choose(2), activation=F.relu, dropout=True)
 72 |             self.c3 = CBR(base * 16, base * 8, bn=True, sample=_choose(3), activation=F.relu, dropout=False)
 73 |             self.c4 = CBR(base * 16, base * 4, bn=True, sample=_choose(4), activation=F.relu, dropout=False)
 74 |             self.c5 = CBR(base * 8, base * 2, bn=True, sample=_choose(5), activation=F.relu, dropout=False)
 75 |             self.c6 = CBR(base * 4, base * 1, bn=True, sample=_choose(6), activation=F.relu, dropout=False)
 76 | 
 77 |             if extensive_layers > 0:
 78 |                 self.c7 = L.Convolution2D(base * 2, out_ch, 3, 1, 1, initialW=w)
 79 |             else:
 80 |                 self.c7 = L.Convolution2D(base * 2, out_ch, 1, 1, 0, initialW=w)
 81 | 
 82 |     def __call__(self, hs):
 83 |         h = self.c0(hs[-1])
 84 |         for i in range(1, 8):
 85 |             h = F.concat([h, hs[-i - 1]])
 86 |             if i < 7:
 87 |                 h = self['c%d' % i](h)
 88 |             else:
 89 |                 h = self.c7(h)
 90 |         return h
 91 | 
 92 | 
 93 | class SRPredictor(chainer.Chain):
 94 |     def __init__(self, in_ch, out_ch, base, extensive_layers) -> None:
 95 |         super().__init__()
 96 |         with self.init_scope():
 97 |             self.encoder = SREncoder(in_ch, base=base, extensive_layers=extensive_layers)
 98 |             self.decoder = SRDecoder(out_ch, base=base, extensive_layers=extensive_layers)
 99 | 
100 |     def __call__(self, x):
101 |         return self.decoder(self.encoder(x))
102 | 
103 | 
104 | class SRDiscriminator(chainer.Chain):
105 |     def __init__(self, in_ch, out_ch, base=32, extensive_layers=5) -> None:
106 |         super().__init__()
107 |         w = chainer.initializers.Normal(0.02)
108 |         with self.init_scope():
109 |             _choose = lambda i: 'down' if i < extensive_layers else 'same'
110 |             self.c0_0 = CBR(in_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=False)
111 |             self.c0_1 = CBR(out_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=False)
112 |             self.c1 = CBR(base * 2, base * 4, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False)
113 |             self.c2 = CBR(base * 4, base * 8, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False)
114 |             self.c3 = CBR(base * 8, base * 16, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False)
115 | 
116 |             if extensive_layers > 4:
117 |                 self.c4 = L.Convolution2D(base * 16, 1, 3, 1, 1, initialW=w)
118 |             else:
119 |                 self.c4 = L.Convolution2D(base * 16, 1, 1, 1, 0, initialW=w)
120 | 
121 |     def __call__(self, x_0, x_1):
122 |         h = F.concat([self.c0_0(x_0), self.c0_1(x_1)])
123 |         h = self.c1(h)
124 |         h = self.c2(h)
125 |         h = self.c3(h)
126 |         h = self.c4(h)
127 |         # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0)
128 |         return h
129 | 
130 | 
131 | def create_predictor_sr(config: SRModelConfig):
132 |     return SRPredictor(
133 |         in_ch=1,
134 |         out_ch=1,
135 |         base=config.generator_base_channels,
136 |         extensive_layers=config.generator_extensive_layers,
137 |     )
138 | 
139 | 
140 | def create_discriminator_sr(config: SRModelConfig):
141 |     return SRDiscriminator(
142 |         in_ch=1,
143 |         out_ch=1,
144 |         base=config.discriminator_base_channels,
145 |         extensive_layers=config.discriminator_extensive_layers,
146 |     )
147 | 
148 | 
149 | def create_sr(config: SRModelConfig):
150 |     predictor = create_predictor_sr(config)
151 |     discriminator = create_discriminator_sr(config)
152 |     return predictor, discriminator
153 | 


--------------------------------------------------------------------------------
/become_yukarin/model/model.py:
--------------------------------------------------------------------------------
  1 | import chainer
  2 | import chainer.functions as F
  3 | import chainer.links as L
  4 | 
  5 | from become_yukarin.config.config import ModelConfig
  6 | 
  7 | 
  8 | class Convolution1D(chainer.links.ConvolutionND):
  9 |     def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0,
 10 |                  nobias=False, initialW=None, initial_bias=None,
 11 |                  cover_all=False) -> None:
 12 |         super().__init__(
 13 |             ndim=1,
 14 |             in_channels=in_channels,
 15 |             out_channels=out_channels,
 16 |             ksize=ksize,
 17 |             stride=stride,
 18 |             pad=pad,
 19 |             nobias=nobias,
 20 |             initialW=initialW,
 21 |             initial_bias=initial_bias,
 22 |             cover_all=cover_all,
 23 |         )
 24 | 
 25 | 
 26 | class Deconvolution1D(chainer.links.DeconvolutionND):
 27 |     def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0,
 28 |                  nobias=False, outsize=None,
 29 |                  initialW=None, initial_bias=None) -> None:
 30 |         super().__init__(
 31 |             ndim=1,
 32 |             in_channels=in_channels,
 33 |             out_channels=out_channels,
 34 |             ksize=ksize,
 35 |             stride=stride,
 36 |             pad=pad,
 37 |             nobias=nobias,
 38 |             outsize=outsize,
 39 |             initialW=initialW,
 40 |             initial_bias=initial_bias,
 41 |         )
 42 | 
 43 | 
 44 | class CBR(chainer.Chain):
 45 |     def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False) -> None:
 46 |         super().__init__()
 47 |         self.bn = bn
 48 |         self.activation = activation
 49 |         self.dropout = dropout
 50 | 
 51 |         w = chainer.initializers.Normal(0.02)
 52 |         with self.init_scope():
 53 |             if sample == 'down':
 54 |                 self.c = Convolution1D(ch0, ch1, 4, 2, 1, initialW=w)
 55 |             elif sample == 'up':
 56 |                 self.c = Deconvolution1D(ch0, ch1, 4, 2, 1, initialW=w)
 57 |             else:
 58 |                 self.c = Convolution1D(ch0, ch1, 1, 1, 0, initialW=w)
 59 |             if bn:
 60 |                 self.batchnorm = L.BatchNormalization(ch1)
 61 | 
 62 |     def __call__(self, x):
 63 |         h = self.c(x)
 64 |         if self.bn:
 65 |             h = self.batchnorm(h)
 66 |         if self.dropout:
 67 |             h = F.dropout(h)
 68 |         if self.activation is not None:
 69 |             h = self.activation(h)
 70 |         return h
 71 | 
 72 | 
 73 | class Encoder(chainer.Chain):
 74 |     def __init__(self, in_ch, base=64, extensive_layers=8) -> None:
 75 |         super().__init__()
 76 |         w = chainer.initializers.Normal(0.02)
 77 |         with self.init_scope():
 78 |             if extensive_layers > 0:
 79 |                 self.c0 = Convolution1D(in_ch, base * 1, 3, 1, 1, initialW=w)
 80 |             else:
 81 |                 self.c0 = Convolution1D(in_ch, base * 1, 1, 1, 0, initialW=w)
 82 | 
 83 |             _choose = lambda i: 'down' if i < extensive_layers else 'same'
 84 |             self.c1 = CBR(base * 1, base * 2, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False)
 85 |             self.c2 = CBR(base * 2, base * 4, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False)
 86 |             self.c3 = CBR(base * 4, base * 8, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False)
 87 |             self.c4 = CBR(base * 8, base * 8, bn=True, sample=_choose(4), activation=F.leaky_relu, dropout=False)
 88 |             self.c5 = CBR(base * 8, base * 8, bn=True, sample=_choose(5), activation=F.leaky_relu, dropout=False)
 89 |             self.c6 = CBR(base * 8, base * 8, bn=True, sample=_choose(6), activation=F.leaky_relu, dropout=False)
 90 |             self.c7 = CBR(base * 8, base * 8, bn=True, sample=_choose(7), activation=F.leaky_relu, dropout=False)
 91 | 
 92 |     def __call__(self, x):
 93 |         hs = [F.leaky_relu(self.c0(x))]
 94 |         for i in range(1, 8):
 95 |             hs.append(self['c%d' % i](hs[i - 1]))
 96 |         return hs
 97 | 
 98 | 
 99 | class Decoder(chainer.Chain):
100 |     def __init__(self, out_ch, base=64, extensive_layers=8) -> None:
101 |         super().__init__()
102 |         w = chainer.initializers.Normal(0.02)
103 |         with self.init_scope():
104 |             _choose = lambda i: 'up' if i >= 8 - extensive_layers else 'same'
105 |             self.c0 = CBR(base * 8, base * 8, bn=True, sample=_choose(0), activation=F.relu, dropout=True)
106 |             self.c1 = CBR(base * 16, base * 8, bn=True, sample=_choose(1), activation=F.relu, dropout=True)
107 |             self.c2 = CBR(base * 16, base * 8, bn=True, sample=_choose(2), activation=F.relu, dropout=True)
108 |             self.c3 = CBR(base * 16, base * 8, bn=True, sample=_choose(3), activation=F.relu, dropout=False)
109 |             self.c4 = CBR(base * 16, base * 4, bn=True, sample=_choose(4), activation=F.relu, dropout=False)
110 |             self.c5 = CBR(base * 8, base * 2, bn=True, sample=_choose(5), activation=F.relu, dropout=False)
111 |             self.c6 = CBR(base * 4, base * 1, bn=True, sample=_choose(6), activation=F.relu, dropout=False)
112 | 
113 |             if extensive_layers > 0:
114 |                 self.c7 = Convolution1D(base * 2, out_ch, 3, 1, 1, initialW=w)
115 |             else:
116 |                 self.c7 = Convolution1D(base * 2, out_ch, 1, 1, 0, initialW=w)
117 | 
118 |     def __call__(self, hs):
119 |         h = self.c0(hs[-1])
120 |         for i in range(1, 8):
121 |             h = F.concat([h, hs[-i - 1]])
122 |             if i < 7:
123 |                 h = self['c%d' % i](h)
124 |             else:
125 |                 h = self.c7(h)
126 |         return h
127 | 
128 | 
129 | class Predictor(chainer.Chain):
130 |     def __init__(self, in_ch, out_ch, base=64, extensive_layers=8) -> None:
131 |         super().__init__()
132 |         with self.init_scope():
133 |             self.encoder = Encoder(in_ch, base=base, extensive_layers=extensive_layers)
134 |             self.decoder = Decoder(out_ch, base=base, extensive_layers=extensive_layers)
135 | 
136 |     def __call__(self, x):
137 |         return self.decoder(self.encoder(x))
138 | 
139 | 
140 | class Discriminator(chainer.Chain):
141 |     def __init__(self, in_ch, out_ch, base=32, extensive_layers=5, is_weak=False) -> None:
142 |         super().__init__()
143 |         w = chainer.initializers.Normal(0.02)
144 |         with self.init_scope():
145 |             _choose = lambda i: 'down' if i < extensive_layers else 'same'
146 |             self.c0_0 = CBR(in_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=is_weak)
147 |             self.c0_1 = CBR(out_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=is_weak)
148 |             self.c1 = CBR(base * 2, base * 4, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=is_weak)
149 |             self.c2 = CBR(base * 4, base * 8, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=is_weak)
150 |             self.c3 = CBR(base * 8, base * 16, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=is_weak)
151 | 
152 |             if extensive_layers > 4:
153 |                 self.c4 = Convolution1D(base * 16, 1, 3, 1, 1, initialW=w)
154 |             else:
155 |                 self.c4 = Convolution1D(base * 16, 1, 1, 1, 0, initialW=w)
156 | 
157 |     def __call__(self, x_0, x_1):
158 |         h = F.concat([self.c0_0(x_0), self.c0_1(x_1)])
159 |         h = self.c1(h)
160 |         h = self.c2(h)
161 |         h = self.c3(h)
162 |         h = self.c4(h)
163 |         # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0)
164 |         return h
165 | 
166 | 
167 | def create_predictor(config: ModelConfig):
168 |     return Predictor(
169 |         in_ch=config.in_channels,
170 |         out_ch=config.out_channels,
171 |         base=config.generator_base_channels,
172 |         extensive_layers=config.generator_extensive_layers,
173 |     )
174 | 
175 | 
176 | def create_discriminator(config: ModelConfig):
177 |     return Discriminator(
178 |         in_ch=config.in_channels,
179 |         out_ch=config.out_channels,
180 |         base=config.discriminator_base_channels,
181 |         extensive_layers=config.discriminator_extensive_layers,
182 |         is_weak=config.weak_discriminator,
183 |     )
184 | 
185 | 
186 | def create(config: ModelConfig):
187 |     predictor = create_predictor(config)
188 |     discriminator = create_discriminator(config)
189 |     return predictor, discriminator
190 | 


--------------------------------------------------------------------------------
/scripts/extract_acoustic_feature.py:
--------------------------------------------------------------------------------
  1 | """
  2 | extract alignments voices.
  3 | """
  4 | 
  5 | import argparse
  6 | import multiprocessing
  7 | from pathlib import Path
  8 | from pprint import pprint
  9 | 
 10 | import numpy
 11 | 
 12 | from become_yukarin.acoustic_converter import AcousticConverter
 13 | from become_yukarin.config.config import create_from_json as create_config
 14 | from become_yukarin.data_struct import AcousticFeature
 15 | from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess
 16 | from become_yukarin.dataset.dataset import AcousticFeatureProcess
 17 | from become_yukarin.dataset.dataset import AcousticFeatureSaveProcess
 18 | from become_yukarin.dataset.dataset import WaveFileLoadProcess
 19 | from become_yukarin.dataset.utility import MelCepstrumAligner
 20 | from become_yukarin.param import AcousticFeatureParam
 21 | from become_yukarin.param import VoiceParam
 22 | 
 23 | base_voice_param = VoiceParam()
 24 | base_acoustic_feature_param = AcousticFeatureParam()
 25 | 
 26 | parser = argparse.ArgumentParser()
 27 | parser.add_argument('--input1_directory', '-i1', type=Path)
 28 | parser.add_argument('--input2_directory', '-i2', type=Path)
 29 | parser.add_argument('--output1_directory', '-o1', type=Path)
 30 | parser.add_argument('--output2_directory', '-o2', type=Path)
 31 | parser.add_argument('--pre_converter1_config', type=Path)
 32 | parser.add_argument('--pre_converter1_model', type=Path)
 33 | parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate)
 34 | parser.add_argument('--top_db', type=float, default=base_voice_param.top_db)
 35 | parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_second)
 36 | parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period)
 37 | parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order)
 38 | parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha)
 39 | parser.add_argument('--f0_estimating_method', type=str, default=base_acoustic_feature_param.f0_estimating_method)
 40 | parser.add_argument('--f0_floor1', type=float, default=71)
 41 | parser.add_argument('--f0_ceil1', type=float, default=800)
 42 | parser.add_argument('--f0_floor2', type=float, default=71)
 43 | parser.add_argument('--f0_ceil2', type=float, default=800)
 44 | parser.add_argument('--ignore_feature', nargs='+', default=['spectrogram', 'aperiodicity'])
 45 | parser.add_argument('--disable_alignment', action='store_true')
 46 | parser.add_argument('--enable_overwrite', action='store_true')
 47 | arguments = parser.parse_args()
 48 | 
 49 | pre_convert = arguments.pre_converter1_config is not None
 50 | if pre_convert:
 51 |     config = create_config(arguments.pre_converter1_config)
 52 |     pre_converter1 = AcousticConverter(config, arguments.pre_converter1_model)
 53 | else:
 54 |     pre_converter1 = None
 55 | 
 56 | 
 57 | def generate_feature(path1, path2):
 58 |     out1 = Path(arguments.output1_directory, path1.stem + '.npy')
 59 |     out2 = Path(arguments.output2_directory, path2.stem + '.npy')
 60 |     if out1.exists() and out2.exists() and not arguments.enable_overwrite:
 61 |         return
 62 | 
 63 |     # load wave and padding
 64 |     wave_file_load_process = WaveFileLoadProcess(
 65 |         sample_rate=arguments.sample_rate,
 66 |         top_db=arguments.top_db,
 67 |         pad_second=arguments.pad_second,
 68 |     )
 69 |     wave1 = wave_file_load_process(path1, test=True)
 70 |     wave2 = wave_file_load_process(path2, test=True)
 71 | 
 72 |     # make acoustic feature
 73 |     acoustic_feature_process1 = AcousticFeatureProcess(
 74 |         frame_period=arguments.frame_period,
 75 |         order=arguments.order,
 76 |         alpha=arguments.alpha,
 77 |         f0_estimating_method=arguments.f0_estimating_method,
 78 |         f0_floor=arguments.f0_floor1,
 79 |         f0_ceil=arguments.f0_ceil1,
 80 |     )
 81 |     acoustic_feature_process2 = AcousticFeatureProcess(
 82 |         frame_period=arguments.frame_period,
 83 |         order=arguments.order,
 84 |         alpha=arguments.alpha,
 85 |         f0_estimating_method=arguments.f0_estimating_method,
 86 |         f0_floor=arguments.f0_floor2,
 87 |         f0_ceil=arguments.f0_ceil2,
 88 |     )
 89 |     f1 = acoustic_feature_process1(wave1, test=True).astype_only_float(numpy.float32)
 90 |     f2 = acoustic_feature_process2(wave2, test=True).astype_only_float(numpy.float32)
 91 | 
 92 |     # pre convert
 93 |     if pre_convert:
 94 |         f1_ref = pre_converter1.convert_to_feature(f1)
 95 |     else:
 96 |         f1_ref = f1
 97 | 
 98 |     # alignment
 99 |     if not arguments.disable_alignment:
100 |         aligner = MelCepstrumAligner(f1_ref.mfcc, f2.mfcc)
101 | 
102 |         f0_1, f0_2 = aligner.align(f1.f0, f2.f0)
103 |         spectrogram_1, spectrogram_2 = aligner.align(f1.spectrogram, f2.spectrogram)
104 |         aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity)
105 |         mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc)
106 |         voiced_1, voiced_2 = aligner.align(f1.voiced, f2.voiced)
107 | 
108 |         f1 = AcousticFeature(
109 |             f0=f0_1,
110 |             spectrogram=spectrogram_1,
111 |             aperiodicity=aperiodicity_1,
112 |             mfcc=mfcc_1,
113 |             voiced=voiced_1,
114 |         )
115 |         f2 = AcousticFeature(
116 |             f0=f0_2,
117 |             spectrogram=spectrogram_2,
118 |             aperiodicity=aperiodicity_2,
119 |             mfcc=mfcc_2,
120 |             voiced=voiced_2,
121 |         )
122 | 
123 |         f1.validate()
124 |         f2.validate()
125 | 
126 |     # save
127 |     acoustic_feature_save_process = AcousticFeatureSaveProcess(validate=True, ignore=arguments.ignore_feature)
128 |     acoustic_feature_save_process({'path': out1, 'feature': f1})
129 |     print('saved!', out1)
130 | 
131 |     acoustic_feature_save_process({'path': out2, 'feature': f2})
132 |     print('saved!', out2)
133 | 
134 | 
135 | def generate_mean_var(path_directory: Path):
136 |     path_mean = Path(path_directory, 'mean.npy')
137 |     path_var = Path(path_directory, 'var.npy')
138 |     if path_mean.exists():
139 |         path_mean.unlink()
140 |     if path_var.exists():
141 |         path_var.unlink()
142 | 
143 |     acoustic_feature_load_process = AcousticFeatureLoadProcess(validate=False)
144 |     acoustic_feature_save_process = AcousticFeatureSaveProcess(validate=False)
145 | 
146 |     f0_list = []
147 |     spectrogram_list = []
148 |     aperiodicity_list = []
149 |     mfcc_list = []
150 |     for path in path_directory.glob('*'):
151 |         feature = acoustic_feature_load_process(path)
152 |         f0_list.append(feature.f0[feature.voiced])  # remove unvoiced
153 |         spectrogram_list.append(feature.spectrogram)
154 |         aperiodicity_list.append(feature.aperiodicity)
155 |         mfcc_list.append(feature.mfcc)
156 | 
157 |     def concatenate(arr_list):
158 |         try:
159 |             arr_list = numpy.concatenate(arr_list)
160 |         except:
161 |             pass
162 |         return arr_list
163 | 
164 |     f0_list = concatenate(f0_list)
165 |     spectrogram_list = concatenate(spectrogram_list)
166 |     aperiodicity_list = concatenate(aperiodicity_list)
167 |     mfcc_list = concatenate(mfcc_list)
168 | 
169 |     mean = AcousticFeature(
170 |         f0=numpy.mean(f0_list, axis=0, keepdims=True),
171 |         spectrogram=numpy.mean(spectrogram_list, axis=0, keepdims=True),
172 |         aperiodicity=numpy.mean(aperiodicity_list, axis=0, keepdims=True),
173 |         mfcc=numpy.mean(mfcc_list, axis=0, keepdims=True),
174 |         voiced=numpy.nan,
175 |     )
176 |     var = AcousticFeature(
177 |         f0=numpy.var(f0_list, axis=0, keepdims=True),
178 |         spectrogram=numpy.var(spectrogram_list, axis=0, keepdims=True),
179 |         aperiodicity=numpy.var(aperiodicity_list, axis=0, keepdims=True),
180 |         mfcc=numpy.var(mfcc_list, axis=0, keepdims=True),
181 |         voiced=numpy.nan,
182 |     )
183 | 
184 |     acoustic_feature_save_process({'path': path_mean, 'feature': mean})
185 |     acoustic_feature_save_process({'path': path_var, 'feature': var})
186 | 
187 | 
188 | def main():
189 |     pprint(vars(arguments))
190 | 
191 |     paths1 = list(sorted(arguments.input1_directory.glob('*')))
192 |     paths2 = list(sorted(arguments.input2_directory.glob('*')))
193 |     assert len(paths1) == len(paths2)
194 | 
195 |     arguments.output1_directory.mkdir(exist_ok=True)
196 |     arguments.output2_directory.mkdir(exist_ok=True)
197 | 
198 |     pool = multiprocessing.Pool()
199 |     pool.starmap(generate_feature, zip(paths1, paths2), chunksize=16)
200 | 
201 |     generate_mean_var(arguments.output1_directory)
202 |     generate_mean_var(arguments.output2_directory)
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     main()
207 | 


--------------------------------------------------------------------------------
/become_yukarin/model/cbhg_model.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from typing import List
  3 | 
  4 | import chainer
  5 | 
  6 | from become_yukarin.config.old_config import CBHGDiscriminatorModelConfig
  7 | from become_yukarin.config.old_config import CBHGModelConfig
  8 | 
  9 | 
 10 | class Convolution1D(chainer.links.ConvolutionND):
 11 |     def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0,
 12 |                  nobias=False, initialW=None, initial_bias=None,
 13 |                  cover_all=False):
 14 |         super().__init__(
 15 |             ndim=1,
 16 |             in_channels=in_channels,
 17 |             out_channels=out_channels,
 18 |             ksize=ksize,
 19 |             stride=stride,
 20 |             pad=pad,
 21 |             nobias=nobias,
 22 |             initialW=initialW,
 23 |             initial_bias=initial_bias,
 24 |             cover_all=cover_all,
 25 |         )
 26 | 
 27 | 
 28 | class LegacyConvolution1D(chainer.links.Convolution2D):
 29 |     def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0,
 30 |                  nobias=False, initialW=None, initial_bias=None, **kwargs):
 31 |         assert ksize is None or isinstance(ksize, int)
 32 |         assert isinstance(stride, int)
 33 |         assert isinstance(pad, int)
 34 |         super().__init__(
 35 |             in_channels=in_channels,
 36 |             out_channels=out_channels,
 37 |             ksize=(ksize, 1),
 38 |             stride=(stride, 1),
 39 |             pad=(pad, 0),
 40 |             nobias=nobias,
 41 |             initialW=initialW,
 42 |             initial_bias=initial_bias,
 43 |             **kwargs,
 44 |         )
 45 | 
 46 |     def __call__(self, x):
 47 |         assert x.shape[-1] == 1
 48 |         return super().__call__(x)
 49 | 
 50 | 
 51 | class ConvHighway(chainer.link.Chain):
 52 |     def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu,
 53 |                  init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1):
 54 |         super().__init__()
 55 |         self.activate = activate
 56 | 
 57 |         with self.init_scope():
 58 |             self.plain = Convolution1D(
 59 |                 in_out_size, in_out_size, 1, nobias=nobias,
 60 |                 initialW=init_Wh, initial_bias=init_bh)
 61 |             self.transform = Convolution1D(
 62 |                 in_out_size, in_out_size, 1, nobias=nobias,
 63 |                 initialW=init_Wt, initial_bias=init_bt)
 64 | 
 65 |     def __call__(self, x):
 66 |         out_plain = self.activate(self.plain(x))
 67 |         out_transform = chainer.functions.sigmoid(self.transform(x))
 68 |         y = out_plain * out_transform + x * (1 - out_transform)
 69 |         return y
 70 | 
 71 | 
 72 | class PreNet(chainer.link.Chain):
 73 |     def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None:
 74 |         super().__init__()
 75 |         with self.init_scope():
 76 |             self.conv1 = Convolution1D(in_channels, hidden_channels, 1)
 77 |             self.conv2 = Convolution1D(hidden_channels, out_channels, 1)
 78 | 
 79 |     def __call__(self, x):
 80 |         h = x
 81 |         h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5))
 82 |         h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5))
 83 |         return h
 84 | 
 85 | 
 86 | class Conv1DBank(chainer.link.Chain):
 87 |     def __init__(self, in_channels: int, out_channels: int, k: int) -> None:
 88 |         super().__init__()
 89 |         self.stacked_channels = out_channels * k
 90 |         self.pads = [
 91 |             partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant')
 92 |             for i in range(k)
 93 |         ]
 94 | 
 95 |         with self.init_scope():
 96 |             self.convs = chainer.link.ChainList(
 97 |                 *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k))
 98 |             )
 99 |             self.bn = chainer.links.BatchNormalization(out_channels * k)
100 | 
101 |     def __call__(self, x):
102 |         h = x
103 |         h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)])
104 |         h = chainer.functions.relu(self.bn(h))
105 |         return h
106 | 
107 | 
108 | class Conv1DProjections(chainer.link.Chain):
109 |     def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None:
110 |         super().__init__()
111 | 
112 |         with self.init_scope():
113 |             self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True)
114 |             self.bn1 = chainer.links.BatchNormalization(hidden_channels)
115 |             self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True)
116 |             self.bn2 = chainer.links.BatchNormalization(out_channels)
117 | 
118 |     def __call__(self, x):
119 |         h = x
120 |         h = chainer.functions.relu(self.bn1(self.conv1(h)))
121 |         h = chainer.functions.relu(self.bn2(self.conv2(h)))
122 |         return h
123 | 
124 | 
125 | class CBHG(chainer.link.Chain):
126 |     def __init__(
127 |             self,
128 |             in_channels: int,
129 |             conv_bank_out_channels: int,
130 |             conv_bank_k: int,
131 |             max_pooling_k: int,
132 |             conv_projections_hidden_channels: int,
133 |             highway_layers: int,
134 |             out_channels: int,
135 |             disable_last_rnn: bool,
136 |     ) -> None:
137 |         super().__init__()
138 |         self.max_pooling_padding = partial(
139 |             chainer.functions.pad,
140 |             pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)),
141 |             mode='constant',
142 |         )
143 |         self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False)
144 |         self.out_size = out_channels * (1 if disable_last_rnn else 2)
145 | 
146 |         with self.init_scope():
147 |             self.conv_bank = Conv1DBank(
148 |                 in_channels=in_channels,
149 |                 out_channels=conv_bank_out_channels,
150 |                 k=conv_bank_k,
151 |             )
152 |             self.conv_projectoins = Conv1DProjections(
153 |                 in_channels=self.conv_bank.stacked_channels,
154 |                 hidden_channels=conv_projections_hidden_channels,
155 |                 out_channels=out_channels,
156 |             )
157 |             self.highways = chainer.link.ChainList(
158 |                 *([ConvHighway(out_channels) for _ in range(highway_layers)])
159 |             )
160 |             if not disable_last_rnn:
161 |                 self.gru = chainer.links.NStepBiGRU(
162 |                     n_layers=1,
163 |                     in_size=out_channels,
164 |                     out_size=out_channels,
165 |                     dropout=0.0,
166 |                 )
167 | 
168 |     def __call__(self, x):
169 |         h = x
170 |         h = self.conv_bank(h)
171 |         h = self.max_pooling(self.max_pooling_padding(h))
172 |         h = self.conv_projectoins(h)
173 |         h = h + x
174 |         for highway in self.highways:
175 |             h = highway(h)
176 | 
177 |         if hasattr(self, 'gru'):
178 |             h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1)))
179 |             _, h = self.gru(None, h)
180 |             h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1))
181 |         return h
182 | 
183 | 
184 | class Predictor(chainer.link.Chain):
185 |     def __init__(self, network, out_size: int) -> None:
186 |         super().__init__()
187 |         with self.init_scope():
188 |             self.network = network
189 |             self.last = Convolution1D(network.out_size, out_size, 1)
190 | 
191 |     def __call__(self, x):
192 |         h = x
193 |         h = self.network(h)
194 |         h = self.last(h)
195 |         return h
196 | 
197 | 
198 | class Aligner(chainer.link.Chain):
199 |     def __init__(self, in_size: int, out_time_length: int) -> None:
200 |         super().__init__()
201 |         with self.init_scope():
202 |             self.gru = chainer.links.NStepBiGRU(
203 |                 n_layers=1,
204 |                 in_size=in_size,
205 |                 out_size=in_size // 2,
206 |                 dropout=0.0,
207 |             )
208 |             self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1)
209 | 
210 |     def __call__(self, x):
211 |         """
212 |         :param x: (batch, channel, timeA)
213 |         """
214 |         h = x
215 |         h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1)))  # h: batch * (timeA, channel)
216 |         _, h = self.gru(None, h)  # h: batch * (timeA, ?)
217 |         h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1))  # h: (batch, ?, timeA)
218 |         h = chainer.functions.softmax(self.last(h), axis=1)  # h: (batch, timeB, timeA)
219 | 
220 |         h = chainer.functions.matmul(x, h)  # h: (batch, channel, time)
221 |         return h
222 | 
223 | 
224 | class Discriminator(chainer.link.Chain):
225 |     def __init__(self, in_channels: int, hidden_channels_list: List[int]) -> None:
226 |         super().__init__()
227 |         with self.init_scope():
228 |             self.convs = chainer.link.ChainList(*(
229 |                 LegacyConvolution1D(i_c, o_c, ksize=2, stride=2)
230 |                 for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list)
231 |             ))
232 |             self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1)
233 | 
234 |     def __call__(self, x):
235 |         """
236 |         :param x: (batch, channel, time)
237 |         """
238 |         h = x
239 |         h = chainer.functions.reshape(h, h.shape + (1,))
240 |         for conv in self.convs.children():
241 |             h = chainer.functions.relu(conv(h))
242 |         h = self.last_conv(h)
243 |         h = chainer.functions.reshape(h, h.shape[:-1])
244 |         return h
245 | 
246 | 
247 | def create_predictor(config: CBHGModelConfig):
248 |     network = CBHG(
249 |         in_channels=config.in_channels,
250 |         conv_bank_out_channels=config.conv_bank_out_channels,
251 |         conv_bank_k=config.conv_bank_k,
252 |         max_pooling_k=config.max_pooling_k,
253 |         conv_projections_hidden_channels=config.conv_projections_hidden_channels,
254 |         highway_layers=config.highway_layers,
255 |         out_channels=config.out_channels,
256 |         disable_last_rnn=config.disable_last_rnn,
257 |     )
258 |     predictor = Predictor(
259 |         network=network,
260 |         out_size=config.out_size,
261 |     )
262 |     return predictor
263 | 
264 | 
265 | def create_aligner(config: CBHGModelConfig):
266 |     assert config.enable_aligner
267 |     aligner = Aligner(
268 |         in_size=config.in_channels,
269 |         out_time_length=config.aligner_out_time_length,
270 |     )
271 |     return aligner
272 | 
273 | 
274 | def create_discriminator(config: CBHGDiscriminatorModelConfig):
275 |     discriminator = Discriminator(
276 |         in_channels=config.in_channels,
277 |         hidden_channels_list=config.hidden_channels_list,
278 |     )
279 |     return discriminator
280 | 
281 | 
282 | def create(config: CBHGModelConfig):
283 |     predictor = create_predictor(config)
284 |     if config.enable_aligner:
285 |         aligner = create_aligner(config)
286 |     else:
287 |         aligner = None
288 |     if config.discriminator is not None:
289 |         discriminator = create_discriminator(config.discriminator)
290 |     else:
291 |         discriminator = None
292 |     return predictor, aligner, discriminator
293 | 


--------------------------------------------------------------------------------
/become_yukarin/dataset/dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import glob
  3 | import typing
  4 | from abc import ABCMeta, abstractmethod
  5 | from collections import defaultdict
  6 | from pathlib import Path
  7 | from typing import Any
  8 | from typing import Callable
  9 | from typing import Dict
 10 | from typing import List
 11 | 
 12 | import chainer
 13 | import librosa
 14 | import numpy
 15 | import pysptk
 16 | import pyworld
 17 | import scipy.ndimage
 18 | 
 19 | from ..config.config import DatasetConfig
 20 | from ..config.sr_config import SRDatasetConfig
 21 | from ..data_struct import AcousticFeature
 22 | from ..data_struct import LowHighSpectrogramFeature
 23 | from ..data_struct import Wave
 24 | 
 25 | 
 26 | class BaseDataProcess(metaclass=ABCMeta):
 27 |     @abstractmethod
 28 |     def __call__(self, data, test):
 29 |         pass
 30 | 
 31 | 
 32 | class LambdaProcess(BaseDataProcess):
 33 |     def __init__(self, process: Callable[[Any, bool], Any]) -> None:
 34 |         self._process = process
 35 | 
 36 |     def __call__(self, data, test):
 37 |         return self._process(data, test)
 38 | 
 39 | 
 40 | class DictKeyReplaceProcess(BaseDataProcess):
 41 |     def __init__(self, key_map: Dict[str, str]) -> None:
 42 |         self._key_map = key_map
 43 | 
 44 |     def __call__(self, data: Dict[str, Any], test):
 45 |         return {key_after: data[key_before] for key_after, key_before in self._key_map}
 46 | 
 47 | 
 48 | class ChainProcess(BaseDataProcess):
 49 |     def __init__(self, process: typing.Iterable[BaseDataProcess]) -> None:
 50 |         self._process = list(process)
 51 | 
 52 |     def __call__(self, data, test):
 53 |         for p in self._process:
 54 |             data = p(data, test)
 55 |         return data
 56 | 
 57 |     def append(self, process: BaseDataProcess):
 58 |         self._process.append(process)
 59 | 
 60 | 
 61 | class SplitProcess(BaseDataProcess):
 62 |     def __init__(self, process: typing.Dict[str, typing.Optional[BaseDataProcess]]) -> None:
 63 |         self._process = process
 64 | 
 65 |     def __call__(self, data, test):
 66 |         data = {
 67 |             k: p(data, test) if p is not None else data
 68 |             for k, p in self._process.items()
 69 |         }
 70 |         return data
 71 | 
 72 | 
 73 | class WaveFileLoadProcess(BaseDataProcess):
 74 |     def __init__(self, sample_rate: int, top_db: float = None, pad_second: float = 0, dtype=numpy.float32) -> None:
 75 |         self._sample_rate = sample_rate
 76 |         self._top_db = top_db
 77 |         self._pad_second = pad_second
 78 |         self._dtype = dtype
 79 | 
 80 |     def __call__(self, data: str, test=None):
 81 |         wave = librosa.core.load(data, sr=self._sample_rate, dtype=self._dtype)[0]
 82 |         if self._top_db is not None:
 83 |             wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db))
 84 |         if self._pad_second > 0.0:
 85 |             p = int(self._sample_rate * self._pad_second)
 86 |             wave = numpy.pad(wave, pad_width=(p, p), mode='constant')
 87 |         return Wave(wave, self._sample_rate)
 88 | 
 89 | 
 90 | class AcousticFeatureProcess(BaseDataProcess):
 91 |     def __init__(
 92 |             self,
 93 |             frame_period,
 94 |             order,
 95 |             alpha,
 96 |             f0_estimating_method,
 97 |             f0_floor=71,
 98 |             f0_ceil=800,
 99 |             dtype=numpy.float32,
100 |     ) -> None:
101 |         self._frame_period = frame_period
102 |         self._order = order
103 |         self._alpha = alpha
104 |         self._f0_estimating_method = f0_estimating_method
105 |         self._f0_floor = f0_floor
106 |         self._f0_ceil = f0_ceil
107 |         self._dtype = dtype
108 | 
109 |     def __call__(self, data: Wave, test=None):
110 |         x = data.wave.astype(numpy.float64)
111 |         fs = data.sampling_rate
112 | 
113 |         if self._f0_estimating_method == 'dio':
114 |             _f0, t = pyworld.dio(
115 |                 x,
116 |                 fs,
117 |                 frame_period=self._frame_period,
118 |                 f0_floor=self._f0_floor,
119 |                 f0_ceil=self._f0_ceil,
120 |             )
121 |         else:
122 |             from world4py.np import apis
123 |             _f0, t = apis.harvest(
124 |                 x,
125 |                 fs,
126 |                 frame_period=self._frame_period,
127 |                 f0_floor=self._f0_floor,
128 |                 f0_ceil=self._f0_ceil,
129 |             )
130 |         f0 = pyworld.stonemask(x, _f0, t, fs)
131 |         spectrogram = pyworld.cheaptrick(x, f0, t, fs)
132 |         aperiodicity = pyworld.d4c(x, f0, t, fs)
133 | 
134 |         mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
135 |         voiced = ~(f0 == 0)  # type: numpy.ndarray
136 | 
137 |         feature = AcousticFeature(
138 |             f0=f0[:, None].astype(self._dtype),
139 |             spectrogram=spectrogram.astype(self._dtype),
140 |             aperiodicity=aperiodicity.astype(self._dtype),
141 |             mfcc=mfcc.astype(self._dtype),
142 |             voiced=voiced[:, None],
143 |         )
144 |         feature.validate()
145 |         return feature
146 | 
147 | 
148 | class LowHighSpectrogramFeatureProcess(BaseDataProcess):
149 |     def __init__(self, frame_period, order, alpha, f0_estimating_method, dtype=numpy.float32) -> None:
150 |         self._acoustic_feature_process = AcousticFeatureProcess(
151 |             frame_period=frame_period,
152 |             order=order,
153 |             alpha=alpha,
154 |             f0_estimating_method=f0_estimating_method,
155 |         )
156 |         self._dtype = dtype
157 |         self._alpha = alpha
158 | 
159 |     def __call__(self, data: Wave, test):
160 |         acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype)
161 |         high_spectrogram = acoustic_feature.spectrogram
162 | 
163 |         fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate)
164 |         low_spectrogram = pysptk.mc2sp(
165 |             acoustic_feature.mfcc,
166 |             alpha=self._alpha,
167 |             fftlen=fftlen,
168 |         )
169 | 
170 |         feature = LowHighSpectrogramFeature(
171 |             low=low_spectrogram,
172 |             high=high_spectrogram,
173 |         )
174 |         feature.validate()
175 |         return feature
176 | 
177 | 
178 | class AcousticFeatureLoadProcess(BaseDataProcess):
179 |     def __init__(self, validate=False) -> None:
180 |         self._validate = validate
181 | 
182 |     def __call__(self, path: Path, test=None):
183 |         d: Dict[str, Any] = numpy.load(path.expanduser(), allow_pickle=True).item()
184 |         feature = AcousticFeature(
185 |             f0=d['f0'],
186 |             spectrogram=d['spectrogram'],
187 |             aperiodicity=d['aperiodicity'],
188 |             mfcc=d['mfcc'],
189 |             voiced=d['voiced'],
190 |         )
191 |         if self._validate:
192 |             feature.validate()
193 |         return feature
194 | 
195 | 
196 | class LowHighSpectrogramFeatureLoadProcess(BaseDataProcess):
197 |     def __init__(self, validate=False) -> None:
198 |         self._validate = validate
199 | 
200 |     def __call__(self, path: Path, test=None):
201 |         d: Dict[str, Any] = numpy.load(path.expanduser(), allow_pickle=True).item()
202 |         feature = LowHighSpectrogramFeature(
203 |             low=d['low'],
204 |             high=d['high'],
205 |         )
206 |         if self._validate:
207 |             feature.validate()
208 |         return feature
209 | 
210 | 
211 | class AcousticFeatureSaveProcess(BaseDataProcess):
212 |     def __init__(self, validate=False, ignore: List[str] = None) -> None:
213 |         self._validate = validate
214 |         self._ignore = ignore if ignore is not None else []
215 | 
216 |     def __call__(self, data: Dict[str, Any], test=None):
217 |         path = data['path']  # type: Path
218 |         feature = data['feature']  # type: AcousticFeature
219 |         if self._validate:
220 |             feature.validate()
221 | 
222 |         d = dict(
223 |             f0=feature.f0,
224 |             spectrogram=feature.spectrogram,
225 |             aperiodicity=feature.aperiodicity,
226 |             mfcc=feature.mfcc,
227 |             voiced=feature.voiced,
228 |         )
229 |         for k in self._ignore:
230 |             assert k in d
231 |             d[k] = numpy.nan
232 | 
233 |         numpy.save(path.absolute(), d)
234 | 
235 | 
236 | class DistillateUsingFeatureProcess(BaseDataProcess):
237 |     def __init__(self, targets: List[str]) -> None:
238 |         self._targets = targets
239 | 
240 |     def __call__(self, feature: AcousticFeature, test=None):
241 |         d = defaultdict(lambda: numpy.nan, **{t: getattr(feature, t) for t in self._targets})
242 |         return AcousticFeature(
243 |             f0=d['f0'],
244 |             spectrogram=d['spectrogram'],
245 |             aperiodicity=d['aperiodicity'],
246 |             mfcc=d['mfcc'],
247 |             voiced=d['voiced'],
248 |         )
249 | 
250 | 
251 | class MakeMaskProcess(BaseDataProcess):
252 |     def __init__(self) -> None:
253 |         pass
254 | 
255 |     def __call__(self, feature: AcousticFeature, test=None):
256 |         return AcousticFeature(
257 |             f0=feature.voiced,
258 |             spectrogram=numpy.ones_like(feature.spectrogram, dtype=numpy.bool),
259 |             aperiodicity=numpy.ones_like(feature.aperiodicity, dtype=numpy.bool),
260 |             mfcc=numpy.ones_like(feature.mfcc, dtype=numpy.bool),
261 |             voiced=numpy.ones_like(feature.voiced, dtype=numpy.bool),
262 |         ).astype(numpy.float32)
263 | 
264 | 
265 | class AcousticFeatureNormalizeProcess(BaseDataProcess):
266 |     def __init__(self, mean: AcousticFeature, var: AcousticFeature) -> None:
267 |         self._mean = mean
268 |         self._var = var
269 | 
270 |     def __call__(self, data: AcousticFeature, test=None):
271 |         f0 = (data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0)
272 |         f0[~data.voiced] = 0
273 |         return AcousticFeature(
274 |             f0=f0,
275 |             spectrogram=(data.spectrogram - self._mean.spectrogram) / numpy.sqrt(self._var.spectrogram),
276 |             aperiodicity=(data.aperiodicity - self._mean.aperiodicity) / numpy.sqrt(self._var.aperiodicity),
277 |             mfcc=(data.mfcc - self._mean.mfcc) / numpy.sqrt(self._var.mfcc),
278 |             voiced=data.voiced,
279 |         )
280 | 
281 | 
282 | class AcousticFeatureDenormalizeProcess(BaseDataProcess):
283 |     def __init__(self, mean: AcousticFeature, var: AcousticFeature) -> None:
284 |         self._mean = mean
285 |         self._var = var
286 | 
287 |     def __call__(self, data: AcousticFeature, test=None):
288 |         f0 = data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0
289 |         f0[~data.voiced] = 0
290 |         return AcousticFeature(
291 |             f0=f0,
292 |             spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram,
293 |             aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity,
294 |             mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc,
295 |             voiced=data.voiced,
296 |         )
297 | 
298 | 
299 | class EncodeFeatureProcess(BaseDataProcess):
300 |     def __init__(self, targets: List[str]) -> None:
301 |         self._targets = targets
302 | 
303 |     def __call__(self, data: AcousticFeature, test):
304 |         feature = numpy.concatenate([getattr(data, t) for t in self._targets], axis=1)
305 |         feature = feature.T
306 |         return feature
307 | 
308 | 
309 | class DecodeFeatureProcess(BaseDataProcess):
310 |     def __init__(self, targets: List[str], sizes: Dict[str, int]) -> None:
311 |         assert all(t in sizes for t in targets)
312 |         self._targets = targets
313 |         self._sizes = sizes
314 | 
315 |     def __call__(self, data: numpy.ndarray, test):
316 |         data = data.T
317 | 
318 |         lasts = numpy.cumsum([self._sizes[t] for t in self._targets]).tolist()
319 |         assert data.shape[1] == lasts[-1]
320 | 
321 |         d = defaultdict(lambda: numpy.nan, **{
322 |             t: data[:, bef:aft]
323 |             for t, bef, aft in zip(self._targets, [0] + lasts[:-1], lasts)
324 |         })
325 |         return AcousticFeature(
326 |             f0=d['f0'],
327 |             spectrogram=d['spectrogram'],
328 |             aperiodicity=d['aperiodicity'],
329 |             mfcc=d['mfcc'],
330 |             voiced=d['voiced'],
331 |         )
332 | 
333 | 
334 | class ShapeAlignProcess(BaseDataProcess):
335 |     def __call__(self, data, test):
336 |         data1, data2, data3 = data['input'], data['target'], data['mask']
337 |         m = max(data1.shape[1], data2.shape[1], data3.shape[1])
338 |         data1 = numpy.pad(data1, ((0, 0), (0, m - data1.shape[1])), mode='constant')
339 |         data2 = numpy.pad(data2, ((0, 0), (0, m - data2.shape[1])), mode='constant')
340 |         data3 = numpy.pad(data3, ((0, 0), (0, m - data3.shape[1])), mode='constant')
341 |         data['input'], data['target'], data['mask'] = data1, data2, data3
342 |         return data
343 | 
344 | 
345 | class RandomPaddingProcess(BaseDataProcess):
346 |     def __init__(self, min_size: int, time_axis: int = 1) -> None:
347 |         self._min_size = min_size
348 |         self._time_axis = time_axis
349 | 
350 |     def __call__(self, datas: Dict[str, Any], test=True):
351 |         assert not test
352 | 
353 |         data, seed = datas['data'], datas['seed']
354 |         random = numpy.random.RandomState(seed)
355 | 
356 |         if data.shape[self._time_axis] >= self._min_size:
357 |             return data
358 | 
359 |         pre = random.randint(self._min_size - data.shape[self._time_axis] + 1)
360 |         post = self._min_size - pre
361 |         pad = [(0, 0)] * data.ndim
362 |         pad[self._time_axis] = (pre, post)
363 |         return numpy.pad(data, pad, mode='constant')
364 | 
365 | 
366 | class LastPaddingProcess(BaseDataProcess):
367 |     def __init__(self, min_size: int, time_axis: int = 1) -> None:
368 |         assert time_axis == 1
369 |         self._min_size = min_size
370 |         self._time_axis = time_axis
371 | 
372 |     def __call__(self, data: numpy.ndarray, test=None):
373 |         if data.shape[self._time_axis] >= self._min_size:
374 |             return data
375 | 
376 |         pre = self._min_size - data.shape[self._time_axis]
377 |         return numpy.pad(data, ((0, 0), (pre, 0)), mode='constant')
378 | 
379 | 
380 | class RandomCropProcess(BaseDataProcess):
381 |     def __init__(self, crop_size: int, time_axis: int = 1) -> None:
382 |         self._crop_size = crop_size
383 |         self._time_axis = time_axis
384 | 
385 |     def __call__(self, datas: Dict[str, Any], test=True):
386 |         assert not test
387 | 
388 |         data, seed = datas['data'], datas['seed']
389 |         random = numpy.random.RandomState(seed)
390 | 
391 |         len_time = data.shape[self._time_axis]
392 |         assert len_time >= self._crop_size
393 | 
394 |         start = random.randint(len_time - self._crop_size + 1)
395 |         return numpy.split(data, [start, start + self._crop_size], axis=self._time_axis)[1]
396 | 
397 | 
398 | class FirstCropProcess(BaseDataProcess):
399 |     def __init__(self, crop_size: int, time_axis: int = 1) -> None:
400 |         self._crop_size = crop_size
401 |         self._time_axis = time_axis
402 | 
403 |     def __call__(self, data: numpy.ndarray, test=None):
404 |         return numpy.split(data, [0, self._crop_size], axis=self._time_axis)[1]
405 | 
406 | 
407 | class AddNoiseProcess(BaseDataProcess):
408 |     def __init__(self, p_global: float = None, p_local: float = None) -> None:
409 |         assert p_global is None or 0 <= p_global
410 |         assert p_local is None or 0 <= p_local
411 |         self._p_global = p_global
412 |         self._p_local = p_local
413 | 
414 |     def __call__(self, data: numpy.ndarray, test):
415 |         assert not test
416 | 
417 |         g = numpy.random.randn() * self._p_global
418 |         l = numpy.random.randn(*data.shape).astype(data.dtype) * self._p_local
419 |         return data + g + l
420 | 
421 | 
422 | class RandomBlurProcess(BaseDataProcess):
423 |     def __init__(self, blur_size_factor: float, time_axis: int = 1) -> None:
424 |         assert time_axis == 1
425 |         self._blur_size_factor = blur_size_factor
426 |         self._time_axis = time_axis
427 | 
428 |     def __call__(self, data: numpy.ndarray, test=None):
429 |         assert not test
430 | 
431 |         blur_size = numpy.abs(numpy.random.randn()) * self._blur_size_factor
432 |         return scipy.ndimage.gaussian_filter(data, (0, blur_size))
433 | 
434 | 
435 | class DataProcessDataset(chainer.dataset.DatasetMixin):
436 |     def __init__(self, data: typing.List, data_process: BaseDataProcess) -> None:
437 |         self._data = data
438 |         self._data_process = data_process
439 | 
440 |     def __len__(self):
441 |         return len(self._data)
442 | 
443 |     def get_example(self, i):
444 |         return self._data_process(data=self._data[i], test=not chainer.config.train)
445 | 
446 | 
447 | def create(config: DatasetConfig):
448 |     acoustic_feature_load_process = AcousticFeatureLoadProcess()
449 |     input_mean = acoustic_feature_load_process(config.input_mean_path, test=True)
450 |     input_var = acoustic_feature_load_process(config.input_var_path, test=True)
451 |     target_mean = acoustic_feature_load_process(config.target_mean_path, test=True)
452 |     target_var = acoustic_feature_load_process(config.target_var_path, test=True)
453 | 
454 |     # {input_path, target_path}
455 |     data_process_base = ChainProcess([
456 |         SplitProcess(dict(
457 |             input=ChainProcess([
458 |                 LambdaProcess(lambda d, test: d['input_path']),
459 |                 acoustic_feature_load_process,
460 |                 DistillateUsingFeatureProcess(config.features + ['voiced']),
461 |                 AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var),
462 |                 EncodeFeatureProcess(config.features),
463 |             ]),
464 |             target=ChainProcess([
465 |                 LambdaProcess(lambda d, test: d['target_path']),
466 |                 acoustic_feature_load_process,
467 |                 DistillateUsingFeatureProcess(config.features + ['voiced']),
468 |                 AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var),
469 |                 SplitProcess(dict(
470 |                     feature=EncodeFeatureProcess(config.features),
471 |                     mask=ChainProcess([
472 |                         MakeMaskProcess(),
473 |                         EncodeFeatureProcess(config.features),
474 |                     ])
475 |                 )),
476 |             ]),
477 |         )),
478 |         LambdaProcess(
479 |             lambda d, test: dict(input=d['input'], target=d['target']['feature'], mask=d['target']['mask'])),
480 |         ShapeAlignProcess(),
481 |     ])
482 | 
483 |     data_process_train = copy.deepcopy(data_process_base)
484 | 
485 |     # cropping
486 |     if config.train_crop_size is not None:
487 |         def add_seed():
488 |             return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 31), **d))
489 | 
490 |         def padding(s):
491 |             return ChainProcess([
492 |                 LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
493 |                 RandomPaddingProcess(min_size=config.train_crop_size),
494 |             ])
495 | 
496 |         def crop(s):
497 |             return ChainProcess([
498 |                 LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
499 |                 RandomCropProcess(crop_size=config.train_crop_size),
500 |             ])
501 | 
502 |         data_process_train.append(ChainProcess([
503 |             add_seed(),
504 |             SplitProcess(dict(input=padding('input'), target=padding('target'), mask=padding('mask'))),
505 |             add_seed(),
506 |             SplitProcess(dict(input=crop('input'), target=crop('target'), mask=crop('mask'))),
507 |         ]))
508 | 
509 |     # add noise
510 |     data_process_train.append(SplitProcess(dict(
511 |         input=ChainProcess([
512 |             LambdaProcess(lambda d, test: d['input']),
513 |             AddNoiseProcess(p_global=config.input_global_noise, p_local=config.input_local_noise),
514 |         ]),
515 |         target=ChainProcess([
516 |             LambdaProcess(lambda d, test: d['target']),
517 |             AddNoiseProcess(p_global=config.target_global_noise, p_local=config.target_local_noise),
518 |         ]),
519 |         mask=ChainProcess([
520 |             LambdaProcess(lambda d, test: d['mask']),
521 |         ]),
522 |     )))
523 | 
524 |     data_process_test = copy.deepcopy(data_process_base)
525 |     if config.train_crop_size is not None:
526 |         data_process_test.append(SplitProcess(dict(
527 |             input=ChainProcess([
528 |                 LambdaProcess(lambda d, test: d['input']),
529 |                 LastPaddingProcess(min_size=config.train_crop_size),
530 |                 FirstCropProcess(crop_size=config.train_crop_size),
531 |             ]),
532 |             target=ChainProcess([
533 |                 LambdaProcess(lambda d, test: d['target']),
534 |                 LastPaddingProcess(min_size=config.train_crop_size),
535 |                 FirstCropProcess(crop_size=config.train_crop_size),
536 |             ]),
537 |             mask=ChainProcess([
538 |                 LambdaProcess(lambda d, test: d['mask']),
539 |                 LastPaddingProcess(min_size=config.train_crop_size),
540 |                 FirstCropProcess(crop_size=config.train_crop_size),
541 |             ]),
542 |         )))
543 | 
544 |     input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))]))
545 |     target_paths = list(sorted([Path(p) for p in glob.glob(str(config.target_glob))]))
546 |     assert len(input_paths) == len(target_paths)
547 | 
548 |     num_test = config.num_test
549 |     pairs = [
550 |         dict(input_path=input_path, target_path=target_path)
551 |         for input_path, target_path in zip(input_paths, target_paths)
552 |     ]
553 |     numpy.random.RandomState(config.seed).shuffle(pairs)
554 |     train_paths = pairs[num_test:]
555 |     test_paths = pairs[:num_test]
556 |     train_for_evaluate_paths = train_paths[:num_test]
557 | 
558 |     return {
559 |         'train': DataProcessDataset(train_paths, data_process_train),
560 |         'test': DataProcessDataset(test_paths, data_process_test),
561 |         'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test),
562 |     }
563 | 
564 | 
565 | def create_sr(config: SRDatasetConfig):
566 |     data_process_base = ChainProcess([
567 |         LowHighSpectrogramFeatureLoadProcess(validate=True),
568 |         SplitProcess(dict(
569 |             input=LambdaProcess(lambda d, test: numpy.log(d.low[:, :-1])),
570 |             target=LambdaProcess(lambda d, test: numpy.log(d.high[:, :-1])),
571 |         )),
572 |     ])
573 | 
574 |     data_process_train = copy.deepcopy(data_process_base)
575 | 
576 |     # blur
577 |     data_process_train.append(SplitProcess(dict(
578 |         input=ChainProcess([
579 |             LambdaProcess(lambda d, test: d['input']),
580 |             RandomBlurProcess(blur_size_factor=config.blur_size_factor),
581 |         ]),
582 |         target=ChainProcess([
583 |             LambdaProcess(lambda d, test: d['target']),
584 |         ]),
585 |     )))
586 | 
587 |     # cropping
588 |     if config.train_crop_size is not None:
589 |         def add_seed():
590 |             return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 31), **d))
591 | 
592 |         def padding(s):
593 |             return ChainProcess([
594 |                 LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
595 |                 RandomPaddingProcess(min_size=config.train_crop_size, time_axis=0),
596 |             ])
597 | 
598 |         def crop(s):
599 |             return ChainProcess([
600 |                 LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
601 |                 RandomCropProcess(crop_size=config.train_crop_size, time_axis=0),
602 |             ])
603 | 
604 |         data_process_train.append(ChainProcess([
605 |             add_seed(),
606 |             SplitProcess(dict(input=padding('input'), target=padding('target'))),
607 |             add_seed(),
608 |             SplitProcess(dict(input=crop('input'), target=crop('target'))),
609 |         ]))
610 | 
611 |     # add noise
612 |     data_process_train.append(SplitProcess(dict(
613 |         input=ChainProcess([
614 |             LambdaProcess(lambda d, test: d['input']),
615 |             AddNoiseProcess(p_global=config.input_global_noise, p_local=config.input_local_noise),
616 |         ]),
617 |         target=ChainProcess([
618 |             LambdaProcess(lambda d, test: d['target']),
619 |         ]),
620 |     )))
621 | 
622 |     data_process_train.append(LambdaProcess(lambda d, test: {
623 |         'input': d['input'][numpy.newaxis],
624 |         'target': d['target'][numpy.newaxis],
625 |     }))
626 | 
627 |     data_process_test = copy.deepcopy(data_process_base)
628 |     if config.train_crop_size is not None:
629 |         data_process_test.append(SplitProcess(dict(
630 |             input=ChainProcess([
631 |                 LambdaProcess(lambda d, test: d['input']),
632 |                 LastPaddingProcess(min_size=config.train_crop_size),
633 |                 FirstCropProcess(crop_size=config.train_crop_size, time_axis=0),
634 |             ]),
635 |             target=ChainProcess([
636 |                 LambdaProcess(lambda d, test: d['target']),
637 |                 LastPaddingProcess(min_size=config.train_crop_size),
638 |                 FirstCropProcess(crop_size=config.train_crop_size, time_axis=0),
639 |             ]),
640 |         )))
641 | 
642 |     data_process_test.append(LambdaProcess(lambda d, test: {
643 |         'input': d['input'][numpy.newaxis],
644 |         'target': d['target'][numpy.newaxis],
645 |     }))
646 | 
647 |     input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))]))
648 | 
649 |     num_test = config.num_test
650 |     numpy.random.RandomState(config.seed).shuffle(input_paths)
651 |     train_paths = input_paths[num_test:]
652 |     test_paths = input_paths[:num_test]
653 |     train_for_evaluate_paths = train_paths[:num_test]
654 | 
655 |     return {
656 |         'train': DataProcessDataset(train_paths, data_process_train),
657 |         'test': DataProcessDataset(test_paths, data_process_test),
658 |         'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test),
659 |     }
660 | 


--------------------------------------------------------------------------------