├── tests ├── __init__.py ├── test-deep-learning-yuduki-yukari.wav └── test_dataset.py ├── become_yukarin ├── model │ ├── __init__.py │ ├── sr_model.py │ ├── model.py │ └── cbhg_model.py ├── config │ ├── __init__.py │ ├── old_config.py │ ├── sr_config.py │ └── config.py ├── updater │ ├── __init__.py │ ├── sr_updater.py │ └── updater.py ├── dataset │ ├── __init__.py │ ├── utility.py │ └── dataset.py ├── __init__.py ├── param.py ├── voice_changer.py ├── vocoder.py ├── data_struct.py ├── super_resolution.py └── acoustic_converter.py ├── requirements.txt ├── recipe ├── recipe.json ├── config_sr.json └── config.json ├── scripts ├── ln_atr503_to_subset.py ├── ln_jnas_subset.py ├── ln_apply_subset.py ├── voice_conversion_test.py ├── extract_spectrogram_pair.py ├── super_resolution_test.py ├── launch.py └── extract_acoustic_feature.py ├── setup.py ├── LICENSE ├── README_jp.md ├── README.md ├── train.py └── train_sr.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /become_yukarin/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import model 2 | from . import sr_model 3 | -------------------------------------------------------------------------------- /become_yukarin/config/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import sr_config 3 | -------------------------------------------------------------------------------- /become_yukarin/updater/__init__.py: -------------------------------------------------------------------------------- 1 | from . import sr_updater 2 | from . import updater 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | cupy<6.0.0 3 | chainer<6.0.0 4 | librosa<0.7.0 5 | pysptk 6 | pyworld 7 | fastdtw 8 | matplotlib 9 | tqdm 10 | -------------------------------------------------------------------------------- /tests/test-deep-learning-yuduki-yukari.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hiroshiba/become-yukarin/HEAD/tests/test-deep-learning-yuduki-yukari.wav -------------------------------------------------------------------------------- /become_yukarin/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from . import dataset 2 | from . import utility 3 | from .dataset import create 4 | from .dataset import create_sr 5 | -------------------------------------------------------------------------------- /become_yukarin/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import dataset 3 | from . import param 4 | from .acoustic_converter import AcousticConverter 5 | from .super_resolution import SuperResolution 6 | from .vocoder import RealtimeVocoder 7 | from .vocoder import Vocoder 8 | from .voice_changer import VoiceChanger 9 | -------------------------------------------------------------------------------- /become_yukarin/param.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | 4 | class VoiceParam(NamedTuple): 5 | sample_rate: int = 24000 6 | top_db: float = None 7 | pad_second: float = 0.0 8 | 9 | 10 | class AcousticFeatureParam(NamedTuple): 11 | frame_period: int = 5 12 | order: int = 8 13 | alpha: float = 0.466 14 | f0_estimating_method: str = 'harvest' # dio / harvest 15 | 16 | 17 | class Param(NamedTuple): 18 | voice_param: VoiceParam = VoiceParam() 19 | acoustic_feature_param: AcousticFeatureParam = AcousticFeatureParam() 20 | -------------------------------------------------------------------------------- /recipe/recipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "each": { 3 | "model/generator_extensive_layers": [ 4 | 8, 5 | 0, 6 | 8, 7 | 0 8 | ], 9 | "model/discriminator_extensive_layers": [ 10 | 5, 11 | 0, 12 | 5, 13 | 0 14 | ], 15 | "model/weak_discriminator": [ 16 | true, 17 | true, 18 | false, 19 | false 20 | ], 21 | "train/gpu": [ 22 | 0, 23 | 1, 24 | 2, 25 | 3 26 | ], 27 | "project/name": [ 28 | "pp-weakD-el8", 29 | "pp-weakD-el0", 30 | "pp-el8", 31 | "pp-el0" 32 | ] 33 | }, 34 | "all": { 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /recipe/config_sr.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": { 3 | "input_glob": "./feature/*.npy", 4 | "num_test": 5, 5 | "input_global_noise": 3, 6 | "input_local_noise": 3, 7 | "blur_size_factor": 0, 8 | "seed": 0, 9 | "train_crop_size": 512, 10 | "generator_base_channels": 64, 11 | "generator_extensive_layers": 8, 12 | "discriminator_base_channels": 32, 13 | "discriminator_extensive_layers": 5 14 | }, 15 | "loss": { 16 | "mse": 100, 17 | "adversarial": 1 18 | }, 19 | "model": { 20 | }, 21 | "project": { 22 | "name": "", 23 | "tags": [] 24 | }, 25 | "train": { 26 | "batchsize": 8, 27 | "gpu": 0, 28 | "log_iteration": 250, 29 | "snapshot_iteration": 5000 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /scripts/ln_atr503_to_subset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('input', type=Path) 6 | parser.add_argument('output', type=Path) 7 | parser.add_argument('--prefix', default='') 8 | argument = parser.parse_args() 9 | 10 | input = argument.input # type: Path 11 | output = argument.output # type: Path 12 | 13 | paths = list(sorted(input.glob('*'), key=lambda p: int(''.join(filter(str.isdigit, p.name))))) 14 | assert len(paths) == 503 15 | 16 | output.mkdir(exist_ok=True) 17 | 18 | names = ['{}{:02d}'.format(s, n + 1) for s in 'ABCDEFGHIJ' for n in range(50)] 19 | names += ['J51', 'J52', 'J53'] 20 | 21 | for p, n in zip(paths, names): 22 | out = output / (argument.prefix + n + p.suffix) 23 | out.symlink_to(p) 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='become_yukarin', 5 | version='1.0.0', 6 | packages=find_packages(), 7 | url='https://github.com/Hiroshiba/become-yukarin', 8 | author='Kazuyuki Hiroshiba', 9 | author_email='hihokaruta@gmail.com', 10 | description='become Yuduki Yukari with DeepLearning power.', 11 | license='MIT License', 12 | install_requires=[ 13 | 'numpy', 14 | 'chainer', 15 | 'librosa', 16 | 'pysptk', 17 | 'pyworld', 18 | 'fastdtw', 19 | 'chainerui', 20 | ], 21 | classifiers=[ 22 | 'Programming Language :: Python :: 3.5', 23 | 'Programming Language :: Python :: 3.6', 24 | 'License :: OSI Approved :: MIT License', 25 | ] 26 | ) 27 | -------------------------------------------------------------------------------- /become_yukarin/config/old_config.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import NamedTuple 3 | from typing import Optional 4 | 5 | 6 | class CBHGDiscriminatorModelConfig(NamedTuple): 7 | in_channels: int 8 | hidden_channels_list: List[int] 9 | 10 | 11 | class CBHGModelConfig(NamedTuple): 12 | in_channels: int 13 | conv_bank_out_channels: int 14 | conv_bank_k: int 15 | max_pooling_k: int 16 | conv_projections_hidden_channels: int 17 | highway_layers: int 18 | out_channels: int 19 | out_size: int 20 | aligner_out_time_length: int 21 | disable_last_rnn: bool 22 | enable_aligner: bool 23 | discriminator: Optional[CBHGDiscriminatorModelConfig] 24 | 25 | 26 | class CBHGLossConfig(NamedTuple): 27 | l1: float 28 | predictor_fake: float 29 | discriminator_true: float 30 | discriminator_fake: float 31 | discriminator_grad: float 32 | -------------------------------------------------------------------------------- /scripts/ln_jnas_subset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | from pathlib import Path 4 | 5 | from jnas_metadata_loader import load_from_directory 6 | from jnas_metadata_loader.jnas_metadata import JnasMetadata 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('jnas', type=Path) 10 | parser.add_argument('output', type=Path) 11 | parser.add_argument('--format', default='{sex}{text_id}_{mic}_atr_{subset}{sen_id}.wav') 12 | argument = parser.parse_args() 13 | 14 | jnas = argument.jnas # type: Path 15 | output = argument.output # type: Path 16 | 17 | jnas_list = load_from_directory(str(jnas)) 18 | atr_list = jnas_list.subset_news_or_atr('B') 19 | 20 | output.mkdir(exist_ok=True) 21 | 22 | 23 | def process(d: JnasMetadata): 24 | p = d.path 25 | out = output / argument.format.format(**d._asdict()) 26 | out.symlink_to(p) 27 | 28 | 29 | pool = multiprocessing.Pool() 30 | pool.map(process, atr_list) 31 | -------------------------------------------------------------------------------- /become_yukarin/voice_changer.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from .acoustic_converter import AcousticConverter 4 | from .data_struct import AcousticFeature 5 | from .super_resolution import SuperResolution 6 | 7 | 8 | class VoiceChanger(object): 9 | def __init__( 10 | self, 11 | acoustic_converter: AcousticConverter, 12 | super_resolution: SuperResolution, 13 | output_sampling_rate: int = None, 14 | ) -> None: 15 | if output_sampling_rate is None: 16 | output_sampling_rate = super_resolution.config.dataset.param.voice_param.sample_rate 17 | 18 | self.acoustic_converter = acoustic_converter 19 | self.super_resolution = super_resolution 20 | self.output_sampling_rate = output_sampling_rate 21 | 22 | def convert_from_acoustic_feature(self, f_in: AcousticFeature): 23 | f_low = self.acoustic_converter.convert_to_feature(f_in) 24 | s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32)) 25 | f_high = self.super_resolution.convert_to_feature(s_high, f_low) 26 | return f_high 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Kazuyuki Hiroshiba. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /recipe/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": { 3 | "features": [ 4 | "f0", 5 | "mfcc" 6 | ], 7 | "input_glob": "/hiho-pause/hiho-pause-atr-*.npy", 8 | "input_global_noise": 0.01, 9 | "input_local_noise": 0.01, 10 | "input_mean_path": "/hiho-pause/mean.npy", 11 | "input_var_path": "/hiho-pause/var.npy", 12 | "num_test": 1, 13 | "seed": 0, 14 | "target_glob": "/yukari-pause/yukari-pause-atr-*.npy", 15 | "target_global_noise": 0.01, 16 | "target_local_noise": 0.01, 17 | "target_mean_path": "/yukari-pause/mean.npy", 18 | "target_var_path": "/yukari-pause/var.npy", 19 | "train_crop_size": 512 20 | }, 21 | "loss": { 22 | "adversarial": 1, 23 | "mse": 100 24 | }, 25 | "model": { 26 | "in_channels": 10, 27 | "out_channels": 10, 28 | "generator_base_channels": 64, 29 | "generator_extensive_layers": 8, 30 | "discriminator_base_channels": 32, 31 | "discriminator_extensive_layers": 5, 32 | "weak_discriminator": false 33 | }, 34 | "project": { 35 | "name": "", 36 | "tags": [] 37 | }, 38 | "train": { 39 | "batchsize": 8, 40 | "gpu": 0, 41 | "log_iteration": 250, 42 | "snapshot_iteration": 5000 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /scripts/ln_apply_subset.py: -------------------------------------------------------------------------------- 1 | """ 2 | ある話者のATR503サブセットを、他の話者に対応するようにコピーする。 3 | targetは、拡張子前3文字がATR503サブセットでないといけない。 4 | """ 5 | 6 | import argparse 7 | from pathlib import Path 8 | import re 9 | from itertools import chain, groupby 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('source', type=Path) 13 | parser.add_argument('target', type=Path) 14 | parser.add_argument('output', type=Path) 15 | parser.add_argument('--prefix', default='') 16 | argument = parser.parse_args() 17 | 18 | source = argument.source # type: Path 19 | target = argument.target # type: Path 20 | output = argument.output # type: Path 21 | 22 | # source 23 | sources = list(sorted(source.glob('*'))) 24 | assert len(sources) == 503 25 | 26 | names = ['{}{:02d}'.format(s, n + 1) for s in 'ABCDEFGHIJ' for n in range(50)] 27 | names += ['J51', 'J52', 'J53'] 28 | 29 | assert all(n in s.name for s, n in zip(sources, names)) 30 | 31 | map_source = {n: s for s, n in zip(sources, names)} 32 | 33 | # target 34 | keyfunc = lambda t: t.stem[-3:] 35 | targets = list(target.glob('*')) 36 | map_targets = {n: list(vs) for n, vs in groupby(sorted(targets, key=keyfunc), key=keyfunc)} 37 | 38 | assert all(n in names for n in map_targets.keys()) 39 | assert len(list(chain.from_iterable(map_targets.values()))) == len(targets) 40 | 41 | # output 42 | output.mkdir(exist_ok=True) 43 | 44 | for n in names: 45 | s = map_source[n] 46 | for t in map_targets[n]: 47 | out = output / (argument.prefix + t.stem + s.suffix) 48 | out.symlink_to(s) 49 | -------------------------------------------------------------------------------- /README_jp.md: -------------------------------------------------------------------------------- 1 | # Become Yukarin: 誰でも好きなキャラの声に 2 | Become Yukarinは、機械学習(ディープラーニング)で声質変換を実現するリポジトリです。 3 | 元の声と好きな声の音声データを大量に用いて機械学習することで、 4 | 元の声を好きな声に変換することができるようになります。 5 | 6 | [English README](./README.md) 7 | 8 | ## 推奨環境 9 | * Linux OS 10 | * Python 3.6 11 | 12 | ## 準備 13 | ```bash 14 | # 必要なライブラリをインストール 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ## 学習させる 19 | 学習用のPythonスクリプトを実行するには、`become_yukarin`ライブラリをパス(PYTHONPATH)に通す必要があります。 20 | 例えば`scripts/extract_acoustic_feature.py`を以下のように書いて、パスを通しつつ実行します。 21 | 22 | ```bash 23 | PYTHONPATH=`pwd` python scripts/extract_acoustic_feature.py --- 24 | ``` 25 | 26 | ### 第1段階の学習 27 | * 音声データを用意する 28 | * 2つのディレクトリに、入出力の音声データを置く(ファイル名を揃える) 29 | * 音響特徴量を作成する 30 | * `scripts/extract_acoustic_feature.py` 31 | * 学習を回す 32 | * `train.py` 33 | * テストする 34 | * `scripts/voice_conversion_test.py` 35 | 36 | ### 第2段階の学習 37 | * 音声データを用意する 38 | * 1つのディレクトリに音声データを置く 39 | * 音響特徴量を作成する 40 | * `scripts/extract_spectrogram_pair.py` 41 | * 学習を回す 42 | * `train_sr.py` 43 | * テストする 44 | * `scripts/super_resolution_test.py` 45 | * 別の音声データを変換する 46 | * SuperResolutionクラスとAcousticConverterクラスを使うことで変換できます 47 | * [サンプルコード](https://github.com/Hiroshiba/become-yukarin/blob/ipynb/show%20vc%20and%20sr.ipynb) 48 | 49 | ## 参考 50 | * [ipynbブランチ](https://github.com/Hiroshiba/become-yukarin/tree/ipynb)に大量にサンプルが置いてあります 51 | * [解説ブログ](https://hiroshiba.github.io/blog/became-yuduki-yukari-with-deep-learning-power/) 52 | * [Realtime Yukarin](https://github.com/Hiroshiba/realtime-yukarin)を使うことで、リアルタイムに声質変換することができます 53 | 54 | ## License 55 | [MIT License](./LICENSE) 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Become Yukarin: Convert your voice to favorite voice 2 | Become Yukarin is a repository for voice conversion with a Deep Learning model. 3 | By traingin with a large amount of the original and favorite voice, 4 | The Deep Learning model can convert the original voice to the favorite voice. 5 | 6 | [Japanese README](./README_jp.md) 7 | 8 | ## Supported environment 9 | * Linux OS 10 | * Python 3.6 11 | 12 | ## Preparation 13 | ```bash 14 | # install required libraries 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ## Training 19 | To run a Python script for training, 20 | you should set the environment variable `PYTHONPATH` to find the `become_yukarin` library. 21 | For example, you can execute `scripts/extract_acoustic_feature.py` with the following command: 22 | 23 | ```bash 24 | PYTHONPATH=`pwd` python scripts/extract_acoustic_feature.py --- 25 | ``` 26 | 27 | ## First Stage Model 28 | * Prepare voice data 29 | * Put input/target voice data in two directories (with same file names) 30 | * Create acoustic feature 31 | * `scripts/extract_acoustic_feature.py` 32 | * Train 33 | * `train.py` 34 | * Test 35 | * `scripts/voice_conversion_test.py` 36 | 37 | ## Second Stage Model 38 | * Prepare voice data 39 | * Put input/target voice data in two directories 40 | * Create acoustic feature 41 | * `scripts/extract_spectrogram_pair.py` 42 | * Train 43 | * `train_sr.py` 44 | * Test 45 | * `scripts/super_resolution_test.py` 46 | * Convert other voice data 47 | * Use SuperResolution class and AcousticConverter class 48 | * [sample code](https://github.com/Hiroshiba/become-yukarin/blob/ipynb/show%20vc%20and%20sr.ipynb) 49 | 50 | ## Reference 51 | * [ipynb branch](https://github.com/Hiroshiba/become-yukarin/tree/ipynb): Other sample code 52 | * [Commentary Blog (Japanese)](https://hiroshiba.github.io/blog/became-yuduki-yukari-with-deep-learning-power/) 53 | * [Realtime Yukarin](https://github.com/Hiroshiba/realtime-yukarin): Real-time voice conversion system 54 | 55 | ## License 56 | [MIT License](./LICENSE) 57 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy 4 | from become_yukarin.dataset import dataset 5 | 6 | 7 | class TestDataset(unittest.TestCase): 8 | def setUp(self): 9 | self.sample_rate = 24000 10 | self.len_time = len_time = 100 11 | self.fft_size = fft_size = 1024 12 | self.order = order = 59 13 | self.dummy_feature = dataset.AcousticFeature( 14 | f0=numpy.arange(len_time).reshape((len_time, -1)), 15 | spectrogram=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)), 16 | aperiodicity=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)), 17 | mfcc=numpy.arange(len_time * (order + 1)).reshape((len_time, -1)), 18 | voiced=(numpy.arange(len_time) % 2 == 1).reshape((len_time, -1)), 19 | ) 20 | self.feature_sizes = dataset.AcousticFeature.get_sizes( 21 | sampling_rate=self.sample_rate, 22 | order=self.order, 23 | ) 24 | 25 | def test_encode_decode_feature(self): 26 | encode_feature = dataset.EncodeFeatureProcess(['mfcc']) 27 | decode_feature = dataset.DecodeFeatureProcess(['mfcc'], self.feature_sizes) 28 | e = encode_feature(self.dummy_feature, test=True) 29 | d = decode_feature(e, test=True) 30 | self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc)) 31 | 32 | def test_encode_decode_feature2(self): 33 | encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0']) 34 | decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes) 35 | e = encode_feature(self.dummy_feature, test=True) 36 | d = decode_feature(e, test=True) 37 | self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc)) 38 | self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0)) 39 | 40 | def test_encode_decode_feature3(self): 41 | encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0']) 42 | decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes) 43 | e = encode_feature(self.dummy_feature, test=True) 44 | e[0] = numpy.nan 45 | d = decode_feature(e, test=True) 46 | self.assertFalse(numpy.all(self.dummy_feature.mfcc == d.mfcc)) 47 | self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0)) 48 | 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /scripts/voice_conversion_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import multiprocessing 4 | import re 5 | from functools import partial 6 | from pathlib import Path 7 | 8 | import librosa 9 | import numpy 10 | 11 | from become_yukarin import AcousticConverter 12 | from become_yukarin.config.config import create_from_json as create_config 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('model_names', nargs='+') 16 | parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/')) 17 | parser.add_argument('-iwd', '--input_wave_directory', type=Path, 18 | default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/hiho-wave/hiho-pause-atr503-subset/')) 19 | parser.add_argument('-it', '--iteration', type=int) 20 | parser.add_argument('-g', '--gpu', type=int) 21 | args = parser.parse_args() 22 | 23 | model_directory = args.model_directory # type: Path 24 | input_wave_directory = args.input_wave_directory # type: Path 25 | it = args.iteration 26 | gpu = args.gpu 27 | 28 | paths_test = list(Path('./test_data/').glob('*.wav')) 29 | 30 | 31 | def extract_number(f): 32 | s = re.findall("\d+", str(f)) 33 | return int(s[-1]) if s else -1 34 | 35 | 36 | def process(p: Path, acoustic_converter: AcousticConverter): 37 | try: 38 | if p.suffix in ['.npy', '.npz']: 39 | fn = glob.glob(str(input_wave_directory / p.stem) + '.*')[0] 40 | p = Path(fn) 41 | wave = acoustic_converter(p) 42 | librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True) 43 | except: 44 | import traceback 45 | print('error!', str(p)) 46 | print(traceback.format_exc()) 47 | 48 | 49 | for model_name in args.model_names: 50 | base_model = model_directory / model_name 51 | config = create_config(base_model / 'config.json') 52 | 53 | input_paths = list(sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))])) 54 | numpy.random.RandomState(config.dataset.seed).shuffle(input_paths) 55 | path_train = input_paths[0] 56 | path_test = input_paths[-1] 57 | 58 | if it is not None: 59 | model_path = base_model / 'predictor_{}.npz'.format(it) 60 | else: 61 | model_paths = base_model.glob('predictor_*.npz') 62 | model_path = list(sorted(model_paths, key=extract_number))[-1] 63 | print(model_path) 64 | acoustic_converter = AcousticConverter(config, model_path, gpu=gpu) 65 | 66 | output = Path('./output').absolute() / base_model.name 67 | output.mkdir(exist_ok=True) 68 | 69 | paths = [path_train, path_test] + paths_test 70 | 71 | process_partial = partial(process, acoustic_converter=acoustic_converter) 72 | if gpu is None: 73 | pool = multiprocessing.Pool() 74 | pool.map(process_partial, paths) 75 | else: 76 | list(map(process_partial, paths)) 77 | -------------------------------------------------------------------------------- /scripts/extract_spectrogram_pair.py: -------------------------------------------------------------------------------- 1 | """ 2 | extract low and high quality spectrogram data. 3 | """ 4 | 5 | import argparse 6 | import multiprocessing 7 | from pathlib import Path 8 | from pprint import pprint 9 | 10 | import numpy 11 | import pysptk 12 | import pyworld 13 | from tqdm import tqdm 14 | 15 | from become_yukarin.dataset.dataset import AcousticFeatureProcess 16 | from become_yukarin.dataset.dataset import WaveFileLoadProcess 17 | from become_yukarin.param import AcousticFeatureParam 18 | from become_yukarin.param import VoiceParam 19 | 20 | base_voice_param = VoiceParam() 21 | base_acoustic_feature_param = AcousticFeatureParam() 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--input_directory', '-i', type=Path) 25 | parser.add_argument('--output_directory', '-o', type=Path) 26 | parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate) 27 | parser.add_argument('--top_db', type=float, default=base_voice_param.top_db) 28 | parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_second) 29 | parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period) 30 | parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order) 31 | parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha) 32 | parser.add_argument('--f0_estimating_method', default=base_acoustic_feature_param.f0_estimating_method) 33 | parser.add_argument('--enable_overwrite', action='store_true') 34 | arguments = parser.parse_args() 35 | 36 | 37 | def generate_file(path): 38 | out = Path(arguments.output_directory, path.stem + '.npy') 39 | if out.exists() and not arguments.enable_overwrite: 40 | return 41 | 42 | # load wave and padding 43 | wave_file_load_process = WaveFileLoadProcess( 44 | sample_rate=arguments.sample_rate, 45 | top_db=arguments.top_db, 46 | pad_second=arguments.pad_second, 47 | ) 48 | wave = wave_file_load_process(path, test=True) 49 | 50 | # make acoustic feature 51 | acoustic_feature_process = AcousticFeatureProcess( 52 | frame_period=arguments.frame_period, 53 | order=arguments.order, 54 | alpha=arguments.alpha, 55 | f0_estimating_method=arguments.f0_estimating_method, 56 | ) 57 | feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32) 58 | high_spectrogram = feature.spectrogram 59 | 60 | fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate) 61 | low_spectrogram = pysptk.mc2sp( 62 | feature.mfcc, 63 | alpha=arguments.alpha, 64 | fftlen=fftlen, 65 | ) 66 | 67 | # save 68 | numpy.save(out.absolute(), { 69 | 'low': low_spectrogram, 70 | 'high': high_spectrogram, 71 | }) 72 | 73 | 74 | def main(): 75 | pprint(vars(arguments)) 76 | 77 | paths = list(sorted(arguments.input_directory.glob('*'))) 78 | arguments.output_directory.mkdir(exist_ok=True) 79 | 80 | pool = multiprocessing.Pool() 81 | list(tqdm(pool.imap(generate_file, paths), total=len(paths))) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /become_yukarin/updater/sr_updater.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | from become_yukarin.config.sr_config import SRLossConfig 4 | 5 | from become_yukarin.model.sr_model import SRDiscriminator 6 | from become_yukarin.model.sr_model import SRPredictor 7 | 8 | 9 | class SRUpdater(chainer.training.StandardUpdater): 10 | def __init__( 11 | self, 12 | loss_config: SRLossConfig, 13 | predictor: SRPredictor, 14 | discriminator: SRDiscriminator, 15 | *args, 16 | **kwargs, 17 | ) -> None: 18 | super().__init__(*args, **kwargs) 19 | self.loss_config = loss_config 20 | self.predictor = predictor 21 | self.discriminator = discriminator 22 | 23 | def _loss_predictor(self, predictor, output, target, d_fake): 24 | b, _, w, h = d_fake.data.shape 25 | 26 | loss_mse = (F.mean_absolute_error(output, target)) 27 | chainer.report({'mse': loss_mse}, predictor) 28 | 29 | loss_adv = F.sum(F.softplus(-d_fake)) / (b * w * h) 30 | chainer.report({'adversarial': loss_adv}, predictor) 31 | 32 | loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv 33 | chainer.report({'loss': loss}, predictor) 34 | return loss 35 | 36 | def _loss_discriminator(self, discriminator, d_real, d_fake): 37 | b, _, w, h = d_real.data.shape 38 | 39 | loss_real = F.sum(F.softplus(-d_real)) / (b * w * h) 40 | chainer.report({'real': loss_real}, discriminator) 41 | 42 | loss_fake = F.sum(F.softplus(d_fake)) / (b * w * h) 43 | chainer.report({'fake': loss_fake}, discriminator) 44 | 45 | loss = loss_real + loss_fake 46 | chainer.report({'loss': loss}, discriminator) 47 | 48 | tp = (d_real.data > 0.5).sum() 49 | fp = (d_fake.data > 0.5).sum() 50 | fn = (d_real.data <= 0.5).sum() 51 | tn = (d_fake.data <= 0.5).sum() 52 | accuracy = (tp + tn) / (tp + fp + fn + tn) 53 | precision = tp / (tp + fp) 54 | recall = tp / (tp + fn) 55 | chainer.report({'accuracy': accuracy}, self.discriminator) 56 | chainer.report({'precision': precision}, self.discriminator) 57 | chainer.report({'recall': recall}, self.discriminator) 58 | return loss 59 | 60 | def forward(self, input, target): 61 | output = self.predictor(input) 62 | d_fake = self.discriminator(input, output) 63 | d_real = self.discriminator(input, target) 64 | 65 | loss = { 66 | 'predictor': self._loss_predictor(self.predictor, output, target, d_fake), 67 | 'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake), 68 | } 69 | return loss 70 | 71 | def update_core(self): 72 | opt_predictor = self.get_optimizer('predictor') 73 | opt_discriminator = self.get_optimizer('discriminator') 74 | 75 | batch = self.get_iterator('main').next() 76 | batch = self.converter(batch, self.device) 77 | loss = self.forward(**batch) 78 | 79 | opt_predictor.update(loss.get, 'predictor') 80 | opt_discriminator.update(loss.get, 'discriminator') 81 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | from pathlib import Path 4 | 5 | from chainer import cuda 6 | from chainer import optimizers 7 | from chainer import training 8 | from chainer.dataset import convert 9 | from chainer.iterators import MultiprocessIterator 10 | from chainer.training import extensions 11 | 12 | from become_yukarin.config.config import create_from_json 13 | from become_yukarin.dataset import create as create_dataset 14 | from become_yukarin.model.model import create 15 | from become_yukarin.updater.updater import Updater 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('config_json_path', type=Path) 20 | parser.add_argument('output', type=Path) 21 | arguments = parser.parse_args() 22 | 23 | config = create_from_json(arguments.config_json_path) 24 | arguments.output.mkdir(exist_ok=True) 25 | config.save_as_json((arguments.output / 'config.json').absolute()) 26 | 27 | # model 28 | if config.train.gpu >= 0: 29 | cuda.get_device_from_id(config.train.gpu).use() 30 | predictor, discriminator = create(config.model) 31 | models = { 32 | 'predictor': predictor, 33 | 'discriminator': discriminator, 34 | } 35 | 36 | # dataset 37 | dataset = create_dataset(config.dataset) 38 | train_iter = MultiprocessIterator(dataset['train'], config.train.batchsize) 39 | test_iter = MultiprocessIterator(dataset['test'], config.train.batchsize, repeat=False, shuffle=False) 40 | train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batchsize, repeat=False, shuffle=False) 41 | 42 | 43 | # optimizer 44 | def create_optimizer(model): 45 | optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999) 46 | optimizer.setup(model) 47 | return optimizer 48 | 49 | 50 | opts = {key: create_optimizer(model) for key, model in models.items()} 51 | 52 | # updater 53 | converter = partial(convert.concat_examples, padding=0) 54 | updater = Updater( 55 | loss_config=config.loss, 56 | predictor=predictor, 57 | discriminator=discriminator, 58 | device=config.train.gpu, 59 | iterator=train_iter, 60 | optimizer=opts, 61 | converter=converter, 62 | ) 63 | 64 | # trainer 65 | trigger_log = (config.train.log_iteration, 'iteration') 66 | trigger_snapshot = (config.train.snapshot_iteration, 'iteration') 67 | 68 | trainer = training.Trainer(updater, out=arguments.output) 69 | 70 | ext = extensions.Evaluator(test_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) 71 | trainer.extend(ext, name='test', trigger=trigger_log) 72 | ext = extensions.Evaluator(train_eval_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) 73 | trainer.extend(ext, name='train', trigger=trigger_log) 74 | 75 | trainer.extend(extensions.dump_graph('predictor/loss')) 76 | 77 | ext = extensions.snapshot_object(predictor, filename='predictor_{.updater.iteration}.npz') 78 | trainer.extend(ext, trigger=trigger_snapshot) 79 | 80 | trainer.extend(extensions.LogReport(trigger=trigger_log)) 81 | 82 | trainer.run() 83 | -------------------------------------------------------------------------------- /train_sr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | from pathlib import Path 4 | 5 | from chainer import cuda 6 | from chainer import optimizers 7 | from chainer import training 8 | from chainer.dataset import convert 9 | from chainer.iterators import MultiprocessIterator 10 | from chainer.training import extensions 11 | 12 | from become_yukarin.config.sr_config import create_from_json 13 | from become_yukarin.dataset import create_sr as create_sr_dataset 14 | from become_yukarin.model.sr_model import create_sr as create_sr_model 15 | from become_yukarin.updater.sr_updater import SRUpdater 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('config_json_path', type=Path) 20 | parser.add_argument('output', type=Path) 21 | arguments = parser.parse_args() 22 | 23 | config = create_from_json(arguments.config_json_path) 24 | arguments.output.mkdir(exist_ok=True) 25 | config.save_as_json((arguments.output / 'config.json').absolute()) 26 | 27 | # model 28 | if config.train.gpu >= 0: 29 | cuda.get_device_from_id(config.train.gpu).use() 30 | predictor, discriminator = create_sr_model(config.model) 31 | models = { 32 | 'predictor': predictor, 33 | 'discriminator': discriminator, 34 | } 35 | 36 | # dataset 37 | dataset = create_sr_dataset(config.dataset) 38 | train_iter = MultiprocessIterator(dataset['train'], config.train.batchsize) 39 | test_iter = MultiprocessIterator(dataset['test'], config.train.batchsize, repeat=False, shuffle=False) 40 | train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batchsize, repeat=False, shuffle=False) 41 | 42 | 43 | # optimizer 44 | def create_optimizer(model): 45 | optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999) 46 | optimizer.setup(model) 47 | return optimizer 48 | 49 | 50 | opts = {key: create_optimizer(model) for key, model in models.items()} 51 | 52 | # updater 53 | converter = partial(convert.concat_examples, padding=0) 54 | updater = SRUpdater( 55 | loss_config=config.loss, 56 | predictor=predictor, 57 | discriminator=discriminator, 58 | device=config.train.gpu, 59 | iterator=train_iter, 60 | optimizer=opts, 61 | converter=converter, 62 | ) 63 | 64 | # trainer 65 | trigger_log = (config.train.log_iteration, 'iteration') 66 | trigger_snapshot = (config.train.snapshot_iteration, 'iteration') 67 | 68 | trainer = training.Trainer(updater, out=arguments.output) 69 | 70 | ext = extensions.Evaluator(test_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) 71 | trainer.extend(ext, name='test', trigger=trigger_log) 72 | ext = extensions.Evaluator(train_eval_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) 73 | trainer.extend(ext, name='train', trigger=trigger_log) 74 | 75 | trainer.extend(extensions.dump_graph('predictor/loss')) 76 | 77 | ext = extensions.snapshot_object(predictor, filename='predictor_{.updater.iteration}.npz') 78 | trainer.extend(ext, trigger=trigger_snapshot) 79 | 80 | trainer.extend(extensions.LogReport(trigger=trigger_log)) 81 | trainer.extend(extensions.PrintReport(['predictor/loss'])) 82 | 83 | trainer.run() 84 | -------------------------------------------------------------------------------- /become_yukarin/updater/updater.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | 4 | from become_yukarin.config.config import LossConfig 5 | from become_yukarin.model.model import Discriminator 6 | from become_yukarin.model.model import Predictor 7 | 8 | 9 | class Updater(chainer.training.StandardUpdater): 10 | def __init__( 11 | self, 12 | loss_config: LossConfig, 13 | predictor: Predictor, 14 | discriminator: Discriminator, 15 | *args, 16 | **kwargs, 17 | ) -> None: 18 | super().__init__(*args, **kwargs) 19 | self.loss_config = loss_config 20 | self.predictor = predictor 21 | self.discriminator = discriminator 22 | 23 | def _loss_predictor(self, predictor, output, target, d_fake): 24 | b, _, t = d_fake.data.shape 25 | 26 | loss_mse = (F.mean_absolute_error(output, target)) 27 | chainer.report({'mse': loss_mse}, predictor) 28 | 29 | loss_adv = F.sum(F.softplus(-d_fake)) / (b * t) 30 | chainer.report({'adversarial': loss_adv}, predictor) 31 | 32 | loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv 33 | chainer.report({'loss': loss}, predictor) 34 | return loss 35 | 36 | def _loss_discriminator(self, discriminator, d_real, d_fake): 37 | b, _, t = d_real.data.shape 38 | 39 | loss_real = F.sum(F.softplus(-d_real)) / (b * t) 40 | chainer.report({'real': loss_real}, discriminator) 41 | 42 | loss_fake = F.sum(F.softplus(d_fake)) / (b * t) 43 | chainer.report({'fake': loss_fake}, discriminator) 44 | 45 | loss = loss_real + loss_fake 46 | chainer.report({'loss': loss}, discriminator) 47 | 48 | tp = (d_real.data > 0.5).sum() 49 | fp = (d_fake.data > 0.5).sum() 50 | fn = (d_real.data <= 0.5).sum() 51 | tn = (d_fake.data <= 0.5).sum() 52 | accuracy = (tp + tn) / (tp + fp + fn + tn) 53 | precision = tp / (tp + fp) 54 | recall = tp / (tp + fn) 55 | chainer.report({'accuracy': accuracy}, self.discriminator) 56 | chainer.report({'precision': precision}, self.discriminator) 57 | chainer.report({'recall': recall}, self.discriminator) 58 | return loss 59 | 60 | def forward(self, input, target, mask): 61 | input = chainer.as_variable(input) 62 | target = chainer.as_variable(target) 63 | mask = chainer.as_variable(mask) 64 | 65 | output = self.predictor(input) 66 | output = output * mask 67 | target = target * mask 68 | 69 | d_fake = self.discriminator(input, output) 70 | d_real = self.discriminator(input, target) 71 | 72 | loss = { 73 | 'predictor': self._loss_predictor(self.predictor, output, target, d_fake), 74 | 'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake), 75 | } 76 | return loss 77 | 78 | def update_core(self): 79 | opt_predictor = self.get_optimizer('predictor') 80 | opt_discriminator = self.get_optimizer('discriminator') 81 | 82 | batch = self.get_iterator('main').next() 83 | batch = self.converter(batch, self.device) 84 | loss = self.forward(**batch) 85 | 86 | opt_predictor.update(loss.get, 'predictor') 87 | opt_discriminator.update(loss.get, 'discriminator') 88 | -------------------------------------------------------------------------------- /scripts/super_resolution_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import multiprocessing 4 | import re 5 | from functools import partial 6 | from pathlib import Path 7 | 8 | import librosa 9 | import numpy 10 | 11 | from become_yukarin import SuperResolution 12 | from become_yukarin.config.sr_config import create_from_json as create_config 13 | from become_yukarin.dataset.dataset import AcousticFeatureProcess 14 | from become_yukarin.dataset.dataset import WaveFileLoadProcess 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('model_names', nargs='+') 18 | parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/')) 19 | parser.add_argument('-iwd', '--input_wave_directory', type=Path, 20 | default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/yukari-wave/yukari-news/')) 21 | parser.add_argument('-g', '--gpu', type=int) 22 | args = parser.parse_args() 23 | 24 | model_directory = args.model_directory # type: Path 25 | input_wave_directory = args.input_wave_directory # type: Path 26 | gpu = args.gpu 27 | 28 | paths_test = list(Path('./test_data_sr/').glob('*.wav')) 29 | 30 | 31 | def extract_number(f): 32 | s = re.findall("\d+", str(f)) 33 | return int(s[-1]) if s else -1 34 | 35 | 36 | def process(p: Path, super_resolution: SuperResolution): 37 | param = config.dataset.param 38 | wave_process = WaveFileLoadProcess( 39 | sample_rate=param.voice_param.sample_rate, 40 | top_db=None, 41 | ) 42 | acoustic_feature_process = AcousticFeatureProcess( 43 | frame_period=param.acoustic_feature_param.frame_period, 44 | order=param.acoustic_feature_param.order, 45 | alpha=param.acoustic_feature_param.alpha, 46 | f0_estimating_method=param.acoustic_feature_param.f0_estimating_method, 47 | ) 48 | 49 | try: 50 | if p.suffix in ['.npy', '.npz']: 51 | p = glob.glob(str(input_wave_directory / p.stem) + '.*')[0] 52 | p = Path(p) 53 | input = acoustic_feature_process(wave_process(str(p))) 54 | wave = super_resolution(input.spectrogram, acoustic_feature=input, sampling_rate=param.voice_param.sample_rate) 55 | librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True) 56 | except: 57 | import traceback 58 | print('error!', str(p)) 59 | print(traceback.format_exc()) 60 | 61 | 62 | for model_name in args.model_names: 63 | base_model = model_directory / model_name 64 | config = create_config(base_model / 'config.json') 65 | 66 | input_paths = list(sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))])) 67 | numpy.random.RandomState(config.dataset.seed).shuffle(input_paths) 68 | path_train = input_paths[0] 69 | path_test = input_paths[-1] 70 | 71 | model_paths = base_model.glob('predictor*.npz') 72 | model_path = list(sorted(model_paths, key=extract_number))[-1] 73 | print(model_path) 74 | super_resolution = SuperResolution(config, model_path, gpu=gpu) 75 | 76 | output = Path('./output').absolute() / base_model.name 77 | output.mkdir(exist_ok=True) 78 | 79 | paths = [path_train, path_test] + paths_test 80 | 81 | process_partial = partial(process, super_resolution=super_resolution) 82 | if gpu is None: 83 | pool = multiprocessing.Pool() 84 | pool.map(process_partial, paths) 85 | else: 86 | list(map(process_partial, paths)) 87 | -------------------------------------------------------------------------------- /scripts/launch.py: -------------------------------------------------------------------------------- 1 | """ 2 | launcher for some task that have diff params 3 | """ 4 | 5 | import argparse 6 | import copy 7 | import datetime 8 | import hashlib 9 | import json 10 | import subprocess 11 | import time 12 | from pathlib import Path 13 | 14 | base_command_default = \ 15 | "screen -d -m -S {project/name}_gpu{train/gpu} ;" + \ 16 | "screen -S {project/name}_gpu{train/gpu} -X stuff 'python3 {python_file_path} {recipe_path} {output}\n'" 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('output_dir', type=Path) 20 | parser.add_argument('--python_file_path', default='train.py') 21 | parser.add_argument('--recipe_json_path', default='recipe/recipe.json') 22 | parser.add_argument('--base_config_json_path', default='recipe/config.json') 23 | parser.add_argument('--base_command', default=base_command_default) 24 | args = parser.parse_args() 25 | 26 | recipe = json.load(open(args.recipe_json_path, encoding='utf-8')) 27 | recipe_each = recipe['each'] 28 | recipe_all = recipe['all'] 29 | base_config = json.load(open(args.base_config_json_path, encoding='utf-8')) 30 | 31 | 32 | def put_config_value(config, recipe_key, value): 33 | key_tree = recipe_key.split('/') 34 | target = config 35 | for key in key_tree[:-1]: 36 | target = target[key] 37 | 38 | target[key_tree[-1]] = value 39 | 40 | 41 | def _replace_name(dist): 42 | _format = {} 43 | now = datetime.datetime.now() 44 | 45 | if '{date}' in dist['project']['name']: 46 | _format['date'] = now.strftime('%Y%m%d%H%M%S') 47 | if '{hash}' in dist['project']['name']: 48 | _format['hash'] = hashlib.md5(bytes(str(now), 'utf')).hexdigest()[:6] 49 | 50 | if len(_format) > 0: 51 | dist['project']['name'] = dist['project']['name'].format(**_format) 52 | 53 | 54 | num_task = min(len(list(value)) for value in recipe_each.values()) 55 | command_list = [] 56 | 57 | for i in range(num_task): 58 | config = copy.deepcopy(base_config) 59 | 60 | for recipe_key in recipe_all.keys(): 61 | put_config_value(config, recipe_key, recipe_all[recipe_key]) 62 | 63 | for recipe_key in recipe_each.keys(): 64 | put_config_value(config, recipe_key, recipe_each[recipe_key][i]) 65 | 66 | _replace_name(config) 67 | 68 | # add git branch name 69 | git_branch = subprocess.check_output('git rev-parse --abbrev-ref HEAD', shell=True).decode("utf-8").strip() 70 | config['project']['tags'].append('git branch name:' + git_branch) 71 | 72 | made_recipe_path = "{}.{}.json".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'), i) 73 | with open(made_recipe_path, 'w', encoding='utf') as f: 74 | json.dump(config, f, indent=2, sort_keys=True, ensure_ascii=False) 75 | 76 | 77 | def make_key_chain(key_chain, value, dist): 78 | if not isinstance(value, dict): 79 | dist['/'.join(key_chain)] = value 80 | else: 81 | for key in value.keys(): 82 | make_key_chain(key_chain + [key], value[key], dist) 83 | 84 | 85 | dist = {} 86 | make_key_chain([], config, dist) 87 | 88 | dist['output'] = args.output_dir / config['project']['name'] 89 | dist['python_file_path'] = args.python_file_path 90 | dist['recipe_path'] = made_recipe_path 91 | 92 | command = args.base_command.format(**dist) 93 | command_list += [command] 94 | 95 | print(config['project']['name']) 96 | 97 | for command in command_list: 98 | time.sleep(1) 99 | subprocess.check_output(command, shell=True) 100 | -------------------------------------------------------------------------------- /become_yukarin/config/sr_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Dict 4 | from typing import List 5 | from typing import NamedTuple 6 | from typing import Union 7 | 8 | from become_yukarin.param import Param 9 | 10 | 11 | class SRDatasetConfig(NamedTuple): 12 | param: Param 13 | input_glob: Path 14 | train_crop_size: int 15 | input_global_noise: float 16 | input_local_noise: float 17 | blur_size_factor: float 18 | seed: int 19 | num_test: int 20 | 21 | 22 | class SRModelConfig(NamedTuple): 23 | generator_base_channels: int 24 | generator_extensive_layers: int 25 | discriminator_base_channels: int 26 | discriminator_extensive_layers: int 27 | 28 | 29 | class SRLossConfig(NamedTuple): 30 | mse: float 31 | adversarial: float 32 | 33 | 34 | class SRTrainConfig(NamedTuple): 35 | batchsize: int 36 | gpu: int 37 | log_iteration: int 38 | snapshot_iteration: int 39 | 40 | 41 | class SRProjectConfig(NamedTuple): 42 | name: str 43 | tags: List[str] 44 | 45 | 46 | class SRConfig(NamedTuple): 47 | dataset: SRDatasetConfig 48 | model: SRModelConfig 49 | loss: SRLossConfig 50 | train: SRTrainConfig 51 | project: SRProjectConfig 52 | 53 | def save_as_json(self, path): 54 | d = _namedtuple_to_dict(self) 55 | json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path) 56 | 57 | 58 | def _default_path(o): 59 | if isinstance(o, Path): 60 | return str(o) 61 | raise TypeError(repr(o) + " is not JSON serializable") 62 | 63 | 64 | def _namedtuple_to_dict(o: NamedTuple): 65 | return { 66 | k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v) 67 | for k, v in o._asdict().items() 68 | } 69 | 70 | 71 | def create_from_json(s: Union[str, Path]): 72 | try: 73 | d = json.loads(s) 74 | except TypeError: 75 | d = json.load(open(s)) 76 | 77 | backward_compatible(d) 78 | 79 | return SRConfig( 80 | dataset=SRDatasetConfig( 81 | param=Param(), 82 | input_glob=Path(d['dataset']['input_glob']), 83 | train_crop_size=d['dataset']['train_crop_size'], 84 | input_global_noise=d['dataset']['input_global_noise'], 85 | input_local_noise=d['dataset']['input_local_noise'], 86 | blur_size_factor=d['dataset']['blur_size_factor'], 87 | seed=d['dataset']['seed'], 88 | num_test=d['dataset']['num_test'], 89 | ), 90 | model=SRModelConfig( 91 | generator_base_channels=d['model']['generator_base_channels'], 92 | generator_extensive_layers=d['model']['generator_extensive_layers'], 93 | discriminator_base_channels=d['model']['discriminator_base_channels'], 94 | discriminator_extensive_layers=d['model']['discriminator_extensive_layers'], 95 | ), 96 | loss=SRLossConfig( 97 | mse=d['loss']['mse'], 98 | adversarial=d['loss']['adversarial'], 99 | ), 100 | train=SRTrainConfig( 101 | batchsize=d['train']['batchsize'], 102 | gpu=d['train']['gpu'], 103 | log_iteration=d['train']['log_iteration'], 104 | snapshot_iteration=d['train']['snapshot_iteration'], 105 | ), 106 | project=SRProjectConfig( 107 | name=d['project']['name'], 108 | tags=d['project']['tags'], 109 | ) 110 | ) 111 | 112 | 113 | def backward_compatible(d: Dict): 114 | if 'blur_size_factor' not in d['dataset']: 115 | d['dataset']['blur_size_factor'] = 0 116 | 117 | if 'generator_base_channels' not in d['model']: 118 | d['model']['generator_base_channels'] = 64 119 | d['model']['generator_extensive_layers'] = 8 120 | d['model']['discriminator_base_channels'] = 32 121 | d['model']['discriminator_extensive_layers'] = 5 122 | -------------------------------------------------------------------------------- /become_yukarin/dataset/utility.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import fastdtw 4 | import numpy 5 | 6 | _logdb_const = 10.0 / numpy.log(10.0) * numpy.sqrt(2.0) 7 | 8 | 9 | # should work on torch and numpy arrays 10 | def _sqrt(x): 11 | isnumpy = isinstance(x, numpy.ndarray) 12 | isscalar = numpy.isscalar(x) 13 | return numpy.sqrt(x) if isnumpy else math.sqrt(x) if isscalar else x.sqrt() 14 | 15 | 16 | def _exp(x): 17 | isnumpy = isinstance(x, numpy.ndarray) 18 | isscalar = numpy.isscalar(x) 19 | return numpy.exp(x) if isnumpy else math.exp(x) if isscalar else x.exp() 20 | 21 | 22 | def _sum(x): 23 | if isinstance(x, list) or isinstance(x, numpy.ndarray): 24 | return numpy.sum(x) 25 | return float(x.sum()) 26 | 27 | 28 | def melcd(X, Y, lengths=None): 29 | """Mel-cepstrum distortion (MCD). 30 | 31 | The function computes MCD for time-aligned mel-cepstrum sequences. 32 | 33 | Args: 34 | X (ndarray): Input mel-cepstrum, shape can be either of 35 | (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays 36 | are supported. 37 | Y (ndarray): Target mel-cepstrum, shape can be either of 38 | (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays 39 | are supported. 40 | lengths (list): Lengths of padded inputs. This should only be specified 41 | if you give mini-batch inputs. 42 | 43 | Returns: 44 | float: Mean mel-cepstrum distortion in dB. 45 | 46 | .. note:: 47 | 48 | The function doesn't check if inputs are actually mel-cepstrum. 49 | """ 50 | # summing against feature axis, and then take mean against time axis 51 | # Eq. (1a) 52 | # https://www.cs.cmu.edu/~awb/papers/sltu2008/kominek_black.sltu_2008.pdf 53 | if lengths is None: 54 | z = X - Y 55 | r = _sqrt((z * z).sum(-1)) 56 | if not numpy.isscalar(r): 57 | r = r.mean() 58 | return _logdb_const * r 59 | 60 | # Case for 1-dim features. 61 | if len(X.shape) == 2: 62 | # Add feature axis 63 | X, Y = X[:, :, None], Y[:, :, None] 64 | 65 | s = 0.0 66 | T = _sum(lengths) 67 | for x, y, length in zip(X, Y, lengths): 68 | x, y = x[:length], y[:length] 69 | z = x - y 70 | s += _sqrt((z * z).sum(-1)).sum() 71 | 72 | return _logdb_const * s / T 73 | 74 | 75 | class DTWAligner(object): 76 | """ 77 | from https://github.com/r9y9/nnmnkwii/blob/4cade86b5c35b4e35615a2a8162ddc638018af0e/nnmnkwii/preprocessing/alignment.py#L14 78 | """ 79 | 80 | def __init__(self, x, y, dist=lambda x, y: numpy.linalg.norm(x - y), radius=1) -> None: 81 | assert x.ndim == 2 and y.ndim == 2 82 | 83 | _, path = fastdtw.fastdtw(x, y, radius=radius, dist=dist) 84 | path = numpy.array(path) 85 | self.normed_path_x = path[:, 0] / len(x) 86 | self.normed_path_y = path[:, 1] / len(y) 87 | 88 | def align_x(self, x): 89 | path = self._interp_path(self.normed_path_x, len(x)) 90 | return x[path] 91 | 92 | def align_y(self, y): 93 | path = self._interp_path(self.normed_path_y, len(y)) 94 | return y[path] 95 | 96 | def align(self, x, y): 97 | return self.align_x(x), self.align_y(y) 98 | 99 | @staticmethod 100 | def align_and_transform(x, y, *args, **kwargs): 101 | aligner = DTWAligner(*args, x=x, y=y, **kwargs) 102 | return aligner.align(x, y) 103 | 104 | @staticmethod 105 | def _interp_path(normed_path: numpy.ndarray, target_length: int): 106 | path = numpy.floor(normed_path * target_length).astype(numpy.int) 107 | return path 108 | 109 | 110 | class MelCepstrumAligner(DTWAligner): 111 | def __init__(self, x, y, *args, **kwargs) -> None: 112 | x = self._calc_aligner_feature(x) 113 | y = self._calc_aligner_feature(y) 114 | kwargs.update(dist=melcd) 115 | super().__init__(x, y, *args, **kwargs) 116 | 117 | @classmethod 118 | def _calc_delta(cls, x): 119 | x = numpy.zeros_like(x, x.dtype) 120 | x[:-1] = x[1:] - x[:-1] 121 | x[-1] = 0 122 | return x 123 | 124 | @classmethod 125 | def _calc_aligner_feature(cls, x): 126 | d = cls._calc_delta(x) 127 | feature = numpy.concatenate((x, d), axis=1)[:, 1:] 128 | return feature 129 | -------------------------------------------------------------------------------- /become_yukarin/vocoder.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pyworld 3 | 4 | from become_yukarin.data_struct import AcousticFeature 5 | from become_yukarin.data_struct import Wave 6 | from become_yukarin.dataset.dataset import AcousticFeatureProcess 7 | from become_yukarin.param import AcousticFeatureParam 8 | 9 | 10 | class Vocoder(object): 11 | def __init__( 12 | self, 13 | acoustic_feature_param: AcousticFeatureParam, 14 | out_sampling_rate: int, 15 | ): 16 | self.acoustic_feature_param = acoustic_feature_param 17 | self.out_sampling_rate = out_sampling_rate 18 | self._encoder = AcousticFeatureProcess( 19 | frame_period=acoustic_feature_param.frame_period, 20 | order=acoustic_feature_param.order, 21 | alpha=acoustic_feature_param.alpha, 22 | f0_estimating_method=acoustic_feature_param.f0_estimating_method, 23 | ) 24 | 25 | def encode(self, wave: Wave): 26 | return self._encoder(wave) 27 | 28 | def decode( 29 | self, 30 | acoustic_feature: AcousticFeature, 31 | ): 32 | acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) 33 | out = pyworld.synthesize( 34 | f0=acoustic_feature.f0.ravel(), 35 | spectrogram=acoustic_feature.spectrogram, 36 | aperiodicity=acoustic_feature.aperiodicity, 37 | fs=self.out_sampling_rate, 38 | frame_period=self.acoustic_feature_param.frame_period 39 | ) 40 | return Wave(out, sampling_rate=self.out_sampling_rate) 41 | 42 | 43 | class RealtimeVocoder(Vocoder): 44 | def __init__( 45 | self, 46 | acoustic_feature_param: AcousticFeatureParam, 47 | out_sampling_rate: int, 48 | buffer_size: int, 49 | number_of_pointers: int, 50 | ): 51 | from world4py.native import structures, apidefinitions 52 | super().__init__( 53 | acoustic_feature_param=acoustic_feature_param, 54 | out_sampling_rate=out_sampling_rate, 55 | ) 56 | 57 | self.buffer_size = buffer_size 58 | 59 | self._synthesizer = structures.WorldSynthesizer() 60 | apidefinitions._InitializeSynthesizer( 61 | self.out_sampling_rate, # sampling rate 62 | self.acoustic_feature_param.frame_period, # frame period 63 | pyworld.get_cheaptrick_fft_size(out_sampling_rate), # fft size 64 | buffer_size, # buffer size 65 | number_of_pointers, # number of pointers 66 | self._synthesizer, 67 | ) 68 | self._before_buffer = [] # for holding memory 69 | 70 | def decode( 71 | self, 72 | acoustic_feature: AcousticFeature, 73 | ): 74 | from world4py.native import apidefinitions, utils 75 | length = len(acoustic_feature.f0) 76 | f0_buffer = utils.cast_1d_list_to_1d_pointer(acoustic_feature.f0.flatten().tolist()) 77 | sp_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.spectrogram.tolist()) 78 | ap_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.aperiodicity.tolist()) 79 | apidefinitions._AddParameters(f0_buffer, length, sp_buffer, ap_buffer, self._synthesizer) 80 | 81 | ys = [] 82 | while apidefinitions._Synthesis2(self._synthesizer) != 0: 83 | y = numpy.array([self._synthesizer.buffer[i] for i in range(self.buffer_size)]) 84 | ys.append(y) 85 | 86 | if len(ys) > 0: 87 | out_wave = Wave( 88 | wave=numpy.concatenate(ys), 89 | sampling_rate=self.out_sampling_rate, 90 | ) 91 | else: 92 | out_wave = Wave( 93 | wave=numpy.empty(0), 94 | sampling_rate=self.out_sampling_rate, 95 | ) 96 | 97 | self._before_buffer.append((f0_buffer, sp_buffer, ap_buffer)) # for holding memory 98 | if len(self._before_buffer) > 16: 99 | self._before_buffer.pop(0) 100 | return out_wave 101 | 102 | def warm_up(self, time_length: float): 103 | y = numpy.zeros(int(time_length * self.out_sampling_rate)) 104 | w = Wave(wave=y, sampling_rate=self.out_sampling_rate) 105 | f = self.encode(w) 106 | self.decode(f) 107 | 108 | def __del__(self): 109 | from world4py.native import apidefinitions 110 | if hasattr(self, '_synthesizer'): 111 | apidefinitions._DestroySynthesizer(self._synthesizer) 112 | -------------------------------------------------------------------------------- /become_yukarin/data_struct.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, Dict, List 2 | 3 | import numpy 4 | import pyworld 5 | 6 | _min_mc = -18.3 7 | 8 | 9 | class Wave(NamedTuple): 10 | wave: numpy.ndarray 11 | sampling_rate: int 12 | 13 | 14 | class AcousticFeature(NamedTuple): 15 | f0: numpy.ndarray = numpy.nan 16 | spectrogram: numpy.ndarray = numpy.nan 17 | aperiodicity: numpy.ndarray = numpy.nan 18 | mfcc: numpy.ndarray = numpy.nan 19 | voiced: numpy.ndarray = numpy.nan 20 | 21 | @staticmethod 22 | def dtypes(): 23 | return dict( 24 | f0=numpy.float32, 25 | spectrogram=numpy.float32, 26 | aperiodicity=numpy.float32, 27 | mfcc=numpy.float32, 28 | voiced=numpy.bool, 29 | ) 30 | 31 | def astype(self, dtype): 32 | return AcousticFeature( 33 | f0=self.f0.astype(dtype), 34 | spectrogram=self.spectrogram.astype(dtype), 35 | aperiodicity=self.aperiodicity.astype(dtype), 36 | mfcc=self.mfcc.astype(dtype), 37 | voiced=self.voiced.astype(dtype), 38 | ) 39 | 40 | def astype_only_float(self, dtype): 41 | return AcousticFeature( 42 | f0=self.f0.astype(dtype), 43 | spectrogram=self.spectrogram.astype(dtype), 44 | aperiodicity=self.aperiodicity.astype(dtype), 45 | mfcc=self.mfcc.astype(dtype), 46 | voiced=self.voiced, 47 | ) 48 | 49 | def validate(self): 50 | assert self.f0.ndim == 2 51 | assert self.spectrogram.ndim == 2 52 | assert self.aperiodicity.ndim == 2 53 | assert self.mfcc.ndim == 2 54 | assert self.voiced.ndim == 2 55 | 56 | len_time = len(self.f0) 57 | assert len(self.spectrogram) == len_time 58 | assert len(self.aperiodicity) == len_time 59 | assert len(self.mfcc) == len_time 60 | assert len(self.voiced) == len_time 61 | 62 | assert self.voiced.dtype == numpy.bool 63 | 64 | @staticmethod 65 | def silent(length: int, sizes: Dict[str, int], keys: List[str]): 66 | d = {} 67 | if 'f0' in keys: 68 | d['f0'] = numpy.zeros((length, sizes['f0']), dtype=AcousticFeature.dtypes()['f0']) 69 | if 'spectrogram' in keys: 70 | d['spectrogram'] = numpy.zeros((length, sizes['spectrogram']), 71 | dtype=AcousticFeature.dtypes()['spectrogram']) 72 | if 'aperiodicity' in keys: 73 | d['aperiodicity'] = numpy.zeros((length, sizes['aperiodicity']), 74 | dtype=AcousticFeature.dtypes()['aperiodicity']) 75 | if 'mfcc' in keys: 76 | d['mfcc'] = numpy.hstack(( 77 | numpy.ones((length, 1), dtype=AcousticFeature.dtypes()['mfcc']) * _min_mc, 78 | numpy.zeros((length, sizes['mfcc'] - 1), dtype=AcousticFeature.dtypes()['mfcc']) 79 | )) 80 | if 'voiced' in keys: 81 | d['voiced'] = numpy.zeros((length, sizes['voiced']), dtype=AcousticFeature.dtypes()['voiced']) 82 | feature = AcousticFeature(**d) 83 | return feature 84 | 85 | @staticmethod 86 | def concatenate(fs: List['AcousticFeature'], keys: List[str]): 87 | is_target = lambda a: not numpy.any(numpy.isnan(a)) 88 | return AcousticFeature(**{ 89 | key: numpy.concatenate([getattr(f, key) for f in fs]) if is_target(getattr(fs[0], key)) else numpy.nan 90 | for key in keys 91 | }) 92 | 93 | def pick(self, first: int, last: int): 94 | is_target = lambda a: not numpy.any(numpy.isnan(a)) 95 | return AcousticFeature( 96 | f0=self.f0[first:last] if is_target(self.f0) else numpy.nan, 97 | spectrogram=self.spectrogram[first:last] if is_target(self.spectrogram) else numpy.nan, 98 | aperiodicity=self.aperiodicity[first:last] if is_target(self.aperiodicity) else numpy.nan, 99 | mfcc=self.mfcc[first:last] if is_target(self.mfcc) else numpy.nan, 100 | voiced=self.voiced[first:last] if is_target(self.voiced) else numpy.nan, 101 | ) 102 | 103 | @staticmethod 104 | def get_sizes(sampling_rate: int, order: int): 105 | fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) 106 | return dict( 107 | f0=1, 108 | spectrogram=fft_size // 2 + 1, 109 | aperiodicity=fft_size // 2 + 1, 110 | mfcc=order + 1, 111 | voiced=1, 112 | ) 113 | 114 | 115 | class LowHighSpectrogramFeature(NamedTuple): 116 | low: numpy.ndarray 117 | high: numpy.ndarray 118 | 119 | def validate(self): 120 | assert self.low.ndim == 2 121 | assert self.high.ndim == 2 122 | assert self.low.shape == self.high.shape 123 | -------------------------------------------------------------------------------- /become_yukarin/config/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Dict 4 | from typing import List 5 | from typing import NamedTuple 6 | from typing import Optional 7 | from typing import Union 8 | 9 | from become_yukarin.param import Param 10 | 11 | 12 | class DatasetConfig(NamedTuple): 13 | param: Param 14 | input_glob: Path 15 | target_glob: Path 16 | input_mean_path: Path 17 | input_var_path: Path 18 | target_mean_path: Path 19 | target_var_path: Path 20 | features: List[str] 21 | train_crop_size: int 22 | input_global_noise: float 23 | input_local_noise: float 24 | target_global_noise: float 25 | target_local_noise: float 26 | seed: int 27 | num_test: int 28 | 29 | 30 | class ModelConfig(NamedTuple): 31 | in_channels: int 32 | out_channels: int 33 | generator_base_channels: int 34 | generator_extensive_layers: int 35 | discriminator_base_channels: int 36 | discriminator_extensive_layers: int 37 | weak_discriminator: bool 38 | 39 | 40 | class LossConfig(NamedTuple): 41 | mse: float 42 | adversarial: float 43 | 44 | 45 | class TrainConfig(NamedTuple): 46 | batchsize: int 47 | gpu: int 48 | log_iteration: int 49 | snapshot_iteration: int 50 | 51 | 52 | class ProjectConfig(NamedTuple): 53 | name: str 54 | tags: List[str] 55 | 56 | 57 | class Config(NamedTuple): 58 | dataset: DatasetConfig 59 | model: ModelConfig 60 | loss: LossConfig 61 | train: TrainConfig 62 | project: ProjectConfig 63 | 64 | def save_as_json(self, path): 65 | d = _namedtuple_to_dict(self) 66 | json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path) 67 | 68 | 69 | def _default_path(o): 70 | if isinstance(o, Path): 71 | return str(o) 72 | raise TypeError(repr(o) + " is not JSON serializable") 73 | 74 | 75 | def _namedtuple_to_dict(o: NamedTuple): 76 | return { 77 | k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v) 78 | for k, v in o._asdict().items() 79 | } 80 | 81 | 82 | def create_from_json(s: Union[str, Path]): 83 | try: 84 | d = json.loads(s) 85 | except TypeError: 86 | d = json.load(open(s)) 87 | 88 | backward_compatible(d) 89 | 90 | return Config( 91 | dataset=DatasetConfig( 92 | param=Param(), 93 | input_glob=Path(d['dataset']['input_glob']), 94 | target_glob=Path(d['dataset']['target_glob']), 95 | input_mean_path=Path(d['dataset']['input_mean_path']), 96 | input_var_path=Path(d['dataset']['input_var_path']), 97 | target_mean_path=Path(d['dataset']['target_mean_path']), 98 | target_var_path=Path(d['dataset']['target_var_path']), 99 | features=d['dataset']['features'], 100 | train_crop_size=d['dataset']['train_crop_size'], 101 | input_global_noise=d['dataset']['input_global_noise'], 102 | input_local_noise=d['dataset']['input_local_noise'], 103 | target_global_noise=d['dataset']['target_global_noise'], 104 | target_local_noise=d['dataset']['target_local_noise'], 105 | seed=d['dataset']['seed'], 106 | num_test=d['dataset']['num_test'], 107 | ), 108 | model=ModelConfig( 109 | in_channels=d['model']['in_channels'], 110 | out_channels=d['model']['out_channels'], 111 | generator_base_channels=d['model']['generator_base_channels'], 112 | generator_extensive_layers=d['model']['generator_extensive_layers'], 113 | discriminator_base_channels=d['model']['discriminator_base_channels'], 114 | discriminator_extensive_layers=d['model']['discriminator_extensive_layers'], 115 | weak_discriminator=d['model']['weak_discriminator'], 116 | ), 117 | loss=LossConfig( 118 | mse=d['loss']['mse'], 119 | adversarial=d['loss']['adversarial'], 120 | ), 121 | train=TrainConfig( 122 | batchsize=d['train']['batchsize'], 123 | gpu=d['train']['gpu'], 124 | log_iteration=d['train']['log_iteration'], 125 | snapshot_iteration=d['train']['snapshot_iteration'], 126 | ), 127 | project=ProjectConfig( 128 | name=d['project']['name'], 129 | tags=d['project']['tags'], 130 | ) 131 | ) 132 | 133 | 134 | def backward_compatible(d: Dict): 135 | if 'input_global_noise' not in d['dataset']: 136 | d['dataset']['input_global_noise'] = d['dataset']['global_noise'] 137 | d['dataset']['input_local_noise'] = d['dataset']['local_noise'] 138 | 139 | if 'target_global_noise' not in d['dataset']: 140 | d['dataset']['target_global_noise'] = d['dataset']['global_noise'] 141 | d['dataset']['target_local_noise'] = d['dataset']['local_noise'] 142 | 143 | if 'generator_base_channels' not in d['model']: 144 | d['model']['generator_base_channels'] = 64 145 | d['model']['generator_extensive_layers'] = 8 146 | d['model']['discriminator_base_channels'] = 32 147 | d['model']['discriminator_extensive_layers'] = 5 148 | 149 | if 'weak_discriminator' not in d['model']: 150 | d['model']['weak_discriminator'] = False 151 | -------------------------------------------------------------------------------- /become_yukarin/super_resolution.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import List 4 | 5 | import chainer 6 | import numpy 7 | import pyworld 8 | 9 | from become_yukarin.config.sr_config import SRConfig 10 | from become_yukarin.data_struct import AcousticFeature 11 | from become_yukarin.data_struct import Wave 12 | from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureLoadProcess 13 | from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureProcess 14 | from become_yukarin.dataset.dataset import WaveFileLoadProcess 15 | from become_yukarin.model.sr_model import create_predictor_sr 16 | 17 | 18 | class SuperResolution(object): 19 | def __init__(self, config: SRConfig, model_path: Path, gpu: int = None) -> None: 20 | self.config = config 21 | self.model_path = model_path 22 | self.gpu = gpu 23 | 24 | self.model = model = create_predictor_sr(config.model) 25 | chainer.serializers.load_npz(str(model_path), model) 26 | if self.gpu is not None: 27 | model.to_gpu(self.gpu) 28 | 29 | self._param = param = config.dataset.param 30 | self._wave_process = WaveFileLoadProcess( 31 | sample_rate=param.voice_param.sample_rate, 32 | top_db=None, 33 | ) 34 | self._low_high_spectrogram_process = LowHighSpectrogramFeatureProcess( 35 | frame_period=param.acoustic_feature_param.frame_period, 36 | order=param.acoustic_feature_param.order, 37 | alpha=param.acoustic_feature_param.alpha, 38 | f0_estimating_method=param.acoustic_feature_param.f0_estimating_method, 39 | ) 40 | self._low_high_spectrogram_load_process = LowHighSpectrogramFeatureLoadProcess( 41 | validate=True, 42 | ) 43 | 44 | def convert(self, input: numpy.ndarray) -> numpy.ndarray: 45 | converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) 46 | pad = 128 - len(input) % 128 47 | input = numpy.pad(input, [(0, pad), (0, 0)], mode='minimum') 48 | input = numpy.log(input)[:, :-1] 49 | input = input[numpy.newaxis] 50 | inputs = converter([input]) 51 | 52 | with chainer.using_config('train', False): 53 | out = self.model(inputs).data[0] 54 | 55 | if self.gpu is not None: 56 | out = chainer.cuda.to_cpu(out) 57 | 58 | out = out[0] 59 | out = numpy.pad(out, [(0, 0), (0, 1)], mode='edge') 60 | out = numpy.exp(out) 61 | out = out[:-pad] 62 | return out 63 | 64 | def convert_loop(self, input: numpy.ndarray, n_len: int = 512, n_wrap: int = 128): 65 | out_feature_list: List[AcousticFeature] = [] 66 | N = len(input) 67 | for i in numpy.arange(0, int(numpy.ceil(N / n_len))): 68 | # convert with overwrapped 69 | start = i * n_len 70 | mi = max(start - n_wrap, 0) 71 | ma = min(start + n_len + n_wrap, N) 72 | f = input[numpy.arange(mi, ma)] 73 | o_warp = self.convert(f) 74 | 75 | # eliminate overwrap 76 | ex_mi = start - mi 77 | ex_len = min(ma - start, n_len) 78 | o = o_warp[numpy.arange(ex_mi, ex_mi + ex_len)] 79 | out_feature_list.append(o) 80 | return numpy.concatenate(out_feature_list) 81 | 82 | def convert_to_feature( 83 | self, 84 | spectrogram: numpy.ndarray, 85 | acoustic_feature: AcousticFeature, 86 | ): 87 | acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) 88 | f_out = AcousticFeature( 89 | f0=acoustic_feature.f0, 90 | spectrogram=spectrogram.astype(numpy.float64), 91 | aperiodicity=acoustic_feature.aperiodicity, 92 | mfcc=acoustic_feature.mfcc, 93 | voiced=acoustic_feature.voiced, 94 | ) 95 | return f_out 96 | 97 | def convert_to_audio( 98 | self, 99 | input: numpy.ndarray, 100 | acoustic_feature: AcousticFeature, 101 | sampling_rate: int, 102 | ): 103 | acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) 104 | out = pyworld.synthesize( 105 | f0=acoustic_feature.f0.ravel(), 106 | spectrogram=input.astype(numpy.float64), 107 | aperiodicity=acoustic_feature.aperiodicity, 108 | fs=sampling_rate, 109 | frame_period=self._param.acoustic_feature_param.frame_period, 110 | ) 111 | return Wave(out, sampling_rate=sampling_rate) 112 | 113 | def convert_from_audio_path(self, input: Path): 114 | wave = self._wave_process(str(input), test=True) 115 | feature = self._low_high_spectrogram_process(wave, test=True) 116 | return self.convert(feature.low) 117 | 118 | def convert_from_feature_path(self, input: Path): 119 | feature = self._low_high_spectrogram_load_process(input, test=True) 120 | return self.convert(feature.low) 121 | 122 | def __call__( 123 | self, 124 | input: numpy.ndarray, 125 | acoustic_feature: AcousticFeature, 126 | sampling_rate: int, 127 | ): 128 | high = self.convert(input) 129 | return self.convert_to_audio(high, acoustic_feature=acoustic_feature, sampling_rate=sampling_rate) 130 | -------------------------------------------------------------------------------- /become_yukarin/acoustic_converter.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import chainer 6 | import numpy 7 | import pysptk 8 | import pyworld 9 | 10 | from become_yukarin.config.config import Config 11 | from become_yukarin.data_struct import AcousticFeature 12 | from become_yukarin.data_struct import Wave 13 | from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess 14 | from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess 15 | from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess 16 | from become_yukarin.dataset.dataset import AcousticFeatureProcess 17 | from become_yukarin.dataset.dataset import DecodeFeatureProcess 18 | from become_yukarin.dataset.dataset import EncodeFeatureProcess 19 | from become_yukarin.dataset.dataset import WaveFileLoadProcess 20 | from become_yukarin.model.model import create_predictor 21 | 22 | 23 | class AcousticConverter(object): 24 | def __init__(self, config: Config, model_path: Path, gpu: int = None) -> None: 25 | self.config = config 26 | self.model_path = model_path 27 | self.gpu = gpu 28 | 29 | self.model = model = create_predictor(config.model) 30 | chainer.serializers.load_npz(str(model_path), model) 31 | if self.gpu is not None: 32 | model.to_gpu(self.gpu) 33 | 34 | self._param = param = config.dataset.param 35 | self._wave_process = WaveFileLoadProcess( 36 | sample_rate=param.voice_param.sample_rate, 37 | top_db=None, 38 | ) 39 | self._feature_process = AcousticFeatureProcess( 40 | frame_period=param.acoustic_feature_param.frame_period, 41 | order=param.acoustic_feature_param.order, 42 | alpha=param.acoustic_feature_param.alpha, 43 | f0_estimating_method=param.acoustic_feature_param.f0_estimating_method, 44 | ) 45 | 46 | self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess() 47 | 48 | input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True) 49 | input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True) 50 | target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True) 51 | target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True) 52 | self._feature_normalize = AcousticFeatureNormalizeProcess( 53 | mean=input_mean, 54 | var=input_var, 55 | ) 56 | self._feature_denormalize = AcousticFeatureDenormalizeProcess( 57 | mean=target_mean, 58 | var=target_var, 59 | ) 60 | 61 | feature_sizes = AcousticFeature.get_sizes( 62 | sampling_rate=param.voice_param.sample_rate, 63 | order=param.acoustic_feature_param.order, 64 | ) 65 | self._encode_feature = EncodeFeatureProcess(config.dataset.features) 66 | self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) 67 | 68 | def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): 69 | if out_sampling_rate is None: 70 | out_sampling_rate = self.config.dataset.param.voice_param.sample_rate 71 | 72 | input_feature = input 73 | input = self._feature_normalize(input, test=True) 74 | input = self._encode_feature(input, test=True) 75 | 76 | pad = 128 - input.shape[1] % 128 77 | input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') 78 | 79 | converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) 80 | inputs = converter([input]) 81 | 82 | with chainer.using_config('train', False): 83 | out = self.model(inputs).data[0] 84 | 85 | if self.gpu is not None: 86 | out = chainer.cuda.to_cpu(out) 87 | out = out[:, :-pad] 88 | 89 | out = self._decode_feature(out, test=True) 90 | out = AcousticFeature( 91 | f0=out.f0, 92 | spectrogram=out.spectrogram, 93 | aperiodicity=out.aperiodicity, 94 | mfcc=out.mfcc, 95 | voiced=input_feature.voiced, 96 | ) 97 | out = self._feature_denormalize(out, test=True) 98 | out = AcousticFeature( 99 | f0=out.f0, 100 | spectrogram=out.spectrogram, 101 | aperiodicity=input_feature.aperiodicity, 102 | mfcc=out.mfcc, 103 | voiced=out.voiced, 104 | ) 105 | 106 | fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) 107 | spectrogram = pysptk.mc2sp( 108 | out.mfcc, 109 | alpha=self._param.acoustic_feature_param.alpha, 110 | fftlen=fftlen, 111 | ) 112 | 113 | out = AcousticFeature( 114 | f0=out.f0, 115 | spectrogram=spectrogram, 116 | aperiodicity=out.aperiodicity, 117 | mfcc=out.mfcc, 118 | voiced=out.voiced, 119 | ).astype(numpy.float64) 120 | return out 121 | 122 | def convert_from_audio_path(self, path: Path, out_sampling_rate: Optional[int] = None): 123 | wave = self._wave_process(str(path), test=True) 124 | feature = self._feature_process(wave, test=True) 125 | return self.convert_from_feature(feature, out_sampling_rate) 126 | 127 | def convert_from_feature_path(self, path: Path, out_sampling_rate: Optional[int] = None): 128 | feature = self._acoustic_feature_load_process(path, test=True) 129 | return self.convert_from_feature(feature, out_sampling_rate) 130 | 131 | def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): 132 | if out_sampling_rate is None: 133 | out_sampling_rate = self.config.dataset.param.voice_param.sample_rate 134 | 135 | out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate) 136 | out = pyworld.synthesize( 137 | f0=out.f0.ravel(), 138 | spectrogram=out.spectrogram, 139 | aperiodicity=out.aperiodicity, 140 | fs=out_sampling_rate, 141 | frame_period=self._param.acoustic_feature_param.frame_period, 142 | ) 143 | return Wave(out, sampling_rate=out_sampling_rate) 144 | 145 | def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): 146 | return self.convert_from_audio_path(voice_path, out_sampling_rate) 147 | -------------------------------------------------------------------------------- /become_yukarin/model/sr_model.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | import chainer.links as L 4 | 5 | from become_yukarin.config.sr_config import SRModelConfig 6 | 7 | 8 | class CBR(chainer.Chain): 9 | def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False) -> None: 10 | super().__init__() 11 | self.bn = bn 12 | self.activation = activation 13 | self.dropout = dropout 14 | 15 | w = chainer.initializers.Normal(0.02) 16 | with self.init_scope(): 17 | if sample == 'down': 18 | self.c = L.Convolution2D(ch0, ch1, 4, 2, 1, initialW=w) 19 | elif sample == 'up': 20 | self.c = L.Deconvolution2D(ch0, ch1, 4, 2, 1, initialW=w) 21 | else: 22 | self.c = L.Convolution2D(ch0, ch1, 1, 1, 0, initialW=w) 23 | if bn: 24 | self.batchnorm = L.BatchNormalization(ch1) 25 | 26 | def __call__(self, x): 27 | h = self.c(x) 28 | if self.bn: 29 | h = self.batchnorm(h) 30 | if self.dropout: 31 | h = F.dropout(h) 32 | if self.activation is not None: 33 | h = self.activation(h) 34 | return h 35 | 36 | 37 | class SREncoder(chainer.Chain): 38 | def __init__(self, in_ch, base=64, extensive_layers=8) -> None: 39 | super().__init__() 40 | w = chainer.initializers.Normal(0.02) 41 | with self.init_scope(): 42 | if extensive_layers > 0: 43 | self.c0 = L.Convolution2D(in_ch, base * 1, 3, 1, 1, initialW=w) 44 | else: 45 | self.c0 = L.Convolution2D(in_ch, base * 1, 1, 1, 0, initialW=w) 46 | 47 | _choose = lambda i: 'down' if i < extensive_layers else 'same' 48 | self.c1 = CBR(base * 1, base * 2, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False) 49 | self.c2 = CBR(base * 2, base * 4, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False) 50 | self.c3 = CBR(base * 4, base * 8, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False) 51 | self.c4 = CBR(base * 8, base * 8, bn=True, sample=_choose(4), activation=F.leaky_relu, dropout=False) 52 | self.c5 = CBR(base * 8, base * 8, bn=True, sample=_choose(5), activation=F.leaky_relu, dropout=False) 53 | self.c6 = CBR(base * 8, base * 8, bn=True, sample=_choose(6), activation=F.leaky_relu, dropout=False) 54 | self.c7 = CBR(base * 8, base * 8, bn=True, sample=_choose(7), activation=F.leaky_relu, dropout=False) 55 | 56 | def __call__(self, x): 57 | hs = [F.leaky_relu(self.c0(x))] 58 | for i in range(1, 8): 59 | hs.append(self['c%d' % i](hs[i - 1])) 60 | return hs 61 | 62 | 63 | class SRDecoder(chainer.Chain): 64 | def __init__(self, out_ch, base=64, extensive_layers=8) -> None: 65 | super().__init__() 66 | w = chainer.initializers.Normal(0.02) 67 | with self.init_scope(): 68 | _choose = lambda i: 'up' if i >= 8 - extensive_layers else 'same' 69 | self.c0 = CBR(base * 8, base * 8, bn=True, sample=_choose(0), activation=F.relu, dropout=True) 70 | self.c1 = CBR(base * 16, base * 8, bn=True, sample=_choose(1), activation=F.relu, dropout=True) 71 | self.c2 = CBR(base * 16, base * 8, bn=True, sample=_choose(2), activation=F.relu, dropout=True) 72 | self.c3 = CBR(base * 16, base * 8, bn=True, sample=_choose(3), activation=F.relu, dropout=False) 73 | self.c4 = CBR(base * 16, base * 4, bn=True, sample=_choose(4), activation=F.relu, dropout=False) 74 | self.c5 = CBR(base * 8, base * 2, bn=True, sample=_choose(5), activation=F.relu, dropout=False) 75 | self.c6 = CBR(base * 4, base * 1, bn=True, sample=_choose(6), activation=F.relu, dropout=False) 76 | 77 | if extensive_layers > 0: 78 | self.c7 = L.Convolution2D(base * 2, out_ch, 3, 1, 1, initialW=w) 79 | else: 80 | self.c7 = L.Convolution2D(base * 2, out_ch, 1, 1, 0, initialW=w) 81 | 82 | def __call__(self, hs): 83 | h = self.c0(hs[-1]) 84 | for i in range(1, 8): 85 | h = F.concat([h, hs[-i - 1]]) 86 | if i < 7: 87 | h = self['c%d' % i](h) 88 | else: 89 | h = self.c7(h) 90 | return h 91 | 92 | 93 | class SRPredictor(chainer.Chain): 94 | def __init__(self, in_ch, out_ch, base, extensive_layers) -> None: 95 | super().__init__() 96 | with self.init_scope(): 97 | self.encoder = SREncoder(in_ch, base=base, extensive_layers=extensive_layers) 98 | self.decoder = SRDecoder(out_ch, base=base, extensive_layers=extensive_layers) 99 | 100 | def __call__(self, x): 101 | return self.decoder(self.encoder(x)) 102 | 103 | 104 | class SRDiscriminator(chainer.Chain): 105 | def __init__(self, in_ch, out_ch, base=32, extensive_layers=5) -> None: 106 | super().__init__() 107 | w = chainer.initializers.Normal(0.02) 108 | with self.init_scope(): 109 | _choose = lambda i: 'down' if i < extensive_layers else 'same' 110 | self.c0_0 = CBR(in_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=False) 111 | self.c0_1 = CBR(out_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=False) 112 | self.c1 = CBR(base * 2, base * 4, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False) 113 | self.c2 = CBR(base * 4, base * 8, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False) 114 | self.c3 = CBR(base * 8, base * 16, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False) 115 | 116 | if extensive_layers > 4: 117 | self.c4 = L.Convolution2D(base * 16, 1, 3, 1, 1, initialW=w) 118 | else: 119 | self.c4 = L.Convolution2D(base * 16, 1, 1, 1, 0, initialW=w) 120 | 121 | def __call__(self, x_0, x_1): 122 | h = F.concat([self.c0_0(x_0), self.c0_1(x_1)]) 123 | h = self.c1(h) 124 | h = self.c2(h) 125 | h = self.c3(h) 126 | h = self.c4(h) 127 | # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0) 128 | return h 129 | 130 | 131 | def create_predictor_sr(config: SRModelConfig): 132 | return SRPredictor( 133 | in_ch=1, 134 | out_ch=1, 135 | base=config.generator_base_channels, 136 | extensive_layers=config.generator_extensive_layers, 137 | ) 138 | 139 | 140 | def create_discriminator_sr(config: SRModelConfig): 141 | return SRDiscriminator( 142 | in_ch=1, 143 | out_ch=1, 144 | base=config.discriminator_base_channels, 145 | extensive_layers=config.discriminator_extensive_layers, 146 | ) 147 | 148 | 149 | def create_sr(config: SRModelConfig): 150 | predictor = create_predictor_sr(config) 151 | discriminator = create_discriminator_sr(config) 152 | return predictor, discriminator 153 | -------------------------------------------------------------------------------- /become_yukarin/model/model.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | import chainer.links as L 4 | 5 | from become_yukarin.config.config import ModelConfig 6 | 7 | 8 | class Convolution1D(chainer.links.ConvolutionND): 9 | def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, 10 | nobias=False, initialW=None, initial_bias=None, 11 | cover_all=False) -> None: 12 | super().__init__( 13 | ndim=1, 14 | in_channels=in_channels, 15 | out_channels=out_channels, 16 | ksize=ksize, 17 | stride=stride, 18 | pad=pad, 19 | nobias=nobias, 20 | initialW=initialW, 21 | initial_bias=initial_bias, 22 | cover_all=cover_all, 23 | ) 24 | 25 | 26 | class Deconvolution1D(chainer.links.DeconvolutionND): 27 | def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, 28 | nobias=False, outsize=None, 29 | initialW=None, initial_bias=None) -> None: 30 | super().__init__( 31 | ndim=1, 32 | in_channels=in_channels, 33 | out_channels=out_channels, 34 | ksize=ksize, 35 | stride=stride, 36 | pad=pad, 37 | nobias=nobias, 38 | outsize=outsize, 39 | initialW=initialW, 40 | initial_bias=initial_bias, 41 | ) 42 | 43 | 44 | class CBR(chainer.Chain): 45 | def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False) -> None: 46 | super().__init__() 47 | self.bn = bn 48 | self.activation = activation 49 | self.dropout = dropout 50 | 51 | w = chainer.initializers.Normal(0.02) 52 | with self.init_scope(): 53 | if sample == 'down': 54 | self.c = Convolution1D(ch0, ch1, 4, 2, 1, initialW=w) 55 | elif sample == 'up': 56 | self.c = Deconvolution1D(ch0, ch1, 4, 2, 1, initialW=w) 57 | else: 58 | self.c = Convolution1D(ch0, ch1, 1, 1, 0, initialW=w) 59 | if bn: 60 | self.batchnorm = L.BatchNormalization(ch1) 61 | 62 | def __call__(self, x): 63 | h = self.c(x) 64 | if self.bn: 65 | h = self.batchnorm(h) 66 | if self.dropout: 67 | h = F.dropout(h) 68 | if self.activation is not None: 69 | h = self.activation(h) 70 | return h 71 | 72 | 73 | class Encoder(chainer.Chain): 74 | def __init__(self, in_ch, base=64, extensive_layers=8) -> None: 75 | super().__init__() 76 | w = chainer.initializers.Normal(0.02) 77 | with self.init_scope(): 78 | if extensive_layers > 0: 79 | self.c0 = Convolution1D(in_ch, base * 1, 3, 1, 1, initialW=w) 80 | else: 81 | self.c0 = Convolution1D(in_ch, base * 1, 1, 1, 0, initialW=w) 82 | 83 | _choose = lambda i: 'down' if i < extensive_layers else 'same' 84 | self.c1 = CBR(base * 1, base * 2, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=False) 85 | self.c2 = CBR(base * 2, base * 4, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=False) 86 | self.c3 = CBR(base * 4, base * 8, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=False) 87 | self.c4 = CBR(base * 8, base * 8, bn=True, sample=_choose(4), activation=F.leaky_relu, dropout=False) 88 | self.c5 = CBR(base * 8, base * 8, bn=True, sample=_choose(5), activation=F.leaky_relu, dropout=False) 89 | self.c6 = CBR(base * 8, base * 8, bn=True, sample=_choose(6), activation=F.leaky_relu, dropout=False) 90 | self.c7 = CBR(base * 8, base * 8, bn=True, sample=_choose(7), activation=F.leaky_relu, dropout=False) 91 | 92 | def __call__(self, x): 93 | hs = [F.leaky_relu(self.c0(x))] 94 | for i in range(1, 8): 95 | hs.append(self['c%d' % i](hs[i - 1])) 96 | return hs 97 | 98 | 99 | class Decoder(chainer.Chain): 100 | def __init__(self, out_ch, base=64, extensive_layers=8) -> None: 101 | super().__init__() 102 | w = chainer.initializers.Normal(0.02) 103 | with self.init_scope(): 104 | _choose = lambda i: 'up' if i >= 8 - extensive_layers else 'same' 105 | self.c0 = CBR(base * 8, base * 8, bn=True, sample=_choose(0), activation=F.relu, dropout=True) 106 | self.c1 = CBR(base * 16, base * 8, bn=True, sample=_choose(1), activation=F.relu, dropout=True) 107 | self.c2 = CBR(base * 16, base * 8, bn=True, sample=_choose(2), activation=F.relu, dropout=True) 108 | self.c3 = CBR(base * 16, base * 8, bn=True, sample=_choose(3), activation=F.relu, dropout=False) 109 | self.c4 = CBR(base * 16, base * 4, bn=True, sample=_choose(4), activation=F.relu, dropout=False) 110 | self.c5 = CBR(base * 8, base * 2, bn=True, sample=_choose(5), activation=F.relu, dropout=False) 111 | self.c6 = CBR(base * 4, base * 1, bn=True, sample=_choose(6), activation=F.relu, dropout=False) 112 | 113 | if extensive_layers > 0: 114 | self.c7 = Convolution1D(base * 2, out_ch, 3, 1, 1, initialW=w) 115 | else: 116 | self.c7 = Convolution1D(base * 2, out_ch, 1, 1, 0, initialW=w) 117 | 118 | def __call__(self, hs): 119 | h = self.c0(hs[-1]) 120 | for i in range(1, 8): 121 | h = F.concat([h, hs[-i - 1]]) 122 | if i < 7: 123 | h = self['c%d' % i](h) 124 | else: 125 | h = self.c7(h) 126 | return h 127 | 128 | 129 | class Predictor(chainer.Chain): 130 | def __init__(self, in_ch, out_ch, base=64, extensive_layers=8) -> None: 131 | super().__init__() 132 | with self.init_scope(): 133 | self.encoder = Encoder(in_ch, base=base, extensive_layers=extensive_layers) 134 | self.decoder = Decoder(out_ch, base=base, extensive_layers=extensive_layers) 135 | 136 | def __call__(self, x): 137 | return self.decoder(self.encoder(x)) 138 | 139 | 140 | class Discriminator(chainer.Chain): 141 | def __init__(self, in_ch, out_ch, base=32, extensive_layers=5, is_weak=False) -> None: 142 | super().__init__() 143 | w = chainer.initializers.Normal(0.02) 144 | with self.init_scope(): 145 | _choose = lambda i: 'down' if i < extensive_layers else 'same' 146 | self.c0_0 = CBR(in_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=is_weak) 147 | self.c0_1 = CBR(out_ch, base * 1, bn=False, sample=_choose(0), activation=F.leaky_relu, dropout=is_weak) 148 | self.c1 = CBR(base * 2, base * 4, bn=True, sample=_choose(1), activation=F.leaky_relu, dropout=is_weak) 149 | self.c2 = CBR(base * 4, base * 8, bn=True, sample=_choose(2), activation=F.leaky_relu, dropout=is_weak) 150 | self.c3 = CBR(base * 8, base * 16, bn=True, sample=_choose(3), activation=F.leaky_relu, dropout=is_weak) 151 | 152 | if extensive_layers > 4: 153 | self.c4 = Convolution1D(base * 16, 1, 3, 1, 1, initialW=w) 154 | else: 155 | self.c4 = Convolution1D(base * 16, 1, 1, 1, 0, initialW=w) 156 | 157 | def __call__(self, x_0, x_1): 158 | h = F.concat([self.c0_0(x_0), self.c0_1(x_1)]) 159 | h = self.c1(h) 160 | h = self.c2(h) 161 | h = self.c3(h) 162 | h = self.c4(h) 163 | # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0) 164 | return h 165 | 166 | 167 | def create_predictor(config: ModelConfig): 168 | return Predictor( 169 | in_ch=config.in_channels, 170 | out_ch=config.out_channels, 171 | base=config.generator_base_channels, 172 | extensive_layers=config.generator_extensive_layers, 173 | ) 174 | 175 | 176 | def create_discriminator(config: ModelConfig): 177 | return Discriminator( 178 | in_ch=config.in_channels, 179 | out_ch=config.out_channels, 180 | base=config.discriminator_base_channels, 181 | extensive_layers=config.discriminator_extensive_layers, 182 | is_weak=config.weak_discriminator, 183 | ) 184 | 185 | 186 | def create(config: ModelConfig): 187 | predictor = create_predictor(config) 188 | discriminator = create_discriminator(config) 189 | return predictor, discriminator 190 | -------------------------------------------------------------------------------- /scripts/extract_acoustic_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | extract alignments voices. 3 | """ 4 | 5 | import argparse 6 | import multiprocessing 7 | from pathlib import Path 8 | from pprint import pprint 9 | 10 | import numpy 11 | 12 | from become_yukarin.acoustic_converter import AcousticConverter 13 | from become_yukarin.config.config import create_from_json as create_config 14 | from become_yukarin.data_struct import AcousticFeature 15 | from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess 16 | from become_yukarin.dataset.dataset import AcousticFeatureProcess 17 | from become_yukarin.dataset.dataset import AcousticFeatureSaveProcess 18 | from become_yukarin.dataset.dataset import WaveFileLoadProcess 19 | from become_yukarin.dataset.utility import MelCepstrumAligner 20 | from become_yukarin.param import AcousticFeatureParam 21 | from become_yukarin.param import VoiceParam 22 | 23 | base_voice_param = VoiceParam() 24 | base_acoustic_feature_param = AcousticFeatureParam() 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--input1_directory', '-i1', type=Path) 28 | parser.add_argument('--input2_directory', '-i2', type=Path) 29 | parser.add_argument('--output1_directory', '-o1', type=Path) 30 | parser.add_argument('--output2_directory', '-o2', type=Path) 31 | parser.add_argument('--pre_converter1_config', type=Path) 32 | parser.add_argument('--pre_converter1_model', type=Path) 33 | parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate) 34 | parser.add_argument('--top_db', type=float, default=base_voice_param.top_db) 35 | parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_second) 36 | parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period) 37 | parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order) 38 | parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha) 39 | parser.add_argument('--f0_estimating_method', type=str, default=base_acoustic_feature_param.f0_estimating_method) 40 | parser.add_argument('--f0_floor1', type=float, default=71) 41 | parser.add_argument('--f0_ceil1', type=float, default=800) 42 | parser.add_argument('--f0_floor2', type=float, default=71) 43 | parser.add_argument('--f0_ceil2', type=float, default=800) 44 | parser.add_argument('--ignore_feature', nargs='+', default=['spectrogram', 'aperiodicity']) 45 | parser.add_argument('--disable_alignment', action='store_true') 46 | parser.add_argument('--enable_overwrite', action='store_true') 47 | arguments = parser.parse_args() 48 | 49 | pre_convert = arguments.pre_converter1_config is not None 50 | if pre_convert: 51 | config = create_config(arguments.pre_converter1_config) 52 | pre_converter1 = AcousticConverter(config, arguments.pre_converter1_model) 53 | else: 54 | pre_converter1 = None 55 | 56 | 57 | def generate_feature(path1, path2): 58 | out1 = Path(arguments.output1_directory, path1.stem + '.npy') 59 | out2 = Path(arguments.output2_directory, path2.stem + '.npy') 60 | if out1.exists() and out2.exists() and not arguments.enable_overwrite: 61 | return 62 | 63 | # load wave and padding 64 | wave_file_load_process = WaveFileLoadProcess( 65 | sample_rate=arguments.sample_rate, 66 | top_db=arguments.top_db, 67 | pad_second=arguments.pad_second, 68 | ) 69 | wave1 = wave_file_load_process(path1, test=True) 70 | wave2 = wave_file_load_process(path2, test=True) 71 | 72 | # make acoustic feature 73 | acoustic_feature_process1 = AcousticFeatureProcess( 74 | frame_period=arguments.frame_period, 75 | order=arguments.order, 76 | alpha=arguments.alpha, 77 | f0_estimating_method=arguments.f0_estimating_method, 78 | f0_floor=arguments.f0_floor1, 79 | f0_ceil=arguments.f0_ceil1, 80 | ) 81 | acoustic_feature_process2 = AcousticFeatureProcess( 82 | frame_period=arguments.frame_period, 83 | order=arguments.order, 84 | alpha=arguments.alpha, 85 | f0_estimating_method=arguments.f0_estimating_method, 86 | f0_floor=arguments.f0_floor2, 87 | f0_ceil=arguments.f0_ceil2, 88 | ) 89 | f1 = acoustic_feature_process1(wave1, test=True).astype_only_float(numpy.float32) 90 | f2 = acoustic_feature_process2(wave2, test=True).astype_only_float(numpy.float32) 91 | 92 | # pre convert 93 | if pre_convert: 94 | f1_ref = pre_converter1.convert_to_feature(f1) 95 | else: 96 | f1_ref = f1 97 | 98 | # alignment 99 | if not arguments.disable_alignment: 100 | aligner = MelCepstrumAligner(f1_ref.mfcc, f2.mfcc) 101 | 102 | f0_1, f0_2 = aligner.align(f1.f0, f2.f0) 103 | spectrogram_1, spectrogram_2 = aligner.align(f1.spectrogram, f2.spectrogram) 104 | aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity) 105 | mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc) 106 | voiced_1, voiced_2 = aligner.align(f1.voiced, f2.voiced) 107 | 108 | f1 = AcousticFeature( 109 | f0=f0_1, 110 | spectrogram=spectrogram_1, 111 | aperiodicity=aperiodicity_1, 112 | mfcc=mfcc_1, 113 | voiced=voiced_1, 114 | ) 115 | f2 = AcousticFeature( 116 | f0=f0_2, 117 | spectrogram=spectrogram_2, 118 | aperiodicity=aperiodicity_2, 119 | mfcc=mfcc_2, 120 | voiced=voiced_2, 121 | ) 122 | 123 | f1.validate() 124 | f2.validate() 125 | 126 | # save 127 | acoustic_feature_save_process = AcousticFeatureSaveProcess(validate=True, ignore=arguments.ignore_feature) 128 | acoustic_feature_save_process({'path': out1, 'feature': f1}) 129 | print('saved!', out1) 130 | 131 | acoustic_feature_save_process({'path': out2, 'feature': f2}) 132 | print('saved!', out2) 133 | 134 | 135 | def generate_mean_var(path_directory: Path): 136 | path_mean = Path(path_directory, 'mean.npy') 137 | path_var = Path(path_directory, 'var.npy') 138 | if path_mean.exists(): 139 | path_mean.unlink() 140 | if path_var.exists(): 141 | path_var.unlink() 142 | 143 | acoustic_feature_load_process = AcousticFeatureLoadProcess(validate=False) 144 | acoustic_feature_save_process = AcousticFeatureSaveProcess(validate=False) 145 | 146 | f0_list = [] 147 | spectrogram_list = [] 148 | aperiodicity_list = [] 149 | mfcc_list = [] 150 | for path in path_directory.glob('*'): 151 | feature = acoustic_feature_load_process(path) 152 | f0_list.append(feature.f0[feature.voiced]) # remove unvoiced 153 | spectrogram_list.append(feature.spectrogram) 154 | aperiodicity_list.append(feature.aperiodicity) 155 | mfcc_list.append(feature.mfcc) 156 | 157 | def concatenate(arr_list): 158 | try: 159 | arr_list = numpy.concatenate(arr_list) 160 | except: 161 | pass 162 | return arr_list 163 | 164 | f0_list = concatenate(f0_list) 165 | spectrogram_list = concatenate(spectrogram_list) 166 | aperiodicity_list = concatenate(aperiodicity_list) 167 | mfcc_list = concatenate(mfcc_list) 168 | 169 | mean = AcousticFeature( 170 | f0=numpy.mean(f0_list, axis=0, keepdims=True), 171 | spectrogram=numpy.mean(spectrogram_list, axis=0, keepdims=True), 172 | aperiodicity=numpy.mean(aperiodicity_list, axis=0, keepdims=True), 173 | mfcc=numpy.mean(mfcc_list, axis=0, keepdims=True), 174 | voiced=numpy.nan, 175 | ) 176 | var = AcousticFeature( 177 | f0=numpy.var(f0_list, axis=0, keepdims=True), 178 | spectrogram=numpy.var(spectrogram_list, axis=0, keepdims=True), 179 | aperiodicity=numpy.var(aperiodicity_list, axis=0, keepdims=True), 180 | mfcc=numpy.var(mfcc_list, axis=0, keepdims=True), 181 | voiced=numpy.nan, 182 | ) 183 | 184 | acoustic_feature_save_process({'path': path_mean, 'feature': mean}) 185 | acoustic_feature_save_process({'path': path_var, 'feature': var}) 186 | 187 | 188 | def main(): 189 | pprint(vars(arguments)) 190 | 191 | paths1 = list(sorted(arguments.input1_directory.glob('*'))) 192 | paths2 = list(sorted(arguments.input2_directory.glob('*'))) 193 | assert len(paths1) == len(paths2) 194 | 195 | arguments.output1_directory.mkdir(exist_ok=True) 196 | arguments.output2_directory.mkdir(exist_ok=True) 197 | 198 | pool = multiprocessing.Pool() 199 | pool.starmap(generate_feature, zip(paths1, paths2), chunksize=16) 200 | 201 | generate_mean_var(arguments.output1_directory) 202 | generate_mean_var(arguments.output2_directory) 203 | 204 | 205 | if __name__ == '__main__': 206 | main() 207 | -------------------------------------------------------------------------------- /become_yukarin/model/cbhg_model.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from typing import List 3 | 4 | import chainer 5 | 6 | from become_yukarin.config.old_config import CBHGDiscriminatorModelConfig 7 | from become_yukarin.config.old_config import CBHGModelConfig 8 | 9 | 10 | class Convolution1D(chainer.links.ConvolutionND): 11 | def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, 12 | nobias=False, initialW=None, initial_bias=None, 13 | cover_all=False): 14 | super().__init__( 15 | ndim=1, 16 | in_channels=in_channels, 17 | out_channels=out_channels, 18 | ksize=ksize, 19 | stride=stride, 20 | pad=pad, 21 | nobias=nobias, 22 | initialW=initialW, 23 | initial_bias=initial_bias, 24 | cover_all=cover_all, 25 | ) 26 | 27 | 28 | class LegacyConvolution1D(chainer.links.Convolution2D): 29 | def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, 30 | nobias=False, initialW=None, initial_bias=None, **kwargs): 31 | assert ksize is None or isinstance(ksize, int) 32 | assert isinstance(stride, int) 33 | assert isinstance(pad, int) 34 | super().__init__( 35 | in_channels=in_channels, 36 | out_channels=out_channels, 37 | ksize=(ksize, 1), 38 | stride=(stride, 1), 39 | pad=(pad, 0), 40 | nobias=nobias, 41 | initialW=initialW, 42 | initial_bias=initial_bias, 43 | **kwargs, 44 | ) 45 | 46 | def __call__(self, x): 47 | assert x.shape[-1] == 1 48 | return super().__call__(x) 49 | 50 | 51 | class ConvHighway(chainer.link.Chain): 52 | def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu, 53 | init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1): 54 | super().__init__() 55 | self.activate = activate 56 | 57 | with self.init_scope(): 58 | self.plain = Convolution1D( 59 | in_out_size, in_out_size, 1, nobias=nobias, 60 | initialW=init_Wh, initial_bias=init_bh) 61 | self.transform = Convolution1D( 62 | in_out_size, in_out_size, 1, nobias=nobias, 63 | initialW=init_Wt, initial_bias=init_bt) 64 | 65 | def __call__(self, x): 66 | out_plain = self.activate(self.plain(x)) 67 | out_transform = chainer.functions.sigmoid(self.transform(x)) 68 | y = out_plain * out_transform + x * (1 - out_transform) 69 | return y 70 | 71 | 72 | class PreNet(chainer.link.Chain): 73 | def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: 74 | super().__init__() 75 | with self.init_scope(): 76 | self.conv1 = Convolution1D(in_channels, hidden_channels, 1) 77 | self.conv2 = Convolution1D(hidden_channels, out_channels, 1) 78 | 79 | def __call__(self, x): 80 | h = x 81 | h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5)) 82 | h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5)) 83 | return h 84 | 85 | 86 | class Conv1DBank(chainer.link.Chain): 87 | def __init__(self, in_channels: int, out_channels: int, k: int) -> None: 88 | super().__init__() 89 | self.stacked_channels = out_channels * k 90 | self.pads = [ 91 | partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant') 92 | for i in range(k) 93 | ] 94 | 95 | with self.init_scope(): 96 | self.convs = chainer.link.ChainList( 97 | *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k)) 98 | ) 99 | self.bn = chainer.links.BatchNormalization(out_channels * k) 100 | 101 | def __call__(self, x): 102 | h = x 103 | h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)]) 104 | h = chainer.functions.relu(self.bn(h)) 105 | return h 106 | 107 | 108 | class Conv1DProjections(chainer.link.Chain): 109 | def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: 110 | super().__init__() 111 | 112 | with self.init_scope(): 113 | self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True) 114 | self.bn1 = chainer.links.BatchNormalization(hidden_channels) 115 | self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True) 116 | self.bn2 = chainer.links.BatchNormalization(out_channels) 117 | 118 | def __call__(self, x): 119 | h = x 120 | h = chainer.functions.relu(self.bn1(self.conv1(h))) 121 | h = chainer.functions.relu(self.bn2(self.conv2(h))) 122 | return h 123 | 124 | 125 | class CBHG(chainer.link.Chain): 126 | def __init__( 127 | self, 128 | in_channels: int, 129 | conv_bank_out_channels: int, 130 | conv_bank_k: int, 131 | max_pooling_k: int, 132 | conv_projections_hidden_channels: int, 133 | highway_layers: int, 134 | out_channels: int, 135 | disable_last_rnn: bool, 136 | ) -> None: 137 | super().__init__() 138 | self.max_pooling_padding = partial( 139 | chainer.functions.pad, 140 | pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)), 141 | mode='constant', 142 | ) 143 | self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) 144 | self.out_size = out_channels * (1 if disable_last_rnn else 2) 145 | 146 | with self.init_scope(): 147 | self.conv_bank = Conv1DBank( 148 | in_channels=in_channels, 149 | out_channels=conv_bank_out_channels, 150 | k=conv_bank_k, 151 | ) 152 | self.conv_projectoins = Conv1DProjections( 153 | in_channels=self.conv_bank.stacked_channels, 154 | hidden_channels=conv_projections_hidden_channels, 155 | out_channels=out_channels, 156 | ) 157 | self.highways = chainer.link.ChainList( 158 | *([ConvHighway(out_channels) for _ in range(highway_layers)]) 159 | ) 160 | if not disable_last_rnn: 161 | self.gru = chainer.links.NStepBiGRU( 162 | n_layers=1, 163 | in_size=out_channels, 164 | out_size=out_channels, 165 | dropout=0.0, 166 | ) 167 | 168 | def __call__(self, x): 169 | h = x 170 | h = self.conv_bank(h) 171 | h = self.max_pooling(self.max_pooling_padding(h)) 172 | h = self.conv_projectoins(h) 173 | h = h + x 174 | for highway in self.highways: 175 | h = highway(h) 176 | 177 | if hasattr(self, 'gru'): 178 | h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) 179 | _, h = self.gru(None, h) 180 | h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) 181 | return h 182 | 183 | 184 | class Predictor(chainer.link.Chain): 185 | def __init__(self, network, out_size: int) -> None: 186 | super().__init__() 187 | with self.init_scope(): 188 | self.network = network 189 | self.last = Convolution1D(network.out_size, out_size, 1) 190 | 191 | def __call__(self, x): 192 | h = x 193 | h = self.network(h) 194 | h = self.last(h) 195 | return h 196 | 197 | 198 | class Aligner(chainer.link.Chain): 199 | def __init__(self, in_size: int, out_time_length: int) -> None: 200 | super().__init__() 201 | with self.init_scope(): 202 | self.gru = chainer.links.NStepBiGRU( 203 | n_layers=1, 204 | in_size=in_size, 205 | out_size=in_size // 2, 206 | dropout=0.0, 207 | ) 208 | self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) 209 | 210 | def __call__(self, x): 211 | """ 212 | :param x: (batch, channel, timeA) 213 | """ 214 | h = x 215 | h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) 216 | _, h = self.gru(None, h) # h: batch * (timeA, ?) 217 | h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) 218 | h = chainer.functions.softmax(self.last(h), axis=1) # h: (batch, timeB, timeA) 219 | 220 | h = chainer.functions.matmul(x, h) # h: (batch, channel, time) 221 | return h 222 | 223 | 224 | class Discriminator(chainer.link.Chain): 225 | def __init__(self, in_channels: int, hidden_channels_list: List[int]) -> None: 226 | super().__init__() 227 | with self.init_scope(): 228 | self.convs = chainer.link.ChainList(*( 229 | LegacyConvolution1D(i_c, o_c, ksize=2, stride=2) 230 | for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list) 231 | )) 232 | self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1) 233 | 234 | def __call__(self, x): 235 | """ 236 | :param x: (batch, channel, time) 237 | """ 238 | h = x 239 | h = chainer.functions.reshape(h, h.shape + (1,)) 240 | for conv in self.convs.children(): 241 | h = chainer.functions.relu(conv(h)) 242 | h = self.last_conv(h) 243 | h = chainer.functions.reshape(h, h.shape[:-1]) 244 | return h 245 | 246 | 247 | def create_predictor(config: CBHGModelConfig): 248 | network = CBHG( 249 | in_channels=config.in_channels, 250 | conv_bank_out_channels=config.conv_bank_out_channels, 251 | conv_bank_k=config.conv_bank_k, 252 | max_pooling_k=config.max_pooling_k, 253 | conv_projections_hidden_channels=config.conv_projections_hidden_channels, 254 | highway_layers=config.highway_layers, 255 | out_channels=config.out_channels, 256 | disable_last_rnn=config.disable_last_rnn, 257 | ) 258 | predictor = Predictor( 259 | network=network, 260 | out_size=config.out_size, 261 | ) 262 | return predictor 263 | 264 | 265 | def create_aligner(config: CBHGModelConfig): 266 | assert config.enable_aligner 267 | aligner = Aligner( 268 | in_size=config.in_channels, 269 | out_time_length=config.aligner_out_time_length, 270 | ) 271 | return aligner 272 | 273 | 274 | def create_discriminator(config: CBHGDiscriminatorModelConfig): 275 | discriminator = Discriminator( 276 | in_channels=config.in_channels, 277 | hidden_channels_list=config.hidden_channels_list, 278 | ) 279 | return discriminator 280 | 281 | 282 | def create(config: CBHGModelConfig): 283 | predictor = create_predictor(config) 284 | if config.enable_aligner: 285 | aligner = create_aligner(config) 286 | else: 287 | aligner = None 288 | if config.discriminator is not None: 289 | discriminator = create_discriminator(config.discriminator) 290 | else: 291 | discriminator = None 292 | return predictor, aligner, discriminator 293 | -------------------------------------------------------------------------------- /become_yukarin/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import glob 3 | import typing 4 | from abc import ABCMeta, abstractmethod 5 | from collections import defaultdict 6 | from pathlib import Path 7 | from typing import Any 8 | from typing import Callable 9 | from typing import Dict 10 | from typing import List 11 | 12 | import chainer 13 | import librosa 14 | import numpy 15 | import pysptk 16 | import pyworld 17 | import scipy.ndimage 18 | 19 | from ..config.config import DatasetConfig 20 | from ..config.sr_config import SRDatasetConfig 21 | from ..data_struct import AcousticFeature 22 | from ..data_struct import LowHighSpectrogramFeature 23 | from ..data_struct import Wave 24 | 25 | 26 | class BaseDataProcess(metaclass=ABCMeta): 27 | @abstractmethod 28 | def __call__(self, data, test): 29 | pass 30 | 31 | 32 | class LambdaProcess(BaseDataProcess): 33 | def __init__(self, process: Callable[[Any, bool], Any]) -> None: 34 | self._process = process 35 | 36 | def __call__(self, data, test): 37 | return self._process(data, test) 38 | 39 | 40 | class DictKeyReplaceProcess(BaseDataProcess): 41 | def __init__(self, key_map: Dict[str, str]) -> None: 42 | self._key_map = key_map 43 | 44 | def __call__(self, data: Dict[str, Any], test): 45 | return {key_after: data[key_before] for key_after, key_before in self._key_map} 46 | 47 | 48 | class ChainProcess(BaseDataProcess): 49 | def __init__(self, process: typing.Iterable[BaseDataProcess]) -> None: 50 | self._process = list(process) 51 | 52 | def __call__(self, data, test): 53 | for p in self._process: 54 | data = p(data, test) 55 | return data 56 | 57 | def append(self, process: BaseDataProcess): 58 | self._process.append(process) 59 | 60 | 61 | class SplitProcess(BaseDataProcess): 62 | def __init__(self, process: typing.Dict[str, typing.Optional[BaseDataProcess]]) -> None: 63 | self._process = process 64 | 65 | def __call__(self, data, test): 66 | data = { 67 | k: p(data, test) if p is not None else data 68 | for k, p in self._process.items() 69 | } 70 | return data 71 | 72 | 73 | class WaveFileLoadProcess(BaseDataProcess): 74 | def __init__(self, sample_rate: int, top_db: float = None, pad_second: float = 0, dtype=numpy.float32) -> None: 75 | self._sample_rate = sample_rate 76 | self._top_db = top_db 77 | self._pad_second = pad_second 78 | self._dtype = dtype 79 | 80 | def __call__(self, data: str, test=None): 81 | wave = librosa.core.load(data, sr=self._sample_rate, dtype=self._dtype)[0] 82 | if self._top_db is not None: 83 | wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db)) 84 | if self._pad_second > 0.0: 85 | p = int(self._sample_rate * self._pad_second) 86 | wave = numpy.pad(wave, pad_width=(p, p), mode='constant') 87 | return Wave(wave, self._sample_rate) 88 | 89 | 90 | class AcousticFeatureProcess(BaseDataProcess): 91 | def __init__( 92 | self, 93 | frame_period, 94 | order, 95 | alpha, 96 | f0_estimating_method, 97 | f0_floor=71, 98 | f0_ceil=800, 99 | dtype=numpy.float32, 100 | ) -> None: 101 | self._frame_period = frame_period 102 | self._order = order 103 | self._alpha = alpha 104 | self._f0_estimating_method = f0_estimating_method 105 | self._f0_floor = f0_floor 106 | self._f0_ceil = f0_ceil 107 | self._dtype = dtype 108 | 109 | def __call__(self, data: Wave, test=None): 110 | x = data.wave.astype(numpy.float64) 111 | fs = data.sampling_rate 112 | 113 | if self._f0_estimating_method == 'dio': 114 | _f0, t = pyworld.dio( 115 | x, 116 | fs, 117 | frame_period=self._frame_period, 118 | f0_floor=self._f0_floor, 119 | f0_ceil=self._f0_ceil, 120 | ) 121 | else: 122 | from world4py.np import apis 123 | _f0, t = apis.harvest( 124 | x, 125 | fs, 126 | frame_period=self._frame_period, 127 | f0_floor=self._f0_floor, 128 | f0_ceil=self._f0_ceil, 129 | ) 130 | f0 = pyworld.stonemask(x, _f0, t, fs) 131 | spectrogram = pyworld.cheaptrick(x, f0, t, fs) 132 | aperiodicity = pyworld.d4c(x, f0, t, fs) 133 | 134 | mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) 135 | voiced = ~(f0 == 0) # type: numpy.ndarray 136 | 137 | feature = AcousticFeature( 138 | f0=f0[:, None].astype(self._dtype), 139 | spectrogram=spectrogram.astype(self._dtype), 140 | aperiodicity=aperiodicity.astype(self._dtype), 141 | mfcc=mfcc.astype(self._dtype), 142 | voiced=voiced[:, None], 143 | ) 144 | feature.validate() 145 | return feature 146 | 147 | 148 | class LowHighSpectrogramFeatureProcess(BaseDataProcess): 149 | def __init__(self, frame_period, order, alpha, f0_estimating_method, dtype=numpy.float32) -> None: 150 | self._acoustic_feature_process = AcousticFeatureProcess( 151 | frame_period=frame_period, 152 | order=order, 153 | alpha=alpha, 154 | f0_estimating_method=f0_estimating_method, 155 | ) 156 | self._dtype = dtype 157 | self._alpha = alpha 158 | 159 | def __call__(self, data: Wave, test): 160 | acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype) 161 | high_spectrogram = acoustic_feature.spectrogram 162 | 163 | fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate) 164 | low_spectrogram = pysptk.mc2sp( 165 | acoustic_feature.mfcc, 166 | alpha=self._alpha, 167 | fftlen=fftlen, 168 | ) 169 | 170 | feature = LowHighSpectrogramFeature( 171 | low=low_spectrogram, 172 | high=high_spectrogram, 173 | ) 174 | feature.validate() 175 | return feature 176 | 177 | 178 | class AcousticFeatureLoadProcess(BaseDataProcess): 179 | def __init__(self, validate=False) -> None: 180 | self._validate = validate 181 | 182 | def __call__(self, path: Path, test=None): 183 | d: Dict[str, Any] = numpy.load(path.expanduser(), allow_pickle=True).item() 184 | feature = AcousticFeature( 185 | f0=d['f0'], 186 | spectrogram=d['spectrogram'], 187 | aperiodicity=d['aperiodicity'], 188 | mfcc=d['mfcc'], 189 | voiced=d['voiced'], 190 | ) 191 | if self._validate: 192 | feature.validate() 193 | return feature 194 | 195 | 196 | class LowHighSpectrogramFeatureLoadProcess(BaseDataProcess): 197 | def __init__(self, validate=False) -> None: 198 | self._validate = validate 199 | 200 | def __call__(self, path: Path, test=None): 201 | d: Dict[str, Any] = numpy.load(path.expanduser(), allow_pickle=True).item() 202 | feature = LowHighSpectrogramFeature( 203 | low=d['low'], 204 | high=d['high'], 205 | ) 206 | if self._validate: 207 | feature.validate() 208 | return feature 209 | 210 | 211 | class AcousticFeatureSaveProcess(BaseDataProcess): 212 | def __init__(self, validate=False, ignore: List[str] = None) -> None: 213 | self._validate = validate 214 | self._ignore = ignore if ignore is not None else [] 215 | 216 | def __call__(self, data: Dict[str, Any], test=None): 217 | path = data['path'] # type: Path 218 | feature = data['feature'] # type: AcousticFeature 219 | if self._validate: 220 | feature.validate() 221 | 222 | d = dict( 223 | f0=feature.f0, 224 | spectrogram=feature.spectrogram, 225 | aperiodicity=feature.aperiodicity, 226 | mfcc=feature.mfcc, 227 | voiced=feature.voiced, 228 | ) 229 | for k in self._ignore: 230 | assert k in d 231 | d[k] = numpy.nan 232 | 233 | numpy.save(path.absolute(), d) 234 | 235 | 236 | class DistillateUsingFeatureProcess(BaseDataProcess): 237 | def __init__(self, targets: List[str]) -> None: 238 | self._targets = targets 239 | 240 | def __call__(self, feature: AcousticFeature, test=None): 241 | d = defaultdict(lambda: numpy.nan, **{t: getattr(feature, t) for t in self._targets}) 242 | return AcousticFeature( 243 | f0=d['f0'], 244 | spectrogram=d['spectrogram'], 245 | aperiodicity=d['aperiodicity'], 246 | mfcc=d['mfcc'], 247 | voiced=d['voiced'], 248 | ) 249 | 250 | 251 | class MakeMaskProcess(BaseDataProcess): 252 | def __init__(self) -> None: 253 | pass 254 | 255 | def __call__(self, feature: AcousticFeature, test=None): 256 | return AcousticFeature( 257 | f0=feature.voiced, 258 | spectrogram=numpy.ones_like(feature.spectrogram, dtype=numpy.bool), 259 | aperiodicity=numpy.ones_like(feature.aperiodicity, dtype=numpy.bool), 260 | mfcc=numpy.ones_like(feature.mfcc, dtype=numpy.bool), 261 | voiced=numpy.ones_like(feature.voiced, dtype=numpy.bool), 262 | ).astype(numpy.float32) 263 | 264 | 265 | class AcousticFeatureNormalizeProcess(BaseDataProcess): 266 | def __init__(self, mean: AcousticFeature, var: AcousticFeature) -> None: 267 | self._mean = mean 268 | self._var = var 269 | 270 | def __call__(self, data: AcousticFeature, test=None): 271 | f0 = (data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0) 272 | f0[~data.voiced] = 0 273 | return AcousticFeature( 274 | f0=f0, 275 | spectrogram=(data.spectrogram - self._mean.spectrogram) / numpy.sqrt(self._var.spectrogram), 276 | aperiodicity=(data.aperiodicity - self._mean.aperiodicity) / numpy.sqrt(self._var.aperiodicity), 277 | mfcc=(data.mfcc - self._mean.mfcc) / numpy.sqrt(self._var.mfcc), 278 | voiced=data.voiced, 279 | ) 280 | 281 | 282 | class AcousticFeatureDenormalizeProcess(BaseDataProcess): 283 | def __init__(self, mean: AcousticFeature, var: AcousticFeature) -> None: 284 | self._mean = mean 285 | self._var = var 286 | 287 | def __call__(self, data: AcousticFeature, test=None): 288 | f0 = data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0 289 | f0[~data.voiced] = 0 290 | return AcousticFeature( 291 | f0=f0, 292 | spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram, 293 | aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity, 294 | mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc, 295 | voiced=data.voiced, 296 | ) 297 | 298 | 299 | class EncodeFeatureProcess(BaseDataProcess): 300 | def __init__(self, targets: List[str]) -> None: 301 | self._targets = targets 302 | 303 | def __call__(self, data: AcousticFeature, test): 304 | feature = numpy.concatenate([getattr(data, t) for t in self._targets], axis=1) 305 | feature = feature.T 306 | return feature 307 | 308 | 309 | class DecodeFeatureProcess(BaseDataProcess): 310 | def __init__(self, targets: List[str], sizes: Dict[str, int]) -> None: 311 | assert all(t in sizes for t in targets) 312 | self._targets = targets 313 | self._sizes = sizes 314 | 315 | def __call__(self, data: numpy.ndarray, test): 316 | data = data.T 317 | 318 | lasts = numpy.cumsum([self._sizes[t] for t in self._targets]).tolist() 319 | assert data.shape[1] == lasts[-1] 320 | 321 | d = defaultdict(lambda: numpy.nan, **{ 322 | t: data[:, bef:aft] 323 | for t, bef, aft in zip(self._targets, [0] + lasts[:-1], lasts) 324 | }) 325 | return AcousticFeature( 326 | f0=d['f0'], 327 | spectrogram=d['spectrogram'], 328 | aperiodicity=d['aperiodicity'], 329 | mfcc=d['mfcc'], 330 | voiced=d['voiced'], 331 | ) 332 | 333 | 334 | class ShapeAlignProcess(BaseDataProcess): 335 | def __call__(self, data, test): 336 | data1, data2, data3 = data['input'], data['target'], data['mask'] 337 | m = max(data1.shape[1], data2.shape[1], data3.shape[1]) 338 | data1 = numpy.pad(data1, ((0, 0), (0, m - data1.shape[1])), mode='constant') 339 | data2 = numpy.pad(data2, ((0, 0), (0, m - data2.shape[1])), mode='constant') 340 | data3 = numpy.pad(data3, ((0, 0), (0, m - data3.shape[1])), mode='constant') 341 | data['input'], data['target'], data['mask'] = data1, data2, data3 342 | return data 343 | 344 | 345 | class RandomPaddingProcess(BaseDataProcess): 346 | def __init__(self, min_size: int, time_axis: int = 1) -> None: 347 | self._min_size = min_size 348 | self._time_axis = time_axis 349 | 350 | def __call__(self, datas: Dict[str, Any], test=True): 351 | assert not test 352 | 353 | data, seed = datas['data'], datas['seed'] 354 | random = numpy.random.RandomState(seed) 355 | 356 | if data.shape[self._time_axis] >= self._min_size: 357 | return data 358 | 359 | pre = random.randint(self._min_size - data.shape[self._time_axis] + 1) 360 | post = self._min_size - pre 361 | pad = [(0, 0)] * data.ndim 362 | pad[self._time_axis] = (pre, post) 363 | return numpy.pad(data, pad, mode='constant') 364 | 365 | 366 | class LastPaddingProcess(BaseDataProcess): 367 | def __init__(self, min_size: int, time_axis: int = 1) -> None: 368 | assert time_axis == 1 369 | self._min_size = min_size 370 | self._time_axis = time_axis 371 | 372 | def __call__(self, data: numpy.ndarray, test=None): 373 | if data.shape[self._time_axis] >= self._min_size: 374 | return data 375 | 376 | pre = self._min_size - data.shape[self._time_axis] 377 | return numpy.pad(data, ((0, 0), (pre, 0)), mode='constant') 378 | 379 | 380 | class RandomCropProcess(BaseDataProcess): 381 | def __init__(self, crop_size: int, time_axis: int = 1) -> None: 382 | self._crop_size = crop_size 383 | self._time_axis = time_axis 384 | 385 | def __call__(self, datas: Dict[str, Any], test=True): 386 | assert not test 387 | 388 | data, seed = datas['data'], datas['seed'] 389 | random = numpy.random.RandomState(seed) 390 | 391 | len_time = data.shape[self._time_axis] 392 | assert len_time >= self._crop_size 393 | 394 | start = random.randint(len_time - self._crop_size + 1) 395 | return numpy.split(data, [start, start + self._crop_size], axis=self._time_axis)[1] 396 | 397 | 398 | class FirstCropProcess(BaseDataProcess): 399 | def __init__(self, crop_size: int, time_axis: int = 1) -> None: 400 | self._crop_size = crop_size 401 | self._time_axis = time_axis 402 | 403 | def __call__(self, data: numpy.ndarray, test=None): 404 | return numpy.split(data, [0, self._crop_size], axis=self._time_axis)[1] 405 | 406 | 407 | class AddNoiseProcess(BaseDataProcess): 408 | def __init__(self, p_global: float = None, p_local: float = None) -> None: 409 | assert p_global is None or 0 <= p_global 410 | assert p_local is None or 0 <= p_local 411 | self._p_global = p_global 412 | self._p_local = p_local 413 | 414 | def __call__(self, data: numpy.ndarray, test): 415 | assert not test 416 | 417 | g = numpy.random.randn() * self._p_global 418 | l = numpy.random.randn(*data.shape).astype(data.dtype) * self._p_local 419 | return data + g + l 420 | 421 | 422 | class RandomBlurProcess(BaseDataProcess): 423 | def __init__(self, blur_size_factor: float, time_axis: int = 1) -> None: 424 | assert time_axis == 1 425 | self._blur_size_factor = blur_size_factor 426 | self._time_axis = time_axis 427 | 428 | def __call__(self, data: numpy.ndarray, test=None): 429 | assert not test 430 | 431 | blur_size = numpy.abs(numpy.random.randn()) * self._blur_size_factor 432 | return scipy.ndimage.gaussian_filter(data, (0, blur_size)) 433 | 434 | 435 | class DataProcessDataset(chainer.dataset.DatasetMixin): 436 | def __init__(self, data: typing.List, data_process: BaseDataProcess) -> None: 437 | self._data = data 438 | self._data_process = data_process 439 | 440 | def __len__(self): 441 | return len(self._data) 442 | 443 | def get_example(self, i): 444 | return self._data_process(data=self._data[i], test=not chainer.config.train) 445 | 446 | 447 | def create(config: DatasetConfig): 448 | acoustic_feature_load_process = AcousticFeatureLoadProcess() 449 | input_mean = acoustic_feature_load_process(config.input_mean_path, test=True) 450 | input_var = acoustic_feature_load_process(config.input_var_path, test=True) 451 | target_mean = acoustic_feature_load_process(config.target_mean_path, test=True) 452 | target_var = acoustic_feature_load_process(config.target_var_path, test=True) 453 | 454 | # {input_path, target_path} 455 | data_process_base = ChainProcess([ 456 | SplitProcess(dict( 457 | input=ChainProcess([ 458 | LambdaProcess(lambda d, test: d['input_path']), 459 | acoustic_feature_load_process, 460 | DistillateUsingFeatureProcess(config.features + ['voiced']), 461 | AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var), 462 | EncodeFeatureProcess(config.features), 463 | ]), 464 | target=ChainProcess([ 465 | LambdaProcess(lambda d, test: d['target_path']), 466 | acoustic_feature_load_process, 467 | DistillateUsingFeatureProcess(config.features + ['voiced']), 468 | AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var), 469 | SplitProcess(dict( 470 | feature=EncodeFeatureProcess(config.features), 471 | mask=ChainProcess([ 472 | MakeMaskProcess(), 473 | EncodeFeatureProcess(config.features), 474 | ]) 475 | )), 476 | ]), 477 | )), 478 | LambdaProcess( 479 | lambda d, test: dict(input=d['input'], target=d['target']['feature'], mask=d['target']['mask'])), 480 | ShapeAlignProcess(), 481 | ]) 482 | 483 | data_process_train = copy.deepcopy(data_process_base) 484 | 485 | # cropping 486 | if config.train_crop_size is not None: 487 | def add_seed(): 488 | return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 31), **d)) 489 | 490 | def padding(s): 491 | return ChainProcess([ 492 | LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), 493 | RandomPaddingProcess(min_size=config.train_crop_size), 494 | ]) 495 | 496 | def crop(s): 497 | return ChainProcess([ 498 | LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), 499 | RandomCropProcess(crop_size=config.train_crop_size), 500 | ]) 501 | 502 | data_process_train.append(ChainProcess([ 503 | add_seed(), 504 | SplitProcess(dict(input=padding('input'), target=padding('target'), mask=padding('mask'))), 505 | add_seed(), 506 | SplitProcess(dict(input=crop('input'), target=crop('target'), mask=crop('mask'))), 507 | ])) 508 | 509 | # add noise 510 | data_process_train.append(SplitProcess(dict( 511 | input=ChainProcess([ 512 | LambdaProcess(lambda d, test: d['input']), 513 | AddNoiseProcess(p_global=config.input_global_noise, p_local=config.input_local_noise), 514 | ]), 515 | target=ChainProcess([ 516 | LambdaProcess(lambda d, test: d['target']), 517 | AddNoiseProcess(p_global=config.target_global_noise, p_local=config.target_local_noise), 518 | ]), 519 | mask=ChainProcess([ 520 | LambdaProcess(lambda d, test: d['mask']), 521 | ]), 522 | ))) 523 | 524 | data_process_test = copy.deepcopy(data_process_base) 525 | if config.train_crop_size is not None: 526 | data_process_test.append(SplitProcess(dict( 527 | input=ChainProcess([ 528 | LambdaProcess(lambda d, test: d['input']), 529 | LastPaddingProcess(min_size=config.train_crop_size), 530 | FirstCropProcess(crop_size=config.train_crop_size), 531 | ]), 532 | target=ChainProcess([ 533 | LambdaProcess(lambda d, test: d['target']), 534 | LastPaddingProcess(min_size=config.train_crop_size), 535 | FirstCropProcess(crop_size=config.train_crop_size), 536 | ]), 537 | mask=ChainProcess([ 538 | LambdaProcess(lambda d, test: d['mask']), 539 | LastPaddingProcess(min_size=config.train_crop_size), 540 | FirstCropProcess(crop_size=config.train_crop_size), 541 | ]), 542 | ))) 543 | 544 | input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))])) 545 | target_paths = list(sorted([Path(p) for p in glob.glob(str(config.target_glob))])) 546 | assert len(input_paths) == len(target_paths) 547 | 548 | num_test = config.num_test 549 | pairs = [ 550 | dict(input_path=input_path, target_path=target_path) 551 | for input_path, target_path in zip(input_paths, target_paths) 552 | ] 553 | numpy.random.RandomState(config.seed).shuffle(pairs) 554 | train_paths = pairs[num_test:] 555 | test_paths = pairs[:num_test] 556 | train_for_evaluate_paths = train_paths[:num_test] 557 | 558 | return { 559 | 'train': DataProcessDataset(train_paths, data_process_train), 560 | 'test': DataProcessDataset(test_paths, data_process_test), 561 | 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test), 562 | } 563 | 564 | 565 | def create_sr(config: SRDatasetConfig): 566 | data_process_base = ChainProcess([ 567 | LowHighSpectrogramFeatureLoadProcess(validate=True), 568 | SplitProcess(dict( 569 | input=LambdaProcess(lambda d, test: numpy.log(d.low[:, :-1])), 570 | target=LambdaProcess(lambda d, test: numpy.log(d.high[:, :-1])), 571 | )), 572 | ]) 573 | 574 | data_process_train = copy.deepcopy(data_process_base) 575 | 576 | # blur 577 | data_process_train.append(SplitProcess(dict( 578 | input=ChainProcess([ 579 | LambdaProcess(lambda d, test: d['input']), 580 | RandomBlurProcess(blur_size_factor=config.blur_size_factor), 581 | ]), 582 | target=ChainProcess([ 583 | LambdaProcess(lambda d, test: d['target']), 584 | ]), 585 | ))) 586 | 587 | # cropping 588 | if config.train_crop_size is not None: 589 | def add_seed(): 590 | return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 31), **d)) 591 | 592 | def padding(s): 593 | return ChainProcess([ 594 | LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), 595 | RandomPaddingProcess(min_size=config.train_crop_size, time_axis=0), 596 | ]) 597 | 598 | def crop(s): 599 | return ChainProcess([ 600 | LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), 601 | RandomCropProcess(crop_size=config.train_crop_size, time_axis=0), 602 | ]) 603 | 604 | data_process_train.append(ChainProcess([ 605 | add_seed(), 606 | SplitProcess(dict(input=padding('input'), target=padding('target'))), 607 | add_seed(), 608 | SplitProcess(dict(input=crop('input'), target=crop('target'))), 609 | ])) 610 | 611 | # add noise 612 | data_process_train.append(SplitProcess(dict( 613 | input=ChainProcess([ 614 | LambdaProcess(lambda d, test: d['input']), 615 | AddNoiseProcess(p_global=config.input_global_noise, p_local=config.input_local_noise), 616 | ]), 617 | target=ChainProcess([ 618 | LambdaProcess(lambda d, test: d['target']), 619 | ]), 620 | ))) 621 | 622 | data_process_train.append(LambdaProcess(lambda d, test: { 623 | 'input': d['input'][numpy.newaxis], 624 | 'target': d['target'][numpy.newaxis], 625 | })) 626 | 627 | data_process_test = copy.deepcopy(data_process_base) 628 | if config.train_crop_size is not None: 629 | data_process_test.append(SplitProcess(dict( 630 | input=ChainProcess([ 631 | LambdaProcess(lambda d, test: d['input']), 632 | LastPaddingProcess(min_size=config.train_crop_size), 633 | FirstCropProcess(crop_size=config.train_crop_size, time_axis=0), 634 | ]), 635 | target=ChainProcess([ 636 | LambdaProcess(lambda d, test: d['target']), 637 | LastPaddingProcess(min_size=config.train_crop_size), 638 | FirstCropProcess(crop_size=config.train_crop_size, time_axis=0), 639 | ]), 640 | ))) 641 | 642 | data_process_test.append(LambdaProcess(lambda d, test: { 643 | 'input': d['input'][numpy.newaxis], 644 | 'target': d['target'][numpy.newaxis], 645 | })) 646 | 647 | input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))])) 648 | 649 | num_test = config.num_test 650 | numpy.random.RandomState(config.seed).shuffle(input_paths) 651 | train_paths = input_paths[num_test:] 652 | test_paths = input_paths[:num_test] 653 | train_for_evaluate_paths = train_paths[:num_test] 654 | 655 | return { 656 | 'train': DataProcessDataset(train_paths, data_process_train), 657 | 'test': DataProcessDataset(test_paths, data_process_test), 658 | 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test), 659 | } 660 | --------------------------------------------------------------------------------