├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── japros-infer.jpg
    └── japros-train.jpg
├── conf
    ├── config.yaml
    ├── finetune.yaml
    ├── tokens.txt
    └── train_args.py
├── data
    ├── README.txt
    ├── transcript_utf8_sample.txt
    └── wavs
    │   └── .gitkeep
├── docs
    └── CLI.md
├── espnet
    ├── __init__.py
    ├── asr
    │   ├── __init__.py
    │   ├── asr_mix_utils.py
    │   ├── asr_utils.py
    │   ├── chainer_backend
    │   │   ├── __init__.py
    │   │   └── asr.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   ├── asr.py
    │   │   ├── asr_init.py
    │   │   ├── asr_mix.py
    │   │   └── recog.py
    ├── bin
    │   ├── __init__.py
    │   ├── asr_align.py
    │   ├── asr_enhance.py
    │   ├── asr_recog.py
    │   ├── asr_train.py
    │   ├── lm_train.py
    │   ├── mt_train.py
    │   ├── mt_trans.py
    │   ├── st_train.py
    │   ├── st_trans.py
    │   ├── tts_decode.py
    │   ├── tts_train.py
    │   ├── vc_decode.py
    │   └── vc_train.py
    ├── distributed
    │   ├── __init__.py
    │   └── pytorch_backend
    │   │   └── launch.py
    ├── lm
    │   ├── __init__.py
    │   ├── chainer_backend
    │   │   ├── __init__.py
    │   │   ├── extlm.py
    │   │   └── lm.py
    │   ├── lm_utils.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   ├── extlm.py
    │   │   └── lm.py
    ├── mt
    │   ├── __init__.py
    │   ├── mt_utils.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   └── mt.py
    ├── nets
    │   ├── __init__.py
    │   ├── asr_interface.py
    │   ├── batch_beam_search.py
    │   ├── batch_beam_search_online.py
    │   ├── batch_beam_search_online_sim.py
    │   ├── beam_search.py
    │   ├── beam_search_timesync.py
    │   ├── beam_search_timesync_streaming.py
    │   ├── beam_search_transducer.py
    │   ├── chainer_backend
    │   │   ├── __init__.py
    │   │   ├── asr_interface.py
    │   │   ├── ctc.py
    │   │   ├── deterministic_embed_id.py
    │   │   ├── e2e_asr.py
    │   │   ├── e2e_asr_transformer.py
    │   │   ├── nets_utils.py
    │   │   ├── rnn
    │   │   │   ├── __init__.py
    │   │   │   ├── attentions.py
    │   │   │   ├── decoders.py
    │   │   │   ├── encoders.py
    │   │   │   └── training.py
    │   │   └── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── ctc.py
    │   │   │   ├── decoder.py
    │   │   │   ├── decoder_layer.py
    │   │   │   ├── embedding.py
    │   │   │   ├── encoder.py
    │   │   │   ├── encoder_layer.py
    │   │   │   ├── label_smoothing_loss.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── mask.py
    │   │   │   ├── positionwise_feed_forward.py
    │   │   │   ├── subsampling.py
    │   │   │   └── training.py
    │   ├── ctc_prefix_score.py
    │   ├── e2e_asr_common.py
    │   ├── e2e_mt_common.py
    │   ├── lm_interface.py
    │   ├── mt_interface.py
    │   ├── pytorch_backend
    │   │   ├── __init__.py
    │   │   ├── conformer
    │   │   │   ├── __init__.py
    │   │   │   ├── argument.py
    │   │   │   ├── contextual_block_encoder_layer.py
    │   │   │   ├── convolution.py
    │   │   │   ├── encoder.py
    │   │   │   ├── encoder_layer.py
    │   │   │   └── swish.py
    │   │   ├── ctc.py
    │   │   ├── e2e_asr.py
    │   │   ├── e2e_asr_conformer.py
    │   │   ├── e2e_asr_maskctc.py
    │   │   ├── e2e_asr_mix.py
    │   │   ├── e2e_asr_mix_transformer.py
    │   │   ├── e2e_asr_mulenc.py
    │   │   ├── e2e_asr_transducer.py
    │   │   ├── e2e_asr_transformer.py
    │   │   ├── e2e_mt.py
    │   │   ├── e2e_mt_transformer.py
    │   │   ├── e2e_st.py
    │   │   ├── e2e_st_conformer.py
    │   │   ├── e2e_st_transformer.py
    │   │   ├── e2e_tts_fastspeech.py
    │   │   ├── e2e_tts_tacotron2.py
    │   │   ├── e2e_tts_transformer.py
    │   │   ├── e2e_vc_tacotron2.py
    │   │   ├── e2e_vc_transformer.py
    │   │   ├── fastspeech
    │   │   │   ├── __init__.py
    │   │   │   ├── duration_calculator.py
    │   │   │   ├── duration_predictor.py
    │   │   │   └── length_regulator.py
    │   │   ├── frontends
    │   │   │   ├── __init__.py
    │   │   │   ├── beamformer.py
    │   │   │   ├── dnn_beamformer.py
    │   │   │   ├── dnn_wpe.py
    │   │   │   ├── feature_transform.py
    │   │   │   ├── frontend.py
    │   │   │   └── mask_estimator.py
    │   │   ├── gtn_ctc.py
    │   │   ├── initialization.py
    │   │   ├── lm
    │   │   │   ├── __init__.py
    │   │   │   ├── default.py
    │   │   │   ├── seq_rnn.py
    │   │   │   └── transformer.py
    │   │   ├── maskctc
    │   │   │   ├── __init__.py
    │   │   │   ├── add_mask_token.py
    │   │   │   └── mask.py
    │   │   ├── nets_utils.py
    │   │   ├── rnn
    │   │   │   ├── __init__.py
    │   │   │   ├── argument.py
    │   │   │   ├── attentions.py
    │   │   │   ├── decoders.py
    │   │   │   └── encoders.py
    │   │   ├── streaming
    │   │   │   ├── __init__.py
    │   │   │   ├── segment.py
    │   │   │   └── window.py
    │   │   ├── tacotron2
    │   │   │   ├── __init__.py
    │   │   │   ├── cbhg.py
    │   │   │   ├── decoder.py
    │   │   │   └── encoder.py
    │   │   ├── transducer
    │   │   │   ├── __init__.py
    │   │   │   ├── arguments.py
    │   │   │   ├── blocks.py
    │   │   │   ├── conv1d_nets.py
    │   │   │   ├── custom_decoder.py
    │   │   │   ├── custom_encoder.py
    │   │   │   ├── error_calculator.py
    │   │   │   ├── initializer.py
    │   │   │   ├── joint_network.py
    │   │   │   ├── rnn_decoder.py
    │   │   │   ├── rnn_encoder.py
    │   │   │   ├── transducer_tasks.py
    │   │   │   ├── transformer_decoder_layer.py
    │   │   │   ├── utils.py
    │   │   │   └── vgg2l.py
    │   │   ├── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── add_sos_eos.py
    │   │   │   ├── argument.py
    │   │   │   ├── attention.py
    │   │   │   ├── contextual_block_encoder_layer.py
    │   │   │   ├── decoder.py
    │   │   │   ├── decoder_layer.py
    │   │   │   ├── dynamic_conv.py
    │   │   │   ├── dynamic_conv2d.py
    │   │   │   ├── embedding.py
    │   │   │   ├── encoder.py
    │   │   │   ├── encoder_layer.py
    │   │   │   ├── encoder_mix.py
    │   │   │   ├── initializer.py
    │   │   │   ├── label_smoothing_loss.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── lightconv.py
    │   │   │   ├── lightconv2d.py
    │   │   │   ├── longformer_attention.py
    │   │   │   ├── mask.py
    │   │   │   ├── multi_layer_conv.py
    │   │   │   ├── optimizer.py
    │   │   │   ├── plot.py
    │   │   │   ├── positionwise_feed_forward.py
    │   │   │   ├── repeat.py
    │   │   │   ├── subsampling.py
    │   │   │   └── subsampling_without_posenc.py
    │   │   └── wavenet.py
    │   ├── scorer_interface.py
    │   ├── scorers
    │   │   ├── __init__.py
    │   │   ├── ctc.py
    │   │   ├── length_bonus.py
    │   │   ├── ngram.py
    │   │   └── uasr.py
    │   ├── st_interface.py
    │   ├── transducer_decoder_interface.py
    │   └── tts_interface.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── chainer.py
    │   ├── factory.py
    │   ├── parser.py
    │   └── pytorch.py
    ├── scheduler
    │   ├── __init__.py
    │   ├── chainer.py
    │   ├── pytorch.py
    │   └── scheduler.py
    ├── st
    │   ├── __init__.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   └── st.py
    ├── transform
    │   ├── __init__.py
    │   ├── add_deltas.py
    │   ├── channel_selector.py
    │   ├── cmvn.py
    │   ├── functional.py
    │   ├── perturb.py
    │   ├── spec_augment.py
    │   ├── spectrogram.py
    │   ├── transform_interface.py
    │   ├── transformation.py
    │   └── wpe.py
    ├── tts
    │   ├── __init__.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   └── tts.py
    ├── utils
    │   ├── __init__.py
    │   ├── check_kwargs.py
    │   ├── cli_readers.py
    │   ├── cli_utils.py
    │   ├── cli_writers.py
    │   ├── dataset.py
    │   ├── deterministic_utils.py
    │   ├── dynamic_import.py
    │   ├── fill_missing_args.py
    │   ├── io_utils.py
    │   ├── spec_augment.py
    │   └── training
    │   │   ├── __init__.py
    │   │   ├── batchfy.py
    │   │   ├── evaluator.py
    │   │   ├── iterators.py
    │   │   ├── tensorboard_logger.py
    │   │   └── train_utils.py
    ├── vc
    │   └── pytorch_backend
    │   │   └── vc.py
    └── version.txt
├── espnet2
    ├── __init__.py
    ├── asr
    │   ├── __init__.py
    │   ├── ctc.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── abs_decoder.py
    │   │   ├── hugging_face_transformers_decoder.py
    │   │   ├── mlm_decoder.py
    │   │   ├── rnn_decoder.py
    │   │   ├── s4_decoder.py
    │   │   ├── transducer_decoder.py
    │   │   ├── transformer_decoder.py
    │   │   └── whisper_decoder.py
    │   ├── discrete_asr_espnet_model.py
    │   ├── encoder
    │   │   ├── __init__.py
    │   │   ├── abs_encoder.py
    │   │   ├── branchformer_encoder.py
    │   │   ├── conformer_encoder.py
    │   │   ├── contextual_block_conformer_encoder.py
    │   │   ├── contextual_block_transformer_encoder.py
    │   │   ├── e_branchformer_encoder.py
    │   │   ├── hubert_encoder.py
    │   │   ├── hugging_face_transformers_encoder.py
    │   │   ├── longformer_encoder.py
    │   │   ├── rnn_encoder.py
    │   │   ├── transformer_encoder.py
    │   │   ├── transformer_encoder_multispkr.py
    │   │   ├── vgg_rnn_encoder.py
    │   │   ├── wav2vec2_encoder.py
    │   │   └── whisper_encoder.py
    │   ├── espnet_model.py
    │   ├── frontend
    │   │   ├── __init__.py
    │   │   ├── abs_frontend.py
    │   │   ├── default.py
    │   │   ├── fused.py
    │   │   ├── s3prl.py
    │   │   ├── whisper.py
    │   │   └── windowing.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── cgmlp.py
    │   │   └── fastformer.py
    │   ├── maskctc_model.py
    │   ├── pit_espnet_model.py
    │   ├── postencoder
    │   │   ├── __init__.py
    │   │   ├── abs_postencoder.py
    │   │   ├── hugging_face_transformers_postencoder.py
    │   │   └── length_adaptor_postencoder.py
    │   ├── preencoder
    │   │   ├── __init__.py
    │   │   ├── abs_preencoder.py
    │   │   ├── linear.py
    │   │   └── sinc.py
    │   ├── specaug
    │   │   ├── __init__.py
    │   │   ├── abs_specaug.py
    │   │   └── specaug.py
    │   ├── state_spaces
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── base.py
    │   │   ├── block.py
    │   │   ├── cauchy.py
    │   │   ├── components.py
    │   │   ├── ff.py
    │   │   ├── model.py
    │   │   ├── pool.py
    │   │   ├── registry.py
    │   │   ├── residual.py
    │   │   ├── s4.py
    │   │   └── utils.py
    │   └── transducer
    │   │   ├── __init__.py
    │   │   ├── beam_search_transducer.py
    │   │   ├── beam_search_transducer_streaming.py
    │   │   ├── error_calculator.py
    │   │   └── rnnt_multi_blank
    │   │       ├── __init__.py
    │   │       ├── rnnt.py
    │   │       ├── rnnt_multi_blank.py
    │   │       └── utils
    │   │           ├── __init__.py
    │   │           ├── cpu_utils
    │   │               ├── __init__.py
    │   │               └── cpu_rnnt.py
    │   │           ├── cuda_utils
    │   │               ├── __init__.py
    │   │               ├── gpu_rnnt.py
    │   │               ├── gpu_rnnt_kernel.py
    │   │               └── reduce.py
    │   │           ├── global_constants.py
    │   │           └── rnnt_helper.py
    ├── asr_transducer
    │   ├── __init__.py
    │   ├── activation.py
    │   ├── beam_search_transducer.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── abs_decoder.py
    │   │   ├── blocks
    │   │   │   ├── __init__.py
    │   │   │   ├── mega.py
    │   │   │   └── rwkv.py
    │   │   ├── mega_decoder.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── mega
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── feed_forward.py
    │   │   │   │   ├── multi_head_damped_ema.py
    │   │   │   │   └── positional_bias.py
    │   │   │   └── rwkv
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── cuda
    │   │   │   │       ├── wkv_cuda.cu
    │   │   │   │       └── wkv_op.cpp
    │   │   │   │   └── feed_forward.py
    │   │   ├── rnn_decoder.py
    │   │   ├── rwkv_decoder.py
    │   │   └── stateless_decoder.py
    │   ├── encoder
    │   │   ├── __init__.py
    │   │   ├── blocks
    │   │   │   ├── __init__.py
    │   │   │   ├── branchformer.py
    │   │   │   ├── conformer.py
    │   │   │   ├── conv1d.py
    │   │   │   ├── conv_input.py
    │   │   │   └── ebranchformer.py
    │   │   ├── building.py
    │   │   ├── encoder.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── convolution.py
    │   │   │   ├── multi_blocks.py
    │   │   │   └── positional_encoding.py
    │   │   └── validation.py
    │   ├── error_calculator.py
    │   ├── espnet_transducer_model.py
    │   ├── frontend
    │   │   ├── __init__.py
    │   │   └── online_audio_processor.py
    │   ├── joint_network.py
    │   ├── normalization.py
    │   └── utils.py
    ├── asvspoof
    │   ├── __init__.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── abs_decoder.py
    │   │   └── linear_decoder.py
    │   ├── espnet_model.py
    │   └── loss
    │   │   ├── __init__.py
    │   │   ├── abs_loss.py
    │   │   ├── am_softmax_loss.py
    │   │   ├── binary_loss.py
    │   │   └── oc_softmax_loss.py
    ├── bin
    │   ├── __init__.py
    │   ├── aggregate_stats_dirs.py
    │   ├── asr_align.py
    │   ├── asr_inference.py
    │   ├── asr_inference_k2.py
    │   ├── asr_inference_maskctc.py
    │   ├── asr_inference_streaming.py
    │   ├── asr_train.py
    │   ├── asr_transducer_inference.py
    │   ├── asr_transducer_train.py
    │   ├── asvspoof_inference.py
    │   ├── asvspoof_train.py
    │   ├── diar_inference.py
    │   ├── diar_train.py
    │   ├── enh_inference.py
    │   ├── enh_inference_streaming.py
    │   ├── enh_s2t_train.py
    │   ├── enh_scoring.py
    │   ├── enh_train.py
    │   ├── enh_tse_inference.py
    │   ├── enh_tse_train.py
    │   ├── gan_svs_train.py
    │   ├── gan_tts_train.py
    │   ├── hubert_train.py
    │   ├── hugging_face_export_vocabulary.py
    │   ├── launch.py
    │   ├── lm_calc_perplexity.py
    │   ├── lm_inference.py
    │   ├── lm_train.py
    │   ├── mt_inference.py
    │   ├── mt_train.py
    │   ├── pack.py
    │   ├── s2t_inference.py
    │   ├── s2t_inference_language.py
    │   ├── s2t_train.py
    │   ├── slu_inference.py
    │   ├── slu_train.py
    │   ├── spk_train.py
    │   ├── split_scps.py
    │   ├── st_inference.py
    │   ├── st_inference_streaming.py
    │   ├── st_train.py
    │   ├── svs_inference.py
    │   ├── svs_train.py
    │   ├── tokenize_text.py
    │   ├── tts_inference.py
    │   ├── tts_train.py
    │   ├── uasr_extract_feature.py
    │   ├── uasr_inference.py
    │   ├── uasr_inference_k2.py
    │   ├── uasr_train.py
    │   └── whisper_export_vocabulary.py
    ├── diar
    │   ├── __init__.py
    │   ├── abs_diar.py
    │   ├── attractor
    │   │   ├── __init__.py
    │   │   ├── abs_attractor.py
    │   │   └── rnn_attractor.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── abs_decoder.py
    │   │   └── linear_decoder.py
    │   ├── espnet_model.py
    │   ├── label_processor.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── abs_mask.py
    │   │   ├── multi_mask.py
    │   │   └── tcn_nomask.py
    │   └── separator
    │   │   ├── __init__.py
    │   │   └── tcn_separator_nomask.py
    ├── enh
    │   ├── __init__.py
    │   ├── abs_enh.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── abs_decoder.py
    │   │   ├── conv_decoder.py
    │   │   ├── null_decoder.py
    │   │   └── stft_decoder.py
    │   ├── encoder
    │   │   ├── __init__.py
    │   │   ├── abs_encoder.py
    │   │   ├── conv_encoder.py
    │   │   ├── null_encoder.py
    │   │   └── stft_encoder.py
    │   ├── espnet_enh_s2t_model.py
    │   ├── espnet_model.py
    │   ├── espnet_model_tse.py
    │   ├── extractor
    │   │   ├── __init__.py
    │   │   ├── abs_extractor.py
    │   │   └── td_speakerbeam_extractor.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── adapt_layers.py
    │   │   ├── beamformer.py
    │   │   ├── beamformer_th.py
    │   │   ├── complex_utils.py
    │   │   ├── complexnn.py
    │   │   ├── conv_utils.py
    │   │   ├── dc_crn.py
    │   │   ├── dnn_beamformer.py
    │   │   ├── dnn_wpe.py
    │   │   ├── dnsmos.py
    │   │   ├── dpmulcat.py
    │   │   ├── dprnn.py
    │   │   ├── dptnet.py
    │   │   ├── fasnet.py
    │   │   ├── ifasnet.py
    │   │   ├── mask_estimator.py
    │   │   ├── skim.py
    │   │   ├── tcn.py
    │   │   ├── tcndenseunet.py
    │   │   └── wpe.py
    │   ├── loss
    │   │   ├── __init__.py
    │   │   ├── criterions
    │   │   │   ├── __init__.py
    │   │   │   ├── abs_loss.py
    │   │   │   ├── tf_domain.py
    │   │   │   └── time_domain.py
    │   │   └── wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── abs_wrapper.py
    │   │   │   ├── dpcl_solver.py
    │   │   │   ├── fixed_order.py
    │   │   │   ├── mixit_solver.py
    │   │   │   ├── multilayer_pit_solver.py
    │   │   │   └── pit_solver.py
    │   └── separator
    │   │   ├── __init__.py
    │   │   ├── abs_separator.py
    │   │   ├── asteroid_models.py
    │   │   ├── conformer_separator.py
    │   │   ├── dan_separator.py
    │   │   ├── dc_crn_separator.py
    │   │   ├── dccrn_separator.py
    │   │   ├── dpcl_e2e_separator.py
    │   │   ├── dpcl_separator.py
    │   │   ├── dprnn_separator.py
    │   │   ├── dptnet_separator.py
    │   │   ├── fasnet_separator.py
    │   │   ├── ineube_separator.py
    │   │   ├── neural_beamformer.py
    │   │   ├── rnn_separator.py
    │   │   ├── skim_separator.py
    │   │   ├── svoice_separator.py
    │   │   ├── tcn_separator.py
    │   │   ├── tfgridnet_separator.py
    │   │   └── transformer_separator.py
    ├── fileio
    │   ├── __init__.py
    │   ├── datadir_writer.py
    │   ├── npy_scp.py
    │   ├── rand_gen_dataset.py
    │   ├── read_text.py
    │   ├── rttm.py
    │   ├── score_scp.py
    │   ├── sound_scp.py
    │   └── vad_scp.py
    ├── fst
    │   ├── __init__.py
    │   └── lm_rescore.py
    ├── gan_svs
    │   ├── __init__.py
    │   ├── abs_gan_svs.py
    │   ├── avocodo
    │   │   ├── __init__.py
    │   │   └── avocodo.py
    │   ├── espnet_model.py
    │   ├── joint
    │   │   ├── __init__.py
    │   │   └── joint_score2wav.py
    │   ├── pits
    │   │   ├── modules.py
    │   │   └── ying_decoder.py
    │   ├── uhifigan
    │   │   ├── __init__.py
    │   │   ├── sine_generator.py
    │   │   └── uhifigan.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── expand_f0.py
    │   ├── visinger2
    │   │   ├── __init__.py
    │   │   ├── ddsp.py
    │   │   └── visinger2_vocoder.py
    │   └── vits
    │   │   ├── __init__.py
    │   │   ├── duration_predictor.py
    │   │   ├── generator.py
    │   │   ├── length_regulator.py
    │   │   ├── modules.py
    │   │   ├── phoneme_predictor.py
    │   │   ├── pitch_predictor.py
    │   │   ├── prior_decoder.py
    │   │   ├── text_encoder.py
    │   │   └── vits.py
    ├── gan_tts
    │   ├── __init__.py
    │   ├── abs_gan_tts.py
    │   ├── espnet_model.py
    │   ├── hifigan
    │   │   ├── __init__.py
    │   │   ├── hifigan.py
    │   │   ├── loss.py
    │   │   └── residual_block.py
    │   ├── jets
    │   │   ├── __init__.py
    │   │   ├── alignments.py
    │   │   ├── generator.py
    │   │   ├── jets.py
    │   │   ├── length_regulator.py
    │   │   └── loss.py
    │   ├── joint
    │   │   ├── __init__.py
    │   │   └── joint_text2wav.py
    │   ├── melgan
    │   │   ├── __init__.py
    │   │   ├── melgan.py
    │   │   ├── pqmf.py
    │   │   └── residual_stack.py
    │   ├── parallel_wavegan
    │   │   ├── __init__.py
    │   │   ├── parallel_wavegan.py
    │   │   └── upsample.py
    │   ├── style_melgan
    │   │   ├── __init__.py
    │   │   ├── style_melgan.py
    │   │   └── tade_res_block.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── get_random_segments.py
    │   ├── vits
    │   │   ├── __init__.py
    │   │   ├── duration_predictor.py
    │   │   ├── flow.py
    │   │   ├── generator.py
    │   │   ├── loss.py
    │   │   ├── monotonic_align
    │   │   │   ├── __init__.py
    │   │   │   ├── core.pyx
    │   │   │   └── setup.py
    │   │   ├── posterior_encoder.py
    │   │   ├── residual_coupling.py
    │   │   ├── text_encoder.py
    │   │   ├── transform.py
    │   │   └── vits.py
    │   └── wavenet
    │   │   ├── __init__.py
    │   │   ├── residual_block.py
    │   │   └── wavenet.py
    ├── hubert
    │   ├── __init__.py
    │   ├── espnet_model.py
    │   └── hubert_loss.py
    ├── iterators
    │   ├── __init__.py
    │   ├── abs_iter_factory.py
    │   ├── category_iter_factory.py
    │   ├── chunk_iter_factory.py
    │   ├── multiple_iter_factory.py
    │   └── sequence_iter_factory.py
    ├── layers
    │   ├── __init__.py
    │   ├── abs_normalize.py
    │   ├── augmentation.py
    │   ├── global_mvn.py
    │   ├── inversible_interface.py
    │   ├── label_aggregation.py
    │   ├── log_mel.py
    │   ├── mask_along_axis.py
    │   ├── sinc_conv.py
    │   ├── stft.py
    │   ├── time_warp.py
    │   └── utterance_mvn.py
    ├── lm
    │   ├── __init__.py
    │   ├── abs_model.py
    │   ├── espnet_model.py
    │   ├── seq_rnn_lm.py
    │   └── transformer_lm.py
    ├── main_funcs
    │   ├── __init__.py
    │   ├── average_nbest_models.py
    │   ├── calculate_all_attentions.py
    │   ├── collect_stats.py
    │   └── pack_funcs.py
    ├── mt
    │   ├── __init__.py
    │   ├── espnet_model.py
    │   └── frontend
    │   │   ├── __init__.py
    │   │   └── embedding.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── optim_groups.py
    │   └── sgd.py
    ├── s2t
    │   ├── __init__.py
    │   └── espnet_model.py
    ├── samplers
    │   ├── __init__.py
    │   ├── abs_sampler.py
    │   ├── build_batch_sampler.py
    │   ├── category_balanced_sampler.py
    │   ├── folded_batch_sampler.py
    │   ├── length_batch_sampler.py
    │   ├── num_elements_batch_sampler.py
    │   ├── sorted_batch_sampler.py
    │   └── unsorted_batch_sampler.py
    ├── schedulers
    │   ├── __init__.py
    │   ├── abs_scheduler.py
    │   ├── cosine_anneal_warmup_restart.py
    │   ├── noam_lr.py
    │   ├── warmup_lr.py
    │   ├── warmup_reducelronplateau.py
    │   └── warmup_step_lr.py
    ├── slu
    │   ├── __init__.py
    │   ├── espnet_model.py
    │   ├── postdecoder
    │   │   ├── __init__.py
    │   │   ├── abs_postdecoder.py
    │   │   └── hugging_face_transformers_postdecoder.py
    │   └── postencoder
    │   │   ├── __init__.py
    │   │   ├── conformer_postencoder.py
    │   │   └── transformer_postencoder.py
    ├── spk
    │   ├── __init__.py
    │   ├── encoder
    │   │   ├── __init__.py
    │   │   └── rawnet3_encoder.py
    │   ├── espnet_model.py
    │   ├── layers
    │   │   ├── RawNetBasicBlock.py
    │   │   └── __init__.py
    │   ├── loss
    │   │   ├── __init__.py
    │   │   ├── aamsoftmax.py
    │   │   └── abs_loss.py
    │   ├── pooling
    │   │   ├── __init__.py
    │   │   ├── abs_pooling.py
    │   │   └── chn_attn_stat_pooling.py
    │   └── projector
    │   │   ├── __init__.py
    │   │   ├── abs_projector.py
    │   │   └── rawnet3_projector.py
    ├── st
    │   ├── __init__.py
    │   └── espnet_model.py
    ├── svs
    │   ├── __init__.py
    │   ├── abs_svs.py
    │   ├── espnet_model.py
    │   ├── feats_extract
    │   │   ├── __init__.py
    │   │   └── score_feats_extract.py
    │   ├── naive_rnn
    │   │   ├── __init__.py
    │   │   ├── naive_rnn.py
    │   │   └── naive_rnn_dp.py
    │   ├── singing_tacotron
    │   │   ├── __init__.py
    │   │   ├── decoder.py
    │   │   ├── encoder.py
    │   │   └── singing_tacotron.py
    │   └── xiaoice
    │   │   ├── XiaoiceSing.py
    │   │   ├── __init__.py
    │   │   └── loss.py
    ├── tasks
    │   ├── __init__.py
    │   ├── abs_task.py
    │   ├── asr.py
    │   ├── asr_transducer.py
    │   ├── asvspoof.py
    │   ├── diar.py
    │   ├── enh.py
    │   ├── enh_s2t.py
    │   ├── enh_tse.py
    │   ├── gan_svs.py
    │   ├── gan_tts.py
    │   ├── hubert.py
    │   ├── lm.py
    │   ├── mt.py
    │   ├── s2t.py
    │   ├── slu.py
    │   ├── spk.py
    │   ├── st.py
    │   ├── svs.py
    │   ├── tts.py
    │   └── uasr.py
    ├── text
    │   ├── __init__.py
    │   ├── abs_tokenizer.py
    │   ├── build_tokenizer.py
    │   ├── char_tokenizer.py
    │   ├── cleaner.py
    │   ├── hugging_face_token_id_converter.py
    │   ├── hugging_face_tokenizer.py
    │   ├── korean_cleaner.py
    │   ├── phoneme_tokenizer.py
    │   ├── sentencepiece_tokenizer.py
    │   ├── token_id_converter.py
    │   ├── whisper_token_id_converter.py
    │   ├── whisper_tokenizer.py
    │   └── word_tokenizer.py
    ├── torch_utils
    │   ├── __init__.py
    │   ├── add_gradient_noise.py
    │   ├── device_funcs.py
    │   ├── forward_adaptor.py
    │   ├── get_layer_from_string.py
    │   ├── initialize.py
    │   ├── load_pretrained_model.py
    │   ├── model_summary.py
    │   ├── pytorch_version.py
    │   ├── recursive_op.py
    │   └── set_all_random_seed.py
    ├── train
    │   ├── __init__.py
    │   ├── abs_espnet_model.py
    │   ├── abs_gan_espnet_model.py
    │   ├── class_choices.py
    │   ├── collate_fn.py
    │   ├── dataset.py
    │   ├── distributed_utils.py
    │   ├── gan_trainer.py
    │   ├── iterable_dataset.py
    │   ├── preprocessor.py
    │   ├── reporter.py
    │   ├── spk_trainer.py
    │   ├── trainer.py
    │   └── uasr_trainer.py
    ├── tts
    │   ├── __init__.py
    │   ├── abs_tts.py
    │   ├── espnet_model.py
    │   ├── fastspeech
    │   │   ├── __init__.py
    │   │   └── fastspeech.py
    │   ├── fastspeech2
    │   │   ├── __init__.py
    │   │   ├── fastspeech2.py
    │   │   ├── loss.py
    │   │   └── variance_predictor.py
    │   ├── feats_extract
    │   │   ├── __init__.py
    │   │   ├── abs_feats_extract.py
    │   │   ├── dio.py
    │   │   ├── energy.py
    │   │   ├── linear_spectrogram.py
    │   │   ├── log_mel_fbank.py
    │   │   ├── log_spectrogram.py
    │   │   ├── yin.py
    │   │   └── ying.py
    │   ├── gst
    │   │   ├── __init__.py
    │   │   └── style_encoder.py
    │   ├── prodiff
    │   │   ├── __init__.py
    │   │   ├── denoiser.py
    │   │   ├── loss.py
    │   │   └── prodiff.py
    │   ├── tacotron2
    │   │   ├── __init__.py
    │   │   └── tacotron2.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   └── transformer.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── duration_calculator.py
    │   │   └── parallel_wavegan_pretrained_vocoder.py
    ├── uasr
    │   ├── __init__.py
    │   ├── discriminator
    │   │   ├── __init__.py
    │   │   ├── abs_discriminator.py
    │   │   └── conv_discriminator.py
    │   ├── espnet_model.py
    │   ├── generator
    │   │   ├── __init__.py
    │   │   ├── abs_generator.py
    │   │   └── conv_generator.py
    │   ├── loss
    │   │   ├── __init__.py
    │   │   ├── abs_loss.py
    │   │   ├── discriminator_loss.py
    │   │   ├── gradient_penalty.py
    │   │   ├── phoneme_diversity_loss.py
    │   │   ├── pseudo_label_loss.py
    │   │   └── smoothness_penalty.py
    │   └── segmenter
    │   │   ├── __init__.py
    │   │   ├── abs_segmenter.py
    │   │   ├── join_segmenter.py
    │   │   └── random_segmenter.py
    └── utils
    │   ├── __init__.py
    │   ├── build_dataclass.py
    │   ├── config_argparse.py
    │   ├── eer.py
    │   ├── get_default_kwargs.py
    │   ├── griffin_lim.py
    │   ├── kwargs2args.py
    │   ├── nested_dict_action.py
    │   ├── sized_dict.py
    │   ├── types.py
    │   └── yaml_no_alias_safe_dump.py
├── model.py
├── preprocess.py
├── pretrained
    └── .gitkeep
├── requirements.txt
├── setup.bat
├── text.py
├── train.py
├── transcribe.py
├── transcribe_split.py
├── update.bat
├── vits_japros_train_colab.ipynb
├── webui_infer.bat
├── webui_infer.py
├── webui_train.bat
├── webui_train.py
└── weights
    └── .gitignore


/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | __pycache__/
3 | outputs/
4 | *.wav
5 | *.pth
6 | *.bak
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 litagin02
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/japros-infer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/assets/japros-infer.jpg


--------------------------------------------------------------------------------
/assets/japros-train.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/assets/japros-train.jpg


--------------------------------------------------------------------------------
/conf/tokens.txt:
--------------------------------------------------------------------------------
 1 | <blank>
 2 | <unk>
 3 | a
 4 | o
 5 | i
 6 | [
 7 | #
 8 | u
 9 | ]
10 | e
11 | k
12 | n
13 | t
14 | r
15 | s
16 | N
17 | m
18 | _
19 | sh
20 | d
21 | g
22 | ^
23 | $
24 | w
25 | cl
26 | h
27 | y
28 | b
29 | j
30 | ts
31 | ch
32 | z
33 | p
34 | f
35 | ky
36 | ry
37 | gy
38 | hy
39 | ny
40 | by
41 | my
42 | py
43 | v
44 | dy
45 | ?
46 | ty
47 | <sos/eos>
48 | 


--------------------------------------------------------------------------------
/conf/train_args.py:
--------------------------------------------------------------------------------
 1 | train_args = [
 2 |     "--use_preprocessor",
 3 |     "true",
 4 |     "--token_type",
 5 |     "phn",
 6 |     "--token_list",
 7 |     "conf/tokens.txt",
 8 |     "--non_linguistic_symbols",
 9 |     "none",
10 |     "--cleaner",
11 |     "jaconv",
12 |     "--g2p",
13 |     "pyopenjtalk_prosody",
14 |     "--normalize",
15 |     "none",
16 |     "--resume",
17 |     "true",
18 |     "--fold_length",
19 |     "150",
20 |     "--fold_length",
21 |     "409600",
22 |     "--output_dir",
23 |     "{output_dir}/{model_name}/checkpoints",
24 |     "--config",
25 |     "conf/finetune.yaml",
26 |     "--feats_extract",
27 |     "linear_spectrogram",
28 |     "--feats_extract_conf",
29 |     "n_fft=2048",
30 |     "--feats_extract_conf",
31 |     "hop_length=512",
32 |     "--feats_extract_conf",
33 |     "win_length=null",
34 |     "--train_data_path_and_name_and_type",
35 |     "{output_dir}/{model_name}/dump/train/text,text,text",
36 |     "--train_data_path_and_name_and_type",
37 |     "{output_dir}/{model_name}/dump/train/wav.scp,speech,sound",
38 |     "--train_shape_file",
39 |     "{output_dir}/{model_name}/stats/train/text_shape.phn",
40 |     "--train_shape_file",
41 |     "{output_dir}/{model_name}/stats/train/speech_shape",
42 |     "--valid_data_path_and_name_and_type",
43 |     "{output_dir}/{model_name}/dump/valid/text,text,text",
44 |     "--valid_data_path_and_name_and_type",
45 |     "{output_dir}/{model_name}/dump/valid/wav.scp,speech,sound",
46 |     "--valid_shape_file",
47 |     "{output_dir}/{model_name}/stats/valid/text_shape.phn",
48 |     "--valid_shape_file",
49 |     "{output_dir}/{model_name}/stats/valid/speech_shape",
50 |     "--init_param",
51 |     "pretrained/pretrained.pth:tts:tts",
52 |     "--ngpu",
53 |     "1",
54 | ]
55 | 


--------------------------------------------------------------------------------
/data/README.txt:
--------------------------------------------------------------------------------
 1 | このフォルダには以下のようにファイルを準備してください。
 2 | 
 3 | - wavsフォルダに音声ファイル（wav形式）
 4 | - transcript_utf8.txt
 5 | 
 6 | transcript_utf8.txtの中身は以下の感じ（whisperを使う場合は自動で作られます。）
 7 | 
 8 | wav_filename1:ここに発言内容を書きます。
 9 | wav2:コロンの左側はファイル名のみで、拡張子は不要です。
10 | …
11 | 
12 | また、wavファイルは以下のようにwavsフォルダに入れてください。
13 | wavs
14 | ├── wav_filename1.wav
15 | ├── wav2.wav
16 | └── …
17 | 


--------------------------------------------------------------------------------
/data/transcript_utf8_sample.txt:
--------------------------------------------------------------------------------
1 | sample_wav_file_name:これはサンプルです。
2 | wav_filename1:ここに発言内容を書きます。
3 | wav2:コロンの左側はファイル名のみで、拡張子は不要です。
4 | 


--------------------------------------------------------------------------------
/data/wavs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/data/wavs/.gitkeep


--------------------------------------------------------------------------------
/espnet/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize espnet package."""
2 | 
3 | import os
4 | 
5 | dirname = os.path.dirname(__file__)
6 | version_file = os.path.join(dirname, "version.txt")
7 | with open(version_file, "r") as f:
8 |     __version__ = f.read().strip()
9 | 


--------------------------------------------------------------------------------
/espnet/asr/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/asr/chainer_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/bin/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText:
3 | #   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4 | # SPDX-License-Identifier: Apache-2.0
5 | #
6 | 
7 | """Initialize sub package."""
8 | 


--------------------------------------------------------------------------------
/espnet/lm/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/lm/chainer_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/lm/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/mt/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/mt/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/asr_interface.py:
--------------------------------------------------------------------------------
 1 | """ASR Interface module."""
 2 | import chainer
 3 | 
 4 | from espnet.nets.asr_interface import ASRInterface
 5 | 
 6 | 
 7 | class ChainerASRInterface(ASRInterface, chainer.Chain):
 8 |     """ASR Interface for ESPnet model implementation."""
 9 | 
10 |     @staticmethod
11 |     def custom_converter(*args, **kw):
12 |         """Get customconverter of the model (Chainer only)."""
13 |         raise NotImplementedError("custom converter method is not implemented")
14 | 
15 |     @staticmethod
16 |     def custom_updater(*args, **kw):
17 |         """Get custom_updater of the model (Chainer only)."""
18 |         raise NotImplementedError("custom updater method is not implemented")
19 | 
20 |     @staticmethod
21 |     def custom_parallel_updater(*args, **kw):
22 |         """Get custom_parallel_updater of the model (Chainer only)."""
23 |         raise NotImplementedError("custom parallel updater method is not implemented")
24 | 
25 |     def get_total_subsampling_factor(self):
26 |         """Get total subsampling factor."""
27 |         raise NotImplementedError(
28 |             "get_total_subsampling_factor method is not implemented"
29 |         )
30 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/nets_utils.py:
--------------------------------------------------------------------------------
1 | import chainer.functions as F
2 | 
3 | 
4 | def _subsamplex(x, n):
5 |     x = [F.get_item(xx, (slice(None, None, n), slice(None))) for xx in x]
6 |     ilens = [xx.shape[0] for xx in x]
7 |     return x, ilens
8 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/rnn/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/embedding.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """Class Declaration of Transformer's Positional Encoding."""
 3 | 
 4 | import chainer
 5 | import chainer.functions as F
 6 | import numpy as np
 7 | 
 8 | 
 9 | class PositionalEncoding(chainer.Chain):
10 |     """Positional encoding module.
11 | 
12 |     :param int n_units: embedding dim
13 |     :param float dropout: dropout rate
14 |     :param int length: maximum input length
15 | 
16 |     """
17 | 
18 |     def __init__(self, n_units, dropout=0.1, length=5000):
19 |         """Initialize Positional Encoding."""
20 |         # Implementation described in the paper
21 |         super(PositionalEncoding, self).__init__()
22 |         self.dropout = dropout
23 |         posi_block = np.arange(0, length, dtype=np.float32)[:, None]
24 |         unit_block = np.exp(
25 |             np.arange(0, n_units, 2, dtype=np.float32) * -(np.log(10000.0) / n_units)
26 |         )
27 |         self.pe = np.zeros((length, n_units), dtype=np.float32)
28 |         self.pe[:, ::2] = np.sin(posi_block * unit_block)
29 |         self.pe[:, 1::2] = np.cos(posi_block * unit_block)
30 |         self.scale = np.sqrt(n_units)
31 | 
32 |     def forward(self, e):
33 |         """Forward Positional Encoding."""
34 |         length = e.shape[1]
35 |         e = e * self.scale + self.xp.array(self.pe[:length])
36 |         return F.dropout(e, self.dropout)
37 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/encoder_layer.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """Class Declaration of Transformer's Encoder Block."""
 3 | 
 4 | import chainer
 5 | import chainer.functions as F
 6 | 
 7 | from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
 8 | from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
 9 | from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
10 |     PositionwiseFeedForward,
11 | )
12 | 
13 | 
14 | class EncoderLayer(chainer.Chain):
15 |     """Single encoder layer module.
16 | 
17 |     Args:
18 |         n_units (int): Number of input/output dimension of a FeedForward layer.
19 |         d_units (int): Number of units of hidden layer in a FeedForward layer.
20 |         h (int): Number of attention heads.
21 |         dropout (float): Dropout rate
22 | 
23 |     """
24 | 
25 |     def __init__(
26 |         self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
27 |     ):
28 |         """Initialize EncoderLayer."""
29 |         super(EncoderLayer, self).__init__()
30 |         with self.init_scope():
31 |             self.self_attn = MultiHeadAttention(
32 |                 n_units,
33 |                 h,
34 |                 dropout=dropout,
35 |                 initialW=initialW,
36 |                 initial_bias=initial_bias,
37 |             )
38 |             self.feed_forward = PositionwiseFeedForward(
39 |                 n_units,
40 |                 d_units=d_units,
41 |                 dropout=dropout,
42 |                 initialW=initialW,
43 |                 initial_bias=initial_bias,
44 |             )
45 |             self.norm1 = LayerNorm(n_units)
46 |             self.norm2 = LayerNorm(n_units)
47 |         self.dropout = dropout
48 |         self.n_units = n_units
49 | 
50 |     def forward(self, e, xx_mask, batch):
51 |         """Forward Positional Encoding."""
52 |         n_e = self.norm1(e)
53 |         n_e = self.self_attn(n_e, mask=xx_mask, batch=batch)
54 |         e = e + F.dropout(n_e, self.dropout)
55 | 
56 |         n_e = self.norm2(e)
57 |         n_e = self.feed_forward(n_e)
58 |         e = e + F.dropout(n_e, self.dropout)
59 |         return e
60 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/layer_norm.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """Class Declaration of Transformer's Label Smootion loss."""
 3 | 
 4 | import chainer.links as L
 5 | 
 6 | 
 7 | class LayerNorm(L.LayerNormalization):
 8 |     """Redirect to L.LayerNormalization."""
 9 | 
10 |     def __init__(self, dims, eps=1e-12):
11 |         """Initialize LayerNorm."""
12 |         super(LayerNorm, self).__init__(size=dims, eps=eps)
13 | 
14 |     def __call__(self, e):
15 |         """Forward LayerNorm."""
16 |         return super(LayerNorm, self).__call__(e)
17 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | """Create mask for subsequent steps."""
 2 | 
 3 | 
 4 | def make_history_mask(xp, block):
 5 |     """Prepare the history mask.
 6 | 
 7 |     Args:
 8 |         block (ndarray): Block with dimensions: (B x S).
 9 |     Returns:
10 |         ndarray, np.ndarray: History mask with dimensions (B, S, S).
11 | 
12 |     """
13 |     batch, length = block.shape
14 |     arange = xp.arange(length)
15 |     history_mask = (arange[None] <= arange[:, None])[None,]
16 |     history_mask = xp.broadcast_to(history_mask, (batch, length, length))
17 |     return history_mask
18 | 


--------------------------------------------------------------------------------
/espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """Class Declaration of Transformer's Positionwise Feedforward."""
 3 | 
 4 | import chainer
 5 | import chainer.functions as F
 6 | import chainer.links as L
 7 | import numpy as np
 8 | 
 9 | 
10 | class PositionwiseFeedForward(chainer.Chain):
11 |     """Positionwise feed forward.
12 | 
13 |     Args:
14 |         :param int idim: input dimenstion
15 |         :param int hidden_units: number of hidden units
16 |         :param float dropout_rate: dropout rate
17 | 
18 |     """
19 | 
20 |     def __init__(
21 |         self, n_units, d_units=0, dropout=0.1, initialW=None, initial_bias=None
22 |     ):
23 |         """Initialize PositionwiseFeedForward.
24 | 
25 |         Args:
26 |             n_units (int): Input dimension.
27 |             d_units (int, optional): Output dimension of hidden layer.
28 |             dropout (float, optional): Dropout ratio.
29 |             initialW (int, optional):  Initializer to initialize the weight.
30 |             initial_bias (bool, optional): Initializer to initialize the bias.
31 | 
32 |         """
33 |         super(PositionwiseFeedForward, self).__init__()
34 |         n_inner_units = d_units if d_units > 0 else n_units * 4
35 |         with self.init_scope():
36 |             stvd = 1.0 / np.sqrt(n_units)
37 |             self.w_1 = L.Linear(
38 |                 n_units,
39 |                 n_inner_units,
40 |                 initialW=initialW(scale=stvd),
41 |                 initial_bias=initial_bias(scale=stvd),
42 |             )
43 |             stvd = 1.0 / np.sqrt(n_inner_units)
44 |             self.w_2 = L.Linear(
45 |                 n_inner_units,
46 |                 n_units,
47 |                 initialW=initialW(scale=stvd),
48 |                 initial_bias=initial_bias(scale=stvd),
49 |             )
50 |             self.act = F.relu
51 |         self.dropout = dropout
52 | 
53 |     def __call__(self, e):
54 |         """Initialize PositionwiseFeedForward.
55 | 
56 |         Args:
57 |             e (chainer.Variable): Input variable.
58 | 
59 |         Return:
60 |             chainer.Variable: Output variable.
61 | 
62 |         """
63 |         e = F.dropout(self.act(self.w_1(e)), self.dropout)
64 |         return self.w_2(e)
65 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/conformer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/conformer/swish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """Swish() activation function for Conformer."""
 9 | 
10 | import torch
11 | 
12 | 
13 | class Swish(torch.nn.Module):
14 |     """Construct an Swish object."""
15 | 
16 |     def forward(self, x):
17 |         """Return Swich activation function."""
18 |         return x * torch.sigmoid(x)
19 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/fastspeech/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/frontends/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/initialization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | """Initialization functions for RNN sequence-to-sequence models."""
 7 | 
 8 | import math
 9 | 
10 | 
11 | def lecun_normal_init_parameters(module):
12 |     """Initialize parameters in the LeCun's manner."""
13 |     for p in module.parameters():
14 |         data = p.data
15 |         if data.dim() == 1:
16 |             # bias
17 |             data.zero_()
18 |         elif data.dim() == 2:
19 |             # linear weight
20 |             n = data.size(1)
21 |             stdv = 1.0 / math.sqrt(n)
22 |             data.normal_(0, stdv)
23 |         elif data.dim() in (3, 4):
24 |             # conv weight
25 |             n = data.size(1)
26 |             for k in data.size()[2:]:
27 |                 n *= k
28 |             stdv = 1.0 / math.sqrt(n)
29 |             data.normal_(0, stdv)
30 |         else:
31 |             raise NotImplementedError
32 | 
33 | 
34 | def uniform_init_parameters(module):
35 |     """Initialize parameters with an uniform distribution."""
36 |     for p in module.parameters():
37 |         data = p.data
38 |         if data.dim() == 1:
39 |             # bias
40 |             data.uniform_(-0.1, 0.1)
41 |         elif data.dim() == 2:
42 |             # linear weight
43 |             data.uniform_(-0.1, 0.1)
44 |         elif data.dim() in (3, 4):
45 |             # conv weight
46 |             pass  # use the pytorch default
47 |         else:
48 |             raise NotImplementedError
49 | 
50 | 
51 | def set_forget_bias_to_one(bias):
52 |     """Initialize a bias vector in the forget gate with one."""
53 |     n = bias.size(0)
54 |     start, end = n // 4, n // 2
55 |     bias.data[start:end].fill_(1.0)
56 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/lm/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/maskctc/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/maskctc/add_mask_token.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Waseda University (Yosuke Higuchi)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """Token masking module for Masked LM."""
 9 | 
10 | import numpy
11 | 
12 | 
13 | def mask_uniform(ys_pad, mask_token, eos, ignore_id):
14 |     """Replace random tokens with <mask> label and add <eos> label.
15 | 
16 |     The number of <mask> is chosen from a uniform distribution
17 |     between one and the target sequence's length.
18 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
19 |     :param int mask_token: index of <mask>
20 |     :param int eos: index of <eos>
21 |     :param int ignore_id: index of padding
22 |     :return: padded tensor (B, Lmax)
23 |     :rtype: torch.Tensor
24 |     :return: padded tensor (B, Lmax)
25 |     :rtype: torch.Tensor
26 |     """
27 |     from espnet.nets.pytorch_backend.nets_utils import pad_list
28 | 
29 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
30 |     ys_out = [y.new(y.size()).fill_(ignore_id) for y in ys]
31 |     ys_in = [y.clone() for y in ys]
32 |     for i in range(len(ys)):
33 |         num_samples = numpy.random.randint(1, len(ys[i]) + 1)
34 |         idx = numpy.random.choice(len(ys[i]), num_samples)
35 | 
36 |         ys_in[i][idx] = mask_token
37 |         ys_out[i][idx] = ys[i][idx]
38 | 
39 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
40 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/maskctc/mask.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Waseda University (Yosuke Higuchi)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """Attention masking module for Masked LM."""
 9 | 
10 | 
11 | def square_mask(ys_in_pad, ignore_id):
12 |     """Create attention mask to avoid attending on padding tokens.
13 | 
14 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
15 |     :param int ignore_id: index of padding
16 |     :param torch.dtype dtype: result dtype
17 |     :rtype: torch.Tensor (B, Lmax, Lmax)
18 |     """
19 |     ys_mask = (ys_in_pad != ignore_id).unsqueeze(-2)
20 |     ymax = ys_mask.size(-1)
21 |     ys_mask_tmp = ys_mask.transpose(1, 2).repeat(1, 1, ymax)
22 |     ys_mask = ys_mask.repeat(1, ymax, 1) & ys_mask_tmp
23 | 
24 |     return ys_mask
25 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/rnn/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/streaming/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transducer/initializer.py:
--------------------------------------------------------------------------------
 1 | """Parameter initialization for Transducer model."""
 2 | 
 3 | import math
 4 | from argparse import Namespace
 5 | 
 6 | import torch
 7 | 
 8 | from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one
 9 | 
10 | 
11 | def initializer(model: torch.nn.Module, args: Namespace):
12 |     """Initialize Transducer model.
13 | 
14 |     Args:
15 |         model: Transducer model.
16 |         args: Namespace containing model options.
17 | 
18 |     """
19 |     for name, p in model.named_parameters():
20 |         if any(x in name for x in ["enc.", "dec.", "transducer_tasks."]):
21 |             if p.dim() == 1:
22 |                 # bias
23 |                 p.data.zero_()
24 |             elif p.dim() == 2:
25 |                 # linear weight
26 |                 n = p.size(1)
27 |                 stdv = 1.0 / math.sqrt(n)
28 |                 p.data.normal_(0, stdv)
29 |             elif p.dim() in (3, 4):
30 |                 # conv weight
31 |                 n = p.size(1)
32 |                 for k in p.size()[2:]:
33 |                     n *= k
34 |                     stdv = 1.0 / math.sqrt(n)
35 |                     p.data.normal_(0, stdv)
36 | 
37 |     if args.dtype != "custom":
38 |         model.dec.embed.weight.data.normal_(0, 1)
39 | 
40 |         for i in range(model.dec.dlayers):
41 |             set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_ih_l0"))
42 |             set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_hh_l0"))
43 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/add_sos_eos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Unility functions for Transformer."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | def add_sos_eos(ys_pad, sos, eos, ignore_id):
13 |     """Add <sos> and <eos> labels.
14 | 
15 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
16 |     :param int sos: index of <sos>
17 |     :param int eos: index of <eos>
18 |     :param int ignore_id: index of padding
19 |     :return: padded tensor (B, Lmax)
20 |     :rtype: torch.Tensor
21 |     :return: padded tensor (B, Lmax)
22 |     :rtype: torch.Tensor
23 |     """
24 |     from espnet.nets.pytorch_backend.nets_utils import pad_list
25 | 
26 |     _sos = ys_pad.new([sos])
27 |     _eos = ys_pad.new([eos])
28 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
29 |     ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
30 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
31 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
32 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/initializer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Parameter initialization."""
 8 | 
 9 | import torch
10 | 
11 | from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
12 | 
13 | 
14 | def initialize(model, init_type="pytorch"):
15 |     """Initialize Transformer module.
16 | 
17 |     :param torch.nn.Module model: transformer instance
18 |     :param str init_type: initialization type
19 |     """
20 |     if init_type == "pytorch":
21 |         return
22 | 
23 |     # weight init
24 |     for p in model.parameters():
25 |         if p.dim() > 1:
26 |             if init_type == "xavier_uniform":
27 |                 torch.nn.init.xavier_uniform_(p.data)
28 |             elif init_type == "xavier_normal":
29 |                 torch.nn.init.xavier_normal_(p.data)
30 |             elif init_type == "kaiming_uniform":
31 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
32 |             elif init_type == "kaiming_normal":
33 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
34 |             else:
35 |                 raise ValueError("Unknown initialization: " + init_type)
36 |     # bias init
37 |     for p in model.parameters():
38 |         if p.dim() == 1:
39 |             p.data.zero_()
40 | 
41 |     # reset some modules with default init
42 |     for m in model.modules():
43 |         if isinstance(m, (torch.nn.Embedding, LayerNorm)):
44 |             m.reset_parameters()
45 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Layer normalization module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class LayerNorm(torch.nn.LayerNorm):
13 |     """Layer normalization module.
14 | 
15 |     Args:
16 |         nout (int): Output dim size.
17 |         dim (int): Dimension to be normalized.
18 | 
19 |     """
20 | 
21 |     def __init__(self, nout, dim=-1):
22 |         """Construct an LayerNorm object."""
23 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
24 |         self.dim = dim
25 | 
26 |     def forward(self, x):
27 |         """Apply layer normalization.
28 | 
29 |         Args:
30 |             x (torch.Tensor): Input tensor.
31 | 
32 |         Returns:
33 |             torch.Tensor: Normalized tensor.
34 | 
35 |         """
36 |         if self.dim == -1:
37 |             return super(LayerNorm, self).forward(x)
38 |         return (
39 |             super(LayerNorm, self)
40 |             .forward(x.transpose(self.dim, -1))
41 |             .transpose(self.dim, -1)
42 |         )
43 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Shigeki Karita
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Mask module."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def subsequent_mask(size, device="cpu", dtype=torch.bool):
10 |     """Create mask for subsequent steps (size, size).
11 | 
12 |     :param int size: size of mask
13 |     :param str device: "cpu" or "cuda" or torch.Tensor.device
14 |     :param torch.dtype dtype: result dtype
15 |     :rtype: torch.Tensor
16 |     >>> subsequent_mask(3)
17 |     [[1, 0, 0],
18 |      [1, 1, 0],
19 |      [1, 1, 1]]
20 |     """
21 |     ret = torch.ones(size, size, device=device, dtype=dtype)
22 |     return torch.tril(ret, out=ret)
23 | 
24 | 
25 | def target_mask(ys_in_pad, ignore_id):
26 |     """Create mask for decoder self-attention.
27 | 
28 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
29 |     :param int ignore_id: index of padding
30 |     :param torch.dtype dtype: result dtype
31 |     :rtype: torch.Tensor (B, Lmax, Lmax)
32 |     """
33 |     ys_mask = ys_in_pad != ignore_id
34 |     m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
35 |     return ys_mask.unsqueeze(-2) & m
36 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positionwise feed forward layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class PositionwiseFeedForward(torch.nn.Module):
13 |     """Positionwise feed forward layer.
14 | 
15 |     Args:
16 |         idim (int): Input dimenstion.
17 |         hidden_units (int): The number of hidden units.
18 |         dropout_rate (float): Dropout rate.
19 | 
20 |     """
21 | 
22 |     def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
23 |         """Construct an PositionwiseFeedForward object."""
24 |         super(PositionwiseFeedForward, self).__init__()
25 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
26 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
27 |         self.dropout = torch.nn.Dropout(dropout_rate)
28 |         self.activation = activation
29 | 
30 |     def forward(self, x):
31 |         """Forward function."""
32 |         return self.w_2(self.dropout(self.activation(self.w_1(x))))
33 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/repeat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Repeat the same layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiSequential(torch.nn.Sequential):
13 |     """Multi-input multi-output torch.nn.Sequential."""
14 | 
15 |     def __init__(self, *args, layer_drop_rate=0.0):
16 |         """Initialize MultiSequential with layer_drop.
17 | 
18 |         Args:
19 |             layer_drop_rate (float): Probability of dropping out each fn (layer).
20 | 
21 |         """
22 |         super(MultiSequential, self).__init__(*args)
23 |         self.layer_drop_rate = layer_drop_rate
24 | 
25 |     def forward(self, *args):
26 |         """Repeat."""
27 |         _probs = torch.empty(len(self)).uniform_()
28 |         for idx, m in enumerate(self):
29 |             if not self.training or (_probs[idx] >= self.layer_drop_rate):
30 |                 args = m(*args)
31 |         return args
32 | 
33 | 
34 | def repeat(N, fn, layer_drop_rate=0.0):
35 |     """Repeat module N times.
36 | 
37 |     Args:
38 |         N (int): Number of repeat time.
39 |         fn (Callable): Function to generate module.
40 |         layer_drop_rate (float): Probability of dropping out each fn (layer).
41 | 
42 |     Returns:
43 |         MultiSequential: Repeated model instance.
44 | 
45 |     """
46 |     return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate)
47 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/subsampling_without_posenc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Emiru Tsunoo
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Subsampling layer definition."""
 5 | 
 6 | import math
 7 | 
 8 | import torch
 9 | 
10 | 
11 | class Conv2dSubsamplingWOPosEnc(torch.nn.Module):
12 |     """Convolutional 2D subsampling.
13 | 
14 |     Args:
15 |         idim (int): Input dimension.
16 |         odim (int): Output dimension.
17 |         dropout_rate (float): Dropout rate.
18 |         kernels (list): kernel sizes
19 |         strides (list): stride sizes
20 | 
21 |     """
22 | 
23 |     def __init__(self, idim, odim, dropout_rate, kernels, strides):
24 |         """Construct an Conv2dSubsamplingWOPosEnc object."""
25 |         assert len(kernels) == len(strides)
26 |         super().__init__()
27 |         conv = []
28 |         olen = idim
29 |         for i, (k, s) in enumerate(zip(kernels, strides)):
30 |             conv += [
31 |                 torch.nn.Conv2d(1 if i == 0 else odim, odim, k, s),
32 |                 torch.nn.ReLU(),
33 |             ]
34 |             olen = math.floor((olen - k) / s + 1)
35 |         self.conv = torch.nn.Sequential(*conv)
36 |         self.out = torch.nn.Linear(odim * olen, odim)
37 |         self.strides = strides
38 |         self.kernels = kernels
39 | 
40 |     def forward(self, x, x_mask):
41 |         """Subsample x.
42 | 
43 |         Args:
44 |             x (torch.Tensor): Input tensor (#batch, time, idim).
45 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
46 | 
47 |         Returns:
48 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
49 |                 where time' = time // 4.
50 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
51 |                 where time' = time // 4.
52 | 
53 |         """
54 |         x = x.unsqueeze(1)  # (b, c, t, f)
55 |         x = self.conv(x)
56 |         b, c, t, f = x.size()
57 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
58 |         if x_mask is None:
59 |             return x, None
60 |         for k, s in zip(self.kernels, self.strides):
61 |             x_mask = x_mask[:, :, : -k + 1 : s]
62 |         return x, x_mask
63 | 


--------------------------------------------------------------------------------
/espnet/nets/scorers/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/scorers/length_bonus.py:
--------------------------------------------------------------------------------
 1 | """Length bonus module."""
 2 | from typing import Any, List, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from espnet.nets.scorer_interface import BatchScorerInterface
 7 | 
 8 | 
 9 | class LengthBonus(BatchScorerInterface):
10 |     """Length bonus in beam search."""
11 | 
12 |     def __init__(self, n_vocab: int):
13 |         """Initialize class.
14 | 
15 |         Args:
16 |             n_vocab (int): The number of tokens in vocabulary for beam search
17 | 
18 |         """
19 |         self.n = n_vocab
20 | 
21 |     def score(self, y, state, x):
22 |         """Score new token.
23 | 
24 |         Args:
25 |             y (torch.Tensor): 1D torch.int64 prefix tokens.
26 |             state: Scorer state for prefix tokens
27 |             x (torch.Tensor): 2D encoder feature that generates ys.
28 | 
29 |         Returns:
30 |             tuple[torch.Tensor, Any]: Tuple of
31 |                 torch.float32 scores for next token (n_vocab)
32 |                 and None
33 | 
34 |         """
35 |         return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None
36 | 
37 |     def batch_score(
38 |         self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
39 |     ) -> Tuple[torch.Tensor, List[Any]]:
40 |         """Score new token batch.
41 | 
42 |         Args:
43 |             ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
44 |             states (List[Any]): Scorer states for prefix tokens.
45 |             xs (torch.Tensor):
46 |                 The encoder feature that generates ys (n_batch, xlen, n_feat).
47 | 
48 |         Returns:
49 |             tuple[torch.Tensor, List[Any]]: Tuple of
50 |                 batchfied scores for next token with shape of `(n_batch, n_vocab)`
51 |                 and next state list for ys.
52 | 
53 |         """
54 |         return (
55 |             torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand(
56 |                 ys.shape[0], self.n
57 |             ),
58 |             None,
59 |         )
60 | 


--------------------------------------------------------------------------------
/espnet/nets/scorers/uasr.py:
--------------------------------------------------------------------------------
 1 | """ScorerInterface implementation for UASR."""
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from espnet.nets.ctc_prefix_score import CTCPrefixScore, CTCPrefixScoreTH
 7 | from espnet.nets.scorers.ctc import CTCPrefixScorer
 8 | 
 9 | 
10 | class UASRPrefixScorer(CTCPrefixScorer):
11 |     """Decoder interface wrapper for CTCPrefixScore."""
12 | 
13 |     def __init__(self, eos: int):
14 |         """Initialize class."""
15 |         self.eos = eos
16 | 
17 |     def init_state(self, x: torch.Tensor):
18 |         """Get an initial state for decoding.
19 | 
20 |         Args:
21 |             x (torch.Tensor): The encoded feature tensor
22 | 
23 |         Returns: initial state
24 | 
25 |         """
26 |         x[:, 0] = x[:, 0] - 100000000000  # simulate a no-blank CTC
27 |         self.logp = (
28 |             torch.nn.functional.log_softmax(x, dim=1).detach().squeeze(0).cpu().numpy()
29 |         )
30 |         # TODO(karita): use CTCPrefixScoreTH
31 |         self.impl = CTCPrefixScore(self.logp, 0, self.eos, np)
32 |         return 0, self.impl.initial_state()
33 | 
34 |     def batch_init_state(self, x: torch.Tensor):
35 |         """Get an initial state for decoding.
36 | 
37 |         Args:
38 |             x (torch.Tensor): The encoded feature tensor
39 | 
40 |         Returns: initial state
41 | 
42 |         """
43 |         x[:, 0] = x[:, 0] - 100000000000  # simulate a no-blank CTC
44 |         logp = torch.nn.functional.log_softmax(x, dim=1).unsqueeze(
45 |             0
46 |         )  # assuming batch_size = 1
47 |         xlen = torch.tensor([logp.size(1)])
48 |         self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos)
49 |         return None
50 | 


--------------------------------------------------------------------------------
/espnet/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/optimizer/parser.py:
--------------------------------------------------------------------------------
 1 | """Common optimizer default config for multiple backends."""
 2 | 
 3 | 
 4 | def sgd(parser):
 5 |     """Add arguments."""
 6 |     parser.add_argument("--lr", type=float, default=1.0, help="Learning rate")
 7 |     parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
 8 |     return parser
 9 | 
10 | 
11 | def adam(parser):
12 |     """Add arguments."""
13 |     parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
14 |     parser.add_argument("--beta1", type=float, default=0.9, help="Beta1")
15 |     parser.add_argument("--beta2", type=float, default=0.999, help="Beta2")
16 |     parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
17 |     return parser
18 | 
19 | 
20 | def adadelta(parser):
21 |     """Add arguments."""
22 |     parser.add_argument("--rho", type=float, default=0.95, help="Rho")
23 |     parser.add_argument("--eps", type=float, default=1e-8, help="Eps")
24 |     parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
25 |     return parser
26 | 


--------------------------------------------------------------------------------
/espnet/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/scheduler/chainer.py:
--------------------------------------------------------------------------------
 1 | """Chainer optimizer schdulers."""
 2 | 
 3 | from typing import List
 4 | 
 5 | from chainer.optimizer import Optimizer
 6 | 
 7 | from espnet.scheduler.scheduler import SchedulerInterface
 8 | 
 9 | 
10 | class ChainerScheduler:
11 |     """Chainer optimizer scheduler."""
12 | 
13 |     def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer):
14 |         """Initialize class."""
15 |         self.schedulers = schedulers
16 |         self.optimizer = optimizer
17 |         self.init_values = dict()
18 |         for s in self.schedulers:
19 |             self.init_values[s.key] = getattr(self.optimizer, s.key)
20 | 
21 |     def step(self, n_iter: int):
22 |         """Update optimizer by scheduling."""
23 |         for s in self.schedulers:
24 |             new_val = self.init_values[s.key] * s.scale(n_iter)
25 |             setattr(self.optimizer, s.key, new_val)
26 | 


--------------------------------------------------------------------------------
/espnet/scheduler/pytorch.py:
--------------------------------------------------------------------------------
 1 | """PyTorch optimizer schdulers."""
 2 | 
 3 | from typing import List
 4 | 
 5 | from torch.optim import Optimizer
 6 | 
 7 | from espnet.scheduler.scheduler import SchedulerInterface
 8 | 
 9 | 
10 | class PyTorchScheduler:
11 |     """PyTorch optimizer scheduler."""
12 | 
13 |     def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer):
14 |         """Initialize class."""
15 |         self.schedulers = schedulers
16 |         self.optimizer = optimizer
17 |         for s in self.schedulers:
18 |             for group in optimizer.param_groups:
19 |                 group.setdefault("initial_" + s.key, group[s.key])
20 | 
21 |     def step(self, n_iter: int):
22 |         """Update optimizer by scheduling."""
23 |         for s in self.schedulers:
24 |             for group in self.optimizer.param_groups:
25 |                 group[s.key] = group["initial_" + s.key] * s.scale(n_iter)
26 | 


--------------------------------------------------------------------------------
/espnet/st/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/st/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/transform/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize main package."""
2 | 


--------------------------------------------------------------------------------
/espnet/transform/add_deltas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def delta(feat, window):
 5 |     assert window > 0
 6 |     delta_feat = np.zeros_like(feat)
 7 |     for i in range(1, window + 1):
 8 |         delta_feat[:-i] += i * feat[i:]
 9 |         delta_feat[i:] += -i * feat[:-i]
10 |         delta_feat[-i:] += i * feat[-1]
11 |         delta_feat[:i] += -i * feat[0]
12 |     delta_feat /= 2 * sum(i**2 for i in range(1, window + 1))
13 |     return delta_feat
14 | 
15 | 
16 | def add_deltas(x, window=2, order=2):
17 |     feats = [x]
18 |     for _ in range(order):
19 |         feats.append(delta(feats[-1], window))
20 |     return np.concatenate(feats, axis=1)
21 | 
22 | 
23 | class AddDeltas(object):
24 |     def __init__(self, window=2, order=2):
25 |         self.window = window
26 |         self.order = order
27 | 
28 |     def __repr__(self):
29 |         return "{name}(window={window}, order={order}".format(
30 |             name=self.__class__.__name__, window=self.window, order=self.order
31 |         )
32 | 
33 |     def __call__(self, x):
34 |         return add_deltas(x, window=self.window, order=self.order)
35 | 


--------------------------------------------------------------------------------
/espnet/transform/channel_selector.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | class ChannelSelector(object):
 5 |     """Select 1ch from multi-channel signal"""
 6 | 
 7 |     def __init__(self, train_channel="random", eval_channel=0, axis=1):
 8 |         self.train_channel = train_channel
 9 |         self.eval_channel = eval_channel
10 |         self.axis = axis
11 | 
12 |     def __repr__(self):
13 |         return (
14 |             "{name}(train_channel={train_channel}, "
15 |             "eval_channel={eval_channel}, axis={axis})".format(
16 |                 name=self.__class__.__name__,
17 |                 train_channel=self.train_channel,
18 |                 eval_channel=self.eval_channel,
19 |                 axis=self.axis,
20 |             )
21 |         )
22 | 
23 |     def __call__(self, x, train=True):
24 |         # Assuming x: [Time, Channel] by default
25 | 
26 |         if x.ndim <= self.axis:
27 |             # If the dimension is insufficient, then unsqueeze
28 |             # (e.g [Time] -> [Time, 1])
29 |             ind = tuple(
30 |                 slice(None) if i < x.ndim else None for i in range(self.axis + 1)
31 |             )
32 |             x = x[ind]
33 | 
34 |         if train:
35 |             channel = self.train_channel
36 |         else:
37 |             channel = self.eval_channel
38 | 
39 |         if channel == "random":
40 |             ch = numpy.random.randint(0, x.shape[self.axis])
41 |         else:
42 |             ch = channel
43 | 
44 |         ind = tuple(slice(None) if i != self.axis else ch for i in range(x.ndim))
45 |         return x[ind]
46 | 


--------------------------------------------------------------------------------
/espnet/transform/functional.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | from espnet.transform.transform_interface import TransformInterface
 4 | from espnet.utils.check_kwargs import check_kwargs
 5 | 
 6 | 
 7 | class FuncTrans(TransformInterface):
 8 |     """Functional Transformation
 9 | 
10 |     WARNING:
11 |         Builtin or C/C++ functions may not work properly
12 |         because this class heavily depends on the `inspect` module.
13 | 
14 |     Usage:
15 | 
16 |     >>> def foo_bar(x, a=1, b=2):
17 |     ...     '''Foo bar
18 |     ...     :param x: input
19 |     ...     :param int a: default 1
20 |     ...     :param int b: default 2
21 |     ...     '''
22 |     ...     return x + a - b
23 | 
24 | 
25 |     >>> class FooBar(FuncTrans):
26 |     ...     _func = foo_bar
27 |     ...     __doc__ = foo_bar.__doc__
28 |     """
29 | 
30 |     _func = None
31 | 
32 |     def __init__(self, **kwargs):
33 |         self.kwargs = kwargs
34 |         check_kwargs(self.func, kwargs)
35 | 
36 |     def __call__(self, x):
37 |         return self.func(x, **self.kwargs)
38 | 
39 |     @classmethod
40 |     def add_arguments(cls, parser):
41 |         fname = cls._func.__name__.replace("_", "-")
42 |         group = parser.add_argument_group(fname + " transformation setting")
43 |         for k, v in cls.default_params().items():
44 |             # TODO(karita): get help and choices from docstring?
45 |             attr = k.replace("_", "-")
46 |             group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
47 |         return parser
48 | 
49 |     @property
50 |     def func(self):
51 |         return type(self)._func
52 | 
53 |     @classmethod
54 |     def default_params(cls):
55 |         try:
56 |             d = dict(inspect.signature(cls._func).parameters)
57 |         except ValueError:
58 |             d = dict()
59 |         return {
60 |             k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
61 |         }
62 | 
63 |     def __repr__(self):
64 |         params = self.default_params()
65 |         params.update(**self.kwargs)
66 |         ret = self.__class__.__name__ + "("
67 |         if len(params) == 0:
68 |             return ret + ")"
69 |         for k, v in params.items():
70 |             ret += "{}={}, ".format(k, v)
71 |         return ret[:-2] + ")"
72 | 


--------------------------------------------------------------------------------
/espnet/transform/transform_interface.py:
--------------------------------------------------------------------------------
 1 | # TODO(karita): add this to all the transform impl.
 2 | class TransformInterface:
 3 |     """Transform Interface"""
 4 | 
 5 |     def __call__(self, x):
 6 |         raise NotImplementedError("__call__ method is not implemented")
 7 | 
 8 |     @classmethod
 9 |     def add_arguments(cls, parser):
10 |         return parser
11 | 
12 |     def __repr__(self):
13 |         return self.__class__.__name__ + "()"
14 | 
15 | 
16 | class Identity(TransformInterface):
17 |     """Identity Function"""
18 | 
19 |     def __call__(self, x):
20 |         return x
21 | 


--------------------------------------------------------------------------------
/espnet/transform/wpe.py:
--------------------------------------------------------------------------------
 1 | class WPE(object):
 2 |     def __init__(
 3 |         self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full"
 4 |     ):
 5 |         self.taps = taps
 6 |         self.delay = delay
 7 |         self.iterations = iterations
 8 |         self.psd_context = psd_context
 9 |         self.statistics_mode = statistics_mode
10 | 
11 |     def __repr__(self):
12 |         return (
13 |             "{name}(taps={taps}, delay={delay}"
14 |             "iterations={iterations}, psd_context={psd_context}, "
15 |             "statistics_mode={statistics_mode})".format(
16 |                 name=self.__class__.__name__,
17 |                 taps=self.taps,
18 |                 delay=self.delay,
19 |                 iterations=self.iterations,
20 |                 psd_context=self.psd_context,
21 |                 statistics_mode=self.statistics_mode,
22 |             )
23 |         )
24 | 
25 |     def __call__(self, xs):
26 |         """Return enhanced
27 | 
28 |         :param np.ndarray xs: (Time, Channel, Frequency)
29 |         :return: enhanced_xs
30 |         :rtype: np.ndarray
31 | 
32 |         """
33 |         from nara_wpe.wpe import wpe
34 | 
35 |         # nara_wpe.wpe: (F, C, T)
36 |         xs = wpe(
37 |             xs.transpose((2, 1, 0)),
38 |             taps=self.taps,
39 |             delay=self.delay,
40 |             iterations=self.iterations,
41 |             psd_context=self.psd_context,
42 |             statistics_mode=self.statistics_mode,
43 |         )
44 |         return xs.transpose(2, 1, 0)
45 | 


--------------------------------------------------------------------------------
/espnet/tts/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/tts/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/utils/check_kwargs.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | def check_kwargs(func, kwargs, name=None):
 5 |     """check kwargs are valid for func
 6 | 
 7 |     If kwargs are invalid, raise TypeError as same as python default
 8 |     :param function func: function to be validated
 9 |     :param dict kwargs: keyword arguments for func
10 |     :param str name: name used in TypeError (default is func name)
11 |     """
12 |     try:
13 |         params = inspect.signature(func).parameters
14 |     except ValueError:
15 |         return
16 |     if name is None:
17 |         name = func.__name__
18 |     for k in kwargs.keys():
19 |         if k not in params:
20 |             raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
21 | 


--------------------------------------------------------------------------------
/espnet/utils/cli_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections.abc import Sequence
 3 | from distutils.util import strtobool as dist_strtobool
 4 | 
 5 | import numpy
 6 | 
 7 | 
 8 | def strtobool(x):
 9 |     # distutils.util.strtobool returns integer, but it's confusing,
10 |     return bool(dist_strtobool(x))
11 | 
12 | 
13 | def get_commandline_args():
14 |     extra_chars = [
15 |         " ",
16 |         ";",
17 |         "&",
18 |         "(",
19 |         ")",
20 |         "|",
21 |         "^",
22 |         "<",
23 |         ">",
24 |         "?",
25 |         "*",
26 |         "[",
27 |         "]",
28 |         "$",
29 |         "`",
30 |         '"',
31 |         "\\",
32 |         "!",
33 |         "{",
34 |         "}",
35 |     ]
36 | 
37 |     # Escape the extra characters for shell
38 |     argv = [
39 |         arg.replace("'", "'\\''")
40 |         if all(char not in arg for char in extra_chars)
41 |         else "'" + arg.replace("'", "'\\''") + "'"
42 |         for arg in sys.argv
43 |     ]
44 | 
45 |     return sys.executable + " " + " ".join(argv)
46 | 
47 | 
48 | def is_scipy_wav_style(value):
49 |     # If Tuple[int, numpy.ndarray] or not
50 |     return (
51 |         isinstance(value, Sequence)
52 |         and len(value) == 2
53 |         and isinstance(value[0], int)
54 |         and isinstance(value[1], numpy.ndarray)
55 |     )
56 | 
57 | 
58 | def assert_scipy_wav_style(value):
59 |     assert is_scipy_wav_style(
60 |         value
61 |     ), "Must be Tuple[int, numpy.ndarray], but got {}".format(
62 |         type(value)
63 |         if not isinstance(value, Sequence)
64 |         else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
65 |     )
66 | 


--------------------------------------------------------------------------------
/espnet/utils/deterministic_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import chainer
 5 | import torch
 6 | 
 7 | 
 8 | def set_deterministic_pytorch(args):
 9 |     """Ensures pytorch produces deterministic results depending on the program arguments
10 | 
11 |     :param Namespace args: The program arguments
12 |     """
13 |     # seed setting
14 |     torch.manual_seed(args.seed)
15 | 
16 |     # debug mode setting
17 |     # 0 would be fastest, but 1 seems to be reasonable
18 |     # considering reproducibility
19 |     # remove type check
20 |     torch.backends.cudnn.deterministic = True
21 |     torch.backends.cudnn.benchmark = (
22 |         False  # https://github.com/pytorch/pytorch/issues/6351
23 |     )
24 |     if args.debugmode < 2:
25 |         chainer.config.type_check = False
26 |         logging.info("torch type check is disabled")
27 |     # use deterministic computation or not
28 |     if args.debugmode < 1:
29 |         torch.backends.cudnn.deterministic = False
30 |         torch.backends.cudnn.benchmark = True
31 |         logging.info("torch cudnn deterministic is disabled")
32 | 
33 | 
34 | def set_deterministic_chainer(args):
35 |     """Ensures chainer produces deterministic results depending on the program arguments
36 | 
37 |     :param Namespace args: The program arguments
38 |     """
39 |     # seed setting (chainer seed may not need it)
40 |     os.environ["CHAINER_SEED"] = str(args.seed)
41 |     logging.info("chainer seed = " + os.environ["CHAINER_SEED"])
42 | 
43 |     # debug mode setting
44 |     # 0 would be fastest, but 1 seems to be reasonable
45 |     # considering reproducibility
46 |     # remove type check
47 |     if args.debugmode < 2:
48 |         chainer.config.type_check = False
49 |         logging.info("chainer type check is disabled")
50 |     # use deterministic computation or not
51 |     if args.debugmode < 1:
52 |         chainer.config.cudnn_deterministic = False
53 |         logging.info("chainer cudnn deterministic is disabled")
54 |     else:
55 |         chainer.config.cudnn_deterministic = True
56 | 


--------------------------------------------------------------------------------
/espnet/utils/dynamic_import.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | 
 4 | def dynamic_import(import_path, alias=dict()):
 5 |     """dynamic import module and class
 6 | 
 7 |     :param str import_path: syntax 'module_name:class_name'
 8 |         e.g., 'espnet.transform.add_deltas:AddDeltas'
 9 |     :param dict alias: shortcut for registered class
10 |     :return: imported class
11 |     """
12 |     if import_path not in alias and ":" not in import_path:
13 |         raise ValueError(
14 |             "import_path should be one of {} or "
15 |             'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : '
16 |             "{}".format(set(alias), import_path)
17 |         )
18 |     if ":" not in import_path:
19 |         import_path = alias[import_path]
20 | 
21 |     module_name, objname = import_path.split(":")
22 |     m = importlib.import_module(module_name)
23 |     return getattr(m, objname)
24 | 


--------------------------------------------------------------------------------
/espnet/utils/fill_missing_args.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import argparse
 7 | import logging
 8 | 
 9 | 
10 | def fill_missing_args(args, add_arguments):
11 |     """Fill missing arguments in args.
12 | 
13 |     Args:
14 |         args (Namespace or None): Namesapce containing hyperparameters.
15 |         add_arguments (function): Function to add arguments.
16 | 
17 |     Returns:
18 |         Namespace: Arguments whose missing ones are filled with default value.
19 | 
20 |     Examples:
21 |         >>> from argparse import Namespace
22 |         >>> from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
23 |         >>> args = Namespace()
24 |         >>> fill_missing_args(args, Tacotron2.add_arguments_fn)
25 |         Namespace(aconv_chans=32, aconv_filts=15, adim=512, atype='location', ...)
26 | 
27 |     """
28 |     # check argument type
29 |     assert isinstance(args, argparse.Namespace) or args is None
30 |     assert callable(add_arguments)
31 | 
32 |     # get default arguments
33 |     default_args, _ = add_arguments(argparse.ArgumentParser()).parse_known_args()
34 | 
35 |     # convert to dict
36 |     args = {} if args is None else vars(args)
37 |     default_args = vars(default_args)
38 | 
39 |     for key, value in default_args.items():
40 |         if key not in args:
41 |             logging.info(
42 |                 'attribute "%s" does not exist. use default %s.' % (key, str(value))
43 |             )
44 |             args[key] = value
45 | 
46 |     return argparse.Namespace(**args)
47 | 


--------------------------------------------------------------------------------
/espnet/utils/training/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/utils/training/evaluator.py:
--------------------------------------------------------------------------------
 1 | from chainer.training.extensions import Evaluator
 2 | 
 3 | from espnet.utils.training.tensorboard_logger import TensorboardLogger
 4 | 
 5 | 
 6 | class BaseEvaluator(Evaluator):
 7 |     """Base Evaluator in ESPnet"""
 8 | 
 9 |     def __call__(self, trainer=None):
10 |         ret = super().__call__(trainer)
11 |         try:
12 |             if trainer is not None:
13 |                 # force tensorboard to report evaluation log
14 |                 tb_logger = trainer.get_extension(TensorboardLogger.default_name)
15 |                 tb_logger(trainer)
16 |         except ValueError:
17 |             pass
18 |         return ret
19 | 


--------------------------------------------------------------------------------
/espnet/utils/training/tensorboard_logger.py:
--------------------------------------------------------------------------------
 1 | from chainer.training.extension import Extension
 2 | 
 3 | 
 4 | class TensorboardLogger(Extension):
 5 |     """A tensorboard logger extension"""
 6 | 
 7 |     default_name = "espnet_tensorboard_logger"
 8 | 
 9 |     def __init__(
10 |         self, logger, att_reporter=None, ctc_reporter=None, entries=None, epoch=0
11 |     ):
12 |         """Init the extension
13 | 
14 |         :param SummaryWriter logger: The logger to use
15 |         :param PlotAttentionReporter att_reporter: The (optional) PlotAttentionReporter
16 |         :param entries: The entries to watch
17 |         :param int epoch: The starting epoch
18 |         """
19 |         self._entries = entries
20 |         self._att_reporter = att_reporter
21 |         self._ctc_reporter = ctc_reporter
22 |         self._logger = logger
23 |         self._epoch = epoch
24 | 
25 |     def __call__(self, trainer):
26 |         """Updates the events file with the new values
27 | 
28 |         :param trainer: The trainer
29 |         """
30 |         observation = trainer.observation
31 |         for k, v in observation.items():
32 |             if (self._entries is not None) and (k not in self._entries):
33 |                 continue
34 |             if k is not None and v is not None:
35 |                 if "cupy" in str(type(v)):
36 |                     v = v.get()
37 |                 if "cupy" in str(type(k)):
38 |                     k = k.get()
39 |                 self._logger.add_scalar(k, v, trainer.updater.iteration)
40 |         if (
41 |             self._att_reporter is not None
42 |             and trainer.updater.get_iterator("main").epoch > self._epoch
43 |         ):
44 |             self._epoch = trainer.updater.get_iterator("main").epoch
45 |             self._att_reporter.log_attentions(self._logger, trainer.updater.iteration)
46 |         if (
47 |             self._ctc_reporter is not None
48 |             and trainer.updater.get_iterator("main").epoch > self._epoch
49 |         ):
50 |             self._epoch = trainer.updater.get_iterator("main").epoch
51 |             self._ctc_reporter.log_ctc_probs(self._logger, trainer.updater.iteration)
52 | 


--------------------------------------------------------------------------------
/espnet/utils/training/train_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import chainer
 4 | 
 5 | 
 6 | def check_early_stop(trainer, epochs):
 7 |     """Checks an early stopping trigger and warns the user if it's the case
 8 | 
 9 |     :param trainer: The trainer used for training
10 |     :param epochs: The maximum number of epochs
11 |     """
12 |     end_epoch = trainer.updater.get_iterator("main").epoch
13 |     if end_epoch < (epochs - 1):
14 |         logging.warning(
15 |             "Hit early stop at epoch "
16 |             + str(end_epoch)
17 |             + "\nYou can change the patience or set it to 0 to run all epochs"
18 |         )
19 | 
20 | 
21 | def set_early_stop(trainer, args, is_lm=False):
22 |     """Sets the early stop trigger given the program arguments
23 | 
24 |     :param trainer: The trainer used for training
25 |     :param args: The program arguments
26 |     :param is_lm: If the trainer is for a LM (epoch instead of epochs)
27 |     """
28 |     patience = args.patience
29 |     criterion = args.early_stop_criterion
30 |     epochs = args.epoch if is_lm else args.epochs
31 |     mode = "max" if "acc" in criterion else "min"
32 |     if patience > 0:
33 |         trainer.stop_trigger = chainer.training.triggers.EarlyStoppingTrigger(
34 |             monitor=criterion,
35 |             mode=mode,
36 |             patients=patience,
37 |             max_trigger=(epochs, "epoch"),
38 |         )
39 | 


--------------------------------------------------------------------------------
/espnet/version.txt:
--------------------------------------------------------------------------------
1 | 202308
2 | 


--------------------------------------------------------------------------------
/espnet2/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize espnet2 package."""
2 | 
3 | # from espnet import __version__  # NOQA
4 | 


--------------------------------------------------------------------------------
/espnet2/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/decoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/decoder/abs_decoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from espnet.nets.scorer_interface import ScorerInterface
 7 | 
 8 | 
 9 | class AbsDecoder(torch.nn.Module, ScorerInterface, ABC):
10 |     @abstractmethod
11 |     def forward(
12 |         self,
13 |         hs_pad: torch.Tensor,
14 |         hlens: torch.Tensor,
15 |         ys_in_pad: torch.Tensor,
16 |         ys_in_lens: torch.Tensor,
17 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/espnet2/asr/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/encoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/encoder/abs_encoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Optional, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsEncoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def forward(
14 |         self,
15 |         xs_pad: torch.Tensor,
16 |         ilens: torch.Tensor,
17 |         prev_states: torch.Tensor = None,
18 |     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/espnet2/asr/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/frontend/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/frontend/abs_frontend.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsFrontend(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def forward(
14 |         self, input: torch.Tensor, input_lengths: torch.Tensor
15 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
16 |         raise NotImplementedError
17 | 


--------------------------------------------------------------------------------
/espnet2/asr/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/layers/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/postencoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/postencoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/postencoder/abs_postencoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsPostEncoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def forward(
14 |         self, input: torch.Tensor, input_lengths: torch.Tensor
15 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
16 |         raise NotImplementedError
17 | 


--------------------------------------------------------------------------------
/espnet2/asr/preencoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/preencoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/preencoder/abs_preencoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsPreEncoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def forward(
14 |         self, input: torch.Tensor, input_lengths: torch.Tensor
15 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
16 |         raise NotImplementedError
17 | 


--------------------------------------------------------------------------------
/espnet2/asr/preencoder/linear.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #  2021, Carnegie Mellon University;  Xuankai Chang
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | 
 5 | """Linear Projection."""
 6 | 
 7 | from typing import Tuple
 8 | 
 9 | import torch
10 | from typeguard import check_argument_types
11 | 
12 | from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
13 | 
14 | 
15 | class LinearProjection(AbsPreEncoder):
16 |     """Linear Projection Preencoder."""
17 | 
18 |     def __init__(self, input_size: int, output_size: int, dropout: float = 0.0):
19 |         """Initialize the module."""
20 |         assert check_argument_types()
21 |         super().__init__()
22 | 
23 |         self.output_dim = output_size
24 |         self.linear_out = torch.nn.Linear(input_size, output_size)
25 |         self.dropout = torch.nn.Dropout(dropout)
26 | 
27 |     def forward(
28 |         self, input: torch.Tensor, input_lengths: torch.Tensor
29 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
30 |         """Forward."""
31 |         output = self.linear_out(self.dropout(input))
32 |         return output, input_lengths  # no state in this layer
33 | 
34 |     def output_size(self) -> int:
35 |         """Get the output size."""
36 |         return self.output_dim
37 | 


--------------------------------------------------------------------------------
/espnet2/asr/specaug/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/specaug/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/specaug/abs_specaug.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class AbsSpecAug(torch.nn.Module):
 7 |     """Abstract class for the augmentation of spectrogram
 8 | 
 9 |     The process-flow:
10 | 
11 |     Frontend  -> SpecAug -> Normalization -> Encoder -> Decoder
12 |     """
13 | 
14 |     def forward(
15 |         self, x: torch.Tensor, x_lengths: torch.Tensor = None
16 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/espnet2/asr/state_spaces/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet2/asr/state_spaces/ff.py:
--------------------------------------------------------------------------------
 1 | # This code is derived from https://github.com/HazyResearch/state-spaces
 2 | 
 3 | """Implementation of FFN block in the style of Transformers."""
 4 | 
 5 | from functools import partial
 6 | 
 7 | from torch import nn
 8 | 
 9 | from espnet2.asr.state_spaces.base import SequenceModule
10 | from espnet2.asr.state_spaces.components import DropoutNd, LinearActivation
11 | 
12 | 
13 | class FF(SequenceModule):
14 |     def __init__(
15 |         self,
16 |         d_input,
17 |         expand=2,
18 |         d_output=None,
19 |         transposed=False,
20 |         activation="gelu",
21 |         initializer=None,
22 |         dropout=0.0,
23 |         tie_dropout=False,
24 |     ):
25 |         super().__init__()
26 |         self.d_output = d_input if d_output is None else d_output
27 |         self.transposed = transposed
28 |         d_inner = expand * d_input
29 | 
30 |         linear1 = LinearActivation(
31 |             d_input,
32 |             d_inner,
33 |             transposed=transposed,
34 |             activation=activation,
35 |             initializer=initializer,
36 |             activate=True,
37 |         )
38 |         dropout_cls = (
39 |             partial(DropoutNd, transposed=self.transposed)
40 |             if tie_dropout
41 |             else nn.Dropout
42 |         )
43 |         # dropout_cls = nn.Dropout2d if self.transposed else nn.Dropout
44 |         drop = dropout_cls(dropout) if dropout > 0.0 else nn.Identity()
45 | 
46 |         linear2 = LinearActivation(
47 |             d_inner,
48 |             self.d_output,
49 |             transposed=transposed,
50 |             activation=None,
51 |             initializer=initializer,
52 |             activate=False,
53 |         )
54 | 
55 |         self.ff = nn.Sequential(
56 |             linear1,
57 |             drop,
58 |             linear2,
59 |         )
60 | 
61 |     def forward(self, x, *args, **kwargs):
62 |         return self.ff(x), None
63 | 
64 |     def step(self, x, state, **kwargs):
65 |         # x: [batch, d_input]
66 |         if self.transposed:
67 |             # expects: [batch, d_input, seq_len]
68 |             return self.ff(x.unsqueeze(-1)).squeeze(-1), state
69 |         else:
70 |             return self.ff(x), state
71 | 


--------------------------------------------------------------------------------
/espnet2/asr/state_spaces/registry.py:
--------------------------------------------------------------------------------
1 | layer = {
2 |     "s4": "espnet2.asr.state_spaces.s4.S4",
3 |     "ff": "espnet2.asr.state_spaces.ff.FF",
4 |     "mha": "espnet2.asr.state_spaces.attention.MultiHeadedAttention",
5 | }
6 | 


--------------------------------------------------------------------------------
/espnet2/asr/transducer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/transducer/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr/transducer/rnnt_multi_blank/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from espnet2.asr.transducer.rnnt_multi_blank.rnnt_multi_blank import (
16 |     MultiblankRNNTLossNumba,
17 | )
18 | 
19 | __all__ = [MultiblankRNNTLossNumba]
20 | 


--------------------------------------------------------------------------------
/espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2018-2019, Mingkun Huang
16 | #
17 | # Licensed under the Apache License, Version 2.0 (the "License");
18 | # you may not use this file except in compliance with the License.
19 | # You may obtain a copy of the License at
20 | #
21 | #    http://www.apache.org/licenses/LICENSE-2.0
22 | #
23 | # Unless required by applicable law or agreed to in writing, software
24 | # distributed under the License is distributed on an "AS IS" BASIS,
25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | # See the License for the specific language governing permissions and
27 | # limitations under the License.
28 | 


--------------------------------------------------------------------------------
/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2018-2019, Mingkun Huang
16 | #
17 | # Licensed under the Apache License, Version 2.0 (the "License");
18 | # you may not use this file except in compliance with the License.
19 | # You may obtain a copy of the License at
20 | #
21 | #    http://www.apache.org/licenses/LICENSE-2.0
22 | #
23 | # Unless required by applicable law or agreed to in writing, software
24 | # distributed under the License is distributed on an "AS IS" BASIS,
25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | # See the License for the specific language governing permissions and
27 | # limitations under the License.
28 | 


--------------------------------------------------------------------------------
/espnet2/asr/transducer/rnnt_multi_blank/utils/global_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2018-2019, Mingkun Huang
16 | #
17 | # Licensed under the Apache License, Version 2.0 (the "License");
18 | # you may not use this file except in compliance with the License.
19 | # You may obtain a copy of the License at
20 | #
21 | #    http://www.apache.org/licenses/LICENSE-2.0
22 | #
23 | # Unless required by applicable law or agreed to in writing, software
24 | # distributed under the License is distributed on an "AS IS" BASIS,
25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | # See the License for the specific language governing permissions and
27 | # limitations under the License.
28 | 
29 | 
30 | import enum
31 | 
32 | import numpy as np
33 | from numba import float32
34 | 
35 | # Internal globals
36 | _THREADS_PER_BLOCK = 32
37 | _WARP_SIZE = 32
38 | _DTYPE = float32
39 | 
40 | # Constants
41 | FP32_INF = np.inf
42 | FP32_NEG_INF = -np.inf
43 | THRESHOLD = 1e-1
44 | 
45 | """
46 | Getters
47 | """
48 | 
49 | 
50 | def threads_per_block():
51 |     global _THREADS_PER_BLOCK
52 |     return _THREADS_PER_BLOCK
53 | 
54 | 
55 | def warp_size():
56 |     global _WARP_SIZE
57 |     return _WARP_SIZE
58 | 
59 | 
60 | def dtype():
61 |     global _DTYPE
62 |     return _DTYPE
63 | 
64 | 
65 | # RNNT STATUS
66 | class RNNTStatus(enum.Enum):
67 |     RNNT_STATUS_SUCCESS = 0
68 |     RNNT_STATUS_INVALID_VALUE = 1
69 | 


--------------------------------------------------------------------------------
/espnet2/asr_transducer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/blocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/blocks/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/modules/mega/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/mega/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/modules/rwkv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/rwkv/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/decoder/modules/rwkv/cuda/wkv_op.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Based on https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/cuda/wkv_op.cpp
 3 |    Function signatures were modified based on https://github.com/huggingface/transformers/blob/main/src/transformers/kernels/rwkv/wkv_op.cpp
 4 | 
 5 |  */
 6 | 
 7 | #include <torch/extension.h>
 8 | 
 9 | void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);
10 | 
11 | void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);
12 | 
13 | void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
14 |   const int B = k.size(0);
15 |   const int T = k.size(1);
16 |   const int C = k.size(2);
17 | 
18 |   cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>());
19 | }
20 | 
21 | void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
22 |   const int B = k.size(0);
23 |   const int T = k.size(1);
24 |   const int C = k.size(2);
25 | 
26 |   cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), gy.data_ptr<float>(), gw.data_ptr<float>(), gu.data_ptr<float>(), gk.data_ptr<float>(), gv.data_ptr<float>());
27 | }
28 | 
29 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
30 |     m.def("forward", &forward, "wkv forward");
31 |     m.def("backward", &backward, "wkv backward");
32 | }
33 | 
34 | TORCH_LIBRARY(wkv, m) {
35 |     m.def("forward", forward);
36 |     m.def("backward", backward);
37 | }
38 | 


--------------------------------------------------------------------------------
/espnet2/asr_transducer/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/encoder/blocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/blocks/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/encoder/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/modules/__init__.py


--------------------------------------------------------------------------------
/espnet2/asr_transducer/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/frontend/__init__.py


--------------------------------------------------------------------------------
/espnet2/asvspoof/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/__init__.py


--------------------------------------------------------------------------------
/espnet2/asvspoof/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/decoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/asvspoof/decoder/abs_decoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsDecoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self,
11 |         input: torch.Tensor,
12 |         ilens: torch.Tensor,
13 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/espnet2/asvspoof/decoder/linear_decoder.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | 
 5 | from espnet2.asvspoof.decoder.abs_decoder import AbsDecoder
 6 | 
 7 | 
 8 | class LinearDecoder(AbsDecoder):
 9 |     """Linear decoder for speaker diarization"""
10 | 
11 |     def __init__(
12 |         self,
13 |         encoder_output_size: int,
14 |     ):
15 |         super().__init__()
16 |         # TODO1 (checkpoint3): initialize a linear projection layer
17 | 
18 |     def forward(self, input: torch.Tensor, ilens: Optional[torch.Tensor]):
19 |         """Forward.
20 |         Args:
21 |             input (torch.Tensor): hidden_space [Batch, T, F]
22 |             ilens (torch.Tensor): input lengths [Batch]
23 |         """
24 |         # TODO2 (checkpoint3): compute mean over time-domain (dimension 1)
25 | 
26 |         # TODO3 (checkpoint3): apply the projection layer
27 | 
28 |         # TODO4 (checkpoint3): change the return value
29 |         return None
30 | 


--------------------------------------------------------------------------------
/espnet2/asvspoof/loss/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/loss/__init__.py


--------------------------------------------------------------------------------
/espnet2/asvspoof/loss/abs_loss.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | EPS = torch.finfo(torch.get_default_dtype()).eps
 6 | 
 7 | 
 8 | class AbsASVSpoofLoss(torch.nn.Module, ABC):
 9 |     """Base class for all ASV Spoofing loss modules."""
10 | 
11 |     # the name will be the key that appears in the reporter
12 |     @property
13 |     def name(self) -> str:
14 |         return NotImplementedError
15 | 
16 |     @abstractmethod
17 |     def forward(
18 |         self,
19 |         ref,
20 |         inf,
21 |     ) -> torch.Tensor:
22 |         # the return tensor should be shape of (batch)
23 |         raise NotImplementedError
24 | 
25 |     @abstractmethod
26 |     def score(
27 |         self,
28 |         pred,
29 |     ) -> torch.Tensor:
30 |         raise NotImplemented
31 | 


--------------------------------------------------------------------------------
/espnet2/asvspoof/loss/binary_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss
 4 | from espnet.nets.pytorch_backend.nets_utils import to_device
 5 | 
 6 | 
 7 | class ASVSpoofBinaryLoss(AbsASVSpoofLoss):
 8 |     """Binary loss for ASV Spoofing."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         weight: float = 1.0,
13 |     ):
14 |         super().__init__()
15 |         self.weight = weight
16 |         self.sigmoid = torch.nn.Sigmoid()
17 |         self.loss = torch.nn.BCELoss(reduction="mean")
18 | 
19 |     def forward(self, pred: torch.Tensor, label: torch.Tensor, **kwargs):
20 |         """Forward.
21 |         Args:
22 |             pred  (torch.Tensor): prediction probability [Batch, 2]
23 |             label (torch.Tensor): ground truth label [Batch, 2]
24 |         """
25 |         loss = self.loss(self.sigmoid(pred.view(-1)), label.view(-1).float())
26 |         return loss
27 | 
28 |     def score(self, pred: torch.Tensor):
29 |         return pred
30 | 


--------------------------------------------------------------------------------
/espnet2/asvspoof/loss/oc_softmax_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss
 4 | from espnet.nets.pytorch_backend.nets_utils import to_device
 5 | 
 6 | 
 7 | class ASVSpoofOCSoftmaxLoss(AbsASVSpoofLoss):
 8 |     """Binary loss for ASV Spoofing."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         weight: float = 1.0,
13 |         enc_dim: int = 128,
14 |         m_real: float = 0.5,
15 |         m_fake: float = 0.2,
16 |         alpha: float = 20.0,
17 |     ):
18 |         super(ASVSpoofOCSoftmaxLoss).__init__()
19 |         self.weight = weight
20 |         self.feat_dim = enc_dim
21 |         self.m_real = m_real
22 |         self.m_fake = m_fake
23 |         self.alpha = alpha
24 |         self.center = torch.nn.Parameter(torch.randn(1, self.feat_dim))
25 |         torch.nn.init.kaiming_uniform_(self.center, 0.25)
26 |         self.softplus = torch.nn.Softplus()
27 | 
28 |     def forward(self, label: torch.Tensor, emb: torch.Tensor, **kwargs):
29 |         """Forward.
30 |         Args:
31 |             label (torch.Tensor): ground truth label [Batch, 1]
32 |             emb   (torch.Tensor): encoder embedding output [Batch, T, enc_dim]
33 |         """
34 |         emb = torch.mean(emb, dim=1)
35 |         w = torch.nn.functional.normalize(self.center, p=2, dim=1)
36 |         x = torch.nn.functional.normalize(emb, p=2, dim=1)
37 | 
38 |         # TODO1 (exercise 2): compute scores based on w and x
39 | 
40 |         # TODO2 (exercise 2): calculate the score bias based on m_real and m_fake
41 | 
42 |         # TODO3 (exercise 2): apply alpha and softplus
43 | 
44 |         # TODO4 (exercise 2): returnthe final loss
45 |         return None
46 | 
47 |     def score(self, emb: torch.Tensor):
48 |         """Prediction.
49 |         Args:
50 |             emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim]
51 |         """
52 |         emb = torch.mean(emb, dim=1)
53 |         w = torch.nn.functional.normalize(self.center, p=2, dim=1)
54 |         x = torch.nn.functional.normalize(emb, p=2, dim=1)
55 | 
56 |         # TODO5 (exercise 2): compute scores
57 | 


--------------------------------------------------------------------------------
/espnet2/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/bin/__init__.py


--------------------------------------------------------------------------------
/espnet2/bin/asr_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.asr import ASRTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = ASRTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""ASR training.
12 | 
13 |     Example:
14 | 
15 |         % python asr_train.py asr --print_config --optim adadelta \
16 |                 > conf/train_asr.yaml
17 |         % python asr_train.py --config conf/train_asr.yaml
18 |     """
19 |     ASRTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/asr_transducer_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from espnet2.tasks.asr_transducer import ASRTransducerTask
 4 | 
 5 | 
 6 | def get_parser():
 7 |     """Get parser for ASR Transducer task."""
 8 |     parser = ASRTransducerTask.get_parser()
 9 |     return parser
10 | 
11 | 
12 | def main(cmd=None):
13 |     r"""ASR Transducer training.
14 | 
15 |     Example:
16 | 
17 |         % python asr_transducer_train.py asr --print_config \
18 |                 --optim adadelta > conf/train_asr.yaml
19 |         % python asr_transducer_train.py \
20 |                 --config conf/tuning/transducer/train_rnn_transducer.yaml
21 |     """
22 |     ASRTransducerTask.main(cmd=cmd)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/espnet2/bin/asvspoof_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.asvspoof import ASVSpoofTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = ASVSpoofTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""ASVSpoof training.
12 |     Example:
13 |         % python asvspoof_train.py asr --print_config --optim adadelta \
14 |                 > conf/train_asvspoof.yaml
15 |         % python asvspoof_train.py --config conf/train_asvspoof.yaml
16 |     """
17 |     ASVSpoofTask.main(cmd=cmd)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/espnet2/bin/diar_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from espnet2.tasks.diar import DiarizationTask
 4 | 
 5 | 
 6 | def get_parser():
 7 |     parser = DiarizationTask.get_parser()
 8 |     return parser
 9 | 
10 | 
11 | def main(cmd=None):
12 |     r"""Speaker diarization training.
13 | 
14 |     Example:
15 |         % python diar_train.py diar --print_config --optim adadelta \
16 |                 > conf/train_diar.yaml
17 |         % python diar_train.py --config conf/train_diar.yaml
18 |     """
19 |     DiarizationTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/enh_s2t_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.enh_s2t import EnhS2TTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = EnhS2TTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""EnhS2T training.
12 | 
13 |     Example:
14 | 
15 |         % python enh_s2t_train.py enh_s2t --print_config --optim adadelta \
16 |                 > conf/train_enh_s2t.yaml
17 |         % python enh_s2t_train.py --config conf/train_enh_s2t.yaml
18 |     """
19 |     EnhS2TTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/enh_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.enh import EnhancementTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = EnhancementTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""Enhancemnet frontend training.
12 | 
13 |     Example:
14 | 
15 |         % python enh_train.py enh --print_config --optim adadelta \
16 |                 > conf/train_enh.yaml
17 |         % python enh_train.py --config conf/train_enh.yaml
18 |     """
19 |     EnhancementTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/enh_tse_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.enh_tse import TargetSpeakerExtractionTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = TargetSpeakerExtractionTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""Target Speaker Extraction model training.
12 | 
13 |     Example:
14 | 
15 |         % python enh_tse_train.py asr --print_config --optim adadelta \
16 |                 > conf/train_enh.yaml
17 |         % python enh_tse_train.py --config conf/train_enh.yaml
18 |     """
19 |     TargetSpeakerExtractionTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/gan_svs_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.gan_svs import GANSVSTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = GANSVSTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """GAN-based SVS training
12 | 
13 |     Example:
14 | 
15 |         % python gan_svs_train.py --print_config --optim1 adadelta
16 |         % python gan_svs_train.py --config conf/train.yaml
17 |     """
18 |     GANSVSTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/gan_tts_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.gan_tts import GANTTSTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = GANTTSTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """GAN-based TTS training
12 | 
13 |     Example:
14 | 
15 |         % python gan_tts_train.py --print_config --optim1 adadelta
16 |         % python gan_tts_train.py --config conf/train.yaml
17 |     """
18 |     GANTTSTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/hubert_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.hubert import HubertTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = HubertTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """Hubert pretraining.
12 | 
13 |     Example:
14 |         % python hubert_train.py asr --print_config --optim adadelta \
15 |                 > conf/hubert_asr.yaml
16 |         % python hubert_train.py --config conf/train_asr.yaml
17 |     """
18 |     HubertTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/lm_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.lm import LMTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = LMTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """LM training.
12 | 
13 |     Example:
14 | 
15 |         % python lm_train.py asr --print_config --optim adadelta
16 |         % python lm_train.py --config conf/train_asr.yaml
17 |     """
18 |     LMTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/mt_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.mt import MTTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = MTTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""MT training.
12 | 
13 |     Example:
14 | 
15 |         % python mt_train.py st --print_config --optim adadelta \
16 |                 > conf/train_mt.yaml
17 |         % python mt_train.py --config conf/train_mt.yaml
18 |     """
19 |     MTTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/s2t_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.s2t import S2TTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = S2TTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""S2T training.
12 | 
13 |     Example:
14 | 
15 |         % python s2t_train.py s2t --print_config --optim adadelta \
16 |                 > conf/train_s2t.yaml
17 |         % python s2t_train.py --config conf/train_s2t.yaml
18 |     """
19 |     S2TTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/slu_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.slu import SLUTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = SLUTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""SLU training.
12 | 
13 |     Example:
14 | 
15 |         % python slu_train.py slu --print_config --optim adadelta \
16 |                 > conf/train_slu.yaml
17 |         % python slu_train.py --config conf/train_slu.yaml
18 |     """
19 |     SLUTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/spk_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from espnet2.tasks.spk import SpeakerTask
 4 | 
 5 | 
 6 | def get_parser():
 7 |     parser = SpeakerTask.get_parser()
 8 |     return parser
 9 | 
10 | 
11 | def main(cmd=None):
12 |     r"""Speaker embedding extractor training. Trained model can be used for
13 |         speaker verification, open set speaker identification, and also as
14 |         embeddings for various other tasks including speaker diarization.
15 | 
16 |     Example:
17 |         % python spk_train.py --print_config --optim adadelta \
18 |                 > conf/train_spk.yaml
19 |         % python spk_train.py --config conf/train_diar.yaml
20 |     """
21 |     SpeakerTask.main(cmd=cmd)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/espnet2/bin/st_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.st import STTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = STTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""ST training.
12 | 
13 |     Example:
14 | 
15 |         % python st_train.py st --print_config --optim adadelta \
16 |                 > conf/train_st.yaml
17 |         % python st_train.py --config conf/train_st.yaml
18 |     """
19 |     STTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/bin/svs_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.svs import SVSTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = SVSTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """SVS training
12 | 
13 |     Example:
14 | 
15 |         % python svs_train.py svs --print_config --optim adadelta
16 |         % python svs_train.py --config conf/train_svs.yaml
17 |     """
18 |     SVSTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/tts_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.tts import TTSTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = TTSTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     """TTS training
12 | 
13 |     Example:
14 | 
15 |         % python tts_train.py asr --print_config --optim adadelta
16 |         % python tts_train.py --config conf/train_asr.yaml
17 |     """
18 |     TTSTask.main(cmd=cmd)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/espnet2/bin/uasr_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from espnet2.tasks.uasr import UASRTask
 3 | 
 4 | 
 5 | def get_parser():
 6 |     parser = UASRTask.get_parser()
 7 |     return parser
 8 | 
 9 | 
10 | def main(cmd=None):
11 |     r"""UASR training.
12 | 
13 |     Example:
14 | 
15 |         % python uasr_train.py uasr --print_config --optim adadelta \
16 |                 > conf/train_uasr.yaml
17 |         % python uasr_train.py --config conf/train_uasr.yaml
18 |     """
19 |     UASRTask.main(cmd=cmd)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/espnet2/diar/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/__init__.py


--------------------------------------------------------------------------------
/espnet2/diar/abs_diar.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class AbsDiarization(torch.nn.Module, ABC):
 9 |     # @abstractmethod
10 |     # def output_size(self) -> int:
11 |     #     raise NotImplementedError
12 | 
13 |     @abstractmethod
14 |     def forward(
15 |         self,
16 |         input: torch.Tensor,
17 |         ilens: torch.Tensor,
18 |     ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]:
19 |         raise NotImplementedError
20 | 
21 |     @abstractmethod
22 |     def forward_rawwav(
23 |         self, input: torch.Tensor, ilens: torch.Tensor
24 |     ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]:
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/espnet2/diar/attractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/attractor/__init__.py


--------------------------------------------------------------------------------
/espnet2/diar/attractor/abs_attractor.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsAttractor(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self,
11 |         enc_input: torch.Tensor,
12 |         ilens: torch.Tensor,
13 |         dec_input: torch.Tensor,
14 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
15 |         raise NotImplementedError
16 | 


--------------------------------------------------------------------------------
/espnet2/diar/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/decoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/diar/decoder/abs_decoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsDecoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self,
11 |         input: torch.Tensor,
12 |         ilens: torch.Tensor,
13 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
14 |         raise NotImplementedError
15 | 
16 |     @property
17 |     @abstractmethod
18 |     def num_spk(self):
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/espnet2/diar/decoder/linear_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.diar.decoder.abs_decoder import AbsDecoder
 4 | 
 5 | 
 6 | class LinearDecoder(AbsDecoder):
 7 |     """Linear decoder for speaker diarization"""
 8 | 
 9 |     def __init__(
10 |         self,
11 |         encoder_output_size: int,
12 |         num_spk: int = 2,
13 |     ):
14 |         super().__init__()
15 |         self._num_spk = num_spk
16 |         self.linear_decoder = torch.nn.Linear(encoder_output_size, num_spk)
17 | 
18 |     def forward(self, input: torch.Tensor, ilens: torch.Tensor):
19 |         """Forward.
20 | 
21 |         Args:
22 |             input (torch.Tensor): hidden_space [Batch, T, F]
23 |             ilens (torch.Tensor): input lengths [Batch]
24 |         """
25 | 
26 |         output = self.linear_decoder(input)
27 | 
28 |         return output
29 | 
30 |     @property
31 |     def num_spk(self):
32 |         return self._num_spk
33 | 


--------------------------------------------------------------------------------
/espnet2/diar/label_processor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.layers.label_aggregation import LabelAggregate
 4 | 
 5 | 
 6 | class LabelProcessor(torch.nn.Module):
 7 |     """Label aggregator for speaker diarization"""
 8 | 
 9 |     def __init__(
10 |         self, win_length: int = 512, hop_length: int = 128, center: bool = True
11 |     ):
12 |         super().__init__()
13 |         self.label_aggregator = LabelAggregate(win_length, hop_length, center)
14 | 
15 |     def forward(self, input: torch.Tensor, ilens: torch.Tensor):
16 |         """Forward.
17 | 
18 |         Args:
19 |             input: (Batch, Nsamples, Label_dim)
20 |             ilens: (Batch)
21 |         Returns:
22 |             output: (Batch, Frames, Label_dim)
23 |             olens: (Batch)
24 | 
25 |         """
26 | 
27 |         output, olens = self.label_aggregator(input, ilens)
28 | 
29 |         return output, olens
30 | 


--------------------------------------------------------------------------------
/espnet2/diar/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/layers/__init__.py


--------------------------------------------------------------------------------
/espnet2/diar/layers/abs_mask.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class AbsMask(torch.nn.Module, ABC):
 9 |     @property
10 |     @abstractmethod
11 |     def max_num_spk(self) -> int:
12 |         raise NotImplementedError
13 | 
14 |     @abstractmethod
15 |     def forward(
16 |         self,
17 |         input,
18 |         ilens,
19 |         bottleneck_feat,
20 |         num_spk,
21 |     ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]:
22 |         raise NotImplementedError
23 | 


--------------------------------------------------------------------------------
/espnet2/diar/separator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/separator/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/abs_enh.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class AbsEnhancement(torch.nn.Module, ABC):
 9 |     # @abstractmethod
10 |     # def output_size(self) -> int:
11 |     #     raise NotImplementedError
12 | 
13 |     @abstractmethod
14 |     def forward(
15 |         self,
16 |         input: torch.Tensor,
17 |         ilens: torch.Tensor,
18 |     ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]:
19 |         raise NotImplementedError
20 | 
21 |     @abstractmethod
22 |     def forward_rawwav(
23 |         self, input: torch.Tensor, ilens: torch.Tensor
24 |     ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]:
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/espnet2/enh/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/decoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/decoder/abs_decoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsDecoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self,
11 |         input: torch.Tensor,
12 |         ilens: torch.Tensor,
13 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
14 |         raise NotImplementedError
15 | 
16 |     def forward_streaming(self, input_frame: torch.Tensor):
17 |         raise NotImplementedError
18 | 
19 |     def streaming_merge(self, chunks: torch.Tensor, ilens: torch.tensor = None):
20 |         """streaming_merge. It merges the frame-level processed audio chunks
21 |         in the streaming *simulation*. It is noted that, in real applications,
22 |         the processed audio should be sent to the output channel frame by frame.
23 |         You may refer to this function to manage your streaming output buffer.
24 | 
25 |         Args:
26 |             chunks: List [(B, frame_size),]
27 |             ilens: [B]
28 |         Returns:
29 |             merge_audio: [B, T]
30 |         """
31 | 
32 |         raise NotImplementedError
33 | 


--------------------------------------------------------------------------------
/espnet2/enh/decoder/null_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.enh.decoder.abs_decoder import AbsDecoder
 4 | 
 5 | 
 6 | class NullDecoder(AbsDecoder):
 7 |     """Null decoder, return the same args."""
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def forward(self, input: torch.Tensor, ilens: torch.Tensor):
13 |         """Forward. The input should be the waveform already.
14 | 
15 |         Args:
16 |             input (torch.Tensor): wav [Batch, sample]
17 |             ilens (torch.Tensor): input lengths [Batch]
18 |         """
19 |         return input, ilens
20 | 


--------------------------------------------------------------------------------
/espnet2/enh/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/encoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/encoder/abs_encoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsEncoder(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self,
11 |         input: torch.Tensor,
12 |         ilens: torch.Tensor,
13 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
14 |         raise NotImplementedError
15 | 
16 |     @property
17 |     @abstractmethod
18 |     def output_dim(self) -> int:
19 |         raise NotImplementedError
20 | 
21 |     def forward_streaming(self, input: torch.Tensor):
22 |         raise NotImplementedError
23 | 
24 |     def streaming_frame(self, audio: torch.Tensor):
25 |         """streaming_frame. It splits the continuous audio into frame-level
26 |         audio chunks in the streaming *simulation*. It is noted that this
27 |         function takes the entire long audio as input for a streaming simulation.
28 |         You may refer to this function to manage your streaming input
29 |         buffer in a real streaming application.
30 | 
31 |         Args:
32 |             audio: (B, T)
33 |         Returns:
34 |             chunked: List [(B, frame_size),]
35 |         """
36 |         NotImplementedError
37 | 


--------------------------------------------------------------------------------
/espnet2/enh/encoder/null_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.enh.encoder.abs_encoder import AbsEncoder
 4 | 
 5 | 
 6 | class NullEncoder(AbsEncoder):
 7 |     """Null encoder."""
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     @property
13 |     def output_dim(self) -> int:
14 |         return 1
15 | 
16 |     def forward(self, input: torch.Tensor, ilens: torch.Tensor):
17 |         """Forward.
18 | 
19 |         Args:
20 |             input (torch.Tensor): mixed speech [Batch, sample]
21 |             ilens (torch.Tensor): input lengths [Batch]
22 |         """
23 |         return input, ilens
24 | 


--------------------------------------------------------------------------------
/espnet2/enh/extractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/extractor/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/extractor/abs_extractor.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class AbsExtractor(torch.nn.Module, ABC):
 9 |     @abstractmethod
10 |     def forward(
11 |         self,
12 |         input: torch.Tensor,
13 |         ilens: torch.Tensor,
14 |         input_aux: torch.Tensor,
15 |         ilens_aux: torch.Tensor,
16 |         suffix_tag: str = "",
17 |     ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]:
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/espnet2/enh/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/layers/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/layers/conv_utils.py:
--------------------------------------------------------------------------------
 1 | # noqa: E501 ported from https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7
 2 | import math
 3 | 
 4 | 
 5 | def num2tuple(num):
 6 |     return num if isinstance(num, tuple) else (num, num)
 7 | 
 8 | 
 9 | def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
10 |     h_w, kernel_size, stride, pad, dilation = (
11 |         num2tuple(h_w),
12 |         num2tuple(kernel_size),
13 |         num2tuple(stride),
14 |         num2tuple(pad),
15 |         num2tuple(dilation),
16 |     )
17 |     pad = num2tuple(pad[0]), num2tuple(pad[1])
18 | 
19 |     h = math.floor(
20 |         (h_w[0] + sum(pad[0]) - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1
21 |     )
22 |     w = math.floor(
23 |         (h_w[1] + sum(pad[1]) - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1
24 |     )
25 | 
26 |     return h, w
27 | 
28 | 
29 | def convtransp2d_output_shape(
30 |     h_w, kernel_size=1, stride=1, pad=0, dilation=1, out_pad=0
31 | ):
32 |     h_w, kernel_size, stride, pad, dilation, out_pad = (
33 |         num2tuple(h_w),
34 |         num2tuple(kernel_size),
35 |         num2tuple(stride),
36 |         num2tuple(pad),
37 |         num2tuple(dilation),
38 |         num2tuple(out_pad),
39 |     )
40 |     pad = num2tuple(pad[0]), num2tuple(pad[1])
41 | 
42 |     h = (
43 |         (h_w[0] - 1) * stride[0]
44 |         - sum(pad[0])
45 |         + dilation[0] * (kernel_size[0] - 1)
46 |         + out_pad[0]
47 |         + 1
48 |     )
49 |     w = (
50 |         (h_w[1] - 1) * stride[1]
51 |         - sum(pad[1])
52 |         + dilation[1] * (kernel_size[1] - 1)
53 |         + out_pad[1]
54 |         + 1
55 |     )
56 | 
57 |     return h, w
58 | 


--------------------------------------------------------------------------------
/espnet2/enh/loss/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/loss/criterions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/criterions/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/loss/criterions/abs_loss.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | EPS = torch.finfo(torch.get_default_dtype()).eps
 6 | 
 7 | 
 8 | class AbsEnhLoss(torch.nn.Module, ABC):
 9 |     """Base class for all Enhancement loss modules."""
10 | 
11 |     # the name will be the key that appears in the reporter
12 |     @property
13 |     def name(self) -> str:
14 |         return NotImplementedError
15 | 
16 |     # This property specifies whether the criterion will only
17 |     # be evaluated during the inference stage
18 |     @property
19 |     def only_for_test(self) -> bool:
20 |         return False
21 | 
22 |     @abstractmethod
23 |     def forward(
24 |         self,
25 |         ref,
26 |         inf,
27 |     ) -> torch.Tensor:
28 |         # the return tensor should be shape of (batch)
29 |         raise NotImplementedError
30 | 


--------------------------------------------------------------------------------
/espnet2/enh/loss/wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/wrappers/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/loss/wrappers/abs_wrapper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, List, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsLossWrapper(torch.nn.Module, ABC):
 8 |     """Base class for all Enhancement loss wrapper modules."""
 9 | 
10 |     # The weight for the current loss in the multi-task learning.
11 |     # The overall training target will be combined as:
12 |     # loss = weight_1 * loss_1 + ... + weight_N * loss_N
13 |     weight = 1.0
14 | 
15 |     @abstractmethod
16 |     def forward(
17 |         self,
18 |         ref: List,
19 |         inf: List,
20 |         others: Dict,
21 |     ) -> Tuple[torch.Tensor, Dict, Dict]:
22 |         raise NotImplementedError
23 | 


--------------------------------------------------------------------------------
/espnet2/enh/loss/wrappers/dpcl_solver.py:
--------------------------------------------------------------------------------
 1 | from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 2 | from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
 3 | 
 4 | 
 5 | class DPCLSolver(AbsLossWrapper):
 6 |     def __init__(self, criterion: AbsEnhLoss, weight=1.0):
 7 |         super().__init__()
 8 |         self.criterion = criterion
 9 |         self.weight = weight
10 | 
11 |     def forward(self, ref, inf, others={}):
12 |         """A naive DPCL solver
13 | 
14 |         Args:
15 |             ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
16 |             inf (List[torch.Tensor]): [(batch, ...), ...]
17 |             others (List): other data included in this solver
18 |                 e.g. "tf_embedding" learned embedding of all T-F bins (B, T * F, D)
19 | 
20 |         Returns:
21 |             loss: (torch.Tensor): minimum loss with the best permutation
22 |             stats: (dict), for collecting training status
23 |             others: reserved
24 |         """
25 |         assert "tf_embedding" in others
26 | 
27 |         loss = self.criterion(ref, others["tf_embedding"]).mean()
28 | 
29 |         stats = dict()
30 |         stats[self.criterion.name] = loss.detach()
31 | 
32 |         return loss.mean(), stats, {}
33 | 


--------------------------------------------------------------------------------
/espnet2/enh/loss/wrappers/fixed_order.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import torch
 4 | 
 5 | from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 6 | from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
 7 | 
 8 | 
 9 | class FixedOrderSolver(AbsLossWrapper):
10 |     def __init__(self, criterion: AbsEnhLoss, weight=1.0):
11 |         super().__init__()
12 |         self.criterion = criterion
13 |         self.weight = weight
14 | 
15 |     def forward(self, ref, inf, others={}):
16 |         """An naive fixed-order solver
17 | 
18 |         Args:
19 |             ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
20 |             inf (List[torch.Tensor]): [(batch, ...), ...]
21 | 
22 |         Returns:
23 |             loss: (torch.Tensor): minimum loss with the best permutation
24 |             stats: dict, for collecting training status
25 |             others: reserved
26 |         """
27 |         assert len(ref) == len(inf), (len(ref), len(inf))
28 |         num_spk = len(ref)
29 | 
30 |         loss = 0.0
31 |         stats = defaultdict(list)
32 |         for r, i in zip(ref, inf):
33 |             loss += torch.mean(self.criterion(r, i)) / num_spk
34 |             for k, v in getattr(self.criterion, "stats", {}).items():
35 |                 stats[k].append(v)
36 | 
37 |         for k, v in stats.items():
38 |             stats[k] = torch.stack(v, dim=1).mean()
39 |         stats[self.criterion.name] = loss.detach()
40 | 
41 |         perm = torch.arange(num_spk).unsqueeze(0).repeat(ref[0].size(0), 1)
42 |         return loss.mean(), dict(stats), {"perm": perm}
43 | 


--------------------------------------------------------------------------------
/espnet2/enh/separator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/separator/__init__.py


--------------------------------------------------------------------------------
/espnet2/enh/separator/abs_separator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections import OrderedDict
 3 | from typing import Dict, Optional, Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class AbsSeparator(torch.nn.Module, ABC):
 9 |     @abstractmethod
10 |     def forward(
11 |         self,
12 |         input: torch.Tensor,
13 |         ilens: torch.Tensor,
14 |         additional: Optional[Dict] = None,
15 |     ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]:
16 |         raise NotImplementedError
17 | 
18 |     def forward_streaming(
19 |         self,
20 |         input_frame: torch.Tensor,
21 |         buffer=None,
22 |     ):
23 |         raise NotImplementedError
24 | 
25 |     @property
26 |     @abstractmethod
27 |     def num_spk(self):
28 |         raise NotImplementedError
29 | 


--------------------------------------------------------------------------------
/espnet2/fileio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/fileio/__init__.py


--------------------------------------------------------------------------------
/espnet2/fst/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/fst/__init__.py


--------------------------------------------------------------------------------
/espnet2/gan_svs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/gan_svs/__init__.py


--------------------------------------------------------------------------------
/espnet2/gan_svs/abs_gan_svs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | # Copyright 2022 Yifeng Yu
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | 
 5 | """GAN-based SVS abstrast class."""
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from typing import Dict, Union
 9 | 
10 | import torch
11 | 
12 | from espnet2.svs.abs_svs import AbsSVS
13 | 
14 | 
15 | class AbsGANSVS(AbsSVS, ABC):
16 |     """GAN-based SVS model abstract class."""
17 | 
18 |     @abstractmethod
19 |     def forward(
20 |         self,
21 |         forward_generator,
22 |         *args,
23 |         **kwargs,
24 |     ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]:
25 |         """Return generator or discriminator loss."""
26 |         raise NotImplementedError
27 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/avocodo/__init__.py:
--------------------------------------------------------------------------------
 1 | from espnet2.gan_svs.avocodo.avocodo import (
 2 |     MDC,
 3 |     SBD,
 4 |     AvocodoDiscriminator,
 5 |     AvocodoDiscriminatorPlus,
 6 |     AvocodoGenerator,
 7 |     CoMBD,
 8 |     CoMBDBlock,
 9 |     SBDBlock,
10 | )
11 | 
12 | __all__ = [
13 |     "MDC",
14 |     "SBD",
15 |     "AvocodoDiscriminator",
16 |     "AvocodoDiscriminatorPlus",
17 |     "AvocodoGenerator",
18 |     "CoMBD",
19 |     "CoMBDBlock",
20 |     "SBDBlock",
21 | ]
22 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/joint/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_svs.joint.joint_score2wav import JointScore2Wav  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/uhifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_svs.uhifigan.sine_generator import SineGen
2 | from espnet2.gan_svs.uhifigan.uhifigan import UHiFiGANGenerator
3 | 
4 | __all__ = [
5 |     "UHiFiGANGenerator",
6 |     "SineGen",
7 | ]
8 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_svs.utils.expand_f0 import expand_f0
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/utils/expand_f0.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Yifeng Yu
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Function to get random segments."""
 5 | 
 6 | from typing import Optional, Tuple
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | def expand_f0(f0_frame, hop_length, method="interpolation"):
13 |     """Expand f0 to output wave length.
14 | 
15 |     Args:
16 |         f0_frame (Tensor): Input tensor (B, 1, frame_len).
17 |         hop_length (Tensor): Hop length.
18 |         method (str): Method to expand f0. Choose either 'interpolation' or 'repeat'.
19 | 
20 |     Returns:
21 |         Tensor: Output tensor (B, 1, wav_len).
22 | 
23 |     """
24 |     frame_length = f0_frame.size(2)
25 |     signal_length = frame_length * hop_length
26 |     if method == "interpolation":
27 |         f0_sample = F.interpolate(
28 |             f0_frame, size=signal_length, mode="linear", align_corners=False
29 |         )
30 |     elif method == "repeat":
31 |         f0_sample = f0_frame.repeat_interleave(hop_length, dim=2)[:signal_length]
32 |     else:
33 |         raise ValueError("Invalid method. Choose either 'interpolation' or 'repeat'.")
34 |     f0_sample = f0_sample.squeeze()[
35 |         :signal_length
36 |     ]  # Remove extra dimensions and trim to signal_length
37 |     return f0_sample
38 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/visinger2/__init__.py:
--------------------------------------------------------------------------------
 1 | from espnet2.gan_svs.visinger2.visinger2_vocoder import (
 2 |     Generator_Harm,
 3 |     Generator_Noise,
 4 |     VISinger2Discriminator,
 5 |     VISinger2VocoderGenerator,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "Generator_Harm",
10 |     "Generator_Noise",
11 |     "VISinger2Discriminator",
12 |     "VISinger2VocoderGenerator",
13 | ]
14 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/vits/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_svs.vits.vits import VITS  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_svs/vits/modules.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2022 Yifeng Yu
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class Projection(torch.nn.Module):
11 |     def __init__(self, hidden_channels, out_channels):
12 |         super().__init__()
13 |         self.hidden_channels = hidden_channels
14 |         self.out_channels = out_channels
15 |         self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
16 | 
17 |     def forward(self, x, x_mask):
18 |         # x shape: (B, attention_dim, T_text)
19 |         stats = self.proj(x) * x_mask
20 |         m_p, logs_p = torch.split(stats, self.out_channels, dim=1)
21 |         return m_p, logs_p
22 | 
23 | 
24 | def sequence_mask(length, max_length=None):
25 |     if max_length is None:
26 |         max_length = length.max()
27 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
28 |     return x.unsqueeze(0) < length.unsqueeze(1)
29 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/gan_tts/__init__.py


--------------------------------------------------------------------------------
/espnet2/gan_tts/abs_gan_tts.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """GAN-based TTS abstrast class."""
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | from typing import Dict, Union
 8 | 
 9 | import torch
10 | 
11 | from espnet2.tts.abs_tts import AbsTTS
12 | 
13 | 
14 | class AbsGANTTS(AbsTTS, ABC):
15 |     """GAN-based TTS model abstract class."""
16 | 
17 |     @abstractmethod
18 |     def forward(
19 |         self,
20 |         forward_generator,
21 |         *args,
22 |         **kwargs,
23 |     ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]:
24 |         """Return generator or discriminator loss."""
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/hifigan/__init__.py:
--------------------------------------------------------------------------------
 1 | from espnet2.gan_tts.hifigan.hifigan import (
 2 |     HiFiGANGenerator,
 3 |     HiFiGANMultiPeriodDiscriminator,
 4 |     HiFiGANMultiScaleDiscriminator,
 5 |     HiFiGANMultiScaleMultiPeriodDiscriminator,
 6 |     HiFiGANPeriodDiscriminator,
 7 |     HiFiGANScaleDiscriminator,
 8 | )
 9 | 
10 | __all__ = [
11 |     "HiFiGANGenerator",
12 |     "HiFiGANMultiPeriodDiscriminator",
13 |     "HiFiGANMultiScaleDiscriminator",
14 |     "HiFiGANMultiScaleMultiPeriodDiscriminator",
15 |     "HiFiGANPeriodDiscriminator",
16 |     "HiFiGANScaleDiscriminator",
17 | ]
18 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/jets/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.jets.jets import JETS  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/joint/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.joint.joint_text2wav import JointText2Wav  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/melgan/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.melgan.melgan import MelGANDiscriminator  # NOQA
2 | from espnet2.gan_tts.melgan.melgan import MelGANGenerator  # NOQA
3 | from espnet2.gan_tts.melgan.melgan import MelGANMultiScaleDiscriminator  # NOQA
4 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.parallel_wavegan.parallel_wavegan import (
2 |     ParallelWaveGANDiscriminator,
3 |     ParallelWaveGANGenerator,
4 | )
5 | 
6 | __all__ = ["ParallelWaveGANDiscriminator", "ParallelWaveGANGenerator"]
7 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/style_melgan/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANDiscriminator  # NOQA
2 | from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANGenerator  # NOQA
3 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.utils.get_random_segments import get_random_segments  # NOQA
2 | from espnet2.gan_tts.utils.get_random_segments import get_segments  # NOQA
3 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/utils/get_random_segments.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Function to get random segments."""
 5 | 
 6 | from typing import Optional, Tuple
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def get_random_segments(
12 |     x: torch.Tensor,
13 |     x_lengths: torch.Tensor,
14 |     segment_size: int,
15 | ) -> Tuple[torch.Tensor, torch.Tensor]:
16 |     """Get random segments.
17 | 
18 |     Args:
19 |         x (Tensor): Input tensor (B, C, T).
20 |         x_lengths (Tensor): Length tensor (B,).
21 |         segment_size (int): Segment size.
22 | 
23 |     Returns:
24 |         Tensor: Segmented tensor (B, C, segment_size).
25 |         Tensor: Start index tensor (B,).
26 | 
27 |     """
28 |     b, c, t = x.size()
29 |     max_start_idx = x_lengths - segment_size
30 |     max_start_idx[max_start_idx < 0] = 0
31 |     start_idxs = (torch.rand([b]).to(x.device) * max_start_idx).to(
32 |         dtype=torch.long,
33 |     )
34 |     segments = get_segments(x, start_idxs, segment_size)
35 | 
36 |     return segments, start_idxs
37 | 
38 | 
39 | def get_segments(
40 |     x: torch.Tensor,
41 |     start_idxs: torch.Tensor,
42 |     segment_size: int,
43 | ) -> torch.Tensor:
44 |     """Get segments.
45 | 
46 |     Args:
47 |         x (Tensor): Input tensor (B, C, T).
48 |         start_idxs (Tensor): Start index tensor (B,).
49 |         segment_size (int): Segment size.
50 | 
51 |     Returns:
52 |         Tensor: Segmented tensor (B, C, segment_size).
53 | 
54 |     """
55 |     b, c, t = x.size()
56 |     segments = x.new_zeros(b, c, segment_size)
57 |     for i, start_idx in enumerate(start_idxs):
58 |         segments[i] = x[i, :, start_idx : start_idx + segment_size]
59 |     return segments
60 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/vits/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.vits.vits import VITS  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/vits/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | """Maximum path calculation module with cython optimization.
 2 | 
 3 | This code is copied from https://github.com/jaywalnut310/vits and modifed code format.
 4 | 
 5 | """
 6 | 
 7 | cimport cython
 8 | 
 9 | from cython.parallel import prange
10 | 
11 | 
12 | @cython.boundscheck(False)
13 | @cython.wraparound(False)
14 | cdef void maximum_path_each(int[:, ::1] path, float[:, ::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
15 |     cdef int x
16 |     cdef int y
17 |     cdef float v_prev
18 |     cdef float v_cur
19 |     cdef float tmp
20 |     cdef int index = t_x - 1
21 | 
22 |     for y in range(t_y):
23 |         for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
24 |             if x == y:
25 |                 v_cur = max_neg_val
26 |             else:
27 |                 v_cur = value[y - 1, x]
28 |             if x == 0:
29 |                 if y == 0:
30 |                     v_prev = 0.0
31 |                 else:
32 |                     v_prev = max_neg_val
33 |             else:
34 |                 v_prev = value[y - 1, x - 1]
35 |             value[y, x] += max(v_prev, v_cur)
36 | 
37 |     for y in range(t_y - 1, -1, -1):
38 |         path[y, index] = 1
39 |         if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
40 |             index = index - 1
41 | 
42 | 
43 | @cython.boundscheck(False)
44 | @cython.wraparound(False)
45 | cpdef void maximum_path_c(int[:, :, ::1] paths, float[:, :, ::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
46 |     cdef int b = paths.shape[0]
47 |     cdef int i
48 |     for i in prange(b, nogil=True):
49 |         maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
50 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/vits/monotonic_align/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup cython code."""
 2 | 
 3 | from Cython.Build import cythonize
 4 | from setuptools import Extension, setup
 5 | from setuptools.command.build_ext import build_ext as _build_ext
 6 | 
 7 | 
 8 | class build_ext(_build_ext):
 9 |     """Overwrite build_ext."""
10 | 
11 |     def finalize_options(self):
12 |         """Prevent numpy from thinking it is still in its setup process."""
13 |         _build_ext.finalize_options(self)
14 |         __builtins__.__NUMPY_SETUP__ = False
15 |         import numpy
16 | 
17 |         self.include_dirs.append(numpy.get_include())
18 | 
19 | 
20 | exts = [
21 |     Extension(
22 |         name="core",
23 |         sources=["core.pyx"],
24 |     )
25 | ]
26 | setup(
27 |     name="monotonic_align",
28 |     ext_modules=cythonize(exts, language_level=3),
29 |     cmdclass={"build_ext": build_ext},
30 | )
31 | 


--------------------------------------------------------------------------------
/espnet2/gan_tts/wavenet/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.gan_tts.wavenet.wavenet import WaveNet  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/hubert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/hubert/__init__.py


--------------------------------------------------------------------------------
/espnet2/iterators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/iterators/__init__.py


--------------------------------------------------------------------------------
/espnet2/iterators/abs_iter_factory.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Iterator
3 | 
4 | 
5 | class AbsIterFactory(ABC):
6 |     @abstractmethod
7 |     def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator:
8 |         raise NotImplementedError
9 | 


--------------------------------------------------------------------------------
/espnet2/iterators/multiple_iter_factory.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Callable, Collection, Iterator
 3 | 
 4 | import numpy as np
 5 | from typeguard import check_argument_types
 6 | 
 7 | from espnet2.iterators.abs_iter_factory import AbsIterFactory
 8 | 
 9 | 
10 | class MultipleIterFactory(AbsIterFactory):
11 |     def __init__(
12 |         self,
13 |         build_funcs: Collection[Callable[[], AbsIterFactory]],
14 |         seed: int = 0,
15 |         shuffle: bool = False,
16 |     ):
17 |         assert check_argument_types()
18 |         self.build_funcs = list(build_funcs)
19 |         self.seed = seed
20 |         self.shuffle = shuffle
21 | 
22 |     def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator:
23 |         if shuffle is None:
24 |             shuffle = self.shuffle
25 | 
26 |         build_funcs = list(self.build_funcs)
27 | 
28 |         if shuffle:
29 |             np.random.RandomState(epoch + self.seed).shuffle(build_funcs)
30 | 
31 |         for i, build_func in enumerate(build_funcs):
32 |             logging.info(f"Building {i}th iter-factory...")
33 |             iter_factory = build_func()
34 |             assert isinstance(iter_factory, AbsIterFactory), type(iter_factory)
35 |             yield from iter_factory.build_iter(epoch, shuffle)
36 | 


--------------------------------------------------------------------------------
/espnet2/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/layers/__init__.py


--------------------------------------------------------------------------------
/espnet2/layers/abs_normalize.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsNormalize(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def forward(
10 |         self, input: torch.Tensor, input_lengths: torch.Tensor = None
11 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
12 |         # return output, output_lengths
13 |         raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/espnet2/layers/inversible_interface.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class InversibleInterface(ABC):
 8 |     @abstractmethod
 9 |     def inverse(
10 |         self, input: torch.Tensor, input_lengths: torch.Tensor = None
11 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
12 |         # return output, output_lengths
13 |         raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/espnet2/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/lm/__init__.py


--------------------------------------------------------------------------------
/espnet2/lm/abs_model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from espnet.nets.scorer_interface import BatchScorerInterface
 7 | 
 8 | 
 9 | class AbsLM(torch.nn.Module, BatchScorerInterface, ABC):
10 |     """The abstract LM class
11 | 
12 |     To share the loss calculation way among different models,
13 |     We uses delegate pattern here:
14 |     The instance of this class should be passed to "LanguageModel"
15 | 
16 |     >>> from espnet2.lm.abs_model import AbsLM
17 |     >>> lm = AbsLM()
18 |     >>> model = LanguageESPnetModel(lm=lm)
19 | 
20 |     This "model" is one of mediator objects for "Task" class.
21 | 
22 |     """
23 | 
24 |     @abstractmethod
25 |     def forward(
26 |         self, input: torch.Tensor, hidden: torch.Tensor
27 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
28 |         raise NotImplementedError
29 | 


--------------------------------------------------------------------------------
/espnet2/main_funcs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/main_funcs/__init__.py


--------------------------------------------------------------------------------
/espnet2/mt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/mt/__init__.py


--------------------------------------------------------------------------------
/espnet2/mt/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/mt/frontend/__init__.py


--------------------------------------------------------------------------------
/espnet2/mt/frontend/embedding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #  2020, Technische Universität München;  Ludwig Kürzinger
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | 
 5 | """Embedding Frontend for text based inputs."""
 6 | 
 7 | from typing import Tuple
 8 | 
 9 | import torch
10 | from typeguard import check_argument_types
11 | 
12 | from espnet2.asr.frontend.abs_frontend import AbsFrontend
13 | from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
14 | 
15 | 
16 | class Embedding(AbsFrontend):
17 |     """Embedding Frontend for text based inputs."""
18 | 
19 |     def __init__(
20 |         self,
21 |         input_size: int = 400,
22 |         embed_dim: int = 400,
23 |         pos_enc_class=PositionalEncoding,
24 |         positional_dropout_rate: float = 0.1,
25 |     ):
26 |         """Initialize.
27 | 
28 |         Args:
29 |             input_size: Number of input tokens.
30 |             embed_dim: Embedding Size.
31 |             pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
32 |             positional_dropout_rate: dropout rate after adding positional encoding
33 |         """
34 |         assert check_argument_types()
35 |         super().__init__()
36 |         self.embed_dim = embed_dim
37 |         # TODO(sdalmia): check for padding idx
38 |         self.embed = torch.nn.Sequential(
39 |             torch.nn.Embedding(input_size, embed_dim),
40 |             pos_enc_class(embed_dim, positional_dropout_rate),
41 |         )
42 | 
43 |     def forward(
44 |         self, input: torch.Tensor, input_lengths: torch.Tensor
45 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
46 |         """Apply a sliding window on the input.
47 | 
48 |         Args:
49 |             input: Input (B, T) or (B, T,D), with D.
50 |             input_lengths: Input lengths within batch.
51 | 
52 |         Returns:
53 |             Tensor: Output with dimensions (B, T, D).
54 |             Tensor: Output lengths within batch.
55 |         """
56 |         x = self.embed(input)
57 | 
58 |         return x, input_lengths
59 | 
60 |     def output_size(self) -> int:
61 |         """Return output length of feature dimension D, i.e. the embedding dim."""
62 |         return self.embed_dim
63 | 


--------------------------------------------------------------------------------
/espnet2/optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/optimizers/__init__.py


--------------------------------------------------------------------------------
/espnet2/optimizers/sgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typeguard import check_argument_types
 3 | 
 4 | 
 5 | class SGD(torch.optim.SGD):
 6 |     """Thin inheritance of torch.optim.SGD to bind the required arguments, 'lr'
 7 | 
 8 |     Note that
 9 |     the arguments of the optimizer invoked by AbsTask.main()
10 |     must have default value except for 'param'.
11 | 
12 |     I can't understand why only SGD.lr doesn't have the default value.
13 |     """
14 | 
15 |     def __init__(
16 |         self,
17 |         params,
18 |         lr: float = 0.1,
19 |         momentum: float = 0.0,
20 |         dampening: float = 0.0,
21 |         weight_decay: float = 0.0,
22 |         nesterov: bool = False,
23 |     ):
24 |         assert check_argument_types()
25 |         super().__init__(
26 |             params,
27 |             lr=lr,
28 |             momentum=momentum,
29 |             dampening=dampening,
30 |             weight_decay=weight_decay,
31 |             nesterov=nesterov,
32 |         )
33 | 


--------------------------------------------------------------------------------
/espnet2/s2t/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/s2t/__init__.py


--------------------------------------------------------------------------------
/espnet2/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/samplers/__init__.py


--------------------------------------------------------------------------------
/espnet2/samplers/abs_sampler.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Iterator, Tuple
 3 | 
 4 | from torch.utils.data import Sampler
 5 | 
 6 | 
 7 | class AbsSampler(Sampler, ABC):
 8 |     @abstractmethod
 9 |     def __len__(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def __iter__(self) -> Iterator[Tuple[str, ...]]:
14 |         raise NotImplementedError
15 | 
16 |     def generate(self, seed):
17 |         return list(self)
18 | 


--------------------------------------------------------------------------------
/espnet2/schedulers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/schedulers/__init__.py


--------------------------------------------------------------------------------
/espnet2/schedulers/abs_scheduler.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch.optim.lr_scheduler as L
 4 | 
 5 | 
 6 | class AbsScheduler(ABC):
 7 |     @abstractmethod
 8 |     def step(self, epoch: int = None):
 9 |         pass
10 | 
11 |     @abstractmethod
12 |     def state_dict(self):
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def load_state_dict(self, state):
17 |         pass
18 | 
19 | 
20 | # If you need to define custom scheduler, please inherit these classes
21 | class AbsBatchStepScheduler(AbsScheduler):
22 |     @abstractmethod
23 |     def step(self, epoch: int = None):
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def state_dict(self):
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def load_state_dict(self, state):
32 |         pass
33 | 
34 | 
35 | class AbsEpochStepScheduler(AbsScheduler):
36 |     @abstractmethod
37 |     def step(self, epoch: int = None):
38 |         pass
39 | 
40 |     @abstractmethod
41 |     def state_dict(self):
42 |         pass
43 | 
44 |     @abstractmethod
45 |     def load_state_dict(self, state):
46 |         pass
47 | 
48 | 
49 | class AbsValEpochStepScheduler(AbsEpochStepScheduler):
50 |     @abstractmethod
51 |     def step(self, val, epoch: int = None):
52 |         pass
53 | 
54 |     @abstractmethod
55 |     def state_dict(self):
56 |         pass
57 | 
58 |     @abstractmethod
59 |     def load_state_dict(self, state):
60 |         pass
61 | 
62 | 
63 | # Create alias type to check the type
64 | # Note(kamo): Currently PyTorch doesn't provide the base class
65 | # to judge these classes.
66 | AbsValEpochStepScheduler.register(L.ReduceLROnPlateau)
67 | for s in [
68 |     L.ReduceLROnPlateau,
69 |     L.LambdaLR,
70 |     L.StepLR,
71 |     L.MultiStepLR,
72 |     L.MultiStepLR,
73 |     L.ExponentialLR,
74 |     L.CosineAnnealingLR,
75 | ]:
76 |     AbsEpochStepScheduler.register(s)
77 | 
78 | AbsBatchStepScheduler.register(L.CyclicLR)
79 | for s in [
80 |     L.OneCycleLR,
81 |     L.CosineAnnealingWarmRestarts,
82 | ]:
83 |     AbsBatchStepScheduler.register(s)
84 | 


--------------------------------------------------------------------------------
/espnet2/schedulers/warmup_lr.py:
--------------------------------------------------------------------------------
 1 | """Warm up learning rate scheduler module."""
 2 | from typing import Union
 3 | 
 4 | import torch
 5 | from torch.optim.lr_scheduler import _LRScheduler
 6 | from typeguard import check_argument_types
 7 | 
 8 | from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler
 9 | 
10 | 
11 | class WarmupLR(_LRScheduler, AbsBatchStepScheduler):
12 |     """The WarmupLR scheduler
13 | 
14 |     This scheduler is almost same as NoamLR Scheduler except for following difference:
15 | 
16 |     NoamLR:
17 |         lr = optimizer.lr * model_size ** -0.5
18 |              * min(step ** -0.5, step * warmup_step ** -1.5)
19 |     WarmupLR:
20 |         lr = optimizer.lr * warmup_step ** 0.5
21 |              * min(step ** -0.5, step * warmup_step ** -1.5)
22 | 
23 |     Note that the maximum lr equals to optimizer.lr in this scheduler.
24 | 
25 |     """
26 | 
27 |     def __init__(
28 |         self,
29 |         optimizer: torch.optim.Optimizer,
30 |         warmup_steps: Union[int, float] = 25000,
31 |         last_epoch: int = -1,
32 |     ):
33 |         assert check_argument_types()
34 |         self.warmup_steps = warmup_steps
35 | 
36 |         # __init__() must be invoked before setting field
37 |         # because step() is also invoked in __init__()
38 |         super().__init__(optimizer, last_epoch)
39 | 
40 |     def __repr__(self):
41 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
42 | 
43 |     def get_lr(self):
44 |         step_num = self.last_epoch + 1
45 |         return [
46 |             lr
47 |             * self.warmup_steps**0.5
48 |             * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
49 |             for lr in self.base_lrs
50 |         ]
51 | 


--------------------------------------------------------------------------------
/espnet2/slu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/__init__.py


--------------------------------------------------------------------------------
/espnet2/slu/postdecoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/postdecoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/slu/postdecoder/abs_postdecoder.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class AbsPostDecoder(torch.nn.Module, ABC):
 7 |     @abstractmethod
 8 |     def output_size(self) -> int:
 9 |         raise NotImplementedError
10 | 
11 |     @abstractmethod
12 |     def forward(
13 |         self,
14 |         transcript_input_ids: torch.LongTensor,
15 |         transcript_attention_mask: torch.LongTensor,
16 |         transcript_token_type_ids: torch.LongTensor,
17 |         transcript_position_ids: torch.LongTensor,
18 |     ) -> torch.Tensor:
19 |         raise NotImplementedError
20 | 
21 |     @abstractmethod
22 |     def convert_examples_to_features(
23 |         self, data: list, max_seq_length: int, output_size: int
24 |     ):
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/espnet2/slu/postencoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/postencoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/encoder/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/layers/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/loss/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/loss/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/loss/abs_loss.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- encoding: utf-8 -*-
 3 | # code from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py
 4 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License)
 5 | from abc import ABC, abstractmethod
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | 
11 | class AbsLoss(nn.Module):
12 |     def __init__(self, nout: int, **kwargs):
13 |         super().__init__()
14 | 
15 |     @abstractmethod
16 |     def forward(self, x: torch.Tensor, label=None) -> torch.Tensor:
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/espnet2/spk/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/pooling/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/pooling/abs_pooling.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class AbsPooling(torch.nn.Module, ABC):
 7 |     @abstractmethod
 8 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
 9 |         raise NotImplementedError
10 | 


--------------------------------------------------------------------------------
/espnet2/spk/pooling/chn_attn_stat_pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from espnet2.spk.pooling.abs_pooling import AbsPooling
 5 | 
 6 | 
 7 | class ChnAttnStatPooling(AbsPooling):
 8 |     """
 9 |     Aggregates frame-level features to single utterance-level feature.
10 |     Proposed in B.Desplanques et al., "ECAPA-TDNN: Emphasized Channel
11 |     Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
12 | 
13 |     args:
14 |         input_size: dimensionality of the input frame-level embeddings.
15 |             Determined by encoder hyperparameter.
16 |             For this pooling layer, the output dimensionality will be double of
17 |             the input_size
18 |     """
19 | 
20 |     def __init__(self, input_size: int = 1536):
21 |         super().__init__()
22 |         self.attention = nn.Sequential(
23 |             nn.Conv1d(input_size * 3, 128, kernel_size=1),
24 |             nn.ReLU(),
25 |             nn.BatchNorm1d(128),
26 |             nn.Conv1d(128, input_size, kernel_size=1),
27 |             nn.Softmax(dim=2),
28 |         )
29 | 
30 |     def forward(self, x):
31 |         t = x.size()[-1]
32 |         global_x = torch.cat(
33 |             (
34 |                 x,
35 |                 torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
36 |                 torch.sqrt(
37 |                     torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4)
38 |                 ).repeat(1, 1, t),
39 |             ),
40 |             dim=1,
41 |         )
42 | 
43 |         w = self.attention(global_x)
44 | 
45 |         mu = torch.sum(x * w, dim=2)
46 |         sg = torch.sqrt(
47 |             (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
48 |         )
49 | 
50 |         x = torch.cat((mu, sg), dim=1)
51 | 
52 |         return x
53 | 


--------------------------------------------------------------------------------
/espnet2/spk/projector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/projector/__init__.py


--------------------------------------------------------------------------------
/espnet2/spk/projector/abs_projector.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class AbsProjector(torch.nn.Module, ABC):
 7 |     @abstractmethod
 8 |     def output_size(self) -> int:
 9 |         raise NotImplementedError
10 | 
11 |     @abstractmethod
12 |     def forward(self, utt_embd: torch.Tensor) -> torch.Tensor:
13 |         raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/espnet2/spk/projector/rawnet3_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from espnet2.spk.projector.abs_projector import AbsProjector
 4 | 
 5 | 
 6 | class RawNet3Projector(AbsProjector):
 7 |     def __init__(self, input_size, output_size):
 8 |         super().__init__()
 9 |         self._output_size = output_size
10 | 
11 |         self.bn = torch.nn.BatchNorm1d(input_size)
12 |         self.fc = torch.nn.Linear(input_size, output_size)
13 | 
14 |     def output_size(self):
15 |         return self._output_size
16 | 
17 |     def forward(self, x):
18 |         return self.fc(self.bn(x))
19 | 


--------------------------------------------------------------------------------
/espnet2/st/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/st/__init__.py


--------------------------------------------------------------------------------
/espnet2/svs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/__init__.py


--------------------------------------------------------------------------------
/espnet2/svs/abs_svs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | # Copyright 2021 Carnegie Mellon University (Jiatong Shi)
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | 
 5 | """Singing-voice-synthesis abstrast class."""
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from typing import Dict, Tuple
 9 | 
10 | import torch
11 | 
12 | 
13 | class AbsSVS(torch.nn.Module, ABC):
14 |     """SVS abstract class."""
15 | 
16 |     @abstractmethod
17 |     def forward(
18 |         self,
19 |         text: torch.Tensor,
20 |         text_lengths: torch.Tensor,
21 |         feats: torch.Tensor,
22 |         feats_lengths: torch.Tensor,
23 |         **kwargs,
24 |     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
25 |         """Calculate outputs and return the loss tensor."""
26 |         raise NotImplementedError
27 | 
28 |     @abstractmethod
29 |     def inference(
30 |         self,
31 |         text: torch.Tensor,
32 |         **kwargs,
33 |     ) -> Dict[str, torch.Tensor]:
34 |         """Return predicted output as a dict."""
35 |         raise NotImplementedError
36 | 
37 |     @property
38 |     def require_raw_singing(self):
39 |         """Return whether or not raw_singing is required."""
40 |         return False
41 | 
42 |     @property
43 |     def require_vocoder(self):
44 |         """Return whether or not vocoder is required."""
45 |         return True
46 | 


--------------------------------------------------------------------------------
/espnet2/svs/feats_extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/feats_extract/__init__.py


--------------------------------------------------------------------------------
/espnet2/svs/naive_rnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/naive_rnn/__init__.py


--------------------------------------------------------------------------------
/espnet2/svs/singing_tacotron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/singing_tacotron/__init__.py


--------------------------------------------------------------------------------
/espnet2/svs/xiaoice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/xiaoice/__init__.py


--------------------------------------------------------------------------------
/espnet2/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tasks/__init__.py


--------------------------------------------------------------------------------
/espnet2/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/text/__init__.py


--------------------------------------------------------------------------------
/espnet2/text/abs_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Iterable, List
 3 | 
 4 | 
 5 | class AbsTokenizer(ABC):
 6 |     @abstractmethod
 7 |     def text2tokens(self, line: str) -> List[str]:
 8 |         raise NotImplementedError
 9 | 
10 |     @abstractmethod
11 |     def tokens2text(self, tokens: Iterable[str]) -> str:
12 |         raise NotImplementedError
13 | 


--------------------------------------------------------------------------------
/espnet2/text/hugging_face_token_id_converter.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, List, Union
 2 | 
 3 | import numpy as np
 4 | from typeguard import check_argument_types
 5 | 
 6 | try:
 7 |     from transformers import AutoTokenizer
 8 | 
 9 |     is_transformers_available = True
10 | except ImportError:
11 |     is_transformers_available = False
12 | 
13 | 
14 | class HuggingFaceTokenIDConverter:
15 |     def __init__(
16 |         self,
17 |         model_name_or_path: str,
18 |     ):
19 |         assert check_argument_types()
20 | 
21 |         if not is_transformers_available:
22 |             raise ImportError(
23 |                 "`transformers` is not available. Please install it via `pip install"
24 |                 " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh"
25 |                 " && ./installers/install_transformers.sh`."
26 |             )
27 | 
28 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
29 | 
30 |     def get_num_vocabulary_size(self) -> int:
31 |         return self.tokenizer.vocab_size
32 | 
33 |     def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
34 |         return self.tokenizer.convert_ids_to_tokens(integers)
35 | 
36 |     def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
37 |         return self.tokenizer.convert_tokens_to_ids(tokens)
38 | 


--------------------------------------------------------------------------------
/espnet2/text/hugging_face_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Iterable, List, Union
 3 | 
 4 | from typeguard import check_argument_types
 5 | 
 6 | from espnet2.text.abs_tokenizer import AbsTokenizer
 7 | 
 8 | try:
 9 |     from transformers import AutoTokenizer
10 | 
11 |     is_transformers_available = True
12 | except ImportError:
13 |     is_transformers_available = False
14 | 
15 | 
16 | class HuggingFaceTokenizer(AbsTokenizer):
17 |     def __init__(self, model: Union[Path, str]):
18 |         assert check_argument_types()
19 | 
20 |         if not is_transformers_available:
21 |             raise ImportError(
22 |                 "`transformers` is not available. Please install it via `pip install"
23 |                 " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh"
24 |                 " && ./installers/install_transformers.sh`."
25 |             )
26 | 
27 |         self.model = str(model)
28 |         # NOTE(kamo):
29 |         # Don't build tokenizer in __init__()
30 |         # because it's not picklable and it may cause following error,
31 |         # "TypeError: can't pickle SwigPyObject objects",
32 |         # when giving it as argument of "multiprocessing.Process()".
33 |         self.tokenizer = None
34 | 
35 |     def __repr__(self):
36 |         return f'{self.__class__.__name__}(model="{self.model}")'
37 | 
38 |     def _build_tokenizer(self):
39 |         # Build Hugging Face tokenizer lazily.
40 |         if self.tokenizer is None:
41 |             self.tokenizer = AutoTokenizer.from_pretrained(self.model)
42 | 
43 |     def text2tokens(self, line: str) -> List[str]:
44 |         self._build_tokenizer()
45 |         return self.tokenizer.tokenize(line)
46 | 
47 |     def tokens2text(self, tokens: Iterable[str]) -> str:
48 |         self._build_tokenizer()
49 |         return (
50 |             self.tokenizer.batch_decode(
51 |                 [self.tokenizer.convert_tokens_to_ids(tokens)], skip_special_tokens=True
52 |             )[0]
53 |             .replace("\n", " ")
54 |             .strip()
55 |         )
56 | 


--------------------------------------------------------------------------------
/espnet2/text/korean_cleaner.py:
--------------------------------------------------------------------------------
 1 | # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | class KoreanCleaner:
 7 |     @classmethod
 8 |     def _normalize_numbers(cls, text):
 9 |         number_to_kor = {
10 |             "0": "영",
11 |             "1": "일",
12 |             "2": "이",
13 |             "3": "삼",
14 |             "4": "사",
15 |             "5": "오",
16 |             "6": "육",
17 |             "7": "칠",
18 |             "8": "팔",
19 |             "9": "구",
20 |         }
21 |         new_text = "".join(
22 |             number_to_kor[char] if char in number_to_kor.keys() else char
23 |             for char in text
24 |         )
25 |         return new_text
26 | 
27 |     @classmethod
28 |     def _normalize_english_text(cls, text):
29 |         upper_alphabet_to_kor = {
30 |             "A": "에이",
31 |             "B": "비",
32 |             "C": "씨",
33 |             "D": "디",
34 |             "E": "이",
35 |             "F": "에프",
36 |             "G": "지",
37 |             "H": "에이치",
38 |             "I": "아이",
39 |             "J": "제이",
40 |             "K": "케이",
41 |             "L": "엘",
42 |             "M": "엠",
43 |             "N": "엔",
44 |             "O": "오",
45 |             "P": "피",
46 |             "Q": "큐",
47 |             "R": "알",
48 |             "S": "에스",
49 |             "T": "티",
50 |             "U": "유",
51 |             "V": "브이",
52 |             "W": "더블유",
53 |             "X": "엑스",
54 |             "Y": "와이",
55 |             "Z": "지",
56 |         }
57 |         new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
58 |         new_text = "".join(
59 |             upper_alphabet_to_kor[char]
60 |             if char in upper_alphabet_to_kor.keys()
61 |             else char
62 |             for char in new_text
63 |         )
64 | 
65 |         return new_text
66 | 
67 |     @classmethod
68 |     def normalize_text(cls, text):
69 |         # stage 0 : text strip
70 |         text = text.strip()
71 | 
72 |         # stage 1 : normalize numbers
73 |         text = cls._normalize_numbers(text)
74 | 
75 |         # stage 2 : normalize english text
76 |         text = cls._normalize_english_text(text)
77 |         return text
78 | 


--------------------------------------------------------------------------------
/espnet2/text/sentencepiece_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Dict, Iterable, List, Union
 3 | 
 4 | import sentencepiece as spm
 5 | from typeguard import check_argument_types
 6 | 
 7 | from espnet2.text.abs_tokenizer import AbsTokenizer
 8 | 
 9 | 
10 | class SentencepiecesTokenizer(AbsTokenizer):
11 |     def __init__(self, model: Union[Path, str], encode_kwargs: Dict = dict()):
12 |         assert check_argument_types()
13 |         self.model = str(model)
14 |         # NOTE(kamo):
15 |         # Don't build SentencePieceProcessor in __init__()
16 |         # because it's not picklable and it may cause following error,
17 |         # "TypeError: can't pickle SwigPyObject objects",
18 |         # when giving it as argument of "multiprocessing.Process()".
19 |         self.sp = None
20 |         self.encode_kwargs = encode_kwargs
21 | 
22 |     def __repr__(self):
23 |         return f'{self.__class__.__name__}(model="{self.model}")'
24 | 
25 |     def _build_sentence_piece_processor(self):
26 |         # Build SentencePieceProcessor lazily.
27 |         if self.sp is None:
28 |             self.sp = spm.SentencePieceProcessor()
29 |             self.sp.load(self.model)
30 | 
31 |     def text2tokens(self, line: str) -> List[str]:
32 |         self._build_sentence_piece_processor()
33 |         return self.sp.EncodeAsPieces(line, **self.encode_kwargs)
34 | 
35 |     def tokens2text(self, tokens: Iterable[str]) -> str:
36 |         self._build_sentence_piece_processor()
37 |         return self.sp.DecodePieces(list(tokens))
38 | 


--------------------------------------------------------------------------------
/espnet2/text/word_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | from typing import Iterable, List, Union
 4 | 
 5 | from typeguard import check_argument_types
 6 | 
 7 | from espnet2.text.abs_tokenizer import AbsTokenizer
 8 | 
 9 | 
10 | class WordTokenizer(AbsTokenizer):
11 |     def __init__(
12 |         self,
13 |         delimiter: str = None,
14 |         non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
15 |         remove_non_linguistic_symbols: bool = False,
16 |     ):
17 |         assert check_argument_types()
18 |         self.delimiter = delimiter
19 | 
20 |         if not remove_non_linguistic_symbols and non_linguistic_symbols is not None:
21 |             warnings.warn(
22 |                 "non_linguistic_symbols is only used "
23 |                 "when remove_non_linguistic_symbols = True"
24 |             )
25 | 
26 |         if non_linguistic_symbols is None:
27 |             self.non_linguistic_symbols = set()
28 |         elif isinstance(non_linguistic_symbols, (Path, str)):
29 |             non_linguistic_symbols = Path(non_linguistic_symbols)
30 |             try:
31 |                 with non_linguistic_symbols.open("r", encoding="utf-8") as f:
32 |                     self.non_linguistic_symbols = set(line.rstrip() for line in f)
33 |             except FileNotFoundError:
34 |                 warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
35 |                 self.non_linguistic_symbols = set()
36 |         else:
37 |             self.non_linguistic_symbols = set(non_linguistic_symbols)
38 |         self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
39 | 
40 |     def __repr__(self):
41 |         return f'{self.__class__.__name__}(delimiter="{self.delimiter}")'
42 | 
43 |     def text2tokens(self, line: str) -> List[str]:
44 |         tokens = []
45 |         for t in line.split(self.delimiter):
46 |             if self.remove_non_linguistic_symbols and t in self.non_linguistic_symbols:
47 |                 continue
48 |             tokens.append(t)
49 |         return tokens
50 | 
51 |     def tokens2text(self, tokens: Iterable[str]) -> str:
52 |         if self.delimiter is None:
53 |             delimiter = " "
54 |         else:
55 |             delimiter = self.delimiter
56 |         return delimiter.join(tokens)
57 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/torch_utils/__init__.py


--------------------------------------------------------------------------------
/espnet2/torch_utils/add_gradient_noise.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def add_gradient_noise(
 5 |     model: torch.nn.Module,
 6 |     iteration: int,
 7 |     duration: float = 100,
 8 |     eta: float = 1.0,
 9 |     scale_factor: float = 0.55,
10 | ):
11 |     """Adds noise from a standard normal distribution to the gradients.
12 | 
13 |     The standard deviation (`sigma`) is controlled
14 |     by the three hyper-parameters below.
15 |     `sigma` goes to zero (no noise) with more iterations.
16 | 
17 |     Args:
18 |         model: Model.
19 |         iteration: Number of iterations.
20 |         duration: {100, 1000}: Number of durations to control
21 |             the interval of the `sigma` change.
22 |         eta: {0.01, 0.3, 1.0}: The magnitude of `sigma`.
23 |         scale_factor: {0.55}: The scale of `sigma`.
24 |     """
25 |     interval = (iteration // duration) + 1
26 |     sigma = eta / interval**scale_factor
27 |     for param in model.parameters():
28 |         if param.grad is not None:
29 |             _shape = param.grad.size()
30 |             noise = sigma * torch.randn(_shape).to(param.device)
31 |             param.grad += noise
32 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/forward_adaptor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typeguard import check_argument_types
 3 | 
 4 | 
 5 | class ForwardAdaptor(torch.nn.Module):
 6 |     """Wrapped module to parallelize specified method
 7 | 
 8 |     torch.nn.DataParallel parallelizes only "forward()"
 9 |     and, maybe, the method having the other name can't be applied
10 |     except for wrapping the module just like this class.
11 | 
12 |     Examples:
13 |         >>> class A(torch.nn.Module):
14 |         ...     def foo(self, x):
15 |         ...         ...
16 |         >>> model = A()
17 |         >>> model = ForwardAdaptor(model, "foo")
18 |         >>> model = torch.nn.DataParallel(model, device_ids=[0, 1])
19 |         >>> x = torch.randn(2, 10)
20 |         >>> model(x)
21 |     """
22 | 
23 |     def __init__(self, module: torch.nn.Module, name: str):
24 |         assert check_argument_types()
25 |         super().__init__()
26 |         self.module = module
27 |         self.name = name
28 |         if not hasattr(module, name):
29 |             raise ValueError(f"{module} doesn't have {name}")
30 | 
31 |     def forward(self, *args, **kwargs):
32 |         func = getattr(self.module, self.name)
33 |         return func(*args, **kwargs)
34 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/get_layer_from_string.py:
--------------------------------------------------------------------------------
 1 | import difflib
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_layer(l_name, library=torch.nn):
 7 |     """Return layer object handler from library e.g. from torch.nn
 8 | 
 9 |     E.g. if l_name=="elu", returns torch.nn.ELU.
10 | 
11 |     Args:
12 |         l_name (string): Case insensitive name for layer in library (e.g. .'elu').
13 |         library (module): Name of library/module where to search for object handler
14 |         with l_name e.g. "torch.nn".
15 | 
16 |     Returns:
17 |         layer_handler (object): handler for the requested layer e.g. (torch.nn.ELU)
18 | 
19 |     """
20 | 
21 |     all_torch_layers = [x for x in dir(torch.nn)]
22 |     match = [x for x in all_torch_layers if l_name.lower() == x.lower()]
23 |     if len(match) == 0:
24 |         close_matches = difflib.get_close_matches(
25 |             l_name, [x.lower() for x in all_torch_layers]
26 |         )
27 |         raise NotImplementedError(
28 |             "Layer with name {} not found in {}.\n Closest matches: {}".format(
29 |                 l_name, str(library), close_matches
30 |             )
31 |         )
32 |     elif len(match) > 1:
33 |         close_matches = difflib.get_close_matches(
34 |             l_name, [x.lower() for x in all_torch_layers]
35 |         )
36 |         raise NotImplementedError(
37 |             "Multiple matchs for layer with name {} not found in {}.\n "
38 |             "All matches: {}".format(l_name, str(library), close_matches)
39 |         )
40 |     else:
41 |         # valid
42 |         layer_handler = getattr(library, match[0])
43 |         return layer_handler
44 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/pytorch_version.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def pytorch_cudnn_version() -> str:
 5 |     message = (
 6 |         f"pytorch.version={torch.__version__}, "
 7 |         f"cuda.available={torch.cuda.is_available()}, "
 8 |     )
 9 | 
10 |     if torch.backends.cudnn.enabled:
11 |         message += (
12 |             f"cudnn.version={torch.backends.cudnn.version()}, "
13 |             f"cudnn.benchmark={torch.backends.cudnn.benchmark}, "
14 |             f"cudnn.deterministic={torch.backends.cudnn.deterministic}"
15 |         )
16 |     return message
17 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/recursive_op.py:
--------------------------------------------------------------------------------
 1 | """Torch utility module."""
 2 | import torch
 3 | 
 4 | if torch.distributed.is_available():
 5 |     from torch.distributed import ReduceOp
 6 | 
 7 | 
 8 | def recursive_sum(obj, weight: torch.Tensor, distributed: bool = False):
 9 |     assert weight.dim() == 1, weight.size()
10 |     if isinstance(obj, (tuple, list)):
11 |         return type(obj)(recursive_sum(v, weight, distributed) for v in obj)
12 |     elif isinstance(obj, dict):
13 |         return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
14 |     elif isinstance(obj, torch.Tensor):
15 |         assert obj.size() == weight.size(), (obj.size(), weight.size())
16 |         obj = (obj * weight.type(obj.dtype)).sum()
17 |         if distributed:
18 |             torch.distributed.all_reduce(obj, op=ReduceOp.SUM)
19 |         return obj
20 |     elif obj is None:
21 |         return None
22 |     else:
23 |         raise ValueError(type(obj))
24 | 
25 | 
26 | def recursive_divide(a, b: torch.Tensor):
27 |     if isinstance(a, (tuple, list)):
28 |         return type(a)(recursive_divide(v, b) for v in a)
29 |     elif isinstance(a, dict):
30 |         return {k: recursive_divide(v, b) for k, v in a.items()}
31 |     elif isinstance(a, torch.Tensor):
32 |         assert a.size() == b.size(), (a.size(), b.size())
33 |         return a / b.type(a.dtype)
34 |     elif a is None:
35 |         return None
36 |     else:
37 |         raise ValueError(type(a))
38 | 
39 | 
40 | def recursive_average(obj, weight: torch.Tensor, distributed: bool = False):
41 |     obj = recursive_sum(obj, weight, distributed)
42 |     weight = weight.sum()
43 |     if distributed:
44 |         torch.distributed.all_reduce(weight, op=ReduceOp.SUM)
45 |     # Normalize weight to be sum-to-1
46 |     obj = recursive_divide(obj, weight)
47 |     return obj, weight
48 | 


--------------------------------------------------------------------------------
/espnet2/torch_utils/set_all_random_seed.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def set_all_random_seed(seed: int):
 8 |     random.seed(seed)
 9 |     np.random.seed(seed)
10 |     torch.random.manual_seed(seed)
11 | 


--------------------------------------------------------------------------------
/espnet2/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/train/__init__.py


--------------------------------------------------------------------------------
/espnet2/train/abs_espnet_model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsESPnetModel(torch.nn.Module, ABC):
 8 |     """The common abstract class among each tasks
 9 | 
10 |     "ESPnetModel" is referred to a class which inherits torch.nn.Module,
11 |     and makes the dnn-models forward as its member field,
12 |     a.k.a delegate pattern,
13 |     and defines "loss", "stats", and "weight" for the task.
14 | 
15 |     If you intend to implement new task in ESPNet,
16 |     the model must inherit this class.
17 |     In other words, the "mediator" objects between
18 |     our training system and the your task class are
19 |     just only these three values, loss, stats, and weight.
20 | 
21 |     Example:
22 |         >>> from espnet2.tasks.abs_task import AbsTask
23 |         >>> class YourESPnetModel(AbsESPnetModel):
24 |         ...     def forward(self, input, input_lengths):
25 |         ...         ...
26 |         ...         return loss, stats, weight
27 |         >>> class YourTask(AbsTask):
28 |         ...     @classmethod
29 |         ...     def build_model(cls, args: argparse.Namespace) -> YourESPnetModel:
30 |     """
31 | 
32 |     @abstractmethod
33 |     def forward(
34 |         self, **batch: torch.Tensor
35 |     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
36 |         raise NotImplementedError
37 | 
38 |     @abstractmethod
39 |     def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
40 |         raise NotImplementedError
41 | 


--------------------------------------------------------------------------------
/espnet2/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/__init__.py


--------------------------------------------------------------------------------
/espnet2/tts/abs_tts.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Text-to-speech abstrast class."""
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | from typing import Dict, Tuple
 8 | 
 9 | import torch
10 | 
11 | 
12 | class AbsTTS(torch.nn.Module, ABC):
13 |     """TTS abstract class."""
14 | 
15 |     @abstractmethod
16 |     def forward(
17 |         self,
18 |         text: torch.Tensor,
19 |         text_lengths: torch.Tensor,
20 |         feats: torch.Tensor,
21 |         feats_lengths: torch.Tensor,
22 |         **kwargs,
23 |     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
24 |         """Calculate outputs and return the loss tensor."""
25 |         raise NotImplementedError
26 | 
27 |     @abstractmethod
28 |     def inference(
29 |         self,
30 |         text: torch.Tensor,
31 |         **kwargs,
32 |     ) -> Dict[str, torch.Tensor]:
33 |         """Return predicted output as a dict."""
34 |         raise NotImplementedError
35 | 
36 |     @property
37 |     def require_raw_speech(self):
38 |         """Return whether or not raw_speech is required."""
39 |         return False
40 | 
41 |     @property
42 |     def require_vocoder(self):
43 |         """Return whether or not vocoder is required."""
44 |         return True
45 | 


--------------------------------------------------------------------------------
/espnet2/tts/fastspeech/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.fastspeech.fastspeech import FastSpeech  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/tts/fastspeech2/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.fastspeech2.fastspeech2 import FastSpeech2  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/tts/feats_extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/feats_extract/__init__.py


--------------------------------------------------------------------------------
/espnet2/tts/feats_extract/abs_feats_extract.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsFeatsExtract(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def get_parameters(self) -> Dict[str, Any]:
14 |         raise NotImplementedError
15 | 
16 |     @abstractmethod
17 |     def forward(
18 |         self, input: torch.Tensor, input_lengths: torch.Tensor
19 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
20 |         raise NotImplementedError
21 | 


--------------------------------------------------------------------------------
/espnet2/tts/gst/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/gst/__init__.py


--------------------------------------------------------------------------------
/espnet2/tts/prodiff/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.prodiff.prodiff import ProDiff  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/tts/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.tacotron2.tacotron2 import Tacotron2  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/tts/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.transformer.transformer import Transformer  # NOQA
2 | 


--------------------------------------------------------------------------------
/espnet2/tts/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from espnet2.tts.utils.duration_calculator import DurationCalculator
2 | from espnet2.tts.utils.parallel_wavegan_pretrained_vocoder import (
3 |     ParallelWaveGANPretrainedVocoder,
4 | )
5 | 
6 | __all__ = ["DurationCalculator", "ParallelWaveGANPretrainedVocoder"]
7 | 


--------------------------------------------------------------------------------
/espnet2/tts/utils/parallel_wavegan_pretrained_vocoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Wrapper class for the vocoder model trained with parallel_wavegan repo."""
 5 | 
 6 | import logging
 7 | import os
 8 | from pathlib import Path
 9 | from typing import Optional, Union
10 | 
11 | import torch
12 | import yaml
13 | 
14 | 
15 | class ParallelWaveGANPretrainedVocoder(torch.nn.Module):
16 |     """Wrapper class to load the vocoder trained with parallel_wavegan repo."""
17 | 
18 |     def __init__(
19 |         self,
20 |         model_file: Union[Path, str],
21 |         config_file: Optional[Union[Path, str]] = None,
22 |     ):
23 |         """Initialize ParallelWaveGANPretrainedVocoder module."""
24 |         super().__init__()
25 |         try:
26 |             from parallel_wavegan.utils import load_model
27 |         except ImportError:
28 |             logging.error(
29 |                 "`parallel_wavegan` is not installed. "
30 |                 "Please install via `pip install -U parallel_wavegan`."
31 |             )
32 |             raise
33 |         if config_file is None:
34 |             dirname = os.path.dirname(str(model_file))
35 |             config_file = os.path.join(dirname, "config.yml")
36 |         with open(config_file) as f:
37 |             config = yaml.load(f, Loader=yaml.Loader)
38 |         self.fs = config["sampling_rate"]
39 |         self.vocoder = load_model(model_file, config)
40 |         if hasattr(self.vocoder, "remove_weight_norm"):
41 |             self.vocoder.remove_weight_norm()
42 |         self.normalize_before = False
43 |         if hasattr(self.vocoder, "mean"):
44 |             self.normalize_before = True
45 | 
46 |     @torch.no_grad()
47 |     def forward(self, feats: torch.Tensor) -> torch.Tensor:
48 |         """Generate waveform with pretrained vocoder.
49 | 
50 |         Args:
51 |             feats (Tensor): Feature tensor (T_feats, #mels).
52 | 
53 |         Returns:
54 |             Tensor: Generated waveform tensor (T_wav).
55 | 
56 |         """
57 |         return self.vocoder.inference(
58 |             feats,
59 |             normalize_before=self.normalize_before,
60 |         ).view(-1)
61 | 


--------------------------------------------------------------------------------
/espnet2/uasr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/__init__.py


--------------------------------------------------------------------------------
/espnet2/uasr/discriminator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/discriminator/__init__.py


--------------------------------------------------------------------------------
/espnet2/uasr/discriminator/abs_discriminator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class AbsDiscriminator(torch.nn.Module, ABC):
 7 |     @abstractmethod
 8 |     def forward(
 9 |         self,
10 |         xs_pad: torch.Tensor,
11 |         padding_mask: torch.Tensor,
12 |     ) -> torch.Tensor:
13 |         raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/espnet2/uasr/generator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/generator/__init__.py


--------------------------------------------------------------------------------
/espnet2/uasr/generator/abs_generator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Optional, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsGenerator(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def forward(
14 |         self,
15 |         xs_pad: torch.Tensor,
16 |         ilens: torch.Tensor,
17 |     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/espnet2/uasr/loss/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/loss/__init__.py


--------------------------------------------------------------------------------
/espnet2/uasr/loss/abs_loss.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | EPS = torch.finfo(torch.get_default_dtype()).eps
 6 | 
 7 | 
 8 | class AbsUASRLoss(torch.nn.Module, ABC):
 9 |     """Base class for all Diarization loss modules."""
10 | 
11 |     # the name will be the key that appears in the reporter
12 |     @property
13 |     def name(self) -> str:
14 |         return NotImplementedError
15 | 
16 |     @abstractmethod
17 |     def forward(
18 |         self,
19 |     ) -> torch.Tensor:
20 |         # the return tensor should be shape of (batch)
21 |         raise NotImplementedError
22 | 


--------------------------------------------------------------------------------
/espnet2/uasr/loss/discriminator_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typeguard import check_argument_types
 4 | 
 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss
 6 | from espnet2.utils.types import str2bool
 7 | 
 8 | 
 9 | class UASRDiscriminatorLoss(AbsUASRLoss):
10 |     """discriminator loss for UASR."""
11 | 
12 |     def __init__(
13 |         self,
14 |         weight: float = 1.0,
15 |         smoothing: float = 0.0,
16 |         smoothing_one_side: str2bool = False,
17 |         reduction: str = "sum",
18 |     ):
19 |         super().__init__()
20 |         assert check_argument_types()
21 |         self.weight = weight
22 |         self.smoothing = smoothing
23 |         self.smoothing_one_sided = smoothing_one_side
24 |         self.reduction = reduction
25 | 
26 |     def forward(
27 |         self,
28 |         dense_y: torch.Tensor,
29 |         token_y: torch.Tensor,
30 |         is_discriminative_step: str2bool,
31 |     ):
32 |         """Forward.
33 | 
34 |         Args:
35 |             dense_y: predicted logits of generated samples
36 |             token_y: predicted logits of real samples
37 |         """
38 |         if self.weight > 0:
39 |             fake_smooth = self.smoothing
40 |             real_smooth = self.smoothing
41 |             if self.smoothing_one_sided:
42 |                 fake_smooth = 0
43 | 
44 |             if is_discriminative_step:
45 |                 loss_dense = F.binary_cross_entropy_with_logits(
46 |                     dense_y,
47 |                     dense_y.new_ones(dense_y.shape) - fake_smooth,
48 |                     reduction=self.reduction,
49 |                 )
50 |                 loss_token = F.binary_cross_entropy_with_logits(
51 |                     token_y,
52 |                     token_y.new_zeros(token_y.shape) + real_smooth,
53 |                     reduction=self.reduction,
54 |                 )
55 |             else:
56 |                 loss_dense = F.binary_cross_entropy_with_logits(
57 |                     dense_y,
58 |                     dense_y.new_zeros(dense_y.shape) + fake_smooth,
59 |                     reduction=self.reduction,
60 |                 )
61 |                 loss_token = None
62 | 
63 |             return loss_dense, loss_token
64 |         else:
65 |             return 0
66 | 


--------------------------------------------------------------------------------
/espnet2/uasr/loss/phoneme_diversity_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typeguard import check_argument_types
 3 | 
 4 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss
 5 | from espnet2.utils.types import str2bool
 6 | 
 7 | 
 8 | class UASRPhonemeDiversityLoss(AbsUASRLoss):
 9 |     """phoneme diversity loss for UASR."""
10 | 
11 |     def __init__(
12 |         self,
13 |         weight: float = 1.0,
14 |     ):
15 |         super().__init__()
16 |         assert check_argument_types()
17 | 
18 |         self.weight = weight
19 | 
20 |     def forward(
21 |         self, dense_x: torch.Tensor, sample_size: int, is_discriminative_step: str2bool
22 |     ):
23 |         """Forward.
24 | 
25 |         Args:
26 |             dense_x: predicted logits of generated samples
27 |             sample_size: batch size
28 |             is_dicriminative_step: whether is training discriminator
29 |         """
30 |         if self.weight > 0 and not is_discriminative_step:
31 |             batch_size, time_length, channel_size = dense_x.shape
32 | 
33 |             avg_probs = torch.softmax(
34 |                 dense_x.reshape(-1, channel_size).float(), dim=-1
35 |             ).mean(dim=0)
36 |             phoneme_ppl = torch.exp(
37 |                 -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
38 |             )
39 |             phoneme_diversity_loss = (
40 |                 (channel_size - phoneme_ppl) / channel_size
41 |             ) * sample_size
42 | 
43 |             return phoneme_diversity_loss
44 |         else:
45 |             return 0
46 | 


--------------------------------------------------------------------------------
/espnet2/uasr/loss/pseudo_label_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typeguard import check_argument_types
 4 | 
 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss
 6 | from espnet2.utils.types import str2bool
 7 | 
 8 | 
 9 | class UASRPseudoLabelLoss(AbsUASRLoss):
10 |     """auxiliary pseudo label loss for UASR."""
11 | 
12 |     def __init__(
13 |         self,
14 |         weight: float = 1.0,
15 |         input_dim: int = 128,
16 |         output_dim: int = 64,
17 |         downsample_rate: int = 2,
18 |         ignore_index: int = -1,
19 |         reduction: str = "none",
20 |     ):
21 |         super().__init__()
22 |         assert check_argument_types()
23 | 
24 |         self.weight = weight
25 |         self.input_dim = input_dim
26 |         self.output_dim = output_dim
27 |         self.downsample_rate = downsample_rate
28 |         self.ignore_index = ignore_index
29 |         self.reduction = reduction
30 | 
31 |         if self.weight > 0:
32 |             self.decoder = torch.nn.Linear(self.input_dim, self.output_dim)
33 | 
34 |     def forward(
35 |         self,
36 |         inter_x: torch.Tensor,
37 |         pseudo_labels: torch.Tensor,
38 |         is_discriminative_step: str2bool,
39 |     ):
40 |         """Forward.
41 | 
42 |         Args:
43 |         """
44 |         if self.weight > 0 and not is_discriminative_step and pseudo_labels is not None:
45 |             inter_x = self.decoder(inter_x)
46 | 
47 |             if self.downsample_rate > 1:
48 |                 pseudo_labels = pseudo_labels[:, :: self.downsample_rate]
49 |             valid_time_length = min(pseudo_labels.shape[1], inter_x.shape[1])
50 |             pseudo_label_loss = F.cross_entropy(
51 |                 inter_x[:, :valid_time_length].transpose(1, 2),
52 |                 pseudo_labels[:, :valid_time_length],
53 |                 ignore_index=self.ignore_index,
54 |                 reduction=self.reduction,
55 |             )
56 |             pseudo_label_loss = pseudo_label_loss.mean() * pseudo_label_loss.shape[0]
57 | 
58 |             return pseudo_label_loss
59 |         else:
60 |             return 0
61 | 


--------------------------------------------------------------------------------
/espnet2/uasr/loss/smoothness_penalty.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typeguard import check_argument_types
 4 | 
 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss
 6 | 
 7 | 
 8 | class UASRSmoothnessPenalty(AbsUASRLoss):
 9 |     """smoothness penalty for UASR."""
10 | 
11 |     def __init__(
12 |         self,
13 |         weight: float = 1.0,
14 |         reduction: str = "none",
15 |     ):
16 |         super().__init__()
17 |         assert check_argument_types()
18 | 
19 |         self.weight = weight
20 |         self.reduction = reduction
21 | 
22 |     def forward(
23 |         self,
24 |         dense_logits: torch.Tensor,
25 |         dense_padding_mask: torch.Tensor,
26 |         sample_size: int,
27 |         is_discriminative_step: bool,
28 |     ):
29 |         """Forward.
30 | 
31 |         Args:
32 |             dense_logits: output logits of generator
33 |             dense_padding_mask: padding mask of logits
34 |             sample_size: batch size
35 |             is_discriminative_step: Whether is training discriminator
36 |         """
37 |         if self.weight > 0 and not is_discriminative_step:
38 |             smoothness_penalty = F.mse_loss(
39 |                 dense_logits[:, :-1], dense_logits[:, 1:], reduction=self.reduction
40 |             )
41 |             smoothness_penalty[dense_padding_mask[:, 1:]] = 0
42 |             smoothness_penalty = smoothness_penalty.mean() * sample_size
43 | 
44 |             return smoothness_penalty
45 |         else:
46 |             return 0
47 | 


--------------------------------------------------------------------------------
/espnet2/uasr/segmenter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/segmenter/__init__.py


--------------------------------------------------------------------------------
/espnet2/uasr/segmenter/abs_segmenter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Segmenter definition for UASR task
 3 | 
 4 | Practially, the output of the generator (in frame-level) may
 5 | predict the same phoneme for consecutive frames, which makes
 6 | it too easy for the discriminator. So, the segmenter here is
 7 | to merge frames with a similar prediction from the generator output.
 8 | """
 9 | 
10 | from abc import ABC, abstractmethod
11 | 
12 | import torch
13 | 
14 | 
15 | class AbsSegmenter(torch.nn.Module, ABC):
16 |     @abstractmethod
17 |     def pre_segment(
18 |         self,
19 |         xs_pad: torch.Tensor,
20 |         ilens: torch.Tensor,
21 |     ) -> torch.Tensor:
22 |         raise NotImplementedError
23 | 
24 |     @abstractmethod
25 |     def logit_segment(
26 |         self,
27 |         xs_pad: torch.Tensor,
28 |         ilens: torch.Tensor,
29 |     ) -> torch.Tensor:
30 |         raise NotImplementedError
31 | 


--------------------------------------------------------------------------------
/espnet2/uasr/segmenter/random_segmenter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | from espnet2.uasr.segmenter.abs_segmenter import AbsSegmenter
 7 | from espnet2.utils.types import str2bool
 8 | 
 9 | 
10 | class RandomSegmenter(AbsSegmenter):
11 |     def __init__(
12 |         self,
13 |         subsample_rate: float = 0.25,
14 |         mean_pool: str2bool = True,
15 |         mean_join_pool: str2bool = False,
16 |         remove_zeros: str2bool = False,
17 |     ):
18 |         super().__init__()
19 |         assert check_argument_types()
20 |         self.subsample_rate = subsample_rate
21 | 
22 |     def pre_segment(
23 |         self,
24 |         xs_pad: torch.Tensor,
25 |         padding_mask: torch.Tensor,
26 |     ) -> torch.Tensor:
27 |         target_num = math.ceil(xs_pad.size(1) * self.subsample_rate)
28 |         ones = torch.ones(xs_pad.shape[:-1], device=xs_pad.device)
29 |         indices, _ = ones.multinomial(target_num).sort(dim=-1)
30 |         indices_ld = indices.unsqueeze(-1).expand(-1, -1, xs_pad.size(-1))
31 |         xs_pad = xs_pad.gather(1, indices_ld)
32 |         padding_mask = padding_mask.gather(1, index=indices)
33 |         return xs_pad, padding_mask
34 | 
35 |     def logit_segment(
36 |         self,
37 |         xs_pad: torch.Tensor,
38 |         padding_mask: torch.Tensor,
39 |     ) -> torch.Tensor:
40 |         return xs_pad, padding_mask
41 | 


--------------------------------------------------------------------------------
/espnet2/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/utils/__init__.py


--------------------------------------------------------------------------------
/espnet2/utils/build_dataclass.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import dataclasses
 3 | 
 4 | from typeguard import check_type
 5 | 
 6 | 
 7 | def build_dataclass(dataclass, args: argparse.Namespace):
 8 |     """Helper function to build dataclass from 'args'."""
 9 |     kwargs = {}
10 |     for field in dataclasses.fields(dataclass):
11 |         if not hasattr(args, field.name):
12 |             raise ValueError(
13 |                 f"args doesn't have {field.name}. You need to set it to ArgumentsParser"
14 |             )
15 |         check_type(field.name, getattr(args, field.name), field.type)
16 |         kwargs[field.name] = getattr(args, field.name)
17 |     return dataclass(**kwargs)
18 | 


--------------------------------------------------------------------------------
/espnet2/utils/config_argparse.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import yaml
 5 | 
 6 | 
 7 | class ArgumentParser(argparse.ArgumentParser):
 8 |     """Simple implementation of ArgumentParser supporting config file
 9 | 
10 |     This class is originated from https://github.com/bw2/ConfigArgParse,
11 |     but this class is lack of some features that it has.
12 | 
13 |     - Not supporting multiple config files
14 |     - Automatically adding "--config" as an option.
15 |     - Not supporting any formats other than yaml
16 |     - Not checking argument type
17 | 
18 |     """
19 | 
20 |     def __init__(self, *args, **kwargs):
21 |         super().__init__(*args, **kwargs)
22 |         self.add_argument("--config", help="Give config file in yaml format")
23 | 
24 |     def parse_known_args(self, args=None, namespace=None):
25 |         # Once parsing for setting from "--config"
26 |         _args, _ = super().parse_known_args(args, namespace)
27 |         if _args.config is not None:
28 |             if not Path(_args.config).exists():
29 |                 self.error(f"No such file: {_args.config}")
30 | 
31 |             with open(_args.config, "r", encoding="utf-8") as f:
32 |                 d = yaml.safe_load(f)
33 |             if not isinstance(d, dict):
34 |                 self.error("Config file has non dict value: {_args.config}")
35 | 
36 |             for key in d:
37 |                 for action in self._actions:
38 |                     if key == action.dest:
39 |                         break
40 |                 else:
41 |                     self.error(f"unrecognized arguments: {key} (from {_args.config})")
42 | 
43 |             # NOTE(kamo): Ignore "--config" from a config file
44 |             # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
45 |             #   i.e. We can set any type value regardless of argument type.
46 |             self.set_defaults(**d)
47 |         return super().parse_known_args(args, namespace)
48 | 


--------------------------------------------------------------------------------
/espnet2/utils/get_default_kwargs.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | class Invalid:
 5 |     """Marker object for not serializable-object"""
 6 | 
 7 | 
 8 | def get_default_kwargs(func):
 9 |     """Get the default values of the input function.
10 | 
11 |     Examples:
12 |         >>> def func(a, b=3):  pass
13 |         >>> get_default_kwargs(func)
14 |         {'b': 3}
15 | 
16 |     """
17 | 
18 |     def yaml_serializable(value):
19 |         # isinstance(x, tuple) includes namedtuple, so type is used here
20 |         if type(value) is tuple:
21 |             return yaml_serializable(list(value))
22 |         elif isinstance(value, set):
23 |             return yaml_serializable(list(value))
24 |         elif isinstance(value, dict):
25 |             if not all(isinstance(k, str) for k in value):
26 |                 return Invalid
27 |             retval = {}
28 |             for k, v in value.items():
29 |                 v2 = yaml_serializable(v)
30 |                 # Register only valid object
31 |                 if v2 not in (Invalid, inspect.Parameter.empty):
32 |                     retval[k] = v2
33 |             return retval
34 |         elif isinstance(value, list):
35 |             retval = []
36 |             for v in value:
37 |                 v2 = yaml_serializable(v)
38 |                 # If any elements in the list are invalid,
39 |                 # the list also becomes invalid
40 |                 if v2 is Invalid:
41 |                     return Invalid
42 |                 else:
43 |                     retval.append(v2)
44 |             return retval
45 |         elif value in (inspect.Parameter.empty, None):
46 |             return value
47 |         elif isinstance(value, (float, int, complex, bool, str, bytes)):
48 |             return value
49 |         else:
50 |             return Invalid
51 | 
52 |     # params: An ordered mapping of inspect.Parameter
53 |     params = inspect.signature(func).parameters
54 |     data = {p.name: p.default for p in params.values()}
55 |     # Remove not yaml-serializable object
56 |     data = yaml_serializable(data)
57 |     return data
58 | 


--------------------------------------------------------------------------------
/espnet2/utils/kwargs2args.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | def func(a: int, b, *, c, **kwargs):
 5 |     pass
 6 | 
 7 | 
 8 | def kwargs2args(func, kwargs):
 9 |     parameters = inspect.signature(func).parameters
10 |     d = {k: i for i, k in enumerate(parameters)}
11 |     args = [None for i in range(len(parameters))]
12 |     for k, v in kwargs.items():
13 |         if k in d:
14 |             args[d[k]] = v
15 | 
16 |     for i, v in enumerate(args):
17 |         if v is None:
18 |             break
19 | 
20 |     return tuple(args[:i])
21 | 


--------------------------------------------------------------------------------
/espnet2/utils/yaml_no_alias_safe_dump.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | 
 4 | class NoAliasSafeDumper(yaml.SafeDumper):
 5 |     # Disable anchor/alias in yaml because looks ugly
 6 |     def ignore_aliases(self, data):
 7 |         return True
 8 | 
 9 | 
10 | def yaml_no_alias_safe_dump(data, stream=None, **kwargs):
11 |     """Safe-dump in yaml with no anchor/alias"""
12 |     return yaml.dump(
13 |         data, stream, allow_unicode=True, Dumper=NoAliasSafeDumper, **kwargs
14 |     )
15 | 


--------------------------------------------------------------------------------
/pretrained/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/pretrained/.gitkeep


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # espnet==202308
 2 | espnet_tts_frontend
 3 | faster-whisper==0.9.0
 4 | gradio==3.44.4
 5 | h5py
 6 | jaconv==0.3.4
 7 | jamo==0.4.1
 8 | kaldiio==2.18.0
 9 | librosa==0.10.1
10 | # pyopenjtalk==0.3.2
11 | pyopenjtalk-prebuilt==0.3.0
12 | pyworld==0.3.4
13 | sentencepiece==0.1.99
14 | tensorboard==2.14
15 | torch_complex
16 | typeguard==2.13.3


--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | echo Creating virtual environment...
 4 | python -m venv venv
 5 | 
 6 | if errorlevel 1 (
 7 |     echo Error: Failed to create virtual environment.
 8 |     exit /b
 9 | )
10 | 
11 | echo Installing torch...
12 | venv\Scripts\pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
13 | 
14 | if errorlevel 1 (
15 |     echo Error: Failed to install torch and torchaudio.
16 |     exit /b
17 | )
18 | 
19 | echo Installing packages from requirements.txt...
20 | venv\Scripts\pip install -r requirements.txt
21 | 
22 | if errorlevel 1 (
23 |     echo Error: Failed to install packages.
24 |     exit /b
25 | )
26 | 
27 | echo Downloading pretrained model...
28 | if not exist pretrained mkdir pretrained
29 | curl -L "https://huggingface.co/litagin/vits-japros-pretrained/resolve/main/pretrained.pth" -o "pretrained\pretrained.pth"
30 | 
31 | if errorlevel 1 (
32 |     echo Error: Failed to download pretrained model.
33 |     exit /b
34 | )
35 | 
36 | if not exist "weights\pretrained\" mkdir "weights\pretrained\"
37 | 
38 | if not exist "weights\pretrained\pretrained.pth" (
39 |     echo Copying pretrained model to weights/pretrained/...
40 |     copy "pretrained\pretrained.pth" "weights\pretrained\pretrained.pth"
41 | )
42 | 
43 | echo Setup complete.
44 | 
45 | pause


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import subprocess
 4 | 
 5 | python = sys.executable
 6 | 
 7 | 
 8 | def run_train(
 9 |     model_name: str,
10 |     max_epoch: int = 200,
11 |     batch_bins: int = 1000000,
12 |     output_dir: str = "outputs",
13 | ) -> str:
14 |     cmd = [python, "-m", "espnet2.bin.gan_tts_train"]
15 |     from conf.train_args import train_args
16 | 
17 |     for i, arg in enumerate(train_args):
18 |         train_args[i] = arg.format(model_name=model_name, output_dir=output_dir)
19 | 
20 |     cmd.extend(train_args)
21 |     cmd.extend(["--batch_bins", str(batch_bins)])
22 |     cmd.extend(["--max_epoch", str(max_epoch)])
23 | 
24 |     print(" ".join(cmd))
25 |     print("Submitted to subprocess.")
26 |     subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout)
27 | 
28 |     return "学習が開始されました。詳細はターミナルとTensorBoardを確認してください。"
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument("--model-name", type=str, required=True)
34 |     parser.add_argument("--max-epoch", type=int, default=200)
35 |     parser.add_argument("--batch-bins", type=int, default=1000000)
36 |     parser.add_argument("--output-dir", type=str, default="outputs")
37 |     args = parser.parse_args()
38 | 
39 |     model_name = args.model_name
40 |     max_epoch = args.max_epoch
41 |     batch_bins = args.batch_bins
42 |     output_dir = args.output_dir
43 | 
44 |     run_train(model_name, max_epoch, batch_bins, output_dir)
45 | 


--------------------------------------------------------------------------------
/update.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | echo Updating the repository...
 3 | 
 4 | cd /d %~dp0
 5 | 
 6 | git pull
 7 | 
 8 | venv\Scripts\pip install -r requirements.txt
 9 | 
10 | echo Update complete.
11 | pause
12 | 


--------------------------------------------------------------------------------
/webui_infer.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | echo Running webui_infer.py...
 4 | venv\Scripts\python webui_infer.py
 5 | 
 6 | if errorlevel 1 (
 7 |     echo Error: Failed to run webui_infer.py.
 8 |     pause
 9 |     exit /b
10 | )
11 | 
12 | pause
13 | 


--------------------------------------------------------------------------------
/webui_train.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | echo Running webui_train.py...
 4 | venv\Scripts\python webui_train.py
 5 | 
 6 | if errorlevel 1 (
 7 |     echo Error: Failed to run webui_train.py.
 8 |     pause
 9 |     exit /b
10 | )
11 | 
12 | pause
13 | 


--------------------------------------------------------------------------------
/weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------