├── .gitignore ├── LICENSE ├── README.md ├── assets ├── japros-infer.jpg └── japros-train.jpg ├── conf ├── config.yaml ├── finetune.yaml ├── tokens.txt └── train_args.py ├── data ├── README.txt ├── transcript_utf8_sample.txt └── wavs │ └── .gitkeep ├── docs └── CLI.md ├── espnet ├── __init__.py ├── asr │ ├── __init__.py │ ├── asr_mix_utils.py │ ├── asr_utils.py │ ├── chainer_backend │ │ ├── __init__.py │ │ └── asr.py │ └── pytorch_backend │ │ ├── __init__.py │ │ ├── asr.py │ │ ├── asr_init.py │ │ ├── asr_mix.py │ │ └── recog.py ├── bin │ ├── __init__.py │ ├── asr_align.py │ ├── asr_enhance.py │ ├── asr_recog.py │ ├── asr_train.py │ ├── lm_train.py │ ├── mt_train.py │ ├── mt_trans.py │ ├── st_train.py │ ├── st_trans.py │ ├── tts_decode.py │ ├── tts_train.py │ ├── vc_decode.py │ └── vc_train.py ├── distributed │ ├── __init__.py │ └── pytorch_backend │ │ └── launch.py ├── lm │ ├── __init__.py │ ├── chainer_backend │ │ ├── __init__.py │ │ ├── extlm.py │ │ └── lm.py │ ├── lm_utils.py │ └── pytorch_backend │ │ ├── __init__.py │ │ ├── extlm.py │ │ └── lm.py ├── mt │ ├── __init__.py │ ├── mt_utils.py │ └── pytorch_backend │ │ ├── __init__.py │ │ └── mt.py ├── nets │ ├── __init__.py │ ├── asr_interface.py │ ├── batch_beam_search.py │ ├── batch_beam_search_online.py │ ├── batch_beam_search_online_sim.py │ ├── beam_search.py │ ├── beam_search_timesync.py │ ├── beam_search_timesync_streaming.py │ ├── beam_search_transducer.py │ ├── chainer_backend │ │ ├── __init__.py │ │ ├── asr_interface.py │ │ ├── ctc.py │ │ ├── deterministic_embed_id.py │ │ ├── e2e_asr.py │ │ ├── e2e_asr_transformer.py │ │ ├── nets_utils.py │ │ ├── rnn │ │ │ ├── __init__.py │ │ │ ├── attentions.py │ │ │ ├── decoders.py │ │ │ ├── encoders.py │ │ │ └── training.py │ │ └── transformer │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── ctc.py │ │ │ ├── decoder.py │ │ │ ├── decoder_layer.py │ │ │ ├── embedding.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ ├── label_smoothing_loss.py │ │ │ ├── layer_norm.py │ │ │ ├── mask.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── subsampling.py │ │ │ └── training.py │ ├── ctc_prefix_score.py │ ├── e2e_asr_common.py │ ├── e2e_mt_common.py │ ├── lm_interface.py │ ├── mt_interface.py │ ├── pytorch_backend │ │ ├── __init__.py │ │ ├── conformer │ │ │ ├── __init__.py │ │ │ ├── argument.py │ │ │ ├── contextual_block_encoder_layer.py │ │ │ ├── convolution.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ └── swish.py │ │ ├── ctc.py │ │ ├── e2e_asr.py │ │ ├── e2e_asr_conformer.py │ │ ├── e2e_asr_maskctc.py │ │ ├── e2e_asr_mix.py │ │ ├── e2e_asr_mix_transformer.py │ │ ├── e2e_asr_mulenc.py │ │ ├── e2e_asr_transducer.py │ │ ├── e2e_asr_transformer.py │ │ ├── e2e_mt.py │ │ ├── e2e_mt_transformer.py │ │ ├── e2e_st.py │ │ ├── e2e_st_conformer.py │ │ ├── e2e_st_transformer.py │ │ ├── e2e_tts_fastspeech.py │ │ ├── e2e_tts_tacotron2.py │ │ ├── e2e_tts_transformer.py │ │ ├── e2e_vc_tacotron2.py │ │ ├── e2e_vc_transformer.py │ │ ├── fastspeech │ │ │ ├── __init__.py │ │ │ ├── duration_calculator.py │ │ │ ├── duration_predictor.py │ │ │ └── length_regulator.py │ │ ├── frontends │ │ │ ├── __init__.py │ │ │ ├── beamformer.py │ │ │ ├── dnn_beamformer.py │ │ │ ├── dnn_wpe.py │ │ │ ├── feature_transform.py │ │ │ ├── frontend.py │ │ │ └── mask_estimator.py │ │ ├── gtn_ctc.py │ │ ├── initialization.py │ │ ├── lm │ │ │ ├── __init__.py │ │ │ ├── default.py │ │ │ ├── seq_rnn.py │ │ │ └── transformer.py │ │ ├── maskctc │ │ │ ├── __init__.py │ │ │ ├── add_mask_token.py │ │ │ └── mask.py │ │ ├── nets_utils.py │ │ ├── rnn │ │ │ ├── __init__.py │ │ │ ├── argument.py │ │ │ ├── attentions.py │ │ │ ├── decoders.py │ │ │ └── encoders.py │ │ ├── streaming │ │ │ ├── __init__.py │ │ │ ├── segment.py │ │ │ └── window.py │ │ ├── tacotron2 │ │ │ ├── __init__.py │ │ │ ├── cbhg.py │ │ │ ├── decoder.py │ │ │ └── encoder.py │ │ ├── transducer │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── blocks.py │ │ │ ├── conv1d_nets.py │ │ │ ├── custom_decoder.py │ │ │ ├── custom_encoder.py │ │ │ ├── error_calculator.py │ │ │ ├── initializer.py │ │ │ ├── joint_network.py │ │ │ ├── rnn_decoder.py │ │ │ ├── rnn_encoder.py │ │ │ ├── transducer_tasks.py │ │ │ ├── transformer_decoder_layer.py │ │ │ ├── utils.py │ │ │ └── vgg2l.py │ │ ├── transformer │ │ │ ├── __init__.py │ │ │ ├── add_sos_eos.py │ │ │ ├── argument.py │ │ │ ├── attention.py │ │ │ ├── contextual_block_encoder_layer.py │ │ │ ├── decoder.py │ │ │ ├── decoder_layer.py │ │ │ ├── dynamic_conv.py │ │ │ ├── dynamic_conv2d.py │ │ │ ├── embedding.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ ├── encoder_mix.py │ │ │ ├── initializer.py │ │ │ ├── label_smoothing_loss.py │ │ │ ├── layer_norm.py │ │ │ ├── lightconv.py │ │ │ ├── lightconv2d.py │ │ │ ├── longformer_attention.py │ │ │ ├── mask.py │ │ │ ├── multi_layer_conv.py │ │ │ ├── optimizer.py │ │ │ ├── plot.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── repeat.py │ │ │ ├── subsampling.py │ │ │ └── subsampling_without_posenc.py │ │ └── wavenet.py │ ├── scorer_interface.py │ ├── scorers │ │ ├── __init__.py │ │ ├── ctc.py │ │ ├── length_bonus.py │ │ ├── ngram.py │ │ └── uasr.py │ ├── st_interface.py │ ├── transducer_decoder_interface.py │ └── tts_interface.py ├── optimizer │ ├── __init__.py │ ├── chainer.py │ ├── factory.py │ ├── parser.py │ └── pytorch.py ├── scheduler │ ├── __init__.py │ ├── chainer.py │ ├── pytorch.py │ └── scheduler.py ├── st │ ├── __init__.py │ └── pytorch_backend │ │ ├── __init__.py │ │ └── st.py ├── transform │ ├── __init__.py │ ├── add_deltas.py │ ├── channel_selector.py │ ├── cmvn.py │ ├── functional.py │ ├── perturb.py │ ├── spec_augment.py │ ├── spectrogram.py │ ├── transform_interface.py │ ├── transformation.py │ └── wpe.py ├── tts │ ├── __init__.py │ └── pytorch_backend │ │ ├── __init__.py │ │ └── tts.py ├── utils │ ├── __init__.py │ ├── check_kwargs.py │ ├── cli_readers.py │ ├── cli_utils.py │ ├── cli_writers.py │ ├── dataset.py │ ├── deterministic_utils.py │ ├── dynamic_import.py │ ├── fill_missing_args.py │ ├── io_utils.py │ ├── spec_augment.py │ └── training │ │ ├── __init__.py │ │ ├── batchfy.py │ │ ├── evaluator.py │ │ ├── iterators.py │ │ ├── tensorboard_logger.py │ │ └── train_utils.py ├── vc │ └── pytorch_backend │ │ └── vc.py └── version.txt ├── espnet2 ├── __init__.py ├── asr │ ├── __init__.py │ ├── ctc.py │ ├── decoder │ │ ├── __init__.py │ │ ├── abs_decoder.py │ │ ├── hugging_face_transformers_decoder.py │ │ ├── mlm_decoder.py │ │ ├── rnn_decoder.py │ │ ├── s4_decoder.py │ │ ├── transducer_decoder.py │ │ ├── transformer_decoder.py │ │ └── whisper_decoder.py │ ├── discrete_asr_espnet_model.py │ ├── encoder │ │ ├── __init__.py │ │ ├── abs_encoder.py │ │ ├── branchformer_encoder.py │ │ ├── conformer_encoder.py │ │ ├── contextual_block_conformer_encoder.py │ │ ├── contextual_block_transformer_encoder.py │ │ ├── e_branchformer_encoder.py │ │ ├── hubert_encoder.py │ │ ├── hugging_face_transformers_encoder.py │ │ ├── longformer_encoder.py │ │ ├── rnn_encoder.py │ │ ├── transformer_encoder.py │ │ ├── transformer_encoder_multispkr.py │ │ ├── vgg_rnn_encoder.py │ │ ├── wav2vec2_encoder.py │ │ └── whisper_encoder.py │ ├── espnet_model.py │ ├── frontend │ │ ├── __init__.py │ │ ├── abs_frontend.py │ │ ├── default.py │ │ ├── fused.py │ │ ├── s3prl.py │ │ ├── whisper.py │ │ └── windowing.py │ ├── layers │ │ ├── __init__.py │ │ ├── cgmlp.py │ │ └── fastformer.py │ ├── maskctc_model.py │ ├── pit_espnet_model.py │ ├── postencoder │ │ ├── __init__.py │ │ ├── abs_postencoder.py │ │ ├── hugging_face_transformers_postencoder.py │ │ └── length_adaptor_postencoder.py │ ├── preencoder │ │ ├── __init__.py │ │ ├── abs_preencoder.py │ │ ├── linear.py │ │ └── sinc.py │ ├── specaug │ │ ├── __init__.py │ │ ├── abs_specaug.py │ │ └── specaug.py │ ├── state_spaces │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── base.py │ │ ├── block.py │ │ ├── cauchy.py │ │ ├── components.py │ │ ├── ff.py │ │ ├── model.py │ │ ├── pool.py │ │ ├── registry.py │ │ ├── residual.py │ │ ├── s4.py │ │ └── utils.py │ └── transducer │ │ ├── __init__.py │ │ ├── beam_search_transducer.py │ │ ├── beam_search_transducer_streaming.py │ │ ├── error_calculator.py │ │ └── rnnt_multi_blank │ │ ├── __init__.py │ │ ├── rnnt.py │ │ ├── rnnt_multi_blank.py │ │ └── utils │ │ ├── __init__.py │ │ ├── cpu_utils │ │ ├── __init__.py │ │ └── cpu_rnnt.py │ │ ├── cuda_utils │ │ ├── __init__.py │ │ ├── gpu_rnnt.py │ │ ├── gpu_rnnt_kernel.py │ │ └── reduce.py │ │ ├── global_constants.py │ │ └── rnnt_helper.py ├── asr_transducer │ ├── __init__.py │ ├── activation.py │ ├── beam_search_transducer.py │ ├── decoder │ │ ├── __init__.py │ │ ├── abs_decoder.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── mega.py │ │ │ └── rwkv.py │ │ ├── mega_decoder.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── mega │ │ │ │ ├── __init__.py │ │ │ │ ├── feed_forward.py │ │ │ │ ├── multi_head_damped_ema.py │ │ │ │ └── positional_bias.py │ │ │ └── rwkv │ │ │ │ ├── __init__.py │ │ │ │ ├── attention.py │ │ │ │ ├── cuda │ │ │ │ ├── wkv_cuda.cu │ │ │ │ └── wkv_op.cpp │ │ │ │ └── feed_forward.py │ │ ├── rnn_decoder.py │ │ ├── rwkv_decoder.py │ │ └── stateless_decoder.py │ ├── encoder │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── branchformer.py │ │ │ ├── conformer.py │ │ │ ├── conv1d.py │ │ │ ├── conv_input.py │ │ │ └── ebranchformer.py │ │ ├── building.py │ │ ├── encoder.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── convolution.py │ │ │ ├── multi_blocks.py │ │ │ └── positional_encoding.py │ │ └── validation.py │ ├── error_calculator.py │ ├── espnet_transducer_model.py │ ├── frontend │ │ ├── __init__.py │ │ └── online_audio_processor.py │ ├── joint_network.py │ ├── normalization.py │ └── utils.py ├── asvspoof │ ├── __init__.py │ ├── decoder │ │ ├── __init__.py │ │ ├── abs_decoder.py │ │ └── linear_decoder.py │ ├── espnet_model.py │ └── loss │ │ ├── __init__.py │ │ ├── abs_loss.py │ │ ├── am_softmax_loss.py │ │ ├── binary_loss.py │ │ └── oc_softmax_loss.py ├── bin │ ├── __init__.py │ ├── aggregate_stats_dirs.py │ ├── asr_align.py │ ├── asr_inference.py │ ├── asr_inference_k2.py │ ├── asr_inference_maskctc.py │ ├── asr_inference_streaming.py │ ├── asr_train.py │ ├── asr_transducer_inference.py │ ├── asr_transducer_train.py │ ├── asvspoof_inference.py │ ├── asvspoof_train.py │ ├── diar_inference.py │ ├── diar_train.py │ ├── enh_inference.py │ ├── enh_inference_streaming.py │ ├── enh_s2t_train.py │ ├── enh_scoring.py │ ├── enh_train.py │ ├── enh_tse_inference.py │ ├── enh_tse_train.py │ ├── gan_svs_train.py │ ├── gan_tts_train.py │ ├── hubert_train.py │ ├── hugging_face_export_vocabulary.py │ ├── launch.py │ ├── lm_calc_perplexity.py │ ├── lm_inference.py │ ├── lm_train.py │ ├── mt_inference.py │ ├── mt_train.py │ ├── pack.py │ ├── s2t_inference.py │ ├── s2t_inference_language.py │ ├── s2t_train.py │ ├── slu_inference.py │ ├── slu_train.py │ ├── spk_train.py │ ├── split_scps.py │ ├── st_inference.py │ ├── st_inference_streaming.py │ ├── st_train.py │ ├── svs_inference.py │ ├── svs_train.py │ ├── tokenize_text.py │ ├── tts_inference.py │ ├── tts_train.py │ ├── uasr_extract_feature.py │ ├── uasr_inference.py │ ├── uasr_inference_k2.py │ ├── uasr_train.py │ └── whisper_export_vocabulary.py ├── diar │ ├── __init__.py │ ├── abs_diar.py │ ├── attractor │ │ ├── __init__.py │ │ ├── abs_attractor.py │ │ └── rnn_attractor.py │ ├── decoder │ │ ├── __init__.py │ │ ├── abs_decoder.py │ │ └── linear_decoder.py │ ├── espnet_model.py │ ├── label_processor.py │ ├── layers │ │ ├── __init__.py │ │ ├── abs_mask.py │ │ ├── multi_mask.py │ │ └── tcn_nomask.py │ └── separator │ │ ├── __init__.py │ │ └── tcn_separator_nomask.py ├── enh │ ├── __init__.py │ ├── abs_enh.py │ ├── decoder │ │ ├── __init__.py │ │ ├── abs_decoder.py │ │ ├── conv_decoder.py │ │ ├── null_decoder.py │ │ └── stft_decoder.py │ ├── encoder │ │ ├── __init__.py │ │ ├── abs_encoder.py │ │ ├── conv_encoder.py │ │ ├── null_encoder.py │ │ └── stft_encoder.py │ ├── espnet_enh_s2t_model.py │ ├── espnet_model.py │ ├── espnet_model_tse.py │ ├── extractor │ │ ├── __init__.py │ │ ├── abs_extractor.py │ │ └── td_speakerbeam_extractor.py │ ├── layers │ │ ├── __init__.py │ │ ├── adapt_layers.py │ │ ├── beamformer.py │ │ ├── beamformer_th.py │ │ ├── complex_utils.py │ │ ├── complexnn.py │ │ ├── conv_utils.py │ │ ├── dc_crn.py │ │ ├── dnn_beamformer.py │ │ ├── dnn_wpe.py │ │ ├── dnsmos.py │ │ ├── dpmulcat.py │ │ ├── dprnn.py │ │ ├── dptnet.py │ │ ├── fasnet.py │ │ ├── ifasnet.py │ │ ├── mask_estimator.py │ │ ├── skim.py │ │ ├── tcn.py │ │ ├── tcndenseunet.py │ │ └── wpe.py │ ├── loss │ │ ├── __init__.py │ │ ├── criterions │ │ │ ├── __init__.py │ │ │ ├── abs_loss.py │ │ │ ├── tf_domain.py │ │ │ └── time_domain.py │ │ └── wrappers │ │ │ ├── __init__.py │ │ │ ├── abs_wrapper.py │ │ │ ├── dpcl_solver.py │ │ │ ├── fixed_order.py │ │ │ ├── mixit_solver.py │ │ │ ├── multilayer_pit_solver.py │ │ │ └── pit_solver.py │ └── separator │ │ ├── __init__.py │ │ ├── abs_separator.py │ │ ├── asteroid_models.py │ │ ├── conformer_separator.py │ │ ├── dan_separator.py │ │ ├── dc_crn_separator.py │ │ ├── dccrn_separator.py │ │ ├── dpcl_e2e_separator.py │ │ ├── dpcl_separator.py │ │ ├── dprnn_separator.py │ │ ├── dptnet_separator.py │ │ ├── fasnet_separator.py │ │ ├── ineube_separator.py │ │ ├── neural_beamformer.py │ │ ├── rnn_separator.py │ │ ├── skim_separator.py │ │ ├── svoice_separator.py │ │ ├── tcn_separator.py │ │ ├── tfgridnet_separator.py │ │ └── transformer_separator.py ├── fileio │ ├── __init__.py │ ├── datadir_writer.py │ ├── npy_scp.py │ ├── rand_gen_dataset.py │ ├── read_text.py │ ├── rttm.py │ ├── score_scp.py │ ├── sound_scp.py │ └── vad_scp.py ├── fst │ ├── __init__.py │ └── lm_rescore.py ├── gan_svs │ ├── __init__.py │ ├── abs_gan_svs.py │ ├── avocodo │ │ ├── __init__.py │ │ └── avocodo.py │ ├── espnet_model.py │ ├── joint │ │ ├── __init__.py │ │ └── joint_score2wav.py │ ├── pits │ │ ├── modules.py │ │ └── ying_decoder.py │ ├── uhifigan │ │ ├── __init__.py │ │ ├── sine_generator.py │ │ └── uhifigan.py │ ├── utils │ │ ├── __init__.py │ │ └── expand_f0.py │ ├── visinger2 │ │ ├── __init__.py │ │ ├── ddsp.py │ │ └── visinger2_vocoder.py │ └── vits │ │ ├── __init__.py │ │ ├── duration_predictor.py │ │ ├── generator.py │ │ ├── length_regulator.py │ │ ├── modules.py │ │ ├── phoneme_predictor.py │ │ ├── pitch_predictor.py │ │ ├── prior_decoder.py │ │ ├── text_encoder.py │ │ └── vits.py ├── gan_tts │ ├── __init__.py │ ├── abs_gan_tts.py │ ├── espnet_model.py │ ├── hifigan │ │ ├── __init__.py │ │ ├── hifigan.py │ │ ├── loss.py │ │ └── residual_block.py │ ├── jets │ │ ├── __init__.py │ │ ├── alignments.py │ │ ├── generator.py │ │ ├── jets.py │ │ ├── length_regulator.py │ │ └── loss.py │ ├── joint │ │ ├── __init__.py │ │ └── joint_text2wav.py │ ├── melgan │ │ ├── __init__.py │ │ ├── melgan.py │ │ ├── pqmf.py │ │ └── residual_stack.py │ ├── parallel_wavegan │ │ ├── __init__.py │ │ ├── parallel_wavegan.py │ │ └── upsample.py │ ├── style_melgan │ │ ├── __init__.py │ │ ├── style_melgan.py │ │ └── tade_res_block.py │ ├── utils │ │ ├── __init__.py │ │ └── get_random_segments.py │ ├── vits │ │ ├── __init__.py │ │ ├── duration_predictor.py │ │ ├── flow.py │ │ ├── generator.py │ │ ├── loss.py │ │ ├── monotonic_align │ │ │ ├── __init__.py │ │ │ ├── core.pyx │ │ │ └── setup.py │ │ ├── posterior_encoder.py │ │ ├── residual_coupling.py │ │ ├── text_encoder.py │ │ ├── transform.py │ │ └── vits.py │ └── wavenet │ │ ├── __init__.py │ │ ├── residual_block.py │ │ └── wavenet.py ├── hubert │ ├── __init__.py │ ├── espnet_model.py │ └── hubert_loss.py ├── iterators │ ├── __init__.py │ ├── abs_iter_factory.py │ ├── category_iter_factory.py │ ├── chunk_iter_factory.py │ ├── multiple_iter_factory.py │ └── sequence_iter_factory.py ├── layers │ ├── __init__.py │ ├── abs_normalize.py │ ├── augmentation.py │ ├── global_mvn.py │ ├── inversible_interface.py │ ├── label_aggregation.py │ ├── log_mel.py │ ├── mask_along_axis.py │ ├── sinc_conv.py │ ├── stft.py │ ├── time_warp.py │ └── utterance_mvn.py ├── lm │ ├── __init__.py │ ├── abs_model.py │ ├── espnet_model.py │ ├── seq_rnn_lm.py │ └── transformer_lm.py ├── main_funcs │ ├── __init__.py │ ├── average_nbest_models.py │ ├── calculate_all_attentions.py │ ├── collect_stats.py │ └── pack_funcs.py ├── mt │ ├── __init__.py │ ├── espnet_model.py │ └── frontend │ │ ├── __init__.py │ │ └── embedding.py ├── optimizers │ ├── __init__.py │ ├── optim_groups.py │ └── sgd.py ├── s2t │ ├── __init__.py │ └── espnet_model.py ├── samplers │ ├── __init__.py │ ├── abs_sampler.py │ ├── build_batch_sampler.py │ ├── category_balanced_sampler.py │ ├── folded_batch_sampler.py │ ├── length_batch_sampler.py │ ├── num_elements_batch_sampler.py │ ├── sorted_batch_sampler.py │ └── unsorted_batch_sampler.py ├── schedulers │ ├── __init__.py │ ├── abs_scheduler.py │ ├── cosine_anneal_warmup_restart.py │ ├── noam_lr.py │ ├── warmup_lr.py │ ├── warmup_reducelronplateau.py │ └── warmup_step_lr.py ├── slu │ ├── __init__.py │ ├── espnet_model.py │ ├── postdecoder │ │ ├── __init__.py │ │ ├── abs_postdecoder.py │ │ └── hugging_face_transformers_postdecoder.py │ └── postencoder │ │ ├── __init__.py │ │ ├── conformer_postencoder.py │ │ └── transformer_postencoder.py ├── spk │ ├── __init__.py │ ├── encoder │ │ ├── __init__.py │ │ └── rawnet3_encoder.py │ ├── espnet_model.py │ ├── layers │ │ ├── RawNetBasicBlock.py │ │ └── __init__.py │ ├── loss │ │ ├── __init__.py │ │ ├── aamsoftmax.py │ │ └── abs_loss.py │ ├── pooling │ │ ├── __init__.py │ │ ├── abs_pooling.py │ │ └── chn_attn_stat_pooling.py │ └── projector │ │ ├── __init__.py │ │ ├── abs_projector.py │ │ └── rawnet3_projector.py ├── st │ ├── __init__.py │ └── espnet_model.py ├── svs │ ├── __init__.py │ ├── abs_svs.py │ ├── espnet_model.py │ ├── feats_extract │ │ ├── __init__.py │ │ └── score_feats_extract.py │ ├── naive_rnn │ │ ├── __init__.py │ │ ├── naive_rnn.py │ │ └── naive_rnn_dp.py │ ├── singing_tacotron │ │ ├── __init__.py │ │ ├── decoder.py │ │ ├── encoder.py │ │ └── singing_tacotron.py │ └── xiaoice │ │ ├── XiaoiceSing.py │ │ ├── __init__.py │ │ └── loss.py ├── tasks │ ├── __init__.py │ ├── abs_task.py │ ├── asr.py │ ├── asr_transducer.py │ ├── asvspoof.py │ ├── diar.py │ ├── enh.py │ ├── enh_s2t.py │ ├── enh_tse.py │ ├── gan_svs.py │ ├── gan_tts.py │ ├── hubert.py │ ├── lm.py │ ├── mt.py │ ├── s2t.py │ ├── slu.py │ ├── spk.py │ ├── st.py │ ├── svs.py │ ├── tts.py │ └── uasr.py ├── text │ ├── __init__.py │ ├── abs_tokenizer.py │ ├── build_tokenizer.py │ ├── char_tokenizer.py │ ├── cleaner.py │ ├── hugging_face_token_id_converter.py │ ├── hugging_face_tokenizer.py │ ├── korean_cleaner.py │ ├── phoneme_tokenizer.py │ ├── sentencepiece_tokenizer.py │ ├── token_id_converter.py │ ├── whisper_token_id_converter.py │ ├── whisper_tokenizer.py │ └── word_tokenizer.py ├── torch_utils │ ├── __init__.py │ ├── add_gradient_noise.py │ ├── device_funcs.py │ ├── forward_adaptor.py │ ├── get_layer_from_string.py │ ├── initialize.py │ ├── load_pretrained_model.py │ ├── model_summary.py │ ├── pytorch_version.py │ ├── recursive_op.py │ └── set_all_random_seed.py ├── train │ ├── __init__.py │ ├── abs_espnet_model.py │ ├── abs_gan_espnet_model.py │ ├── class_choices.py │ ├── collate_fn.py │ ├── dataset.py │ ├── distributed_utils.py │ ├── gan_trainer.py │ ├── iterable_dataset.py │ ├── preprocessor.py │ ├── reporter.py │ ├── spk_trainer.py │ ├── trainer.py │ └── uasr_trainer.py ├── tts │ ├── __init__.py │ ├── abs_tts.py │ ├── espnet_model.py │ ├── fastspeech │ │ ├── __init__.py │ │ └── fastspeech.py │ ├── fastspeech2 │ │ ├── __init__.py │ │ ├── fastspeech2.py │ │ ├── loss.py │ │ └── variance_predictor.py │ ├── feats_extract │ │ ├── __init__.py │ │ ├── abs_feats_extract.py │ │ ├── dio.py │ │ ├── energy.py │ │ ├── linear_spectrogram.py │ │ ├── log_mel_fbank.py │ │ ├── log_spectrogram.py │ │ ├── yin.py │ │ └── ying.py │ ├── gst │ │ ├── __init__.py │ │ └── style_encoder.py │ ├── prodiff │ │ ├── __init__.py │ │ ├── denoiser.py │ │ ├── loss.py │ │ └── prodiff.py │ ├── tacotron2 │ │ ├── __init__.py │ │ └── tacotron2.py │ ├── transformer │ │ ├── __init__.py │ │ └── transformer.py │ └── utils │ │ ├── __init__.py │ │ ├── duration_calculator.py │ │ └── parallel_wavegan_pretrained_vocoder.py ├── uasr │ ├── __init__.py │ ├── discriminator │ │ ├── __init__.py │ │ ├── abs_discriminator.py │ │ └── conv_discriminator.py │ ├── espnet_model.py │ ├── generator │ │ ├── __init__.py │ │ ├── abs_generator.py │ │ └── conv_generator.py │ ├── loss │ │ ├── __init__.py │ │ ├── abs_loss.py │ │ ├── discriminator_loss.py │ │ ├── gradient_penalty.py │ │ ├── phoneme_diversity_loss.py │ │ ├── pseudo_label_loss.py │ │ └── smoothness_penalty.py │ └── segmenter │ │ ├── __init__.py │ │ ├── abs_segmenter.py │ │ ├── join_segmenter.py │ │ └── random_segmenter.py └── utils │ ├── __init__.py │ ├── build_dataclass.py │ ├── config_argparse.py │ ├── eer.py │ ├── get_default_kwargs.py │ ├── griffin_lim.py │ ├── kwargs2args.py │ ├── nested_dict_action.py │ ├── sized_dict.py │ ├── types.py │ └── yaml_no_alias_safe_dump.py ├── model.py ├── preprocess.py ├── pretrained └── .gitkeep ├── requirements.txt ├── setup.bat ├── text.py ├── train.py ├── transcribe.py ├── transcribe_split.py ├── update.bat ├── vits_japros_train_colab.ipynb ├── webui_infer.bat ├── webui_infer.py ├── webui_train.bat ├── webui_train.py └── weights └── .gitignore /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | __pycache__/ 3 | outputs/ 4 | *.wav 5 | *.pth 6 | *.bak 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 litagin02 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/japros-infer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/assets/japros-infer.jpg -------------------------------------------------------------------------------- /assets/japros-train.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/assets/japros-train.jpg -------------------------------------------------------------------------------- /conf/tokens.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | a 4 | o 5 | i 6 | [ 7 | # 8 | u 9 | ] 10 | e 11 | k 12 | n 13 | t 14 | r 15 | s 16 | N 17 | m 18 | _ 19 | sh 20 | d 21 | g 22 | ^ 23 | $ 24 | w 25 | cl 26 | h 27 | y 28 | b 29 | j 30 | ts 31 | ch 32 | z 33 | p 34 | f 35 | ky 36 | ry 37 | gy 38 | hy 39 | ny 40 | by 41 | my 42 | py 43 | v 44 | dy 45 | ? 46 | ty 47 | 48 | -------------------------------------------------------------------------------- /conf/train_args.py: -------------------------------------------------------------------------------- 1 | train_args = [ 2 | "--use_preprocessor", 3 | "true", 4 | "--token_type", 5 | "phn", 6 | "--token_list", 7 | "conf/tokens.txt", 8 | "--non_linguistic_symbols", 9 | "none", 10 | "--cleaner", 11 | "jaconv", 12 | "--g2p", 13 | "pyopenjtalk_prosody", 14 | "--normalize", 15 | "none", 16 | "--resume", 17 | "true", 18 | "--fold_length", 19 | "150", 20 | "--fold_length", 21 | "409600", 22 | "--output_dir", 23 | "{output_dir}/{model_name}/checkpoints", 24 | "--config", 25 | "conf/finetune.yaml", 26 | "--feats_extract", 27 | "linear_spectrogram", 28 | "--feats_extract_conf", 29 | "n_fft=2048", 30 | "--feats_extract_conf", 31 | "hop_length=512", 32 | "--feats_extract_conf", 33 | "win_length=null", 34 | "--train_data_path_and_name_and_type", 35 | "{output_dir}/{model_name}/dump/train/text,text,text", 36 | "--train_data_path_and_name_and_type", 37 | "{output_dir}/{model_name}/dump/train/wav.scp,speech,sound", 38 | "--train_shape_file", 39 | "{output_dir}/{model_name}/stats/train/text_shape.phn", 40 | "--train_shape_file", 41 | "{output_dir}/{model_name}/stats/train/speech_shape", 42 | "--valid_data_path_and_name_and_type", 43 | "{output_dir}/{model_name}/dump/valid/text,text,text", 44 | "--valid_data_path_and_name_and_type", 45 | "{output_dir}/{model_name}/dump/valid/wav.scp,speech,sound", 46 | "--valid_shape_file", 47 | "{output_dir}/{model_name}/stats/valid/text_shape.phn", 48 | "--valid_shape_file", 49 | "{output_dir}/{model_name}/stats/valid/speech_shape", 50 | "--init_param", 51 | "pretrained/pretrained.pth:tts:tts", 52 | "--ngpu", 53 | "1", 54 | ] 55 | -------------------------------------------------------------------------------- /data/README.txt: -------------------------------------------------------------------------------- 1 | このフォルダには以下のようにファイルを準備してください。 2 | 3 | - wavsフォルダに音声ファイル(wav形式) 4 | - transcript_utf8.txt 5 | 6 | transcript_utf8.txtの中身は以下の感じ(whisperを使う場合は自動で作られます。) 7 | 8 | wav_filename1:ここに発言内容を書きます。 9 | wav2:コロンの左側はファイル名のみで、拡張子は不要です。 10 | … 11 | 12 | また、wavファイルは以下のようにwavsフォルダに入れてください。 13 | wavs 14 | ├── wav_filename1.wav 15 | ├── wav2.wav 16 | └── … 17 | -------------------------------------------------------------------------------- /data/transcript_utf8_sample.txt: -------------------------------------------------------------------------------- 1 | sample_wav_file_name:これはサンプルです。 2 | wav_filename1:ここに発言内容を書きます。 3 | wav2:コロンの左側はファイル名のみで、拡張子は不要です。 4 | -------------------------------------------------------------------------------- /data/wavs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/data/wavs/.gitkeep -------------------------------------------------------------------------------- /espnet/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize espnet package.""" 2 | 3 | import os 4 | 5 | dirname = os.path.dirname(__file__) 6 | version_file = os.path.join(dirname, "version.txt") 7 | with open(version_file, "r") as f: 8 | __version__ = f.read().strip() 9 | -------------------------------------------------------------------------------- /espnet/asr/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/asr/chainer_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/asr/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/bin/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: 3 | # Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # SPDX-License-Identifier: Apache-2.0 5 | # 6 | 7 | """Initialize sub package.""" 8 | -------------------------------------------------------------------------------- /espnet/lm/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/lm/chainer_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/lm/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/mt/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/mt/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/asr_interface.py: -------------------------------------------------------------------------------- 1 | """ASR Interface module.""" 2 | import chainer 3 | 4 | from espnet.nets.asr_interface import ASRInterface 5 | 6 | 7 | class ChainerASRInterface(ASRInterface, chainer.Chain): 8 | """ASR Interface for ESPnet model implementation.""" 9 | 10 | @staticmethod 11 | def custom_converter(*args, **kw): 12 | """Get customconverter of the model (Chainer only).""" 13 | raise NotImplementedError("custom converter method is not implemented") 14 | 15 | @staticmethod 16 | def custom_updater(*args, **kw): 17 | """Get custom_updater of the model (Chainer only).""" 18 | raise NotImplementedError("custom updater method is not implemented") 19 | 20 | @staticmethod 21 | def custom_parallel_updater(*args, **kw): 22 | """Get custom_parallel_updater of the model (Chainer only).""" 23 | raise NotImplementedError("custom parallel updater method is not implemented") 24 | 25 | def get_total_subsampling_factor(self): 26 | """Get total subsampling factor.""" 27 | raise NotImplementedError( 28 | "get_total_subsampling_factor method is not implemented" 29 | ) 30 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/nets_utils.py: -------------------------------------------------------------------------------- 1 | import chainer.functions as F 2 | 3 | 4 | def _subsamplex(x, n): 5 | x = [F.get_item(xx, (slice(None, None, n), slice(None))) for xx in x] 6 | ilens = [xx.shape[0] for xx in x] 7 | return x, ilens 8 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/embedding.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """Class Declaration of Transformer's Positional Encoding.""" 3 | 4 | import chainer 5 | import chainer.functions as F 6 | import numpy as np 7 | 8 | 9 | class PositionalEncoding(chainer.Chain): 10 | """Positional encoding module. 11 | 12 | :param int n_units: embedding dim 13 | :param float dropout: dropout rate 14 | :param int length: maximum input length 15 | 16 | """ 17 | 18 | def __init__(self, n_units, dropout=0.1, length=5000): 19 | """Initialize Positional Encoding.""" 20 | # Implementation described in the paper 21 | super(PositionalEncoding, self).__init__() 22 | self.dropout = dropout 23 | posi_block = np.arange(0, length, dtype=np.float32)[:, None] 24 | unit_block = np.exp( 25 | np.arange(0, n_units, 2, dtype=np.float32) * -(np.log(10000.0) / n_units) 26 | ) 27 | self.pe = np.zeros((length, n_units), dtype=np.float32) 28 | self.pe[:, ::2] = np.sin(posi_block * unit_block) 29 | self.pe[:, 1::2] = np.cos(posi_block * unit_block) 30 | self.scale = np.sqrt(n_units) 31 | 32 | def forward(self, e): 33 | """Forward Positional Encoding.""" 34 | length = e.shape[1] 35 | e = e * self.scale + self.xp.array(self.pe[:length]) 36 | return F.dropout(e, self.dropout) 37 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/encoder_layer.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """Class Declaration of Transformer's Encoder Block.""" 3 | 4 | import chainer 5 | import chainer.functions as F 6 | 7 | from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention 8 | from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm 9 | from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import ( 10 | PositionwiseFeedForward, 11 | ) 12 | 13 | 14 | class EncoderLayer(chainer.Chain): 15 | """Single encoder layer module. 16 | 17 | Args: 18 | n_units (int): Number of input/output dimension of a FeedForward layer. 19 | d_units (int): Number of units of hidden layer in a FeedForward layer. 20 | h (int): Number of attention heads. 21 | dropout (float): Dropout rate 22 | 23 | """ 24 | 25 | def __init__( 26 | self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None 27 | ): 28 | """Initialize EncoderLayer.""" 29 | super(EncoderLayer, self).__init__() 30 | with self.init_scope(): 31 | self.self_attn = MultiHeadAttention( 32 | n_units, 33 | h, 34 | dropout=dropout, 35 | initialW=initialW, 36 | initial_bias=initial_bias, 37 | ) 38 | self.feed_forward = PositionwiseFeedForward( 39 | n_units, 40 | d_units=d_units, 41 | dropout=dropout, 42 | initialW=initialW, 43 | initial_bias=initial_bias, 44 | ) 45 | self.norm1 = LayerNorm(n_units) 46 | self.norm2 = LayerNorm(n_units) 47 | self.dropout = dropout 48 | self.n_units = n_units 49 | 50 | def forward(self, e, xx_mask, batch): 51 | """Forward Positional Encoding.""" 52 | n_e = self.norm1(e) 53 | n_e = self.self_attn(n_e, mask=xx_mask, batch=batch) 54 | e = e + F.dropout(n_e, self.dropout) 55 | 56 | n_e = self.norm2(e) 57 | n_e = self.feed_forward(n_e) 58 | e = e + F.dropout(n_e, self.dropout) 59 | return e 60 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/layer_norm.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """Class Declaration of Transformer's Label Smootion loss.""" 3 | 4 | import chainer.links as L 5 | 6 | 7 | class LayerNorm(L.LayerNormalization): 8 | """Redirect to L.LayerNormalization.""" 9 | 10 | def __init__(self, dims, eps=1e-12): 11 | """Initialize LayerNorm.""" 12 | super(LayerNorm, self).__init__(size=dims, eps=eps) 13 | 14 | def __call__(self, e): 15 | """Forward LayerNorm.""" 16 | return super(LayerNorm, self).__call__(e) 17 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/mask.py: -------------------------------------------------------------------------------- 1 | """Create mask for subsequent steps.""" 2 | 3 | 4 | def make_history_mask(xp, block): 5 | """Prepare the history mask. 6 | 7 | Args: 8 | block (ndarray): Block with dimensions: (B x S). 9 | Returns: 10 | ndarray, np.ndarray: History mask with dimensions (B, S, S). 11 | 12 | """ 13 | batch, length = block.shape 14 | arange = xp.arange(length) 15 | history_mask = (arange[None] <= arange[:, None])[None,] 16 | history_mask = xp.broadcast_to(history_mask, (batch, length, length)) 17 | return history_mask 18 | -------------------------------------------------------------------------------- /espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """Class Declaration of Transformer's Positionwise Feedforward.""" 3 | 4 | import chainer 5 | import chainer.functions as F 6 | import chainer.links as L 7 | import numpy as np 8 | 9 | 10 | class PositionwiseFeedForward(chainer.Chain): 11 | """Positionwise feed forward. 12 | 13 | Args: 14 | :param int idim: input dimenstion 15 | :param int hidden_units: number of hidden units 16 | :param float dropout_rate: dropout rate 17 | 18 | """ 19 | 20 | def __init__( 21 | self, n_units, d_units=0, dropout=0.1, initialW=None, initial_bias=None 22 | ): 23 | """Initialize PositionwiseFeedForward. 24 | 25 | Args: 26 | n_units (int): Input dimension. 27 | d_units (int, optional): Output dimension of hidden layer. 28 | dropout (float, optional): Dropout ratio. 29 | initialW (int, optional): Initializer to initialize the weight. 30 | initial_bias (bool, optional): Initializer to initialize the bias. 31 | 32 | """ 33 | super(PositionwiseFeedForward, self).__init__() 34 | n_inner_units = d_units if d_units > 0 else n_units * 4 35 | with self.init_scope(): 36 | stvd = 1.0 / np.sqrt(n_units) 37 | self.w_1 = L.Linear( 38 | n_units, 39 | n_inner_units, 40 | initialW=initialW(scale=stvd), 41 | initial_bias=initial_bias(scale=stvd), 42 | ) 43 | stvd = 1.0 / np.sqrt(n_inner_units) 44 | self.w_2 = L.Linear( 45 | n_inner_units, 46 | n_units, 47 | initialW=initialW(scale=stvd), 48 | initial_bias=initial_bias(scale=stvd), 49 | ) 50 | self.act = F.relu 51 | self.dropout = dropout 52 | 53 | def __call__(self, e): 54 | """Initialize PositionwiseFeedForward. 55 | 56 | Args: 57 | e (chainer.Variable): Input variable. 58 | 59 | Return: 60 | chainer.Variable: Output variable. 61 | 62 | """ 63 | e = F.dropout(self.act(self.w_1(e)), self.dropout) 64 | return self.w_2(e) 65 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/conformer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/conformer/swish.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """Swish() activation function for Conformer.""" 9 | 10 | import torch 11 | 12 | 13 | class Swish(torch.nn.Module): 14 | """Construct an Swish object.""" 15 | 16 | def forward(self, x): 17 | """Return Swich activation function.""" 18 | return x * torch.sigmoid(x) 19 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/fastspeech/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/frontends/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/initialization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2019 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | """Initialization functions for RNN sequence-to-sequence models.""" 7 | 8 | import math 9 | 10 | 11 | def lecun_normal_init_parameters(module): 12 | """Initialize parameters in the LeCun's manner.""" 13 | for p in module.parameters(): 14 | data = p.data 15 | if data.dim() == 1: 16 | # bias 17 | data.zero_() 18 | elif data.dim() == 2: 19 | # linear weight 20 | n = data.size(1) 21 | stdv = 1.0 / math.sqrt(n) 22 | data.normal_(0, stdv) 23 | elif data.dim() in (3, 4): 24 | # conv weight 25 | n = data.size(1) 26 | for k in data.size()[2:]: 27 | n *= k 28 | stdv = 1.0 / math.sqrt(n) 29 | data.normal_(0, stdv) 30 | else: 31 | raise NotImplementedError 32 | 33 | 34 | def uniform_init_parameters(module): 35 | """Initialize parameters with an uniform distribution.""" 36 | for p in module.parameters(): 37 | data = p.data 38 | if data.dim() == 1: 39 | # bias 40 | data.uniform_(-0.1, 0.1) 41 | elif data.dim() == 2: 42 | # linear weight 43 | data.uniform_(-0.1, 0.1) 44 | elif data.dim() in (3, 4): 45 | # conv weight 46 | pass # use the pytorch default 47 | else: 48 | raise NotImplementedError 49 | 50 | 51 | def set_forget_bias_to_one(bias): 52 | """Initialize a bias vector in the forget gate with one.""" 53 | n = bias.size(0) 54 | start, end = n // 4, n // 2 55 | bias.data[start:end].fill_(1.0) 56 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/lm/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/maskctc/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/maskctc/add_mask_token.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Waseda University (Yosuke Higuchi) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """Token masking module for Masked LM.""" 9 | 10 | import numpy 11 | 12 | 13 | def mask_uniform(ys_pad, mask_token, eos, ignore_id): 14 | """Replace random tokens with label and add label. 15 | 16 | The number of is chosen from a uniform distribution 17 | between one and the target sequence's length. 18 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 19 | :param int mask_token: index of 20 | :param int eos: index of 21 | :param int ignore_id: index of padding 22 | :return: padded tensor (B, Lmax) 23 | :rtype: torch.Tensor 24 | :return: padded tensor (B, Lmax) 25 | :rtype: torch.Tensor 26 | """ 27 | from espnet.nets.pytorch_backend.nets_utils import pad_list 28 | 29 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 30 | ys_out = [y.new(y.size()).fill_(ignore_id) for y in ys] 31 | ys_in = [y.clone() for y in ys] 32 | for i in range(len(ys)): 33 | num_samples = numpy.random.randint(1, len(ys[i]) + 1) 34 | idx = numpy.random.choice(len(ys[i]), num_samples) 35 | 36 | ys_in[i][idx] = mask_token 37 | ys_out[i][idx] = ys[i][idx] 38 | 39 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 40 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/maskctc/mask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Waseda University (Yosuke Higuchi) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """Attention masking module for Masked LM.""" 9 | 10 | 11 | def square_mask(ys_in_pad, ignore_id): 12 | """Create attention mask to avoid attending on padding tokens. 13 | 14 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 15 | :param int ignore_id: index of padding 16 | :param torch.dtype dtype: result dtype 17 | :rtype: torch.Tensor (B, Lmax, Lmax) 18 | """ 19 | ys_mask = (ys_in_pad != ignore_id).unsqueeze(-2) 20 | ymax = ys_mask.size(-1) 21 | ys_mask_tmp = ys_mask.transpose(1, 2).repeat(1, 1, ymax) 22 | ys_mask = ys_mask.repeat(1, ymax, 1) & ys_mask_tmp 23 | 24 | return ys_mask 25 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/streaming/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transducer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transducer/initializer.py: -------------------------------------------------------------------------------- 1 | """Parameter initialization for Transducer model.""" 2 | 3 | import math 4 | from argparse import Namespace 5 | 6 | import torch 7 | 8 | from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one 9 | 10 | 11 | def initializer(model: torch.nn.Module, args: Namespace): 12 | """Initialize Transducer model. 13 | 14 | Args: 15 | model: Transducer model. 16 | args: Namespace containing model options. 17 | 18 | """ 19 | for name, p in model.named_parameters(): 20 | if any(x in name for x in ["enc.", "dec.", "transducer_tasks."]): 21 | if p.dim() == 1: 22 | # bias 23 | p.data.zero_() 24 | elif p.dim() == 2: 25 | # linear weight 26 | n = p.size(1) 27 | stdv = 1.0 / math.sqrt(n) 28 | p.data.normal_(0, stdv) 29 | elif p.dim() in (3, 4): 30 | # conv weight 31 | n = p.size(1) 32 | for k in p.size()[2:]: 33 | n *= k 34 | stdv = 1.0 / math.sqrt(n) 35 | p.data.normal_(0, stdv) 36 | 37 | if args.dtype != "custom": 38 | model.dec.embed.weight.data.normal_(0, 1) 39 | 40 | for i in range(model.dec.dlayers): 41 | set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_ih_l0")) 42 | set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_hh_l0")) 43 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/add_sos_eos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Unility functions for Transformer.""" 8 | 9 | import torch 10 | 11 | 12 | def add_sos_eos(ys_pad, sos, eos, ignore_id): 13 | """Add and labels. 14 | 15 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 16 | :param int sos: index of 17 | :param int eos: index of 18 | :param int ignore_id: index of padding 19 | :return: padded tensor (B, Lmax) 20 | :rtype: torch.Tensor 21 | :return: padded tensor (B, Lmax) 22 | :rtype: torch.Tensor 23 | """ 24 | from espnet.nets.pytorch_backend.nets_utils import pad_list 25 | 26 | _sos = ys_pad.new([sos]) 27 | _eos = ys_pad.new([eos]) 28 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 29 | ys_in = [torch.cat([_sos, y], dim=0) for y in ys] 30 | ys_out = [torch.cat([y, _eos], dim=0) for y in ys] 31 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 32 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/initializer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Parameter initialization.""" 8 | 9 | import torch 10 | 11 | from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm 12 | 13 | 14 | def initialize(model, init_type="pytorch"): 15 | """Initialize Transformer module. 16 | 17 | :param torch.nn.Module model: transformer instance 18 | :param str init_type: initialization type 19 | """ 20 | if init_type == "pytorch": 21 | return 22 | 23 | # weight init 24 | for p in model.parameters(): 25 | if p.dim() > 1: 26 | if init_type == "xavier_uniform": 27 | torch.nn.init.xavier_uniform_(p.data) 28 | elif init_type == "xavier_normal": 29 | torch.nn.init.xavier_normal_(p.data) 30 | elif init_type == "kaiming_uniform": 31 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 32 | elif init_type == "kaiming_normal": 33 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 34 | else: 35 | raise ValueError("Unknown initialization: " + init_type) 36 | # bias init 37 | for p in model.parameters(): 38 | if p.dim() == 1: 39 | p.data.zero_() 40 | 41 | # reset some modules with default init 42 | for m in model.modules(): 43 | if isinstance(m, (torch.nn.Embedding, LayerNorm)): 44 | m.reset_parameters() 45 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/layer_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Layer normalization module.""" 8 | 9 | import torch 10 | 11 | 12 | class LayerNorm(torch.nn.LayerNorm): 13 | """Layer normalization module. 14 | 15 | Args: 16 | nout (int): Output dim size. 17 | dim (int): Dimension to be normalized. 18 | 19 | """ 20 | 21 | def __init__(self, nout, dim=-1): 22 | """Construct an LayerNorm object.""" 23 | super(LayerNorm, self).__init__(nout, eps=1e-12) 24 | self.dim = dim 25 | 26 | def forward(self, x): 27 | """Apply layer normalization. 28 | 29 | Args: 30 | x (torch.Tensor): Input tensor. 31 | 32 | Returns: 33 | torch.Tensor: Normalized tensor. 34 | 35 | """ 36 | if self.dim == -1: 37 | return super(LayerNorm, self).forward(x) 38 | return ( 39 | super(LayerNorm, self) 40 | .forward(x.transpose(self.dim, -1)) 41 | .transpose(self.dim, -1) 42 | ) 43 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/mask.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Shigeki Karita 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Mask module.""" 5 | 6 | import torch 7 | 8 | 9 | def subsequent_mask(size, device="cpu", dtype=torch.bool): 10 | """Create mask for subsequent steps (size, size). 11 | 12 | :param int size: size of mask 13 | :param str device: "cpu" or "cuda" or torch.Tensor.device 14 | :param torch.dtype dtype: result dtype 15 | :rtype: torch.Tensor 16 | >>> subsequent_mask(3) 17 | [[1, 0, 0], 18 | [1, 1, 0], 19 | [1, 1, 1]] 20 | """ 21 | ret = torch.ones(size, size, device=device, dtype=dtype) 22 | return torch.tril(ret, out=ret) 23 | 24 | 25 | def target_mask(ys_in_pad, ignore_id): 26 | """Create mask for decoder self-attention. 27 | 28 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 29 | :param int ignore_id: index of padding 30 | :param torch.dtype dtype: result dtype 31 | :rtype: torch.Tensor (B, Lmax, Lmax) 32 | """ 33 | ys_mask = ys_in_pad != ignore_id 34 | m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0) 35 | return ys_mask.unsqueeze(-2) & m 36 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Positionwise feed forward layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class PositionwiseFeedForward(torch.nn.Module): 13 | """Positionwise feed forward layer. 14 | 15 | Args: 16 | idim (int): Input dimenstion. 17 | hidden_units (int): The number of hidden units. 18 | dropout_rate (float): Dropout rate. 19 | 20 | """ 21 | 22 | def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): 23 | """Construct an PositionwiseFeedForward object.""" 24 | super(PositionwiseFeedForward, self).__init__() 25 | self.w_1 = torch.nn.Linear(idim, hidden_units) 26 | self.w_2 = torch.nn.Linear(hidden_units, idim) 27 | self.dropout = torch.nn.Dropout(dropout_rate) 28 | self.activation = activation 29 | 30 | def forward(self, x): 31 | """Forward function.""" 32 | return self.w_2(self.dropout(self.activation(self.w_1(x)))) 33 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/repeat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Repeat the same layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class MultiSequential(torch.nn.Sequential): 13 | """Multi-input multi-output torch.nn.Sequential.""" 14 | 15 | def __init__(self, *args, layer_drop_rate=0.0): 16 | """Initialize MultiSequential with layer_drop. 17 | 18 | Args: 19 | layer_drop_rate (float): Probability of dropping out each fn (layer). 20 | 21 | """ 22 | super(MultiSequential, self).__init__(*args) 23 | self.layer_drop_rate = layer_drop_rate 24 | 25 | def forward(self, *args): 26 | """Repeat.""" 27 | _probs = torch.empty(len(self)).uniform_() 28 | for idx, m in enumerate(self): 29 | if not self.training or (_probs[idx] >= self.layer_drop_rate): 30 | args = m(*args) 31 | return args 32 | 33 | 34 | def repeat(N, fn, layer_drop_rate=0.0): 35 | """Repeat module N times. 36 | 37 | Args: 38 | N (int): Number of repeat time. 39 | fn (Callable): Function to generate module. 40 | layer_drop_rate (float): Probability of dropping out each fn (layer). 41 | 42 | Returns: 43 | MultiSequential: Repeated model instance. 44 | 45 | """ 46 | return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate) 47 | -------------------------------------------------------------------------------- /espnet/nets/pytorch_backend/transformer/subsampling_without_posenc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Emiru Tsunoo 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Subsampling layer definition.""" 5 | 6 | import math 7 | 8 | import torch 9 | 10 | 11 | class Conv2dSubsamplingWOPosEnc(torch.nn.Module): 12 | """Convolutional 2D subsampling. 13 | 14 | Args: 15 | idim (int): Input dimension. 16 | odim (int): Output dimension. 17 | dropout_rate (float): Dropout rate. 18 | kernels (list): kernel sizes 19 | strides (list): stride sizes 20 | 21 | """ 22 | 23 | def __init__(self, idim, odim, dropout_rate, kernels, strides): 24 | """Construct an Conv2dSubsamplingWOPosEnc object.""" 25 | assert len(kernels) == len(strides) 26 | super().__init__() 27 | conv = [] 28 | olen = idim 29 | for i, (k, s) in enumerate(zip(kernels, strides)): 30 | conv += [ 31 | torch.nn.Conv2d(1 if i == 0 else odim, odim, k, s), 32 | torch.nn.ReLU(), 33 | ] 34 | olen = math.floor((olen - k) / s + 1) 35 | self.conv = torch.nn.Sequential(*conv) 36 | self.out = torch.nn.Linear(odim * olen, odim) 37 | self.strides = strides 38 | self.kernels = kernels 39 | 40 | def forward(self, x, x_mask): 41 | """Subsample x. 42 | 43 | Args: 44 | x (torch.Tensor): Input tensor (#batch, time, idim). 45 | x_mask (torch.Tensor): Input mask (#batch, 1, time). 46 | 47 | Returns: 48 | torch.Tensor: Subsampled tensor (#batch, time', odim), 49 | where time' = time // 4. 50 | torch.Tensor: Subsampled mask (#batch, 1, time'), 51 | where time' = time // 4. 52 | 53 | """ 54 | x = x.unsqueeze(1) # (b, c, t, f) 55 | x = self.conv(x) 56 | b, c, t, f = x.size() 57 | x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) 58 | if x_mask is None: 59 | return x, None 60 | for k, s in zip(self.kernels, self.strides): 61 | x_mask = x_mask[:, :, : -k + 1 : s] 62 | return x, x_mask 63 | -------------------------------------------------------------------------------- /espnet/nets/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/nets/scorers/length_bonus.py: -------------------------------------------------------------------------------- 1 | """Length bonus module.""" 2 | from typing import Any, List, Tuple 3 | 4 | import torch 5 | 6 | from espnet.nets.scorer_interface import BatchScorerInterface 7 | 8 | 9 | class LengthBonus(BatchScorerInterface): 10 | """Length bonus in beam search.""" 11 | 12 | def __init__(self, n_vocab: int): 13 | """Initialize class. 14 | 15 | Args: 16 | n_vocab (int): The number of tokens in vocabulary for beam search 17 | 18 | """ 19 | self.n = n_vocab 20 | 21 | def score(self, y, state, x): 22 | """Score new token. 23 | 24 | Args: 25 | y (torch.Tensor): 1D torch.int64 prefix tokens. 26 | state: Scorer state for prefix tokens 27 | x (torch.Tensor): 2D encoder feature that generates ys. 28 | 29 | Returns: 30 | tuple[torch.Tensor, Any]: Tuple of 31 | torch.float32 scores for next token (n_vocab) 32 | and None 33 | 34 | """ 35 | return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None 36 | 37 | def batch_score( 38 | self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor 39 | ) -> Tuple[torch.Tensor, List[Any]]: 40 | """Score new token batch. 41 | 42 | Args: 43 | ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). 44 | states (List[Any]): Scorer states for prefix tokens. 45 | xs (torch.Tensor): 46 | The encoder feature that generates ys (n_batch, xlen, n_feat). 47 | 48 | Returns: 49 | tuple[torch.Tensor, List[Any]]: Tuple of 50 | batchfied scores for next token with shape of `(n_batch, n_vocab)` 51 | and next state list for ys. 52 | 53 | """ 54 | return ( 55 | torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand( 56 | ys.shape[0], self.n 57 | ), 58 | None, 59 | ) 60 | -------------------------------------------------------------------------------- /espnet/nets/scorers/uasr.py: -------------------------------------------------------------------------------- 1 | """ScorerInterface implementation for UASR.""" 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from espnet.nets.ctc_prefix_score import CTCPrefixScore, CTCPrefixScoreTH 7 | from espnet.nets.scorers.ctc import CTCPrefixScorer 8 | 9 | 10 | class UASRPrefixScorer(CTCPrefixScorer): 11 | """Decoder interface wrapper for CTCPrefixScore.""" 12 | 13 | def __init__(self, eos: int): 14 | """Initialize class.""" 15 | self.eos = eos 16 | 17 | def init_state(self, x: torch.Tensor): 18 | """Get an initial state for decoding. 19 | 20 | Args: 21 | x (torch.Tensor): The encoded feature tensor 22 | 23 | Returns: initial state 24 | 25 | """ 26 | x[:, 0] = x[:, 0] - 100000000000 # simulate a no-blank CTC 27 | self.logp = ( 28 | torch.nn.functional.log_softmax(x, dim=1).detach().squeeze(0).cpu().numpy() 29 | ) 30 | # TODO(karita): use CTCPrefixScoreTH 31 | self.impl = CTCPrefixScore(self.logp, 0, self.eos, np) 32 | return 0, self.impl.initial_state() 33 | 34 | def batch_init_state(self, x: torch.Tensor): 35 | """Get an initial state for decoding. 36 | 37 | Args: 38 | x (torch.Tensor): The encoded feature tensor 39 | 40 | Returns: initial state 41 | 42 | """ 43 | x[:, 0] = x[:, 0] - 100000000000 # simulate a no-blank CTC 44 | logp = torch.nn.functional.log_softmax(x, dim=1).unsqueeze( 45 | 0 46 | ) # assuming batch_size = 1 47 | xlen = torch.tensor([logp.size(1)]) 48 | self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos) 49 | return None 50 | -------------------------------------------------------------------------------- /espnet/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/optimizer/parser.py: -------------------------------------------------------------------------------- 1 | """Common optimizer default config for multiple backends.""" 2 | 3 | 4 | def sgd(parser): 5 | """Add arguments.""" 6 | parser.add_argument("--lr", type=float, default=1.0, help="Learning rate") 7 | parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay") 8 | return parser 9 | 10 | 11 | def adam(parser): 12 | """Add arguments.""" 13 | parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate") 14 | parser.add_argument("--beta1", type=float, default=0.9, help="Beta1") 15 | parser.add_argument("--beta2", type=float, default=0.999, help="Beta2") 16 | parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay") 17 | return parser 18 | 19 | 20 | def adadelta(parser): 21 | """Add arguments.""" 22 | parser.add_argument("--rho", type=float, default=0.95, help="Rho") 23 | parser.add_argument("--eps", type=float, default=1e-8, help="Eps") 24 | parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay") 25 | return parser 26 | -------------------------------------------------------------------------------- /espnet/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/scheduler/chainer.py: -------------------------------------------------------------------------------- 1 | """Chainer optimizer schdulers.""" 2 | 3 | from typing import List 4 | 5 | from chainer.optimizer import Optimizer 6 | 7 | from espnet.scheduler.scheduler import SchedulerInterface 8 | 9 | 10 | class ChainerScheduler: 11 | """Chainer optimizer scheduler.""" 12 | 13 | def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer): 14 | """Initialize class.""" 15 | self.schedulers = schedulers 16 | self.optimizer = optimizer 17 | self.init_values = dict() 18 | for s in self.schedulers: 19 | self.init_values[s.key] = getattr(self.optimizer, s.key) 20 | 21 | def step(self, n_iter: int): 22 | """Update optimizer by scheduling.""" 23 | for s in self.schedulers: 24 | new_val = self.init_values[s.key] * s.scale(n_iter) 25 | setattr(self.optimizer, s.key, new_val) 26 | -------------------------------------------------------------------------------- /espnet/scheduler/pytorch.py: -------------------------------------------------------------------------------- 1 | """PyTorch optimizer schdulers.""" 2 | 3 | from typing import List 4 | 5 | from torch.optim import Optimizer 6 | 7 | from espnet.scheduler.scheduler import SchedulerInterface 8 | 9 | 10 | class PyTorchScheduler: 11 | """PyTorch optimizer scheduler.""" 12 | 13 | def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer): 14 | """Initialize class.""" 15 | self.schedulers = schedulers 16 | self.optimizer = optimizer 17 | for s in self.schedulers: 18 | for group in optimizer.param_groups: 19 | group.setdefault("initial_" + s.key, group[s.key]) 20 | 21 | def step(self, n_iter: int): 22 | """Update optimizer by scheduling.""" 23 | for s in self.schedulers: 24 | for group in self.optimizer.param_groups: 25 | group[s.key] = group["initial_" + s.key] * s.scale(n_iter) 26 | -------------------------------------------------------------------------------- /espnet/st/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/st/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/transform/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize main package.""" 2 | -------------------------------------------------------------------------------- /espnet/transform/add_deltas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def delta(feat, window): 5 | assert window > 0 6 | delta_feat = np.zeros_like(feat) 7 | for i in range(1, window + 1): 8 | delta_feat[:-i] += i * feat[i:] 9 | delta_feat[i:] += -i * feat[:-i] 10 | delta_feat[-i:] += i * feat[-1] 11 | delta_feat[:i] += -i * feat[0] 12 | delta_feat /= 2 * sum(i**2 for i in range(1, window + 1)) 13 | return delta_feat 14 | 15 | 16 | def add_deltas(x, window=2, order=2): 17 | feats = [x] 18 | for _ in range(order): 19 | feats.append(delta(feats[-1], window)) 20 | return np.concatenate(feats, axis=1) 21 | 22 | 23 | class AddDeltas(object): 24 | def __init__(self, window=2, order=2): 25 | self.window = window 26 | self.order = order 27 | 28 | def __repr__(self): 29 | return "{name}(window={window}, order={order}".format( 30 | name=self.__class__.__name__, window=self.window, order=self.order 31 | ) 32 | 33 | def __call__(self, x): 34 | return add_deltas(x, window=self.window, order=self.order) 35 | -------------------------------------------------------------------------------- /espnet/transform/channel_selector.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | class ChannelSelector(object): 5 | """Select 1ch from multi-channel signal""" 6 | 7 | def __init__(self, train_channel="random", eval_channel=0, axis=1): 8 | self.train_channel = train_channel 9 | self.eval_channel = eval_channel 10 | self.axis = axis 11 | 12 | def __repr__(self): 13 | return ( 14 | "{name}(train_channel={train_channel}, " 15 | "eval_channel={eval_channel}, axis={axis})".format( 16 | name=self.__class__.__name__, 17 | train_channel=self.train_channel, 18 | eval_channel=self.eval_channel, 19 | axis=self.axis, 20 | ) 21 | ) 22 | 23 | def __call__(self, x, train=True): 24 | # Assuming x: [Time, Channel] by default 25 | 26 | if x.ndim <= self.axis: 27 | # If the dimension is insufficient, then unsqueeze 28 | # (e.g [Time] -> [Time, 1]) 29 | ind = tuple( 30 | slice(None) if i < x.ndim else None for i in range(self.axis + 1) 31 | ) 32 | x = x[ind] 33 | 34 | if train: 35 | channel = self.train_channel 36 | else: 37 | channel = self.eval_channel 38 | 39 | if channel == "random": 40 | ch = numpy.random.randint(0, x.shape[self.axis]) 41 | else: 42 | ch = channel 43 | 44 | ind = tuple(slice(None) if i != self.axis else ch for i in range(x.ndim)) 45 | return x[ind] 46 | -------------------------------------------------------------------------------- /espnet/transform/functional.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from espnet.transform.transform_interface import TransformInterface 4 | from espnet.utils.check_kwargs import check_kwargs 5 | 6 | 7 | class FuncTrans(TransformInterface): 8 | """Functional Transformation 9 | 10 | WARNING: 11 | Builtin or C/C++ functions may not work properly 12 | because this class heavily depends on the `inspect` module. 13 | 14 | Usage: 15 | 16 | >>> def foo_bar(x, a=1, b=2): 17 | ... '''Foo bar 18 | ... :param x: input 19 | ... :param int a: default 1 20 | ... :param int b: default 2 21 | ... ''' 22 | ... return x + a - b 23 | 24 | 25 | >>> class FooBar(FuncTrans): 26 | ... _func = foo_bar 27 | ... __doc__ = foo_bar.__doc__ 28 | """ 29 | 30 | _func = None 31 | 32 | def __init__(self, **kwargs): 33 | self.kwargs = kwargs 34 | check_kwargs(self.func, kwargs) 35 | 36 | def __call__(self, x): 37 | return self.func(x, **self.kwargs) 38 | 39 | @classmethod 40 | def add_arguments(cls, parser): 41 | fname = cls._func.__name__.replace("_", "-") 42 | group = parser.add_argument_group(fname + " transformation setting") 43 | for k, v in cls.default_params().items(): 44 | # TODO(karita): get help and choices from docstring? 45 | attr = k.replace("_", "-") 46 | group.add_argument(f"--{fname}-{attr}", default=v, type=type(v)) 47 | return parser 48 | 49 | @property 50 | def func(self): 51 | return type(self)._func 52 | 53 | @classmethod 54 | def default_params(cls): 55 | try: 56 | d = dict(inspect.signature(cls._func).parameters) 57 | except ValueError: 58 | d = dict() 59 | return { 60 | k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty 61 | } 62 | 63 | def __repr__(self): 64 | params = self.default_params() 65 | params.update(**self.kwargs) 66 | ret = self.__class__.__name__ + "(" 67 | if len(params) == 0: 68 | return ret + ")" 69 | for k, v in params.items(): 70 | ret += "{}={}, ".format(k, v) 71 | return ret[:-2] + ")" 72 | -------------------------------------------------------------------------------- /espnet/transform/transform_interface.py: -------------------------------------------------------------------------------- 1 | # TODO(karita): add this to all the transform impl. 2 | class TransformInterface: 3 | """Transform Interface""" 4 | 5 | def __call__(self, x): 6 | raise NotImplementedError("__call__ method is not implemented") 7 | 8 | @classmethod 9 | def add_arguments(cls, parser): 10 | return parser 11 | 12 | def __repr__(self): 13 | return self.__class__.__name__ + "()" 14 | 15 | 16 | class Identity(TransformInterface): 17 | """Identity Function""" 18 | 19 | def __call__(self, x): 20 | return x 21 | -------------------------------------------------------------------------------- /espnet/transform/wpe.py: -------------------------------------------------------------------------------- 1 | class WPE(object): 2 | def __init__( 3 | self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full" 4 | ): 5 | self.taps = taps 6 | self.delay = delay 7 | self.iterations = iterations 8 | self.psd_context = psd_context 9 | self.statistics_mode = statistics_mode 10 | 11 | def __repr__(self): 12 | return ( 13 | "{name}(taps={taps}, delay={delay}" 14 | "iterations={iterations}, psd_context={psd_context}, " 15 | "statistics_mode={statistics_mode})".format( 16 | name=self.__class__.__name__, 17 | taps=self.taps, 18 | delay=self.delay, 19 | iterations=self.iterations, 20 | psd_context=self.psd_context, 21 | statistics_mode=self.statistics_mode, 22 | ) 23 | ) 24 | 25 | def __call__(self, xs): 26 | """Return enhanced 27 | 28 | :param np.ndarray xs: (Time, Channel, Frequency) 29 | :return: enhanced_xs 30 | :rtype: np.ndarray 31 | 32 | """ 33 | from nara_wpe.wpe import wpe 34 | 35 | # nara_wpe.wpe: (F, C, T) 36 | xs = wpe( 37 | xs.transpose((2, 1, 0)), 38 | taps=self.taps, 39 | delay=self.delay, 40 | iterations=self.iterations, 41 | psd_context=self.psd_context, 42 | statistics_mode=self.statistics_mode, 43 | ) 44 | return xs.transpose(2, 1, 0) 45 | -------------------------------------------------------------------------------- /espnet/tts/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/tts/pytorch_backend/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/utils/check_kwargs.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | def check_kwargs(func, kwargs, name=None): 5 | """check kwargs are valid for func 6 | 7 | If kwargs are invalid, raise TypeError as same as python default 8 | :param function func: function to be validated 9 | :param dict kwargs: keyword arguments for func 10 | :param str name: name used in TypeError (default is func name) 11 | """ 12 | try: 13 | params = inspect.signature(func).parameters 14 | except ValueError: 15 | return 16 | if name is None: 17 | name = func.__name__ 18 | for k in kwargs.keys(): 19 | if k not in params: 20 | raise TypeError(f"{name}() got an unexpected keyword argument '{k}'") 21 | -------------------------------------------------------------------------------- /espnet/utils/cli_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections.abc import Sequence 3 | from distutils.util import strtobool as dist_strtobool 4 | 5 | import numpy 6 | 7 | 8 | def strtobool(x): 9 | # distutils.util.strtobool returns integer, but it's confusing, 10 | return bool(dist_strtobool(x)) 11 | 12 | 13 | def get_commandline_args(): 14 | extra_chars = [ 15 | " ", 16 | ";", 17 | "&", 18 | "(", 19 | ")", 20 | "|", 21 | "^", 22 | "<", 23 | ">", 24 | "?", 25 | "*", 26 | "[", 27 | "]", 28 | "$", 29 | "`", 30 | '"', 31 | "\\", 32 | "!", 33 | "{", 34 | "}", 35 | ] 36 | 37 | # Escape the extra characters for shell 38 | argv = [ 39 | arg.replace("'", "'\\''") 40 | if all(char not in arg for char in extra_chars) 41 | else "'" + arg.replace("'", "'\\''") + "'" 42 | for arg in sys.argv 43 | ] 44 | 45 | return sys.executable + " " + " ".join(argv) 46 | 47 | 48 | def is_scipy_wav_style(value): 49 | # If Tuple[int, numpy.ndarray] or not 50 | return ( 51 | isinstance(value, Sequence) 52 | and len(value) == 2 53 | and isinstance(value[0], int) 54 | and isinstance(value[1], numpy.ndarray) 55 | ) 56 | 57 | 58 | def assert_scipy_wav_style(value): 59 | assert is_scipy_wav_style( 60 | value 61 | ), "Must be Tuple[int, numpy.ndarray], but got {}".format( 62 | type(value) 63 | if not isinstance(value, Sequence) 64 | else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value)) 65 | ) 66 | -------------------------------------------------------------------------------- /espnet/utils/deterministic_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import chainer 5 | import torch 6 | 7 | 8 | def set_deterministic_pytorch(args): 9 | """Ensures pytorch produces deterministic results depending on the program arguments 10 | 11 | :param Namespace args: The program arguments 12 | """ 13 | # seed setting 14 | torch.manual_seed(args.seed) 15 | 16 | # debug mode setting 17 | # 0 would be fastest, but 1 seems to be reasonable 18 | # considering reproducibility 19 | # remove type check 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = ( 22 | False # https://github.com/pytorch/pytorch/issues/6351 23 | ) 24 | if args.debugmode < 2: 25 | chainer.config.type_check = False 26 | logging.info("torch type check is disabled") 27 | # use deterministic computation or not 28 | if args.debugmode < 1: 29 | torch.backends.cudnn.deterministic = False 30 | torch.backends.cudnn.benchmark = True 31 | logging.info("torch cudnn deterministic is disabled") 32 | 33 | 34 | def set_deterministic_chainer(args): 35 | """Ensures chainer produces deterministic results depending on the program arguments 36 | 37 | :param Namespace args: The program arguments 38 | """ 39 | # seed setting (chainer seed may not need it) 40 | os.environ["CHAINER_SEED"] = str(args.seed) 41 | logging.info("chainer seed = " + os.environ["CHAINER_SEED"]) 42 | 43 | # debug mode setting 44 | # 0 would be fastest, but 1 seems to be reasonable 45 | # considering reproducibility 46 | # remove type check 47 | if args.debugmode < 2: 48 | chainer.config.type_check = False 49 | logging.info("chainer type check is disabled") 50 | # use deterministic computation or not 51 | if args.debugmode < 1: 52 | chainer.config.cudnn_deterministic = False 53 | logging.info("chainer cudnn deterministic is disabled") 54 | else: 55 | chainer.config.cudnn_deterministic = True 56 | -------------------------------------------------------------------------------- /espnet/utils/dynamic_import.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | 4 | def dynamic_import(import_path, alias=dict()): 5 | """dynamic import module and class 6 | 7 | :param str import_path: syntax 'module_name:class_name' 8 | e.g., 'espnet.transform.add_deltas:AddDeltas' 9 | :param dict alias: shortcut for registered class 10 | :return: imported class 11 | """ 12 | if import_path not in alias and ":" not in import_path: 13 | raise ValueError( 14 | "import_path should be one of {} or " 15 | 'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : ' 16 | "{}".format(set(alias), import_path) 17 | ) 18 | if ":" not in import_path: 19 | import_path = alias[import_path] 20 | 21 | module_name, objname = import_path.split(":") 22 | m = importlib.import_module(module_name) 23 | return getattr(m, objname) 24 | -------------------------------------------------------------------------------- /espnet/utils/fill_missing_args.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2018 Nagoya University (Tomoki Hayashi) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | import logging 8 | 9 | 10 | def fill_missing_args(args, add_arguments): 11 | """Fill missing arguments in args. 12 | 13 | Args: 14 | args (Namespace or None): Namesapce containing hyperparameters. 15 | add_arguments (function): Function to add arguments. 16 | 17 | Returns: 18 | Namespace: Arguments whose missing ones are filled with default value. 19 | 20 | Examples: 21 | >>> from argparse import Namespace 22 | >>> from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2 23 | >>> args = Namespace() 24 | >>> fill_missing_args(args, Tacotron2.add_arguments_fn) 25 | Namespace(aconv_chans=32, aconv_filts=15, adim=512, atype='location', ...) 26 | 27 | """ 28 | # check argument type 29 | assert isinstance(args, argparse.Namespace) or args is None 30 | assert callable(add_arguments) 31 | 32 | # get default arguments 33 | default_args, _ = add_arguments(argparse.ArgumentParser()).parse_known_args() 34 | 35 | # convert to dict 36 | args = {} if args is None else vars(args) 37 | default_args = vars(default_args) 38 | 39 | for key, value in default_args.items(): 40 | if key not in args: 41 | logging.info( 42 | 'attribute "%s" does not exist. use default %s.' % (key, str(value)) 43 | ) 44 | args[key] = value 45 | 46 | return argparse.Namespace(**args) 47 | -------------------------------------------------------------------------------- /espnet/utils/training/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet/utils/training/evaluator.py: -------------------------------------------------------------------------------- 1 | from chainer.training.extensions import Evaluator 2 | 3 | from espnet.utils.training.tensorboard_logger import TensorboardLogger 4 | 5 | 6 | class BaseEvaluator(Evaluator): 7 | """Base Evaluator in ESPnet""" 8 | 9 | def __call__(self, trainer=None): 10 | ret = super().__call__(trainer) 11 | try: 12 | if trainer is not None: 13 | # force tensorboard to report evaluation log 14 | tb_logger = trainer.get_extension(TensorboardLogger.default_name) 15 | tb_logger(trainer) 16 | except ValueError: 17 | pass 18 | return ret 19 | -------------------------------------------------------------------------------- /espnet/utils/training/tensorboard_logger.py: -------------------------------------------------------------------------------- 1 | from chainer.training.extension import Extension 2 | 3 | 4 | class TensorboardLogger(Extension): 5 | """A tensorboard logger extension""" 6 | 7 | default_name = "espnet_tensorboard_logger" 8 | 9 | def __init__( 10 | self, logger, att_reporter=None, ctc_reporter=None, entries=None, epoch=0 11 | ): 12 | """Init the extension 13 | 14 | :param SummaryWriter logger: The logger to use 15 | :param PlotAttentionReporter att_reporter: The (optional) PlotAttentionReporter 16 | :param entries: The entries to watch 17 | :param int epoch: The starting epoch 18 | """ 19 | self._entries = entries 20 | self._att_reporter = att_reporter 21 | self._ctc_reporter = ctc_reporter 22 | self._logger = logger 23 | self._epoch = epoch 24 | 25 | def __call__(self, trainer): 26 | """Updates the events file with the new values 27 | 28 | :param trainer: The trainer 29 | """ 30 | observation = trainer.observation 31 | for k, v in observation.items(): 32 | if (self._entries is not None) and (k not in self._entries): 33 | continue 34 | if k is not None and v is not None: 35 | if "cupy" in str(type(v)): 36 | v = v.get() 37 | if "cupy" in str(type(k)): 38 | k = k.get() 39 | self._logger.add_scalar(k, v, trainer.updater.iteration) 40 | if ( 41 | self._att_reporter is not None 42 | and trainer.updater.get_iterator("main").epoch > self._epoch 43 | ): 44 | self._epoch = trainer.updater.get_iterator("main").epoch 45 | self._att_reporter.log_attentions(self._logger, trainer.updater.iteration) 46 | if ( 47 | self._ctc_reporter is not None 48 | and trainer.updater.get_iterator("main").epoch > self._epoch 49 | ): 50 | self._epoch = trainer.updater.get_iterator("main").epoch 51 | self._ctc_reporter.log_ctc_probs(self._logger, trainer.updater.iteration) 52 | -------------------------------------------------------------------------------- /espnet/utils/training/train_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import chainer 4 | 5 | 6 | def check_early_stop(trainer, epochs): 7 | """Checks an early stopping trigger and warns the user if it's the case 8 | 9 | :param trainer: The trainer used for training 10 | :param epochs: The maximum number of epochs 11 | """ 12 | end_epoch = trainer.updater.get_iterator("main").epoch 13 | if end_epoch < (epochs - 1): 14 | logging.warning( 15 | "Hit early stop at epoch " 16 | + str(end_epoch) 17 | + "\nYou can change the patience or set it to 0 to run all epochs" 18 | ) 19 | 20 | 21 | def set_early_stop(trainer, args, is_lm=False): 22 | """Sets the early stop trigger given the program arguments 23 | 24 | :param trainer: The trainer used for training 25 | :param args: The program arguments 26 | :param is_lm: If the trainer is for a LM (epoch instead of epochs) 27 | """ 28 | patience = args.patience 29 | criterion = args.early_stop_criterion 30 | epochs = args.epoch if is_lm else args.epochs 31 | mode = "max" if "acc" in criterion else "min" 32 | if patience > 0: 33 | trainer.stop_trigger = chainer.training.triggers.EarlyStoppingTrigger( 34 | monitor=criterion, 35 | mode=mode, 36 | patients=patience, 37 | max_trigger=(epochs, "epoch"), 38 | ) 39 | -------------------------------------------------------------------------------- /espnet/version.txt: -------------------------------------------------------------------------------- 1 | 202308 2 | -------------------------------------------------------------------------------- /espnet2/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize espnet2 package.""" 2 | 3 | # from espnet import __version__ # NOQA 4 | -------------------------------------------------------------------------------- /espnet2/asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/decoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/decoder/abs_decoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | from espnet.nets.scorer_interface import ScorerInterface 7 | 8 | 9 | class AbsDecoder(torch.nn.Module, ScorerInterface, ABC): 10 | @abstractmethod 11 | def forward( 12 | self, 13 | hs_pad: torch.Tensor, 14 | hlens: torch.Tensor, 15 | ys_in_pad: torch.Tensor, 16 | ys_in_lens: torch.Tensor, 17 | ) -> Tuple[torch.Tensor, torch.Tensor]: 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /espnet2/asr/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/encoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/encoder/abs_encoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsEncoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def forward( 14 | self, 15 | xs_pad: torch.Tensor, 16 | ilens: torch.Tensor, 17 | prev_states: torch.Tensor = None, 18 | ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /espnet2/asr/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/frontend/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/frontend/abs_frontend.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsFrontend(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def forward( 14 | self, input: torch.Tensor, input_lengths: torch.Tensor 15 | ) -> Tuple[torch.Tensor, torch.Tensor]: 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /espnet2/asr/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/layers/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/postencoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/postencoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/postencoder/abs_postencoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsPostEncoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def forward( 14 | self, input: torch.Tensor, input_lengths: torch.Tensor 15 | ) -> Tuple[torch.Tensor, torch.Tensor]: 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /espnet2/asr/preencoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/preencoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/preencoder/abs_preencoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsPreEncoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def forward( 14 | self, input: torch.Tensor, input_lengths: torch.Tensor 15 | ) -> Tuple[torch.Tensor, torch.Tensor]: 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /espnet2/asr/preencoder/linear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 2021, Carnegie Mellon University; Xuankai Chang 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | 5 | """Linear Projection.""" 6 | 7 | from typing import Tuple 8 | 9 | import torch 10 | from typeguard import check_argument_types 11 | 12 | from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder 13 | 14 | 15 | class LinearProjection(AbsPreEncoder): 16 | """Linear Projection Preencoder.""" 17 | 18 | def __init__(self, input_size: int, output_size: int, dropout: float = 0.0): 19 | """Initialize the module.""" 20 | assert check_argument_types() 21 | super().__init__() 22 | 23 | self.output_dim = output_size 24 | self.linear_out = torch.nn.Linear(input_size, output_size) 25 | self.dropout = torch.nn.Dropout(dropout) 26 | 27 | def forward( 28 | self, input: torch.Tensor, input_lengths: torch.Tensor 29 | ) -> Tuple[torch.Tensor, torch.Tensor]: 30 | """Forward.""" 31 | output = self.linear_out(self.dropout(input)) 32 | return output, input_lengths # no state in this layer 33 | 34 | def output_size(self) -> int: 35 | """Get the output size.""" 36 | return self.output_dim 37 | -------------------------------------------------------------------------------- /espnet2/asr/specaug/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/specaug/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/specaug/abs_specaug.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import torch 4 | 5 | 6 | class AbsSpecAug(torch.nn.Module): 7 | """Abstract class for the augmentation of spectrogram 8 | 9 | The process-flow: 10 | 11 | Frontend -> SpecAug -> Normalization -> Encoder -> Decoder 12 | """ 13 | 14 | def forward( 15 | self, x: torch.Tensor, x_lengths: torch.Tensor = None 16 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /espnet2/asr/state_spaces/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /espnet2/asr/state_spaces/ff.py: -------------------------------------------------------------------------------- 1 | # This code is derived from https://github.com/HazyResearch/state-spaces 2 | 3 | """Implementation of FFN block in the style of Transformers.""" 4 | 5 | from functools import partial 6 | 7 | from torch import nn 8 | 9 | from espnet2.asr.state_spaces.base import SequenceModule 10 | from espnet2.asr.state_spaces.components import DropoutNd, LinearActivation 11 | 12 | 13 | class FF(SequenceModule): 14 | def __init__( 15 | self, 16 | d_input, 17 | expand=2, 18 | d_output=None, 19 | transposed=False, 20 | activation="gelu", 21 | initializer=None, 22 | dropout=0.0, 23 | tie_dropout=False, 24 | ): 25 | super().__init__() 26 | self.d_output = d_input if d_output is None else d_output 27 | self.transposed = transposed 28 | d_inner = expand * d_input 29 | 30 | linear1 = LinearActivation( 31 | d_input, 32 | d_inner, 33 | transposed=transposed, 34 | activation=activation, 35 | initializer=initializer, 36 | activate=True, 37 | ) 38 | dropout_cls = ( 39 | partial(DropoutNd, transposed=self.transposed) 40 | if tie_dropout 41 | else nn.Dropout 42 | ) 43 | # dropout_cls = nn.Dropout2d if self.transposed else nn.Dropout 44 | drop = dropout_cls(dropout) if dropout > 0.0 else nn.Identity() 45 | 46 | linear2 = LinearActivation( 47 | d_inner, 48 | self.d_output, 49 | transposed=transposed, 50 | activation=None, 51 | initializer=initializer, 52 | activate=False, 53 | ) 54 | 55 | self.ff = nn.Sequential( 56 | linear1, 57 | drop, 58 | linear2, 59 | ) 60 | 61 | def forward(self, x, *args, **kwargs): 62 | return self.ff(x), None 63 | 64 | def step(self, x, state, **kwargs): 65 | # x: [batch, d_input] 66 | if self.transposed: 67 | # expects: [batch, d_input, seq_len] 68 | return self.ff(x.unsqueeze(-1)).squeeze(-1), state 69 | else: 70 | return self.ff(x), state 71 | -------------------------------------------------------------------------------- /espnet2/asr/state_spaces/registry.py: -------------------------------------------------------------------------------- 1 | layer = { 2 | "s4": "espnet2.asr.state_spaces.s4.S4", 3 | "ff": "espnet2.asr.state_spaces.ff.FF", 4 | "mha": "espnet2.asr.state_spaces.attention.MultiHeadedAttention", 5 | } 6 | -------------------------------------------------------------------------------- /espnet2/asr/transducer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr/transducer/__init__.py -------------------------------------------------------------------------------- /espnet2/asr/transducer/rnnt_multi_blank/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from espnet2.asr.transducer.rnnt_multi_blank.rnnt_multi_blank import ( 16 | MultiblankRNNTLossNumba, 17 | ) 18 | 19 | __all__ = [MultiblankRNNTLossNumba] 20 | -------------------------------------------------------------------------------- /espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2018-2019, Mingkun Huang 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | -------------------------------------------------------------------------------- /espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2018-2019, Mingkun Huang 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | -------------------------------------------------------------------------------- /espnet2/asr/transducer/rnnt_multi_blank/utils/global_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2018-2019, Mingkun Huang 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | 29 | 30 | import enum 31 | 32 | import numpy as np 33 | from numba import float32 34 | 35 | # Internal globals 36 | _THREADS_PER_BLOCK = 32 37 | _WARP_SIZE = 32 38 | _DTYPE = float32 39 | 40 | # Constants 41 | FP32_INF = np.inf 42 | FP32_NEG_INF = -np.inf 43 | THRESHOLD = 1e-1 44 | 45 | """ 46 | Getters 47 | """ 48 | 49 | 50 | def threads_per_block(): 51 | global _THREADS_PER_BLOCK 52 | return _THREADS_PER_BLOCK 53 | 54 | 55 | def warp_size(): 56 | global _WARP_SIZE 57 | return _WARP_SIZE 58 | 59 | 60 | def dtype(): 61 | global _DTYPE 62 | return _DTYPE 63 | 64 | 65 | # RNNT STATUS 66 | class RNNTStatus(enum.Enum): 67 | RNNT_STATUS_SUCCESS = 0 68 | RNNT_STATUS_INVALID_VALUE = 1 69 | -------------------------------------------------------------------------------- /espnet2/asr_transducer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/blocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/blocks/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/modules/mega/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/mega/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/modules/rwkv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/decoder/modules/rwkv/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/decoder/modules/rwkv/cuda/wkv_op.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Based on https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/cuda/wkv_op.cpp 3 | Function signatures were modified based on https://github.com/huggingface/transformers/blob/main/src/transformers/kernels/rwkv/wkv_op.cpp 4 | 5 | */ 6 | 7 | #include 8 | 9 | void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y); 10 | 11 | void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv); 12 | 13 | void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { 14 | const int B = k.size(0); 15 | const int T = k.size(1); 16 | const int C = k.size(2); 17 | 18 | cuda_forward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr()); 19 | } 20 | 21 | void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) { 22 | const int B = k.size(0); 23 | const int T = k.size(1); 24 | const int C = k.size(2); 25 | 26 | cuda_backward(B, T, C, w.data_ptr(), u.data_ptr(), k.data_ptr(), v.data_ptr(), y.data_ptr(), gy.data_ptr(), gw.data_ptr(), gu.data_ptr(), gk.data_ptr(), gv.data_ptr()); 27 | } 28 | 29 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 30 | m.def("forward", &forward, "wkv forward"); 31 | m.def("backward", &backward, "wkv backward"); 32 | } 33 | 34 | TORCH_LIBRARY(wkv, m) { 35 | m.def("forward", forward); 36 | m.def("backward", backward); 37 | } 38 | -------------------------------------------------------------------------------- /espnet2/asr_transducer/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/encoder/blocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/blocks/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/encoder/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/encoder/modules/__init__.py -------------------------------------------------------------------------------- /espnet2/asr_transducer/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asr_transducer/frontend/__init__.py -------------------------------------------------------------------------------- /espnet2/asvspoof/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/__init__.py -------------------------------------------------------------------------------- /espnet2/asvspoof/decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/decoder/__init__.py -------------------------------------------------------------------------------- /espnet2/asvspoof/decoder/abs_decoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsDecoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, 11 | input: torch.Tensor, 12 | ilens: torch.Tensor, 13 | ) -> Tuple[torch.Tensor, torch.Tensor]: 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /espnet2/asvspoof/decoder/linear_decoder.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from espnet2.asvspoof.decoder.abs_decoder import AbsDecoder 6 | 7 | 8 | class LinearDecoder(AbsDecoder): 9 | """Linear decoder for speaker diarization""" 10 | 11 | def __init__( 12 | self, 13 | encoder_output_size: int, 14 | ): 15 | super().__init__() 16 | # TODO1 (checkpoint3): initialize a linear projection layer 17 | 18 | def forward(self, input: torch.Tensor, ilens: Optional[torch.Tensor]): 19 | """Forward. 20 | Args: 21 | input (torch.Tensor): hidden_space [Batch, T, F] 22 | ilens (torch.Tensor): input lengths [Batch] 23 | """ 24 | # TODO2 (checkpoint3): compute mean over time-domain (dimension 1) 25 | 26 | # TODO3 (checkpoint3): apply the projection layer 27 | 28 | # TODO4 (checkpoint3): change the return value 29 | return None 30 | -------------------------------------------------------------------------------- /espnet2/asvspoof/loss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/asvspoof/loss/__init__.py -------------------------------------------------------------------------------- /espnet2/asvspoof/loss/abs_loss.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | EPS = torch.finfo(torch.get_default_dtype()).eps 6 | 7 | 8 | class AbsASVSpoofLoss(torch.nn.Module, ABC): 9 | """Base class for all ASV Spoofing loss modules.""" 10 | 11 | # the name will be the key that appears in the reporter 12 | @property 13 | def name(self) -> str: 14 | return NotImplementedError 15 | 16 | @abstractmethod 17 | def forward( 18 | self, 19 | ref, 20 | inf, 21 | ) -> torch.Tensor: 22 | # the return tensor should be shape of (batch) 23 | raise NotImplementedError 24 | 25 | @abstractmethod 26 | def score( 27 | self, 28 | pred, 29 | ) -> torch.Tensor: 30 | raise NotImplemented 31 | -------------------------------------------------------------------------------- /espnet2/asvspoof/loss/binary_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss 4 | from espnet.nets.pytorch_backend.nets_utils import to_device 5 | 6 | 7 | class ASVSpoofBinaryLoss(AbsASVSpoofLoss): 8 | """Binary loss for ASV Spoofing.""" 9 | 10 | def __init__( 11 | self, 12 | weight: float = 1.0, 13 | ): 14 | super().__init__() 15 | self.weight = weight 16 | self.sigmoid = torch.nn.Sigmoid() 17 | self.loss = torch.nn.BCELoss(reduction="mean") 18 | 19 | def forward(self, pred: torch.Tensor, label: torch.Tensor, **kwargs): 20 | """Forward. 21 | Args: 22 | pred (torch.Tensor): prediction probability [Batch, 2] 23 | label (torch.Tensor): ground truth label [Batch, 2] 24 | """ 25 | loss = self.loss(self.sigmoid(pred.view(-1)), label.view(-1).float()) 26 | return loss 27 | 28 | def score(self, pred: torch.Tensor): 29 | return pred 30 | -------------------------------------------------------------------------------- /espnet2/asvspoof/loss/oc_softmax_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss 4 | from espnet.nets.pytorch_backend.nets_utils import to_device 5 | 6 | 7 | class ASVSpoofOCSoftmaxLoss(AbsASVSpoofLoss): 8 | """Binary loss for ASV Spoofing.""" 9 | 10 | def __init__( 11 | self, 12 | weight: float = 1.0, 13 | enc_dim: int = 128, 14 | m_real: float = 0.5, 15 | m_fake: float = 0.2, 16 | alpha: float = 20.0, 17 | ): 18 | super(ASVSpoofOCSoftmaxLoss).__init__() 19 | self.weight = weight 20 | self.feat_dim = enc_dim 21 | self.m_real = m_real 22 | self.m_fake = m_fake 23 | self.alpha = alpha 24 | self.center = torch.nn.Parameter(torch.randn(1, self.feat_dim)) 25 | torch.nn.init.kaiming_uniform_(self.center, 0.25) 26 | self.softplus = torch.nn.Softplus() 27 | 28 | def forward(self, label: torch.Tensor, emb: torch.Tensor, **kwargs): 29 | """Forward. 30 | Args: 31 | label (torch.Tensor): ground truth label [Batch, 1] 32 | emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] 33 | """ 34 | emb = torch.mean(emb, dim=1) 35 | w = torch.nn.functional.normalize(self.center, p=2, dim=1) 36 | x = torch.nn.functional.normalize(emb, p=2, dim=1) 37 | 38 | # TODO1 (exercise 2): compute scores based on w and x 39 | 40 | # TODO2 (exercise 2): calculate the score bias based on m_real and m_fake 41 | 42 | # TODO3 (exercise 2): apply alpha and softplus 43 | 44 | # TODO4 (exercise 2): returnthe final loss 45 | return None 46 | 47 | def score(self, emb: torch.Tensor): 48 | """Prediction. 49 | Args: 50 | emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] 51 | """ 52 | emb = torch.mean(emb, dim=1) 53 | w = torch.nn.functional.normalize(self.center, p=2, dim=1) 54 | x = torch.nn.functional.normalize(emb, p=2, dim=1) 55 | 56 | # TODO5 (exercise 2): compute scores 57 | -------------------------------------------------------------------------------- /espnet2/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/bin/__init__.py -------------------------------------------------------------------------------- /espnet2/bin/asr_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.asr import ASRTask 3 | 4 | 5 | def get_parser(): 6 | parser = ASRTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""ASR training. 12 | 13 | Example: 14 | 15 | % python asr_train.py asr --print_config --optim adadelta \ 16 | > conf/train_asr.yaml 17 | % python asr_train.py --config conf/train_asr.yaml 18 | """ 19 | ASRTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/asr_transducer_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from espnet2.tasks.asr_transducer import ASRTransducerTask 4 | 5 | 6 | def get_parser(): 7 | """Get parser for ASR Transducer task.""" 8 | parser = ASRTransducerTask.get_parser() 9 | return parser 10 | 11 | 12 | def main(cmd=None): 13 | r"""ASR Transducer training. 14 | 15 | Example: 16 | 17 | % python asr_transducer_train.py asr --print_config \ 18 | --optim adadelta > conf/train_asr.yaml 19 | % python asr_transducer_train.py \ 20 | --config conf/tuning/transducer/train_rnn_transducer.yaml 21 | """ 22 | ASRTransducerTask.main(cmd=cmd) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /espnet2/bin/asvspoof_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.asvspoof import ASVSpoofTask 3 | 4 | 5 | def get_parser(): 6 | parser = ASVSpoofTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""ASVSpoof training. 12 | Example: 13 | % python asvspoof_train.py asr --print_config --optim adadelta \ 14 | > conf/train_asvspoof.yaml 15 | % python asvspoof_train.py --config conf/train_asvspoof.yaml 16 | """ 17 | ASVSpoofTask.main(cmd=cmd) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /espnet2/bin/diar_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from espnet2.tasks.diar import DiarizationTask 4 | 5 | 6 | def get_parser(): 7 | parser = DiarizationTask.get_parser() 8 | return parser 9 | 10 | 11 | def main(cmd=None): 12 | r"""Speaker diarization training. 13 | 14 | Example: 15 | % python diar_train.py diar --print_config --optim adadelta \ 16 | > conf/train_diar.yaml 17 | % python diar_train.py --config conf/train_diar.yaml 18 | """ 19 | DiarizationTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/enh_s2t_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.enh_s2t import EnhS2TTask 3 | 4 | 5 | def get_parser(): 6 | parser = EnhS2TTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""EnhS2T training. 12 | 13 | Example: 14 | 15 | % python enh_s2t_train.py enh_s2t --print_config --optim adadelta \ 16 | > conf/train_enh_s2t.yaml 17 | % python enh_s2t_train.py --config conf/train_enh_s2t.yaml 18 | """ 19 | EnhS2TTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/enh_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.enh import EnhancementTask 3 | 4 | 5 | def get_parser(): 6 | parser = EnhancementTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""Enhancemnet frontend training. 12 | 13 | Example: 14 | 15 | % python enh_train.py enh --print_config --optim adadelta \ 16 | > conf/train_enh.yaml 17 | % python enh_train.py --config conf/train_enh.yaml 18 | """ 19 | EnhancementTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/enh_tse_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.enh_tse import TargetSpeakerExtractionTask 3 | 4 | 5 | def get_parser(): 6 | parser = TargetSpeakerExtractionTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""Target Speaker Extraction model training. 12 | 13 | Example: 14 | 15 | % python enh_tse_train.py asr --print_config --optim adadelta \ 16 | > conf/train_enh.yaml 17 | % python enh_tse_train.py --config conf/train_enh.yaml 18 | """ 19 | TargetSpeakerExtractionTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/gan_svs_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.gan_svs import GANSVSTask 3 | 4 | 5 | def get_parser(): 6 | parser = GANSVSTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """GAN-based SVS training 12 | 13 | Example: 14 | 15 | % python gan_svs_train.py --print_config --optim1 adadelta 16 | % python gan_svs_train.py --config conf/train.yaml 17 | """ 18 | GANSVSTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/gan_tts_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.gan_tts import GANTTSTask 3 | 4 | 5 | def get_parser(): 6 | parser = GANTTSTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """GAN-based TTS training 12 | 13 | Example: 14 | 15 | % python gan_tts_train.py --print_config --optim1 adadelta 16 | % python gan_tts_train.py --config conf/train.yaml 17 | """ 18 | GANTTSTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/hubert_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.hubert import HubertTask 3 | 4 | 5 | def get_parser(): 6 | parser = HubertTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """Hubert pretraining. 12 | 13 | Example: 14 | % python hubert_train.py asr --print_config --optim adadelta \ 15 | > conf/hubert_asr.yaml 16 | % python hubert_train.py --config conf/train_asr.yaml 17 | """ 18 | HubertTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/lm_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.lm import LMTask 3 | 4 | 5 | def get_parser(): 6 | parser = LMTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """LM training. 12 | 13 | Example: 14 | 15 | % python lm_train.py asr --print_config --optim adadelta 16 | % python lm_train.py --config conf/train_asr.yaml 17 | """ 18 | LMTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/mt_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.mt import MTTask 3 | 4 | 5 | def get_parser(): 6 | parser = MTTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""MT training. 12 | 13 | Example: 14 | 15 | % python mt_train.py st --print_config --optim adadelta \ 16 | > conf/train_mt.yaml 17 | % python mt_train.py --config conf/train_mt.yaml 18 | """ 19 | MTTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/s2t_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.s2t import S2TTask 3 | 4 | 5 | def get_parser(): 6 | parser = S2TTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""S2T training. 12 | 13 | Example: 14 | 15 | % python s2t_train.py s2t --print_config --optim adadelta \ 16 | > conf/train_s2t.yaml 17 | % python s2t_train.py --config conf/train_s2t.yaml 18 | """ 19 | S2TTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/slu_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.slu import SLUTask 3 | 4 | 5 | def get_parser(): 6 | parser = SLUTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""SLU training. 12 | 13 | Example: 14 | 15 | % python slu_train.py slu --print_config --optim adadelta \ 16 | > conf/train_slu.yaml 17 | % python slu_train.py --config conf/train_slu.yaml 18 | """ 19 | SLUTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/spk_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from espnet2.tasks.spk import SpeakerTask 4 | 5 | 6 | def get_parser(): 7 | parser = SpeakerTask.get_parser() 8 | return parser 9 | 10 | 11 | def main(cmd=None): 12 | r"""Speaker embedding extractor training. Trained model can be used for 13 | speaker verification, open set speaker identification, and also as 14 | embeddings for various other tasks including speaker diarization. 15 | 16 | Example: 17 | % python spk_train.py --print_config --optim adadelta \ 18 | > conf/train_spk.yaml 19 | % python spk_train.py --config conf/train_diar.yaml 20 | """ 21 | SpeakerTask.main(cmd=cmd) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /espnet2/bin/st_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.st import STTask 3 | 4 | 5 | def get_parser(): 6 | parser = STTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""ST training. 12 | 13 | Example: 14 | 15 | % python st_train.py st --print_config --optim adadelta \ 16 | > conf/train_st.yaml 17 | % python st_train.py --config conf/train_st.yaml 18 | """ 19 | STTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/bin/svs_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.svs import SVSTask 3 | 4 | 5 | def get_parser(): 6 | parser = SVSTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """SVS training 12 | 13 | Example: 14 | 15 | % python svs_train.py svs --print_config --optim adadelta 16 | % python svs_train.py --config conf/train_svs.yaml 17 | """ 18 | SVSTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/tts_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.tts import TTSTask 3 | 4 | 5 | def get_parser(): 6 | parser = TTSTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | """TTS training 12 | 13 | Example: 14 | 15 | % python tts_train.py asr --print_config --optim adadelta 16 | % python tts_train.py --config conf/train_asr.yaml 17 | """ 18 | TTSTask.main(cmd=cmd) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /espnet2/bin/uasr_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from espnet2.tasks.uasr import UASRTask 3 | 4 | 5 | def get_parser(): 6 | parser = UASRTask.get_parser() 7 | return parser 8 | 9 | 10 | def main(cmd=None): 11 | r"""UASR training. 12 | 13 | Example: 14 | 15 | % python uasr_train.py uasr --print_config --optim adadelta \ 16 | > conf/train_uasr.yaml 17 | % python uasr_train.py --config conf/train_uasr.yaml 18 | """ 19 | UASRTask.main(cmd=cmd) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /espnet2/diar/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/__init__.py -------------------------------------------------------------------------------- /espnet2/diar/abs_diar.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Tuple 4 | 5 | import torch 6 | 7 | 8 | class AbsDiarization(torch.nn.Module, ABC): 9 | # @abstractmethod 10 | # def output_size(self) -> int: 11 | # raise NotImplementedError 12 | 13 | @abstractmethod 14 | def forward( 15 | self, 16 | input: torch.Tensor, 17 | ilens: torch.Tensor, 18 | ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]: 19 | raise NotImplementedError 20 | 21 | @abstractmethod 22 | def forward_rawwav( 23 | self, input: torch.Tensor, ilens: torch.Tensor 24 | ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]: 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /espnet2/diar/attractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/attractor/__init__.py -------------------------------------------------------------------------------- /espnet2/diar/attractor/abs_attractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsAttractor(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, 11 | enc_input: torch.Tensor, 12 | ilens: torch.Tensor, 13 | dec_input: torch.Tensor, 14 | ) -> Tuple[torch.Tensor, torch.Tensor]: 15 | raise NotImplementedError 16 | -------------------------------------------------------------------------------- /espnet2/diar/decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/decoder/__init__.py -------------------------------------------------------------------------------- /espnet2/diar/decoder/abs_decoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsDecoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, 11 | input: torch.Tensor, 12 | ilens: torch.Tensor, 13 | ) -> Tuple[torch.Tensor, torch.Tensor]: 14 | raise NotImplementedError 15 | 16 | @property 17 | @abstractmethod 18 | def num_spk(self): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /espnet2/diar/decoder/linear_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.diar.decoder.abs_decoder import AbsDecoder 4 | 5 | 6 | class LinearDecoder(AbsDecoder): 7 | """Linear decoder for speaker diarization""" 8 | 9 | def __init__( 10 | self, 11 | encoder_output_size: int, 12 | num_spk: int = 2, 13 | ): 14 | super().__init__() 15 | self._num_spk = num_spk 16 | self.linear_decoder = torch.nn.Linear(encoder_output_size, num_spk) 17 | 18 | def forward(self, input: torch.Tensor, ilens: torch.Tensor): 19 | """Forward. 20 | 21 | Args: 22 | input (torch.Tensor): hidden_space [Batch, T, F] 23 | ilens (torch.Tensor): input lengths [Batch] 24 | """ 25 | 26 | output = self.linear_decoder(input) 27 | 28 | return output 29 | 30 | @property 31 | def num_spk(self): 32 | return self._num_spk 33 | -------------------------------------------------------------------------------- /espnet2/diar/label_processor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.layers.label_aggregation import LabelAggregate 4 | 5 | 6 | class LabelProcessor(torch.nn.Module): 7 | """Label aggregator for speaker diarization""" 8 | 9 | def __init__( 10 | self, win_length: int = 512, hop_length: int = 128, center: bool = True 11 | ): 12 | super().__init__() 13 | self.label_aggregator = LabelAggregate(win_length, hop_length, center) 14 | 15 | def forward(self, input: torch.Tensor, ilens: torch.Tensor): 16 | """Forward. 17 | 18 | Args: 19 | input: (Batch, Nsamples, Label_dim) 20 | ilens: (Batch) 21 | Returns: 22 | output: (Batch, Frames, Label_dim) 23 | olens: (Batch) 24 | 25 | """ 26 | 27 | output, olens = self.label_aggregator(input, ilens) 28 | 29 | return output, olens 30 | -------------------------------------------------------------------------------- /espnet2/diar/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/layers/__init__.py -------------------------------------------------------------------------------- /espnet2/diar/layers/abs_mask.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Tuple 4 | 5 | import torch 6 | 7 | 8 | class AbsMask(torch.nn.Module, ABC): 9 | @property 10 | @abstractmethod 11 | def max_num_spk(self) -> int: 12 | raise NotImplementedError 13 | 14 | @abstractmethod 15 | def forward( 16 | self, 17 | input, 18 | ilens, 19 | bottleneck_feat, 20 | num_spk, 21 | ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]: 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /espnet2/diar/separator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/diar/separator/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/abs_enh.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Tuple 4 | 5 | import torch 6 | 7 | 8 | class AbsEnhancement(torch.nn.Module, ABC): 9 | # @abstractmethod 10 | # def output_size(self) -> int: 11 | # raise NotImplementedError 12 | 13 | @abstractmethod 14 | def forward( 15 | self, 16 | input: torch.Tensor, 17 | ilens: torch.Tensor, 18 | ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]: 19 | raise NotImplementedError 20 | 21 | @abstractmethod 22 | def forward_rawwav( 23 | self, input: torch.Tensor, ilens: torch.Tensor 24 | ) -> Tuple[torch.Tensor, torch.Tensor, OrderedDict]: 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /espnet2/enh/decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/decoder/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/decoder/abs_decoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsDecoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, 11 | input: torch.Tensor, 12 | ilens: torch.Tensor, 13 | ) -> Tuple[torch.Tensor, torch.Tensor]: 14 | raise NotImplementedError 15 | 16 | def forward_streaming(self, input_frame: torch.Tensor): 17 | raise NotImplementedError 18 | 19 | def streaming_merge(self, chunks: torch.Tensor, ilens: torch.tensor = None): 20 | """streaming_merge. It merges the frame-level processed audio chunks 21 | in the streaming *simulation*. It is noted that, in real applications, 22 | the processed audio should be sent to the output channel frame by frame. 23 | You may refer to this function to manage your streaming output buffer. 24 | 25 | Args: 26 | chunks: List [(B, frame_size),] 27 | ilens: [B] 28 | Returns: 29 | merge_audio: [B, T] 30 | """ 31 | 32 | raise NotImplementedError 33 | -------------------------------------------------------------------------------- /espnet2/enh/decoder/null_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.enh.decoder.abs_decoder import AbsDecoder 4 | 5 | 6 | class NullDecoder(AbsDecoder): 7 | """Null decoder, return the same args.""" 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def forward(self, input: torch.Tensor, ilens: torch.Tensor): 13 | """Forward. The input should be the waveform already. 14 | 15 | Args: 16 | input (torch.Tensor): wav [Batch, sample] 17 | ilens (torch.Tensor): input lengths [Batch] 18 | """ 19 | return input, ilens 20 | -------------------------------------------------------------------------------- /espnet2/enh/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/encoder/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/encoder/abs_encoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsEncoder(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, 11 | input: torch.Tensor, 12 | ilens: torch.Tensor, 13 | ) -> Tuple[torch.Tensor, torch.Tensor]: 14 | raise NotImplementedError 15 | 16 | @property 17 | @abstractmethod 18 | def output_dim(self) -> int: 19 | raise NotImplementedError 20 | 21 | def forward_streaming(self, input: torch.Tensor): 22 | raise NotImplementedError 23 | 24 | def streaming_frame(self, audio: torch.Tensor): 25 | """streaming_frame. It splits the continuous audio into frame-level 26 | audio chunks in the streaming *simulation*. It is noted that this 27 | function takes the entire long audio as input for a streaming simulation. 28 | You may refer to this function to manage your streaming input 29 | buffer in a real streaming application. 30 | 31 | Args: 32 | audio: (B, T) 33 | Returns: 34 | chunked: List [(B, frame_size),] 35 | """ 36 | NotImplementedError 37 | -------------------------------------------------------------------------------- /espnet2/enh/encoder/null_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.enh.encoder.abs_encoder import AbsEncoder 4 | 5 | 6 | class NullEncoder(AbsEncoder): 7 | """Null encoder.""" 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | @property 13 | def output_dim(self) -> int: 14 | return 1 15 | 16 | def forward(self, input: torch.Tensor, ilens: torch.Tensor): 17 | """Forward. 18 | 19 | Args: 20 | input (torch.Tensor): mixed speech [Batch, sample] 21 | ilens (torch.Tensor): input lengths [Batch] 22 | """ 23 | return input, ilens 24 | -------------------------------------------------------------------------------- /espnet2/enh/extractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/extractor/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/extractor/abs_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Tuple 4 | 5 | import torch 6 | 7 | 8 | class AbsExtractor(torch.nn.Module, ABC): 9 | @abstractmethod 10 | def forward( 11 | self, 12 | input: torch.Tensor, 13 | ilens: torch.Tensor, 14 | input_aux: torch.Tensor, 15 | ilens_aux: torch.Tensor, 16 | suffix_tag: str = "", 17 | ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]: 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /espnet2/enh/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/layers/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/layers/conv_utils.py: -------------------------------------------------------------------------------- 1 | # noqa: E501 ported from https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7 2 | import math 3 | 4 | 5 | def num2tuple(num): 6 | return num if isinstance(num, tuple) else (num, num) 7 | 8 | 9 | def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1): 10 | h_w, kernel_size, stride, pad, dilation = ( 11 | num2tuple(h_w), 12 | num2tuple(kernel_size), 13 | num2tuple(stride), 14 | num2tuple(pad), 15 | num2tuple(dilation), 16 | ) 17 | pad = num2tuple(pad[0]), num2tuple(pad[1]) 18 | 19 | h = math.floor( 20 | (h_w[0] + sum(pad[0]) - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1 21 | ) 22 | w = math.floor( 23 | (h_w[1] + sum(pad[1]) - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1 24 | ) 25 | 26 | return h, w 27 | 28 | 29 | def convtransp2d_output_shape( 30 | h_w, kernel_size=1, stride=1, pad=0, dilation=1, out_pad=0 31 | ): 32 | h_w, kernel_size, stride, pad, dilation, out_pad = ( 33 | num2tuple(h_w), 34 | num2tuple(kernel_size), 35 | num2tuple(stride), 36 | num2tuple(pad), 37 | num2tuple(dilation), 38 | num2tuple(out_pad), 39 | ) 40 | pad = num2tuple(pad[0]), num2tuple(pad[1]) 41 | 42 | h = ( 43 | (h_w[0] - 1) * stride[0] 44 | - sum(pad[0]) 45 | + dilation[0] * (kernel_size[0] - 1) 46 | + out_pad[0] 47 | + 1 48 | ) 49 | w = ( 50 | (h_w[1] - 1) * stride[1] 51 | - sum(pad[1]) 52 | + dilation[1] * (kernel_size[1] - 1) 53 | + out_pad[1] 54 | + 1 55 | ) 56 | 57 | return h, w 58 | -------------------------------------------------------------------------------- /espnet2/enh/loss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/loss/criterions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/criterions/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/loss/criterions/abs_loss.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | EPS = torch.finfo(torch.get_default_dtype()).eps 6 | 7 | 8 | class AbsEnhLoss(torch.nn.Module, ABC): 9 | """Base class for all Enhancement loss modules.""" 10 | 11 | # the name will be the key that appears in the reporter 12 | @property 13 | def name(self) -> str: 14 | return NotImplementedError 15 | 16 | # This property specifies whether the criterion will only 17 | # be evaluated during the inference stage 18 | @property 19 | def only_for_test(self) -> bool: 20 | return False 21 | 22 | @abstractmethod 23 | def forward( 24 | self, 25 | ref, 26 | inf, 27 | ) -> torch.Tensor: 28 | # the return tensor should be shape of (batch) 29 | raise NotImplementedError 30 | -------------------------------------------------------------------------------- /espnet2/enh/loss/wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/loss/wrappers/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/loss/wrappers/abs_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, List, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsLossWrapper(torch.nn.Module, ABC): 8 | """Base class for all Enhancement loss wrapper modules.""" 9 | 10 | # The weight for the current loss in the multi-task learning. 11 | # The overall training target will be combined as: 12 | # loss = weight_1 * loss_1 + ... + weight_N * loss_N 13 | weight = 1.0 14 | 15 | @abstractmethod 16 | def forward( 17 | self, 18 | ref: List, 19 | inf: List, 20 | others: Dict, 21 | ) -> Tuple[torch.Tensor, Dict, Dict]: 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /espnet2/enh/loss/wrappers/dpcl_solver.py: -------------------------------------------------------------------------------- 1 | from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss 2 | from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper 3 | 4 | 5 | class DPCLSolver(AbsLossWrapper): 6 | def __init__(self, criterion: AbsEnhLoss, weight=1.0): 7 | super().__init__() 8 | self.criterion = criterion 9 | self.weight = weight 10 | 11 | def forward(self, ref, inf, others={}): 12 | """A naive DPCL solver 13 | 14 | Args: 15 | ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk 16 | inf (List[torch.Tensor]): [(batch, ...), ...] 17 | others (List): other data included in this solver 18 | e.g. "tf_embedding" learned embedding of all T-F bins (B, T * F, D) 19 | 20 | Returns: 21 | loss: (torch.Tensor): minimum loss with the best permutation 22 | stats: (dict), for collecting training status 23 | others: reserved 24 | """ 25 | assert "tf_embedding" in others 26 | 27 | loss = self.criterion(ref, others["tf_embedding"]).mean() 28 | 29 | stats = dict() 30 | stats[self.criterion.name] = loss.detach() 31 | 32 | return loss.mean(), stats, {} 33 | -------------------------------------------------------------------------------- /espnet2/enh/loss/wrappers/fixed_order.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import torch 4 | 5 | from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss 6 | from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper 7 | 8 | 9 | class FixedOrderSolver(AbsLossWrapper): 10 | def __init__(self, criterion: AbsEnhLoss, weight=1.0): 11 | super().__init__() 12 | self.criterion = criterion 13 | self.weight = weight 14 | 15 | def forward(self, ref, inf, others={}): 16 | """An naive fixed-order solver 17 | 18 | Args: 19 | ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk 20 | inf (List[torch.Tensor]): [(batch, ...), ...] 21 | 22 | Returns: 23 | loss: (torch.Tensor): minimum loss with the best permutation 24 | stats: dict, for collecting training status 25 | others: reserved 26 | """ 27 | assert len(ref) == len(inf), (len(ref), len(inf)) 28 | num_spk = len(ref) 29 | 30 | loss = 0.0 31 | stats = defaultdict(list) 32 | for r, i in zip(ref, inf): 33 | loss += torch.mean(self.criterion(r, i)) / num_spk 34 | for k, v in getattr(self.criterion, "stats", {}).items(): 35 | stats[k].append(v) 36 | 37 | for k, v in stats.items(): 38 | stats[k] = torch.stack(v, dim=1).mean() 39 | stats[self.criterion.name] = loss.detach() 40 | 41 | perm = torch.arange(num_spk).unsqueeze(0).repeat(ref[0].size(0), 1) 42 | return loss.mean(), dict(stats), {"perm": perm} 43 | -------------------------------------------------------------------------------- /espnet2/enh/separator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/enh/separator/__init__.py -------------------------------------------------------------------------------- /espnet2/enh/separator/abs_separator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import OrderedDict 3 | from typing import Dict, Optional, Tuple 4 | 5 | import torch 6 | 7 | 8 | class AbsSeparator(torch.nn.Module, ABC): 9 | @abstractmethod 10 | def forward( 11 | self, 12 | input: torch.Tensor, 13 | ilens: torch.Tensor, 14 | additional: Optional[Dict] = None, 15 | ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]: 16 | raise NotImplementedError 17 | 18 | def forward_streaming( 19 | self, 20 | input_frame: torch.Tensor, 21 | buffer=None, 22 | ): 23 | raise NotImplementedError 24 | 25 | @property 26 | @abstractmethod 27 | def num_spk(self): 28 | raise NotImplementedError 29 | -------------------------------------------------------------------------------- /espnet2/fileio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/fileio/__init__.py -------------------------------------------------------------------------------- /espnet2/fst/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/fst/__init__.py -------------------------------------------------------------------------------- /espnet2/gan_svs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/gan_svs/__init__.py -------------------------------------------------------------------------------- /espnet2/gan_svs/abs_gan_svs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Copyright 2022 Yifeng Yu 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | 5 | """GAN-based SVS abstrast class.""" 6 | 7 | from abc import ABC, abstractmethod 8 | from typing import Dict, Union 9 | 10 | import torch 11 | 12 | from espnet2.svs.abs_svs import AbsSVS 13 | 14 | 15 | class AbsGANSVS(AbsSVS, ABC): 16 | """GAN-based SVS model abstract class.""" 17 | 18 | @abstractmethod 19 | def forward( 20 | self, 21 | forward_generator, 22 | *args, 23 | **kwargs, 24 | ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]: 25 | """Return generator or discriminator loss.""" 26 | raise NotImplementedError 27 | -------------------------------------------------------------------------------- /espnet2/gan_svs/avocodo/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.avocodo.avocodo import ( 2 | MDC, 3 | SBD, 4 | AvocodoDiscriminator, 5 | AvocodoDiscriminatorPlus, 6 | AvocodoGenerator, 7 | CoMBD, 8 | CoMBDBlock, 9 | SBDBlock, 10 | ) 11 | 12 | __all__ = [ 13 | "MDC", 14 | "SBD", 15 | "AvocodoDiscriminator", 16 | "AvocodoDiscriminatorPlus", 17 | "AvocodoGenerator", 18 | "CoMBD", 19 | "CoMBDBlock", 20 | "SBDBlock", 21 | ] 22 | -------------------------------------------------------------------------------- /espnet2/gan_svs/joint/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.joint.joint_score2wav import JointScore2Wav # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/gan_svs/uhifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.uhifigan.sine_generator import SineGen 2 | from espnet2.gan_svs.uhifigan.uhifigan import UHiFiGANGenerator 3 | 4 | __all__ = [ 5 | "UHiFiGANGenerator", 6 | "SineGen", 7 | ] 8 | -------------------------------------------------------------------------------- /espnet2/gan_svs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.utils.expand_f0 import expand_f0 2 | -------------------------------------------------------------------------------- /espnet2/gan_svs/utils/expand_f0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Yifeng Yu 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Function to get random segments.""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def expand_f0(f0_frame, hop_length, method="interpolation"): 13 | """Expand f0 to output wave length. 14 | 15 | Args: 16 | f0_frame (Tensor): Input tensor (B, 1, frame_len). 17 | hop_length (Tensor): Hop length. 18 | method (str): Method to expand f0. Choose either 'interpolation' or 'repeat'. 19 | 20 | Returns: 21 | Tensor: Output tensor (B, 1, wav_len). 22 | 23 | """ 24 | frame_length = f0_frame.size(2) 25 | signal_length = frame_length * hop_length 26 | if method == "interpolation": 27 | f0_sample = F.interpolate( 28 | f0_frame, size=signal_length, mode="linear", align_corners=False 29 | ) 30 | elif method == "repeat": 31 | f0_sample = f0_frame.repeat_interleave(hop_length, dim=2)[:signal_length] 32 | else: 33 | raise ValueError("Invalid method. Choose either 'interpolation' or 'repeat'.") 34 | f0_sample = f0_sample.squeeze()[ 35 | :signal_length 36 | ] # Remove extra dimensions and trim to signal_length 37 | return f0_sample 38 | -------------------------------------------------------------------------------- /espnet2/gan_svs/visinger2/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.visinger2.visinger2_vocoder import ( 2 | Generator_Harm, 3 | Generator_Noise, 4 | VISinger2Discriminator, 5 | VISinger2VocoderGenerator, 6 | ) 7 | 8 | __all__ = [ 9 | "Generator_Harm", 10 | "Generator_Noise", 11 | "VISinger2Discriminator", 12 | "VISinger2VocoderGenerator", 13 | ] 14 | -------------------------------------------------------------------------------- /espnet2/gan_svs/vits/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_svs.vits.vits import VITS # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/gan_svs/vits/modules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2022 Yifeng Yu 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import torch 8 | 9 | 10 | class Projection(torch.nn.Module): 11 | def __init__(self, hidden_channels, out_channels): 12 | super().__init__() 13 | self.hidden_channels = hidden_channels 14 | self.out_channels = out_channels 15 | self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) 16 | 17 | def forward(self, x, x_mask): 18 | # x shape: (B, attention_dim, T_text) 19 | stats = self.proj(x) * x_mask 20 | m_p, logs_p = torch.split(stats, self.out_channels, dim=1) 21 | return m_p, logs_p 22 | 23 | 24 | def sequence_mask(length, max_length=None): 25 | if max_length is None: 26 | max_length = length.max() 27 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 28 | return x.unsqueeze(0) < length.unsqueeze(1) 29 | -------------------------------------------------------------------------------- /espnet2/gan_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/gan_tts/__init__.py -------------------------------------------------------------------------------- /espnet2/gan_tts/abs_gan_tts.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """GAN-based TTS abstrast class.""" 5 | 6 | from abc import ABC, abstractmethod 7 | from typing import Dict, Union 8 | 9 | import torch 10 | 11 | from espnet2.tts.abs_tts import AbsTTS 12 | 13 | 14 | class AbsGANTTS(AbsTTS, ABC): 15 | """GAN-based TTS model abstract class.""" 16 | 17 | @abstractmethod 18 | def forward( 19 | self, 20 | forward_generator, 21 | *args, 22 | **kwargs, 23 | ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]: 24 | """Return generator or discriminator loss.""" 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /espnet2/gan_tts/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.hifigan.hifigan import ( 2 | HiFiGANGenerator, 3 | HiFiGANMultiPeriodDiscriminator, 4 | HiFiGANMultiScaleDiscriminator, 5 | HiFiGANMultiScaleMultiPeriodDiscriminator, 6 | HiFiGANPeriodDiscriminator, 7 | HiFiGANScaleDiscriminator, 8 | ) 9 | 10 | __all__ = [ 11 | "HiFiGANGenerator", 12 | "HiFiGANMultiPeriodDiscriminator", 13 | "HiFiGANMultiScaleDiscriminator", 14 | "HiFiGANMultiScaleMultiPeriodDiscriminator", 15 | "HiFiGANPeriodDiscriminator", 16 | "HiFiGANScaleDiscriminator", 17 | ] 18 | -------------------------------------------------------------------------------- /espnet2/gan_tts/jets/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.jets.jets import JETS # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/gan_tts/joint/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.joint.joint_text2wav import JointText2Wav # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/gan_tts/melgan/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.melgan.melgan import MelGANDiscriminator # NOQA 2 | from espnet2.gan_tts.melgan.melgan import MelGANGenerator # NOQA 3 | from espnet2.gan_tts.melgan.melgan import MelGANMultiScaleDiscriminator # NOQA 4 | -------------------------------------------------------------------------------- /espnet2/gan_tts/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.parallel_wavegan.parallel_wavegan import ( 2 | ParallelWaveGANDiscriminator, 3 | ParallelWaveGANGenerator, 4 | ) 5 | 6 | __all__ = ["ParallelWaveGANDiscriminator", "ParallelWaveGANGenerator"] 7 | -------------------------------------------------------------------------------- /espnet2/gan_tts/style_melgan/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANDiscriminator # NOQA 2 | from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANGenerator # NOQA 3 | -------------------------------------------------------------------------------- /espnet2/gan_tts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.utils.get_random_segments import get_random_segments # NOQA 2 | from espnet2.gan_tts.utils.get_random_segments import get_segments # NOQA 3 | -------------------------------------------------------------------------------- /espnet2/gan_tts/utils/get_random_segments.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Function to get random segments.""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | 10 | 11 | def get_random_segments( 12 | x: torch.Tensor, 13 | x_lengths: torch.Tensor, 14 | segment_size: int, 15 | ) -> Tuple[torch.Tensor, torch.Tensor]: 16 | """Get random segments. 17 | 18 | Args: 19 | x (Tensor): Input tensor (B, C, T). 20 | x_lengths (Tensor): Length tensor (B,). 21 | segment_size (int): Segment size. 22 | 23 | Returns: 24 | Tensor: Segmented tensor (B, C, segment_size). 25 | Tensor: Start index tensor (B,). 26 | 27 | """ 28 | b, c, t = x.size() 29 | max_start_idx = x_lengths - segment_size 30 | max_start_idx[max_start_idx < 0] = 0 31 | start_idxs = (torch.rand([b]).to(x.device) * max_start_idx).to( 32 | dtype=torch.long, 33 | ) 34 | segments = get_segments(x, start_idxs, segment_size) 35 | 36 | return segments, start_idxs 37 | 38 | 39 | def get_segments( 40 | x: torch.Tensor, 41 | start_idxs: torch.Tensor, 42 | segment_size: int, 43 | ) -> torch.Tensor: 44 | """Get segments. 45 | 46 | Args: 47 | x (Tensor): Input tensor (B, C, T). 48 | start_idxs (Tensor): Start index tensor (B,). 49 | segment_size (int): Segment size. 50 | 51 | Returns: 52 | Tensor: Segmented tensor (B, C, segment_size). 53 | 54 | """ 55 | b, c, t = x.size() 56 | segments = x.new_zeros(b, c, segment_size) 57 | for i, start_idx in enumerate(start_idxs): 58 | segments[i] = x[i, :, start_idx : start_idx + segment_size] 59 | return segments 60 | -------------------------------------------------------------------------------- /espnet2/gan_tts/vits/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.vits.vits import VITS # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/gan_tts/vits/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | """Maximum path calculation module with cython optimization. 2 | 3 | This code is copied from https://github.com/jaywalnut310/vits and modifed code format. 4 | 5 | """ 6 | 7 | cimport cython 8 | 9 | from cython.parallel import prange 10 | 11 | 12 | @cython.boundscheck(False) 13 | @cython.wraparound(False) 14 | cdef void maximum_path_each(int[:, ::1] path, float[:, ::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: 15 | cdef int x 16 | cdef int y 17 | cdef float v_prev 18 | cdef float v_cur 19 | cdef float tmp 20 | cdef int index = t_x - 1 21 | 22 | for y in range(t_y): 23 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 24 | if x == y: 25 | v_cur = max_neg_val 26 | else: 27 | v_cur = value[y - 1, x] 28 | if x == 0: 29 | if y == 0: 30 | v_prev = 0.0 31 | else: 32 | v_prev = max_neg_val 33 | else: 34 | v_prev = value[y - 1, x - 1] 35 | value[y, x] += max(v_prev, v_cur) 36 | 37 | for y in range(t_y - 1, -1, -1): 38 | path[y, index] = 1 39 | if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]): 40 | index = index - 1 41 | 42 | 43 | @cython.boundscheck(False) 44 | @cython.wraparound(False) 45 | cpdef void maximum_path_c(int[:, :, ::1] paths, float[:, :, ::1] values, int[::1] t_ys, int[::1] t_xs) nogil: 46 | cdef int b = paths.shape[0] 47 | cdef int i 48 | for i in prange(b, nogil=True): 49 | maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) 50 | -------------------------------------------------------------------------------- /espnet2/gan_tts/vits/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | """Setup cython code.""" 2 | 3 | from Cython.Build import cythonize 4 | from setuptools import Extension, setup 5 | from setuptools.command.build_ext import build_ext as _build_ext 6 | 7 | 8 | class build_ext(_build_ext): 9 | """Overwrite build_ext.""" 10 | 11 | def finalize_options(self): 12 | """Prevent numpy from thinking it is still in its setup process.""" 13 | _build_ext.finalize_options(self) 14 | __builtins__.__NUMPY_SETUP__ = False 15 | import numpy 16 | 17 | self.include_dirs.append(numpy.get_include()) 18 | 19 | 20 | exts = [ 21 | Extension( 22 | name="core", 23 | sources=["core.pyx"], 24 | ) 25 | ] 26 | setup( 27 | name="monotonic_align", 28 | ext_modules=cythonize(exts, language_level=3), 29 | cmdclass={"build_ext": build_ext}, 30 | ) 31 | -------------------------------------------------------------------------------- /espnet2/gan_tts/wavenet/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.gan_tts.wavenet.wavenet import WaveNet # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/hubert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/hubert/__init__.py -------------------------------------------------------------------------------- /espnet2/iterators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/iterators/__init__.py -------------------------------------------------------------------------------- /espnet2/iterators/abs_iter_factory.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterator 3 | 4 | 5 | class AbsIterFactory(ABC): 6 | @abstractmethod 7 | def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator: 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /espnet2/iterators/multiple_iter_factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Callable, Collection, Iterator 3 | 4 | import numpy as np 5 | from typeguard import check_argument_types 6 | 7 | from espnet2.iterators.abs_iter_factory import AbsIterFactory 8 | 9 | 10 | class MultipleIterFactory(AbsIterFactory): 11 | def __init__( 12 | self, 13 | build_funcs: Collection[Callable[[], AbsIterFactory]], 14 | seed: int = 0, 15 | shuffle: bool = False, 16 | ): 17 | assert check_argument_types() 18 | self.build_funcs = list(build_funcs) 19 | self.seed = seed 20 | self.shuffle = shuffle 21 | 22 | def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator: 23 | if shuffle is None: 24 | shuffle = self.shuffle 25 | 26 | build_funcs = list(self.build_funcs) 27 | 28 | if shuffle: 29 | np.random.RandomState(epoch + self.seed).shuffle(build_funcs) 30 | 31 | for i, build_func in enumerate(build_funcs): 32 | logging.info(f"Building {i}th iter-factory...") 33 | iter_factory = build_func() 34 | assert isinstance(iter_factory, AbsIterFactory), type(iter_factory) 35 | yield from iter_factory.build_iter(epoch, shuffle) 36 | -------------------------------------------------------------------------------- /espnet2/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/layers/__init__.py -------------------------------------------------------------------------------- /espnet2/layers/abs_normalize.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsNormalize(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def forward( 10 | self, input: torch.Tensor, input_lengths: torch.Tensor = None 11 | ) -> Tuple[torch.Tensor, torch.Tensor]: 12 | # return output, output_lengths 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /espnet2/layers/inversible_interface.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | 7 | class InversibleInterface(ABC): 8 | @abstractmethod 9 | def inverse( 10 | self, input: torch.Tensor, input_lengths: torch.Tensor = None 11 | ) -> Tuple[torch.Tensor, torch.Tensor]: 12 | # return output, output_lengths 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /espnet2/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/lm/__init__.py -------------------------------------------------------------------------------- /espnet2/lm/abs_model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | from espnet.nets.scorer_interface import BatchScorerInterface 7 | 8 | 9 | class AbsLM(torch.nn.Module, BatchScorerInterface, ABC): 10 | """The abstract LM class 11 | 12 | To share the loss calculation way among different models, 13 | We uses delegate pattern here: 14 | The instance of this class should be passed to "LanguageModel" 15 | 16 | >>> from espnet2.lm.abs_model import AbsLM 17 | >>> lm = AbsLM() 18 | >>> model = LanguageESPnetModel(lm=lm) 19 | 20 | This "model" is one of mediator objects for "Task" class. 21 | 22 | """ 23 | 24 | @abstractmethod 25 | def forward( 26 | self, input: torch.Tensor, hidden: torch.Tensor 27 | ) -> Tuple[torch.Tensor, torch.Tensor]: 28 | raise NotImplementedError 29 | -------------------------------------------------------------------------------- /espnet2/main_funcs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/main_funcs/__init__.py -------------------------------------------------------------------------------- /espnet2/mt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/mt/__init__.py -------------------------------------------------------------------------------- /espnet2/mt/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/mt/frontend/__init__.py -------------------------------------------------------------------------------- /espnet2/mt/frontend/embedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 2020, Technische Universität München; Ludwig Kürzinger 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | 5 | """Embedding Frontend for text based inputs.""" 6 | 7 | from typing import Tuple 8 | 9 | import torch 10 | from typeguard import check_argument_types 11 | 12 | from espnet2.asr.frontend.abs_frontend import AbsFrontend 13 | from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding 14 | 15 | 16 | class Embedding(AbsFrontend): 17 | """Embedding Frontend for text based inputs.""" 18 | 19 | def __init__( 20 | self, 21 | input_size: int = 400, 22 | embed_dim: int = 400, 23 | pos_enc_class=PositionalEncoding, 24 | positional_dropout_rate: float = 0.1, 25 | ): 26 | """Initialize. 27 | 28 | Args: 29 | input_size: Number of input tokens. 30 | embed_dim: Embedding Size. 31 | pos_enc_class: PositionalEncoding or ScaledPositionalEncoding 32 | positional_dropout_rate: dropout rate after adding positional encoding 33 | """ 34 | assert check_argument_types() 35 | super().__init__() 36 | self.embed_dim = embed_dim 37 | # TODO(sdalmia): check for padding idx 38 | self.embed = torch.nn.Sequential( 39 | torch.nn.Embedding(input_size, embed_dim), 40 | pos_enc_class(embed_dim, positional_dropout_rate), 41 | ) 42 | 43 | def forward( 44 | self, input: torch.Tensor, input_lengths: torch.Tensor 45 | ) -> Tuple[torch.Tensor, torch.Tensor]: 46 | """Apply a sliding window on the input. 47 | 48 | Args: 49 | input: Input (B, T) or (B, T,D), with D. 50 | input_lengths: Input lengths within batch. 51 | 52 | Returns: 53 | Tensor: Output with dimensions (B, T, D). 54 | Tensor: Output lengths within batch. 55 | """ 56 | x = self.embed(input) 57 | 58 | return x, input_lengths 59 | 60 | def output_size(self) -> int: 61 | """Return output length of feature dimension D, i.e. the embedding dim.""" 62 | return self.embed_dim 63 | -------------------------------------------------------------------------------- /espnet2/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/optimizers/__init__.py -------------------------------------------------------------------------------- /espnet2/optimizers/sgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typeguard import check_argument_types 3 | 4 | 5 | class SGD(torch.optim.SGD): 6 | """Thin inheritance of torch.optim.SGD to bind the required arguments, 'lr' 7 | 8 | Note that 9 | the arguments of the optimizer invoked by AbsTask.main() 10 | must have default value except for 'param'. 11 | 12 | I can't understand why only SGD.lr doesn't have the default value. 13 | """ 14 | 15 | def __init__( 16 | self, 17 | params, 18 | lr: float = 0.1, 19 | momentum: float = 0.0, 20 | dampening: float = 0.0, 21 | weight_decay: float = 0.0, 22 | nesterov: bool = False, 23 | ): 24 | assert check_argument_types() 25 | super().__init__( 26 | params, 27 | lr=lr, 28 | momentum=momentum, 29 | dampening=dampening, 30 | weight_decay=weight_decay, 31 | nesterov=nesterov, 32 | ) 33 | -------------------------------------------------------------------------------- /espnet2/s2t/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/s2t/__init__.py -------------------------------------------------------------------------------- /espnet2/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/samplers/__init__.py -------------------------------------------------------------------------------- /espnet2/samplers/abs_sampler.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterator, Tuple 3 | 4 | from torch.utils.data import Sampler 5 | 6 | 7 | class AbsSampler(Sampler, ABC): 8 | @abstractmethod 9 | def __len__(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def __iter__(self) -> Iterator[Tuple[str, ...]]: 14 | raise NotImplementedError 15 | 16 | def generate(self, seed): 17 | return list(self) 18 | -------------------------------------------------------------------------------- /espnet2/schedulers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/schedulers/__init__.py -------------------------------------------------------------------------------- /espnet2/schedulers/abs_scheduler.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch.optim.lr_scheduler as L 4 | 5 | 6 | class AbsScheduler(ABC): 7 | @abstractmethod 8 | def step(self, epoch: int = None): 9 | pass 10 | 11 | @abstractmethod 12 | def state_dict(self): 13 | pass 14 | 15 | @abstractmethod 16 | def load_state_dict(self, state): 17 | pass 18 | 19 | 20 | # If you need to define custom scheduler, please inherit these classes 21 | class AbsBatchStepScheduler(AbsScheduler): 22 | @abstractmethod 23 | def step(self, epoch: int = None): 24 | pass 25 | 26 | @abstractmethod 27 | def state_dict(self): 28 | pass 29 | 30 | @abstractmethod 31 | def load_state_dict(self, state): 32 | pass 33 | 34 | 35 | class AbsEpochStepScheduler(AbsScheduler): 36 | @abstractmethod 37 | def step(self, epoch: int = None): 38 | pass 39 | 40 | @abstractmethod 41 | def state_dict(self): 42 | pass 43 | 44 | @abstractmethod 45 | def load_state_dict(self, state): 46 | pass 47 | 48 | 49 | class AbsValEpochStepScheduler(AbsEpochStepScheduler): 50 | @abstractmethod 51 | def step(self, val, epoch: int = None): 52 | pass 53 | 54 | @abstractmethod 55 | def state_dict(self): 56 | pass 57 | 58 | @abstractmethod 59 | def load_state_dict(self, state): 60 | pass 61 | 62 | 63 | # Create alias type to check the type 64 | # Note(kamo): Currently PyTorch doesn't provide the base class 65 | # to judge these classes. 66 | AbsValEpochStepScheduler.register(L.ReduceLROnPlateau) 67 | for s in [ 68 | L.ReduceLROnPlateau, 69 | L.LambdaLR, 70 | L.StepLR, 71 | L.MultiStepLR, 72 | L.MultiStepLR, 73 | L.ExponentialLR, 74 | L.CosineAnnealingLR, 75 | ]: 76 | AbsEpochStepScheduler.register(s) 77 | 78 | AbsBatchStepScheduler.register(L.CyclicLR) 79 | for s in [ 80 | L.OneCycleLR, 81 | L.CosineAnnealingWarmRestarts, 82 | ]: 83 | AbsBatchStepScheduler.register(s) 84 | -------------------------------------------------------------------------------- /espnet2/schedulers/warmup_lr.py: -------------------------------------------------------------------------------- 1 | """Warm up learning rate scheduler module.""" 2 | from typing import Union 3 | 4 | import torch 5 | from torch.optim.lr_scheduler import _LRScheduler 6 | from typeguard import check_argument_types 7 | 8 | from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler 9 | 10 | 11 | class WarmupLR(_LRScheduler, AbsBatchStepScheduler): 12 | """The WarmupLR scheduler 13 | 14 | This scheduler is almost same as NoamLR Scheduler except for following difference: 15 | 16 | NoamLR: 17 | lr = optimizer.lr * model_size ** -0.5 18 | * min(step ** -0.5, step * warmup_step ** -1.5) 19 | WarmupLR: 20 | lr = optimizer.lr * warmup_step ** 0.5 21 | * min(step ** -0.5, step * warmup_step ** -1.5) 22 | 23 | Note that the maximum lr equals to optimizer.lr in this scheduler. 24 | 25 | """ 26 | 27 | def __init__( 28 | self, 29 | optimizer: torch.optim.Optimizer, 30 | warmup_steps: Union[int, float] = 25000, 31 | last_epoch: int = -1, 32 | ): 33 | assert check_argument_types() 34 | self.warmup_steps = warmup_steps 35 | 36 | # __init__() must be invoked before setting field 37 | # because step() is also invoked in __init__() 38 | super().__init__(optimizer, last_epoch) 39 | 40 | def __repr__(self): 41 | return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" 42 | 43 | def get_lr(self): 44 | step_num = self.last_epoch + 1 45 | return [ 46 | lr 47 | * self.warmup_steps**0.5 48 | * min(step_num**-0.5, step_num * self.warmup_steps**-1.5) 49 | for lr in self.base_lrs 50 | ] 51 | -------------------------------------------------------------------------------- /espnet2/slu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/__init__.py -------------------------------------------------------------------------------- /espnet2/slu/postdecoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/postdecoder/__init__.py -------------------------------------------------------------------------------- /espnet2/slu/postdecoder/abs_postdecoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | 6 | class AbsPostDecoder(torch.nn.Module, ABC): 7 | @abstractmethod 8 | def output_size(self) -> int: 9 | raise NotImplementedError 10 | 11 | @abstractmethod 12 | def forward( 13 | self, 14 | transcript_input_ids: torch.LongTensor, 15 | transcript_attention_mask: torch.LongTensor, 16 | transcript_token_type_ids: torch.LongTensor, 17 | transcript_position_ids: torch.LongTensor, 18 | ) -> torch.Tensor: 19 | raise NotImplementedError 20 | 21 | @abstractmethod 22 | def convert_examples_to_features( 23 | self, data: list, max_seq_length: int, output_size: int 24 | ): 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /espnet2/slu/postencoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/slu/postencoder/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/encoder/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/layers/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/loss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/loss/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/loss/abs_loss.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- encoding: utf-8 -*- 3 | # code from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py 4 | # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License) 5 | from abc import ABC, abstractmethod 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | 11 | class AbsLoss(nn.Module): 12 | def __init__(self, nout: int, **kwargs): 13 | super().__init__() 14 | 15 | @abstractmethod 16 | def forward(self, x: torch.Tensor, label=None) -> torch.Tensor: 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /espnet2/spk/pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/pooling/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/pooling/abs_pooling.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | 6 | class AbsPooling(torch.nn.Module, ABC): 7 | @abstractmethod 8 | def forward(self, input: torch.Tensor) -> torch.Tensor: 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /espnet2/spk/pooling/chn_attn_stat_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from espnet2.spk.pooling.abs_pooling import AbsPooling 5 | 6 | 7 | class ChnAttnStatPooling(AbsPooling): 8 | """ 9 | Aggregates frame-level features to single utterance-level feature. 10 | Proposed in B.Desplanques et al., "ECAPA-TDNN: Emphasized Channel 11 | Attention, Propagation and Aggregation in TDNN Based Speaker Verification" 12 | 13 | args: 14 | input_size: dimensionality of the input frame-level embeddings. 15 | Determined by encoder hyperparameter. 16 | For this pooling layer, the output dimensionality will be double of 17 | the input_size 18 | """ 19 | 20 | def __init__(self, input_size: int = 1536): 21 | super().__init__() 22 | self.attention = nn.Sequential( 23 | nn.Conv1d(input_size * 3, 128, kernel_size=1), 24 | nn.ReLU(), 25 | nn.BatchNorm1d(128), 26 | nn.Conv1d(128, input_size, kernel_size=1), 27 | nn.Softmax(dim=2), 28 | ) 29 | 30 | def forward(self, x): 31 | t = x.size()[-1] 32 | global_x = torch.cat( 33 | ( 34 | x, 35 | torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), 36 | torch.sqrt( 37 | torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4) 38 | ).repeat(1, 1, t), 39 | ), 40 | dim=1, 41 | ) 42 | 43 | w = self.attention(global_x) 44 | 45 | mu = torch.sum(x * w, dim=2) 46 | sg = torch.sqrt( 47 | (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4) 48 | ) 49 | 50 | x = torch.cat((mu, sg), dim=1) 51 | 52 | return x 53 | -------------------------------------------------------------------------------- /espnet2/spk/projector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/spk/projector/__init__.py -------------------------------------------------------------------------------- /espnet2/spk/projector/abs_projector.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | 6 | class AbsProjector(torch.nn.Module, ABC): 7 | @abstractmethod 8 | def output_size(self) -> int: 9 | raise NotImplementedError 10 | 11 | @abstractmethod 12 | def forward(self, utt_embd: torch.Tensor) -> torch.Tensor: 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /espnet2/spk/projector/rawnet3_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from espnet2.spk.projector.abs_projector import AbsProjector 4 | 5 | 6 | class RawNet3Projector(AbsProjector): 7 | def __init__(self, input_size, output_size): 8 | super().__init__() 9 | self._output_size = output_size 10 | 11 | self.bn = torch.nn.BatchNorm1d(input_size) 12 | self.fc = torch.nn.Linear(input_size, output_size) 13 | 14 | def output_size(self): 15 | return self._output_size 16 | 17 | def forward(self, x): 18 | return self.fc(self.bn(x)) 19 | -------------------------------------------------------------------------------- /espnet2/st/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/st/__init__.py -------------------------------------------------------------------------------- /espnet2/svs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/__init__.py -------------------------------------------------------------------------------- /espnet2/svs/abs_svs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Copyright 2021 Carnegie Mellon University (Jiatong Shi) 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | 5 | """Singing-voice-synthesis abstrast class.""" 6 | 7 | from abc import ABC, abstractmethod 8 | from typing import Dict, Tuple 9 | 10 | import torch 11 | 12 | 13 | class AbsSVS(torch.nn.Module, ABC): 14 | """SVS abstract class.""" 15 | 16 | @abstractmethod 17 | def forward( 18 | self, 19 | text: torch.Tensor, 20 | text_lengths: torch.Tensor, 21 | feats: torch.Tensor, 22 | feats_lengths: torch.Tensor, 23 | **kwargs, 24 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: 25 | """Calculate outputs and return the loss tensor.""" 26 | raise NotImplementedError 27 | 28 | @abstractmethod 29 | def inference( 30 | self, 31 | text: torch.Tensor, 32 | **kwargs, 33 | ) -> Dict[str, torch.Tensor]: 34 | """Return predicted output as a dict.""" 35 | raise NotImplementedError 36 | 37 | @property 38 | def require_raw_singing(self): 39 | """Return whether or not raw_singing is required.""" 40 | return False 41 | 42 | @property 43 | def require_vocoder(self): 44 | """Return whether or not vocoder is required.""" 45 | return True 46 | -------------------------------------------------------------------------------- /espnet2/svs/feats_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/feats_extract/__init__.py -------------------------------------------------------------------------------- /espnet2/svs/naive_rnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/naive_rnn/__init__.py -------------------------------------------------------------------------------- /espnet2/svs/singing_tacotron/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/singing_tacotron/__init__.py -------------------------------------------------------------------------------- /espnet2/svs/xiaoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/svs/xiaoice/__init__.py -------------------------------------------------------------------------------- /espnet2/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tasks/__init__.py -------------------------------------------------------------------------------- /espnet2/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/text/__init__.py -------------------------------------------------------------------------------- /espnet2/text/abs_tokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterable, List 3 | 4 | 5 | class AbsTokenizer(ABC): 6 | @abstractmethod 7 | def text2tokens(self, line: str) -> List[str]: 8 | raise NotImplementedError 9 | 10 | @abstractmethod 11 | def tokens2text(self, tokens: Iterable[str]) -> str: 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /espnet2/text/hugging_face_token_id_converter.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List, Union 2 | 3 | import numpy as np 4 | from typeguard import check_argument_types 5 | 6 | try: 7 | from transformers import AutoTokenizer 8 | 9 | is_transformers_available = True 10 | except ImportError: 11 | is_transformers_available = False 12 | 13 | 14 | class HuggingFaceTokenIDConverter: 15 | def __init__( 16 | self, 17 | model_name_or_path: str, 18 | ): 19 | assert check_argument_types() 20 | 21 | if not is_transformers_available: 22 | raise ImportError( 23 | "`transformers` is not available. Please install it via `pip install" 24 | " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh" 25 | " && ./installers/install_transformers.sh`." 26 | ) 27 | 28 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 29 | 30 | def get_num_vocabulary_size(self) -> int: 31 | return self.tokenizer.vocab_size 32 | 33 | def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]: 34 | return self.tokenizer.convert_ids_to_tokens(integers) 35 | 36 | def tokens2ids(self, tokens: Iterable[str]) -> List[int]: 37 | return self.tokenizer.convert_tokens_to_ids(tokens) 38 | -------------------------------------------------------------------------------- /espnet2/text/hugging_face_tokenizer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Iterable, List, Union 3 | 4 | from typeguard import check_argument_types 5 | 6 | from espnet2.text.abs_tokenizer import AbsTokenizer 7 | 8 | try: 9 | from transformers import AutoTokenizer 10 | 11 | is_transformers_available = True 12 | except ImportError: 13 | is_transformers_available = False 14 | 15 | 16 | class HuggingFaceTokenizer(AbsTokenizer): 17 | def __init__(self, model: Union[Path, str]): 18 | assert check_argument_types() 19 | 20 | if not is_transformers_available: 21 | raise ImportError( 22 | "`transformers` is not available. Please install it via `pip install" 23 | " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh" 24 | " && ./installers/install_transformers.sh`." 25 | ) 26 | 27 | self.model = str(model) 28 | # NOTE(kamo): 29 | # Don't build tokenizer in __init__() 30 | # because it's not picklable and it may cause following error, 31 | # "TypeError: can't pickle SwigPyObject objects", 32 | # when giving it as argument of "multiprocessing.Process()". 33 | self.tokenizer = None 34 | 35 | def __repr__(self): 36 | return f'{self.__class__.__name__}(model="{self.model}")' 37 | 38 | def _build_tokenizer(self): 39 | # Build Hugging Face tokenizer lazily. 40 | if self.tokenizer is None: 41 | self.tokenizer = AutoTokenizer.from_pretrained(self.model) 42 | 43 | def text2tokens(self, line: str) -> List[str]: 44 | self._build_tokenizer() 45 | return self.tokenizer.tokenize(line) 46 | 47 | def tokens2text(self, tokens: Iterable[str]) -> str: 48 | self._build_tokenizer() 49 | return ( 50 | self.tokenizer.batch_decode( 51 | [self.tokenizer.convert_tokens_to_ids(tokens)], skip_special_tokens=True 52 | )[0] 53 | .replace("\n", " ") 54 | .strip() 55 | ) 56 | -------------------------------------------------------------------------------- /espnet2/text/korean_cleaner.py: -------------------------------------------------------------------------------- 1 | # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean 2 | 3 | import re 4 | 5 | 6 | class KoreanCleaner: 7 | @classmethod 8 | def _normalize_numbers(cls, text): 9 | number_to_kor = { 10 | "0": "영", 11 | "1": "일", 12 | "2": "이", 13 | "3": "삼", 14 | "4": "사", 15 | "5": "오", 16 | "6": "육", 17 | "7": "칠", 18 | "8": "팔", 19 | "9": "구", 20 | } 21 | new_text = "".join( 22 | number_to_kor[char] if char in number_to_kor.keys() else char 23 | for char in text 24 | ) 25 | return new_text 26 | 27 | @classmethod 28 | def _normalize_english_text(cls, text): 29 | upper_alphabet_to_kor = { 30 | "A": "에이", 31 | "B": "비", 32 | "C": "씨", 33 | "D": "디", 34 | "E": "이", 35 | "F": "에프", 36 | "G": "지", 37 | "H": "에이치", 38 | "I": "아이", 39 | "J": "제이", 40 | "K": "케이", 41 | "L": "엘", 42 | "M": "엠", 43 | "N": "엔", 44 | "O": "오", 45 | "P": "피", 46 | "Q": "큐", 47 | "R": "알", 48 | "S": "에스", 49 | "T": "티", 50 | "U": "유", 51 | "V": "브이", 52 | "W": "더블유", 53 | "X": "엑스", 54 | "Y": "와이", 55 | "Z": "지", 56 | } 57 | new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text) 58 | new_text = "".join( 59 | upper_alphabet_to_kor[char] 60 | if char in upper_alphabet_to_kor.keys() 61 | else char 62 | for char in new_text 63 | ) 64 | 65 | return new_text 66 | 67 | @classmethod 68 | def normalize_text(cls, text): 69 | # stage 0 : text strip 70 | text = text.strip() 71 | 72 | # stage 1 : normalize numbers 73 | text = cls._normalize_numbers(text) 74 | 75 | # stage 2 : normalize english text 76 | text = cls._normalize_english_text(text) 77 | return text 78 | -------------------------------------------------------------------------------- /espnet2/text/sentencepiece_tokenizer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Dict, Iterable, List, Union 3 | 4 | import sentencepiece as spm 5 | from typeguard import check_argument_types 6 | 7 | from espnet2.text.abs_tokenizer import AbsTokenizer 8 | 9 | 10 | class SentencepiecesTokenizer(AbsTokenizer): 11 | def __init__(self, model: Union[Path, str], encode_kwargs: Dict = dict()): 12 | assert check_argument_types() 13 | self.model = str(model) 14 | # NOTE(kamo): 15 | # Don't build SentencePieceProcessor in __init__() 16 | # because it's not picklable and it may cause following error, 17 | # "TypeError: can't pickle SwigPyObject objects", 18 | # when giving it as argument of "multiprocessing.Process()". 19 | self.sp = None 20 | self.encode_kwargs = encode_kwargs 21 | 22 | def __repr__(self): 23 | return f'{self.__class__.__name__}(model="{self.model}")' 24 | 25 | def _build_sentence_piece_processor(self): 26 | # Build SentencePieceProcessor lazily. 27 | if self.sp is None: 28 | self.sp = spm.SentencePieceProcessor() 29 | self.sp.load(self.model) 30 | 31 | def text2tokens(self, line: str) -> List[str]: 32 | self._build_sentence_piece_processor() 33 | return self.sp.EncodeAsPieces(line, **self.encode_kwargs) 34 | 35 | def tokens2text(self, tokens: Iterable[str]) -> str: 36 | self._build_sentence_piece_processor() 37 | return self.sp.DecodePieces(list(tokens)) 38 | -------------------------------------------------------------------------------- /espnet2/text/word_tokenizer.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | from typing import Iterable, List, Union 4 | 5 | from typeguard import check_argument_types 6 | 7 | from espnet2.text.abs_tokenizer import AbsTokenizer 8 | 9 | 10 | class WordTokenizer(AbsTokenizer): 11 | def __init__( 12 | self, 13 | delimiter: str = None, 14 | non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, 15 | remove_non_linguistic_symbols: bool = False, 16 | ): 17 | assert check_argument_types() 18 | self.delimiter = delimiter 19 | 20 | if not remove_non_linguistic_symbols and non_linguistic_symbols is not None: 21 | warnings.warn( 22 | "non_linguistic_symbols is only used " 23 | "when remove_non_linguistic_symbols = True" 24 | ) 25 | 26 | if non_linguistic_symbols is None: 27 | self.non_linguistic_symbols = set() 28 | elif isinstance(non_linguistic_symbols, (Path, str)): 29 | non_linguistic_symbols = Path(non_linguistic_symbols) 30 | try: 31 | with non_linguistic_symbols.open("r", encoding="utf-8") as f: 32 | self.non_linguistic_symbols = set(line.rstrip() for line in f) 33 | except FileNotFoundError: 34 | warnings.warn(f"{non_linguistic_symbols} doesn't exist.") 35 | self.non_linguistic_symbols = set() 36 | else: 37 | self.non_linguistic_symbols = set(non_linguistic_symbols) 38 | self.remove_non_linguistic_symbols = remove_non_linguistic_symbols 39 | 40 | def __repr__(self): 41 | return f'{self.__class__.__name__}(delimiter="{self.delimiter}")' 42 | 43 | def text2tokens(self, line: str) -> List[str]: 44 | tokens = [] 45 | for t in line.split(self.delimiter): 46 | if self.remove_non_linguistic_symbols and t in self.non_linguistic_symbols: 47 | continue 48 | tokens.append(t) 49 | return tokens 50 | 51 | def tokens2text(self, tokens: Iterable[str]) -> str: 52 | if self.delimiter is None: 53 | delimiter = " " 54 | else: 55 | delimiter = self.delimiter 56 | return delimiter.join(tokens) 57 | -------------------------------------------------------------------------------- /espnet2/torch_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/torch_utils/__init__.py -------------------------------------------------------------------------------- /espnet2/torch_utils/add_gradient_noise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def add_gradient_noise( 5 | model: torch.nn.Module, 6 | iteration: int, 7 | duration: float = 100, 8 | eta: float = 1.0, 9 | scale_factor: float = 0.55, 10 | ): 11 | """Adds noise from a standard normal distribution to the gradients. 12 | 13 | The standard deviation (`sigma`) is controlled 14 | by the three hyper-parameters below. 15 | `sigma` goes to zero (no noise) with more iterations. 16 | 17 | Args: 18 | model: Model. 19 | iteration: Number of iterations. 20 | duration: {100, 1000}: Number of durations to control 21 | the interval of the `sigma` change. 22 | eta: {0.01, 0.3, 1.0}: The magnitude of `sigma`. 23 | scale_factor: {0.55}: The scale of `sigma`. 24 | """ 25 | interval = (iteration // duration) + 1 26 | sigma = eta / interval**scale_factor 27 | for param in model.parameters(): 28 | if param.grad is not None: 29 | _shape = param.grad.size() 30 | noise = sigma * torch.randn(_shape).to(param.device) 31 | param.grad += noise 32 | -------------------------------------------------------------------------------- /espnet2/torch_utils/forward_adaptor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typeguard import check_argument_types 3 | 4 | 5 | class ForwardAdaptor(torch.nn.Module): 6 | """Wrapped module to parallelize specified method 7 | 8 | torch.nn.DataParallel parallelizes only "forward()" 9 | and, maybe, the method having the other name can't be applied 10 | except for wrapping the module just like this class. 11 | 12 | Examples: 13 | >>> class A(torch.nn.Module): 14 | ... def foo(self, x): 15 | ... ... 16 | >>> model = A() 17 | >>> model = ForwardAdaptor(model, "foo") 18 | >>> model = torch.nn.DataParallel(model, device_ids=[0, 1]) 19 | >>> x = torch.randn(2, 10) 20 | >>> model(x) 21 | """ 22 | 23 | def __init__(self, module: torch.nn.Module, name: str): 24 | assert check_argument_types() 25 | super().__init__() 26 | self.module = module 27 | self.name = name 28 | if not hasattr(module, name): 29 | raise ValueError(f"{module} doesn't have {name}") 30 | 31 | def forward(self, *args, **kwargs): 32 | func = getattr(self.module, self.name) 33 | return func(*args, **kwargs) 34 | -------------------------------------------------------------------------------- /espnet2/torch_utils/get_layer_from_string.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | 3 | import torch 4 | 5 | 6 | def get_layer(l_name, library=torch.nn): 7 | """Return layer object handler from library e.g. from torch.nn 8 | 9 | E.g. if l_name=="elu", returns torch.nn.ELU. 10 | 11 | Args: 12 | l_name (string): Case insensitive name for layer in library (e.g. .'elu'). 13 | library (module): Name of library/module where to search for object handler 14 | with l_name e.g. "torch.nn". 15 | 16 | Returns: 17 | layer_handler (object): handler for the requested layer e.g. (torch.nn.ELU) 18 | 19 | """ 20 | 21 | all_torch_layers = [x for x in dir(torch.nn)] 22 | match = [x for x in all_torch_layers if l_name.lower() == x.lower()] 23 | if len(match) == 0: 24 | close_matches = difflib.get_close_matches( 25 | l_name, [x.lower() for x in all_torch_layers] 26 | ) 27 | raise NotImplementedError( 28 | "Layer with name {} not found in {}.\n Closest matches: {}".format( 29 | l_name, str(library), close_matches 30 | ) 31 | ) 32 | elif len(match) > 1: 33 | close_matches = difflib.get_close_matches( 34 | l_name, [x.lower() for x in all_torch_layers] 35 | ) 36 | raise NotImplementedError( 37 | "Multiple matchs for layer with name {} not found in {}.\n " 38 | "All matches: {}".format(l_name, str(library), close_matches) 39 | ) 40 | else: 41 | # valid 42 | layer_handler = getattr(library, match[0]) 43 | return layer_handler 44 | -------------------------------------------------------------------------------- /espnet2/torch_utils/pytorch_version.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pytorch_cudnn_version() -> str: 5 | message = ( 6 | f"pytorch.version={torch.__version__}, " 7 | f"cuda.available={torch.cuda.is_available()}, " 8 | ) 9 | 10 | if torch.backends.cudnn.enabled: 11 | message += ( 12 | f"cudnn.version={torch.backends.cudnn.version()}, " 13 | f"cudnn.benchmark={torch.backends.cudnn.benchmark}, " 14 | f"cudnn.deterministic={torch.backends.cudnn.deterministic}" 15 | ) 16 | return message 17 | -------------------------------------------------------------------------------- /espnet2/torch_utils/recursive_op.py: -------------------------------------------------------------------------------- 1 | """Torch utility module.""" 2 | import torch 3 | 4 | if torch.distributed.is_available(): 5 | from torch.distributed import ReduceOp 6 | 7 | 8 | def recursive_sum(obj, weight: torch.Tensor, distributed: bool = False): 9 | assert weight.dim() == 1, weight.size() 10 | if isinstance(obj, (tuple, list)): 11 | return type(obj)(recursive_sum(v, weight, distributed) for v in obj) 12 | elif isinstance(obj, dict): 13 | return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} 14 | elif isinstance(obj, torch.Tensor): 15 | assert obj.size() == weight.size(), (obj.size(), weight.size()) 16 | obj = (obj * weight.type(obj.dtype)).sum() 17 | if distributed: 18 | torch.distributed.all_reduce(obj, op=ReduceOp.SUM) 19 | return obj 20 | elif obj is None: 21 | return None 22 | else: 23 | raise ValueError(type(obj)) 24 | 25 | 26 | def recursive_divide(a, b: torch.Tensor): 27 | if isinstance(a, (tuple, list)): 28 | return type(a)(recursive_divide(v, b) for v in a) 29 | elif isinstance(a, dict): 30 | return {k: recursive_divide(v, b) for k, v in a.items()} 31 | elif isinstance(a, torch.Tensor): 32 | assert a.size() == b.size(), (a.size(), b.size()) 33 | return a / b.type(a.dtype) 34 | elif a is None: 35 | return None 36 | else: 37 | raise ValueError(type(a)) 38 | 39 | 40 | def recursive_average(obj, weight: torch.Tensor, distributed: bool = False): 41 | obj = recursive_sum(obj, weight, distributed) 42 | weight = weight.sum() 43 | if distributed: 44 | torch.distributed.all_reduce(weight, op=ReduceOp.SUM) 45 | # Normalize weight to be sum-to-1 46 | obj = recursive_divide(obj, weight) 47 | return obj, weight 48 | -------------------------------------------------------------------------------- /espnet2/torch_utils/set_all_random_seed.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def set_all_random_seed(seed: int): 8 | random.seed(seed) 9 | np.random.seed(seed) 10 | torch.random.manual_seed(seed) 11 | -------------------------------------------------------------------------------- /espnet2/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/train/__init__.py -------------------------------------------------------------------------------- /espnet2/train/abs_espnet_model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsESPnetModel(torch.nn.Module, ABC): 8 | """The common abstract class among each tasks 9 | 10 | "ESPnetModel" is referred to a class which inherits torch.nn.Module, 11 | and makes the dnn-models forward as its member field, 12 | a.k.a delegate pattern, 13 | and defines "loss", "stats", and "weight" for the task. 14 | 15 | If you intend to implement new task in ESPNet, 16 | the model must inherit this class. 17 | In other words, the "mediator" objects between 18 | our training system and the your task class are 19 | just only these three values, loss, stats, and weight. 20 | 21 | Example: 22 | >>> from espnet2.tasks.abs_task import AbsTask 23 | >>> class YourESPnetModel(AbsESPnetModel): 24 | ... def forward(self, input, input_lengths): 25 | ... ... 26 | ... return loss, stats, weight 27 | >>> class YourTask(AbsTask): 28 | ... @classmethod 29 | ... def build_model(cls, args: argparse.Namespace) -> YourESPnetModel: 30 | """ 31 | 32 | @abstractmethod 33 | def forward( 34 | self, **batch: torch.Tensor 35 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: 36 | raise NotImplementedError 37 | 38 | @abstractmethod 39 | def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]: 40 | raise NotImplementedError 41 | -------------------------------------------------------------------------------- /espnet2/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/__init__.py -------------------------------------------------------------------------------- /espnet2/tts/abs_tts.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Text-to-speech abstrast class.""" 5 | 6 | from abc import ABC, abstractmethod 7 | from typing import Dict, Tuple 8 | 9 | import torch 10 | 11 | 12 | class AbsTTS(torch.nn.Module, ABC): 13 | """TTS abstract class.""" 14 | 15 | @abstractmethod 16 | def forward( 17 | self, 18 | text: torch.Tensor, 19 | text_lengths: torch.Tensor, 20 | feats: torch.Tensor, 21 | feats_lengths: torch.Tensor, 22 | **kwargs, 23 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: 24 | """Calculate outputs and return the loss tensor.""" 25 | raise NotImplementedError 26 | 27 | @abstractmethod 28 | def inference( 29 | self, 30 | text: torch.Tensor, 31 | **kwargs, 32 | ) -> Dict[str, torch.Tensor]: 33 | """Return predicted output as a dict.""" 34 | raise NotImplementedError 35 | 36 | @property 37 | def require_raw_speech(self): 38 | """Return whether or not raw_speech is required.""" 39 | return False 40 | 41 | @property 42 | def require_vocoder(self): 43 | """Return whether or not vocoder is required.""" 44 | return True 45 | -------------------------------------------------------------------------------- /espnet2/tts/fastspeech/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.fastspeech.fastspeech import FastSpeech # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/tts/fastspeech2/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.fastspeech2.fastspeech2 import FastSpeech2 # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/tts/feats_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/feats_extract/__init__.py -------------------------------------------------------------------------------- /espnet2/tts/feats_extract/abs_feats_extract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsFeatsExtract(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def get_parameters(self) -> Dict[str, Any]: 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def forward( 18 | self, input: torch.Tensor, input_lengths: torch.Tensor 19 | ) -> Tuple[torch.Tensor, torch.Tensor]: 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /espnet2/tts/gst/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/tts/gst/__init__.py -------------------------------------------------------------------------------- /espnet2/tts/prodiff/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.prodiff.prodiff import ProDiff # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/tts/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.tacotron2.tacotron2 import Tacotron2 # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/tts/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.transformer.transformer import Transformer # NOQA 2 | -------------------------------------------------------------------------------- /espnet2/tts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from espnet2.tts.utils.duration_calculator import DurationCalculator 2 | from espnet2.tts.utils.parallel_wavegan_pretrained_vocoder import ( 3 | ParallelWaveGANPretrainedVocoder, 4 | ) 5 | 6 | __all__ = ["DurationCalculator", "ParallelWaveGANPretrainedVocoder"] 7 | -------------------------------------------------------------------------------- /espnet2/tts/utils/parallel_wavegan_pretrained_vocoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Wrapper class for the vocoder model trained with parallel_wavegan repo.""" 5 | 6 | import logging 7 | import os 8 | from pathlib import Path 9 | from typing import Optional, Union 10 | 11 | import torch 12 | import yaml 13 | 14 | 15 | class ParallelWaveGANPretrainedVocoder(torch.nn.Module): 16 | """Wrapper class to load the vocoder trained with parallel_wavegan repo.""" 17 | 18 | def __init__( 19 | self, 20 | model_file: Union[Path, str], 21 | config_file: Optional[Union[Path, str]] = None, 22 | ): 23 | """Initialize ParallelWaveGANPretrainedVocoder module.""" 24 | super().__init__() 25 | try: 26 | from parallel_wavegan.utils import load_model 27 | except ImportError: 28 | logging.error( 29 | "`parallel_wavegan` is not installed. " 30 | "Please install via `pip install -U parallel_wavegan`." 31 | ) 32 | raise 33 | if config_file is None: 34 | dirname = os.path.dirname(str(model_file)) 35 | config_file = os.path.join(dirname, "config.yml") 36 | with open(config_file) as f: 37 | config = yaml.load(f, Loader=yaml.Loader) 38 | self.fs = config["sampling_rate"] 39 | self.vocoder = load_model(model_file, config) 40 | if hasattr(self.vocoder, "remove_weight_norm"): 41 | self.vocoder.remove_weight_norm() 42 | self.normalize_before = False 43 | if hasattr(self.vocoder, "mean"): 44 | self.normalize_before = True 45 | 46 | @torch.no_grad() 47 | def forward(self, feats: torch.Tensor) -> torch.Tensor: 48 | """Generate waveform with pretrained vocoder. 49 | 50 | Args: 51 | feats (Tensor): Feature tensor (T_feats, #mels). 52 | 53 | Returns: 54 | Tensor: Generated waveform tensor (T_wav). 55 | 56 | """ 57 | return self.vocoder.inference( 58 | feats, 59 | normalize_before=self.normalize_before, 60 | ).view(-1) 61 | -------------------------------------------------------------------------------- /espnet2/uasr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/__init__.py -------------------------------------------------------------------------------- /espnet2/uasr/discriminator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/discriminator/__init__.py -------------------------------------------------------------------------------- /espnet2/uasr/discriminator/abs_discriminator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | 6 | class AbsDiscriminator(torch.nn.Module, ABC): 7 | @abstractmethod 8 | def forward( 9 | self, 10 | xs_pad: torch.Tensor, 11 | padding_mask: torch.Tensor, 12 | ) -> torch.Tensor: 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /espnet2/uasr/generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/generator/__init__.py -------------------------------------------------------------------------------- /espnet2/uasr/generator/abs_generator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsGenerator(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def forward( 14 | self, 15 | xs_pad: torch.Tensor, 16 | ilens: torch.Tensor, 17 | ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /espnet2/uasr/loss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/loss/__init__.py -------------------------------------------------------------------------------- /espnet2/uasr/loss/abs_loss.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | EPS = torch.finfo(torch.get_default_dtype()).eps 6 | 7 | 8 | class AbsUASRLoss(torch.nn.Module, ABC): 9 | """Base class for all Diarization loss modules.""" 10 | 11 | # the name will be the key that appears in the reporter 12 | @property 13 | def name(self) -> str: 14 | return NotImplementedError 15 | 16 | @abstractmethod 17 | def forward( 18 | self, 19 | ) -> torch.Tensor: 20 | # the return tensor should be shape of (batch) 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /espnet2/uasr/loss/discriminator_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typeguard import check_argument_types 4 | 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss 6 | from espnet2.utils.types import str2bool 7 | 8 | 9 | class UASRDiscriminatorLoss(AbsUASRLoss): 10 | """discriminator loss for UASR.""" 11 | 12 | def __init__( 13 | self, 14 | weight: float = 1.0, 15 | smoothing: float = 0.0, 16 | smoothing_one_side: str2bool = False, 17 | reduction: str = "sum", 18 | ): 19 | super().__init__() 20 | assert check_argument_types() 21 | self.weight = weight 22 | self.smoothing = smoothing 23 | self.smoothing_one_sided = smoothing_one_side 24 | self.reduction = reduction 25 | 26 | def forward( 27 | self, 28 | dense_y: torch.Tensor, 29 | token_y: torch.Tensor, 30 | is_discriminative_step: str2bool, 31 | ): 32 | """Forward. 33 | 34 | Args: 35 | dense_y: predicted logits of generated samples 36 | token_y: predicted logits of real samples 37 | """ 38 | if self.weight > 0: 39 | fake_smooth = self.smoothing 40 | real_smooth = self.smoothing 41 | if self.smoothing_one_sided: 42 | fake_smooth = 0 43 | 44 | if is_discriminative_step: 45 | loss_dense = F.binary_cross_entropy_with_logits( 46 | dense_y, 47 | dense_y.new_ones(dense_y.shape) - fake_smooth, 48 | reduction=self.reduction, 49 | ) 50 | loss_token = F.binary_cross_entropy_with_logits( 51 | token_y, 52 | token_y.new_zeros(token_y.shape) + real_smooth, 53 | reduction=self.reduction, 54 | ) 55 | else: 56 | loss_dense = F.binary_cross_entropy_with_logits( 57 | dense_y, 58 | dense_y.new_zeros(dense_y.shape) + fake_smooth, 59 | reduction=self.reduction, 60 | ) 61 | loss_token = None 62 | 63 | return loss_dense, loss_token 64 | else: 65 | return 0 66 | -------------------------------------------------------------------------------- /espnet2/uasr/loss/phoneme_diversity_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typeguard import check_argument_types 3 | 4 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss 5 | from espnet2.utils.types import str2bool 6 | 7 | 8 | class UASRPhonemeDiversityLoss(AbsUASRLoss): 9 | """phoneme diversity loss for UASR.""" 10 | 11 | def __init__( 12 | self, 13 | weight: float = 1.0, 14 | ): 15 | super().__init__() 16 | assert check_argument_types() 17 | 18 | self.weight = weight 19 | 20 | def forward( 21 | self, dense_x: torch.Tensor, sample_size: int, is_discriminative_step: str2bool 22 | ): 23 | """Forward. 24 | 25 | Args: 26 | dense_x: predicted logits of generated samples 27 | sample_size: batch size 28 | is_dicriminative_step: whether is training discriminator 29 | """ 30 | if self.weight > 0 and not is_discriminative_step: 31 | batch_size, time_length, channel_size = dense_x.shape 32 | 33 | avg_probs = torch.softmax( 34 | dense_x.reshape(-1, channel_size).float(), dim=-1 35 | ).mean(dim=0) 36 | phoneme_ppl = torch.exp( 37 | -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) 38 | ) 39 | phoneme_diversity_loss = ( 40 | (channel_size - phoneme_ppl) / channel_size 41 | ) * sample_size 42 | 43 | return phoneme_diversity_loss 44 | else: 45 | return 0 46 | -------------------------------------------------------------------------------- /espnet2/uasr/loss/pseudo_label_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typeguard import check_argument_types 4 | 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss 6 | from espnet2.utils.types import str2bool 7 | 8 | 9 | class UASRPseudoLabelLoss(AbsUASRLoss): 10 | """auxiliary pseudo label loss for UASR.""" 11 | 12 | def __init__( 13 | self, 14 | weight: float = 1.0, 15 | input_dim: int = 128, 16 | output_dim: int = 64, 17 | downsample_rate: int = 2, 18 | ignore_index: int = -1, 19 | reduction: str = "none", 20 | ): 21 | super().__init__() 22 | assert check_argument_types() 23 | 24 | self.weight = weight 25 | self.input_dim = input_dim 26 | self.output_dim = output_dim 27 | self.downsample_rate = downsample_rate 28 | self.ignore_index = ignore_index 29 | self.reduction = reduction 30 | 31 | if self.weight > 0: 32 | self.decoder = torch.nn.Linear(self.input_dim, self.output_dim) 33 | 34 | def forward( 35 | self, 36 | inter_x: torch.Tensor, 37 | pseudo_labels: torch.Tensor, 38 | is_discriminative_step: str2bool, 39 | ): 40 | """Forward. 41 | 42 | Args: 43 | """ 44 | if self.weight > 0 and not is_discriminative_step and pseudo_labels is not None: 45 | inter_x = self.decoder(inter_x) 46 | 47 | if self.downsample_rate > 1: 48 | pseudo_labels = pseudo_labels[:, :: self.downsample_rate] 49 | valid_time_length = min(pseudo_labels.shape[1], inter_x.shape[1]) 50 | pseudo_label_loss = F.cross_entropy( 51 | inter_x[:, :valid_time_length].transpose(1, 2), 52 | pseudo_labels[:, :valid_time_length], 53 | ignore_index=self.ignore_index, 54 | reduction=self.reduction, 55 | ) 56 | pseudo_label_loss = pseudo_label_loss.mean() * pseudo_label_loss.shape[0] 57 | 58 | return pseudo_label_loss 59 | else: 60 | return 0 61 | -------------------------------------------------------------------------------- /espnet2/uasr/loss/smoothness_penalty.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typeguard import check_argument_types 4 | 5 | from espnet2.uasr.loss.abs_loss import AbsUASRLoss 6 | 7 | 8 | class UASRSmoothnessPenalty(AbsUASRLoss): 9 | """smoothness penalty for UASR.""" 10 | 11 | def __init__( 12 | self, 13 | weight: float = 1.0, 14 | reduction: str = "none", 15 | ): 16 | super().__init__() 17 | assert check_argument_types() 18 | 19 | self.weight = weight 20 | self.reduction = reduction 21 | 22 | def forward( 23 | self, 24 | dense_logits: torch.Tensor, 25 | dense_padding_mask: torch.Tensor, 26 | sample_size: int, 27 | is_discriminative_step: bool, 28 | ): 29 | """Forward. 30 | 31 | Args: 32 | dense_logits: output logits of generator 33 | dense_padding_mask: padding mask of logits 34 | sample_size: batch size 35 | is_discriminative_step: Whether is training discriminator 36 | """ 37 | if self.weight > 0 and not is_discriminative_step: 38 | smoothness_penalty = F.mse_loss( 39 | dense_logits[:, :-1], dense_logits[:, 1:], reduction=self.reduction 40 | ) 41 | smoothness_penalty[dense_padding_mask[:, 1:]] = 0 42 | smoothness_penalty = smoothness_penalty.mean() * sample_size 43 | 44 | return smoothness_penalty 45 | else: 46 | return 0 47 | -------------------------------------------------------------------------------- /espnet2/uasr/segmenter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/uasr/segmenter/__init__.py -------------------------------------------------------------------------------- /espnet2/uasr/segmenter/abs_segmenter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Segmenter definition for UASR task 3 | 4 | Practially, the output of the generator (in frame-level) may 5 | predict the same phoneme for consecutive frames, which makes 6 | it too easy for the discriminator. So, the segmenter here is 7 | to merge frames with a similar prediction from the generator output. 8 | """ 9 | 10 | from abc import ABC, abstractmethod 11 | 12 | import torch 13 | 14 | 15 | class AbsSegmenter(torch.nn.Module, ABC): 16 | @abstractmethod 17 | def pre_segment( 18 | self, 19 | xs_pad: torch.Tensor, 20 | ilens: torch.Tensor, 21 | ) -> torch.Tensor: 22 | raise NotImplementedError 23 | 24 | @abstractmethod 25 | def logit_segment( 26 | self, 27 | xs_pad: torch.Tensor, 28 | ilens: torch.Tensor, 29 | ) -> torch.Tensor: 30 | raise NotImplementedError 31 | -------------------------------------------------------------------------------- /espnet2/uasr/segmenter/random_segmenter.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | from espnet2.uasr.segmenter.abs_segmenter import AbsSegmenter 7 | from espnet2.utils.types import str2bool 8 | 9 | 10 | class RandomSegmenter(AbsSegmenter): 11 | def __init__( 12 | self, 13 | subsample_rate: float = 0.25, 14 | mean_pool: str2bool = True, 15 | mean_join_pool: str2bool = False, 16 | remove_zeros: str2bool = False, 17 | ): 18 | super().__init__() 19 | assert check_argument_types() 20 | self.subsample_rate = subsample_rate 21 | 22 | def pre_segment( 23 | self, 24 | xs_pad: torch.Tensor, 25 | padding_mask: torch.Tensor, 26 | ) -> torch.Tensor: 27 | target_num = math.ceil(xs_pad.size(1) * self.subsample_rate) 28 | ones = torch.ones(xs_pad.shape[:-1], device=xs_pad.device) 29 | indices, _ = ones.multinomial(target_num).sort(dim=-1) 30 | indices_ld = indices.unsqueeze(-1).expand(-1, -1, xs_pad.size(-1)) 31 | xs_pad = xs_pad.gather(1, indices_ld) 32 | padding_mask = padding_mask.gather(1, index=indices) 33 | return xs_pad, padding_mask 34 | 35 | def logit_segment( 36 | self, 37 | xs_pad: torch.Tensor, 38 | padding_mask: torch.Tensor, 39 | ) -> torch.Tensor: 40 | return xs_pad, padding_mask 41 | -------------------------------------------------------------------------------- /espnet2/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/espnet2/utils/__init__.py -------------------------------------------------------------------------------- /espnet2/utils/build_dataclass.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dataclasses 3 | 4 | from typeguard import check_type 5 | 6 | 7 | def build_dataclass(dataclass, args: argparse.Namespace): 8 | """Helper function to build dataclass from 'args'.""" 9 | kwargs = {} 10 | for field in dataclasses.fields(dataclass): 11 | if not hasattr(args, field.name): 12 | raise ValueError( 13 | f"args doesn't have {field.name}. You need to set it to ArgumentsParser" 14 | ) 15 | check_type(field.name, getattr(args, field.name), field.type) 16 | kwargs[field.name] = getattr(args, field.name) 17 | return dataclass(**kwargs) 18 | -------------------------------------------------------------------------------- /espnet2/utils/config_argparse.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import yaml 5 | 6 | 7 | class ArgumentParser(argparse.ArgumentParser): 8 | """Simple implementation of ArgumentParser supporting config file 9 | 10 | This class is originated from https://github.com/bw2/ConfigArgParse, 11 | but this class is lack of some features that it has. 12 | 13 | - Not supporting multiple config files 14 | - Automatically adding "--config" as an option. 15 | - Not supporting any formats other than yaml 16 | - Not checking argument type 17 | 18 | """ 19 | 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | self.add_argument("--config", help="Give config file in yaml format") 23 | 24 | def parse_known_args(self, args=None, namespace=None): 25 | # Once parsing for setting from "--config" 26 | _args, _ = super().parse_known_args(args, namespace) 27 | if _args.config is not None: 28 | if not Path(_args.config).exists(): 29 | self.error(f"No such file: {_args.config}") 30 | 31 | with open(_args.config, "r", encoding="utf-8") as f: 32 | d = yaml.safe_load(f) 33 | if not isinstance(d, dict): 34 | self.error("Config file has non dict value: {_args.config}") 35 | 36 | for key in d: 37 | for action in self._actions: 38 | if key == action.dest: 39 | break 40 | else: 41 | self.error(f"unrecognized arguments: {key} (from {_args.config})") 42 | 43 | # NOTE(kamo): Ignore "--config" from a config file 44 | # NOTE(kamo): Unlike "configargparse", this module doesn't check type. 45 | # i.e. We can set any type value regardless of argument type. 46 | self.set_defaults(**d) 47 | return super().parse_known_args(args, namespace) 48 | -------------------------------------------------------------------------------- /espnet2/utils/get_default_kwargs.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | class Invalid: 5 | """Marker object for not serializable-object""" 6 | 7 | 8 | def get_default_kwargs(func): 9 | """Get the default values of the input function. 10 | 11 | Examples: 12 | >>> def func(a, b=3): pass 13 | >>> get_default_kwargs(func) 14 | {'b': 3} 15 | 16 | """ 17 | 18 | def yaml_serializable(value): 19 | # isinstance(x, tuple) includes namedtuple, so type is used here 20 | if type(value) is tuple: 21 | return yaml_serializable(list(value)) 22 | elif isinstance(value, set): 23 | return yaml_serializable(list(value)) 24 | elif isinstance(value, dict): 25 | if not all(isinstance(k, str) for k in value): 26 | return Invalid 27 | retval = {} 28 | for k, v in value.items(): 29 | v2 = yaml_serializable(v) 30 | # Register only valid object 31 | if v2 not in (Invalid, inspect.Parameter.empty): 32 | retval[k] = v2 33 | return retval 34 | elif isinstance(value, list): 35 | retval = [] 36 | for v in value: 37 | v2 = yaml_serializable(v) 38 | # If any elements in the list are invalid, 39 | # the list also becomes invalid 40 | if v2 is Invalid: 41 | return Invalid 42 | else: 43 | retval.append(v2) 44 | return retval 45 | elif value in (inspect.Parameter.empty, None): 46 | return value 47 | elif isinstance(value, (float, int, complex, bool, str, bytes)): 48 | return value 49 | else: 50 | return Invalid 51 | 52 | # params: An ordered mapping of inspect.Parameter 53 | params = inspect.signature(func).parameters 54 | data = {p.name: p.default for p in params.values()} 55 | # Remove not yaml-serializable object 56 | data = yaml_serializable(data) 57 | return data 58 | -------------------------------------------------------------------------------- /espnet2/utils/kwargs2args.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | def func(a: int, b, *, c, **kwargs): 5 | pass 6 | 7 | 8 | def kwargs2args(func, kwargs): 9 | parameters = inspect.signature(func).parameters 10 | d = {k: i for i, k in enumerate(parameters)} 11 | args = [None for i in range(len(parameters))] 12 | for k, v in kwargs.items(): 13 | if k in d: 14 | args[d[k]] = v 15 | 16 | for i, v in enumerate(args): 17 | if v is None: 18 | break 19 | 20 | return tuple(args[:i]) 21 | -------------------------------------------------------------------------------- /espnet2/utils/yaml_no_alias_safe_dump.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | class NoAliasSafeDumper(yaml.SafeDumper): 5 | # Disable anchor/alias in yaml because looks ugly 6 | def ignore_aliases(self, data): 7 | return True 8 | 9 | 10 | def yaml_no_alias_safe_dump(data, stream=None, **kwargs): 11 | """Safe-dump in yaml with no anchor/alias""" 12 | return yaml.dump( 13 | data, stream, allow_unicode=True, Dumper=NoAliasSafeDumper, **kwargs 14 | ) 15 | -------------------------------------------------------------------------------- /pretrained/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litagin02/vits-japros-webui/3a05a613c617e98388e66921d37a88004476b81e/pretrained/.gitkeep -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # espnet==202308 2 | espnet_tts_frontend 3 | faster-whisper==0.9.0 4 | gradio==3.44.4 5 | h5py 6 | jaconv==0.3.4 7 | jamo==0.4.1 8 | kaldiio==2.18.0 9 | librosa==0.10.1 10 | # pyopenjtalk==0.3.2 11 | pyopenjtalk-prebuilt==0.3.0 12 | pyworld==0.3.4 13 | sentencepiece==0.1.99 14 | tensorboard==2.14 15 | torch_complex 16 | typeguard==2.13.3 -------------------------------------------------------------------------------- /setup.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | echo Creating virtual environment... 4 | python -m venv venv 5 | 6 | if errorlevel 1 ( 7 | echo Error: Failed to create virtual environment. 8 | exit /b 9 | ) 10 | 11 | echo Installing torch... 12 | venv\Scripts\pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu118 13 | 14 | if errorlevel 1 ( 15 | echo Error: Failed to install torch and torchaudio. 16 | exit /b 17 | ) 18 | 19 | echo Installing packages from requirements.txt... 20 | venv\Scripts\pip install -r requirements.txt 21 | 22 | if errorlevel 1 ( 23 | echo Error: Failed to install packages. 24 | exit /b 25 | ) 26 | 27 | echo Downloading pretrained model... 28 | if not exist pretrained mkdir pretrained 29 | curl -L "https://huggingface.co/litagin/vits-japros-pretrained/resolve/main/pretrained.pth" -o "pretrained\pretrained.pth" 30 | 31 | if errorlevel 1 ( 32 | echo Error: Failed to download pretrained model. 33 | exit /b 34 | ) 35 | 36 | if not exist "weights\pretrained\" mkdir "weights\pretrained\" 37 | 38 | if not exist "weights\pretrained\pretrained.pth" ( 39 | echo Copying pretrained model to weights/pretrained/... 40 | copy "pretrained\pretrained.pth" "weights\pretrained\pretrained.pth" 41 | ) 42 | 43 | echo Setup complete. 44 | 45 | pause -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import subprocess 4 | 5 | python = sys.executable 6 | 7 | 8 | def run_train( 9 | model_name: str, 10 | max_epoch: int = 200, 11 | batch_bins: int = 1000000, 12 | output_dir: str = "outputs", 13 | ) -> str: 14 | cmd = [python, "-m", "espnet2.bin.gan_tts_train"] 15 | from conf.train_args import train_args 16 | 17 | for i, arg in enumerate(train_args): 18 | train_args[i] = arg.format(model_name=model_name, output_dir=output_dir) 19 | 20 | cmd.extend(train_args) 21 | cmd.extend(["--batch_bins", str(batch_bins)]) 22 | cmd.extend(["--max_epoch", str(max_epoch)]) 23 | 24 | print(" ".join(cmd)) 25 | print("Submitted to subprocess.") 26 | subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) 27 | 28 | return "学習が開始されました。詳細はターミナルとTensorBoardを確認してください。" 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--model-name", type=str, required=True) 34 | parser.add_argument("--max-epoch", type=int, default=200) 35 | parser.add_argument("--batch-bins", type=int, default=1000000) 36 | parser.add_argument("--output-dir", type=str, default="outputs") 37 | args = parser.parse_args() 38 | 39 | model_name = args.model_name 40 | max_epoch = args.max_epoch 41 | batch_bins = args.batch_bins 42 | output_dir = args.output_dir 43 | 44 | run_train(model_name, max_epoch, batch_bins, output_dir) 45 | -------------------------------------------------------------------------------- /update.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo Updating the repository... 3 | 4 | cd /d %~dp0 5 | 6 | git pull 7 | 8 | venv\Scripts\pip install -r requirements.txt 9 | 10 | echo Update complete. 11 | pause 12 | -------------------------------------------------------------------------------- /webui_infer.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | echo Running webui_infer.py... 4 | venv\Scripts\python webui_infer.py 5 | 6 | if errorlevel 1 ( 7 | echo Error: Failed to run webui_infer.py. 8 | pause 9 | exit /b 10 | ) 11 | 12 | pause 13 | -------------------------------------------------------------------------------- /webui_train.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | echo Running webui_train.py... 4 | venv\Scripts\python webui_train.py 5 | 6 | if errorlevel 1 ( 7 | echo Error: Failed to run webui_train.py. 8 | pause 9 | exit /b 10 | ) 11 | 12 | pause 13 | -------------------------------------------------------------------------------- /weights/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore --------------------------------------------------------------------------------