├── LICENSE ├── README.md ├── conftest.py ├── docs ├── Makefile ├── README.md ├── _apidoc_templates │ ├── module.rst │ └── package.rst ├── conf.py ├── contributing.md ├── docs-requirements.txt ├── experiment.md ├── images │ └── logo_noname_rounded_big.png ├── index.rst ├── installation.md ├── multigpu.md └── tutorials.md ├── lint-requirements.txt ├── pip-wheel-metadata └── speechbrain.dist-info │ ├── LICENSE │ ├── METADATA │ └── top_level.txt ├── pyproject.toml ├── pytest.ini ├── recipes └── LibriMix │ ├── .DS_Store │ ├── __pycache__ │ └── prepare_data.cpython-38.pyc │ ├── extra-dependencies.txt │ ├── meta │ ├── __pycache__ │ │ └── preprocess_dynamic_mixing.cpython-38.pyc │ └── preprocess_dynamic_mixing.py │ ├── prepare_data.py │ └── separation │ ├── .DS_Store │ ├── __pycache__ │ └── dynamic_mixing.cpython-38.pyc │ ├── csv_wham │ ├── libri2mix_dev.csv │ ├── libri2mix_test.csv │ ├── libri2mix_train-360.csv │ ├── libri3mix_dev.csv │ ├── libri3mix_test.csv │ └── libri3mix_train-360.csv │ ├── dynamic_mixing.py │ ├── hparams │ ├── .DS_Store │ ├── dprnn-libri2mix-unified-gm.yaml │ └── sepformer-libri2mix-unified-gm.yaml │ ├── test_dprnn_libri2mix_unified_gm.sh │ ├── test_sepformer_libri2mix_unified_gm.sh │ ├── train.py │ ├── train_dprnn_libri2mix_unified_gm.sh │ ├── train_sepformer_libri2mix_unified_gm.sh │ ├── train_unified.py │ └── train_unified_gm.py ├── requirements.txt ├── samples ├── audio_samples │ ├── csv_example.csv │ ├── csv_example2.csv │ ├── csv_example3.csv │ ├── csv_example_multichannel.csv │ ├── example1.wav │ ├── example2.flac │ ├── example3.sph │ ├── example4.raw │ ├── example5.wav │ ├── example6.wav │ ├── example_fr.wav │ ├── example_multichannel.wav │ ├── example_noisy.wav │ ├── multi_mic │ │ ├── noise_0.70225_-0.70225_0.11704.flac │ │ ├── noise_diffuse.flac │ │ ├── speech_-0.82918_0.55279_-0.082918.flac │ │ └── speech_-0.98894_0_0.14834.flac │ ├── nn_training_samples │ │ ├── debug.csv │ │ ├── dev.csv │ │ ├── dev.json │ │ ├── spk1_snt1.pkl │ │ ├── spk1_snt1.wav │ │ ├── spk1_snt2.pkl │ │ ├── spk1_snt2.wav │ │ ├── spk1_snt3.pkl │ │ ├── spk1_snt3.wav │ │ ├── spk1_snt4.pkl │ │ ├── spk1_snt4.wav │ │ ├── spk1_snt5.pkl │ │ ├── spk1_snt5.wav │ │ ├── spk1_snt6.pkl │ │ ├── spk1_snt6.wav │ │ ├── spk2_snt1.pkl │ │ ├── spk2_snt1.wav │ │ ├── spk2_snt2.pkl │ │ ├── spk2_snt2.wav │ │ ├── spk2_snt3.pkl │ │ ├── spk2_snt3.wav │ │ ├── spk2_snt4.pkl │ │ ├── spk2_snt4.wav │ │ ├── spk2_snt5.pkl │ │ ├── spk2_snt5.wav │ │ ├── spk2_snt6.pkl │ │ ├── spk2_snt6.wav │ │ ├── test.csv │ │ ├── train.csv │ │ └── train.json │ ├── sourcesep_samples │ │ ├── csv_example_sourcesep_mixture.csv │ │ ├── csv_example_sourcesep_source1.csv │ │ ├── csv_example_sourcesep_source2.csv │ │ ├── minimal_example_convtasnet_cv.csv │ │ ├── minimal_example_convtasnet_tr.csv │ │ ├── minimal_example_convtasnet_tt.csv │ │ ├── mixture_0.wav │ │ ├── mixture_1.wav │ │ ├── mixture_2.wav │ │ ├── mixture_3.wav │ │ ├── source1_0.wav │ │ ├── source1_1.wav │ │ ├── source1_2.wav │ │ ├── source1_3.wav │ │ ├── source2_0.wav │ │ ├── source2_1.wav │ │ ├── source2_2.wav │ │ └── source2_3.wav │ ├── test_csv_merge.csv │ ├── test_mixture.wav │ └── vad │ │ ├── train.json │ │ ├── train.wav │ │ ├── valid.json │ │ └── valid.wav ├── label_samples │ ├── hyp.csv │ └── ref.csv ├── noise_samples │ ├── noise.csv │ ├── noise1.wav │ ├── noise2.wav │ ├── noise3.wav │ ├── noise4.wav │ ├── noise5.wav │ ├── noise_multichannel.csv │ ├── noise_multichannel.wav │ └── noise_rel.csv ├── plda_xvect_samples │ ├── enrol_stat_xvect.pkl │ ├── expected_plda_scores.pkl │ ├── test_stat_xvect.pkl │ └── train_stat_xvect.pkl ├── rir_samples │ ├── rir1.wav │ ├── rir2.wav │ ├── rir3.wav │ ├── rir4.wav │ ├── rir_multichannel.csv │ ├── rir_multichannel.wav │ ├── rirs.csv │ └── rirs_rel.csv ├── rttm_samples │ ├── ReadMe.md │ ├── ref_rttm │ │ └── ES2014c.rttm │ └── sys_rttm │ │ └── ES2014c.rttm ├── text_samples │ ├── hdf5_example.h5 │ ├── label_dict.pkl │ └── readme.txt └── voxceleb_samples │ ├── meta │ └── iden_split.txt │ ├── readme.txt │ └── wav │ ├── dev.csv │ ├── id10001 │ └── 1zcIwhmdeo4 │ │ ├── 00001.wav │ │ ├── 00002.wav │ │ └── 00003.wav │ ├── id10002 │ └── xTV-jFAUKcw │ │ ├── 00001.wav │ │ ├── 00002.wav │ │ └── 00003.wav │ └── train.csv ├── setup.py ├── speechbrain.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── requires.txt └── top_level.txt ├── speechbrain ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── core.cpython-37.pyc │ └── core.cpython-38.pyc ├── alignment │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── __init__.cpython-38.pyc │ ├── aligner.py │ └── ctc_segmentation.py ├── core.py ├── dataio │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── batch.cpython-37.pyc │ │ ├── batch.cpython-38.pyc │ │ ├── dataio.cpython-37.pyc │ │ ├── dataio.cpython-38.pyc │ │ ├── dataloader.cpython-37.pyc │ │ ├── dataloader.cpython-38.pyc │ │ ├── dataset.cpython-37.pyc │ │ ├── dataset.cpython-38.pyc │ │ ├── encoder.cpython-37.pyc │ │ ├── encoder.cpython-38.pyc │ │ ├── iterators.cpython-37.pyc │ │ ├── iterators.cpython-38.pyc │ │ ├── legacy.cpython-37.pyc │ │ ├── legacy.cpython-38.pyc │ │ ├── preprocess.cpython-37.pyc │ │ ├── preprocess.cpython-38.pyc │ │ ├── sampler.cpython-37.pyc │ │ ├── sampler.cpython-38.pyc │ │ ├── wer.cpython-37.pyc │ │ └── wer.cpython-38.pyc │ ├── batch.py │ ├── dataio.py │ ├── dataloader.py │ ├── dataset.py │ ├── encoder.py │ ├── iterators.py │ ├── legacy.py │ ├── preprocess.py │ ├── sampler.py │ └── wer.py ├── decoders │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── ctc.cpython-37.pyc │ │ ├── ctc.cpython-38.pyc │ │ ├── seq2seq.cpython-37.pyc │ │ └── seq2seq.cpython-38.pyc │ ├── ctc.py │ ├── seq2seq.py │ └── transducer.py ├── lm │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── __init__.cpython-38.pyc │ ├── arpa.py │ ├── counting.py │ └── ngram.py ├── lobes │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ └── augment.cpython-38.pyc │ ├── augment.py │ ├── beamform_multimic.py │ ├── features.py │ └── models │ │ ├── CRDNN.py │ │ ├── ContextNet.py │ │ ├── ECAPA_TDNN.py │ │ ├── ESPnetVGG.py │ │ ├── MetricGAN.py │ │ ├── RNNLM.py │ │ ├── VanillaNN.py │ │ ├── Xvector.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── conv_tasnet.cpython-38.pyc │ │ ├── dual_path.cpython-37.pyc │ │ ├── dual_path.cpython-38.pyc │ │ ├── dual_path2.cpython-38.pyc │ │ ├── dual_path_context.cpython-38.pyc │ │ ├── dual_path_conv.cpython-38.pyc │ │ ├── dual_path_conv2.cpython-38.pyc │ │ ├── dual_path_multi_scale.cpython-38.pyc │ │ ├── dual_path_splitnet.cpython-38.pyc │ │ ├── dual_path_splitnet_exchange.cpython-38.pyc │ │ ├── galr.cpython-38.pyc │ │ ├── norms.cpython-38.pyc │ │ ├── torch_utils.cpython-38.pyc │ │ ├── u_net.cpython-38.pyc │ │ └── unet.cpython-38.pyc │ │ ├── conv_tasnet.py │ │ ├── convolution.py │ │ ├── dual_path.py │ │ ├── fairseq_wav2vec.py │ │ ├── galr.py │ │ ├── huggingface_wav2vec.py │ │ ├── norms.py │ │ ├── segan_model.py │ │ ├── torch_utils.py │ │ ├── transformer │ │ ├── Conformer.py │ │ ├── Transformer.py │ │ ├── TransformerASR.py │ │ ├── TransformerLM.py │ │ ├── TransformerSE.py │ │ ├── TransformerST.py │ │ ├── Transformer_GALR.py │ │ ├── Transformer_old.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ │ ├── Conformer.cpython-37.pyc │ │ │ ├── Conformer.cpython-38.pyc │ │ │ ├── Transformer.cpython-37.pyc │ │ │ ├── Transformer.cpython-38.pyc │ │ │ ├── Transformer_GALR.cpython-38.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── __init__.cpython-38.pyc │ │ └── unet.py ├── log-config.yaml ├── nnet │ ├── CNN.py │ ├── RNN.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── CNN.cpython-37.pyc │ │ ├── CNN.cpython-38.pyc │ │ ├── RNN.cpython-37.pyc │ │ ├── RNN.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── activations.cpython-37.pyc │ │ ├── activations.cpython-38.pyc │ │ ├── attention.cpython-37.pyc │ │ ├── attention.cpython-38.pyc │ │ ├── containers.cpython-37.pyc │ │ ├── containers.cpython-38.pyc │ │ ├── dropout.cpython-37.pyc │ │ ├── dropout.cpython-38.pyc │ │ ├── embedding.cpython-37.pyc │ │ ├── embedding.cpython-38.pyc │ │ ├── linear.cpython-37.pyc │ │ ├── linear.cpython-38.pyc │ │ ├── losses.cpython-37.pyc │ │ ├── losses.cpython-38.pyc │ │ ├── normalization.cpython-37.pyc │ │ ├── normalization.cpython-38.pyc │ │ ├── pooling.cpython-37.pyc │ │ ├── pooling.cpython-38.pyc │ │ ├── schedulers.cpython-37.pyc │ │ └── schedulers.cpython-38.pyc │ ├── activations.py │ ├── attention.py │ ├── complex_networks │ │ ├── __init__.py │ │ ├── c_CNN.py │ │ ├── c_RNN.py │ │ ├── c_linear.py │ │ ├── c_normalization.py │ │ └── c_ops.py │ ├── containers.py │ ├── dropout.py │ ├── embedding.py │ ├── linear.py │ ├── loss │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── stoi_loss.cpython-37.pyc │ │ │ └── stoi_loss.cpython-38.pyc │ │ ├── guidedattn_loss.py │ │ ├── stoi_loss.py │ │ └── transducer_loss.py │ ├── losses.py │ ├── normalization.py │ ├── pooling.py │ ├── quaternion_networks │ │ ├── __init__.py │ │ ├── q_CNN.py │ │ ├── q_RNN.py │ │ ├── q_linear.py │ │ ├── q_normalization.py │ │ └── q_ops.py │ ├── schedulers.py │ └── transducer │ │ ├── __init__.py │ │ └── transducer_joint.py ├── pretrained │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── fetching.cpython-37.pyc │ │ ├── fetching.cpython-38.pyc │ │ ├── interfaces.cpython-37.pyc │ │ └── interfaces.cpython-38.pyc │ ├── fetching.py │ └── interfaces.py ├── processing │ ├── NMF.py │ ├── PLDA_LDA.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── signal_processing.cpython-37.pyc │ │ ├── signal_processing.cpython-38.pyc │ │ ├── speech_augmentation.cpython-37.pyc │ │ └── speech_augmentation.cpython-38.pyc │ ├── decomposition.py │ ├── diarization.py │ ├── features.py │ ├── multi_mic.py │ ├── signal_processing.py │ └── speech_augmentation.py ├── tokenizers │ ├── SentencePiece.py │ ├── __init__.py │ └── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── __init__.cpython-38.pyc ├── utils │ ├── Accuracy.py │ ├── DER.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── Accuracy.cpython-37.pyc │ │ ├── Accuracy.cpython-38.pyc │ │ ├── DER.cpython-37.pyc │ │ ├── DER.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── bleu.cpython-37.pyc │ │ ├── bleu.cpython-38.pyc │ │ ├── callchains.cpython-37.pyc │ │ ├── callchains.cpython-38.pyc │ │ ├── checkpoints.cpython-37.pyc │ │ ├── checkpoints.cpython-38.pyc │ │ ├── data_pipeline.cpython-37.pyc │ │ ├── data_pipeline.cpython-38.pyc │ │ ├── data_utils.cpython-37.pyc │ │ ├── data_utils.cpython-38.pyc │ │ ├── depgraph.cpython-37.pyc │ │ ├── depgraph.cpython-38.pyc │ │ ├── distributed.cpython-37.pyc │ │ ├── distributed.cpython-38.pyc │ │ ├── edit_distance.cpython-37.pyc │ │ ├── edit_distance.cpython-38.pyc │ │ ├── epoch_loop.cpython-37.pyc │ │ ├── epoch_loop.cpython-38.pyc │ │ ├── logger.cpython-37.pyc │ │ ├── logger.cpython-38.pyc │ │ ├── metric_stats.cpython-37.pyc │ │ ├── metric_stats.cpython-38.pyc │ │ ├── parameter_transfer.cpython-37.pyc │ │ ├── parameter_transfer.cpython-38.pyc │ │ ├── superpowers.cpython-37.pyc │ │ ├── superpowers.cpython-38.pyc │ │ ├── torch_audio_backend.cpython-37.pyc │ │ ├── torch_audio_backend.cpython-38.pyc │ │ ├── train_logger.cpython-37.pyc │ │ └── train_logger.cpython-38.pyc │ ├── bleu.py │ ├── callchains.py │ ├── checkpoints.py │ ├── data_pipeline.py │ ├── data_utils.py │ ├── depgraph.py │ ├── distributed.py │ ├── edit_distance.py │ ├── epoch_loop.py │ ├── logger.py │ ├── metric_stats.py │ ├── parameter_transfer.py │ ├── superpowers.py │ ├── torch_audio_backend.py │ └── train_logger.py └── version.txt ├── templates ├── README.md ├── enhancement │ ├── README.md │ ├── custom_model.py │ ├── mini_librispeech_prepare.py │ ├── train.py │ └── train.yaml ├── speaker_id │ ├── README.md │ ├── custom_model.py │ ├── mini_librispeech_prepare.py │ ├── train.py │ └── train.yaml └── speech_recognition │ ├── ASR │ ├── README.md │ ├── mini_librispeech_prepare.py │ ├── train.py │ └── train.yaml │ ├── LM │ ├── README.md │ ├── RNNLM.yaml │ ├── custom_model.py │ ├── data │ │ ├── test.txt │ │ ├── train.txt │ │ └── valid.txt │ ├── extra_requirements.txt │ └── train.py │ ├── README.md │ ├── Tokenizer │ ├── README.md │ ├── mini_librispeech_prepare.py │ ├── tokenizer.yaml │ └── train.py │ └── mini_librispeech_prepare.py ├── tests ├── .run-doctests.sh ├── .run-linters.sh ├── .run-recipe-tests.sh ├── .run-unittests.sh ├── integration │ ├── neural_networks │ │ ├── ASR_CTC │ │ │ ├── example_asr_ctc_experiment.py │ │ │ ├── example_asr_ctc_experiment_complex_net.py │ │ │ ├── example_asr_ctc_experiment_quaternion_net.py │ │ │ ├── hyperparams.yaml │ │ │ ├── hyperparams_complex_net.yaml │ │ │ └── hyperparams_quaternion_net.yaml │ │ ├── ASR_DNN_HMM │ │ │ ├── example_asr_dnn_hmm_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── ASR_Transducer │ │ │ ├── example_asr_transducer_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── ASR_alignment_forward │ │ │ ├── example_asr_alignment_forward_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── ASR_alignment_viterbi │ │ │ ├── example_asr_alignment_viterbi_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── ASR_seq2seq │ │ │ ├── example_asr_seq2seq_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── G2P │ │ │ ├── example_g2p.py │ │ │ └── hyperparams.yaml │ │ ├── LM_RNN │ │ │ ├── example_lm_rnn_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── VAD │ │ │ ├── example_vad.py │ │ │ └── hyperparams.yaml │ │ ├── autoencoder │ │ │ ├── example_auto_experiment.py │ │ │ └── hyperparams.yaml │ │ ├── enhance_GAN │ │ │ ├── example_enhance_gan_experiment.py │ │ │ ├── hyperparams.yaml │ │ │ └── models.yaml │ │ ├── separation │ │ │ ├── example_conv_tasnet.py │ │ │ └── hyperparams.yaml │ │ └── speaker_id │ │ │ ├── example_xvector_experiment.py │ │ │ └── hyperparams.yaml │ └── signal_processing │ │ ├── PLDA_xvector │ │ └── example_plda_experiment.py │ │ ├── example_add_babble.py │ │ ├── example_add_noise.py │ │ ├── example_add_reverb.py │ │ ├── example_do_clip.py │ │ ├── example_drop_chunk.py │ │ ├── example_drop_freq.py │ │ ├── example_speed_perturb.py │ │ ├── expected │ │ ├── add_babble │ │ │ └── save │ │ │ │ └── example1.flac │ │ ├── add_noise │ │ │ └── save │ │ │ │ └── example1.flac │ │ ├── add_reverb │ │ │ └── save │ │ │ │ └── example1.flac │ │ ├── do_clip │ │ │ └── save │ │ │ │ └── example1.flac │ │ ├── drop_chunk │ │ │ └── save │ │ │ │ └── example1.flac │ │ ├── drop_freq │ │ │ └── save │ │ │ │ └── example1.flac │ │ └── speed_perturb │ │ │ └── save │ │ │ └── example1.flac │ │ ├── hyperparams.yaml │ │ └── nmf_sourcesep │ │ ├── example_experiment.py │ │ └── hyperparams.yaml └── unittests │ ├── test_CNN.py │ ├── test_RNN.py │ ├── test_activations.py │ ├── test_arpa.py │ ├── test_attention.py │ ├── test_augment.py │ ├── test_batching.py │ ├── test_callchains.py │ ├── test_categorical_encoder.py │ ├── test_checkpoints.py │ ├── test_core.py │ ├── test_counting.py │ ├── test_ctc_segmentation.py │ ├── test_data_io.py │ ├── test_data_pipeline.py │ ├── test_dataloader.py │ ├── test_dataset.py │ ├── test_dependency_graph.py │ ├── test_dropout.py │ ├── test_edit_distance.py │ ├── test_embedding.py │ ├── test_epoch_loop.py │ ├── test_features.py │ ├── test_linear.py │ ├── test_losses.py │ ├── test_metrics.py │ ├── test_multi_mic.py │ ├── test_ngram_lm.py │ ├── test_normalization.py │ ├── test_pooling.py │ ├── test_pretrainer.py │ ├── test_samplers.py │ ├── test_schedulers.py │ ├── test_signal_processing.py │ ├── test_superpowers.py │ ├── test_tokenizer.py │ └── tokenizer_data │ └── dev-clean.csv └── tools ├── compute_wer.py └── der_eval └── md-eval.pl /README.md: -------------------------------------------------------------------------------- 1 | # Unifying Speech Enhancement and Separation 2 | 3 | This is the code implementation for paper [Unifying Speech Enhancement and Separation with Gradient Modulation for End-to-End Noise-Robust Speech Separation](https://arxiv.org/abs/2302.11131) that is built based on [SpeechBrain](https://github.com/speechbrain/speechbrain) toolkit. 4 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | collect_ignore = ["setup.py"] 2 | try: 3 | import numba # noqa: F401 4 | except ModuleNotFoundError: 5 | collect_ignore.append("speechbrain/nnet/loss/transducer_loss.py") 6 | try: 7 | import fairseq # noqa: F401 8 | except ModuleNotFoundError: 9 | collect_ignore.append("speechbrain/lobes/models/fairseq_wav2vec.py") 10 | try: 11 | from transformers import Wav2Vec2Model # noqa: F401 12 | except ModuleNotFoundError: 13 | collect_ignore.append("speechbrain/lobes/models/huggingface_wav2vec.py") 14 | try: 15 | import sacrebleu # noqa: F401 16 | except ModuleNotFoundError: 17 | collect_ignore.append("speechbrain/utils/bleu.py") 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | clean: 18 | rm -rf build 19 | rm -rf API 20 | @echo "You may also want to remove files not under version control." 21 | @echo "First run" 22 | @echo " git clean -n -d source" 23 | @echo "to see what would be deleted" 24 | @echo "Then if you're happy run" 25 | @echo " git clean -f -d source" 26 | @echo "This can help to clean out api-doc generated .rst files etc." 27 | 28 | # Catch-all target: route all unknown targets to Sphinx using the new 29 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 30 | %: Makefile 31 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 32 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # SpeechBrain documentation 2 | 3 | Please install additional dependencies: 4 | 5 | ``` 6 | pip install -r docs-requirements.txt 7 | ``` 8 | 9 | Then run: 10 | ``` 11 | make html 12 | ``` 13 | to build HTML documentation. Then open `build/html/index.html` 14 | 15 | ## Automatic API documentation from docstrings 16 | 17 | The documentation uses `sphinx.ext.napoleon` to support Google-style 18 | docstrings. Sphinx natively supports reStructuredText directives. 19 | 20 | Automatically generating documentation based on docstrings is not the 21 | core of Sphinx. For this, after much searching, we use better-apidoc. 22 | 23 | ## Future work 24 | 25 | Besides automatic API documentation, Sphinx will facilitate manual prose 26 | documentation. 27 | -------------------------------------------------------------------------------- /docs/_apidoc_templates/module.rst: -------------------------------------------------------------------------------- 1 | {# The :autogenerated: tag is picked up by breadcrumbs.html to suppress "Edit on Github" link #} 2 | :autogenerated: 3 | 4 | {{ fullname }} module 5 | {% for item in range(7 + fullname|length) -%}={%- endfor %} 6 | 7 | .. currentmodule:: {{ fullname }} 8 | 9 | .. automodule:: {{ fullname }} 10 | {% if members -%} 11 | :members: {{ members|join(", ") }} 12 | :undoc-members: 13 | :show-inheritance: 14 | :member-order: bysource 15 | 16 | Summary 17 | ------- 18 | 19 | {%- if exceptions %} 20 | 21 | Exceptions: 22 | 23 | .. autosummary:: 24 | :nosignatures: 25 | {% for item in exceptions %} 26 | {{ item }} 27 | {%- endfor %} 28 | {%- endif %} 29 | 30 | {%- if classes %} 31 | 32 | Classes: 33 | 34 | .. autosummary:: 35 | :nosignatures: 36 | {% for item in classes %} 37 | {{ item }} 38 | {%- endfor %} 39 | {%- endif %} 40 | 41 | {%- if functions %} 42 | 43 | Functions: 44 | 45 | .. autosummary:: 46 | :nosignatures: 47 | {% for item in functions %} 48 | {{ item }} 49 | {%- endfor %} 50 | {%- endif %} 51 | {%- endif %} 52 | 53 | {% set data = get_members(typ='data', in_list='__all__') %} 54 | {%- if data %} 55 | 56 | Data: 57 | 58 | .. autosummary:: 59 | :nosignatures: 60 | {% for item in data %} 61 | {{ item }} 62 | {%- endfor %} 63 | {%- endif %} 64 | 65 | {% set all_refs = get_members(in_list='__all__', include_imported=True, out_format='refs') %} 66 | {% if all_refs %} 67 | ``__all__``: {{ all_refs|join(", ") }} 68 | {%- endif %} 69 | 70 | 71 | {% if members %} 72 | Reference 73 | --------- 74 | 75 | {%- endif %} 76 | -------------------------------------------------------------------------------- /docs/docs-requirements.txt: -------------------------------------------------------------------------------- 1 | better-apidoc>=0.3.1 2 | numba 3 | recommonmark>=0.7.1 4 | six 5 | sphinx-rtd-theme>=0.4.3 6 | Sphinx>=3.4.3 7 | -------------------------------------------------------------------------------- /docs/images/logo_noname_rounded_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/docs/images/logo_noname_rounded_big.png -------------------------------------------------------------------------------- /docs/tutorials.md: -------------------------------------------------------------------------------- 1 | # Tutorials 2 | 3 | A good way to familiarize yourself with SpeechBrain is to take a look at the Colab tutorials that we made available. More tutorials will be made available as the project will progress. 4 | 5 | The full list of tutorials can be found on the official [website](https://speechbrain.github.io). All the tutorials are developed on the [Google Colab platform](https://colab.research.google.com). This allows users to directly try SpeechBrain on GPUs without the need to set up an environment. 6 | -------------------------------------------------------------------------------- /lint-requirements.txt: -------------------------------------------------------------------------------- 1 | black==19.10b0 2 | flake8==3.7.9 3 | pycodestyle==2.5.0 4 | pytest==5.4.1 5 | yamllint==1.23.0 6 | -------------------------------------------------------------------------------- /pip-wheel-metadata/speechbrain.dist-info/top_level.txt: -------------------------------------------------------------------------------- 1 | speechbrain 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 80 3 | target-version = ['py38'] 4 | exclude = ''' 5 | 6 | ( 7 | /( 8 | \.eggs # exclude a few common directories in the 9 | | \.git # root of the project 10 | | \.mypy_cache 11 | | \.tox 12 | | \.venv 13 | )/ 14 | ) 15 | ''' 16 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | doctest_optionflags= ELLIPSIS 3 | 4 | python_files = 5 | test_*.py 6 | check_*.py 7 | example_*.py 8 | 9 | norecursedirs = results 10 | -------------------------------------------------------------------------------- /recipes/LibriMix/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/.DS_Store -------------------------------------------------------------------------------- /recipes/LibriMix/__pycache__/prepare_data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/__pycache__/prepare_data.cpython-38.pyc -------------------------------------------------------------------------------- /recipes/LibriMix/extra-dependencies.txt: -------------------------------------------------------------------------------- 1 | mir-eval==0.6 2 | pyloudnorm 3 | 4 | -------------------------------------------------------------------------------- /recipes/LibriMix/meta/__pycache__/preprocess_dynamic_mixing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/meta/__pycache__/preprocess_dynamic_mixing.cpython-38.pyc -------------------------------------------------------------------------------- /recipes/LibriMix/separation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/separation/.DS_Store -------------------------------------------------------------------------------- /recipes/LibriMix/separation/__pycache__/dynamic_mixing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/separation/__pycache__/dynamic_mixing.cpython-38.pyc -------------------------------------------------------------------------------- /recipes/LibriMix/separation/hparams/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/recipes/LibriMix/separation/hparams/.DS_Store -------------------------------------------------------------------------------- /recipes/LibriMix/separation/test_dprnn_libri2mix_unified_gm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cmd="/path/to/slurm.pl --quiet" 4 | 5 | source activate CONDA_ENV 6 | 7 | $cmd log/test-dprnn-libri2mix-unified-gm.log \ 8 | python train.py hparams/dprnn-libri2mix-unified-gm.yaml --data_folder /path/to/data/LibriMix/Libri2Mix/ --test_only True 9 | 10 | -------------------------------------------------------------------------------- /recipes/LibriMix/separation/test_sepformer_libri2mix_unified_gm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cmd="/path/to/slurm.pl --quiet" 4 | 5 | source activate CONDA_ENV 6 | 7 | $cmd log/test-sepformer-libri2mix-unified-gm.log \ 8 | python train.py hparams/sepformer-libri2mix-unified-gm.yaml --data_folder /path/to/data/LibriMix/Libri2Mix/ --test_only True 9 | 10 | -------------------------------------------------------------------------------- /recipes/LibriMix/separation/train_dprnn_libri2mix_unified_gm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cmd="/path/to/slurm.pl --quiet" 4 | 5 | source activate CONDA_ENV 6 | 7 | $cmd log/dprnn-libri2mix-unified-gm.log \ 8 | python train_unified_gm.py hparams/dprnn-libri2mix-unified-gm.yaml --data_folder /path/to/data/LibriMix/Libri2Mix/ --dynamic_mixing False 9 | 10 | -------------------------------------------------------------------------------- /recipes/LibriMix/separation/train_sepformer_libri2mix_unified_gm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cmd="/path/to/slurm.pl --quiet" 4 | 5 | source activate CONDA_ENV 6 | 7 | $cmd log/sepformer-libri2mix-unified-gm.log \ 8 | python train_unified_gm.py hparams/sepformer-libri2mix-unified-gm.yaml --data_folder /path/to/data/LibriMix/Libri2Mix/ --dynamic_mixing False 9 | 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r lint-requirements.txt 2 | huggingface_hub>=0.0.6 3 | hyperpyyaml>=0.0.1 4 | joblib>=0.14.1 5 | numpy>=1.17.0 6 | packaging 7 | pre-commit>=2.3.0 8 | scipy>=1.4.1 9 | sentencepiece>=0.1.91 10 | SoundFile; sys_platform == 'win32' 11 | torch>=1.8.0,<=1.8.1 12 | torchaudio>=0.7.2,<=0.8.1 13 | tqdm>=4.42.0 14 | -------------------------------------------------------------------------------- /samples/audio_samples/csv_example.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts 2 | 3 | example1, 3.260, $data_folder/example1.wav, wav, , spk01, string, 4 | example2, 2.068, $data_folder/example2.flac, flac, , spk02, string, 5 | example3, 2.890, $data_folder/example3.sph, wav, , spk03, string, 6 | example5, 1.000, $data_folder/example5.wav, wav, start:10000 stop:26000, spk05, string, 7 | -------------------------------------------------------------------------------- /samples/audio_samples/csv_example2.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts 2 | 3 | example1, 3.260, $data_folder/example1.wav, wav, , spk01, string, 4 | 5 | -------------------------------------------------------------------------------- /samples/audio_samples/csv_example3.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | example1, 3.260, samples/audio_samples/example1.wav, wav, 4 | example2, 2.068, samples/audio_samples/example2.flac, flac, 5 | example5, 1.00, samples/audio_samples/example5.wav, wav, start:10000 stop:26000 6 | 7 | -------------------------------------------------------------------------------- /samples/audio_samples/csv_example_multichannel.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts 2 | 3 | example1, 3.260, $data_folder/example_multichannel.wav, wav, , spk01, string, 4 | 5 | -------------------------------------------------------------------------------- /samples/audio_samples/example1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example1.wav -------------------------------------------------------------------------------- /samples/audio_samples/example2.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example2.flac -------------------------------------------------------------------------------- /samples/audio_samples/example3.sph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example3.sph -------------------------------------------------------------------------------- /samples/audio_samples/example4.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example4.raw -------------------------------------------------------------------------------- /samples/audio_samples/example5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example5.wav -------------------------------------------------------------------------------- /samples/audio_samples/example6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example6.wav -------------------------------------------------------------------------------- /samples/audio_samples/example_fr.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example_fr.wav -------------------------------------------------------------------------------- /samples/audio_samples/example_multichannel.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example_multichannel.wav -------------------------------------------------------------------------------- /samples/audio_samples/example_noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/example_noisy.wav -------------------------------------------------------------------------------- /samples/audio_samples/multi_mic/noise_0.70225_-0.70225_0.11704.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/multi_mic/noise_0.70225_-0.70225_0.11704.flac -------------------------------------------------------------------------------- /samples/audio_samples/multi_mic/noise_diffuse.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/multi_mic/noise_diffuse.flac -------------------------------------------------------------------------------- /samples/audio_samples/multi_mic/speech_-0.82918_0.55279_-0.082918.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/multi_mic/speech_-0.82918_0.55279_-0.082918.flac -------------------------------------------------------------------------------- /samples/audio_samples/multi_mic/speech_-0.98894_0_0.14834.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/multi_mic/speech_-0.98894_0_0.14834.flac -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/dev.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts, ali, ali_format, ali_opts, phn, phn_format, phn_opts,char,char_format,char_opts 2 | spk1_snt5,2.6,$data_folder/spk1_snt5.wav, wav, ,spk1,string, ,$data_folder/spk1_snt5.pkl,pkl, ,s ah n vcl d ey ih z dh ax vcl b eh s cl t cl p aa r dx ax v dh ax w iy cl,string, ,s u n d a y i s t h e b e s t p a r t o f t h e w e e k,string, 3 | spk2_snt5,1.98,$data_folder/spk2_snt5.wav, wav, ,spk2,string, ,$data_folder/spk2_snt5.pkl,pkl, ,vcl jh ah m cl p dh ax f eh n s ae n hh er iy ah cl p dh ax vcl b ae ng cl,string, ,k e n p a I r s l a c k f u l l f l a v o r,string, 4 | -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "spk1_snt5": { 3 | "wav": "{data_root}/spk1_snt5.wav", 4 | "length": 2.6, 5 | "spk_id": "spk1", 6 | "ali": "{data_root}/spk1_snt5.pkl", 7 | "phn": "s ah n vcl d ey ih z dh ax vcl b eh s cl t cl p aa r dx ax v dh ax w iy cl", 8 | "char": "s u n d a y i s t h e b e s t p a r t o f t h e w e e k" 9 | }, 10 | "spk2_snt5": { 11 | "wav": "{data_root}/spk2_snt5.wav", 12 | "length": 1.98, 13 | "spk_id": "spk2", 14 | "ali": "{data_root}/spk2_snt5.pkl", 15 | "phn": "vcl jh ah m cl p dh ax f eh n s ae n hh er iy ah cl p dh ax vcl b ae ng cl", 16 | "char": "k e n p a i r s l a c k f u l l f l a v o r" 17 | } 18 | } -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt1.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt1.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt2.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt2.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt3.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt3.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt4.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt4.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt5.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt5.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt6.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt6.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk1_snt6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk1_snt6.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt1.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt1.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt2.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt2.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt3.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt3.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt4.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt4.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt5.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt5.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt6.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt6.pkl -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/spk2_snt6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/nn_training_samples/spk2_snt6.wav -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/test.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts, ali, ali_format, ali_opts, phn, phn_format, phn_opts,char,char_format,char_opts 2 | spk1_snt6,2.29,$data_folder/spk1_snt6.wav, wav, ,spk1,string, ,$data_folder/spk1_snt6.pkl,pkl, ,t h e p e n c I l s h a v e a l l b e e n u s e d,string, ,t h e c h i l d a l m o s t h u r t t h e s m a l l d o g ,string, 3 | spk2_snt6 ,1.8,$data_folder/spk2_snt6.wav, wav, ,spk2,string, ,$data_folder/spk2_snt6.pkl,pkl, ,j u m p t h e f e n c e a n d h u r r y u p t h e b a n k,string,,k e n p a I r s l a c k f u l l f l a v o r,string, 4 | -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/train.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts, ali, ali_format, ali_opts, phn, phn_format, phn_opts,char,char_format,char_opts 2 | spk1_snt1,2.87,$data_folder/spk1_snt1.wav, wav, ,spk1,string, ,$data_folder/spk1_snt1.pkl,pkl, ,dh ax cl ch ay l vcl d ao l m ow s cl t hh er cl t sil dh ax s m ao l vcl d ao vcl,string, ,t h e c h i l d a l m o s t h u r t t h e s m a l l d o g ,string, 3 | spk1_snt2,3.15,$data_folder/spk1_snt2.wav, wav, ,spk1,string, ,$data_folder/spk1_snt2.pkl,pkl, ,vcl d r aa cl p dh ax cl t uw sil w eh n y uw ae vcl d dh ax f ih vcl g y er,string, ,d r o p t h e t u e w h e n y o u a d d t h e f i g u r e s,string, 4 | spk1_snt3,2.72,$data_folder/spk1_snt3.wav, wav, ,spk1,string, ,$data_folder/spk1_snt3.pkl,pkl, ,ae cl t dh ae cl t hh ay l eh v ax l dh iy eh r ih z cl p y uh,string, ,A t t h a t h i g h l e v e l t h e a i r i s p u r e,string, 5 | spk1_snt4,2.53,$data_folder/spk1_snt4.wav, wav, ,spk1,string, ,$data_folder/spk1_snt4.pkl,pkl, ,ey th ih n s cl t r ay cl p r ah n z vcl d aw n dh ax m ih vcl d ax,string, ,a t h i n s t r i p e r u n s d o w n t h e m i d d l e,string, 6 | spk2_snt1,2.01,$data_folder/spk2_snt1.wav, wav, ,spk2,string, ,$data_folder/spk2_snt1.pkl,pkl, ,w iy er sh ao r dh ax dx w ah n w ao r ih z ih n ah,string, ,w e a r e s u r e t h a t o n e w o r e i s e n o u g h,string, 7 | spk2_snt2,1.76,$data_folder/spk2_snt2.wav, wav, ,spk2,string, ,$data_folder/spk2_snt2.pkl,pkl, ,w ah cl t vcl jh oy dh eh r ih z ih n l ih v ih,string, ,w h a t j o y t h e r e i s i n l i v i n g,string, 8 | spk2_snt3,1.88,$data_folder/spk2_snt3.wav, wav, ,spk2,string, ,$data_folder/spk2_snt3.pkl,pkl, ,t eh r ah th ih n sh iy cl t f er m dh iy y eh l ow cl p ae vcl,string, ,t h e r a t I n s h I p f r o m t h e y a l l o w p a v,string, 9 | spk2_snt4,2.04,$data_folder/spk2_snt4.wav, wav, ,spk2,string, ,$data_folder/spk2_snt4.pkl,pkl, ,m eh n vcl d dh ax cl k ow cl t vcl b ih f ao r y uw vcl g ow aw cl,string, ,m e n t h e c o w t b e f o r e y o u g o o u t,string, 10 | -------------------------------------------------------------------------------- /samples/audio_samples/nn_training_samples/train.json: -------------------------------------------------------------------------------- 1 | { 2 | "spk1_snt1": { 3 | "wav": "{data_root}/spk1_snt1.wav", 4 | "length": 2.87, 5 | "spk_id": "spk1", 6 | "ali": "{data_root}/spk1_snt1.pkl", 7 | "phn": "dh ax cl ch ay l vcl d ao l m ow s cl t hh er cl t sil dh ax s m ao l vcl d ao vcl", 8 | "char": "t h e c h i l d a l m o s t h u r t t h e s m a l l d o g " 9 | }, 10 | "spk1_snt2": { 11 | "wav": "{data_root}/spk1_snt2.wav", 12 | "length": 3.15, 13 | "spk_id": "spk1", 14 | "ali": "{data_root}/spk1_snt2.pkl", 15 | "phn": "vcl d r aa cl p dh ax cl t uw sil w eh n y uw ae vcl d dh ax f ih vcl g y er", 16 | "char": "d r o p t h e t u e w h e n y o u a d d t h e f i g u r e s" 17 | }, 18 | "spk1_snt3": { 19 | "wav": "{data_root}/spk1_snt3.wav", 20 | "length": 2.72, 21 | "spk_id": "spk1", 22 | "ali": "{data_root}/spk1_snt3.pkl", 23 | "phn": "ae cl t dh ae cl t hh ay l eh v ax l dh iy eh r ih z cl p y uh", 24 | "char": "a t t h a t h i g h l e v e l t h e a i r i s p u r e" 25 | }, 26 | "spk1_snt4": { 27 | "wav": "{data_root}/spk1_snt4.wav", 28 | "length": 2.53, 29 | "spk_id": "spk1", 30 | "ali": "{data_root}/spk1_snt4.pkl", 31 | "phn": "ey th ih n s cl t r ay cl p r ah n z vcl d aw n dh ax m ih vcl d ax", 32 | "char": "a t h i n s t r i p e r u n s d o w n t h e m i d d l e" 33 | }, 34 | "spk2_snt1": { 35 | "wav": "{data_root}/spk2_snt1.wav", 36 | "length": 2.01, 37 | "spk_id": "spk2", 38 | "ali": "{data_root}/spk2_snt1.pkl", 39 | "phn": "w iy er sh ao r dh ax dx w ah n w ao r ih z ih n ah", 40 | "char": "w e a r e s u r e t h a t o n e w o r e i s e n o u g h" 41 | }, 42 | "spk2_snt2": { 43 | "wav": "{data_root}/spk2_snt2.wav", 44 | "length": 1.76, 45 | "spk_id": "spk2", 46 | "ali": "{data_root}/spk2_snt2.pkl", 47 | "phn": "w ah cl t vcl jh oy dh eh r ih z ih n l ih v ih", 48 | "char": "w h a t j o y t h e r e i s i n l i v i n g" 49 | }, 50 | "spk2_snt3": { 51 | "wav": "{data_root}/spk2_snt3.wav", 52 | "length": 1.88, 53 | "spk_id": "spk2", 54 | "ali": "{data_root}/spk2_snt3.pkl", 55 | "phn": "t eh r ah th ih n sh iy cl t f er m dh iy y eh l ow cl p ae vcl", 56 | "char": "t h e r a t i n s h i p f r o m t h e y a l l o w p a v" 57 | }, 58 | "spk2_snt4": { 59 | "wav": "{data_root}/spk2_snt4.wav", 60 | "length": 2.04, 61 | "spk_id": "spk2", 62 | "ali": "{data_root}/spk2_snt4.pkl", 63 | "phn": "m eh n vcl d dh ax cl k ow cl t vcl b ih f ao r y uw vcl g ow aw cl", 64 | "char": "m e n t h e c o w t b e f o r e y o u g o o u t" 65 | } 66 | } -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/csv_example_sourcesep_mixture.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | example0, 3.260, $data_folder/mixture_0.wav, wav, 4 | example1, 3.260, $data_folder/mixture_1.wav, wav, 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/csv_example_sourcesep_source1.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | example1, 3.260, $data_folder/source1_0.wav, wav, 4 | example3, 3.260, $data_folder/source1_1.wav, wav, 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/csv_example_sourcesep_source2.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | example2, 3.260, $data_folder/source2_0.wav, wav, 4 | example4, 3.260, $data_folder/source2_1.wav, wav, 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/minimal_example_convtasnet_cv.csv: -------------------------------------------------------------------------------- 1 | ID, duration, mix_wav, mix_wav_format, mix_wav_opts, s1_wav, s1_wav_format, s1_wav_opts, s2_wav, s2_wav_format, s2_wav_opts 2 | 0,1.0,$data_root/mixture_2.wav,wav,,$data_root/source1_2.wav,wav,,$data_root/source2_2.wav,wav, 3 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/minimal_example_convtasnet_tr.csv: -------------------------------------------------------------------------------- 1 | ID, duration, mix_wav, mix_wav_format, mix_wav_opts, s1_wav, s1_wav_format, s1_wav_opts, s2_wav, s2_wav_format, s2_wav_opts 2 | 0,1.0,$data_root/mixture_0.wav,wav,,$data_root/source1_0.wav,wav,,$data_root/source2_0.wav,wav, 3 | 1,1.0,$data_root/mixture_1.wav,wav,,$data_root/source1_1.wav,wav,,$data_root/source2_1.wav,wav, 4 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/minimal_example_convtasnet_tt.csv: -------------------------------------------------------------------------------- 1 | ID, duration, mix_wav, mix_wav_format, mix_wav_opts, s1_wav, s1_wav_format, s1_wav_opts, s2_wav, s2_wav_format, s2_wav_opts 2 | 0,1.0,$data_root/mixture_3.wav,wav,,$data_root/source1_3.wav,wav,,$data_root/source2_3.wav,wav, 3 | -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/mixture_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/mixture_0.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/mixture_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/mixture_1.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/mixture_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/mixture_2.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/mixture_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/mixture_3.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source1_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source1_0.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source1_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source1_1.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source1_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source1_2.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source1_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source1_3.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source2_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source2_0.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source2_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source2_1.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source2_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source2_2.wav -------------------------------------------------------------------------------- /samples/audio_samples/sourcesep_samples/source2_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/sourcesep_samples/source2_3.wav -------------------------------------------------------------------------------- /samples/audio_samples/test_csv_merge.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts, spk_id, spk_id_format, spk_id_opts 2 | 3 | example1, 3.260, $data_folder/example1.wav, wav, , spk01, string, 4 | example2, 2.068, $data_folder/example2.flac, flac, , spk02, string, 5 | example3, 2.890, $data_folder/example3.sph, wav, , spk03, string, 6 | example5, 1.000, $data_folder/example5.wav, wav, start:10000 stop:26000, spk05, string, 7 | 8 | example1, 3.260, $data_folder/example1.wav, wav, , spk01, string, 9 | 10 | -------------------------------------------------------------------------------- /samples/audio_samples/test_mixture.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/test_mixture.wav -------------------------------------------------------------------------------- /samples/audio_samples/vad/train.json: -------------------------------------------------------------------------------- 1 | { 2 | "example_1": { 3 | "length": 32000, 4 | "wav": { 5 | "file": "{data_folder}/train.wav", 6 | "start": 0, 7 | "stop": 32000 8 | }, 9 | "speech": "0.52 0.85 1.32 1.83" 10 | }, 11 | "example_2": { 12 | "length": 32000, 13 | "wav": { 14 | "file": "{data_folder}/train.wav", 15 | "start": 32000, 16 | "stop": 64000 17 | }, 18 | "speech": "0.35 1.70" 19 | } 20 | } -------------------------------------------------------------------------------- /samples/audio_samples/vad/train.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/vad/train.wav -------------------------------------------------------------------------------- /samples/audio_samples/vad/valid.json: -------------------------------------------------------------------------------- 1 | { 2 | "example_1": { 3 | "length": 32000, 4 | "wav": { 5 | "file": "{data_folder}/valid.wav", 6 | "start": 0, 7 | "stop": 32000 8 | }, 9 | "speech": "0.38 2" 10 | }, 11 | "example_2": { 12 | "length": 32000, 13 | "wav": { 14 | "file": "{data_folder}/valid.wav", 15 | "start": 32000, 16 | "stop": 64000 17 | }, 18 | "speech": "0 0.7" 19 | }, 20 | "example_3": { 21 | "length": 32000, 22 | "wav": { 23 | "file": "{data_folder}/valid.wav", 24 | "start": 64000, 25 | "stop": 96000 26 | }, 27 | "speech": "0.1 1.88" 28 | }, 29 | "example_4": { 30 | "length": 32000, 31 | "wav": { 32 | "file": "{data_folder}/valid.wav", 33 | "start": 96000, 34 | "stop": 128000 35 | }, 36 | "speech": "" 37 | } 38 | } -------------------------------------------------------------------------------- /samples/audio_samples/vad/valid.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/audio_samples/vad/valid.wav -------------------------------------------------------------------------------- /samples/label_samples/hyp.csv: -------------------------------------------------------------------------------- 1 | ID,duration,phn,phn_format,phn_opts 2 | example1,3.1,a b d,string, 3 | example2,4.5,e f,string, 4 | -------------------------------------------------------------------------------- /samples/label_samples/ref.csv: -------------------------------------------------------------------------------- 1 | ID,duration,phn,phn_format,phn_opts 2 | example1,3.1,a b c,string, 3 | example2,4.5,d e f,string, 4 | -------------------------------------------------------------------------------- /samples/noise_samples/noise.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | noise1, 33.12325, samples/noise_samples/noise1.wav, wav, 4 | noise2, 5.0, samples/noise_samples/noise2.wav, wav, 5 | noise3, 1.0, samples/noise_samples/noise3.wav, wav, start:0 stop:16000 6 | noise4, 17.65875, samples/noise_samples/noise4.wav, wav, 7 | noise5, 13.685625, samples/noise_samples/noise5.wav, wav, 8 | -------------------------------------------------------------------------------- /samples/noise_samples/noise1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise1.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise2.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise3.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise4.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise5.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise_multichannel.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | noise_multichannel1, 5.0, samples/noise_samples/noise_multichannel.wav, wav, start:0 stop:80000 4 | noise_multichannel2, 5.0, samples/noise_samples/noise_multichannel.wav, wav, start:80000 stop:160000 5 | -------------------------------------------------------------------------------- /samples/noise_samples/noise_multichannel.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/noise_samples/noise_multichannel.wav -------------------------------------------------------------------------------- /samples/noise_samples/noise_rel.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | noise1, 33.12325, $noise_folder/noise1.wav, wav, 4 | noise2, 5.0, $noise_folder/noise2.wav, wav, 5 | noise3, 1.0, $noise_folder/noise3.wav, wav, start:0 stop:16000 6 | noise4, 17.65875, $noise_folder/noise4.wav, wav, 7 | noise5, 13.685625, $noise_folder/noise5.wav, wav, 8 | -------------------------------------------------------------------------------- /samples/plda_xvect_samples/enrol_stat_xvect.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/plda_xvect_samples/enrol_stat_xvect.pkl -------------------------------------------------------------------------------- /samples/plda_xvect_samples/expected_plda_scores.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/plda_xvect_samples/expected_plda_scores.pkl -------------------------------------------------------------------------------- /samples/plda_xvect_samples/test_stat_xvect.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/plda_xvect_samples/test_stat_xvect.pkl -------------------------------------------------------------------------------- /samples/plda_xvect_samples/train_stat_xvect.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/plda_xvect_samples/train_stat_xvect.pkl -------------------------------------------------------------------------------- /samples/rir_samples/rir1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/rir_samples/rir1.wav -------------------------------------------------------------------------------- /samples/rir_samples/rir2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/rir_samples/rir2.wav -------------------------------------------------------------------------------- /samples/rir_samples/rir3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/rir_samples/rir3.wav -------------------------------------------------------------------------------- /samples/rir_samples/rir4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/rir_samples/rir4.wav -------------------------------------------------------------------------------- /samples/rir_samples/rir_multichannel.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | rir_multichannel, 0.5, samples/rir_samples/rir_multichannel.wav, wav, 4 | -------------------------------------------------------------------------------- /samples/rir_samples/rir_multichannel.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/rir_samples/rir_multichannel.wav -------------------------------------------------------------------------------- /samples/rir_samples/rirs.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | rir1, 1.0, samples/rir_samples/rir1.wav, wav, 3 | rir2, 1.3653125, samples/rir_samples/rir2.wav, wav, 4 | rir3, 2.0, samples/rir_samples/rir3.wav, wav, 5 | rir4, 0.5, samples/rir_samples/rir4.wav, wav, 6 | 7 | -------------------------------------------------------------------------------- /samples/rir_samples/rirs_rel.csv: -------------------------------------------------------------------------------- 1 | ID, duration, wav, wav_format, wav_opts 2 | 3 | rir1, 1.0, $rir_folder/rir1.wav, wav, 4 | rir2, 1.3653125, $rir_folder/rir2.wav, wav, 5 | rir3, 2.0, $rir_folder/rir3.wav, wav, 6 | rir4, 0.5, $rir_folder/rir4.wav, wav, 7 | 8 | -------------------------------------------------------------------------------- /samples/rttm_samples/ReadMe.md: -------------------------------------------------------------------------------- 1 | ## RTTM Files 2 | ###### The sample RTTM files given in this directory are generated from manual annotations from AMI corpus (http://groups.inf.ed.ac.uk/ami/corpus/). 3 | ###### The AMI corpus and its annotations are released under the Creative Commons Attribution 4.0 International Public License agreement (CC BY 4.0). Use of this data implies agreement with the terms below. See also: https://creativecommons.org/licenses/by/4.0/ 4 | -------------------------------------------------------------------------------- /samples/text_samples/hdf5_example.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/text_samples/hdf5_example.h5 -------------------------------------------------------------------------------- /samples/text_samples/label_dict.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/text_samples/label_dict.pkl -------------------------------------------------------------------------------- /samples/text_samples/readme.txt: -------------------------------------------------------------------------------- 1 | hdf5_example.h5 contains an example of hdf5 format dataset for text-only data. 2 | The structure of the file is as followed. 3 | 4 | hdf5_example.h5 - wrd - "good morning" 5 | - "good evening" 6 | - char - "g o o d _ m o r n i n g" 7 | - "g o o d _ e v e n i n g" 8 | 9 | The label_dict.pkl is used for HDF5 dataloader and dataset. 10 | -------------------------------------------------------------------------------- /samples/voxceleb_samples/meta/iden_split.txt: -------------------------------------------------------------------------------- 1 | 1 id10001/1zcIwhmdeo4/00001.wav 2 | 2 id10001/1zcIwhmdeo4/00002.wav 3 | 3 id10001/1zcIwhmdeo4/00003.wav 4 | 1 id10002/xTV-jFAUKcw/00001.wav 5 | 2 id10002/xTV-jFAUKcw/00002.wav 6 | 3 id10002/xTV-jFAUKcw/00003.wav 7 | -------------------------------------------------------------------------------- /samples/voxceleb_samples/readme.txt: -------------------------------------------------------------------------------- 1 | 2 | This a small sample data containing 6 audio clips taken from the subset of voxceleb1 dataset (http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) which is distributed under Creative Commons Attribution 4.0 International License (https://creativecommons.org/licenses/by/4.0/). 3 | In this sample data, we have edited train-dev-test split in the iden_split_sample.txt file. Please refer http://www.robots.ox.ac.uk/~vgg/data/voxceleb/ for more information on the complete original dataset. 4 | 5 | -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/dev.csv: -------------------------------------------------------------------------------- 1 | ID,duration,wav,wav_format,wav_opts,spk_id,spk_id_format,spk_id_opts 2 | id10001---1zcIwhmdeo4---00001_0_300,3.0,$data_folder/id10001/1zcIwhmdeo4/00001.wav,wav,start:0 stop:48000,id10001,string, 3 | id10001---1zcIwhmdeo4---00001_300_600,3.0,$data_folder/id10001/1zcIwhmdeo4/00001.wav,wav,start:48000 stop:96000,id10001,string, 4 | id10002---xTV-jFAUKcw---00001_0_300,3.0,$data_folder/id10002/xTV-jFAUKcw/00001.wav,wav,start:0 stop:48000,id10002,string, 5 | -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00001.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00002.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10001/1zcIwhmdeo4/00003.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00001.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00002.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/samples/voxceleb_samples/wav/id10002/xTV-jFAUKcw/00003.wav -------------------------------------------------------------------------------- /samples/voxceleb_samples/wav/train.csv: -------------------------------------------------------------------------------- 1 | ID,duration,wav,wav_format,wav_opts,spk_id,spk_id_format,spk_id_opts 2 | id10001---1zcIwhmdeo4---00003_0_300,3.0,$data_folder/id10001/1zcIwhmdeo4/00003.wav,wav,start:0 stop:48000,id10001,string, 3 | id10002---xTV-jFAUKcw---00002_0_300,3.0,$data_folder/id10002/xTV-jFAUKcw/00002.wav,wav,start:0 stop:48000,id10002,string, 4 | id10001---1zcIwhmdeo4---00002_0_300,3.0,$data_folder/id10001/1zcIwhmdeo4/00002.wav,wav,start:0 stop:48000,id10001,string, 5 | id10002---xTV-jFAUKcw---00003_0_300,3.0,$data_folder/id10002/xTV-jFAUKcw/00003.wav,wav,start:0 stop:48000,id10002,string, 6 | id10001---1zcIwhmdeo4---00002_300_600,3.0,$data_folder/id10001/1zcIwhmdeo4/00002.wav,wav,start:48000 stop:96000,id10001,string, 7 | id10002---xTV-jFAUKcw---00003_300_600,3.0,$data_folder/id10002/xTV-jFAUKcw/00003.wav,wav,start:48000 stop:96000,id10002,string, 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import site 5 | import setuptools 6 | from distutils.core import setup 7 | 8 | 9 | # Editable install in user site directory can be allowed with this hack: 10 | # https://github.com/pypa/pip/issues/7953. 11 | site.ENABLE_USER_SITE = "--user" in sys.argv[1:] 12 | 13 | with open("README.md") as f: 14 | long_description = f.read() 15 | 16 | with open(os.path.join("speechbrain", "version.txt")) as f: 17 | version = f.read().strip() 18 | 19 | setup( 20 | name="speechbrain", 21 | version=version, 22 | description="All-in-one speech toolkit in pure Python and Pytorch", 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | author="Mirco Ravanelli & Others", 26 | author_email="speechbrain@gmail.com", 27 | packages=setuptools.find_packages(), 28 | package_data={"speechbrain": ["version.txt", "log-config.yaml"]}, 29 | install_requires=[ 30 | "hyperpyyaml", 31 | "joblib", 32 | "numpy", 33 | "packaging", 34 | "scipy", 35 | "sentencepiece", 36 | "torch>=1.7,<=1.11", 37 | "torchaudio", 38 | "tqdm", 39 | "huggingface_hub", 40 | ], 41 | python_requires=">=3.7", 42 | url="https://speechbrain.github.io/", 43 | ) 44 | -------------------------------------------------------------------------------- /speechbrain.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /speechbrain.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | hyperpyyaml 2 | joblib 3 | numpy 4 | packaging 5 | scipy 6 | sentencepiece 7 | torch<=1.11,>=1.7 8 | torchaudio 9 | tqdm 10 | huggingface_hub 11 | -------------------------------------------------------------------------------- /speechbrain.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | speechbrain 2 | -------------------------------------------------------------------------------- /speechbrain/__init__.py: -------------------------------------------------------------------------------- 1 | """ Comprehensive speech processing toolkit 2 | """ 3 | import os 4 | from .core import Stage, Brain, create_experiment_directory, parse_arguments 5 | from . import alignment # noqa 6 | from . import dataio # noqa 7 | from . import decoders # noqa 8 | from . import lobes # noqa 9 | from . import lm # noqa 10 | from . import nnet # noqa 11 | from . import processing # noqa 12 | from . import tokenizers # noqa 13 | from . import utils # noqa 14 | 15 | with open(os.path.join(os.path.dirname(__file__), "version.txt")) as f: 16 | version = f.read().strip() 17 | 18 | __all__ = [ 19 | "Stage", 20 | "Brain", 21 | "create_experiment_directory", 22 | "parse_arguments", 23 | ] 24 | 25 | __version__ = version 26 | -------------------------------------------------------------------------------- /speechbrain/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/__pycache__/core.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/__pycache__/core.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/__pycache__/core.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/__pycache__/core.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/alignment/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for aligning transcripts and speech signals 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/alignment/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/alignment/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/alignment/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/alignment/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__init__.py: -------------------------------------------------------------------------------- 1 | """Data loading and dataset preprocessing 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/batch.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/batch.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/batch.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/batch.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataio.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataio.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataio.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataio.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataloader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataloader.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataloader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataloader.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataset.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/encoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/encoder.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/encoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/encoder.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/iterators.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/iterators.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/iterators.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/iterators.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/legacy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/legacy.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/legacy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/legacy.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/preprocess.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/preprocess.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/preprocess.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/preprocess.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/sampler.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/sampler.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/sampler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/sampler.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/wer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/wer.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/__pycache__/wer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/dataio/__pycache__/wer.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/dataio/preprocess.py: -------------------------------------------------------------------------------- 1 | """Preprocessors for audio""" 2 | import torch 3 | import functools 4 | from speechbrain.processing.speech_augmentation import Resample 5 | 6 | 7 | class AudioNormalizer: 8 | """Normalizes audio into a standard format 9 | 10 | Arguments 11 | --------- 12 | sample_rate : int 13 | The sampling rate to which the incoming signals should be converted. 14 | mix : {"avg-to-mono", "keep"} 15 | "avg-to-mono" - add all channels together and normalize by number of 16 | channels. This also removes the channel dimension, resulting in [time] 17 | format tensor. 18 | "keep" - don't normalize channel information 19 | 20 | Example 21 | ------- 22 | >>> import torchaudio 23 | >>> example_file = 'samples/audio_samples/example_multichannel.wav' 24 | >>> signal, sr = torchaudio.load(example_file, channels_first = False) 25 | >>> normalizer = AudioNormalizer(sample_rate=8000) 26 | >>> normalized = normalizer(signal, sr) 27 | >>> signal.shape 28 | torch.Size([33882, 2]) 29 | >>> normalized.shape 30 | torch.Size([16941]) 31 | 32 | NOTE 33 | ---- 34 | This will also upsample audio. However, upsampling cannot produce meaningful 35 | information in the bandwidth which it adds. Generally models will not work 36 | well for upsampled data if they have not specifically been trained to do so. 37 | """ 38 | 39 | def __init__(self, sample_rate=16000, mix="avg-to-mono"): 40 | self.sample_rate = sample_rate 41 | if mix not in ["avg-to-mono", "keep"]: 42 | raise ValueError(f"Unexpected mixing configuration {mix}") 43 | self.mix = mix 44 | self._cached_resample = functools.lru_cache(maxsize=12)(Resample) 45 | 46 | def __call__(self, audio, sample_rate): 47 | """Perform normalization 48 | 49 | Arguments 50 | --------- 51 | audio : tensor 52 | The input waveform torch tensor. Assuming [time, channels], 53 | or [time]. 54 | """ 55 | resampler = self._cached_resample(sample_rate, self.sample_rate) 56 | resampled = resampler(audio.unsqueeze(0)).squeeze(0) 57 | return self._mix(resampled) 58 | 59 | def _mix(self, audio): 60 | """Handle channel mixing""" 61 | flat_input = audio.dim() == 1 62 | if self.mix == "avg-to-mono": 63 | if flat_input: 64 | return audio 65 | return torch.mean(audio, 1) 66 | if self.mix == "keep": 67 | return audio 68 | -------------------------------------------------------------------------------- /speechbrain/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing the different decoders (ctc, beamsearch ...) 2 | """ 3 | from .seq2seq import * # noqa 4 | from .ctc import * # noqa 5 | -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/ctc.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/ctc.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/ctc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/ctc.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/seq2seq.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/seq2seq.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/decoders/__pycache__/seq2seq.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/decoders/__pycache__/seq2seq.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lm/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining language models 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/lm/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lm/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lm/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lm/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining common blocks (DNN models, processing ...) 2 | 3 | This subpackage gathers higher level blocks, or "lobes". 4 | The classes here may leverage the extended YAML syntax. 5 | """ 6 | from . import models # noqa 7 | -------------------------------------------------------------------------------- /speechbrain/lobes/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/__pycache__/augment.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/__pycache__/augment.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/beamform_multimic.py: -------------------------------------------------------------------------------- 1 | """Beamformer for multi-mic processing. 2 | 3 | Authors 4 | * Nauman Dawalatabad 5 | """ 6 | import torch 7 | from speechbrain.processing.features import ( 8 | STFT, 9 | ISTFT, 10 | ) 11 | 12 | from speechbrain.processing.multi_mic import ( 13 | Covariance, 14 | GccPhat, 15 | DelaySum, 16 | ) 17 | 18 | 19 | class DelaySum_Beamformer(torch.nn.Module): 20 | """Generate beamformed signal from multi-mic data using DelaySum beamforming. 21 | 22 | Arguments 23 | --------- 24 | sampling_rate : int (default: 16000) 25 | Sampling rate of audio signals. 26 | """ 27 | 28 | def __init__(self, sampling_rate=16000): 29 | super().__init__() 30 | self.fs = sampling_rate 31 | self.stft = STFT(sample_rate=self.fs) 32 | self.cov = Covariance() 33 | self.gccphat = GccPhat() 34 | self.delaysum = DelaySum() 35 | self.istft = ISTFT(sample_rate=self.fs) 36 | 37 | def forward(self, mics_signals): 38 | """Returns beamformed signal using multi-mic data. 39 | 40 | Arguments 41 | --------- 42 | mics_sginal : tensor 43 | Set of audio signals to be transformed. 44 | """ 45 | with torch.no_grad(): 46 | 47 | Xs = self.stft(mics_signals) 48 | XXs = self.cov(Xs) 49 | tdoas = self.gccphat(XXs) 50 | Ys_ds = self.delaysum(Xs, tdoas) 51 | sig = self.istft(Ys_ds) 52 | 53 | return sig 54 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/VanillaNN.py: -------------------------------------------------------------------------------- 1 | """Vanilla Neural Network for simple tests. 2 | 3 | Authors 4 | * Elena Rastorgueva 2020 5 | """ 6 | import torch 7 | import speechbrain as sb 8 | 9 | 10 | class VanillaNN(sb.nnet.containers.Sequential): 11 | """A simple vanilla Deep Neural Network. 12 | 13 | Arguments 14 | --------- 15 | activation : torch class 16 | A class used for constructing the activation layers. 17 | dnn_blocks : int 18 | The number of linear neural blocks to include. 19 | dnn_neurons : int 20 | The number of neurons in the linear layers. 21 | 22 | Example 23 | ------- 24 | >>> inputs = torch.rand([10, 120, 60]) 25 | >>> model = VanillaNN(input_shape=inputs.shape) 26 | >>> outputs = model(inputs) 27 | >>> outputs.shape 28 | torch.Size([10, 120, 512]) 29 | """ 30 | 31 | def __init__( 32 | self, 33 | input_shape, 34 | activation=torch.nn.LeakyReLU, 35 | dnn_blocks=2, 36 | dnn_neurons=512, 37 | ): 38 | super().__init__(input_shape=input_shape) 39 | 40 | for block_index in range(dnn_blocks): 41 | self.append( 42 | sb.nnet.linear.Linear, 43 | n_neurons=dnn_neurons, 44 | bias=True, 45 | layer_name="linear", 46 | ) 47 | self.append(activation(), layer_name="act") 48 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining neural netword models (CRDNN, Xvectors ...) 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/conv_tasnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/conv_tasnet.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path2.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_context.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_context.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_conv.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_conv.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_conv2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_conv2.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_multi_scale.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_multi_scale.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_splitnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_splitnet.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/dual_path_splitnet_exchange.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/dual_path_splitnet_exchange.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/galr.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/galr.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/norms.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/norms.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/torch_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/torch_utils.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/u_net.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/u_net.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/__pycache__/unet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/__pycache__/unet.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """High level processing blocks. 2 | 3 | This subpackage gathers higher level blocks, or "lobes". 4 | The classes here may leverage the extended YAML syntax. 5 | """ 6 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/Conformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/Conformer.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/Conformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/Conformer.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/Transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/Transformer.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/Transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/Transformer.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/Transformer_GALR.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/Transformer_GALR.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/lobes/models/transformer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/log-config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | simple: 5 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 6 | console: 7 | format: "%(name)s - %(message)s" 8 | 9 | handlers: 10 | console: 11 | class: speechbrain.utils.logger.TqdmCompatibleStreamHandler 12 | level: INFO 13 | formatter: console 14 | stream: ext://sys.stdout 15 | 16 | file_handler: 17 | class: logging.FileHandler 18 | level: DEBUG 19 | formatter: simple 20 | filename: log.txt 21 | encoding: utf8 22 | 23 | root: 24 | level: DEBUG 25 | handlers: [console, file_handler] 26 | -------------------------------------------------------------------------------- /speechbrain/nnet/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing the different neural networks layers 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | from .loss import stoi_loss # noqa 13 | -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/CNN.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/CNN.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/CNN.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/CNN.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/RNN.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/RNN.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/RNN.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/RNN.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/activations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/activations.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/activations.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/activations.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/attention.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/containers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/containers.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/containers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/containers.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/dropout.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/dropout.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/dropout.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/dropout.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/embedding.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/embedding.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/embedding.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/linear.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/linear.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/linear.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/linear.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/losses.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/losses.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/losses.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/losses.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/normalization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/normalization.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/normalization.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/normalization.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/pooling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/pooling.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/pooling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/pooling.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/schedulers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/schedulers.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/__pycache__/schedulers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/__pycache__/schedulers.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/complex_networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing complex neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/dropout.py: -------------------------------------------------------------------------------- 1 | """Library implementing dropout. 2 | 3 | Authors 4 | * Mirco Ravanelli 2020 5 | """ 6 | import torch # noqa: F401 7 | import logging 8 | import torch.nn as nn 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Dropout2d(nn.Module): 14 | """This function implements dropout 2d. It randomly put zeros on 15 | entire channels. 16 | 17 | 18 | Arguments 19 | --------- 20 | dropout_rate : float 21 | It is the dropout factor (between 0 and 1). 22 | inplace : bool 23 | If True, it uses inplace operations. 24 | 25 | Example 26 | ------- 27 | >>> drop = Dropout2d(drop_rate=0.5) 28 | >>> inputs = torch.rand(10, 50, 40) 29 | >>> output=drop(inputs) 30 | >>> output.shape 31 | torch.Size([10, 50, 40]) 32 | """ 33 | 34 | def __init__( 35 | self, drop_rate, inplace=False, 36 | ): 37 | super().__init__() 38 | self.drop_rate = drop_rate 39 | self.inplace = inplace 40 | self.drop = nn.Dropout2d(p=self.drop_rate, inplace=self.inplace) 41 | 42 | def forward(self, x): 43 | """Applies dropout 2d to the input tensor. 44 | 45 | Arguments 46 | --------- 47 | x : torch.Tensor (batch, time, channel1, channel2) 48 | input to normalize. 4d tensors are expected. 49 | """ 50 | 51 | # time must be the last 52 | x = x.transpose(1, 2).transpose(2, -1) 53 | x_drop = self.drop(x) 54 | x_drop = x_drop.transpose(-1, 1).transpose(2, -1) 55 | 56 | return x_drop 57 | -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing specific losses (transducer, stoi ...) 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/loss/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/loss/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__pycache__/stoi_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/loss/__pycache__/stoi_loss.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__pycache__/stoi_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/nnet/loss/__pycache__/stoi_loss.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/nnet/quaternion_networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing quaternion neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/transducer/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing transducer neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | """Pretrained models""" 2 | 3 | from .interfaces import * # noqa 4 | -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/fetching.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/fetching.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/fetching.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/fetching.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/interfaces.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/interfaces.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/pretrained/__pycache__/interfaces.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/pretrained/__pycache__/interfaces.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing various techniques of speech processing 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/signal_processing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/signal_processing.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/signal_processing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/signal_processing.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/speech_augmentation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/speech_augmentation.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/processing/__pycache__/speech_augmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/processing/__pycache__/speech_augmentation.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining the SentencePiece tokenizer 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/tokenizers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/tokenizers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/tokenizers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/tokenizers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/Accuracy.py: -------------------------------------------------------------------------------- 1 | """Calculate accuracy. 2 | 3 | Authors 4 | * Jianyuan Zhong 2020 5 | """ 6 | import torch 7 | from speechbrain.dataio.dataio import length_to_mask 8 | 9 | 10 | def Accuracy(log_probabilities, targets, length=None): 11 | """Calculates the accuracy for predicted log probabilities and targets in a batch. 12 | 13 | Arguments 14 | ---------- 15 | log_probabilities : tensor 16 | Predicted log probabilities (batch_size, time, feature). 17 | targets : tensor 18 | Target (batch_size, time). 19 | length : tensor 20 | Length of target (batch_size,). 21 | 22 | Example 23 | ------- 24 | >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0) 25 | >>> acc = Accuracy(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3])) 26 | >>> print(acc) 27 | (1.0, 2.0) 28 | """ 29 | if length is not None: 30 | mask = length_to_mask( 31 | length * targets.shape[1], max_len=targets.shape[1], 32 | ).bool() 33 | if len(targets.shape) == 3: 34 | mask = mask.unsqueeze(2).repeat(1, 1, targets.shape[2]) 35 | 36 | padded_pred = log_probabilities.argmax(-1) 37 | 38 | if length is not None: 39 | numerator = torch.sum( 40 | padded_pred.masked_select(mask) == targets.masked_select(mask) 41 | ) 42 | denominator = torch.sum(mask) 43 | else: 44 | numerator = torch.sum(padded_pred == targets) 45 | denominator = targets.shape[1] 46 | return float(numerator), float(denominator) 47 | 48 | 49 | class AccuracyStats: 50 | """Module for calculate the overall one-step-forward prediction accuracy. 51 | 52 | Example 53 | ------- 54 | >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0) 55 | >>> stats = AccuracyStats() 56 | >>> stats.append(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3])) 57 | >>> acc = stats.summarize() 58 | >>> print(acc) 59 | 0.5 60 | """ 61 | 62 | def __init__(self): 63 | self.correct = 0 64 | self.total = 0 65 | 66 | def append(self, log_probabilities, targets, length=None): 67 | """This function is for updating the stats according to the prediction 68 | and target in the current batch. 69 | 70 | Arguments 71 | ---------- 72 | log_probabilities : tensor 73 | Predicted log probabilities (batch_size, time, feature). 74 | targets : tensor 75 | Target (batch_size, time). 76 | length: tensor 77 | Length of target (batch_size,). 78 | """ 79 | numerator, denominator = Accuracy(log_probabilities, targets, length) 80 | self.correct += numerator 81 | self.total += denominator 82 | 83 | def summarize(self): 84 | return self.correct / self.total 85 | -------------------------------------------------------------------------------- /speechbrain/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing various tools (accuracy, checkpoints ...) 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/Accuracy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/Accuracy.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/Accuracy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/Accuracy.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/DER.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/DER.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/DER.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/DER.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/bleu.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/bleu.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/bleu.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/bleu.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/callchains.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/callchains.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/callchains.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/callchains.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/checkpoints.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/checkpoints.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/checkpoints.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/checkpoints.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/data_pipeline.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/data_pipeline.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/data_pipeline.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/data_pipeline.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/data_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/data_utils.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/data_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/data_utils.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/depgraph.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/depgraph.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/depgraph.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/depgraph.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/distributed.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/distributed.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/distributed.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/distributed.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/edit_distance.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/edit_distance.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/edit_distance.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/edit_distance.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/epoch_loop.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/epoch_loop.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/epoch_loop.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/epoch_loop.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/logger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/logger.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/metric_stats.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/metric_stats.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/metric_stats.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/metric_stats.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/parameter_transfer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/parameter_transfer.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/parameter_transfer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/parameter_transfer.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/superpowers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/superpowers.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/superpowers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/superpowers.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/torch_audio_backend.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/torch_audio_backend.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/torch_audio_backend.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/torch_audio_backend.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/train_logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/train_logger.cpython-37.pyc -------------------------------------------------------------------------------- /speechbrain/utils/__pycache__/train_logger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/speechbrain/utils/__pycache__/train_logger.cpython-38.pyc -------------------------------------------------------------------------------- /speechbrain/utils/callchains.py: -------------------------------------------------------------------------------- 1 | """Chaining together callables, if some require relative lengths""" 2 | import inspect 3 | 4 | 5 | def lengths_arg_exists(func): 6 | """Returns True if func takes ``lengths`` keyword argument. 7 | 8 | Arguments 9 | --------- 10 | func : callable 11 | The function, method, or other callable to search for the lengths arg. 12 | """ 13 | spec = inspect.getfullargspec(func) 14 | return "lengths" in spec.args + spec.kwonlyargs 15 | 16 | 17 | class LengthsCapableChain: 18 | """Chain together callables. Can handle relative lengths. 19 | 20 | This is a more light-weight version of 21 | speechbrain.nnet.containers.LengthsCapableSequential 22 | 23 | Arguments 24 | --------- 25 | *funcs : list, optional 26 | Any number of functions or other callables, given in order of 27 | execution. 28 | 29 | Returns 30 | ------- 31 | Any 32 | The input as processed by each function. If no functions were given, 33 | simply returns the input. 34 | """ 35 | 36 | def __init__(self, *funcs): 37 | self.funcs = [] 38 | self.takes_lengths = [] 39 | for func in funcs: 40 | self.append(func) 41 | 42 | def __call__(self, x, lengths=None): 43 | """Run the chain of callables on the given input 44 | 45 | Arguments 46 | --------- 47 | x : Any 48 | The main input 49 | lengths : Any 50 | The lengths argument which will be conditionally passed to 51 | any functions in the chain that take a 'lengths' argument. 52 | In SpeechBrain the convention is to use relative lengths. 53 | 54 | Note 55 | ---- 56 | By convention, if a callable in the chain returns multiple outputs 57 | (returns a tuple), only the first output is passed to the next 58 | callable in the chain. 59 | """ 60 | if not self.funcs: 61 | return x 62 | for func, give_lengths in zip(self.funcs, self.takes_lengths): 63 | if give_lengths: 64 | x = func(x, lengths) 65 | else: 66 | x = func(x) 67 | if isinstance(x, tuple): 68 | x = x[0] 69 | return x 70 | 71 | def append(self, func): 72 | """Add a function to the chain""" 73 | self.funcs.append(func) 74 | self.takes_lengths.append(lengths_arg_exists(func)) 75 | 76 | def __str__(self): 77 | clsname = self.__class__.__name__ 78 | if self.funcs: 79 | return f"{clsname}:\n" + "\n".join(str(f) for f in self.funcs) 80 | else: 81 | return "Empty {clsname}" 82 | -------------------------------------------------------------------------------- /speechbrain/utils/superpowers.py: -------------------------------------------------------------------------------- 1 | """Superpowers which should be rarely used. 2 | 3 | This library contains functions for importing python classes and 4 | for running shell commands. Remember, with great power comes great 5 | responsibility. 6 | 7 | Authors 8 | * Mirco Ravanelli 2020 9 | """ 10 | 11 | import logging 12 | import subprocess 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def run_shell(cmd): 18 | r"""This function can be used to run a command in the bash shell. 19 | 20 | Arguments 21 | --------- 22 | cmd : str 23 | Shell command to run. 24 | 25 | Returns 26 | ------- 27 | bytes 28 | The captured standard output. 29 | bytes 30 | The captured standard error. 31 | int 32 | The returncode. 33 | 34 | Raises 35 | ------ 36 | OSError 37 | If returncode is not 0, i.e., command failed. 38 | 39 | Example 40 | ------- 41 | >>> out, err, code = run_shell("echo 'hello world'") 42 | >>> out.decode(errors="ignore") 43 | 'hello world\n' 44 | """ 45 | 46 | # Executing the command 47 | p = subprocess.Popen( 48 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 49 | ) 50 | 51 | # Capturing standard output and error 52 | (output, err) = p.communicate() 53 | 54 | if p.returncode != 0: 55 | raise OSError(err.decode(errors="replace")) 56 | 57 | # Adding information in the logger 58 | msg = output.decode(errors="replace") + "\n" + err.decode(errors="replace") 59 | logger.debug(msg) 60 | 61 | return output, err, p.returncode 62 | -------------------------------------------------------------------------------- /speechbrain/utils/torch_audio_backend.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | 4 | def get_torchaudio_backend(): 5 | """Get the backend for torchaudio between soundfile and sox_io according to the os. 6 | 7 | Allow users to use soundfile or sox_io according to their os. 8 | 9 | Returns 10 | ------- 11 | str 12 | The torchaudio backend to use. 13 | """ 14 | current_system = platform.system() 15 | if current_system == "Windows": 16 | return "soundfile" 17 | else: 18 | return "sox_io" 19 | -------------------------------------------------------------------------------- /speechbrain/version.txt: -------------------------------------------------------------------------------- 1 | 0.5.10 2 | -------------------------------------------------------------------------------- /templates/README.md: -------------------------------------------------------------------------------- 1 | Templates 2 | --------- 3 | 4 | These templates should serve as a good starting point for developing new 5 | recipes with the SpeechBrain toolkit. They are simple, well-documented, 6 | and contain all the parts necessary for a working recipe. They cover 7 | a broad spectrum of types of tasks that are encountered in speech 8 | research, such as sequence regression (enhancement), sequence 9 | to sequence (speech_recognition), and sequence classification (speaker ID). 10 | -------------------------------------------------------------------------------- /templates/enhancement/README.md: -------------------------------------------------------------------------------- 1 | # Template for Speech Enhancement 2 | 3 | This folder provides a working, well-documented example for training 4 | a speech enhancement model from scratch, based on a few hours of 5 | data. The data we use is from Mini Librispeech + OpenRIR. 6 | 7 | There are four files here: 8 | 9 | * `train.py`: the main code file, outlines entire training process. 10 | * `train.yaml`: the hyperparameters file, sets all parameters of execution. 11 | * `custom_model.py`: A file containing the definition of a PyTorch module. 12 | * `mini_librispeech_prepare.py`: If necessary, downloads and prepares data 13 | manifests. 14 | 15 | To train an enhancement model, just execute the following on the command-line: 16 | 17 | ```bash 18 | python train.py train.yaml --data_folder /path/to/save/mini_librispeech 19 | ``` 20 | 21 | This will automatically download and prepare the data manifest for mini 22 | librispeech, and then train a model with dynamically generated noisy 23 | samples, using noise, reverberation, and babble. 24 | 25 | More details about what each file does and how to make modifications 26 | are found within each file. The whole folder can be copied and used 27 | as a starting point for developing recipes doing regression tasks 28 | similar to speech enhancement. Please reach out to the SpeechBrain 29 | team if any errors are found or clarification is needed about how 30 | parts of the template work. Good Luck! 31 | 32 | [For more information, please take a look into the "Speech Enhancement from scratch" tutorial](https://colab.research.google.com/drive/18RyiuKupAhwWX7fh3LCatwQGU5eIS3TR?usp=sharing) 33 | -------------------------------------------------------------------------------- /templates/enhancement/custom_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a very simple PyTorch module to use for enhancement. 3 | 4 | To replace this model, change the `!new:` tag in the hyperparameter file 5 | to refer to a built-in SpeechBrain model or another file containing 6 | a custom PyTorch module. 7 | 8 | Authors 9 | * Peter Plantinga 2021 10 | """ 11 | import torch 12 | 13 | 14 | class CustomModel(torch.nn.Module): 15 | """Basic RNN model with projection layers between RNN layers. 16 | 17 | Arguments 18 | --------- 19 | input_size : int 20 | Size of the expected input in the 3rd dimension. 21 | rnn_size : int 22 | Number of neurons to use in rnn (for each direction -> and <-). 23 | projection : int 24 | Number of neurons in projection layer. 25 | layers : int 26 | Number of RNN layers to use. 27 | """ 28 | 29 | def __init__(self, input_size, rnn_size=256, projection=128, layers=2): 30 | super().__init__() 31 | self.layers = torch.nn.ModuleList() 32 | 33 | # Alternate RNN and projection layers. 34 | for i in range(layers): 35 | self.layers.append( 36 | torch.nn.LSTM( 37 | input_size=input_size if i == 0 else projection, 38 | hidden_size=rnn_size, 39 | bidirectional=True, 40 | ) 41 | ) 42 | 43 | # Projection layer reduces size, except last layer, which 44 | # goes back to input size to create the mask 45 | linear_size = input_size if i == layers - 1 else projection 46 | self.layers.append( 47 | torch.nn.Linear( 48 | in_features=rnn_size * 2, out_features=linear_size, 49 | ) 50 | ) 51 | 52 | # Use ReLU to make sure outputs aren't negative (unhelpful for masking) 53 | self.layers.append(torch.nn.ReLU()) 54 | 55 | def forward(self, x): 56 | """Shift to time-first, pass layers, then back to batch-first.""" 57 | x = x.transpose(0, 1) 58 | for layer in self.layers: 59 | x = layer(x) 60 | if isinstance(x, tuple): 61 | x = x[0] 62 | x = x.transpose(0, 1) 63 | return x 64 | -------------------------------------------------------------------------------- /templates/speaker_id/README.md: -------------------------------------------------------------------------------- 1 | # Template for Speaker Identification 2 | 3 | This folder provides a working, well-documented example for training 4 | a speaker identification model from scratch, based on a few hours of 5 | data. The data we use is from Mini Librispeech + OpenRIR. 6 | 7 | There are four files here: 8 | 9 | * `train.py`: the main code file, outlines the entire training process. 10 | * `train.yaml`: the hyperparameters file, sets all parameters of execution. 11 | * `custom_model.py`: A file containing the definition of a PyTorch module. 12 | * `mini_librispeech_prepare.py`: If necessary, downloads and prepares data manifests. 13 | 14 | To train the speaker-id model, just execute the following on the command-line: 15 | 16 | ```bash 17 | python train.py train.yaml 18 | ``` 19 | 20 | This will automatically download and prepare the data manifest for mini 21 | librispeech, and then train a model with dynamically augmented samples. 22 | 23 | More details about what each file does and how to make modifications 24 | are found within each file. The whole folder can be copied and used 25 | as a starting point for developing recipes doing classification tasks 26 | similar to speech speaker-id (e.g, language-id, emotion classification, ..). 27 | Please reach out to the SpeechBrain 28 | team if any errors are found or clarification is needed about how 29 | parts of the template work. Good Luck! 30 | 31 | [For more information, please take a look into the "speaker-id from scratch" tutorial](https://colab.research.google.com/drive/1UwisnAjr8nQF3UnrkIJ4abBMAWzVwBMh?usp=sharing) 32 | -------------------------------------------------------------------------------- /templates/speech_recognition/ASR/README.md: -------------------------------------------------------------------------------- 1 | # Template for Speech Recognition 2 | This folder provides a working, well-documented example for training 3 | a seq2seq (+ CTC) speech recognizer model from scratch, based on a few hours of data. 4 | 5 | There are three files here: 6 | 7 | * `train.py`: the main code file, outlines the entire training process. 8 | * `train.yaml`: the hyperparameters file, sets all parameters of execution. 9 | * `mini_librispeech_prepare.py`: If necessary, downloads and prepares data manifests. 10 | 11 | To train the speech recognition model, just execute the following on the command-line: 12 | 13 | ```bash 14 | python train.py train.yaml 15 | ``` 16 | 17 | We assume you already trained the tokenizer (see ../Tokenizer) and the language model (../LM). 18 | Training is done with the mini-librispeech dataset using a CRDNN model for encoding and a GRU for decoding. 19 | We pre-train with a larger model to ensure convergence (mini-librispeech is too small for training an e2e model from scratch). 20 | In a real case, you can skip pre-training and train from scratch on a larger dataset. 21 | 22 | -------------------------------------------------------------------------------- /templates/speech_recognition/ASR/mini_librispeech_prepare.py: -------------------------------------------------------------------------------- 1 | ../mini_librispeech_prepare.py -------------------------------------------------------------------------------- /templates/speech_recognition/LM/README.md: -------------------------------------------------------------------------------- 1 | # Language Model 2 | This folder contains a recipe for training language models. 3 | It supports both an RNN-based LM and a Transformer-based LM. 4 | The scripts rely on the HuggingFace dataset, which manages data reading and loading from large text corpora. 5 | Training an LM might on large text corpora might take weeks (or months) even on modern GPUs. In this template, for simplicity, we only use the training transcriptions of the mini-librispeech dataset. In the recipes, we assume you 6 | already ran the tokenizer training (see ../Tokenizer). 7 | 8 | # Extra Dependency: 9 | Make sure you have the HuggingFace dataset installed. If not, type: 10 | pip install datasets 11 | 12 | # How to run: 13 | python train.py RNNLM.yaml 14 | -------------------------------------------------------------------------------- /templates/speech_recognition/LM/extra_requirements.txt: -------------------------------------------------------------------------------- 1 | # huggingface dataset 2 | datasets 3 | -------------------------------------------------------------------------------- /templates/speech_recognition/README.md: -------------------------------------------------------------------------------- 1 | # Training a Speech Recognizer 2 | 3 | This template implements a simple speech recognizer trained on mini-librispeech. In particular, it implements an offline end-to-end attention-based speech recognizer. A tokenizer is used to detect the word token to estimate. Search replies on beam search coupled with an RNN language model. 4 | 5 | Training such a system requires the following steps: 6 | 7 | 1. Train a tokenizer. 8 | Given the training transcriptions, the tokenizers decide which word pieces allocate for training. Most atomic units are character, the least atomic units are words. Most of the time, it is convenient to use tokens that are something in between characters and full words. 9 | SpeechBrain relies on the popular [SentencePiece](https://github.com/google/sentencepiece) for tokenization. To train the tokenizer: 10 | 11 | ``` 12 | cd python train.py tokenizer.yaml 13 | python train.py tokenizer.yaml 14 | ``` 15 | 16 | 2. Train a LM 17 | After having our target tokens, we can train a language model on top of that. To do it, we need some large text corpus (better if the language domain is the same as the one of your target application). In this example, we simply train the LM on top of the training transcriptions: 18 | 19 | ``` 20 | cd ../LM 21 | python train.py RNNLM.yaml 22 | ``` 23 | 24 | In a real case, training LM is extremely computational demanding. It is thus a good practice to re-use existing LM or fine-tune them. 25 | 26 | 3. Train the speech recognizer 27 | At this point, we can train our speech recognizer. In this case, we are using a simple CRDNN model with an autoregressive GRU decoder. An attention mechanism is employed between encoding and decoder. The final sequence of words is retrieved with beamsearch coupled with the RNN LM trained in the previous step. To train the ASR: 28 | 29 | ``` 30 | cd ../ASR 31 | python train.py train.yaml 32 | ``` 33 | 34 | This template can help you figure out how to set speechbrain for implementing an e2e speech recognizer. However, in a real case, the system must be trained with much more data to provide acceptable performance. For a competitive recipe with more data, see for instance our recipes on LibriSpeech (https://github.com/speechbrain/speechbrain/tree/develop/recipes/LibriSpeech/ASR). 35 | 36 | [For more information, please take a look into the "ASR from scratch" tutorial](https://colab.research.google.com/drive/1aFgzrUv3udM_gNJNUoLaHIm78QHtxdIz?usp=sharing) 37 | -------------------------------------------------------------------------------- /templates/speech_recognition/Tokenizer/README.md: -------------------------------------------------------------------------------- 1 | # Tokenizer. 2 | This folder contains the scripts to train a tokenizer using SentencePiece (https://github.com/google/sentencepiece). 3 | The tokenizer is trained on the top of the training transcriptions. 4 | 5 | # How to run 6 | python train.py tokenizer.yaml 7 | -------------------------------------------------------------------------------- /templates/speech_recognition/Tokenizer/mini_librispeech_prepare.py: -------------------------------------------------------------------------------- 1 | ../mini_librispeech_prepare.py -------------------------------------------------------------------------------- /templates/speech_recognition/Tokenizer/tokenizer.yaml: -------------------------------------------------------------------------------- 1 | # ############################################################################ 2 | # Tokenizer: subword BPE tokenizer with unigram 1K 3 | # Training: Mini-LibriSpeech 4 | # Authors: Abdel Heba 2021 5 | # Mirco Ravanelli 2021 6 | # ############################################################################ 7 | 8 | 9 | # Set up folders for reading from and writing to 10 | data_folder: ../data 11 | output_folder: ./save 12 | 13 | # Path where data-specification files are stored 14 | train_annotation: ../train.json 15 | valid_annotation: ../valid.json 16 | test_annotation: ../test.json 17 | 18 | # Tokenizer parameters 19 | token_type: unigram # ["unigram", "bpe", "char"] 20 | token_output: 1000 # index(blank/eos/bos/unk) = 0 21 | character_coverage: 1.0 22 | annotation_read: words # field to read 23 | 24 | # Tokenizer object 25 | tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece 26 | model_dir: !ref 27 | vocab_size: !ref 28 | annotation_train: !ref 29 | annotation_read: !ref 30 | model_type: !ref # ["unigram", "bpe", "char"] 31 | character_coverage: !ref 32 | annotation_list_to_check: [!ref , !ref ] 33 | annotation_format: json 34 | -------------------------------------------------------------------------------- /templates/speech_recognition/Tokenizer/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python3 2 | """Script for training a BPE tokenizer on the top of CSV or JSON annotation files. 3 | The tokenizer converts words into sub-word units that can be used to train a 4 | language (LM) or an acoustic model (AM). 5 | When doing a speech recognition experiment you have to make 6 | sure that the acoustic and language models are trained with 7 | the same tokenizer. Otherwise, a token mismatch is introduced 8 | and beamsearch will produce bad results when combining AM and LM. 9 | 10 | To run this recipe, do the following: 11 | > python train.py tokenizer.yaml 12 | 13 | 14 | Authors 15 | * Abdel Heba 2021 16 | * Mirco Ravanelli 2021 17 | """ 18 | 19 | import sys 20 | import speechbrain as sb 21 | from hyperpyyaml import load_hyperpyyaml 22 | from mini_librispeech_prepare import prepare_mini_librispeech 23 | 24 | if __name__ == "__main__": 25 | 26 | # Load hyperparameters file with command-line overrides 27 | hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) 28 | with open(hparams_file) as fin: 29 | hparams = load_hyperpyyaml(fin, overrides) 30 | 31 | # Create experiment directory 32 | sb.create_experiment_directory( 33 | experiment_directory=hparams["output_folder"], 34 | hyperparams_to_save=hparams_file, 35 | overrides=overrides, 36 | ) 37 | 38 | # Data preparation, to be run on only one process. 39 | prepare_mini_librispeech( 40 | data_folder=hparams["data_folder"], 41 | save_json_train=hparams["train_annotation"], 42 | save_json_valid=hparams["valid_annotation"], 43 | save_json_test=hparams["test_annotation"], 44 | ) 45 | 46 | # Train tokenizer 47 | hparams["tokenizer"]() 48 | -------------------------------------------------------------------------------- /tests/.run-doctests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u -o pipefail 3 | 4 | # To run doctests locally, the easiest approach is to do: 5 | # > pytest --doctest-modules speechbrain/ 6 | # However, we take this more complex approach to avoid testing files not 7 | # tracked by git. We filter out tests that require optional dependencies. 8 | avoid="transducer_loss.py\|fairseq_wav2vec.py\|huggingface_wav2vec.py\|bleu.py" 9 | git ls-files speechbrain | grep -e "\.py$" | grep -v $avoid | xargs pytest --doctest-modules 10 | -------------------------------------------------------------------------------- /tests/.run-linters.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u -o pipefail 3 | 4 | echo "===Black===" 5 | git ls-files | grep -E "\.py$" | xargs black --check --diff 6 | echo "===Flake8===" 7 | git ls-files | grep -E "\.py$" | xargs flake8 --count --statistics 8 | echo "===Yamllint===" 9 | git ls-files | grep -E "\.yaml$|\.yml$" | xargs yamllint --no-warnings 10 | -------------------------------------------------------------------------------- /tests/.run-unittests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u -o pipefail 3 | 4 | git ls-files tests/unittests | grep -e "\.py$" | xargs pytest 5 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_CTC/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | # NOTE: Seed does not guarantee replicability with CTC 3 | seed: 1234 4 | __set_seed: !apply:torch.manual_seed [!ref ] 5 | 6 | # Training params 7 | N_epochs: 15 8 | lr: 0.002 9 | dataloader_options: 10 | batch_size: 1 11 | 12 | # Special tokens and labels 13 | blank_index: 0 14 | num_labels: 44 15 | 16 | 17 | # Model parameters 18 | activation: !name:torch.nn.LeakyReLU [] 19 | dropout: 0.15 20 | cnn_blocks: 1 21 | cnn_channels: (16,) 22 | cnn_kernelsize: (3, 3) 23 | rnn_layers: 1 24 | rnn_neurons: 128 25 | rnn_bidirectional: True 26 | dnn_blocks: 1 27 | dnn_neurons: 128 28 | 29 | compute_features: !new:speechbrain.lobes.features.MFCC 30 | 31 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 32 | norm_type: global 33 | 34 | model: !new:speechbrain.lobes.models.CRDNN.CRDNN 35 | input_shape: [null, null, 660] 36 | activation: !ref 37 | dropout: !ref 38 | cnn_blocks: !ref 39 | cnn_channels: !ref 40 | cnn_kernelsize: !ref 41 | time_pooling: True 42 | rnn_layers: !ref 43 | rnn_neurons: !ref 44 | rnn_bidirectional: !ref 45 | dnn_blocks: !ref 46 | dnn_neurons: !ref 47 | 48 | lin: !new:speechbrain.nnet.linear.Linear 49 | input_size: !ref 50 | n_neurons: !ref 51 | bias: False 52 | 53 | softmax: !new:speechbrain.nnet.activations.Softmax 54 | apply_log: True 55 | 56 | compute_cost: !name:speechbrain.nnet.losses.ctc_loss 57 | blank_index: !ref 58 | 59 | modules: 60 | model: !ref 61 | lin: !ref 62 | mean_var_norm: !ref 63 | 64 | opt_class: !name:torch.optim.Adam 65 | lr: !ref 66 | 67 | per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats 68 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_CTC/hyperparams_complex_net.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | # NOTE: Seed does not guarantee replicability with CTC 3 | seed: 1234 4 | __set_seed: !apply:torch.manual_seed [!ref ] 5 | 6 | # Training params 7 | N_epochs: 25 8 | lr: 0.002 9 | dataloader_options: 10 | batch_size: 1 11 | 12 | # Special tokens and labels 13 | blank_index: 0 14 | num_labels: 44 # 43 phonemes + 1 blank 15 | 16 | # Model parameters 17 | activation: !new:torch.nn.LeakyReLU 18 | 19 | compute_features: !new:speechbrain.lobes.features.MFCC 20 | 21 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 22 | norm_type: global 23 | 24 | 25 | model: !new:speechbrain.nnet.containers.Sequential 26 | input_shape: [null, null, 660] # input_size 27 | conv1: !name:speechbrain.nnet.complex_networks.c_CNN.CConv1d 28 | out_channels: 16 29 | kernel_size: 3 30 | nrm1: !name:speechbrain.nnet.complex_networks.c_normalization.CLayerNorm 31 | act1: !ref 32 | conv2: !name:speechbrain.nnet.complex_networks.c_CNN.CConv1d 33 | out_channels: 32 34 | kernel_size: 3 35 | nrm2: !name:speechbrain.nnet.complex_networks.c_normalization.CLayerNorm 36 | act2: !ref 37 | pooling: !new:speechbrain.nnet.pooling.Pooling1d 38 | pool_type: "avg" 39 | kernel_size: 3 40 | RNN: !name:speechbrain.nnet.complex_networks.c_RNN.CLiGRU 41 | hidden_size: 64 42 | bidirectional: True 43 | 44 | lin: !new:speechbrain.nnet.linear.Linear 45 | input_size: 256 46 | n_neurons: !ref 47 | bias: False 48 | 49 | softmax: !new:speechbrain.nnet.activations.Softmax 50 | apply_log: True 51 | 52 | modules: 53 | model: !ref 54 | lin: !ref 55 | mean_var_norm: !ref 56 | 57 | 58 | opt_class: !name:torch.optim.Adam 59 | lr: !ref 60 | 61 | compute_cost: !name:speechbrain.nnet.losses.ctc_loss 62 | blank_index: !ref 63 | 64 | per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats 65 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_CTC/hyperparams_quaternion_net.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | # NOTE: Seed does not guarantee replicability with CTC 3 | seed: 1234 4 | __set_seed: !apply:torch.manual_seed [!ref ] 5 | 6 | # Training params 7 | N_epochs: 25 8 | lr: 0.002 9 | dataloader_options: 10 | batch_size: 1 11 | 12 | # Special tokens and labels 13 | blank_index: 0 14 | num_labels: 44 # 43 phonemes + 1 blank 15 | 16 | # Model parameters 17 | activation: !new:torch.nn.LeakyReLU 18 | 19 | compute_features: !new:speechbrain.lobes.features.MFCC 20 | 21 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 22 | norm_type: global 23 | 24 | 25 | model: !new:speechbrain.nnet.containers.Sequential 26 | input_shape: [null, null, 660] # input_size 27 | conv1: !name:speechbrain.nnet.quaternion_networks.q_CNN.QConv1d 28 | out_channels: 16 29 | kernel_size: 3 30 | act1: !ref 31 | conv2: !name:speechbrain.nnet.quaternion_networks.q_CNN.QConv1d 32 | out_channels: 32 33 | kernel_size: 3 34 | act2: !ref 35 | pooling: !new:speechbrain.nnet.pooling.Pooling1d 36 | pool_type: "avg" 37 | kernel_size: 3 38 | RNN: !name:speechbrain.nnet.quaternion_networks.q_RNN.QLiGRU 39 | hidden_size: 64 40 | bidirectional: True 41 | 42 | lin: !new:speechbrain.nnet.linear.Linear 43 | input_size: 512 # 64 * 2 (bidirectional) * 4 (quaternion) 44 | n_neurons: !ref 45 | bias: False 46 | 47 | softmax: !new:speechbrain.nnet.activations.Softmax 48 | apply_log: True 49 | 50 | modules: 51 | model: !ref 52 | lin: !ref 53 | mean_var_norm: !ref 54 | 55 | 56 | opt_class: !name:torch.optim.Adam 57 | lr: !ref 58 | 59 | compute_cost: !name:speechbrain.nnet.losses.ctc_loss 60 | blank_index: !ref 61 | 62 | per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats 63 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_DNN_HMM/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training params 6 | N_epochs: 15 7 | lr: 0.002 8 | dataloader_options: 9 | batch_size: 1 10 | 11 | # Model parameters 12 | compute_features: !new:speechbrain.lobes.features.MFCC 13 | 14 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 15 | norm_type: global 16 | 17 | 18 | linear1: !new:speechbrain.nnet.linear.Linear 19 | input_size: 660 20 | n_neurons: 1024 21 | bias: False 22 | 23 | activation: !new:torch.nn.LeakyReLU 24 | 25 | linear2: !new:speechbrain.nnet.linear.Linear 26 | input_size: 1024 27 | n_neurons: 43 28 | bias: False 29 | 30 | softmax: !new:speechbrain.nnet.activations.Softmax 31 | apply_log: True 32 | 33 | modules: 34 | linear1: !ref 35 | linear2: !ref 36 | mean_var_norm: !ref 37 | 38 | opt_class: !name:torch.optim.Adam 39 | lr: !ref 40 | 41 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 42 | 43 | error_stats: !name:speechbrain.utils.metric_stats.MetricStats 44 | metric: !name:speechbrain.nnet.losses.classification_error 45 | reduction: batch 46 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_alignment_forward/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training params 6 | N_epochs: 15 7 | lr: 0.004 8 | dataloader_options: 9 | batch_size: 1 10 | 11 | # Model parameters 12 | activation: !name:torch.nn.LeakyReLU 13 | dropout: 0.15 14 | cnn_blocks: 1 15 | cnn_channels: (16,) 16 | cnn_kernelsize: (3, 3) 17 | rnn_layers: 1 18 | rnn_neurons: 128 19 | rnn_bidirectional: True 20 | dnn_blocks: 1 21 | dnn_neurons: 128 22 | 23 | 24 | compute_features: !new:speechbrain.lobes.features.MFCC 25 | 26 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 27 | norm_type: global 28 | 29 | model: !new:speechbrain.lobes.models.CRDNN.CRDNN 30 | input_shape: [null, null, 660] 31 | activation: !ref 32 | dropout: !ref 33 | cnn_blocks: !ref 34 | cnn_channels: !ref 35 | cnn_kernelsize: !ref 36 | time_pooling: False 37 | rnn_layers: !ref 38 | rnn_neurons: !ref 39 | rnn_bidirectional: !ref 40 | dnn_blocks: !ref 41 | dnn_neurons: !ref 42 | 43 | lin: !new:speechbrain.nnet.linear.Linear 44 | input_size: !ref 45 | n_neurons: 43 # 43 phonemes, no blank 46 | bias: False 47 | 48 | modules: 49 | model: !ref 50 | lin: !ref 51 | mean_var_norm: !ref 52 | 53 | opt_class: !name:torch.optim.Adam 54 | lr: !ref 55 | 56 | softmax: !new:speechbrain.nnet.activations.Softmax 57 | apply_log: True 58 | 59 | aligner: !new:speechbrain.alignment.aligner.HMMAligner 60 | 61 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 62 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_alignment_viterbi/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training params 6 | N_epochs: 10 7 | lr: 0.004 8 | dataloader_options: 9 | batch_size: 1 10 | 11 | # Model parameters 12 | activation: !name:torch.nn.LeakyReLU 13 | dropout: 0.15 14 | cnn_blocks: 1 15 | cnn_channels: (16,) 16 | cnn_kernelsize: (3, 3) 17 | rnn_layers: 1 18 | rnn_neurons: 128 19 | rnn_bidirectional: True 20 | dnn_blocks: 1 21 | dnn_neurons: 128 22 | 23 | 24 | compute_features: !new:speechbrain.lobes.features.MFCC 25 | 26 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 27 | norm_type: global 28 | 29 | model: !new:speechbrain.lobes.models.CRDNN.CRDNN 30 | input_shape: [null, null, 660] 31 | activation: !ref 32 | dropout: !ref 33 | cnn_blocks: !ref 34 | cnn_channels: !ref 35 | cnn_kernelsize: !ref 36 | time_pooling: False 37 | rnn_layers: !ref 38 | rnn_neurons: !ref 39 | rnn_bidirectional: !ref 40 | dnn_blocks: !ref 41 | dnn_neurons: !ref 42 | 43 | lin: !new:speechbrain.nnet.linear.Linear 44 | input_size: !ref 45 | n_neurons: 43 # 43 phonemes, no blank 46 | bias: False 47 | 48 | modules: 49 | model: !ref 50 | lin: !ref 51 | mean_var_norm: !ref 52 | 53 | opt_class: !name:torch.optim.Adam 54 | lr: !ref 55 | 56 | softmax: !new:speechbrain.nnet.activations.Softmax 57 | apply_log: True 58 | 59 | aligner: !new:speechbrain.alignment.aligner.HMMAligner 60 | 61 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 62 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/ASR_seq2seq/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training Parameters 6 | N_epochs: 10 7 | lr: 0.002 8 | dataloader_options: 9 | batch_size: 1 10 | 11 | # token information 12 | bos_index: 0 # eos_index = bos_index + 1 13 | num_labels: 45 14 | 15 | # Model parameters 16 | activation: !name:torch.nn.LeakyReLU [] 17 | dropout: 0.15 18 | cnn_blocks: 1 19 | cnn_channels: (16,) 20 | cnn_kernelsize: (3, 3) 21 | rnn_layers: 1 22 | rnn_neurons: 128 23 | rnn_bidirectional: True 24 | dnn_blocks: 1 25 | dnn_neurons: 128 26 | 27 | compute_features: !new:speechbrain.lobes.features.MFCC 28 | 29 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 30 | norm_type: global 31 | 32 | enc: !new:speechbrain.lobes.models.CRDNN.CRDNN 33 | input_shape: [null, null, 660] 34 | activation: !ref 35 | dropout: !ref 36 | cnn_blocks: !ref 37 | cnn_channels: !ref 38 | cnn_kernelsize: !ref 39 | time_pooling: True 40 | rnn_layers: !ref 41 | rnn_neurons: !ref 42 | rnn_bidirectional: !ref 43 | dnn_blocks: !ref 44 | dnn_neurons: !ref 45 | 46 | lin: !new:speechbrain.nnet.linear.Linear 47 | input_size: !ref 48 | n_neurons: !ref 49 | bias: False 50 | 51 | emb: !new:speechbrain.nnet.embedding.Embedding 52 | num_embeddings: !ref 53 | embedding_dim: 128 54 | 55 | dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder 56 | enc_dim: 128 57 | input_size: 128 58 | rnn_type: gru 59 | attn_type: content 60 | hidden_size: !ref 61 | attn_dim: !ref 62 | num_layers: 1 63 | 64 | softmax: !new:speechbrain.nnet.activations.Softmax 65 | apply_log: True 66 | 67 | modules: 68 | enc: !ref 69 | emb: !ref 70 | dec: !ref 71 | lin: !ref 72 | mean_var_norm: !ref 73 | 74 | opt_class: !name:torch.optim.Adam 75 | lr: !ref 76 | 77 | searcher: !new:speechbrain.decoders.seq2seq.S2SRNNGreedySearcher 78 | embedding: !ref 79 | decoder: !ref 80 | linear: !ref 81 | bos_index: !ref 82 | eos_index: !ref + 1 83 | min_decode_ratio: 0 84 | max_decode_ratio: 0.1 85 | 86 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 87 | 88 | per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats 89 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/G2P/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training Parameters 6 | N_epochs: 10 7 | lr: 0.002 8 | dataloader_options: 9 | batch_size: 1 10 | 11 | # token information 12 | bos_index: 0 # eos_index = bos_index + 1 13 | num_phns: 45 # 43 phonemes + 1 bos + 1 eos 14 | num_chars: 26 # 24 chars + 1 bos + 1 eos 15 | 16 | 17 | # Model parameters 18 | activation: !name:torch.nn.LeakyReLU 19 | rnn_layers: 1 20 | rnn_neurons: 128 21 | rnn_bidirectional: True 22 | 23 | enc: !new:speechbrain.nnet.RNN.LSTM 24 | input_shape: [null, null, 128] 25 | bidirectional: True 26 | hidden_size: 64 27 | num_layers: 1 28 | dropout: 0.0 29 | 30 | lin: !new:speechbrain.nnet.linear.Linear 31 | input_size: !ref 32 | n_neurons: !ref 33 | bias: False 34 | 35 | encoder_emb: !new:speechbrain.nnet.embedding.Embedding 36 | num_embeddings: !ref 37 | embedding_dim: 128 38 | 39 | emb: !new:speechbrain.nnet.embedding.Embedding 40 | num_embeddings: !ref 41 | embedding_dim: 128 42 | 43 | dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder 44 | enc_dim: 128 45 | input_size: 128 46 | rnn_type: gru 47 | attn_type: content 48 | hidden_size: !ref 49 | attn_dim: !ref 50 | num_layers: 1 51 | 52 | softmax: !new:speechbrain.nnet.activations.Softmax 53 | apply_log: True 54 | 55 | modules: 56 | enc: !ref 57 | emb: !ref 58 | dec: !ref 59 | lin: !ref 60 | 61 | opt_class: !name:torch.optim.Adam 62 | lr: !ref 63 | 64 | searcher: !new:speechbrain.decoders.seq2seq.S2SRNNGreedySearcher 65 | embedding: !ref 66 | decoder: !ref 67 | linear: !ref 68 | bos_index: !ref 69 | eos_index: !ref + 1 70 | min_decode_ratio: 0 71 | max_decode_ratio: 0.1 72 | 73 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 74 | 75 | per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats 76 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/LM_RNN/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training Parameters 6 | N_epochs: 30 7 | lr: 0.01 8 | dataloader_options: 9 | batch_size: 8 10 | 11 | # token information 12 | bos_index: 0 # eos_index = bos_index + 1 13 | num_chars: 26 # 24 chars + 1 bos + 1 eos 14 | 15 | # Model parameters 16 | rnn_layers: 1 17 | rnn_neurons: 256 18 | emb_size: 128 19 | dropout: 0.0 20 | 21 | model: !new:speechbrain.lobes.models.RNNLM.RNNLM 22 | output_neurons: !ref 23 | embedding_dim: !ref 24 | rnn_neurons: !ref 25 | rnn_layers: !ref 26 | dropout: !ref 27 | 28 | modules: {model: !ref } 29 | 30 | opt_class: !name:torch.optim.Adam 31 | lr: !ref 32 | 33 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 34 | limit: !ref 35 | 36 | log_softmax: !new:speechbrain.nnet.activations.Softmax 37 | apply_log: True 38 | 39 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 40 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/VAD/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Seed needs to be set at top of yaml, before objects with parameters are made 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | sample_rate: 16000 5 | example_length: 2 6 | 7 | 8 | # Training params 9 | N_epochs: 15 10 | lr: 0.01 11 | dataloader_options: 12 | batch_size: 1 13 | 14 | # Feature parameters 15 | n_mfcc: 20 16 | 17 | # Model parameters 18 | rnn_layers: 2 19 | rnn_neurons: 256 20 | emb_size: 23 21 | dropout: 0.1 22 | output_neurons: 1 23 | 24 | compute_features: !new:speechbrain.lobes.features.MFCC 25 | n_mfcc: !ref 26 | 27 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 28 | norm_type: global 29 | 30 | rnn: !new:speechbrain.nnet.RNN.LSTM 31 | input_size: !ref * 33 # d & dd = *3, 5 left & 5 right = *11 32 | hidden_size: !ref 33 | num_layers: !ref 34 | dropout: !ref 35 | bidirectional: False 36 | re_init: True 37 | 38 | lin: !new:speechbrain.nnet.linear.Linear 39 | input_size: !ref 40 | n_neurons: !ref 41 | bias: False 42 | 43 | modules: 44 | rnn: !ref 45 | lin: !ref 46 | mean_var_norm: !ref 47 | 48 | opt_class: !name:torch.optim.Adam 49 | lr: !ref 50 | 51 | compute_BCE_cost: !name:speechbrain.nnet.losses.compute_masked_loss 52 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/autoencoder/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Basic parameters 2 | # Seed needs to be set at top of yaml, before objects with parameters are made 3 | seed: 1234 4 | __set_seed: !apply:torch.manual_seed [!ref ] 5 | use_tensorboard: False 6 | tensorboard_logs: runs 7 | 8 | # Training params 9 | N_epochs: 100 10 | lr: 0.004 11 | dataloader_options: 12 | batch_size: 2 13 | 14 | 15 | compute_features: !new:speechbrain.lobes.features.MFCC 16 | left_frames: 1 17 | right_frames: 1 18 | 19 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 20 | norm_type: global 21 | 22 | linear1: !new:speechbrain.nnet.linear.Linear 23 | input_size: 180 24 | n_neurons: 128 25 | bias: False 26 | 27 | activation: !new:torch.nn.LeakyReLU 28 | 29 | linear2: !new:speechbrain.nnet.linear.Linear 30 | input_size: 128 31 | n_neurons: 180 32 | bias: False 33 | 34 | modules: 35 | linear1: !ref 36 | linear2: !ref 37 | mean_var_norm: !ref 38 | 39 | opt_class: !name:torch.optim.Adam 40 | lr: !ref 41 | 42 | compute_cost: !name:speechbrain.nnet.losses.mse_loss 43 | 44 | loss_tracker: !name:speechbrain.utils.metric_stats.MetricStats 45 | metric: !name:speechbrain.nnet.losses.mse_loss 46 | reduction: batch 47 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/enhance_GAN/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Basic parameters 2 | # Seed needs to be set at top of yaml, before objects with parameters are made 3 | seed: 1234 4 | __set_seed: !apply:torch.manual_seed [!ref ] 5 | 6 | # Training params 7 | N_epochs: 5 8 | lr: 0.004 9 | dataloader_options: 10 | batch_size: 2 11 | 12 | models: !include:models.yaml 13 | 14 | add_noise: !new:speechbrain.processing.speech_augmentation.AddNoise 15 | 16 | modules: 17 | generator: !ref 18 | discriminator: !ref 19 | 20 | g_opt_class: !name:torch.optim.Adam 21 | lr: !ref 22 | d_opt_class: !name:torch.optim.Adam 23 | lr: !ref / 4 24 | 25 | compute_cost: !name:speechbrain.nnet.losses.mse_loss 26 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/enhance_GAN/models.yaml: -------------------------------------------------------------------------------- 1 | generator: !new:speechbrain.nnet.containers.Sequential 2 | input_shape: [null, null, 1] 3 | conv1: !name:speechbrain.nnet.CNN.Conv1d 4 | out_channels: 32 5 | kernel_size: 11 6 | activation: !new:torch.nn.LeakyReLU 7 | conv2: !name:speechbrain.nnet.CNN.Conv1d 8 | out_channels: 1 9 | kernel_size: 11 10 | tanh: !new:torch.nn.Tanh 11 | 12 | discriminator: !new:speechbrain.nnet.containers.Sequential 13 | input_shape: [null, null, 1] 14 | conv1: !name:speechbrain.nnet.CNN.Conv1d 15 | out_channels: 32 16 | kernel_size: 11 17 | stride: 8 18 | activation: !new:torch.nn.LeakyReLU 19 | conv2: !name:speechbrain.nnet.CNN.Conv1d 20 | out_channels: 1 21 | kernel_size: 11 22 | stride: 8 23 | sigmoid: !new:torch.nn.Sigmoid 24 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/separation/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: ConvTasnet for source separation 3 | # Data : Minimal Example 4 | # Author: Cem Subakan 5 | # ################################ 6 | 7 | 8 | # Basic parameters 9 | # Seed needs to be set at top of yaml, before objects with parameters are made 10 | seed: 1234 11 | __set_seed: !apply:torch.manual_seed [!ref ] 12 | output_folder: !ref results/conv_tasnet/ 13 | save_folder: !ref /save 14 | train_log: !ref /train_log.txt 15 | 16 | # Training params 17 | N_epochs: 150 18 | lr: 0.002 19 | dataloader_options: 20 | batch_size: 1 21 | 22 | mask_net: !new:speechbrain.lobes.models.conv_tasnet.MaskNet 23 | N: 32 24 | B: 32 25 | H: 32 26 | P: 3 27 | X: 1 28 | R: 2 29 | C: 2 30 | norm_type: 'gLN' 31 | causal: False 32 | mask_nonlinear: 'relu' 33 | 34 | encoder: !new:speechbrain.lobes.models.dual_path.Encoder 35 | kernel_size: 16 36 | out_channels: 32 37 | 38 | decoder: !new:speechbrain.lobes.models.dual_path.Decoder 39 | in_channels: 32 40 | out_channels: 1 41 | kernel_size: 16 42 | stride: 8 43 | bias: False 44 | 45 | modules: 46 | mask_net: !ref 47 | encoder: !ref 48 | decoder: !ref 49 | 50 | opt_class: !name:torch.optim.Adam 51 | lr: !ref 52 | 53 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 54 | limit: !ref 55 | 56 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 57 | save_file: !ref 58 | -------------------------------------------------------------------------------- /tests/integration/neural_networks/speaker_id/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Basic parameters 2 | seed: 1234 3 | __set_seed: !apply:torch.manual_seed [!ref ] 4 | 5 | # Training params 6 | N_epochs: 10 7 | lr: 0.001 8 | dataloader_options: 9 | batch_size: 8 10 | 11 | # Feature parameters 12 | n_mels: 24 13 | left_frames: 0 14 | right_frames: 0 15 | deltas: False 16 | 17 | # Number of speakers 18 | num_spks: 2 19 | 20 | 21 | compute_features: !new:speechbrain.lobes.features.Fbank 22 | n_mels: !ref 23 | left_frames: !ref 24 | right_frames: !ref 25 | deltas: !ref 26 | 27 | xvector_model: !new:speechbrain.lobes.models.Xvector.Xvector 28 | in_channels: !ref 29 | activation: !name:torch.nn.LeakyReLU 30 | tdnn_blocks: 5 31 | tdnn_channels: [512, 512, 512, 512, 1500] 32 | tdnn_kernel_sizes: [5, 3, 3, 1, 1] 33 | tdnn_dilations: [1, 2, 3, 1, 1] 34 | lin_neurons: 512 35 | 36 | classifier: !new:speechbrain.lobes.models.Xvector.Classifier 37 | input_shape: [null, null, 512] 38 | activation: !name:torch.nn.LeakyReLU 39 | lin_blocks: 1 40 | lin_neurons: 512 41 | out_neurons: !ref 42 | 43 | mean_var_norm: !new:speechbrain.processing.features.InputNormalization 44 | norm_type: global 45 | 46 | modules: 47 | xvector_model: !ref 48 | classifier: !ref 49 | mean_var_norm: !ref 50 | 51 | opt_class: !name:torch.optim.Adam 52 | lr: !ref 53 | 54 | compute_cost: !name:speechbrain.nnet.losses.nll_loss 55 | 56 | error_stats: !name:speechbrain.utils.metric_stats.MetricStats 57 | metric: !name:speechbrain.nnet.losses.classification_error 58 | reduction: batch 59 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/PLDA_xvector/example_plda_experiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import pickle 4 | import numpy 5 | from numpy import linalg as LA 6 | from speechbrain.processing.PLDA_LDA import StatObject_SB # noqa F401 7 | from speechbrain.processing.PLDA_LDA import PLDA 8 | from speechbrain.processing.PLDA_LDA import Ndx 9 | from speechbrain.processing.PLDA_LDA import fast_PLDA_scoring 10 | 11 | 12 | # Load params file 13 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 14 | data_folder = "../../../../../samples/plda_xvect_samples/" 15 | data_folder = os.path.abspath(experiment_dir + data_folder) 16 | 17 | # Xvectors stored as StatObject_SB 18 | train_file = data_folder + "/train_stat_xvect.pkl" 19 | enrol_file = data_folder + "/enrol_stat_xvect.pkl" 20 | test_file = data_folder + "/test_stat_xvect.pkl" 21 | scores_file = data_folder + "/expected_plda_scores.pkl" 22 | 23 | # Load Train 24 | with open(train_file, "rb") as input: 25 | train_obj = pickle.load(input) 26 | 27 | # Load Enrol 28 | with open(enrol_file, "rb") as input: 29 | enrol_obj = pickle.load(input) 30 | 31 | # Load Test 32 | with open(test_file, "rb") as input: 33 | test_obj = pickle.load(input) 34 | 35 | print("Training PLDA...") 36 | plda = PLDA() 37 | plda.plda(train_obj) 38 | 39 | # Preparing Ndx map 40 | models = enrol_obj.modelset 41 | testsegs = test_obj.modelset 42 | ndx_obj = Ndx(models=models, testsegs=testsegs) 43 | 44 | # PLDA scoring between enrol and test 45 | scores_plda = fast_PLDA_scoring( 46 | enrol_obj, test_obj, ndx_obj, plda.mean, plda.F, plda.Sigma 47 | ) 48 | print("PLDA score matrix: (Rows: Enrol, Columns: Test)") 49 | print(scores_plda.scoremat) 50 | 51 | with open(scores_file, "rb") as input: 52 | expected_score_matrix = pickle.load(input) 53 | 54 | print("Expected scores:\n", expected_score_matrix) 55 | 56 | # Ensuring the scores are proper (for integration test) 57 | dif = numpy.subtract(expected_score_matrix, scores_plda.scoremat) 58 | f_norm = LA.norm(dif, ord="fro") 59 | 60 | 61 | # Integration test: Ensure we get same score matrix 62 | def test_error(): 63 | assert f_norm < 0.1 64 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_add_babble.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "add_babble") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | "batch_size": 5, 18 | } 19 | with open(hyperparams_file) as fin: 20 | hyperparams = load_hyperpyyaml(fin, overrides) 21 | 22 | sb.create_experiment_directory( 23 | experiment_directory=output_folder, 24 | hyperparams_to_save=hyperparams_file, 25 | overrides=overrides, 26 | ) 27 | 28 | dataloader = sb.dataio.dataloader.make_dataloader( 29 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 30 | ) 31 | for (id, (wav, wav_len),) in iter(dataloader): 32 | wav_babble = hyperparams["add_babble"](wav, wav_len) 33 | # save results on file 34 | for i, snt_id in enumerate(id): 35 | filepath = ( 36 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 37 | ) 38 | write_audio(filepath, wav_babble[i], 16000) 39 | 40 | 41 | def test_bubble(): 42 | from glob import glob 43 | 44 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 45 | expected_file = filename.replace("results", "expected") 46 | actual = read_audio(filename) 47 | expected = read_audio(expected_file) 48 | assert actual.allclose(expected) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_add_noise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "add_noise") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_noise = hyperparams["add_noise"](wav, wav_len) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_noise[i], 16000) 38 | 39 | 40 | def test_noise(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_add_reverb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "add_reverb") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_reverb = hyperparams["add_reverb"](wav, wav_len) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_reverb[i], 16000) 38 | 39 | 40 | def test_reverb(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_do_clip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "do_clip") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_clip = hyperparams["do_clip"](wav) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_clip[i], 16000) 38 | 39 | 40 | def test_do_clip(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_drop_chunk.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "drop_chunk") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_drop = hyperparams["drop_chunk"](wav, wav_len) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_drop[i], 16000) 38 | 39 | 40 | def test_chunk(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_drop_freq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "drop_freq") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_drop = hyperparams["drop_freq"](wav) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_drop[i], 16000) 38 | 39 | 40 | def test_drop_freq(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/example_speed_perturb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speechbrain as sb 3 | from hyperpyyaml import load_hyperpyyaml 4 | from speechbrain.dataio.dataio import read_audio, write_audio 5 | 6 | output_folder = os.path.join("results", "speed_perturb") 7 | experiment_dir = os.path.dirname(os.path.abspath(__file__)) 8 | hyperparams_file = os.path.join(experiment_dir, "hyperparams.yaml") 9 | 10 | 11 | def main(): 12 | overrides = { 13 | "output_folder": output_folder, 14 | "data_folder": os.path.join( 15 | experiment_dir, "..", "..", "..", "samples" 16 | ), 17 | } 18 | with open(hyperparams_file) as fin: 19 | hyperparams = load_hyperpyyaml(fin, overrides) 20 | 21 | sb.create_experiment_directory( 22 | experiment_directory=output_folder, 23 | hyperparams_to_save=hyperparams_file, 24 | overrides=overrides, 25 | ) 26 | 27 | dataloader = sb.dataio.dataloader.make_dataloader( 28 | dataset=hyperparams["sample_data"], batch_size=hyperparams["batch_size"] 29 | ) 30 | for (id, (wav, wav_len),) in iter(dataloader): 31 | wav_perturb = hyperparams["speed_perturb"](wav) 32 | # save results on file 33 | for i, snt_id in enumerate(id): 34 | filepath = ( 35 | hyperparams["output_folder"] + "/save/" + snt_id + ".flac" 36 | ) 37 | write_audio(filepath, wav_perturb[i], 16000) 38 | 39 | 40 | def test_peturb(): 41 | from glob import glob 42 | 43 | for filename in glob(os.path.join(output_folder, "save", "*.flac")): 44 | expected_file = filename.replace("results", "expected") 45 | actual = read_audio(filename) 46 | expected = read_audio(expected_file) 47 | assert actual.allclose(expected) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/add_babble/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/add_babble/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/add_noise/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/add_noise/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/add_reverb/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/add_reverb/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/do_clip/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/do_clip/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/drop_chunk/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/drop_chunk/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/drop_freq/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/drop_freq/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/expected/speed_perturb/save/example1.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YUCHEN005/Unified-Enhance-Separation/1925be9e75835391ad2aed89a7e63a5b9b40e757/tests/integration/signal_processing/expected/speed_perturb/save/example1.flac -------------------------------------------------------------------------------- /tests/integration/signal_processing/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | output_folder: !PLACEHOLDER 2 | data_folder: !PLACEHOLDER 3 | csv_file: !ref /audio_samples/csv_example2.csv 4 | sample_rate: 16000 5 | batch_size: 1 6 | 7 | sample_data: !new:speechbrain.dataio.legacy.ExtendedCSVDataset 8 | csvpath: !ref 9 | sorting: descending 10 | output_keys: [id, wav] 11 | replacements: 12 | data_folder: !ref /audio_samples 13 | 14 | add_babble: !new:speechbrain.processing.speech_augmentation.AddBabble 15 | speaker_count: 4 # Must set batch size to 5 or more 16 | snr_low: 0 17 | snr_high: 0 18 | 19 | add_reverb: !new:speechbrain.processing.speech_augmentation.AddReverb 20 | csv_file: !ref /rir_samples/rirs_rel.csv 21 | sorting: descending 22 | replacements: 23 | rir_folder: !ref /rir_samples 24 | 25 | add_noise: !new:speechbrain.processing.speech_augmentation.AddNoise 26 | csv_file: !ref /noise_samples/noise_rel.csv 27 | sorting: descending 28 | snr_low: 0 29 | snr_high: 0 30 | pad_noise: False 31 | start_index: 0 32 | replacements: 33 | noise_folder: !ref /noise_samples 34 | 35 | drop_freq: !new:speechbrain.processing.speech_augmentation.DropFreq 36 | drop_freq_low: 0.5 37 | drop_freq_high: 0.5 38 | drop_count_low: 1 39 | drop_count_high: 1 40 | drop_width: 0.05 41 | 42 | drop_chunk: !new:speechbrain.processing.speech_augmentation.DropChunk 43 | drop_length_low: 1000 44 | drop_length_high: 1000 45 | drop_count_low: 1 46 | drop_count_high: 1 47 | drop_start: 1000 48 | drop_end: 2000 49 | 50 | do_clip: !new:speechbrain.processing.speech_augmentation.DoClip 51 | clip_low: 0.01 52 | clip_high: 0.01 53 | 54 | speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb 55 | orig_freq: !ref 56 | speeds: [90] 57 | -------------------------------------------------------------------------------- /tests/integration/signal_processing/nmf_sourcesep/hyperparams.yaml: -------------------------------------------------------------------------------- 1 | # Basic parameters 2 | output_folder: results/minimal/nmf_sourcesep 3 | sample_rate: 16000 4 | 5 | # Data files 6 | data_folder: ../../../../samples/audio_samples/sourcesep_samples 7 | csv_train: !ref /csv_example_sourcesep_source1.csv 8 | csv_train2: !ref /csv_example_sourcesep_source2.csv 9 | csv_test: !ref /csv_example_sourcesep_mixture.csv 10 | 11 | # NMF parameters 12 | N_epochs: 50 13 | K: 20 # this specifies the number of template vectors to use in NMF. 14 | N_batch: 200 15 | m: 513 # length of stft vectors 16 | win_length: 40 # window length (in ms) for stft 17 | hop_length: 10 # hop length (in ms) for stft 18 | 19 | # Experiment flags: 20 | save_reconstructed: False # saves the results 21 | copy_original_files: False # copies the original files 22 | 23 | train_data: !new:speechbrain.dataio.legacy.ExtendedCSVDataset 24 | csvpath: !ref 25 | output_keys: [wav] 26 | sorting: 'original' 27 | replacements: 28 | data_folder: !ref 29 | 30 | train_data2: !new:speechbrain.dataio.legacy.ExtendedCSVDataset 31 | csvpath: !ref 32 | output_keys: [wav] 33 | sorting: 'original' 34 | replacements: 35 | data_folder: !ref 36 | 37 | test_data: !new:speechbrain.dataio.legacy.ExtendedCSVDataset 38 | csvpath: !ref 39 | output_keys: [wav] 40 | sorting: 'original' 41 | replacements: 42 | data_folder: !ref 43 | 44 | loader_kwargs: 45 | batch_size: !ref 46 | 47 | compute_features: !new:speechbrain.processing.features.STFT 48 | sample_rate: !ref 49 | n_fft: 1024 50 | win_length: !ref 51 | hop_length: !ref 52 | -------------------------------------------------------------------------------- /tests/unittests/test_CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | 5 | def test_SincConv(): 6 | 7 | from speechbrain.nnet.CNN import SincConv 8 | 9 | input = torch.rand([4, 16000]) 10 | convolve = SincConv( 11 | input_shape=input.shape, out_channels=8, kernel_size=65, padding="same" 12 | ) 13 | output = convolve(input) 14 | assert output.shape[-1] == 8 15 | 16 | assert torch.jit.trace(convolve, input) 17 | 18 | 19 | def test_Conv1d(): 20 | 21 | from speechbrain.nnet.CNN import Conv1d 22 | 23 | input = torch.tensor([-1, -1, -1, -1]).unsqueeze(0).unsqueeze(2).float() 24 | convolve = Conv1d( 25 | out_channels=1, kernel_size=1, input_shape=input.shape, padding="same" 26 | ) 27 | output = convolve(input) 28 | assert input.shape == output.shape 29 | 30 | convolve.conv.weight = torch.nn.Parameter( 31 | torch.tensor([-1]).float().unsqueeze(0).unsqueeze(1) 32 | ) 33 | convolve.conv.bias = torch.nn.Parameter(torch.tensor([0]).float()) 34 | output = convolve(input) 35 | assert torch.all(torch.eq(torch.ones(input.shape), output)) 36 | 37 | assert torch.jit.trace(convolve, input) 38 | 39 | 40 | def test_Conv2d(): 41 | 42 | from speechbrain.nnet.CNN import Conv2d 43 | 44 | input = torch.rand([4, 11, 32, 1]) 45 | convolve = Conv2d( 46 | out_channels=1, 47 | input_shape=input.shape, 48 | kernel_size=(1, 1), 49 | padding="same", 50 | ) 51 | output = convolve(input) 52 | assert output.shape[-1] == 1 53 | 54 | convolve.conv.weight = torch.nn.Parameter( 55 | torch.zeros(convolve.conv.weight.shape) 56 | ) 57 | convolve.conv.bias = torch.nn.Parameter(torch.tensor([0]).float()) 58 | output = convolve(input) 59 | assert torch.all(torch.eq(torch.zeros(input.shape), output)) 60 | 61 | convolve.conv.weight = torch.nn.Parameter( 62 | torch.ones(convolve.conv.weight.shape) 63 | ) 64 | convolve.conv.bias = torch.nn.Parameter(torch.tensor([0]).float()) 65 | output = convolve(input) 66 | assert torch.all(torch.eq(input, output)) 67 | 68 | assert torch.jit.trace(convolve, input) 69 | -------------------------------------------------------------------------------- /tests/unittests/test_activations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | 5 | def test_softmax(): 6 | 7 | from speechbrain.nnet.activations import Softmax 8 | 9 | inputs = torch.tensor([1, 2, 3]).float() 10 | act = Softmax(apply_log=False) 11 | outputs = act(inputs) 12 | assert torch.argmax(outputs) == 2 13 | 14 | assert torch.jit.trace(act, inputs) 15 | -------------------------------------------------------------------------------- /tests/unittests/test_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def test_rel_pos_MHA(): 5 | 6 | from speechbrain.nnet.attention import RelPosMHAXL 7 | 8 | bsz = 2 9 | emb_dim = 4 10 | k_len = [12, 10] 11 | q_len = [10, 12] 12 | bias = [True, False] 13 | head_dim = [4, None] 14 | 15 | for kl in k_len: 16 | for ql in q_len: 17 | for b in bias: 18 | for h in head_dim: 19 | relpos = RelPosMHAXL(emb_dim, num_heads=2, vbias=b, vdim=h) 20 | q = torch.rand((bsz, ql, emb_dim)) 21 | k = torch.rand((bsz, kl, emb_dim)) 22 | pos_embs = torch.rand((1, 2 * kl - 1, emb_dim)) 23 | relpos(q, k, k, pos_embs=pos_embs) 24 | -------------------------------------------------------------------------------- /tests/unittests/test_batching.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | 5 | def test_batch_pad_right_to(): 6 | from speechbrain.utils.data_utils import batch_pad_right 7 | import random 8 | 9 | n_channels = 40 10 | batch_lens = [1, 5] 11 | 12 | for b in batch_lens: 13 | tensors = [ 14 | torch.ones(n_channels, random.randint(10, 53),) for x in range(b) 15 | ] 16 | batched, lens = batch_pad_right(tensors) 17 | assert batched.shape[0] == b 18 | 19 | for b in batch_lens: 20 | tensors = [torch.ones(random.randint(10, 53),) for x in range(b)] 21 | batched, lens = batch_pad_right(tensors) 22 | assert batched.shape[0] == b 23 | 24 | 25 | def test_paddedbatch(): 26 | from speechbrain.dataio.batch import PaddedBatch 27 | 28 | batch = PaddedBatch( 29 | [ 30 | { 31 | "id": "ex1", 32 | "foo": torch.Tensor([1.0]), 33 | "bar": torch.Tensor([1.0, 2.0, 3.0]), 34 | }, 35 | { 36 | "id": "ex2", 37 | "foo": torch.Tensor([2.0, 1.0]), 38 | "bar": torch.Tensor([2.0]), 39 | }, 40 | ] 41 | ) 42 | batch.to(dtype=torch.half) 43 | assert batch.foo.data.dtype == torch.half 44 | assert batch["foo"][1].dtype == torch.half 45 | assert batch.bar.lengths.dtype == torch.half 46 | assert batch.foo.data.shape == torch.Size([2, 2]) 47 | assert batch.bar.data.shape == torch.Size([2, 3]) 48 | ids, foos, bars = batch 49 | assert ids == ["ex1", "ex2"] 50 | 51 | 52 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA") 53 | def test_pin_memory(): 54 | from speechbrain.dataio.batch import PaddedBatch 55 | 56 | batch = PaddedBatch( 57 | [ 58 | { 59 | "id": "ex1", 60 | "foo": torch.Tensor([1.0]), 61 | "bar": torch.Tensor([1.0, 2.0, 3.0]), 62 | }, 63 | { 64 | "id": "ex2", 65 | "foo": torch.Tensor([2.0, 1.0]), 66 | "bar": torch.Tensor([2.0]), 67 | }, 68 | ] 69 | ) 70 | batch.pin_memory() 71 | assert batch.foo.data.is_pinned() 72 | -------------------------------------------------------------------------------- /tests/unittests/test_callchains.py: -------------------------------------------------------------------------------- 1 | def test_lengths_arg_exists(): 2 | from speechbrain.utils.callchains import lengths_arg_exists 3 | 4 | def non_len_func(x): 5 | return x + 1 6 | 7 | def len_func(x, lengths): 8 | return x + lengths 9 | 10 | assert not lengths_arg_exists(non_len_func) 11 | assert lengths_arg_exists(len_func) 12 | 13 | 14 | def test_lengths_capable_chain(): 15 | from speechbrain.utils.callchains import LengthsCapableChain 16 | 17 | def non_len_func(x): 18 | return x + 1 19 | 20 | def len_func(x, lengths): 21 | return x + lengths 22 | 23 | def tuple_func(x): 24 | return x, x + 1 25 | 26 | chain = LengthsCapableChain(non_len_func, len_func) 27 | assert chain(1, 2) == 4 28 | assert chain(lengths=2, x=1) == 4 29 | chain.append(non_len_func) 30 | assert chain(1, 2) == 5 31 | chain.append(tuple_func) 32 | assert chain(1, 2) == 5 33 | -------------------------------------------------------------------------------- /tests/unittests/test_core.py: -------------------------------------------------------------------------------- 1 | def test_parse_arguments(): 2 | from speechbrain.core import parse_arguments 3 | 4 | filename, run_opts, overrides = parse_arguments( 5 | ["params.yaml", "--device=cpu", "--seed=3", "--data_folder", "TIMIT"] 6 | ) 7 | assert filename == "params.yaml" 8 | assert run_opts["device"] == "cpu" 9 | assert overrides == "seed: 3\ndata_folder: TIMIT" 10 | 11 | 12 | def test_brain(): 13 | import torch 14 | from speechbrain.core import Brain, Stage 15 | from torch.optim import SGD 16 | 17 | model = torch.nn.Linear(in_features=10, out_features=10) 18 | 19 | class SimpleBrain(Brain): 20 | def compute_forward(self, batch, stage): 21 | return self.modules.model(batch[0]) 22 | 23 | def compute_objectives(self, predictions, batch, stage): 24 | return torch.nn.functional.l1_loss(predictions, batch[1]) 25 | 26 | brain = SimpleBrain({"model": model}, lambda x: SGD(x, 0.1)) 27 | 28 | inputs = torch.rand(10, 10) 29 | targets = torch.rand(10, 10) 30 | train_set = ([inputs, targets],) 31 | valid_set = ([inputs, targets],) 32 | 33 | start_output = brain.compute_forward(inputs, Stage.VALID) 34 | start_loss = brain.compute_objectives(start_output, targets, Stage.VALID) 35 | brain.fit(epoch_counter=range(10), train_set=train_set, valid_set=valid_set) 36 | end_output = brain.compute_forward(inputs, Stage.VALID) 37 | end_loss = brain.compute_objectives(end_output, targets, Stage.VALID) 38 | assert end_loss < start_loss 39 | -------------------------------------------------------------------------------- /tests/unittests/test_counting.py: -------------------------------------------------------------------------------- 1 | def test_pad_ends(): 2 | from speechbrain.lm.counting import pad_ends 3 | 4 | assert next(pad_ends(["a", "b", "c"])) == "" 5 | assert next(pad_ends(["a", "b", "c"], pad_left=False)) == "a" 6 | assert list(pad_ends(["a", "b", "c"], pad_left=False))[-1] == "" 7 | assert list(pad_ends([], pad_left=False)) 8 | assert list(pad_ends([], pad_left=True)) 9 | 10 | 11 | def test_ngrams(): 12 | from speechbrain.lm.counting import ngrams 13 | 14 | assert next(ngrams(["a", "b", "c"], n=3)) == ("a", "b", "c") 15 | assert next(ngrams(["a", "b", "c"], n=1)) == ("a",) 16 | assert not list(ngrams(["a", "b", "c"], n=4)) 17 | assert list(ngrams(["a", "b", "c"], n=2)) == [("a", "b"), ("b", "c")] 18 | 19 | 20 | def test_ngrams_for_evaluation(): 21 | from speechbrain.lm.counting import ngrams_for_evaluation 22 | 23 | assert list(ngrams_for_evaluation(["a", "b", "c"], max_n=3)) == [ 24 | ("b", ("a",)), 25 | ("c", ("a", "b")), 26 | ] 27 | assert list( 28 | ngrams_for_evaluation(["a", "b", "c"], max_n=3, predict_first=True) 29 | ) == [("a", ()), ("b", ("a",)), ("c", ("a", "b"))] 30 | -------------------------------------------------------------------------------- /tests/unittests/test_dependency_graph.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_dependency_graph(): 5 | from speechbrain.utils.depgraph import ( 6 | DependencyGraph, 7 | CircularDependencyError, 8 | ) 9 | 10 | dg = DependencyGraph() 11 | # a->b->c 12 | dg.add_edge("b", "c") 13 | dg.add_edge("a", "b") 14 | assert dg.is_valid() 15 | eval_order = [node.key for node in dg.get_evaluation_order()] 16 | assert eval_order == ["c", "b", "a"] 17 | dg = DependencyGraph() 18 | # a->b->c, a->c 19 | dg.add_edge("b", "c") 20 | dg.add_edge("a", "b") 21 | dg.add_edge("a", "c") 22 | eval_order = [node.key for node in dg.get_evaluation_order()] 23 | assert eval_order == ["c", "b", "a"] 24 | dg = DependencyGraph() 25 | # a->b, a->c 26 | dg.add_edge("a", "b") 27 | dg.add_edge("a", "c") 28 | eval_order = [node.key for node in dg.get_evaluation_order()] 29 | assert eval_order == ["c", "b", "a"] or eval_order == ["b", "c", "a"] 30 | dg = DependencyGraph() 31 | # a->b, c->d 32 | dg.add_edge("a", "b") 33 | dg.add_edge("c", "d") 34 | eval_order = [node.key for node in dg.get_evaluation_order()] 35 | valid_orders = [ 36 | ["d", "c", "b", "a"], 37 | ["d", "b", "c", "a"], 38 | ["d", "b", "a", "c"], 39 | ["b", "a", "d", "c"], 40 | ["b", "d", "a", "c"], 41 | ["b", "d", "c", "a"], 42 | ] 43 | assert eval_order in valid_orders 44 | dg = DependencyGraph() 45 | # a->b 46 | dg.add_node("a") 47 | dg.add_node("b") 48 | dg.add_edge("a", "b") 49 | eval_order = [node.key for node in dg.get_evaluation_order()] 50 | assert eval_order == ["b", "a"] 51 | dg = DependencyGraph() 52 | # a->b->a Impossible! 53 | dg.add_edge("a", "b") 54 | dg.add_edge("b", "a") 55 | assert not dg.is_valid() 56 | with pytest.raises(CircularDependencyError): 57 | list(dg.get_evaluation_order()) 58 | dg = DependencyGraph() 59 | # a->b with data 60 | # should use uuids 61 | a_key = dg.add_node(data="a") 62 | assert a_key != "a" 63 | b_key = dg.add_node(data="b") 64 | dg.add_edge(a_key, b_key) 65 | eval_order_data = [node.data for node in dg.get_evaluation_order()] 66 | assert eval_order_data == ["b", "a"] 67 | # Adding same key in edge (implicitly) and then explicitly is ok: 68 | dg = DependencyGraph() 69 | dg.add_edge("a", "b") 70 | dg.add_node("a") 71 | eval_order = [node.key for node in dg.get_evaluation_order()] 72 | assert eval_order == ["b", "a"] 73 | # But adding same key twice explicitly will not work: 74 | with pytest.raises(ValueError): 75 | dg.add_node("a") 76 | -------------------------------------------------------------------------------- /tests/unittests/test_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | 5 | def test_dropout(): 6 | 7 | from speechbrain.nnet.dropout import Dropout2d 8 | 9 | inputs = torch.rand([4, 10, 32]) 10 | drop = Dropout2d(drop_rate=0.0) 11 | outputs = drop(inputs) 12 | assert torch.all(torch.eq(inputs, outputs)) 13 | 14 | drop = Dropout2d(drop_rate=1.0) 15 | outputs = drop(inputs) 16 | assert torch.all(torch.eq(torch.zeros(inputs.shape), outputs)) 17 | 18 | assert torch.jit.trace(drop, inputs) 19 | -------------------------------------------------------------------------------- /tests/unittests/test_edit_distance.py: -------------------------------------------------------------------------------- 1 | def test_accumulatable_wer_stats(): 2 | from speechbrain.utils.edit_distance import accumulatable_wer_stats 3 | 4 | refs = [[[1, 2, 3], [4, 5, 6]], [[7, 8], [9]]] 5 | hyps = [[[1, 2, 4], [5, 6]], [[7, 8], [10]]] 6 | # Test basic functionality: 7 | stats = accumulatable_wer_stats(refs[0], hyps[0]) 8 | assert stats["WER"] == 100.0 * 2 / 6 9 | stats = accumulatable_wer_stats(refs[1], hyps[1], stats) 10 | assert stats["WER"] == 100.0 * 3 / 9 11 | # Test edge cases: 12 | import math 13 | 14 | # No batches: 15 | stats = accumulatable_wer_stats([], []) 16 | assert stats["num_ref_tokens"] == 0 17 | assert math.isnan(stats["WER"]) 18 | # Empty hyp sequence: 19 | stats = accumulatable_wer_stats([[1, 2, 3]], [[]]) 20 | assert stats["num_ref_tokens"] == 3 21 | assert stats["WER"] == 100.0 22 | # Empty ref sequence: 23 | stats = accumulatable_wer_stats([[]], [[1, 2, 3]]) 24 | assert stats["num_ref_tokens"] == 0 25 | assert stats["insertions"] == 3 26 | assert math.isnan(stats["WER"]) 27 | 28 | 29 | def test_op_table(): 30 | from speechbrain.utils.edit_distance import op_table, EDIT_SYMBOLS 31 | 32 | assert len(op_table([1, 2, 3], [1, 2, 4])) == 4 33 | assert len(op_table([1, 2, 3], [1, 2, 4])[0]) == 4 34 | assert len(op_table([1, 2, 3], [])) == 4 35 | assert len(op_table([1, 2, 3], [])[0]) == 1 36 | assert op_table([1, 2, 3], [1, 2, 4])[3][3] == EDIT_SYMBOLS["sub"] 37 | assert op_table([1, 2, 3], [1, 2, 4])[2][2] == EDIT_SYMBOLS["eq"] 38 | assert op_table([1, 2, 3], [1, 2, 4])[0][0] == EDIT_SYMBOLS["eq"] 39 | 40 | 41 | def test_alignment(): 42 | from speechbrain.utils.edit_distance import alignment, EDIT_SYMBOLS 43 | 44 | I = EDIT_SYMBOLS["ins"] # noqa: E741, here I is a good var name 45 | D = EDIT_SYMBOLS["del"] 46 | S = EDIT_SYMBOLS["sub"] 47 | E = EDIT_SYMBOLS["eq"] 48 | table = [[I, I, I, I], [D, E, I, I], [D, D, E, I], [D, D, D, S]] 49 | assert alignment(table) == [(E, 0, 0), (E, 1, 1), (S, 2, 2)] 50 | 51 | 52 | def test_count_ops(): 53 | from speechbrain.utils.edit_distance import count_ops, EDIT_SYMBOLS 54 | 55 | I = EDIT_SYMBOLS["ins"] # noqa: E741, here I is a good var name 56 | D = EDIT_SYMBOLS["del"] 57 | S = EDIT_SYMBOLS["sub"] 58 | E = EDIT_SYMBOLS["eq"] 59 | table = [[I, I, I, I], [D, E, I, I], [D, D, E, I], [D, D, D, S]] 60 | assert count_ops(table)["insertions"] == 0 61 | assert count_ops(table)["deletions"] == 0 62 | assert count_ops(table)["substitutions"] == 1 63 | -------------------------------------------------------------------------------- /tests/unittests/test_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def test_embedding(): 5 | 6 | from speechbrain.nnet.embedding import Embedding 7 | 8 | # create one hot vector and consider blank as zero vector 9 | embedding_dim = 39 10 | blank_id = 39 11 | size_dict = 40 12 | emb = Embedding( 13 | num_embeddings=size_dict, consider_as_one_hot=True, blank_id=blank_id, 14 | ) 15 | inputs = torch.Tensor([10, 5, 2, 0, 39]).long() 16 | output = emb(inputs) 17 | assert output.shape == (5, 39) 18 | 19 | # use standard embedding layer 20 | embedding_dim = 128 21 | emb = Embedding(num_embeddings=size_dict, embedding_dim=embedding_dim) 22 | inputs = torch.randint(0, 40, (5, 10)) 23 | output = emb(inputs) 24 | assert output.shape == (5, 10, 128) 25 | 26 | assert torch.jit.trace(emb, inputs) 27 | -------------------------------------------------------------------------------- /tests/unittests/test_epoch_loop.py: -------------------------------------------------------------------------------- 1 | def test_epoch_loop_recovery(tmpdir): 2 | from speechbrain.utils.checkpoints import Checkpointer 3 | from speechbrain.utils.epoch_loop import EpochCounter 4 | 5 | epoch_counter = EpochCounter(2) 6 | recoverer = Checkpointer(tmpdir, {"epoch": epoch_counter}) 7 | for epoch in epoch_counter: 8 | assert epoch == 1 9 | # Save a mid-epoch checkpoint: 10 | recoverer.save_checkpoint(end_of_epoch=False) 11 | # Simulate interruption 12 | break 13 | # Now after recovery still at epoch 1: 14 | recoverer.recover_if_possible() 15 | second_epoch = False # Will manually update this 16 | for epoch in epoch_counter: 17 | if not second_epoch: 18 | assert epoch == 1 19 | recoverer.save_checkpoint(end_of_epoch=True) 20 | second_epoch = True 21 | else: 22 | assert epoch == 2 23 | # Again simulate interruption 24 | break 25 | # Now after recovery we are in epoch 2: 26 | recoverer.recover_if_possible() 27 | loop_runs = 0 28 | for epoch in epoch_counter: 29 | assert epoch == 2 30 | loop_runs += 1 31 | recoverer.save_checkpoint(end_of_epoch=True) 32 | # And that is that: 33 | assert loop_runs == 1 34 | # And now after recovery, no more epochs: 35 | recoverer.recover_if_possible() 36 | for epoch in epoch_counter: 37 | # Will not get here: 38 | assert False 39 | -------------------------------------------------------------------------------- /tests/unittests/test_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | 5 | def test_linear(): 6 | 7 | from speechbrain.nnet.linear import Linear 8 | 9 | inputs = torch.rand(1, 2, 4) 10 | lin_t = Linear(n_neurons=4, input_size=inputs.shape[-1], bias=False) 11 | lin_t.w.weight = torch.nn.Parameter(torch.eye(inputs.shape[-1])) 12 | outputs = lin_t(inputs) 13 | assert torch.all(torch.eq(inputs, outputs)) 14 | 15 | assert torch.jit.trace(lin_t, inputs) 16 | -------------------------------------------------------------------------------- /tests/unittests/test_multi_mic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def test_gccphat(): 5 | 6 | from speechbrain.processing.features import STFT 7 | from speechbrain.processing.multi_mic import Covariance, GccPhat 8 | 9 | # Creating the test signal 10 | fs = 16000 11 | 12 | delay = 60 13 | 14 | sig = torch.randn([10, fs]) 15 | sig_delayed = torch.cat((torch.zeros([10, delay]), sig[:, 0:-delay]), 1) 16 | 17 | xs = torch.stack((sig_delayed, sig), -1) 18 | 19 | stft = STFT(sample_rate=fs) 20 | Xs = stft(xs) 21 | 22 | # Computing the covariance matrix for GCC-PHAT 23 | cov = Covariance() 24 | gccphat = GccPhat() 25 | 26 | XXs = cov(Xs) 27 | tdoas = torch.abs(gccphat(XXs)) 28 | 29 | n_valid_tdoas = torch.sum(torch.abs(tdoas[..., 1] - delay) < 1e-3) 30 | assert n_valid_tdoas == Xs.shape[0] * Xs.shape[1] 31 | assert torch.jit.trace(stft, xs) 32 | assert torch.jit.trace(cov, Xs) 33 | assert torch.jit.trace(gccphat, XXs) 34 | -------------------------------------------------------------------------------- /tests/unittests/test_ngram_lm.py: -------------------------------------------------------------------------------- 1 | def test_backofff_ngram_lm(): 2 | from speechbrain.lm.ngram import BackoffNgramLM 3 | import math 4 | 5 | HALF = math.log(0.5) 6 | ngrams = { 7 | 1: {tuple(): {"a": HALF, "b": HALF}}, 8 | 2: {("a",): {"a": HALF, "b": HALF}, ("b",): {"a": HALF}}, 9 | } 10 | backoffs = {1: {("b",): 0.0}} 11 | lm = BackoffNgramLM(ngrams, backoffs) 12 | # The basic cases covered by the ngrams and backoffs: 13 | assert lm.logprob("a", ()) == HALF 14 | assert lm.logprob("b", ()) == HALF 15 | assert lm.logprob("a", ("a",)) == HALF 16 | assert lm.logprob("a", ("b",)) == HALF 17 | assert lm.logprob("b", ("a",)) == HALF 18 | assert lm.logprob("b", ("b",)) == HALF 19 | # Edge cases 20 | # Too large context: 21 | assert lm.logprob("a", ("a", "a")) == HALF 22 | assert lm.logprob("b", ("a", "b")) == HALF 23 | # OOV: 24 | assert lm.logprob("c", ()) == float("-inf") 25 | # OOV in context: 26 | assert lm.logprob("a", ("c",)) == HALF 27 | -------------------------------------------------------------------------------- /tests/unittests/test_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | 5 | def test_pooling1d(): 6 | 7 | from speechbrain.nnet.pooling import Pooling1d 8 | 9 | input = torch.tensor([1, 3, 2]).unsqueeze(0).unsqueeze(-1).float() 10 | pool = Pooling1d("max", 3) 11 | output = pool(input) 12 | assert output == 3 13 | 14 | pool = Pooling1d("avg", 3) 15 | output = pool(input) 16 | assert output == 2 17 | 18 | assert torch.jit.trace(pool, input) 19 | 20 | 21 | def test_pooling2d(): 22 | 23 | from speechbrain.nnet.pooling import Pooling2d 24 | 25 | input = torch.tensor([[1, 3, 2], [4, 6, 5]]).float().unsqueeze(0) 26 | pool = Pooling2d("max", (2, 3)) 27 | output = pool(input) 28 | assert output == 6 29 | 30 | input = torch.tensor([[1, 3, 2], [4, 6, 5]]).float().unsqueeze(0) 31 | pool = Pooling2d("max", (1, 3)) 32 | output = pool(input) 33 | assert output[0][0] == 3 34 | assert output[0][1] == 6 35 | 36 | input = torch.tensor([[1, 3, 2], [4, 6, 5]]).float().unsqueeze(0) 37 | pool = Pooling2d("avg", (2, 3)) 38 | output = pool(input) 39 | assert output == 3.5 40 | 41 | input = torch.tensor([[1, 3, 2], [4, 6, 5]]).float().unsqueeze(0) 42 | pool = Pooling2d("avg", (1, 3)) 43 | output = pool(input) 44 | assert output[0][0] == 2 45 | assert output[0][1] == 5 46 | 47 | assert torch.jit.trace(pool, input) 48 | -------------------------------------------------------------------------------- /tests/unittests/test_pretrainer.py: -------------------------------------------------------------------------------- 1 | def test_pretrainer(tmpdir): 2 | import torch 3 | from torch.nn import Linear 4 | 5 | # save a model in tmpdir/original/model.ckpt 6 | first_model = Linear(32, 32) 7 | pretrained_dir = tmpdir / "original" 8 | pretrained_dir.mkdir() 9 | with open(pretrained_dir / "model.ckpt", "wb") as fo: 10 | torch.save(first_model.state_dict(), fo) 11 | 12 | # Make a new model and Pretrainer 13 | pretrained_model = Linear(32, 32) 14 | assert not torch.all(torch.eq(pretrained_model.weight, first_model.weight)) 15 | from speechbrain.utils.parameter_transfer import Pretrainer 16 | 17 | pt = Pretrainer( 18 | collect_in=tmpdir / "reused", loadables={"model": pretrained_model} 19 | ) 20 | pt.collect_files(default_source=pretrained_dir) 21 | pt.load_collected() 22 | assert torch.all(torch.eq(pretrained_model.weight, first_model.weight)) 23 | -------------------------------------------------------------------------------- /tests/unittests/test_samplers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def test_ConcatDatasetBatchSampler(): 5 | from torch.utils.data import TensorDataset, ConcatDataset, DataLoader 6 | from speechbrain.dataio.sampler import ( 7 | ReproducibleRandomSampler, 8 | ConcatDatasetBatchSampler, 9 | ) 10 | import numpy as np 11 | 12 | datasets = [] 13 | for i in range(3): 14 | if i == 0: 15 | datasets.append(TensorDataset(torch.arange(i * 10, (i + 1) * 10))) 16 | else: 17 | datasets.append(TensorDataset(torch.arange(i * 6, (i + 1) * 6))) 18 | 19 | samplers = [ReproducibleRandomSampler(x) for x in datasets] 20 | dataset = ConcatDataset(datasets) 21 | loader = DataLoader( 22 | dataset, batch_sampler=ConcatDatasetBatchSampler(samplers, [1, 1, 1]), 23 | ) 24 | 25 | concat_data = [] 26 | 27 | for data in loader: 28 | concat_data.append([x.item() for x in data[0]]) 29 | concat_data = np.array(concat_data) 30 | 31 | non_cat_data = [] 32 | for i in range(len(samplers)): 33 | c_data = [] 34 | loader = DataLoader(dataset.datasets[i], sampler=samplers[i],) 35 | 36 | for data in loader: 37 | c_data.append(data[0].item()) 38 | 39 | non_cat_data.append(c_data) 40 | 41 | minlen = min([len(x) for x in non_cat_data]) 42 | non_cat_data = [x[:minlen] for x in non_cat_data] 43 | non_cat_data = np.array(non_cat_data) 44 | np.testing.assert_array_equal(non_cat_data.T, concat_data) 45 | -------------------------------------------------------------------------------- /tests/unittests/test_schedulers.py: -------------------------------------------------------------------------------- 1 | def test_NewBobScheduler(): 2 | 3 | from speechbrain.nnet.schedulers import NewBobScheduler 4 | 5 | scheduler = NewBobScheduler(initial_value=0.8) 6 | 7 | prev_lr, next_lr = scheduler(1.0) 8 | assert prev_lr == 0.8 9 | assert next_lr == 0.8 10 | 11 | prev_lr, next_lr = scheduler(1.1) 12 | assert next_lr == 0.4 13 | 14 | prev_lr, next_lr = scheduler(0.5) 15 | assert next_lr == 0.4 16 | 17 | scheduler = NewBobScheduler(initial_value=0.8, patient=3) 18 | prev_lr, next_lr = scheduler(1.0) 19 | assert next_lr == 0.8 20 | 21 | prev_lr, next_lr = scheduler(1.1) 22 | prev_lr, next_lr = scheduler(1.1) 23 | prev_lr, next_lr = scheduler(1.1) 24 | assert next_lr == 0.8 25 | 26 | prev_lr, next_lr = scheduler(1.1) 27 | assert next_lr == 0.4 28 | assert scheduler.current_patient == 3 29 | -------------------------------------------------------------------------------- /tests/unittests/test_signal_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def test_normalize(): 5 | 6 | from speechbrain.processing.signal_processing import compute_amplitude 7 | from speechbrain.processing.signal_processing import rescale 8 | import random 9 | import numpy as np 10 | 11 | for scale in ["dB", "linear"]: 12 | for amp_type in ["peak", "avg"]: 13 | for test_vec in [ 14 | torch.zeros((100)), 15 | torch.rand((10, 100)), 16 | torch.rand((10, 100, 5)), 17 | ]: 18 | 19 | lengths = ( 20 | test_vec.size(1) 21 | if len(test_vec.shape) > 1 22 | else test_vec.size(0) 23 | ) 24 | amp = compute_amplitude(test_vec, lengths, amp_type, scale) 25 | scaled_back = rescale( 26 | random.random() * test_vec, lengths, amp, amp_type, scale, 27 | ) 28 | np.testing.assert_array_almost_equal( 29 | scaled_back.numpy(), test_vec.numpy() 30 | ) 31 | -------------------------------------------------------------------------------- /tests/unittests/test_superpowers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest 3 | 4 | 5 | @pytest.mark.skipif( 6 | sys.platform.startswith("win"), 7 | reason="shell tools not necessarily available on Windows", 8 | ) 9 | def test_run_shell(): 10 | from speechbrain.utils.superpowers import run_shell 11 | 12 | out, err, code = run_shell("echo -n hello") 13 | assert out.decode() == "hello" 14 | assert err.decode() == "" 15 | assert code == 0 16 | 17 | with pytest.raises(OSError): 18 | run_shell("false") 19 | 20 | # This last run is just to check that a bytes 21 | # sequence that is returned in an incompatible encoding (not UTF-8) 22 | # does not cause an error . 23 | output, _, _ = run_shell("echo -n pöö | iconv -t LATIN1") 24 | assert output.decode("latin1") == "pöö" 25 | --------------------------------------------------------------------------------