├── LICENSE ├── MIR_ST500 ├── README.md ├── hparams │ └── train_audio_ssl.yaml ├── huggingface_interface.py ├── prepare_benchmarks.py ├── train_audio_ssl.py └── utils.py ├── N20EMv2 ├── audio_only │ ├── README.md │ ├── extract_ssl_feats.py │ ├── hparams │ │ ├── extract_ssl_feats.yaml │ │ └── train_audio_ssl.yaml │ ├── huggingface_interface.py │ ├── prepare_benchmarks.py │ ├── prepare_n20emv2.py │ ├── train_audio_ssl.py │ └── utils.py ├── audio_visual │ ├── README.md │ ├── fusion.py │ ├── hparams │ │ ├── train_rca_a.yaml │ │ └── train_rca_av.yaml │ ├── prepare_musan.py │ ├── prepare_n20emv2.py │ ├── synthesis_noise.py │ ├── train_rca_a.py │ ├── train_rca_av.py │ └── utils.py └── video_only │ ├── README.md │ ├── decoder.py │ ├── extract_ssl_feats.py │ ├── fairseq_interface.py │ ├── hparams │ ├── extract_ssl_feats.yaml │ └── train_video_ssl.yaml │ ├── hubert.py │ ├── hubert_asr.py │ ├── hubert_dataset.py │ ├── hubert_pretraining.py │ ├── prepare_n20emv2.py │ ├── resnet.py │ ├── sequence_generator.py │ ├── train_video_ssl.py │ └── utils.py ├── README.md ├── assets ├── framework.png ├── noise_test.png ├── results.png └── results2.png ├── dependencies.txt ├── requirements.txt ├── setup.py └── speechbrain ├── __init__.py ├── alignment ├── __init__.py ├── aligner.py └── ctc_segmentation.py ├── core.py ├── dataio ├── __init__.py ├── batch.py ├── dataio.py ├── dataloader.py ├── dataset.py ├── encoder.py ├── iterators.py ├── legacy.py ├── preprocess.py ├── sampler.py └── wer.py ├── decoders ├── __init__.py ├── ctc.py ├── seq2seq.py └── transducer.py ├── lm ├── __init__.py ├── arpa.py ├── counting.py └── ngram.py ├── lobes ├── __init__.py ├── augment.py ├── beamform_multimic.py ├── features.py └── models │ ├── CRDNN.py │ ├── ContextNet.py │ ├── ECAPA_TDNN.py │ ├── ESPnetVGG.py │ ├── IMU_CRNN.py │ ├── MetricGAN.py │ ├── MetricGAN_U.py │ ├── QuasiRNN.py │ ├── RNNLM.py │ ├── VanillaNN.py │ ├── Xvector.py │ ├── __init__.py │ ├── conv_tasnet.py │ ├── convolution.py │ ├── decoder.py │ ├── dual_path.py │ ├── fairseq_wav2vec.py │ ├── hubert.py │ ├── hubert_asr.py │ ├── hubert_dataset.py │ ├── hubert_pretraining.py │ ├── huggingface_wav2vec.py │ ├── resnet.py │ ├── segan_model.py │ ├── sequence_generator.py │ ├── transformer │ ├── Conformer.py │ ├── Transformer.py │ ├── TransformerASR.py │ ├── TransformerLM.py │ ├── TransformerSE.py │ ├── TransformerST.py │ └── __init__.py │ └── utils.py ├── log-config.yaml ├── nnet ├── CNN.py ├── RNN.py ├── __init__.py ├── activations.py ├── attention.py ├── complex_networks │ ├── __init__.py │ ├── c_CNN.py │ ├── c_RNN.py │ ├── c_linear.py │ ├── c_normalization.py │ └── c_ops.py ├── containers.py ├── dropout.py ├── embedding.py ├── linear.py ├── loss │ ├── __init__.py │ ├── guidedattn_loss.py │ ├── si_snr_loss.py │ ├── stoi_loss.py │ └── transducer_loss.py ├── losses.py ├── normalization.py ├── pooling.py ├── quaternion_networks │ ├── __init__.py │ ├── q_CNN.py │ ├── q_RNN.py │ ├── q_linear.py │ ├── q_normalization.py │ └── q_ops.py ├── schedulers.py └── transducer │ ├── __init__.py │ └── transducer_joint.py ├── pretrained ├── __init__.py ├── fetching.py └── interfaces.py ├── processing ├── NMF.py ├── PLDA_LDA.py ├── __init__.py ├── decomposition.py ├── diarization.py ├── features.py ├── multi_mic.py ├── signal_processing.py └── speech_augmentation.py ├── tokenizers ├── SentencePiece.py └── __init__.py ├── utils ├── Accuracy.py ├── DER.py ├── __init__.py ├── bleu.py ├── callchains.py ├── checkpoints.py ├── data_pipeline.py ├── data_utils.py ├── depgraph.py ├── distributed.py ├── edit_distance.py ├── epoch_loop.py ├── hpopt.py ├── logger.py ├── metric_stats.py ├── parameter_transfer.py ├── superpowers.py ├── torch_audio_backend.py └── train_logger.py └── version.txt /MIR_ST500/README.md: -------------------------------------------------------------------------------- 1 | # Audio-only Automatic Music Transcription with MIR-ST500 dataset 2 | This sub-project contains recipes for trianing benchmark AMT system using MIR-ST500 dataset. 3 | 4 | ## Prerequisites 5 | 1. Before running our scripts, you need to download, preprocess and save the datasets properly. For polyphonic singing recordings, we use [spleeter](https://github.com/deezer/spleeter) to extract the vocal part. Besides, to meet the requirements of self-supervised-learning models in our project, we resample the audio data into 16 kHz. We provide sample code in `prepare_benchmarks.py`. For the annotations, we save them to a json file. 6 | 7 | The file organization for MIR-ST500 should be: 8 | ``` 9 | /path/to/MIR_ST500 10 | ├── wav16kHz 11 | ├── train 12 | ├── song1 13 | ├── vocals.wav 14 | ├── song2 15 | ├── ... 16 | ├── test 17 | ├── Annotations.json 18 | ``` 19 | 20 | The file organization for ISMIR and TONAS should be: 21 | ``` 22 | /path/to/ISMIR or /path/to/TONAS 23 | ├── wav16kHz 24 | ├── song1 25 | ├── vocals.wav 26 | ├── song2 27 | ├── ... 28 | ├── Annotations.json 29 | ``` 30 | 31 | 32 | 2. Prepare benchmark AMT for singing datasets including MIR-ST500, TONAS, and ISMIR2014, run: 33 | ``` 34 | python prepare_benchmarks.py --duration --frame_rate 49.8 --mir_st500 /path/to/MIR_ST500 --ismir /path/to/ISMIR2014 --tonas /path/to/TONAS 35 | ``` 36 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s in our paper. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `49.8` fps, which is the frame rate of wav2vec 2.0 features. 37 | 38 | After running this script, the file organization for MIR-ST500 should be: 39 | ``` 40 | /path/to/MIR_ST500 41 | ├── wav16kHz 42 | ├── train 43 | ├── song1 44 | ├── vocals.wav 45 | ├── annotation.json 46 | ├── frame_anno.npy 47 | ├── song2 48 | ├── ... 49 | ├── test 50 | ├── Annotations.json 51 | ``` 52 | 53 | The file organization for ISMIR and TONAS should be: 54 | ``` 55 | /path/to/ISMIR or /path/to/TONAS 56 | ├── wav16kHz 57 | ├── song1 58 | ├── vocals.wav 59 | ├── annotation.json 60 | ├── frame_anno.npy 61 | ├── song2 62 | ├── ... 63 | ├── Annotations.json 64 | ``` 65 | 66 | The resulted csv files are save in the same root folder: 67 | ``` 68 | ├── data 69 | ├── dur_ 70 | ├── mir_st500_train.csv 71 | ├── mir_st500_test.csv 72 | ├── ismir2014.csv 73 | ├── tonas.csv 74 | ├── prepare_benchmarks.py 75 | ``` 76 | 77 | ## How to run 78 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce `ours variant 1` in our paper, run: 79 | ``` 80 | CUDA_VISIBLE_DEVICES=0,1 python train_audio_ssl.py hparams/train_audio_ssl.yaml --data_parallel_backend --data_folder /path/to/MIR_ST500 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 81 | ``` 82 | The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. The option `--ssl_model` refers to the self-supervised-learning (SSL) model we used. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM): 83 | ``` 84 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_audio_ssl.py hparams/train_audio_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --data_folder /path/to/MIR_ST500 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 85 | ``` 86 | 87 | NOTE: For SSL models, we used `wav2vec2-large-lv60` as the wav2vec 2.0 pretrained on speech data and `wav2vec2-large-960h-lv60-self` as the wav2vec 2.0 pretrained and finetuned on speech data. To facilitate the follow-up research, our repo supports the following SSL models: [wav2vec2](https://arxiv.org/abs/2006.11477), [HuBERT](https://arxiv.org/abs/2106.07447), [data2vec](https://arxiv.org/abs/2202.03555), [WavLM](https://arxiv.org/abs/2110.13900). Please find the checkpoint name in [Huggingface](https://huggingface.co/models). 88 | 89 | ## Results 90 | We provide our trained AMT model of `ours 1`[[model link](https://drive.google.com/drive/folders/18IvMt8vrtZewCjCSy6DTPfZzhJw4SI95?usp=sharing)] in the paper. 91 |

92 | 93 |

-------------------------------------------------------------------------------- /MIR_ST500/hparams/train_audio_ssl.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: wav2vec2 + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | ssl_model: wav2vec2-large-lv60 12 | output_folder: !ref results//train_audio_ssl_dur_attempt/ 13 | save_folder: !ref /save 14 | csv_folder: !ref data/dur_s 15 | train_log: !ref /train_log.txt 16 | 17 | # URL for the biggest Fairseq english data2vec model. 18 | wav2vec2_hub: !ref facebook/ 19 | wav2vec2_local: !ref ssl_model/ 20 | 21 | # Data files 22 | data_folder: !PLACEHOLDER # e,g./path/to/N20EMv2 23 | # noise/ris dataset will automatically be downloaded 24 | data_folder_rirs: !ref 25 | ckpt_interval_minutes: 25 # save checkpoint every N min 26 | train_csv: !ref /mir_st500_train.csv 27 | valid_csv: !ref /mir_st500_valid.csv 28 | test_csv: 29 | - !ref /mir_st500_test.csv 30 | - !ref /tonas.csv 31 | - !ref /ismir2014.csv 32 | 33 | noise_type: babble 34 | snr_db: -10 35 | add_noise: False 36 | # Training parameters 37 | number_of_epochs: 10 38 | lr: 0.0003 39 | lr_wav2vec: 0.00005 40 | sorting: ascending 41 | auto_mix_prec: False 42 | sample_rate: 16000 43 | frame_rate: 49.8 44 | linear_prob_epochs: 2 45 | pretrain: False 46 | pretrain_folder: ../pretrain_model 47 | save_model: False 48 | save_model_folder: ../save_model 49 | 50 | # Evaluating parameters 51 | onset_threshold: 0.4 52 | offset_threshold: 0.5 53 | onset_tolerance: 0.05 54 | pitch_tolerance: 50 55 | 56 | # With data_parallel batch_size is split into N jobs 57 | # With DDP batch_size is multiplied by N jobs 58 | # Must be 3 per GPU to fit 32GB of VRAM 59 | batch_size: 8 60 | test_batch_size: 1 61 | num_workers: 8 62 | 63 | # Dataloader options 64 | train_dataloader_opts: 65 | batch_size: !ref 66 | num_workers: !ref 67 | 68 | valid_dataloader_opts: 69 | batch_size: !ref 70 | num_workers: !ref 71 | 72 | test_dataloader_opts: 73 | batch_size: !ref 74 | num_workers: !ref 75 | 76 | # Model parameters 77 | freeze_wav2vec: False 78 | 79 | # Outputs 80 | pitch_octave_num: 4 81 | pitch_class_num: 12 82 | feat_dim: 1024 83 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 84 | 85 | # 86 | # Functions and classes 87 | # 88 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 89 | limit: !ref 90 | 91 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment 92 | # sample_rate: !ref 93 | # speeds: [95, 100, 105] 94 | 95 | wav2vec2: !new:huggingface_interface.HuggingFaceWav2Vec2 96 | source: !ref 97 | output_norm: True 98 | freeze: !ref 99 | save_path: !ref # !ref /wav2vec2_checkpoint 100 | 101 | ##### 102 | # Uncomment this block if you prefer to use a Fairseq pretrained model instead 103 | # of a HuggingFace one. Here, we provide an URL that is obtained from the 104 | # Fairseq github for the multilingual XLSR. 105 | # 106 | # wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt 107 | # wav2vec2: !new:nets.fairseq_interface.FairseqWav2Vec2 108 | # pretrained_path: !ref 109 | # output_norm: True 110 | # freeze: False 111 | # save_path: !ref 112 | 113 | model: !new:speechbrain.nnet.linear.Linear 114 | input_size: !ref 115 | n_neurons: !ref 116 | 117 | modules: 118 | wav2vec2: !ref 119 | model: !ref 120 | 121 | log_softmax: !new:speechbrain.nnet.activations.Softmax 122 | apply_log: True 123 | 124 | onset_positive_weight: 15.0 125 | offset_positive_weight: 1.0 126 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 127 | reduction: mean 128 | allowed_len_diff: 3 129 | label_smoothing: 0.0 130 | 131 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 132 | reduction: mean 133 | allowed_len_diff: 3 134 | label_smoothing: 0.0 135 | 136 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 137 | reduction: mean 138 | allowed_len_diff: 3 139 | label_smoothing: 0.0 140 | 141 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 142 | reduction: mean 143 | allowed_len_diff: 3 144 | label_smoothing: 0.0 145 | 146 | model_opt_class: !name:torch.optim.Adadelta 147 | lr: !ref 148 | rho: 0.95 149 | eps: 1.e-8 150 | 151 | wav2vec_opt_class: !name:torch.optim.Adam 152 | lr: !ref 153 | 154 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler 155 | initial_value: !ref 156 | improvement_threshold: 0.0025 157 | annealing_factor: 0.8 158 | patient: 0 159 | 160 | lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler 161 | initial_value: !ref 162 | improvement_threshold: 0.0025 163 | annealing_factor: 0.9 164 | patient: 0 165 | 166 | 167 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 168 | checkpoints_dir: !ref 169 | recoverables: 170 | wav2vec2: !ref 171 | model: !ref 172 | scheduler_model: !ref 173 | scheduler_wav2vec: !ref 174 | counter: !ref 175 | 176 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 177 | save_file: !ref 178 | precision: 3 179 | 180 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 181 | metric: !name:speechbrain.nnet.losses.bce_loss 182 | reduction: batch 183 | allowed_len_diff: 3 184 | label_smoothing: 0.0 185 | 186 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 187 | metric: !name:speechbrain.nnet.losses.bce_loss 188 | reduction: batch 189 | allowed_len_diff: 3 190 | label_smoothing: 0.0 191 | 192 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 193 | metric: !name:speechbrain.nnet.losses.nll_loss 194 | reduction: batch 195 | allowed_len_diff: 3 196 | label_smoothing: 0.0 197 | 198 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 199 | metric: !name:speechbrain.nnet.losses.nll_loss 200 | reduction: batch 201 | allowed_len_diff: 3 202 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/audio_only/README.md: -------------------------------------------------------------------------------- 1 | # Audio-only Automatic Music Transcription with N20EMv2 dataset 2 | This sub-project contains recipes for trianing audio-only AMT system using N20EMv2 dataset. 3 | 4 | ## Prerequisites 5 | 1. Before running our scripts, you need to download, preprocess and save the datasets properly. For polyphonic singing recordings, we use [spleeter](https://github.com/deezer/spleeter) to extract the vocal part. Besides, to meet the requirements of self-supervised-learning models in our project, we resample the audio data into 16 kHz. We provide sample code in `prepare_benchmarks.py`. For the annotations, we save them to a json file. 6 | 7 | The file organization for MIR-ST500 should be: 8 | ``` 9 | /path/to/MIR_ST500 10 | ├── wav16kHz 11 | ├── train 12 | ├── song1 13 | ├── vocals.wav 14 | ├── song2 15 | ├── ... 16 | ├── test 17 | ├── Annotations.json 18 | ``` 19 | 20 | The file organization for ISMIR and TONAS should be: 21 | ``` 22 | /path/to/ISMIR or /path/to/TONAS 23 | ├── wav16kHz 24 | ├── song1 25 | ├── vocals.wav 26 | ├── song2 27 | ├── ... 28 | ├── Annotations.json 29 | ``` 30 | 31 | The file organization for N20EMv2 should be: 32 | ``` 33 | /path/to/N20EMv2 34 | ├── data 35 | ├── song1 36 | ├── vocals.wav 37 | ├── video_50fps.npy 38 | ├── song2 39 | ├── ... 40 | ├── annotations.json 41 | ``` 42 | 43 | 44 | 2. Prepare benchmark AMT for singing datasets including MIR-ST500, TONAS, and ISMIR2014, run: 45 | ``` 46 | python prepare_benchmarks.py --duration --frame_rate 49.8 --mir_st500 /path/to/MIR_ST500 --ismir /path/to/ISMIR2014 --tonas /path/to/TONAS 47 | ``` 48 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s in our paper. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `49.8` fps, which is the frame rate of wav2vec 2.0 features. 49 | 50 | After running this script, the file organization for MIR-ST500 should be: 51 | ``` 52 | /path/to/MIR_ST500 53 | ├── wav16kHz 54 | ├── train 55 | ├── song1 56 | ├── vocals.wav 57 | ├── annotation.json 58 | ├── frame_anno.npy 59 | ├── song2 60 | ├── ... 61 | ├── test 62 | ├── Annotations.json 63 | ``` 64 | 65 | The file organization for ISMIR and TONAS should be: 66 | ``` 67 | /path/to/ISMIR or /path/to/TONAS 68 | ├── wav16kHz 69 | ├── song1 70 | ├── vocals.wav 71 | ├── annotation.json 72 | ├── frame_anno.npy 73 | ├── song2 74 | ├── ... 75 | ├── Annotations.json 76 | ``` 77 | 78 | 3. Prepare N20EMv2 dataset, run: 79 | ``` 80 | python prepare_n20emv2.py --duration --frame_rate 49.8 --n20emv2 /path/to/n20emv2 81 | ``` 82 | 83 | After running this script, the file organization for N20EMv2 should be: 84 | ``` 85 | /path/to/N20EMv2 86 | ├── data 87 | ├── song1 88 | ├── vocals.wav 89 | ├── video_50fps.npy 90 | ├── note_anno.json 91 | ├── audio_anno 92 | ├── 49.8fps 93 | ├── audio_frame_anno.npy 94 | ├── song2 95 | ├── ... 96 | ├── annotations.json 97 | ``` 98 | 99 | The resulted csv files are save in the same root folder: 100 | ``` 101 | ├── data 102 | ├── dur_ 103 | ├── mir_st500_train.csv 104 | ├── mir_st500_test.csv 105 | ├── n20emv2_train.csv 106 | ├── n20emv2_valid.csv 107 | ├── n20emv2_test.csv 108 | ├── ismir2014.csv 109 | ├── tonas.csv 110 | ├── mix_train.csv 111 | ├── prepare_benchmarks.py 112 | ├── prepare_n20emv2.py 113 | ``` 114 | 115 | ## How to run 116 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce `ours variant 2` in our paper, run: 117 | ``` 118 | CUDA_VISIBLE_DEVICES=0,1 python train_audio_ssl.py hparams/train_audio_ssl.yaml --data_parallel_backend --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 119 | ``` 120 | The option `--mix_train` refers to whether to mix the training data of MIR_ST500 and N20EMv2. If `False`, only N20EMv2 is used during training. The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. The option `--ssl_model` refers to the self-supervised-learning (SSL) model we used. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM): 121 | ``` 122 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_audio_ssl.py hparams/train_audio_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 123 | ``` 124 | 125 | NOTE: For SSL models, we used `wav2vec2-large-lv60` as the wav2vec 2.0 pretrained on speech data and `wav2vec2-large-960h-lv60-self` as the wav2vec 2.0 pretrained and finetuned on speech data. To facilitate the follow-up research, our repo supports the following SSL models: [wav2vec2](https://arxiv.org/abs/2006.11477), [HuBERT](https://arxiv.org/abs/2106.07447), [data2vec](https://arxiv.org/abs/2202.03555), [WavLM](https://arxiv.org/abs/2110.13900). Please find the checkpoint name in [Huggingface](https://huggingface.co/models). 126 | 127 | ## Results 128 | We provide our trained AMT model of `ours 2`[[model link](https://drive.google.com/drive/folders/1FZFWf0JXDs2Esmu9GZqxmp5Wev5AclWU?usp=share_link)] in the paper. 129 | 130 | Results on Benchmark datasets for AMT task: 131 |

132 | 133 |

134 | 135 | Results on N20EMv2 dataset for AMT task: 136 |

137 | 138 |

139 | -------------------------------------------------------------------------------- /N20EMv2/audio_only/hparams/extract_ssl_feats.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: wav2vec2 + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | mix_train: True 12 | ssl_model: wav2vec2-large-lv60 13 | output_folder: !ref results/_mix/train_audio_ssl_dur_attempt/ 14 | save_folder: !ref /save 15 | csv_folder: !ref data/dur_s 16 | train_log: !ref /train_log.txt 17 | 18 | # URL for the biggest Fairseq english data2vec model. 19 | wav2vec2_hub: !ref facebook/ 20 | wav2vec2_local: !ref ssl_model/ 21 | 22 | # Data files 23 | data_folder: !PLACEHOLDER # e,g./path/to/N20EMv2 24 | # noise/ris dataset will automatically be downloaded 25 | data_folder_rirs: !ref 26 | ckpt_interval_minutes: 25 # save checkpoint every N min 27 | train_csv: !ref /n20em_train.csv 28 | mix_train_csv: !ref /mix_train.csv 29 | valid_csv: !ref /n20em_valid.csv 30 | test_csv: 31 | - !ref /n20em_test.csv 32 | - !ref /n20em_valid.csv 33 | - !ref /n20em_valid.csv 34 | 35 | noise_type: babble 36 | snr_db: -10 37 | add_noise: False 38 | # Training parameters 39 | number_of_epochs: 10 40 | lr: 0.0003 41 | lr_wav2vec: 0.00005 42 | sorting: ascending 43 | auto_mix_prec: False 44 | sample_rate: 16000 45 | frame_rate: 49.8 46 | linear_prob_epochs: 2 47 | pretrain: False 48 | pretrain_folder: ../pretrain_model 49 | save_model: True 50 | save_model_folder: ../save_model 51 | 52 | # Evaluating parameters 53 | onset_threshold: 0.4 54 | offset_threshold: 0.5 55 | onset_tolerance: 0.05 56 | pitch_tolerance: 50 57 | 58 | # With data_parallel batch_size is split into N jobs 59 | # With DDP batch_size is multiplied by N jobs 60 | # Must be 3 per GPU to fit 32GB of VRAM 61 | batch_size: 8 62 | test_batch_size: 1 63 | num_workers: 8 64 | 65 | # Dataloader options 66 | train_dataloader_opts: 67 | batch_size: !ref 68 | num_workers: !ref 69 | 70 | valid_dataloader_opts: 71 | batch_size: !ref 72 | num_workers: !ref 73 | 74 | test_dataloader_opts: 75 | batch_size: !ref 76 | num_workers: !ref 77 | 78 | # Model parameters 79 | freeze_wav2vec: False 80 | 81 | # Outputs 82 | pitch_octave_num: 4 83 | pitch_class_num: 12 84 | feat_dim: 1024 85 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 86 | 87 | # 88 | # Functions and classes 89 | # 90 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 91 | limit: !ref 92 | 93 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment 94 | # sample_rate: !ref 95 | # speeds: [95, 100, 105] 96 | 97 | wav2vec2: !new:huggingface_interface.HuggingFaceWav2Vec2 98 | source: !ref 99 | output_norm: True 100 | freeze: !ref 101 | save_path: !ref # !ref /wav2vec2_checkpoint 102 | 103 | ##### 104 | # Uncomment this block if you prefer to use a Fairseq pretrained model instead 105 | # of a HuggingFace one. Here, we provide an URL that is obtained from the 106 | # Fairseq github for the multilingual XLSR. 107 | # 108 | # wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt 109 | # wav2vec2: !new:nets.fairseq_interface.FairseqWav2Vec2 110 | # pretrained_path: !ref 111 | # output_norm: True 112 | # freeze: False 113 | # save_path: !ref 114 | 115 | model: !new:speechbrain.nnet.linear.Linear 116 | input_size: !ref 117 | n_neurons: !ref 118 | 119 | modules: 120 | wav2vec2: !ref 121 | model: !ref 122 | 123 | log_softmax: !new:speechbrain.nnet.activations.Softmax 124 | apply_log: True 125 | 126 | onset_positive_weight: 15.0 127 | offset_positive_weight: 1.0 128 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 129 | reduction: mean 130 | allowed_len_diff: 3 131 | label_smoothing: 0.0 132 | 133 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 134 | reduction: mean 135 | allowed_len_diff: 3 136 | label_smoothing: 0.0 137 | 138 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 139 | reduction: mean 140 | allowed_len_diff: 3 141 | label_smoothing: 0.0 142 | 143 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 144 | reduction: mean 145 | allowed_len_diff: 3 146 | label_smoothing: 0.0 147 | 148 | model_opt_class: !name:torch.optim.Adadelta 149 | lr: !ref 150 | rho: 0.95 151 | eps: 1.e-8 152 | 153 | wav2vec_opt_class: !name:torch.optim.Adam 154 | lr: !ref 155 | 156 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler 157 | initial_value: !ref 158 | improvement_threshold: 0.0025 159 | annealing_factor: 0.8 160 | patient: 0 161 | 162 | lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler 163 | initial_value: !ref 164 | improvement_threshold: 0.0025 165 | annealing_factor: 0.9 166 | patient: 0 167 | 168 | 169 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 170 | checkpoints_dir: !ref 171 | recoverables: 172 | wav2vec2: !ref 173 | model: !ref 174 | scheduler_model: !ref 175 | scheduler_wav2vec: !ref 176 | counter: !ref 177 | 178 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 179 | save_file: !ref 180 | precision: 3 181 | 182 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 183 | metric: !name:speechbrain.nnet.losses.bce_loss 184 | reduction: batch 185 | allowed_len_diff: 3 186 | label_smoothing: 0.0 187 | 188 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 189 | metric: !name:speechbrain.nnet.losses.bce_loss 190 | reduction: batch 191 | allowed_len_diff: 3 192 | label_smoothing: 0.0 193 | 194 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 195 | metric: !name:speechbrain.nnet.losses.nll_loss 196 | reduction: batch 197 | allowed_len_diff: 3 198 | label_smoothing: 0.0 199 | 200 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 201 | metric: !name:speechbrain.nnet.losses.nll_loss 202 | reduction: batch 203 | allowed_len_diff: 3 204 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/audio_only/prepare_n20emv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data preparation for datasets of automatic music transcription 3 | 4 | Authors 5 | * Xiangming Gu 2022 6 | """ 7 | import os 8 | import csv 9 | import json 10 | import argparse 11 | import torchaudio 12 | import numpy as np 13 | from tqdm import tqdm 14 | from utils import note2frame 15 | from speechbrain.dataio.dataio import merge_csvs 16 | SAMPLERATE = 16000 17 | 18 | 19 | def prepare_frame_anno(folder, frame_rate=49.8): 20 | """ 21 | This function processes the frame-level annotations for each song 22 | """ 23 | json_file = os.path.join(folder, "annotations.json") 24 | folder_data = os.path.join(folder, "data") 25 | # open ground truth data 26 | with open(json_file) as f: 27 | annotations = json.load(f) 28 | f.close() 29 | # traverse the whole dataset 30 | for entry in tqdm(annotations.keys()): 31 | anno = annotations[entry]["midi"] 32 | json_path = os.path.join(folder_data, entry, "note_anno.json") 33 | # save json file 34 | with open(json_path, "w") as f: 35 | json.dump(anno, f) 36 | f.close() 37 | # save frame-level annotations 38 | wav_file = os.path.join(folder_data, entry, "vocals.wav") 39 | audio, fs = torchaudio.load(wav_file) 40 | assert fs == SAMPLERATE 41 | assert audio.shape[0] == 1 42 | duration = audio.shape[1] / SAMPLERATE 43 | length = round(duration * frame_rate) 44 | frame_label = note2frame(gt_data=anno, length=length, frame_size=1/frame_rate) 45 | # print(length) 46 | assert frame_label.shape[0] == length 47 | # save frame-level annotation 48 | os.makedirs(os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps"), exist_ok=True) 49 | npy_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy") 50 | np.save(npy_path, frame_label) 51 | 52 | 53 | def prepare_csv_n20emv2(folder, csv_folder="./data", frame_rate=49.8, dur_thrd=5): 54 | """ 55 | This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration 56 | """ 57 | 58 | # initialize the csv lines 59 | csv_train_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]] 60 | csv_valid_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]] 61 | csv_test_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]] 62 | # load the annotations 63 | json_file = os.path.join(folder, "annotations.json") 64 | folder_data = os.path.join(folder, "data") 65 | # open ground truth data 66 | with open(json_file) as f: 67 | annotations = json.load(f) 68 | f.close() 69 | # traverse the whole dataset 70 | for entry in tqdm(annotations.keys()): 71 | split = annotations[entry]["split"] 72 | audio_path = os.path.join(folder_data, entry, "vocals.wav") 73 | anno_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy") 74 | song_anno_path = os.path.join(folder_data, entry, "note_anno.json") 75 | 76 | # load the audio 77 | audio, fs = torchaudio.load(audio_path) # audio: [1, N] for mono or [2, N] for stero 78 | assert fs == SAMPLERATE 79 | duration = audio.shape[1] / SAMPLERATE 80 | 81 | # split the whole song into utterances 82 | utter_num = round(duration / dur_thrd) 83 | for i in range(1, utter_num+1): 84 | ID = entry + "_" + str(i) 85 | if i == utter_num: 86 | dur = duration - (utter_num - 1) * dur_thrd 87 | assert 0 < dur <= dur_thrd * 3 / 2 88 | else: 89 | dur = dur_thrd 90 | csv_line = [ 91 | ID, str(dur), audio_path, str(i), str(utter_num), anno_path, song_anno_path, 92 | ] 93 | if split == "train": 94 | csv_train_lines.append(csv_line) 95 | elif split == "valid": 96 | csv_valid_lines.append(csv_line) 97 | elif split == "test": 98 | csv_test_lines.append(csv_line) 99 | # save csv files 100 | save_folder = os.path.join(csv_folder, "dur_" + str(dur_thrd) + "s") 101 | os.makedirs(save_folder, exist_ok=True) 102 | save_train_path = os.path.join(save_folder, "n20em_train.csv") 103 | save_valid_path = os.path.join(save_folder, "n20em_valid.csv") 104 | save_test_path = os.path.join(save_folder, "n20em_test.csv") 105 | # train 106 | with open(save_train_path, mode="w") as csv_f: 107 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 108 | for line in csv_train_lines: 109 | csv_writer.writerow(line) 110 | # valid 111 | with open(save_valid_path, mode="w") as csv_f: 112 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 113 | for line in csv_valid_lines: 114 | csv_writer.writerow(line) 115 | # test 116 | with open(save_test_path, mode="w") as csv_f: 117 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 118 | for line in csv_test_lines: 119 | csv_writer.writerow(line) 120 | 121 | 122 | if __name__ == "__main__": 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument("--duration", type=int, default=5, help="the threshold for duration") 125 | parser.add_argument("--frame_rate", type=float, default=49.8, help="The frame-rate for SSL models") 126 | parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset") 127 | args = parser.parse_args() 128 | 129 | prepare_frame_anno(folder=args.n20emv2, frame_rate=args.frame_rate) 130 | prepare_csv_n20emv2(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration) 131 | save_folder = os.path.join("./data", "dur_" + str(args.duration) + "s") 132 | 133 | merge_files = ["mir_st500_train.csv", "n20em_train.csv"] 134 | merge_name = "mix_train.csv" 135 | merge_csvs( 136 | data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, 137 | ) -------------------------------------------------------------------------------- /N20EMv2/audio_visual/README.md: -------------------------------------------------------------------------------- 1 | # Audio-Visual Automatic Music Transcription with N20EMv2 dataset 2 | This sub-project contains recipes for training audio-visual AMT system using N20EMv2 dataset. 3 | 4 | ## Prerequisites 5 | Before running our scripts, you need to simulate the noisy environments by synthesizing the noisy data. We provide the synthesized noisy data in N20EMv2. If you want to obtain these data from scratch, you can follow the procedure. 6 | 7 | Firstly, you should download and decompress the [MUSAN](https://www.openslr.org/17/). Assmue you save the data in `/path/to/MUSAN`, which contains the following folders `./music, ./speech, ./noise`. We borrow the code from [AVHuBERT](https://github.com/facebookresearch/av_hubert/tree/main/avhubert/preparation) to process the MUSAN dataset. Run the following code: 8 | ``` 9 | python prepare_musan.py --musan /path/to/MUSAN --nshard --slurm_partition 10 | ``` 11 | This will: (1) split raw audios into 10-second clips, (2) generate babble noise from MUSAN speech audio, (3) count number of frames per clip. The whole data will be sharded into ${nshard} parts and each job processes one part. 12 | 13 | Next, we synthesize the noisy data considering four noise types, including `accomp`, `babble`, `white`, `natural`, and five noise levels, including `-10dB`, `-5dB`, `0dB`, `5dB`, `10dB`. Run the following code: 14 | ``` 15 | python synthesis_noise.py --musan /path/to/MUSAN --n20emv2 /path/to/N20EMv2 16 | ``` 17 | 18 | The file organization for N20EMv2 should be: 19 | ``` 20 | /path/to/N20EMv2 21 | ├── data 22 | ├── song1 23 | ├── vocals.wav 24 | ├── accomp.wav 25 | ├── video_50fps.npy 26 | ├── note_anno.json 27 | ├── audio_anno 28 | ├── video_anno 29 | ├── noise_data 30 | ├── accomp 31 | ├── SNR_-10dB.wav 32 | ├── SNR_-5dB.wav 33 | ├── SNR_0dB.wav 34 | ├── SNR_5dB.wav 35 | ├── SNR_10dB.wav 36 | ├── babble 37 | ├── ... 38 | ├── natural 39 | ├── ... 40 | ├── white 41 | ├── ... 42 | ├── song2 43 | ├── ... 44 | ├── annotations.json 45 | ``` 46 | 47 | 48 | ## How to run 49 | 1. Firstly, we train our audio-only AMT system (check `N20EMv2/audio_only/README.md`) and video-only AMT system (check `N20EMv2/video_only/README.md`). Make sure the model are well saved in order to extract features as follows. 50 | 51 | 2. Secondly, we fix the gradients of audio encoder and video encoder. Practically, we extract the acoustic features from audio-only AMT system. To do so, run following commands: 52 | ``` 53 | cd N20EMv2/audio_only 54 | CUDA_VISIBLE_DEVICES=0,1 python extract_ssl_feats.py hparams/extract_ssl_feats.yaml --data_parallel_backend --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 --save_model True --add_noise True --noise_type --snr_db 55 | ``` 56 | Compared to the run script, we enable four extra options. `--add_noise` refers to noisy input or clean input, `--noise_type` refers to the type of noise, including `accomp`, `babble`, `white`, `natural`, while `--snr_db` refers to the noise level, including `-10`, `-5`, `0`, `5`, `10` dB. Finally, `--save_model==True` means that we will save the SSL model and AMT classifier to the folder `N20EMv2/save_model` 57 | 58 | Then we extract the visual features from video-only AMT system. To do so, run following commands: 59 | ``` 60 | CUDA_VISIBLE_DEVICES=0,1 python extract_ssl_feats.py hparams/extract_ssl_feats.yaml --data_parallel_backend --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 61 | ``` 62 | 63 | 3. Prepare csv files for N20EMv2 feats, run: 64 | ``` 65 | python prepare_n20emv2.py --duration --n20emv2 /path/to/n20emv2 66 | ``` 67 | 68 | 4. Run the second training stage of audio-visual AMT system, run: 69 | ``` 70 | CUDA_VISIBLE_DEVICES=0,1 python train_rca_av.py hparams/train_rca_av.yaml --data_folder /path/to/N20EMv2 --lr 0.003 --add_noise True --snr_db --noise_type --data_parallel_backend 71 | ``` 72 | We also provide counterpart audio-only AMT system trained via two stages, run: 73 | ``` 74 | CUDA_VISIBLE_DEVICES=0,1 python train_rca_a.py hparams/train_rca_a.yaml --data_folder /path/to/N20EMv2 --lr 0.003 --add_noise True --snr_db --noise_type --data_parallel_backend 75 | ``` 76 | 77 | ## Results 78 | 79 | Results on N20EMv2 dataset for audio-visual / audio-only AMT task under the perturbation of musical accompaniments: 80 | 81 |

82 | 83 |

-------------------------------------------------------------------------------- /N20EMv2/audio_visual/hparams/train_rca_a.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: wav2vec2 + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | noise_type: natural 12 | snr_db: -10 13 | add_noise: True 14 | lr: 0.0003 15 | output_folder: !ref results/audio_only_rca//noise___db_lr/ 16 | save_folder: !ref /save 17 | csv_folder: !ref data_feat/dur_s 18 | train_log: !ref /train_log.txt 19 | 20 | # Data files 21 | data_folder: !PLACEHOLDER # e,g./path/to/DSing 22 | # noise/ris dataset will automatically be downloaded 23 | data_folder_rirs: !ref 24 | ckpt_interval_minutes: 25 # save checkpoint every N min 25 | train_csv: !ref /n20em_train.csv 26 | valid_csv: !ref /n20em_valid.csv 27 | test_csv: 28 | - !ref /n20em_test.csv 29 | - !ref /n20em_valid.csv 30 | 31 | # Training parameters 32 | number_of_epochs: 10 33 | sorting: ascending 34 | auto_mix_prec: False 35 | audio_sample_rate: 49.8 36 | video_sample_rate: 50 37 | frame_rate: 49.8 38 | linear_prob_epochs: 2 39 | pretrain: True 40 | pretrain_folder: ../save_model 41 | 42 | # Evaluating parameters 43 | onset_threshold: 0.4 44 | offset_threshold: 0.5 45 | onset_tolerance: 0.05 46 | pitch_tolerance: 50 47 | 48 | # With data_parallel batch_size is split into N jobs 49 | # With DDP batch_size is multiplied by N jobs 50 | # Must be 3 per GPU to fit 32GB of VRAM 51 | batch_size: 8 52 | test_batch_size: 1 53 | num_workers: 8 54 | 55 | # Dataloader options 56 | train_dataloader_opts: 57 | batch_size: !ref 58 | num_workers: !ref 59 | 60 | valid_dataloader_opts: 61 | batch_size: !ref 62 | num_workers: !ref 63 | 64 | test_dataloader_opts: 65 | batch_size: !ref 66 | num_workers: !ref 67 | 68 | # Outputs 69 | pitch_octave_num: 4 70 | pitch_class_num: 12 71 | feat_dim: 1024 72 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 73 | 74 | # 75 | # Functions and classes 76 | # 77 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 78 | limit: !ref 79 | 80 | head: !new:speechbrain.nnet.linear.Linear 81 | input_size: !ref 82 | n_neurons: !ref 83 | 84 | fusion: !new:fusion.FusionRCA 85 | 86 | model: !new:torch.nn.ModuleList 87 | - [!ref , !ref ] 88 | 89 | modules: 90 | head: !ref 91 | fusion: !ref 92 | 93 | log_softmax: !new:speechbrain.nnet.activations.Softmax 94 | apply_log: True 95 | 96 | onset_positive_weight: 15.0 97 | offset_positive_weight: 1.0 98 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 99 | reduction: mean 100 | allowed_len_diff: 3 101 | label_smoothing: 0.0 102 | 103 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 104 | reduction: mean 105 | allowed_len_diff: 3 106 | label_smoothing: 0.0 107 | 108 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 109 | reduction: mean 110 | allowed_len_diff: 3 111 | label_smoothing: 0.0 112 | 113 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 114 | reduction: mean 115 | allowed_len_diff: 3 116 | label_smoothing: 0.0 117 | 118 | model_opt_class: !name:torch.optim.Adadelta 119 | lr: !ref 120 | rho: 0.95 121 | eps: 1.e-8 122 | 123 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler 124 | initial_value: !ref 125 | improvement_threshold: 0.0025 126 | annealing_factor: 0.8 127 | patient: 0 128 | 129 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 130 | checkpoints_dir: !ref 131 | recoverables: 132 | model: !ref 133 | scheduler_model: !ref 134 | counter: !ref 135 | 136 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 137 | save_file: !ref 138 | precision: 3 139 | 140 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 141 | metric: !name:speechbrain.nnet.losses.bce_loss 142 | reduction: batch 143 | allowed_len_diff: 3 144 | label_smoothing: 0.0 145 | 146 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 147 | metric: !name:speechbrain.nnet.losses.bce_loss 148 | reduction: batch 149 | allowed_len_diff: 3 150 | label_smoothing: 0.0 151 | 152 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 153 | metric: !name:speechbrain.nnet.losses.nll_loss 154 | reduction: batch 155 | allowed_len_diff: 3 156 | label_smoothing: 0.0 157 | 158 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 159 | metric: !name:speechbrain.nnet.losses.nll_loss 160 | reduction: batch 161 | allowed_len_diff: 3 162 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/audio_visual/hparams/train_rca_av.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: wav2vec2 + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | noise_type: natural 12 | snr_db: -10 13 | add_noise: True 14 | lr: 0.0003 15 | output_folder: !ref results/audio_visual_rca//noise___db_lr/ 16 | save_folder: !ref /save 17 | csv_folder: !ref data_feat/dur_s 18 | train_log: !ref /train_log.txt 19 | 20 | # Data files 21 | data_folder: !PLACEHOLDER # e,g./path/to/DSing 22 | # noise/ris dataset will automatically be downloaded 23 | data_folder_rirs: !ref 24 | ckpt_interval_minutes: 25 # save checkpoint every N min 25 | train_csv: !ref /n20em_train.csv 26 | valid_csv: !ref /n20em_valid.csv 27 | test_csv: 28 | - !ref /n20em_test.csv 29 | - !ref /n20em_valid.csv 30 | 31 | # Training parameters 32 | number_of_epochs: 10 33 | sorting: ascending 34 | auto_mix_prec: False 35 | audio_sample_rate: 49.8 36 | video_sample_rate: 50 37 | frame_rate: 49.8 38 | linear_prob_epochs: 2 39 | pretrain: True 40 | pretrain_folder: ../save_model 41 | 42 | # Evaluating parameters 43 | onset_threshold: 0.4 44 | offset_threshold: 0.5 45 | onset_tolerance: 0.05 46 | pitch_tolerance: 50 47 | 48 | # With data_parallel batch_size is split into N jobs 49 | # With DDP batch_size is multiplied by N jobs 50 | # Must be 3 per GPU to fit 32GB of VRAM 51 | batch_size: 8 52 | test_batch_size: 1 53 | num_workers: 8 54 | 55 | # Dataloader options 56 | train_dataloader_opts: 57 | batch_size: !ref 58 | num_workers: !ref 59 | 60 | valid_dataloader_opts: 61 | batch_size: !ref 62 | num_workers: !ref 63 | 64 | test_dataloader_opts: 65 | batch_size: !ref 66 | num_workers: !ref 67 | 68 | # Outputs 69 | pitch_octave_num: 4 70 | pitch_class_num: 12 71 | feat_dim: 1024 72 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 73 | 74 | # 75 | # Functions and classes 76 | # 77 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 78 | limit: !ref 79 | 80 | head: !new:speechbrain.nnet.linear.Linear 81 | input_size: !ref 82 | n_neurons: !ref 83 | 84 | fusion: !new:fusion.FusionRCA 85 | 86 | model: !new:torch.nn.ModuleList 87 | - [!ref , !ref ] 88 | 89 | modules: 90 | head: !ref 91 | fusion: !ref 92 | 93 | log_softmax: !new:speechbrain.nnet.activations.Softmax 94 | apply_log: True 95 | 96 | onset_positive_weight: 15.0 97 | offset_positive_weight: 1.0 98 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 99 | reduction: mean 100 | allowed_len_diff: 3 101 | label_smoothing: 0.0 102 | 103 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 104 | reduction: mean 105 | allowed_len_diff: 3 106 | label_smoothing: 0.0 107 | 108 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 109 | reduction: mean 110 | allowed_len_diff: 3 111 | label_smoothing: 0.0 112 | 113 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 114 | reduction: mean 115 | allowed_len_diff: 3 116 | label_smoothing: 0.0 117 | 118 | model_opt_class: !name:torch.optim.Adadelta 119 | lr: !ref 120 | rho: 0.95 121 | eps: 1.e-8 122 | 123 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler 124 | initial_value: !ref 125 | improvement_threshold: 0.0025 126 | annealing_factor: 0.8 127 | patient: 0 128 | 129 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 130 | checkpoints_dir: !ref 131 | recoverables: 132 | model: !ref 133 | scheduler_model: !ref 134 | counter: !ref 135 | 136 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 137 | save_file: !ref 138 | precision: 3 139 | 140 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 141 | metric: !name:speechbrain.nnet.losses.bce_loss 142 | reduction: batch 143 | allowed_len_diff: 3 144 | label_smoothing: 0.0 145 | 146 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 147 | metric: !name:speechbrain.nnet.losses.bce_loss 148 | reduction: batch 149 | allowed_len_diff: 3 150 | label_smoothing: 0.0 151 | 152 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 153 | metric: !name:speechbrain.nnet.losses.nll_loss 154 | reduction: batch 155 | allowed_len_diff: 3 156 | label_smoothing: 0.0 157 | 158 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 159 | metric: !name:speechbrain.nnet.losses.nll_loss 160 | reduction: batch 161 | allowed_len_diff: 3 162 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/audio_visual/prepare_n20emv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data preparation for N20EMv2 of singing voice transcription 3 | The input to model needs to be spectrum features 4 | Authors 5 | * Xiangming Gu 2022 6 | """ 7 | import os 8 | import csv 9 | import json 10 | import torch 11 | import argparse 12 | from tqdm import tqdm 13 | SAMPLERATE=16000 14 | 15 | 16 | def prepare_csv_n20emv2_feat(folder, csv_folder="./data_feat", frame_rate=49.8, dur_thrd=5): 17 | """ 18 | This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration 19 | """ 20 | 21 | # initialize the csv lines 22 | csv_train_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 23 | csv_valid_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 24 | csv_test_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 25 | # load the annotations 26 | json_file = os.path.join(folder, "annotations.json") 27 | folder_data = os.path.join(folder, "data") 28 | # open ground truth data 29 | with open(json_file) as f: 30 | annotations = json.load(f) 31 | f.close() 32 | # traverse the whole dataset 33 | for entry in tqdm(annotations.keys()): 34 | split = annotations[entry]["split"] 35 | audio_path = os.path.join(folder_data, entry, "noise_data", "clean_feats.pt") 36 | video_path = os.path.join(folder_data, entry, "noise_data", "video_feats.pt") 37 | anno_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy") 38 | song_anno_path = os.path.join(folder_data, entry, "note_anno.json") 39 | 40 | # load the audio 41 | audio = torch.load(audio_path) 42 | video = torch.load(video_path) 43 | frame1 = audio.shape[0] 44 | frame2 = video.shape[0] 45 | duration = frame1 / 49.8 # audio frame-rate 46 | 47 | # split the whole song into utterances 48 | utter_num = round(duration / dur_thrd) 49 | for i in range(1, utter_num+1): 50 | ID = entry + "_" + str(i) 51 | if i == utter_num: 52 | dur = duration - (utter_num - 1) * dur_thrd 53 | assert 0 < dur <= dur_thrd * 3 / 2 54 | else: 55 | dur = dur_thrd 56 | csv_line = [ 57 | ID, str(dur), audio_path, video_path, str(i), str(utter_num), anno_path, song_anno_path, 58 | ] 59 | if split == "train": 60 | csv_train_lines.append(csv_line) 61 | elif split == "valid": 62 | csv_valid_lines.append(csv_line) 63 | elif split == "test": 64 | csv_test_lines.append(csv_line) 65 | # save csv files 66 | save_folder = os.path.join(csv_folder, "dur_" + str(dur_thrd) + "s") 67 | os.makedirs(save_folder, exist_ok=True) 68 | save_train_path = os.path.join(save_folder, "n20em_train.csv") 69 | save_valid_path = os.path.join(save_folder, "n20em_valid.csv") 70 | save_test_path = os.path.join(save_folder, "n20em_test.csv") 71 | # train 72 | with open(save_train_path, mode="w") as csv_f: 73 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 74 | for line in csv_train_lines: 75 | csv_writer.writerow(line) 76 | # valid 77 | with open(save_valid_path, mode="w") as csv_f: 78 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 79 | for line in csv_valid_lines: 80 | csv_writer.writerow(line) 81 | # test 82 | with open(save_test_path, mode="w") as csv_f: 83 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 84 | for line in csv_test_lines: 85 | csv_writer.writerow(line) 86 | 87 | if __name__ == "__main__": 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument("--duration", type=int, default=5, help="the threshold for duration") 90 | parser.add_argument("--frame_rate", type=float, default=49.8, help="The frame-rate for SSL models") 91 | parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset") 92 | args = parser.parse_args() 93 | prepare_csv_n20emv2_feat(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration) -------------------------------------------------------------------------------- /N20EMv2/audio_visual/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for automatic music transcription 3 | 4 | Authors 5 | * Xiangming Gu 2022 6 | """ 7 | import numpy as np 8 | 9 | 10 | def note2frame(gt_data, length, frame_size=1/49.8, pitch_shift=0): 11 | """ 12 | This function transforms the note-level annotations into the frame-level annotations 13 | Adapted from https://github.com/york135/singing_transcription_ICASSP2021/blob/master/AST/data_utils/audio_dataset.py 14 | """ 15 | new_label = [] 16 | 17 | cur_note = 0 18 | cur_note_onset = gt_data[cur_note][0] 19 | cur_note_offset = gt_data[cur_note][1] 20 | cur_note_pitch = gt_data[cur_note][2] + pitch_shift 21 | 22 | # start from C2 (36) to B5 (83), total: 4 classes. This is a little confusing 23 | octave_start = 0 24 | octave_end = 3 25 | pitch_class_num = 12 26 | # frame_size = 1/ 49 # 1024.0 / 44100.0 27 | 28 | for i in range(length): 29 | cur_time = i * frame_size 30 | 31 | if abs(cur_time - cur_note_onset) <= (frame_size / 2.0): 32 | # First dim : onset 33 | # Second dim : no pitch 34 | if i == 0 or new_label[-1][0] != 1: 35 | my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start 36 | my_pitch_class = cur_note_pitch % pitch_class_num 37 | label = [1, 0, my_oct, my_pitch_class] 38 | new_label.append(label) 39 | else: 40 | my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start 41 | my_pitch_class = cur_note_pitch % pitch_class_num 42 | label = [0, 0, my_oct, my_pitch_class] 43 | new_label.append(label) 44 | 45 | elif cur_time < cur_note_onset or cur_note >= len(gt_data): 46 | # For the frame that doesn't belong to any note 47 | label = [0, 1, octave_end+1, pitch_class_num] 48 | new_label.append(label) 49 | 50 | elif abs(cur_time - cur_note_offset) <= (frame_size / 2.0): 51 | # For the offset frame 52 | my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start 53 | my_pitch_class = cur_note_pitch % pitch_class_num 54 | label = [0, 1, my_oct, my_pitch_class] 55 | 56 | cur_note = cur_note + 1 57 | if cur_note < len(gt_data): 58 | cur_note_onset = gt_data[cur_note][0] 59 | cur_note_offset = gt_data[cur_note][1] 60 | cur_note_pitch = gt_data[cur_note][2] + pitch_shift 61 | if abs(cur_time - cur_note_onset) <= (frame_size / 2.0): 62 | my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start 63 | my_pitch_class = cur_note_pitch % pitch_class_num 64 | label[0] = 1 65 | label[1] = 0 66 | label[2] = my_oct 67 | label[3] = my_pitch_class 68 | 69 | new_label.append(label) 70 | 71 | else: 72 | # For the voiced frame 73 | my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start 74 | my_pitch_class = cur_note_pitch % pitch_class_num 75 | 76 | label = [0, 0, my_oct, my_pitch_class] 77 | new_label.append(label) 78 | 79 | return np.array(new_label) 80 | 81 | 82 | def frame2note(frame_info, onset_thres, offset_thres, frame_size=1/49.8): 83 | """ 84 | This function transforms the frame-level predictions into the note-level predictions. 85 | Parse frame info [(onset_probs, offset_probs, pitch_class)...] into desired label format. 86 | Adapted from https://github.com/york135/singing_transcription_ICASSP2021/blob/master/AST/predictor.py 87 | """ 88 | 89 | result = [] 90 | current_onset = None 91 | pitch_counter = [] 92 | 93 | last_onset = 0.0 94 | onset_seq = np.array([frame_info[i][0] for i in range(len(frame_info))]) 95 | 96 | local_max_size = 3 97 | current_time = 0.0 98 | 99 | onset_seq_length = len(onset_seq) 100 | 101 | for i in range(len(frame_info)): 102 | 103 | current_time = frame_size*i 104 | info = frame_info[i] 105 | 106 | backward_frames = i - local_max_size 107 | if backward_frames < 0: 108 | backward_frames = 0 109 | 110 | forward_frames = i + local_max_size + 1 111 | if forward_frames > onset_seq_length - 1: 112 | forward_frames = onset_seq_length - 1 113 | 114 | # local max and more than threshold 115 | if info[0] >= onset_thres and onset_seq[i] == np.amax(onset_seq[backward_frames : forward_frames]): 116 | 117 | if current_onset is None: 118 | current_onset = current_time 119 | last_onset = info[0] - onset_thres 120 | 121 | else: 122 | if len(pitch_counter) > 0: 123 | result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36]) 124 | 125 | current_onset = current_time 126 | last_onset = info[0] - onset_thres 127 | pitch_counter = [] 128 | 129 | elif info[1] >= offset_thres: # If is offset 130 | if current_onset is not None: 131 | if len(pitch_counter) > 0: 132 | result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36]) 133 | current_onset = None 134 | 135 | pitch_counter = [] 136 | 137 | # If current_onset exist, add count for the pitch 138 | if current_onset is not None: 139 | final_pitch = int(info[2]* 12 + info[3]) 140 | if info[2] != 4 and info[3] != 12: 141 | # if final_pitch != 60: 142 | pitch_counter.append(final_pitch) 143 | 144 | if current_onset is not None: 145 | if len(pitch_counter) > 0: 146 | result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36]) 147 | current_onset = None 148 | 149 | return result 150 | 151 | 152 | class AverageMeter(object): 153 | """Computes and stores the average and current value""" 154 | 155 | def __init__(self): 156 | self.reset() 157 | 158 | def reset(self): 159 | self.val = 0 160 | self.avg = 0 161 | self.sum = 0 162 | self.count = 0 163 | 164 | def update(self, val, n=1): 165 | self.val = val 166 | self.sum += val * n 167 | self.count += n 168 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /N20EMv2/video_only/README.md: -------------------------------------------------------------------------------- 1 | # Video-only Automatic Music Transcription with N20EMv2 dataset 2 | This sub-project contains recipes for trianing video-only AMT system using N20EMv2 dataset. 3 | 4 | ## Prerequisites 5 | 1. Before running our scripts, you need to download, preprocess and save the N20EMv2 properly. For your convenience, we already crop the video clips of lip movements without releasing the identity of each subject. 6 | 7 | The file organization for N20EMv2 should be: 8 | ``` 9 | /path/to/N20EMv2 10 | ├── data 11 | ├── song1 12 | ├── vocals.wav 13 | ├── video_50fps.npy 14 | ├── song2 15 | ├── ... 16 | ├── annotations.json 17 | ``` 18 | 19 | 20 | 2. Prepare N20EMv2 dataset, run: 21 | ``` 22 | python prepare_n20emv2.py --duration --frame_rate 50 --n20emv2 /path/to/n20emv2 23 | ``` 24 | 25 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s, which is the same as audio-only automatic music transcription. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `50` fps, which is also the frame rate of video input. 26 | 27 | After running this script, the file organization for N20EMv2 should be: 28 | ``` 29 | /path/to/N20EMv2 30 | ├── data 31 | ├── song1 32 | ├── vocals.wav 33 | ├── video_50fps.npy 34 | ├── note_anno.json 35 | ├── video_anno 36 | ├── 50fps 37 | ├── video_frame_anno.npy 38 | ├── song2 39 | ├── ... 40 | ├── annotations.json 41 | ``` 42 | 43 | The resulted csv files are save in the same root folder: 44 | ``` 45 | ├── data 46 | ├── frame_rate 47 | ├── dur_ 48 | ├── n20emv2_train.csv 49 | ├── n20emv2_valid.csv 50 | ├── n20emv2_test.csv 51 | ├── prepare_n20emv2.py 52 | ``` 53 | 54 | ## How to run 55 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce video-only sing voice transcription model in our paper, first download the AV-HuBERT model pretrained on audio-visual data: 56 | ``` 57 | mkdir ssl_model/AVHuBERT 58 | cd ssl_model/AVHuBERT 59 | wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt 60 | ``` 61 | 62 | Then run the following command: 63 | ``` 64 | CUDA_VISIBLE_DEVICES=0,1 python train_video_ssl.py hparams/train_video_ssl.yaml --data_parallel_backend --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 65 | ``` 66 | The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. We provide the config for AVHuBERT pretrained on audio-visual speech data. If you intend to use the config for AVHuBERT pretrained and finetuned on audio-visual speech data, please rewrite `hparams/train_video_ssl.yaml` to change the model. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM): 67 | ``` 68 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_video_ssl.py hparams/train_video_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 69 | ``` 70 | 71 | 72 | ## Results 73 | We provide our video-only automatic music transcription model [[model link](https://drive.google.com/drive/folders/1u82GaLM4AWtfp5VzDHryxCNUZglN0bYe?usp=sharing)] in the paper. 74 |

75 | 76 |

-------------------------------------------------------------------------------- /N20EMv2/video_only/hparams/extract_ssl_feats.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: avhubert + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | overlap: 0 12 | frame_rate: 50 13 | sample_rate: 50 14 | output_folder: !ref results/AVHuBERT/train_video_ssl_dur_attempt/ 15 | save_folder: !ref /save 16 | csv_folder: !ref data/frame_rate/dur_s 17 | train_log: !ref /train_log.txt 18 | 19 | # Data files 20 | data_folder: !PLACEHOLDER # e,g./path/to/DSing 21 | # noise/ris dataset will automatically be downloaded 22 | data_folder_rirs: !ref 23 | ckpt_interval_minutes: 25 # save checkpoint every N min 24 | train_csv: !ref /n20em_train.csv 25 | valid_csv: !ref /n20em_valid.csv 26 | test_csv: 27 | - !ref /n20em_test.csv 28 | - !ref /n20em_valid.csv 29 | - !ref /n20em_train.csv 30 | 31 | # Training parameters 32 | number_of_epochs: 10 33 | lr: 0.0003 34 | lr_encoder: 0.00005 35 | sorting: ascending 36 | auto_mix_prec: False 37 | linear_prob_epochs: 2 38 | 39 | split_noise: False 40 | pretrain: False 41 | pretrain_folder: ../pretrain_model 42 | save_model: False 43 | save_model_folder: ../save_model 44 | 45 | # Evaluating parameters 46 | onset_threshold: 0.4 47 | offset_threshold: 0.5 48 | onset_tolerance: 0.05 49 | offset_tolerance: 0.05 50 | pitch_tolerance: 50 51 | 52 | # With data_parallel batch_size is split into N jobs 53 | # With DDP batch_size is multiplied by N jobs 54 | # Must be 3 per GPU to fit 32GB of VRAM 55 | batch_size: 8 56 | test_batch_size: 1 57 | 58 | # Dataloader options 59 | train_dataloader_opts: 60 | batch_size: !ref 61 | num_workers: 8 62 | 63 | valid_dataloader_opts: 64 | batch_size: !ref 65 | num_workers: 8 66 | 67 | test_dataloader_opts: 68 | batch_size: !ref 69 | num_workers: 8 70 | 71 | # Model parameters 72 | freeze_encoder: False 73 | 74 | # Outputs 75 | pitch_octave_num: 4 76 | pitch_class_num: 12 77 | feat_dim: 1024 78 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 79 | 80 | # 81 | # Functions and classes 82 | # 83 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 84 | limit: !ref 85 | 86 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment 87 | # sample_rate: !ref 88 | # speeds: [95, 100, 105] 89 | 90 | avhubert_url: https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt 91 | encoder: !new:fairseq_interface.FairseqAVHubertPretrain 92 | pretrained_path: !ref 93 | output_norm: True 94 | freeze: !ref 95 | save_path: ssl_model/AVHuBERT/large_vox_iter5.pt 96 | 97 | head: !new:speechbrain.nnet.linear.Linear 98 | input_size: !ref 99 | n_neurons: !ref 100 | 101 | modules: 102 | encoder: !ref 103 | head: !ref 104 | 105 | log_softmax: !new:speechbrain.nnet.activations.Softmax 106 | apply_log: True 107 | 108 | onset_positive_weight: 15.0 109 | offset_positive_weight: 1.0 110 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 111 | reduction: mean 112 | allowed_len_diff: 3 113 | label_smoothing: 0.0 114 | 115 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 116 | reduction: mean 117 | allowed_len_diff: 3 118 | label_smoothing: 0.0 119 | 120 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 121 | reduction: mean 122 | allowed_len_diff: 3 123 | label_smoothing: 0.0 124 | 125 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 126 | reduction: mean 127 | allowed_len_diff: 3 128 | label_smoothing: 0.0 129 | 130 | head_opt_class: !name:torch.optim.Adadelta 131 | lr: !ref 132 | rho: 0.95 133 | eps: 1.e-8 134 | 135 | encoder_opt_class: !name:torch.optim.Adam 136 | lr: !ref 137 | 138 | lr_annealing_head: !new:speechbrain.nnet.schedulers.NewBobScheduler 139 | initial_value: !ref 140 | improvement_threshold: 0.0025 141 | annealing_factor: 0.8 142 | patient: 0 143 | 144 | lr_annealing_encoder: !new:speechbrain.nnet.schedulers.NewBobScheduler 145 | initial_value: !ref 146 | improvement_threshold: 0.0025 147 | annealing_factor: 0.9 148 | patient: 0 149 | 150 | 151 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 152 | checkpoints_dir: !ref 153 | recoverables: 154 | encoder: !ref 155 | head: !ref 156 | scheduler_head: !ref 157 | scheduler_encoder: !ref 158 | counter: !ref 159 | 160 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 161 | save_file: !ref 162 | precision: 3 163 | 164 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 165 | metric: !name:speechbrain.nnet.losses.bce_loss 166 | reduction: batch 167 | allowed_len_diff: 3 168 | label_smoothing: 0.0 169 | 170 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 171 | metric: !name:speechbrain.nnet.losses.bce_loss 172 | reduction: batch 173 | allowed_len_diff: 3 174 | label_smoothing: 0.0 175 | 176 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 177 | metric: !name:speechbrain.nnet.losses.nll_loss 178 | reduction: batch 179 | allowed_len_diff: 3 180 | label_smoothing: 0.0 181 | 182 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 183 | metric: !name:speechbrain.nnet.losses.nll_loss 184 | reduction: batch 185 | allowed_len_diff: 3 186 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/video_only/hparams/train_video_ssl.yaml: -------------------------------------------------------------------------------- 1 | # ################################ 2 | # Model: avhubert + Linear 3 | # Authors: Xiangming Gu 2022 4 | # ################################ 5 | 6 | # Seed needs to be set at top of yaml, before objects with parameters are made 7 | seed: 1986 8 | __set_seed: !apply:torch.manual_seed [!ref ] 9 | attempt: 1 10 | dur_threshold: 5 11 | overlap: 0 12 | frame_rate: 50 13 | sample_rate: 50 14 | output_folder: !ref results/AVHuBERT/train_video_ssl_dur_attempt/ 15 | save_folder: !ref /save 16 | csv_folder: !ref data/frame_rate/dur_s 17 | train_log: !ref /train_log.txt 18 | 19 | # Data files 20 | data_folder: !PLACEHOLDER # e,g./path/to/DSing 21 | # noise/ris dataset will automatically be downloaded 22 | data_folder_rirs: !ref 23 | ckpt_interval_minutes: 25 # save checkpoint every N min 24 | train_csv: !ref /n20em_train.csv 25 | valid_csv: !ref /n20em_valid.csv 26 | test_csv: 27 | - !ref /n20em_test.csv 28 | - !ref /n20em_valid.csv 29 | 30 | # Training parameters 31 | number_of_epochs: 10 32 | lr: 0.0003 33 | lr_encoder: 0.00005 34 | sorting: ascending 35 | auto_mix_prec: False 36 | linear_prob_epochs: 2 37 | 38 | split_noise: False 39 | pretrain: False 40 | pretrain_folder: ../pretrain_model 41 | save_model: False 42 | save_model_folder: ../save_model 43 | 44 | # Evaluating parameters 45 | onset_threshold: 0.4 46 | offset_threshold: 0.5 47 | onset_tolerance: 0.05 48 | offset_tolerance: 0.05 49 | pitch_tolerance: 50 50 | 51 | # With data_parallel batch_size is split into N jobs 52 | # With DDP batch_size is multiplied by N jobs 53 | # Must be 3 per GPU to fit 32GB of VRAM 54 | batch_size: 8 55 | test_batch_size: 1 56 | 57 | # Dataloader options 58 | train_dataloader_opts: 59 | batch_size: !ref 60 | num_workers: 8 61 | 62 | valid_dataloader_opts: 63 | batch_size: !ref 64 | num_workers: 8 65 | 66 | test_dataloader_opts: 67 | batch_size: !ref 68 | num_workers: 8 69 | 70 | # Model parameters 71 | freeze_encoder: False 72 | 73 | # Outputs 74 | pitch_octave_num: 4 75 | pitch_class_num: 12 76 | feat_dim: 1024 77 | output_neurons: 20 # 2+pitch_octave+pitch_class+2 78 | 79 | # 80 | # Functions and classes 81 | # 82 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter 83 | limit: !ref 84 | 85 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment 86 | # sample_rate: !ref 87 | # speeds: [95, 100, 105] 88 | 89 | avhubert_url: https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt 90 | encoder: !new:fairseq_interface.FairseqAVHubertPretrain 91 | pretrained_path: !ref 92 | output_norm: True 93 | freeze: !ref 94 | save_path: ssl_model/AVHuBERT/large_vox_iter5.pt 95 | 96 | head: !new:speechbrain.nnet.linear.Linear 97 | input_size: !ref 98 | n_neurons: !ref 99 | 100 | modules: 101 | encoder: !ref 102 | head: !ref 103 | 104 | log_softmax: !new:speechbrain.nnet.activations.Softmax 105 | apply_log: True 106 | 107 | onset_positive_weight: 15.0 108 | offset_positive_weight: 1.0 109 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss 110 | reduction: mean 111 | allowed_len_diff: 3 112 | label_smoothing: 0.0 113 | 114 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss 115 | reduction: mean 116 | allowed_len_diff: 3 117 | label_smoothing: 0.0 118 | 119 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss 120 | reduction: mean 121 | allowed_len_diff: 3 122 | label_smoothing: 0.0 123 | 124 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss 125 | reduction: mean 126 | allowed_len_diff: 3 127 | label_smoothing: 0.0 128 | 129 | head_opt_class: !name:torch.optim.Adadelta 130 | lr: !ref 131 | rho: 0.95 132 | eps: 1.e-8 133 | 134 | encoder_opt_class: !name:torch.optim.Adam 135 | lr: !ref 136 | 137 | lr_annealing_head: !new:speechbrain.nnet.schedulers.NewBobScheduler 138 | initial_value: !ref 139 | improvement_threshold: 0.0025 140 | annealing_factor: 0.8 141 | patient: 0 142 | 143 | lr_annealing_encoder: !new:speechbrain.nnet.schedulers.NewBobScheduler 144 | initial_value: !ref 145 | improvement_threshold: 0.0025 146 | annealing_factor: 0.9 147 | patient: 0 148 | 149 | 150 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer 151 | checkpoints_dir: !ref 152 | recoverables: 153 | encoder: !ref 154 | head: !ref 155 | scheduler_head: !ref 156 | scheduler_encoder: !ref 157 | counter: !ref 158 | 159 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger 160 | save_file: !ref 161 | precision: 3 162 | 163 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats 164 | metric: !name:speechbrain.nnet.losses.bce_loss 165 | reduction: batch 166 | allowed_len_diff: 3 167 | label_smoothing: 0.0 168 | 169 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats 170 | metric: !name:speechbrain.nnet.losses.bce_loss 171 | reduction: batch 172 | allowed_len_diff: 3 173 | label_smoothing: 0.0 174 | 175 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats 176 | metric: !name:speechbrain.nnet.losses.nll_loss 177 | reduction: batch 178 | allowed_len_diff: 3 179 | label_smoothing: 0.0 180 | 181 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats 182 | metric: !name:speechbrain.nnet.losses.nll_loss 183 | reduction: batch 184 | allowed_len_diff: 3 185 | label_smoothing: 0.0 -------------------------------------------------------------------------------- /N20EMv2/video_only/prepare_n20emv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data preparation for datasets of automatic music transcription 3 | 4 | Authors 5 | * Xiangming Gu 2022 6 | """ 7 | import os 8 | import csv 9 | import json 10 | import argparse 11 | import numpy as np 12 | from tqdm import tqdm 13 | from utils import note2frame 14 | 15 | 16 | def prepare_frame_anno(folder, frame_rate=50): 17 | """ 18 | This function processes the frame-level annotations for each song 19 | """ 20 | json_file = os.path.join(folder, "annotations.json") 21 | folder_data = os.path.join(folder, "data") 22 | # open ground truth data 23 | with open(json_file) as f: 24 | annotations = json.load(f) 25 | f.close() 26 | # traverse the whole dataset 27 | for entry in tqdm(annotations.keys()): 28 | anno = annotations[entry]["midi"] 29 | json_path = os.path.join(folder_data, entry, "note_anno.json") 30 | # save json file 31 | with open(json_path, "w") as f: 32 | json.dump(anno, f) 33 | f.close() 34 | # load video file 35 | video_file = os.path.join(folder_data, entry, "video_" + str(frame_rate) + "fps.npy") 36 | video = np.load(video_file) 37 | # compute duration and length 38 | length = video.shape[0] 39 | frame_label = note2frame(gt_data=anno, length=length, frame_size=1/frame_rate) 40 | assert frame_label.shape[0] == length 41 | # save frame-level annotation 42 | os.makedirs(os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps"), exist_ok=True) 43 | frame_anno_path = os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps", "video_frame_anno.npy") 44 | np.save(frame_anno_path, frame_label) 45 | 46 | 47 | def prepare_csv_n20emv2(folder, csv_folder="./data", frame_rate=50, dur_thrd=5): 48 | """ 49 | This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration 50 | """ 51 | 52 | # initialize the csv lines 53 | csv_train_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 54 | csv_valid_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 55 | csv_test_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]] 56 | # load the annotations 57 | json_file = os.path.join(folder, "annotations.json") 58 | folder_data = os.path.join(folder, "data") 59 | # open ground truth data 60 | with open(json_file) as f: 61 | annotations = json.load(f) 62 | f.close() 63 | # traverse the whole dataset 64 | for entry in tqdm(annotations.keys()): 65 | split = annotations[entry]["split"] 66 | video_path = os.path.join(folder_data, entry, "video_" + str(frame_rate) + "fps.npy") 67 | anno_path = os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps", "video_frame_anno.npy") 68 | song_anno_path = os.path.join(folder_data, entry, "note_anno.json") 69 | 70 | # load the video 71 | video = np.load(video_path) 72 | duration = video.shape[0] / frame_rate 73 | 74 | # split the whole song into utterances 75 | is_end = False 76 | cur_i = 1 77 | cur_time = 0 78 | utter_lines = [] 79 | stride = dur_thrd 80 | while not is_end: 81 | ID = entry + "_" + str(cur_i) 82 | # whether is the end 83 | if duration - cur_time <= dur_thrd * 3 / 2: 84 | is_end = True 85 | dur = duration - cur_time 86 | utter_num = cur_i 87 | else: 88 | dur = dur_thrd 89 | 90 | # determine the csv_line 91 | utter_lines.append((ID, dur)) 92 | 93 | # update variables 94 | cur_i = cur_i + 1 95 | cur_time = cur_time + stride 96 | 97 | for i in range(1, utter_num + 1): 98 | ID, dur = utter_lines[i - 1] 99 | csv_line = [ 100 | ID, str(dur), video_path, str(i), str(utter_num), anno_path, song_anno_path, 101 | ] 102 | if split == "train": 103 | csv_train_lines.append(csv_line) 104 | elif split == "valid": 105 | csv_valid_lines.append(csv_line) 106 | elif split == "test": 107 | csv_test_lines.append(csv_line) 108 | 109 | # save csv files 110 | save_folder = os.path.join(csv_folder, "frame_rate" + str(frame_rate), "dur_" + str(dur_thrd) + "s") 111 | os.makedirs(save_folder, exist_ok=True) 112 | save_train_path = os.path.join(save_folder, "n20em_train.csv") 113 | save_valid_path = os.path.join(save_folder, "n20em_valid.csv") 114 | save_test_path = os.path.join(save_folder, "n20em_test.csv") 115 | # train 116 | with open(save_train_path, mode="w") as csv_f: 117 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 118 | for line in csv_train_lines: 119 | csv_writer.writerow(line) 120 | # valid 121 | with open(save_valid_path, mode="w") as csv_f: 122 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 123 | for line in csv_valid_lines: 124 | csv_writer.writerow(line) 125 | # test 126 | with open(save_test_path, mode="w") as csv_f: 127 | csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) 128 | for line in csv_test_lines: 129 | csv_writer.writerow(line) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument("--frame_rate", type=int, default=50, help="the frame rate for log fbanks features") 135 | parser.add_argument("--duration", type=int, default=5, help="the threshold for duration") 136 | parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset") 137 | args = parser.parse_args() 138 | prepare_frame_anno(folder=args.n20emv2, frame_rate=args.frame_rate) 139 | prepare_csv_n20emv2(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration) -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/framework.png -------------------------------------------------------------------------------- /assets/noise_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/noise_test.png -------------------------------------------------------------------------------- /assets/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/results.png -------------------------------------------------------------------------------- /assets/results2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/results2.png -------------------------------------------------------------------------------- /dependencies.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | datasets 3 | scikit-learn 4 | mir_eval 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | black==19.10b0 2 | flake8==3.7.9 3 | pycodestyle==2.5.0 4 | pytest==5.4.1 5 | yamllint==1.23.0 6 | huggingface_hub>=0.0.6 7 | hyperpyyaml>=0.0.1 8 | joblib>=0.14.1 9 | numpy>=1.17.0 10 | packaging 11 | pre-commit>=2.3.0 12 | scipy>=1.4.1 13 | sentencepiece>=0.1.91 14 | SoundFile; sys_platform == 'win32' 15 | torch>=1.8.0,<=1.10.1 16 | torchaudio>=0.8.0,<=0.10.1 17 | tqdm>=4.42.0 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import site 5 | import setuptools 6 | from distutils.core import setup 7 | 8 | 9 | # Editable install in user site directory can be allowed with this hack: 10 | # https://github.com/pypa/pip/issues/7953. 11 | site.ENABLE_USER_SITE = "--user" in sys.argv[1:] 12 | 13 | with open("README.md") as f: 14 | long_description = f.read() 15 | 16 | with open(os.path.join("speechbrain", "version.txt")) as f: 17 | version = f.read().strip() 18 | 19 | setup( 20 | name="speechbrain", 21 | version=version, 22 | description="All-in-one speech toolkit in pure Python and Pytorch", 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | author="Mirco Ravanelli & Others", 26 | author_email="speechbrain@gmail.com", 27 | packages=setuptools.find_packages(), 28 | package_data={"speechbrain": ["version.txt", "log-config.yaml"]}, 29 | install_requires=[ 30 | "hyperpyyaml", 31 | "joblib", 32 | "numpy", 33 | "packaging", 34 | "scipy", 35 | "sentencepiece", 36 | "torch>=1.7,<=1.11", 37 | "torchaudio", 38 | "tqdm", 39 | "huggingface_hub", 40 | ], 41 | python_requires=">=3.7", 42 | url="https://speechbrain.github.io/", 43 | ) 44 | -------------------------------------------------------------------------------- /speechbrain/__init__.py: -------------------------------------------------------------------------------- 1 | """ Comprehensive speech processing toolkit 2 | """ 3 | import os 4 | from .core import Stage, Brain, create_experiment_directory, parse_arguments 5 | from . import alignment # noqa 6 | from . import dataio # noqa 7 | from . import decoders # noqa 8 | from . import lobes # noqa 9 | from . import lm # noqa 10 | from . import nnet # noqa 11 | from . import processing # noqa 12 | from . import tokenizers # noqa 13 | from . import utils # noqa 14 | 15 | with open(os.path.join(os.path.dirname(__file__), "version.txt")) as f: 16 | version = f.read().strip() 17 | 18 | __all__ = [ 19 | "Stage", 20 | "Brain", 21 | "create_experiment_directory", 22 | "parse_arguments", 23 | ] 24 | 25 | __version__ = version 26 | -------------------------------------------------------------------------------- /speechbrain/alignment/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for aligning transcripts and speech signals 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/dataio/__init__.py: -------------------------------------------------------------------------------- 1 | """Data loading and dataset preprocessing 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | -------------------------------------------------------------------------------- /speechbrain/dataio/preprocess.py: -------------------------------------------------------------------------------- 1 | """Preprocessors for audio""" 2 | import torch 3 | import functools 4 | from speechbrain.processing.speech_augmentation import Resample 5 | 6 | 7 | class AudioNormalizer: 8 | """Normalizes audio into a standard format 9 | 10 | Arguments 11 | --------- 12 | sample_rate : int 13 | The sampling rate to which the incoming signals should be converted. 14 | mix : {"avg-to-mono", "keep"} 15 | "avg-to-mono" - add all channels together and normalize by number of 16 | channels. This also removes the channel dimension, resulting in [time] 17 | format tensor. 18 | "keep" - don't normalize channel information 19 | 20 | Example 21 | ------- 22 | >>> import torchaudio 23 | >>> example_file = 'samples/audio_samples/example_multichannel.wav' 24 | >>> signal, sr = torchaudio.load(example_file, channels_first = False) 25 | >>> normalizer = AudioNormalizer(sample_rate=8000) 26 | >>> normalized = normalizer(signal, sr) 27 | >>> signal.shape 28 | torch.Size([33882, 2]) 29 | >>> normalized.shape 30 | torch.Size([16941]) 31 | 32 | NOTE 33 | ---- 34 | This will also upsample audio. However, upsampling cannot produce meaningful 35 | information in the bandwidth which it adds. Generally models will not work 36 | well for upsampled data if they have not specifically been trained to do so. 37 | """ 38 | 39 | def __init__(self, sample_rate=16000, mix="avg-to-mono"): 40 | self.sample_rate = sample_rate 41 | if mix not in ["avg-to-mono", "keep"]: 42 | raise ValueError(f"Unexpected mixing configuration {mix}") 43 | self.mix = mix 44 | self._cached_resample = functools.lru_cache(maxsize=12)(Resample) 45 | 46 | def __call__(self, audio, sample_rate): 47 | """Perform normalization 48 | 49 | Arguments 50 | --------- 51 | audio : tensor 52 | The input waveform torch tensor. Assuming [time, channels], 53 | or [time]. 54 | """ 55 | resampler = self._cached_resample(sample_rate, self.sample_rate) 56 | resampled = resampler(audio.unsqueeze(0)).squeeze(0) 57 | return self._mix(resampled) 58 | 59 | def _mix(self, audio): 60 | """Handle channel mixing""" 61 | flat_input = audio.dim() == 1 62 | if self.mix == "avg-to-mono": 63 | if flat_input: 64 | return audio 65 | return torch.mean(audio, 1) 66 | if self.mix == "keep": 67 | return audio 68 | -------------------------------------------------------------------------------- /speechbrain/dataio/wer.py: -------------------------------------------------------------------------------- 1 | """WER print functions. 2 | 3 | The functions here are used to print the computed statistics 4 | with human-readable formatting. 5 | They have a file argument, but you can also just use 6 | contextlib.redirect_stdout, which may give a nicer syntax. 7 | 8 | Authors 9 | * Aku Rouhe 2020 10 | """ 11 | import sys 12 | from speechbrain.utils import edit_distance 13 | 14 | 15 | def print_wer_summary(wer_details, file=sys.stdout): 16 | """Prints out WER summary details in human-readable format. 17 | 18 | This function essentially mirrors the Kaldi compute-wer output format. 19 | 20 | Arguments 21 | --------- 22 | wer_details : dict 23 | Dict of wer summary details, 24 | see ``speechbrain.utils.edit_distance.wer_summary`` 25 | for format. 26 | file : stream 27 | Where to write. (default: sys.stdout) 28 | """ 29 | print( 30 | "%WER {WER:.2f} [ {num_edits} / {num_scored_tokens}, {insertions} ins, {deletions} del, {substitutions} sub ]".format( # noqa 31 | **wer_details 32 | ), 33 | file=file, 34 | end="", 35 | ) 36 | print( 37 | " [PARTIAL]" 38 | if wer_details["num_scored_sents"] < wer_details["num_ref_sents"] 39 | else "", 40 | file=file, 41 | ) 42 | print( 43 | "%SER {SER:.2f} [ {num_erraneous_sents} / {num_scored_sents} ]".format( 44 | **wer_details 45 | ), 46 | file=file, 47 | ) 48 | print( 49 | "Scored {num_scored_sents} sentences, {num_absent_sents} not present in hyp.".format( # noqa 50 | **wer_details 51 | ), 52 | file=file, 53 | ) 54 | 55 | 56 | def print_alignments( 57 | details_by_utterance, file=sys.stdout, empty_symbol="", separator=" ; " 58 | ): 59 | """Print WER summary and alignments. 60 | 61 | Arguments 62 | --------- 63 | details_by_utterance : list 64 | List of wer details by utterance, 65 | see ``speechbrain.utils.edit_distance.wer_details_by_utterance`` 66 | for format. Has to have alignments included. 67 | file : stream 68 | Where to write. (default: sys.stdout) 69 | empty_symbol : str 70 | Symbol to use when aligning to nothing. 71 | separator : str 72 | String that separates each token in the output. Note the spaces in the 73 | default. 74 | """ 75 | _print_alignments_global_header( 76 | file=file, empty_symbol=empty_symbol, separator=separator 77 | ) 78 | for dets in details_by_utterance: 79 | if dets["scored"]: 80 | _print_alignment_header(dets, file=file) 81 | _print_alignment( 82 | dets["alignment"], 83 | dets["ref_tokens"], 84 | dets["hyp_tokens"], 85 | file=file, 86 | empty_symbol=empty_symbol, 87 | separator=separator, 88 | ) 89 | 90 | 91 | # The following internal functions are used to 92 | # print out more specific things 93 | def _print_top_wer_utts(top_non_empty, top_empty, file=sys.stdout): 94 | print("=" * 80, file=file) 95 | print("UTTERANCES WITH HIGHEST WER", file=file) 96 | if top_non_empty: 97 | print( 98 | "Non-empty hypotheses -- utterances for which output was produced:", 99 | file=file, 100 | ) 101 | for dets in top_non_empty: 102 | print("{key} %WER {WER:.2f}".format(**dets), file=file) 103 | else: 104 | print("No utterances which had produced output!", file=file) 105 | if top_empty: 106 | print( 107 | "Empty hypotheses -- utterances for which no output was produced:", 108 | file=file, 109 | ) 110 | for dets in top_empty: 111 | print("{key} %WER {WER:.2f}".format(**dets), file=file) 112 | else: 113 | print("No utterances which had not produced output!", file=file) 114 | 115 | 116 | def _print_top_wer_spks(spks_by_wer, file=sys.stdout): 117 | print("=" * 80, file=file) 118 | print("SPEAKERS WITH HIGHEST WER", file=file) 119 | for dets in spks_by_wer: 120 | print("{speaker} %WER {WER:.2f}".format(**dets), file=file) 121 | 122 | 123 | def _print_alignment( 124 | alignment, a, b, empty_symbol="", separator=" ; ", file=sys.stdout 125 | ): 126 | # First, get equal length text for all: 127 | a_padded = [] 128 | b_padded = [] 129 | ops_padded = [] 130 | for op, i, j in alignment: # i indexes a, j indexes b 131 | op_string = str(op) 132 | a_string = str(a[i]) if i is not None else empty_symbol 133 | b_string = str(b[j]) if j is not None else empty_symbol 134 | # NOTE: the padding does not actually compute printed length, 135 | # but hopefully we can assume that printed length is 136 | # at most the str len 137 | pad_length = max(len(op_string), len(a_string), len(b_string)) 138 | a_padded.append(a_string.center(pad_length)) 139 | b_padded.append(b_string.center(pad_length)) 140 | ops_padded.append(op_string.center(pad_length)) 141 | # Then print, in the order Ref, op, Hyp 142 | print(separator.join(a_padded), file=file) 143 | print(separator.join(ops_padded), file=file) 144 | print(separator.join(b_padded), file=file) 145 | 146 | 147 | def _print_alignments_global_header( 148 | empty_symbol="", separator=" ; ", file=sys.stdout 149 | ): 150 | print("=" * 80, file=file) 151 | print("ALIGNMENTS", file=file) 152 | print("", file=file) 153 | print("Format:", file=file) 154 | print(", WER DETAILS", file=file) 155 | # Print the format with the actual 156 | # print_alignment function, using artificial data: 157 | a = ["reference", "on", "the", "first", "line"] 158 | b = ["and", "hypothesis", "on", "the", "third"] 159 | alignment = [ 160 | (edit_distance.EDIT_SYMBOLS["ins"], None, 0), 161 | (edit_distance.EDIT_SYMBOLS["sub"], 0, 1), 162 | (edit_distance.EDIT_SYMBOLS["eq"], 1, 2), 163 | (edit_distance.EDIT_SYMBOLS["eq"], 2, 3), 164 | (edit_distance.EDIT_SYMBOLS["sub"], 3, 4), 165 | (edit_distance.EDIT_SYMBOLS["del"], 4, None), 166 | ] 167 | _print_alignment( 168 | alignment, 169 | a, 170 | b, 171 | file=file, 172 | empty_symbol=empty_symbol, 173 | separator=separator, 174 | ) 175 | 176 | 177 | def _print_alignment_header(wer_details, file=sys.stdout): 178 | print("=" * 80, file=file) 179 | print( 180 | "{key}, %WER {WER:.2f} [ {num_edits} / {num_ref_tokens}, {insertions} ins, {deletions} del, {substitutions} sub ]".format( # noqa 181 | **wer_details 182 | ), 183 | file=file, 184 | ) 185 | -------------------------------------------------------------------------------- /speechbrain/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing the different decoders (ctc, beamsearch ...) 2 | """ 3 | from .seq2seq import * # noqa 4 | from .ctc import * # noqa 5 | -------------------------------------------------------------------------------- /speechbrain/lm/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining language models 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/lm/counting.py: -------------------------------------------------------------------------------- 1 | """ 2 | N-gram counting, discounting, interpolation, and backoff 3 | 4 | Authors 5 | * Aku Rouhe 2020 6 | """ 7 | import itertools 8 | 9 | 10 | # The following functions are essentially copying the NLTK ngram counting 11 | # pipeline with minor differences. Written from scratch, but with enough 12 | # inspiration that I feel I want to mention the inspiration source: 13 | # NLTK is licenced under the Apache 2.0 Licence, same as SpeechBrain 14 | # See https://github.com/nltk/nltk 15 | # The NLTK implementation is highly focused on getting lazy evaluation. 16 | def pad_ends( 17 | sequence, pad_left=True, left_pad_symbol="", right_pad_symbol="" 18 | ): 19 | """ 20 | Pad sentence ends with start- and end-of-sentence tokens 21 | 22 | In speech recognition, it is important to predict the end of sentence 23 | and use the start of sentence to condition predictions. Typically this 24 | is done by adding special tokens (usually and ) at the ends of 25 | each sentence. The token should not be predicted, so some special 26 | care needs to be taken for unigrams. 27 | 28 | Arguments 29 | --------- 30 | sequence : iterator 31 | The sequence (any iterable type) to pad. 32 | pad_left : bool 33 | Whether to pad on the left side as well. True by default. 34 | left_pad_symbol : any 35 | The token to use for left side padding. "" by default. 36 | right_pad_symbol : any 37 | The token to use for right side padding. "" by default. 38 | 39 | Returns 40 | ------- 41 | generator 42 | A generator that yields the padded sequence. 43 | 44 | Example 45 | ------- 46 | >>> for token in pad_ends(["Speech", "Brain"]): 47 | ... print(token) 48 | 49 | Speech 50 | Brain 51 | 52 | 53 | """ 54 | if pad_left: 55 | return itertools.chain( 56 | (left_pad_symbol,), tuple(sequence), (right_pad_symbol,) 57 | ) 58 | else: 59 | return itertools.chain(tuple(sequence), (right_pad_symbol,)) 60 | 61 | 62 | def ngrams(sequence, n): 63 | """ 64 | Produce all Nth order N-grams from the sequence. 65 | 66 | This will generally be used in an N-gram counting pipeline. 67 | 68 | Arguments 69 | --------- 70 | sequence : iterator 71 | The sequence from which to produce N-grams. 72 | n : int 73 | The order of N-grams to produce 74 | 75 | Yields 76 | ------ 77 | tuple 78 | Yields each ngram as a tuple. 79 | 80 | Example 81 | ------- 82 | >>> for ngram in ngrams("Brain", 3): 83 | ... print(ngram) 84 | ('B', 'r', 'a') 85 | ('r', 'a', 'i') 86 | ('a', 'i', 'n') 87 | 88 | """ 89 | if n <= 0: 90 | raise ValueError("N must be >=1") 91 | # Handle the unigram case specially: 92 | if n == 1: 93 | for token in sequence: 94 | yield (token,) 95 | return 96 | iterator = iter(sequence) 97 | history = [] 98 | for hist_length, token in enumerate(iterator, start=1): 99 | history.append(token) 100 | if hist_length == n - 1: 101 | break 102 | else: # For-else is obscure but fits here perfectly 103 | return 104 | for token in iterator: 105 | yield tuple(history) + (token,) 106 | history.append(token) 107 | del history[0] 108 | return 109 | 110 | 111 | def ngrams_for_evaluation(sequence, max_n, predict_first=False): 112 | """ 113 | Produce each token with the appropriate context. 114 | 115 | The function produces as large N-grams as possible, so growing from 116 | unigrams/bigrams to max_n. 117 | 118 | E.G. when your model is a trigram model, you'll still only have one token 119 | of context (the start of sentence) for the first token. 120 | 121 | In general this is useful when evaluating an N-gram model. 122 | 123 | Arguments 124 | --------- 125 | sequence : iterator 126 | The sequence to produce tokens and context from. 127 | max_n : int 128 | The maximum N-gram length to produce. 129 | predict_first : bool 130 | To produce the first token in the sequence to predict (without 131 | context) or not. Essentially this should be False when the start of 132 | sentence symbol is the first in the sequence. 133 | 134 | Yields 135 | ------ 136 | Any 137 | The token to predict 138 | tuple 139 | The context to predict conditional on. 140 | 141 | Example 142 | ------- 143 | >>> for token, context in ngrams_for_evaluation("Brain", 3, True): 144 | ... print(f"p( {token} |{' ' if context else ''}{' '.join(context)} )") 145 | p( B | ) 146 | p( r | B ) 147 | p( a | B r ) 148 | p( i | r a ) 149 | p( n | a i ) 150 | """ 151 | if max_n <= 0: 152 | raise ValueError("Max N must be >=1") 153 | iterator = iter(sequence) 154 | history = [] 155 | if not predict_first: 156 | history.append(next(iterator)) 157 | for token in iterator: 158 | if len(history) == max_n: 159 | del history[0] 160 | yield token, tuple(history) 161 | history.append(token) 162 | return 163 | -------------------------------------------------------------------------------- /speechbrain/lobes/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining common blocks (DNN models, processing ...) 2 | 3 | This subpackage gathers higher level blocks, or "lobes". 4 | The classes here may leverage the extended YAML syntax. 5 | """ 6 | from . import models # noqa 7 | -------------------------------------------------------------------------------- /speechbrain/lobes/beamform_multimic.py: -------------------------------------------------------------------------------- 1 | """Beamformer for multi-mic processing. 2 | 3 | Authors 4 | * Nauman Dawalatabad 5 | """ 6 | import torch 7 | from speechbrain.processing.features import ( 8 | STFT, 9 | ISTFT, 10 | ) 11 | 12 | from speechbrain.processing.multi_mic import ( 13 | Covariance, 14 | GccPhat, 15 | DelaySum, 16 | ) 17 | 18 | 19 | class DelaySum_Beamformer(torch.nn.Module): 20 | """Generate beamformed signal from multi-mic data using DelaySum beamforming. 21 | 22 | Arguments 23 | --------- 24 | sampling_rate : int (default: 16000) 25 | Sampling rate of audio signals. 26 | """ 27 | 28 | def __init__(self, sampling_rate=16000): 29 | super().__init__() 30 | self.fs = sampling_rate 31 | self.stft = STFT(sample_rate=self.fs) 32 | self.cov = Covariance() 33 | self.gccphat = GccPhat() 34 | self.delaysum = DelaySum() 35 | self.istft = ISTFT(sample_rate=self.fs) 36 | 37 | def forward(self, mics_signals): 38 | """Returns beamformed signal using multi-mic data. 39 | 40 | Arguments 41 | --------- 42 | mics_sginal : tensor 43 | Set of audio signals to be transformed. 44 | """ 45 | with torch.no_grad(): 46 | 47 | Xs = self.stft(mics_signals) 48 | XXs = self.cov(Xs) 49 | tdoas = self.gccphat(XXs) 50 | Ys_ds = self.delaysum(Xs, tdoas) 51 | sig = self.istft(Ys_ds) 52 | 53 | return sig 54 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/ESPnetVGG.py: -------------------------------------------------------------------------------- 1 | """This lobes replicate the encoder first introduced in ESPNET v1 2 | 3 | source: https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/rnn/encoders.py 4 | 5 | Authors 6 | * Titouan Parcollet 2020 7 | """ 8 | import torch 9 | import speechbrain as sb 10 | 11 | 12 | class ESPnetVGG(sb.nnet.containers.Sequential): 13 | """This model is a combination of CNNs and RNNs following 14 | the ESPnet encoder. (VGG+RNN+MLP+tanh()) 15 | 16 | Arguments 17 | --------- 18 | input_shape : tuple 19 | The shape of an example expected input. 20 | activation : torch class 21 | A class used for constructing the activation layers. For CNN and DNN. 22 | dropout : float 23 | Neuron dropout rate, applied to RNN only. 24 | cnn_channels : list of ints 25 | A list of the number of output channels for each CNN block. 26 | rnn_class : torch class 27 | The type of RNN to use (LiGRU, LSTM, GRU, RNN) 28 | rnn_layers : int 29 | The number of recurrent layers to include. 30 | rnn_neurons : int 31 | Number of neurons in each layer of the RNN. 32 | rnn_bidirectional : bool 33 | Whether this model will process just forward or both directions. 34 | projection_neurons : int 35 | The number of neurons in the last linear layer. 36 | 37 | Example 38 | ------- 39 | >>> inputs = torch.rand([10, 40, 60]) 40 | >>> model = ESPnetVGG(input_shape=inputs.shape) 41 | >>> outputs = model(inputs) 42 | >>> outputs.shape 43 | torch.Size([10, 10, 512]) 44 | """ 45 | 46 | def __init__( 47 | self, 48 | input_shape, 49 | activation=torch.nn.ReLU, 50 | dropout=0.15, 51 | cnn_channels=[64, 128], 52 | rnn_class=sb.nnet.RNN.LSTM, 53 | rnn_layers=4, 54 | rnn_neurons=512, 55 | rnn_bidirectional=True, 56 | rnn_re_init=False, 57 | projection_neurons=512, 58 | ): 59 | super().__init__(input_shape=input_shape) 60 | 61 | self.append(sb.nnet.containers.Sequential, layer_name="VGG") 62 | 63 | self.append( 64 | sb.nnet.CNN.Conv2d, 65 | out_channels=cnn_channels[0], 66 | kernel_size=(3, 3), 67 | layer_name="conv_1_1", 68 | ) 69 | self.append(activation(), layer_name="act_1_1") 70 | self.append( 71 | sb.nnet.CNN.Conv2d, 72 | out_channels=cnn_channels[0], 73 | kernel_size=(3, 3), 74 | layer_name="conv_1_2", 75 | ) 76 | self.append(activation(), layer_name="act_1_2") 77 | self.append( 78 | sb.nnet.pooling.Pooling2d( 79 | pool_type="max", kernel_size=(2, 2), pool_axis=(1, 2), 80 | ), 81 | layer_name="pooling_1", 82 | ) 83 | 84 | self.append( 85 | sb.nnet.CNN.Conv2d, 86 | out_channels=cnn_channels[1], 87 | kernel_size=(3, 3), 88 | layer_name="conv_2_1", 89 | ) 90 | self.append(activation(), layer_name="act_2_1") 91 | self.append( 92 | sb.nnet.CNN.Conv2d, 93 | out_channels=cnn_channels[1], 94 | kernel_size=(3, 3), 95 | layer_name="conv_2_2", 96 | ) 97 | self.append(activation(), layer_name="act_2_2") 98 | self.append( 99 | sb.nnet.pooling.Pooling2d( 100 | pool_type="max", kernel_size=(2, 2), pool_axis=(1, 2), 101 | ), 102 | layer_name="pooling_2", 103 | ) 104 | 105 | if rnn_layers > 0: 106 | self.append( 107 | rnn_class, 108 | layer_name="RNN", 109 | hidden_size=rnn_neurons, 110 | num_layers=rnn_layers, 111 | dropout=dropout, 112 | bidirectional=rnn_bidirectional, 113 | re_init=rnn_re_init, 114 | ) 115 | 116 | self.append( 117 | sb.nnet.linear.Linear, 118 | n_neurons=projection_neurons, 119 | layer_name="proj", 120 | ) 121 | self.append(torch.nn.Tanh(), layer_name="proj_act") 122 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/IMU_CRNN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | 8 | def check_model(model): 9 | pytorch_total_params = sum(p.numel() for p in model.parameters()) 10 | pytorch_train_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 11 | print('Totalparams:', format(pytorch_total_params, ',')) 12 | print('Trainableparams:', format(pytorch_train_params, ',')) 13 | 14 | 15 | class IMU_CRNN_GRU(nn.Module): # IMU_CRNN_Ott_GRU_3 16 | ''' 17 | Modified net from Ott 2022 18 | GRU 2 with fewer neurons 19 | ''' 20 | 21 | def __init__(self, dropout_cnn=0.5, dropout_rnn=0.2, rnn_width=60): 22 | super().__init__() 23 | 24 | channel_num_1 = 128 25 | channel_num_2 = 200 26 | 27 | self.down = nn.AvgPool1d(kernel_size=10, stride=5, padding=4) 28 | 29 | self.conv1 = nn.Conv1d(in_channels=8, out_channels=channel_num_1, kernel_size=3, stride=1, padding=1) # floor(500 + 2*p - 3 + 1) = 500 30 | self.pool1 = nn.MaxPool1d(kernel_size=2) 31 | self.norm1 = nn.BatchNorm1d(num_features=channel_num_1) 32 | self.drop1 = nn.Dropout(p=dropout_cnn) 33 | 34 | self.conv2 = nn.Conv1d(in_channels=channel_num_1, out_channels=channel_num_2, kernel_size=3, stride=1, padding=1) # (250 + 2*2 - 4) / 1 = 250 35 | self.norm2 = nn.BatchNorm1d(num_features=channel_num_2) 36 | self.drop2 = nn.Dropout(p=dropout_cnn) # [B, C2, T] 37 | 38 | self.rnn = nn.GRU(input_size=channel_num_2, hidden_size=rnn_width, num_layers=2, 39 | bias=True, batch_first=True, dropout=dropout_rnn, bidirectional=True) 40 | self.drop3 = nn.Dropout(p=dropout_rnn) 41 | 42 | self.fc = nn.Linear(in_features=rnn_width*2, out_features=1) 43 | 44 | def forward(self, x, cls=True): 45 | ''' 46 | If don't want classification output, set cls=False 47 | ''' 48 | if ('CUDA_VISIBLE_DEVICES' not in os.environ) or len(os.environ['CUDA_VISIBLE_DEVICES']) > 1: 49 | self.rnn.flatten_parameters() 50 | x = self.down(x) # [B, 64, 500] 51 | 52 | x = F.relu_(self.conv1(x)) 53 | x = self.pool1(x) 54 | x = self.norm1(x) 55 | x = self.drop1(x) # [B, C1=200, T=50] 56 | 57 | x = F.relu_(self.conv2(x)) 58 | x = self.norm2(x) 59 | x = self.drop2(x) # [B, C2=200, T=25] 60 | 61 | x = x.permute([0, 2, 1]) # [B, T=25, C2=256] 62 | x, _ = self.rnn(x) # [B, T=25, 512] 63 | x = self.drop3(x) # [B, T, F=120] 64 | 65 | if cls==True: 66 | x = torch.sigmoid(self.fc(x)) 67 | x = x.squeeze() 68 | else: 69 | pass 70 | 71 | return x 72 | 73 | 74 | 75 | class ConvBlock(nn.Module): 76 | def __init__(self, in_channels, out_channels): 77 | super(ConvBlock, self).__init__() 78 | 79 | self.conv1 = nn.Conv1d(in_channels=in_channels, 80 | out_channels=out_channels, 81 | kernel_size=3, 82 | stride=1, 83 | padding=1, 84 | bias=False) 85 | 86 | self.conv2 = nn.Conv1d(in_channels=out_channels, 87 | out_channels=out_channels, 88 | kernel_size=3, 89 | stride=1, 90 | padding=1, 91 | bias=False) 92 | 93 | self.bn1 = nn.BatchNorm1d(out_channels) 94 | self.bn2 = nn.BatchNorm1d(out_channels) 95 | 96 | def forward(self, input): 97 | """ 98 | Args: 99 | input: (batch_size, in_channels, time_steps, freq_bins) 100 | 101 | Outputs: 102 | output: (batch_size, out_channels, classes_num) 103 | """ 104 | 105 | x = F.relu_(self.bn1(self.conv1(input))) 106 | x = F.relu_(self.bn2(self.conv2(x))) 107 | 108 | return x 109 | 110 | 111 | # if __name__ == '__main__': 112 | # main() 113 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/MetricGAN.py: -------------------------------------------------------------------------------- 1 | """Generator and discriminator used in MetricGAN 2 | 3 | Authors: 4 | * Szu-Wei Fu 2020 5 | """ 6 | import torch 7 | import speechbrain as sb 8 | from torch import nn 9 | from torch.nn.utils import spectral_norm 10 | 11 | 12 | def xavier_init_layer( 13 | in_size, out_size=None, spec_norm=True, layer_type=nn.Linear, **kwargs 14 | ): 15 | "Create a layer with spectral norm, xavier uniform init and zero bias" 16 | if out_size is None: 17 | out_size = in_size 18 | 19 | layer = layer_type(in_size, out_size, **kwargs) 20 | if spec_norm: 21 | layer = spectral_norm(layer) 22 | 23 | # Perform initialization 24 | nn.init.xavier_uniform_(layer.weight, gain=1.0) 25 | nn.init.zeros_(layer.bias) 26 | 27 | return layer 28 | 29 | 30 | def shifted_sigmoid(x): 31 | return 1.2 / (1 + torch.exp(-(1 / 1.6) * x)) 32 | 33 | 34 | class Learnable_sigmoid(nn.Module): 35 | def __init__(self, in_features=257): 36 | super().__init__() 37 | self.slope = nn.Parameter(torch.ones(in_features)) 38 | self.slope.requiresGrad = True # set requiresGrad to true! 39 | 40 | # self.scale = nn.Parameter(torch.ones(1)) 41 | # self.scale.requiresGrad = True # set requiresGrad to true! 42 | 43 | def forward(self, x): 44 | return 1.2 * torch.sigmoid(self.slope * x) 45 | 46 | 47 | class EnhancementGenerator(nn.Module): 48 | """Simple LSTM for enhancement with custom initialization. 49 | 50 | Arguments 51 | --------- 52 | input_size : int 53 | Size of the input tensor's last dimension. 54 | hidden_size : int 55 | Number of neurons to use in the LSTM layers. 56 | num_layers : int 57 | Number of layers to use in the LSTM. 58 | dropout : int 59 | Fraction of neurons to drop during training. 60 | """ 61 | 62 | def __init__( 63 | self, input_size=257, hidden_size=200, num_layers=2, dropout=0, 64 | ): 65 | super().__init__() 66 | self.activation = nn.LeakyReLU(negative_slope=0.3) 67 | 68 | self.blstm = sb.nnet.RNN.LSTM( 69 | input_size=input_size, 70 | hidden_size=hidden_size, 71 | num_layers=num_layers, 72 | dropout=dropout, 73 | bidirectional=True, 74 | ) 75 | """ 76 | Use orthogonal init for recurrent layers, xavier uniform for input layers 77 | Bias is 0 78 | """ 79 | for name, param in self.blstm.named_parameters(): 80 | if "bias" in name: 81 | nn.init.zeros_(param) 82 | elif "weight_ih" in name: 83 | nn.init.xavier_uniform_(param) 84 | elif "weight_hh" in name: 85 | nn.init.orthogonal_(param) 86 | 87 | self.linear1 = xavier_init_layer(400, 300, spec_norm=False) 88 | self.linear2 = xavier_init_layer(300, 257, spec_norm=False) 89 | 90 | self.Learnable_sigmoid = Learnable_sigmoid() 91 | self.sigmoid = nn.Sigmoid() 92 | 93 | def forward(self, x, lengths): 94 | out, _ = self.blstm(x, lengths=lengths) 95 | 96 | out = self.linear1(out) 97 | out = self.activation(out) 98 | 99 | out = self.linear2(out) 100 | out = self.Learnable_sigmoid(out) 101 | 102 | return out 103 | 104 | 105 | class MetricDiscriminator(nn.Module): 106 | """Metric estimator for enhancement training. 107 | 108 | Consists of: 109 | * four 2d conv layers 110 | * channel averaging 111 | * three linear layers 112 | 113 | Arguments 114 | --------- 115 | kernel_size : tuple 116 | The dimensions of the 2-d kernel used for convolution. 117 | base_channels : int 118 | Number of channels used in each conv layer. 119 | """ 120 | 121 | def __init__( 122 | self, kernel_size=(5, 5), base_channels=15, activation=nn.LeakyReLU, 123 | ): 124 | super().__init__() 125 | 126 | self.activation = activation(negative_slope=0.3) 127 | 128 | self.BN = nn.BatchNorm2d(num_features=2, momentum=0.01) 129 | 130 | self.conv1 = xavier_init_layer( 131 | 2, base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 132 | ) 133 | self.conv2 = xavier_init_layer( 134 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 135 | ) 136 | self.conv3 = xavier_init_layer( 137 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 138 | ) 139 | self.conv4 = xavier_init_layer( 140 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 141 | ) 142 | 143 | self.Linear1 = xavier_init_layer(base_channels, out_size=50) 144 | self.Linear2 = xavier_init_layer(in_size=50, out_size=10) 145 | self.Linear3 = xavier_init_layer(in_size=10, out_size=1) 146 | 147 | def forward(self, x): 148 | out = self.BN(x) 149 | 150 | out = self.conv1(out) 151 | out = self.activation(out) 152 | 153 | out = self.conv2(out) 154 | out = self.activation(out) 155 | 156 | out = self.conv3(out) 157 | out = self.activation(out) 158 | 159 | out = self.conv4(out) 160 | out = self.activation(out) 161 | 162 | out = torch.mean(out, (2, 3)) 163 | 164 | out = self.Linear1(out) 165 | out = self.activation(out) 166 | 167 | out = self.Linear2(out) 168 | out = self.activation(out) 169 | 170 | out = self.Linear3(out) 171 | 172 | return out 173 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/MetricGAN_U.py: -------------------------------------------------------------------------------- 1 | """Generator and discriminator used in MetricGAN-U 2 | 3 | Authors: 4 | * Szu-Wei Fu 2020 5 | """ 6 | import torch 7 | import speechbrain as sb 8 | from torch import nn 9 | from torch.nn.utils import spectral_norm 10 | 11 | 12 | def xavier_init_layer( 13 | in_size, out_size=None, spec_norm=True, layer_type=nn.Linear, **kwargs 14 | ): 15 | "Create a layer with spectral norm, xavier uniform init and zero bias" 16 | if out_size is None: 17 | out_size = in_size 18 | 19 | layer = layer_type(in_size, out_size, **kwargs) 20 | if spec_norm: 21 | layer = spectral_norm(layer) 22 | 23 | # Perform initialization 24 | nn.init.xavier_uniform_(layer.weight, gain=1.0) 25 | nn.init.zeros_(layer.bias) 26 | 27 | return layer 28 | 29 | 30 | class EnhancementGenerator(nn.Module): 31 | """Simple LSTM for enhancement with custom initialization. 32 | 33 | Arguments 34 | --------- 35 | input_size : int 36 | Size of the input tensor's last dimension. 37 | hidden_size : int 38 | Number of neurons to use in the LSTM layers. 39 | num_layers : int 40 | Number of layers to use in the LSTM. 41 | lin_dim: int 42 | Number of neurons in the last two linear layers. 43 | dropout : int 44 | Fraction of neurons to drop during training. 45 | 46 | Example 47 | ------- 48 | >>> inputs = torch.rand([10, 100, 40]) 49 | >>> model = EnhancementGenerator(input_size=40, hidden_size=50) 50 | >>> outputs = model(inputs, lengths=torch.ones([10])) 51 | >>> outputs.shape 52 | torch.Size([10, 100, 40]) 53 | """ 54 | 55 | def __init__( 56 | self, 57 | input_size=257, 58 | hidden_size=200, 59 | num_layers=2, 60 | lin_dim=300, 61 | dropout=0, 62 | ): 63 | super().__init__() 64 | self.activation = nn.LeakyReLU(negative_slope=0.3) 65 | 66 | self.blstm = sb.nnet.RNN.LSTM( 67 | input_size=input_size, 68 | hidden_size=hidden_size, 69 | num_layers=num_layers, 70 | dropout=dropout, 71 | bidirectional=True, 72 | ) 73 | """ 74 | Use orthogonal init for recurrent layers, xavier uniform for input layers 75 | Bias is 0 76 | """ 77 | for name, param in self.blstm.named_parameters(): 78 | if "bias" in name: 79 | nn.init.zeros_(param) 80 | elif "weight_ih" in name: 81 | nn.init.xavier_uniform_(param) 82 | elif "weight_hh" in name: 83 | nn.init.orthogonal_(param) 84 | 85 | self.linear1 = xavier_init_layer( 86 | hidden_size * 2, lin_dim, spec_norm=False 87 | ) 88 | self.linear2 = xavier_init_layer(lin_dim, input_size, spec_norm=False) 89 | 90 | self.sigmoid = nn.Sigmoid() 91 | 92 | def forward(self, x, lengths): 93 | out, _ = self.blstm(x, lengths=lengths) 94 | 95 | out = self.linear1(out) 96 | out = self.activation(out) 97 | 98 | out = self.linear2(out) 99 | out = self.sigmoid(out) 100 | 101 | return out 102 | 103 | 104 | class MetricDiscriminator(nn.Module): 105 | """Metric estimator for enhancement training. 106 | 107 | Consists of: 108 | * four 2d conv layers 109 | * channel averaging 110 | * three linear layers 111 | 112 | Arguments 113 | --------- 114 | kernel_size : tuple 115 | The dimensions of the 2-d kernel used for convolution. 116 | base_channels : int 117 | Number of channels used in each conv layer. 118 | lin_dim1: int 119 | Dimensionality of the first linear layer. 120 | lin_dim2: int 121 | Dimensionality of the second linear layer. 122 | 123 | 124 | Example 125 | ------- 126 | >>> inputs = torch.rand([1, 1, 100, 257]) 127 | >>> model = MetricDiscriminator() 128 | >>> outputs = model(inputs) 129 | >>> outputs.shape 130 | torch.Size([1, 1]) 131 | """ 132 | 133 | # FCN 134 | def __init__( 135 | self, 136 | kernel_size=(5, 5), 137 | base_channels=15, 138 | activation=nn.LeakyReLU, 139 | lin_dim1=50, 140 | lin_dim2=10, 141 | ): 142 | super().__init__() 143 | 144 | self.activation = activation(negative_slope=0.3) 145 | 146 | self.BN = nn.BatchNorm2d(num_features=1, momentum=0.01) 147 | 148 | self.conv1 = xavier_init_layer( 149 | 1, base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 150 | ) 151 | self.conv2 = xavier_init_layer( 152 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 153 | ) 154 | self.conv3 = xavier_init_layer( 155 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 156 | ) 157 | self.conv4 = xavier_init_layer( 158 | base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size 159 | ) 160 | 161 | self.Linear1 = xavier_init_layer(base_channels, out_size=lin_dim1) 162 | self.Linear2 = xavier_init_layer(in_size=lin_dim1, out_size=lin_dim2) 163 | self.Linear3 = xavier_init_layer(in_size=lin_dim2, out_size=1) 164 | 165 | def forward(self, x): 166 | 167 | out = self.conv1(x) 168 | out = self.activation(out) 169 | 170 | out = self.conv2(out) 171 | out = self.activation(out) 172 | 173 | out = self.conv3(out) 174 | out = self.activation(out) 175 | 176 | out = self.conv4(out) 177 | out = self.activation(out) 178 | 179 | out = torch.mean(out, (2, 3)) 180 | 181 | out = self.Linear1(out) 182 | out = self.activation(out) 183 | 184 | out = self.Linear2(out) 185 | out = self.activation(out) 186 | 187 | out = self.Linear3(out) 188 | 189 | return out 190 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/RNNLM.py: -------------------------------------------------------------------------------- 1 | """Implementation of a Recurrent Language Model. 2 | 3 | Authors 4 | * Mirco Ravanelli 2020 5 | * Peter Plantinga 2020 6 | * Ju-Chieh Chou 2020 7 | * Titouan Parcollet 2020 8 | * Abdel 2020 9 | """ 10 | import torch 11 | from torch import nn 12 | import speechbrain as sb 13 | 14 | 15 | class RNNLM(nn.Module): 16 | """This model is a combination of embedding layer, RNN, DNN. 17 | It can be used for RNNLM. 18 | 19 | Arguments 20 | --------- 21 | output_neurons : int 22 | Number of entries in embedding table, also the number of neurons in 23 | output layer. 24 | embedding_dim : int 25 | Size of embedding vectors (default 128). 26 | activation : torch class 27 | A class used for constructing the activation layers for DNN. 28 | dropout : float 29 | Neuron dropout rate applied to embedding, RNN, and DNN. 30 | rnn_class : torch class 31 | The type of RNN to use in RNNLM network (LiGRU, LSTM, GRU, RNN) 32 | rnn_layers : int 33 | The number of recurrent layers to include. 34 | rnn_neurons : int 35 | Number of neurons in each layer of the RNN. 36 | rnn_re_init : bool 37 | Whether to initialize rnn with orthogonal initialization. 38 | rnn_return_hidden : bool 39 | Whether to return hidden states (default True). 40 | dnn_blocks : int 41 | The number of linear neural blocks to include. 42 | dnn_neurons : int 43 | The number of neurons in the linear layers. 44 | 45 | Example 46 | ------- 47 | >>> model = RNNLM(output_neurons=5) 48 | >>> inputs = torch.Tensor([[1, 2, 3]]) 49 | >>> outputs = model(inputs) 50 | >>> outputs.shape 51 | torch.Size([1, 3, 5]) 52 | """ 53 | 54 | def __init__( 55 | self, 56 | output_neurons, 57 | embedding_dim=128, 58 | activation=torch.nn.LeakyReLU, 59 | dropout=0.15, 60 | rnn_class=sb.nnet.RNN.LSTM, 61 | rnn_layers=2, 62 | rnn_neurons=1024, 63 | rnn_re_init=False, 64 | return_hidden=False, 65 | dnn_blocks=1, 66 | dnn_neurons=512, 67 | ): 68 | super().__init__() 69 | self.embedding = sb.nnet.embedding.Embedding( 70 | num_embeddings=output_neurons, embedding_dim=embedding_dim 71 | ) 72 | self.dropout = nn.Dropout(p=dropout) 73 | self.rnn = rnn_class( 74 | input_size=embedding_dim, 75 | hidden_size=rnn_neurons, 76 | num_layers=rnn_layers, 77 | dropout=dropout, 78 | re_init=rnn_re_init, 79 | ) 80 | self.return_hidden = return_hidden 81 | self.reshape = False 82 | 83 | self.dnn = sb.nnet.containers.Sequential( 84 | input_shape=[None, None, rnn_neurons] 85 | ) 86 | for block_index in range(dnn_blocks): 87 | self.dnn.append( 88 | sb.nnet.linear.Linear, 89 | n_neurons=dnn_neurons, 90 | bias=True, 91 | layer_name="linear", 92 | ) 93 | self.dnn.append(sb.nnet.normalization.LayerNorm, layer_name="norm") 94 | self.dnn.append(activation(), layer_name="act") 95 | self.dnn.append(torch.nn.Dropout(p=dropout), layer_name="dropout") 96 | 97 | self.out = sb.nnet.linear.Linear( 98 | input_size=dnn_neurons, n_neurons=output_neurons 99 | ) 100 | 101 | def forward(self, x, hx=None): 102 | 103 | x = self.embedding(x) 104 | x = self.dropout(x) 105 | 106 | # If 2d tensor, add a time-axis 107 | # This is used for inference time 108 | if len(x.shape) == 2: 109 | x = x.unsqueeze(dim=1) 110 | self.reshape = True 111 | 112 | x, hidden = self.rnn(x, hx) 113 | x = self.dnn(x) 114 | out = self.out(x) 115 | 116 | if self.reshape: 117 | out = out.squeeze(dim=1) 118 | 119 | if self.return_hidden: 120 | return out, hidden 121 | else: 122 | return out 123 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/VanillaNN.py: -------------------------------------------------------------------------------- 1 | """Vanilla Neural Network for simple tests. 2 | 3 | Authors 4 | * Elena Rastorgueva 2020 5 | """ 6 | import torch 7 | import speechbrain as sb 8 | 9 | 10 | class VanillaNN(sb.nnet.containers.Sequential): 11 | """A simple vanilla Deep Neural Network. 12 | 13 | Arguments 14 | --------- 15 | activation : torch class 16 | A class used for constructing the activation layers. 17 | dnn_blocks : int 18 | The number of linear neural blocks to include. 19 | dnn_neurons : int 20 | The number of neurons in the linear layers. 21 | 22 | Example 23 | ------- 24 | >>> inputs = torch.rand([10, 120, 60]) 25 | >>> model = VanillaNN(input_shape=inputs.shape) 26 | >>> outputs = model(inputs) 27 | >>> outputs.shape 28 | torch.Size([10, 120, 512]) 29 | """ 30 | 31 | def __init__( 32 | self, 33 | input_shape, 34 | activation=torch.nn.LeakyReLU, 35 | dnn_blocks=2, 36 | dnn_neurons=512, 37 | ): 38 | super().__init__(input_shape=input_shape) 39 | 40 | for block_index in range(dnn_blocks): 41 | self.append( 42 | sb.nnet.linear.Linear, 43 | n_neurons=dnn_neurons, 44 | bias=True, 45 | layer_name="linear", 46 | ) 47 | self.append(activation(), layer_name="act") 48 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining neural netword models (CRDNN, Xvectors ...) 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/convolution.py: -------------------------------------------------------------------------------- 1 | """This is a module to ensemble a convolution (depthwise) encoder with or without residule connection. 2 | 3 | Authors 4 | * Jianyuan Zhong 2020 5 | """ 6 | import torch 7 | from speechbrain.nnet.CNN import Conv2d 8 | from speechbrain.nnet.containers import Sequential 9 | from speechbrain.nnet.normalization import BatchNorm2d 10 | 11 | 12 | class ConvolutionFrontEnd(Sequential): 13 | """This is a module to ensemble a convolution (depthwise) encoder with or 14 | without residual connection. 15 | 16 | Arguments 17 | ---------- 18 | out_channels: int 19 | Number of output channels of this model (default 640). 20 | out_channels: Optional(list[int]) 21 | Number of output channels for each of block. 22 | kernel_size: int 23 | Kernel size of convolution layers (default 3). 24 | strides: Optional(list[int]) 25 | Striding factor for each block, this stride is applied at the last convolution layer at each block. 26 | num_blocks: int 27 | Number of block (default 21). 28 | num_per_layers: int 29 | Number of convolution layers for each block (default 5). 30 | dropout: float 31 | Dropout (default 0.15). 32 | activation: torch class 33 | Activation function for each block (default Swish). 34 | norm: torch class 35 | Normalization to regularize the model (default BatchNorm1d). 36 | residuals: Optional(list[bool]) 37 | Whether apply residual connection at each block (default None). 38 | 39 | Example 40 | ------- 41 | >>> x = torch.rand((8, 30, 10)) 42 | >>> conv = ConvolutionFrontEnd(input_shape=x.shape) 43 | >>> out = conv(x) 44 | >>> out.shape 45 | torch.Size([8, 8, 3, 512]) 46 | """ 47 | 48 | def __init__( 49 | self, 50 | input_shape, 51 | num_blocks=3, 52 | num_layers_per_block=5, 53 | out_channels=[128, 256, 512], 54 | kernel_sizes=[3, 3, 3], 55 | strides=[1, 2, 2], 56 | dilations=[1, 1, 1], 57 | residuals=[True, True, True], 58 | conv_module=Conv2d, 59 | activation=torch.nn.LeakyReLU, 60 | norm=BatchNorm2d, 61 | dropout=0.1, 62 | ): 63 | super().__init__(input_shape=input_shape) 64 | for i in range(num_blocks): 65 | self.append( 66 | ConvBlock, 67 | num_layers=num_layers_per_block, 68 | out_channels=out_channels[i], 69 | kernel_size=kernel_sizes[i], 70 | stride=strides[i], 71 | dilation=dilations[i], 72 | residual=residuals[i], 73 | conv_module=conv_module, 74 | activation=activation, 75 | norm=norm, 76 | dropout=dropout, 77 | layer_name=f"convblock_{i}", 78 | ) 79 | 80 | 81 | class ConvBlock(torch.nn.Module): 82 | """An implementation of convolution block with 1d or 2d convolutions (depthwise). 83 | 84 | Arguments 85 | ---------- 86 | out_channels : int 87 | Number of output channels of this model (default 640). 88 | kernel_size : int 89 | Kernel size of convolution layers (default 3). 90 | strides : int 91 | Striding factor for this block (default 1). 92 | num_layers : int 93 | Number of depthwise convolution layers for this block. 94 | activation : torch class 95 | Activation function for this block. 96 | norm : torch class 97 | Normalization to regularize the model (default BatchNorm1d). 98 | residuals: bool 99 | Whether apply residual connection at this block (default None). 100 | 101 | Example 102 | ------- 103 | >>> x = torch.rand((8, 30, 10)) 104 | >>> conv = ConvBlock(2, 16, input_shape=x.shape) 105 | >>> out = conv(x) 106 | >>> out.shape 107 | torch.Size([8, 30, 10, 16]) 108 | """ 109 | 110 | def __init__( 111 | self, 112 | num_layers, 113 | out_channels, 114 | input_shape, 115 | kernel_size=3, 116 | stride=1, 117 | dilation=1, 118 | residual=False, 119 | conv_module=Conv2d, 120 | activation=torch.nn.LeakyReLU, 121 | norm=None, 122 | dropout=0.1, 123 | ): 124 | super().__init__() 125 | 126 | self.convs = Sequential(input_shape=input_shape) 127 | 128 | for i in range(num_layers): 129 | self.convs.append( 130 | conv_module, 131 | out_channels=out_channels, 132 | kernel_size=kernel_size, 133 | stride=stride if i == num_layers - 1 else 1, 134 | dilation=dilation, 135 | layer_name=f"conv_{i}", 136 | ) 137 | if norm is not None: 138 | self.convs.append(norm, layer_name=f"norm_{i}") 139 | self.convs.append(activation(), layer_name=f"act_{i}") 140 | self.convs.append( 141 | torch.nn.Dropout(dropout), layer_name=f"dropout_{i}" 142 | ) 143 | 144 | self.reduce_conv = None 145 | self.drop = None 146 | if residual: 147 | self.reduce_conv = Sequential(input_shape=input_shape) 148 | self.reduce_conv.append( 149 | conv_module, 150 | out_channels=out_channels, 151 | kernel_size=1, 152 | stride=stride, 153 | layer_name="conv", 154 | ) 155 | self.reduce_conv.append(norm, layer_name="norm") 156 | self.drop = torch.nn.Dropout(dropout) 157 | 158 | def forward(self, x): 159 | out = self.convs(x) 160 | if self.reduce_conv: 161 | out = out + self.reduce_conv(x) 162 | out = self.drop(out) 163 | 164 | return out 165 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/TransformerLM.py: -------------------------------------------------------------------------------- 1 | """An implementation of Transformer Language model. 2 | 3 | Authors 4 | * Jianyuan Zhong 5 | * Samuele Cornell 6 | """ 7 | 8 | 9 | import torch # noqa 42 10 | from torch import nn 11 | 12 | from speechbrain.nnet.linear import Linear 13 | from speechbrain.nnet.normalization import LayerNorm 14 | from speechbrain.nnet.containers import ModuleList 15 | from speechbrain.lobes.models.transformer.Transformer import ( 16 | TransformerInterface, 17 | get_lookahead_mask, 18 | get_key_padding_mask, 19 | NormalizedEmbedding, 20 | ) 21 | 22 | 23 | class TransformerLM(TransformerInterface): 24 | """This is an implementation of transformer language model. 25 | 26 | The architecture is based on the paper "Attention Is All You Need": https://arxiv.org/pdf/1706.03762.pdf 27 | 28 | Arguments 29 | ---------- 30 | d_model : int 31 | The number of expected features in the encoder/decoder inputs (default=512). 32 | nhead : int 33 | The number of heads in the multiheadattention models (default=8). 34 | num_encoder_layers : int 35 | The number of sub-encoder-layers in the encoder (default=6). 36 | num_decoder_layers : int 37 | The number of sub-decoder-layers in the decoder (default=6). 38 | dim_ffn : int 39 | The dimension of the feedforward network model (default=2048). 40 | dropout : int 41 | The dropout value (default=0.1). 42 | activation: torch class 43 | The activation function of encoder/decoder intermediate layer, relu or gelu (default=relu). 44 | 45 | Example 46 | ------- 47 | >>> src = torch.randint(0, 720, [8, 120]) 48 | >>> net = TransformerLM(720, 512, 8, 1, 0, 1024, activation=torch.nn.GELU) 49 | >>> enc_out = net.forward(src) 50 | >>> print(enc_out.shape) 51 | torch.Size([8, 120, 720]) 52 | """ 53 | 54 | def __init__( 55 | self, 56 | vocab, 57 | d_model=512, 58 | nhead=8, 59 | num_encoder_layers=12, 60 | num_decoder_layers=0, 61 | d_ffn=2048, 62 | dropout=0.1, 63 | activation=nn.ReLU, 64 | positional_encoding="fixed_abs_sine", 65 | normalize_before=False, 66 | d_embedding=None, 67 | max_length=2500, 68 | causal=True, 69 | attention_type="regularMHA", 70 | ): 71 | super().__init__( 72 | d_model=d_model, 73 | nhead=nhead, 74 | num_encoder_layers=num_encoder_layers, 75 | num_decoder_layers=num_decoder_layers, 76 | d_ffn=d_ffn, 77 | dropout=dropout, 78 | activation=activation, 79 | positional_encoding=positional_encoding, 80 | normalize_before=normalize_before, 81 | max_length=max_length, 82 | causal=causal, 83 | attention_type=attention_type, 84 | ) 85 | 86 | self.d_embedding = d_embedding 87 | if d_embedding is None: 88 | self.d_embedding = d_model 89 | 90 | self.custom_src_module = NormalizedEmbedding(self.d_embedding, vocab) 91 | 92 | self.embedding_proj = None 93 | if d_embedding is not None: 94 | self.embedding_proj = Linear( 95 | input_size=self.d_embedding, n_neurons=d_model 96 | ) 97 | 98 | self.output_proj = ModuleList( 99 | Linear(input_size=d_model, n_neurons=d_model), 100 | LayerNorm(d_model, eps=1e-6), 101 | Linear(input_size=d_model, n_neurons=vocab), 102 | ) 103 | 104 | self.num_encoder_layers = num_encoder_layers 105 | self.num_decoder_layers = num_decoder_layers 106 | 107 | # reset the params of the transformer model 108 | self._reset_params() 109 | 110 | def forward(self, src, hx=None): 111 | """ 112 | Arguments 113 | --------- 114 | src : tensor 115 | The sequence to the encoder (required). 116 | """ 117 | src_mask, src_key_padding_mask = self.make_masks(src) 118 | 119 | src = self.custom_src_module(src) 120 | if self.embedding_proj is not None: 121 | src = self.embedding_proj(src) 122 | src = src + self.positional_encoding(src) 123 | if self.num_encoder_layers > 0: 124 | encoder_out, _ = self.encoder( 125 | src=src, 126 | src_mask=src_mask, 127 | src_key_padding_mask=src_key_padding_mask, 128 | ) 129 | 130 | if self.num_decoder_layers > 0: 131 | encoder_out, _ = self.decoder( 132 | src=src, 133 | tgt=src, 134 | tgt_mask=src_mask, 135 | tgt_key_padding_mask=src_key_padding_mask, 136 | ) 137 | 138 | pred = self.output_proj(encoder_out) 139 | 140 | return pred 141 | 142 | def _reset_params(self): 143 | for p in self.parameters(): 144 | if p.dim() > 1: 145 | torch.nn.init.xavier_normal_(p) 146 | 147 | def make_masks( 148 | self, src, pad_idx=0, look_ahead_mask=True, padding_mask=True 149 | ): 150 | src_mask = None 151 | if look_ahead_mask: 152 | src_mask = get_lookahead_mask(src) 153 | 154 | src_key_padding_mask = None 155 | if padding_mask: 156 | src_key_padding_mask = get_key_padding_mask(src, pad_idx) 157 | 158 | return src_mask, src_key_padding_mask 159 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/TransformerSE.py: -------------------------------------------------------------------------------- 1 | """CNN Transformer model for SE in the SpeechBrain style. 2 | 3 | Authors 4 | * Chien-Feng Liao 2020 5 | """ 6 | import torch # noqa E402 7 | from torch import nn 8 | from speechbrain.nnet.linear import Linear 9 | from speechbrain.lobes.models.transformer.Transformer import ( 10 | TransformerInterface, 11 | get_lookahead_mask, 12 | ) 13 | 14 | 15 | class CNNTransformerSE(TransformerInterface): 16 | """This is an implementation of transformer model with CNN pre-encoder for SE. 17 | 18 | Arguments 19 | --------- 20 | d_model : int 21 | The number of expected features in the encoder inputs. 22 | output_size : int 23 | The number of neurons in the output layer. 24 | output_activation : torch class 25 | The activation function of the output layer (default=ReLU). 26 | nhead : int 27 | The number of heads in the multi-head attention models (default=8). 28 | num_layers : int 29 | The number of sub-layers in the transformer (default=8). 30 | d_ffn : int 31 | The number of expected features in the encoder layers (default=512). 32 | dropout : int 33 | The dropout value (default=0.1). 34 | activation : torch class 35 | The activation function of intermediate layers (default=LeakyReLU). 36 | causal : bool 37 | True for causal setting, the model is forbidden to see future frames (default=True). 38 | custom_emb_module : torch class 39 | Module that processes the input features before the transformer model. 40 | 41 | Example 42 | ------- 43 | >>> src = torch.rand([8, 120, 256]) 44 | >>> net = CNNTransformerSE(d_model=256, output_size=257) 45 | >>> out = net(src) 46 | >>> out.shape 47 | torch.Size([8, 120, 257]) 48 | """ 49 | 50 | def __init__( 51 | self, 52 | d_model, 53 | output_size, 54 | output_activation=nn.ReLU, 55 | nhead=8, 56 | num_layers=8, 57 | d_ffn=512, 58 | dropout=0.1, 59 | activation=nn.LeakyReLU, 60 | causal=True, 61 | custom_emb_module=None, 62 | normalize_before=False, 63 | ): 64 | super().__init__( 65 | d_model=d_model, 66 | nhead=nhead, 67 | num_encoder_layers=num_layers, 68 | num_decoder_layers=0, 69 | d_ffn=d_ffn, 70 | dropout=dropout, 71 | activation=activation, 72 | positional_encoding=None, 73 | normalize_before=normalize_before, 74 | causal=causal, 75 | ) 76 | 77 | self.custom_emb_module = custom_emb_module 78 | self.output_layer = Linear(output_size, input_size=d_model, bias=False) 79 | self.output_activation = output_activation() 80 | 81 | def forward(self, x, src_key_padding_mask=None): 82 | if self.causal: 83 | self.attn_mask = get_lookahead_mask(x) 84 | else: 85 | self.attn_mask = None 86 | 87 | if self.custom_emb_module is not None: 88 | x = self.custom_emb_module(x) 89 | 90 | encoder_output, _ = self.encoder( 91 | src=x, 92 | src_mask=self.attn_mask, 93 | src_key_padding_mask=src_key_padding_mask, 94 | ) 95 | 96 | output = self.output_layer(encoder_output) 97 | output = self.output_activation(output) 98 | 99 | return output 100 | -------------------------------------------------------------------------------- /speechbrain/lobes/models/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """High level processing blocks. 2 | 3 | This subpackage gathers higher level blocks, or "lobes". 4 | The classes here may leverage the extended YAML syntax. 5 | """ 6 | -------------------------------------------------------------------------------- /speechbrain/log-config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | simple: 5 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 6 | console: 7 | format: "%(name)s - %(message)s" 8 | 9 | handlers: 10 | console: 11 | class: speechbrain.utils.logger.TqdmCompatibleStreamHandler 12 | level: INFO 13 | formatter: console 14 | stream: ext://sys.stdout 15 | 16 | file_handler: 17 | class: logging.FileHandler 18 | level: DEBUG 19 | formatter: simple 20 | filename: log.txt 21 | encoding: utf8 22 | 23 | root: 24 | level: DEBUG 25 | handlers: [console, file_handler] 26 | -------------------------------------------------------------------------------- /speechbrain/nnet/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing the different neural networks layers 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | from .loss import stoi_loss # noqa 13 | -------------------------------------------------------------------------------- /speechbrain/nnet/activations.py: -------------------------------------------------------------------------------- 1 | """Library implementing activation functions. 2 | 3 | Authors 4 | * Mirco Ravanelli 2020 5 | * Jianyuan Zhong 2020 6 | """ 7 | 8 | import torch 9 | import logging 10 | import torch.nn.functional as F 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Softmax(torch.nn.Module): 16 | """Computes the softmax of a 2d, 3d, or 4d input tensor. 17 | 18 | Arguments 19 | --------- 20 | apply_log : bool 21 | Whether to apply the log function before softmax. 22 | dim : int 23 | If the dimension where softmax is applied. 24 | 25 | Example 26 | ------- 27 | >>> classifier = Softmax() 28 | >>> inputs = torch.rand(10, 50, 40) 29 | >>> output = classifier(inputs) 30 | >>> output.shape 31 | torch.Size([10, 50, 40]) 32 | """ 33 | 34 | def __init__(self, apply_log=False, dim=-1): 35 | super().__init__() 36 | 37 | if apply_log: 38 | self.act = torch.nn.LogSoftmax(dim=dim) 39 | else: 40 | self.act = torch.nn.Softmax(dim=dim) 41 | 42 | def forward(self, x): 43 | """Returns the softmax of the input tensor. 44 | 45 | Arguments 46 | --------- 47 | x : torch.Tensor 48 | Input tensor. 49 | """ 50 | # Reshaping the tensors 51 | dims = x.shape 52 | 53 | if len(dims) == 3: 54 | x = x.reshape(dims[0] * dims[1], dims[2]) 55 | 56 | if len(dims) == 4: 57 | x = x.reshape(dims[0] * dims[1], dims[2], dims[3]) 58 | 59 | x_act = self.act(x) 60 | 61 | # Retrieving the original shape format 62 | if len(dims) == 3: 63 | x_act = x_act.reshape(dims[0], dims[1], dims[2]) 64 | 65 | if len(dims) == 4: 66 | x_act = x_act.reshape(dims[0], dims[1], dims[2], dims[3]) 67 | 68 | return x_act 69 | 70 | 71 | class GumbelSoftmax(torch.nn.Module): 72 | """Samples from the Gumbel-Softmax distribution and optionally discretizes. 73 | 74 | Reference: https://arxiv.org/abs/1611.00712, https://arxiv.org/abs/1611.01144 75 | 76 | Arguments 77 | ---------- 78 | tau: float 79 | non-negative scalar temperature 80 | hard: bool 81 | if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd 82 | dim: int 83 | A dimension along which softmax will be computed (default: -1). 84 | 85 | Example 86 | ------- 87 | >>> x = torch.randn((8, 40, 120)) 88 | >>> act = GumbelSoftmax(0.8, True) 89 | >>> x = act(x) 90 | """ 91 | 92 | def __init__(self, tau, hard=False, apply_log=False): 93 | super().__init__() 94 | self.tau = tau 95 | self.hard = hard 96 | self.apply_log = apply_log 97 | 98 | def forward(self, x): 99 | if self.apply_log: 100 | return torch.log(F.gumbel_softmax(x, tau=self.tau, hard=self.hard)) 101 | return F.gumbel_softmax(x, tau=self.tau, hard=self.hard) 102 | 103 | 104 | class Swish(torch.nn.Module): 105 | """ The class implements the Swish activation function from 106 | https://arxiv.org/pdf/2005.03191.pdf 107 | 108 | given input x. Swish(x) = x / (1 + exp(beta * x)) 109 | 110 | Arguments 111 | --------- 112 | beta: float 113 | Beta value. 114 | 115 | Example 116 | ------- 117 | >>> x = torch.randn((8, 40, 120)) 118 | >>> act = Swish() 119 | >>> x = act(x) 120 | """ 121 | 122 | def __init__(self, beta=1): 123 | super().__init__() 124 | self.beta = beta 125 | self.sigmoid = torch.nn.Sigmoid() 126 | 127 | def forward(self, x): 128 | """Returns the Swished input tensor. 129 | 130 | Arguments 131 | --------- 132 | x : torch.Tensor 133 | Input tensor. 134 | """ 135 | return x * self.sigmoid(self.beta * x) 136 | -------------------------------------------------------------------------------- /speechbrain/nnet/complex_networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing complex neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/complex_networks/c_linear.py: -------------------------------------------------------------------------------- 1 | """Library implementing complex-valued linear transformation. 2 | 3 | Authors 4 | * Titouan Parcollet 2020 5 | """ 6 | 7 | import torch 8 | import logging 9 | from speechbrain.nnet.complex_networks.c_ops import ( 10 | affect_init, 11 | complex_init, 12 | unitary_init, 13 | complex_linear_op, 14 | check_complex_input, 15 | ) 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class CLinear(torch.nn.Module): 21 | """This function implements a fully connected complex-valued 22 | linear layer: y = Wx + b. y, W, x and b are thus complex 23 | numbers. A complex number is written as: r + xi. A tensor of 24 | complex numbers x = [batch, 32] can be understood as 25 | [batch, 0:15] = R and [batch, 16:31] = Xi. Thus the features 26 | dimension is cut in half (must be divisible by 2). 27 | 28 | Arguments 29 | --------- 30 | n_neurons : int 31 | It is the number of output neurons (i.e, the dimensionality of the 32 | output). Please note that these are complex-valued neurons. If 256 33 | neurons are specified, the output dimension will be 512. 34 | input_shape : tuple 35 | Expected size of the input. 36 | bias : bool 37 | if True, the additive bias b is adopted. 38 | init_criterion : str , optional 39 | (glorot, he). 40 | This parameter controls the initialization criterion of the weights. 41 | It is combined with weights_init to build the initialization method of 42 | the complex-valued weights (default "glorot"). 43 | weight_init : str, optional 44 | (complex, unitary). 45 | This parameter defines the initialization procedure of the 46 | complex-valued weights (default "complex"). "complex" will generate random complex-valued 47 | weights following the init_criterion and the complex polar form. 48 | "unitary" will normalize the weights to lie on the unit circle. 49 | More details in: "Deep Complex Networks", Trabelsi C. et al. 50 | 51 | Example 52 | ------- 53 | >>> inputs = torch.rand(10, 50, 40) 54 | >>> lin = CLinear(n_neurons=100, input_shape=inputs.shape) 55 | >>> output = lin(inputs) 56 | >>> output.shape 57 | torch.Size([10, 50, 200]) 58 | """ 59 | 60 | def __init__( 61 | self, 62 | n_neurons, 63 | input_shape, 64 | bias=True, 65 | init_criterion="glorot", 66 | weight_init="complex", 67 | ): 68 | super().__init__() 69 | self.n_neurons = n_neurons 70 | self.bias = bias 71 | self.init_criterion = init_criterion 72 | self.weight_init = weight_init 73 | 74 | # When initialising with speechbrain the input_shape is an integer ! 75 | # we need to transform it into a list it works with all the question ops 76 | if isinstance(input_shape, int): 77 | input_shape = [1, input_shape] 78 | 79 | # Check the complex_valued form of the input 80 | check_complex_input(input_shape) 81 | 82 | # Computing the complex dimensionality of the input 83 | self.in_features = input_shape[-1] // 2 84 | self.out_features = self.n_neurons 85 | 86 | # Two weight matrices are created for the real and imaginary parts of 87 | # the weights. This will also allow an easier complex product. 88 | self.real_weight = torch.nn.Parameter( 89 | torch.Tensor(self.in_features, self.out_features) 90 | ) 91 | self.imag_weight = torch.nn.Parameter( 92 | torch.Tensor(self.in_features, self.out_features) 93 | ) 94 | 95 | if self.bias: 96 | self.b = torch.nn.Parameter(torch.Tensor(2 * self.out_features)) 97 | else: 98 | self.b = torch.Tensor(2 * self.out_features).requires_grad_(False) 99 | 100 | # Managing the weight initialization and bias 101 | self.winit = {"complex": complex_init, "unitary": unitary_init}[ 102 | self.weight_init 103 | ] 104 | 105 | affect_init( 106 | self.real_weight, self.imag_weight, self.winit, init_criterion 107 | ) 108 | 109 | def forward(self, x): 110 | """Returns the linear transformation of input tensor. 111 | 112 | Arguments 113 | --------- 114 | x : torch.Tensor 115 | Input to transform linearly. 116 | """ 117 | wx = complex_linear_op(x, self.real_weight, self.imag_weight, self.b) 118 | 119 | return wx 120 | -------------------------------------------------------------------------------- /speechbrain/nnet/dropout.py: -------------------------------------------------------------------------------- 1 | """Library implementing dropout. 2 | 3 | Authors 4 | * Mirco Ravanelli 2020 5 | """ 6 | import torch # noqa: F401 7 | import logging 8 | import torch.nn as nn 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Dropout2d(nn.Module): 14 | """This function implements dropout 2d. It randomly put zeros on 15 | entire channels. 16 | 17 | 18 | Arguments 19 | --------- 20 | dropout_rate : float 21 | It is the dropout factor (between 0 and 1). 22 | inplace : bool 23 | If True, it uses inplace operations. 24 | 25 | Example 26 | ------- 27 | >>> drop = Dropout2d(drop_rate=0.5) 28 | >>> inputs = torch.rand(10, 50, 40) 29 | >>> output=drop(inputs) 30 | >>> output.shape 31 | torch.Size([10, 50, 40]) 32 | """ 33 | 34 | def __init__( 35 | self, drop_rate, inplace=False, 36 | ): 37 | super().__init__() 38 | self.drop_rate = drop_rate 39 | self.inplace = inplace 40 | self.drop = nn.Dropout2d(p=self.drop_rate, inplace=self.inplace) 41 | 42 | def forward(self, x): 43 | """Applies dropout 2d to the input tensor. 44 | 45 | Arguments 46 | --------- 47 | x : torch.Tensor (batch, time, channel1, channel2) 48 | input to normalize. 4d tensors are expected. 49 | """ 50 | 51 | # time must be the last 52 | x = x.transpose(1, 2).transpose(2, -1) 53 | x_drop = self.drop(x) 54 | x_drop = x_drop.transpose(-1, 1).transpose(2, -1) 55 | 56 | return x_drop 57 | -------------------------------------------------------------------------------- /speechbrain/nnet/embedding.py: -------------------------------------------------------------------------------- 1 | """Library implementing embedding. 2 | 3 | Authors 4 | * Abdelwahab Heba 2020 5 | """ 6 | 7 | import torch 8 | import logging 9 | import torch.nn as nn 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class Embedding(nn.Module): 15 | """Computes an embedding x = wx. 16 | 17 | Arguments 18 | --------- 19 | num_embeddings : int 20 | Size of the dictionary of embeddings. 21 | embedding_dim : int 22 | It is the dim of embedding (i.e, the dimensionality of the output). 23 | consider_as_one_hot : bool 24 | Create non-trainable one-hot vector. 25 | blank_id : int 26 | If consider_as_one_hot == True: consider the embedding as one_hot 27 | and use blank_index as zero one_hot vector. 28 | 29 | Example 30 | ------- 31 | >>> from speechbrain.nnet.embedding import Embedding 32 | >>> import torch 33 | >>> emb = Embedding( 34 | ... num_embeddings=40, 35 | ... embedding_dim=39, 36 | ... consider_as_one_hot=True, 37 | ... blank_id=39 38 | ... ) 39 | >>> inputs = torch.Tensor([10,5,2,0,39]).long() 40 | >>> output = emb(inputs) 41 | >>> output.shape 42 | torch.Size([5, 39]) 43 | >>> output 44 | tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 45 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 46 | 0., 0., 0.], 47 | [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 48 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 49 | 0., 0., 0.], 50 | [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 51 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 52 | 0., 0., 0.], 53 | [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 54 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 55 | 0., 0., 0.], 56 | [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 57 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 58 | 0., 0., 0.]]) 59 | >>> emb = Embedding(num_embeddings=5, embedding_dim=3, consider_as_one_hot=False) 60 | >>> e = emb(torch.LongTensor([[0, 1, 2], [3, 4, 2]])) 61 | >>> e.shape 62 | torch.Size([2, 3, 3]) 63 | """ 64 | 65 | def __init__( 66 | self, 67 | num_embeddings, 68 | embedding_dim=128, 69 | consider_as_one_hot=False, 70 | blank_id=0, 71 | ): 72 | 73 | super().__init__() 74 | self.num_embeddings = num_embeddings 75 | self.consider_as_one_hot = consider_as_one_hot 76 | if self.consider_as_one_hot: 77 | self.embedding_dim = self.num_embeddings - 1 78 | else: 79 | self.embedding_dim = embedding_dim 80 | self.blank_id = blank_id 81 | 82 | if self.consider_as_one_hot: 83 | # deal with blank_id, the output should be embedding_dim-1 as we consider blank output as zeros one_hot vect 84 | # padding_idx fix the idx row to zeros 85 | self.Embedding = nn.Embedding( 86 | self.num_embeddings, 87 | self.embedding_dim, 88 | padding_idx=self.blank_id, 89 | ) 90 | one_hot = torch.eye(self.embedding_dim) 91 | if self.blank_id + 1 != self.num_embeddings: 92 | self.Embedding.weight.data[self.blank_id + 1 :] = one_hot[ 93 | self.blank_id : 94 | ] 95 | if self.blank_id != 0: 96 | self.Embedding.weight.data[: self.blank_id] = one_hot[ 97 | : self.blank_id 98 | ] 99 | self.Embedding.weight.requires_grad = False 100 | else: 101 | self.Embedding = nn.Embedding( 102 | self.num_embeddings, self.embedding_dim 103 | ) 104 | 105 | def forward(self, x): 106 | """Returns the embedding of input tensor. 107 | 108 | Arguments 109 | --------- 110 | x : torch.Tensor 111 | Input to embed. 112 | """ 113 | # pytorch embedding layer only accept long dtype 114 | return self.Embedding(x.long()) 115 | -------------------------------------------------------------------------------- /speechbrain/nnet/linear.py: -------------------------------------------------------------------------------- 1 | """Library implementing linear transformation. 2 | 3 | Authors 4 | * Mirco Ravanelli 2020 5 | * Davide Borra 2021 6 | """ 7 | 8 | import torch 9 | import logging 10 | import torch.nn as nn 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Linear(torch.nn.Module): 16 | """Computes a linear transformation y = wx + b. 17 | 18 | Arguments 19 | --------- 20 | n_neurons : int 21 | It is the number of output neurons (i.e, the dimensionality of the 22 | output). 23 | input_shape: tuple 24 | It is the shape of the input tensor. 25 | input_size: int 26 | Size of the input tensor. 27 | bias : bool 28 | If True, the additive bias b is adopted. 29 | combine_dims : bool 30 | If True and the input is 4D, combine 3rd and 4th dimensions of input. 31 | 32 | Example 33 | ------- 34 | >>> inputs = torch.rand(10, 50, 40) 35 | >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100) 36 | >>> output = lin_t(inputs) 37 | >>> output.shape 38 | torch.Size([10, 50, 100]) 39 | """ 40 | 41 | def __init__( 42 | self, 43 | n_neurons, 44 | input_shape=None, 45 | input_size=None, 46 | bias=True, 47 | combine_dims=False, 48 | ): 49 | super().__init__() 50 | self.combine_dims = combine_dims 51 | 52 | if input_shape is None and input_size is None: 53 | raise ValueError("Expected one of input_shape or input_size") 54 | 55 | if input_size is None: 56 | input_size = input_shape[-1] 57 | if len(input_shape) == 4 and self.combine_dims: 58 | input_size = input_shape[2] * input_shape[3] 59 | 60 | # Weights are initialized following pytorch approach 61 | self.w = nn.Linear(input_size, n_neurons, bias=bias) 62 | 63 | def forward(self, x): 64 | """Returns the linear transformation of input tensor. 65 | 66 | Arguments 67 | --------- 68 | x : torch.Tensor 69 | Input to transform linearly. 70 | """ 71 | if x.ndim == 4 and self.combine_dims: 72 | x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]) 73 | 74 | wx = self.w(x) 75 | 76 | return wx 77 | 78 | 79 | class LinearWithConstraint(Linear): 80 | """Computes a linear transformation y = wx + b with kernel max-norm constaint. 81 | This corresponds to set an upper bound for the kernel norm. 82 | 83 | Arguments 84 | --------- 85 | n_neurons : int 86 | It is the number of output neurons (i.e, the dimensionality of the 87 | output). 88 | input_shape: tuple 89 | It is the shape of the input tensor. 90 | input_size: int 91 | Size of the input tensor. 92 | bias : bool 93 | If True, the additive bias b is adopted. 94 | combine_dims : bool 95 | If True and the input is 4D, combine 3rd and 4th dimensions of input. 96 | max_norm : float 97 | Kernel max-norm 98 | 99 | Example 100 | ------- 101 | >>> inputs = torch.rand(100,) 102 | >>> max_norm = 1. 103 | >>> lin_t_contrained = LinearWithConstraint(input_size=inputs.shape[0], n_neurons=2, max_norm=max_norm) 104 | >>> output = lin_t_contrained(inputs) 105 | >>> torch.any(torch.norm(lin_t_contrained.w.weight.data, p=2, dim=0)>max_norm) 106 | tensor(False) 107 | """ 108 | 109 | def __init__(self, *args, max_norm=1, **kwargs): 110 | self.max_norm = max_norm 111 | super(LinearWithConstraint, self).__init__(*args, **kwargs) 112 | 113 | def forward(self, x): 114 | """Returns the linear transformation of input tensor. 115 | 116 | Arguments 117 | --------- 118 | x : torch.Tensor 119 | Input to transform linearly. 120 | """ 121 | self.w.weight.data = torch.renorm( 122 | self.w.weight.data, p=2, dim=0, maxnorm=self.max_norm 123 | ) 124 | return super(LinearWithConstraint, self).forward(x) 125 | -------------------------------------------------------------------------------- /speechbrain/nnet/loss/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing specific losses (transducer, stoi ...) 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/loss/guidedattn_loss.py: -------------------------------------------------------------------------------- 1 | """The Guided Attention Loss implementation 2 | 3 | This loss can be used to speed up the training of 4 | models in which the correspondence between inputs and 5 | outputs is roughly linear, and the attention alignments 6 | are expected to be approximately diagonal, such as Grapheme-to-Phoneme 7 | and Text-to-Speech 8 | 9 | Authors 10 | * Artem Ploujnikov 2021 11 | """ 12 | 13 | import torch 14 | from torch import nn 15 | 16 | 17 | class GuidedAttentionLoss(nn.Module): 18 | """ 19 | A loss implementation that forces attention matrices to be 20 | near-diagonal, imposing progressively larger penalties for paying 21 | attention to regions far away from the diagonal). It is useful 22 | for sequence-to-sequence models in which the sequence of outputs 23 | is expected to corrsespond closely to the sequence of inputs, 24 | such as TTS or G2P 25 | 26 | https://arxiv.org/abs/1710.08969 27 | 28 | The implementation is inspired by the R9Y9 DeepVoice3 model 29 | https://github.com/r9y9/deepvoice3_pytorch 30 | 31 | It should be roughly equivalent to it; however, it has been 32 | fully vectorized. 33 | 34 | Arguments 35 | --------- 36 | sigma: 37 | the guided attention weight 38 | 39 | Example 40 | ------- 41 | NOTE: In a real scenario, the input_lengths and 42 | target_lengths would come from a data batch, 43 | whereas alignments would come from a model 44 | >>> import torch 45 | >>> from speechbrain.nnet.loss.guidedattn_loss import GuidedAttentionLoss 46 | >>> loss = GuidedAttentionLoss(sigma=0.2) 47 | >>> input_lengths = torch.tensor([2, 3]) 48 | >>> target_lengths = torch.tensor([3, 4]) 49 | >>> alignments = torch.tensor( 50 | ... [ 51 | ... [ 52 | ... [0.8, 0.2, 0.0], 53 | ... [0.4, 0.6, 0.0], 54 | ... [0.2, 0.8, 0.0], 55 | ... [0.0, 0.0, 0.0], 56 | ... ], 57 | ... [ 58 | ... [0.6, 0.2, 0.2], 59 | ... [0.1, 0.7, 0.2], 60 | ... [0.3, 0.4, 0.3], 61 | ... [0.2, 0.3, 0.5], 62 | ... ], 63 | ... ] 64 | ... ) 65 | >>> loss(alignments, input_lengths, target_lengths) 66 | tensor(0.1142) 67 | """ 68 | 69 | def __init__(self, sigma=0.2): 70 | super().__init__() 71 | self.sigma = sigma 72 | self.weight_factor = 2 * (sigma ** 2) 73 | 74 | def forward( 75 | self, 76 | attention, 77 | input_lengths, 78 | target_lengths, 79 | max_input_len=None, 80 | max_target_len=None, 81 | ): 82 | """ 83 | Computes the guided attention loss for a single batch 84 | 85 | Arguments 86 | --------- 87 | attention: torch.Tensor 88 | A padded attention/alignments matrix 89 | (batch, targets, inputs) 90 | input_lengths: torch.tensor 91 | A (batch, lengths) tensor of input lengths 92 | target_lengths: torch.tensor 93 | A (batch, lengths) tensor of target lengths 94 | max_input_len: int 95 | The maximum input length - optional, 96 | if not computed will be set to the maximum 97 | of target_lengths. Setting it explicitly 98 | might be necessary when using data parallelism 99 | max_target_len: int 100 | The maximum target length - optional, 101 | if not computed will be set to the maximum 102 | of target_lengths. Setting it explicitly 103 | might be necessary when using data parallelism 104 | 105 | 106 | Returns 107 | ------- 108 | loss: torch.Tensor 109 | A single-element tensor with the loss value 110 | """ 111 | soft_mask = self.guided_attentions( 112 | input_lengths, target_lengths, max_input_len, max_target_len 113 | ) 114 | return (attention * soft_mask.transpose(-1, -2)).mean() 115 | 116 | def guided_attentions( 117 | self, 118 | input_lengths, 119 | target_lengths, 120 | max_input_len=None, 121 | max_target_len=None, 122 | ): 123 | """ 124 | Computes guided attention matrices 125 | 126 | Arguments 127 | --------- 128 | input_lengths: torch.Tensor 129 | A tensor of input lengths 130 | target_lengths: torch.Tensor 131 | A tensor of target lengths 132 | max_input_len: int 133 | The maximum input length - optional, 134 | if not computed will be set to the maximum 135 | of target_lengths. Setting it explicitly 136 | might be necessary when using data parallelism 137 | max_target_len: int 138 | The maximum target length - optional, 139 | if not computed will be set to the maximum 140 | of target_lengths. Setting it explicitly 141 | might be necessary when using data parallelism 142 | 143 | Returns 144 | ------- 145 | soft_mask: torch.Tensor 146 | The guided attention tensor of shape (batch, max_input_len, max_target_len) 147 | """ 148 | input_lengths_broad = input_lengths.view(-1, 1, 1) 149 | target_lengths_broad = target_lengths.view(-1, 1, 1) 150 | if max_input_len is None: 151 | max_input_len = input_lengths.max() 152 | if max_target_len is None: 153 | max_target_len = target_lengths.max() 154 | input_mesh, target_mesh = torch.meshgrid( 155 | torch.arange(max_input_len).to(input_lengths.device), 156 | torch.arange(max_target_len).to(target_lengths.device), 157 | ) 158 | input_mesh, target_mesh = ( 159 | input_mesh.unsqueeze(0), 160 | target_mesh.unsqueeze(0), 161 | ) 162 | input_lengths_broad = input_lengths.view(-1, 1, 1) 163 | target_lengths_broad = target_lengths.view(-1, 1, 1) 164 | soft_mask = 1.0 - torch.exp( 165 | -( 166 | ( 167 | input_mesh / input_lengths_broad 168 | - target_mesh / target_lengths_broad 169 | ) 170 | ** 2 171 | ) 172 | / self.weight_factor 173 | ) 174 | outside = (input_mesh >= input_lengths_broad) | ( 175 | target_mesh >= target_lengths_broad 176 | ) 177 | soft_mask[outside] = 0.0 178 | return soft_mask 179 | -------------------------------------------------------------------------------- /speechbrain/nnet/loss/si_snr_loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Authors: 3 | * Szu-Wei, Fu 2021 4 | * Mirco Ravanelli 2020 5 | * Samuele Cornell 2020 6 | * Hwidong Na 2020 7 | * Yan Gao 2020 8 | * Titouan Parcollet 2020 9 | """ 10 | 11 | import torch 12 | import numpy as np 13 | 14 | smallVal = np.finfo("float").eps # To avoid divide by zero 15 | 16 | 17 | def si_snr_loss(y_pred_batch, y_true_batch, lens, reduction="mean"): 18 | """Compute the si_snr score and return -1 * that score. 19 | 20 | This function can be used as a loss function for training 21 | with SGD-based updates. 22 | 23 | Arguments 24 | --------- 25 | y_pred_batch : torch.Tensor 26 | The degraded (enhanced) waveforms. 27 | y_true_batch : torch.Tensor 28 | The clean (reference) waveforms. 29 | lens : torch.Tensor 30 | The relative lengths of the waveforms within the batch. 31 | reduction : str 32 | The type of reduction ("mean" or "batch") to use. 33 | 34 | Example 35 | ------- 36 | """ 37 | 38 | y_pred_batch = torch.squeeze(y_pred_batch, dim=-1) 39 | y_true_batch = torch.squeeze(y_true_batch, dim=-1) 40 | 41 | batch_size = y_pred_batch.shape[0] 42 | SI_SNR = torch.zeros(batch_size) 43 | 44 | for i in range(0, batch_size): # Run over mini-batches 45 | s_target = y_true_batch[i, 0 : int(lens[i] * y_pred_batch.shape[1])] 46 | s_estimate = y_pred_batch[i, 0 : int(lens[i] * y_pred_batch.shape[1])] 47 | 48 | # s_target = s / ||s||^2 49 | dot = torch.sum(s_estimate * s_target, dim=0, keepdim=True) 50 | s_target_energy = ( 51 | torch.sum(s_target ** 2, dim=0, keepdim=True) + smallVal 52 | ) 53 | proj = dot * s_target / s_target_energy 54 | 55 | # e_noise = s' - s_target 56 | e_noise = s_estimate - proj 57 | 58 | # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2) 59 | si_snr_beforelog = torch.sum(proj ** 2, dim=0) / ( 60 | torch.sum(e_noise ** 2, dim=0) + smallVal 61 | ) 62 | SI_SNR[i] = 10 * torch.log10(si_snr_beforelog + smallVal) 63 | 64 | if reduction == "mean": 65 | return -SI_SNR.mean() 66 | 67 | return -SI_SNR 68 | -------------------------------------------------------------------------------- /speechbrain/nnet/quaternion_networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing quaternion neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/quaternion_networks/q_normalization.py: -------------------------------------------------------------------------------- 1 | """Library implementing quaternion-valued normalization. 2 | 3 | Authors 4 | * Titouan Parcollet 2020 5 | """ 6 | 7 | import torch 8 | from torch.nn import Parameter 9 | 10 | 11 | class QBatchNorm(torch.nn.Module): 12 | """This class implements the simplest form of a quaternion batchnorm as 13 | described in : "Quaternion Convolutional Neural Network for 14 | Color Image Classification and Forensics", Qilin Y. et al. 15 | 16 | Arguments 17 | --------- 18 | input_size : int 19 | Expected size of the dimension to be normalized. 20 | dim : int, optional 21 | It defines the axis that should be normalized. It usually correspond to 22 | the channel dimension (default -1). 23 | gamma_init : float, optional 24 | First value of gamma to be used (mean) (default 1.0). 25 | beta_param : bool, optional 26 | When set to True the beta parameter of the BN is applied (default True). 27 | momentum : float, optional 28 | It defines the momentum as for the real-valued batch-normalization (default 0.1). 29 | eps : float, optional 30 | Term used to stabilize operation (default 1e-4). 31 | track_running_stats : bool, optional 32 | Equivalent to the real-valued batchnormalization parameter. 33 | When True, stats are tracked. When False, solely statistics computed 34 | over the batch are used (default True). 35 | 36 | 37 | Example 38 | ------- 39 | >>> inp_tensor = torch.rand([10, 40]) 40 | >>> QBN = QBatchNorm(input_size=40) 41 | >>> out_tensor = QBN(inp_tensor) 42 | >>> out_tensor.shape 43 | torch.Size([10, 40]) 44 | 45 | """ 46 | 47 | def __init__( 48 | self, 49 | input_size, 50 | dim=-1, 51 | gamma_init=1.0, 52 | beta_param=True, 53 | momentum=0.1, 54 | eps=1e-4, 55 | track_running_stats=True, 56 | ): 57 | super(QBatchNorm, self).__init__() 58 | 59 | self.num_features = input_size // 4 60 | self.gamma_init = gamma_init 61 | self.beta_param = beta_param 62 | self.momentum = momentum 63 | self.dim = dim 64 | self.eps = eps 65 | self.track_running_stats = track_running_stats 66 | 67 | self.gamma = Parameter(torch.full([self.num_features], self.gamma_init)) 68 | self.beta = Parameter( 69 | torch.zeros(self.num_features * 4), requires_grad=self.beta_param 70 | ) 71 | 72 | # instantiate moving statistics 73 | if track_running_stats: 74 | self.register_buffer( 75 | "running_mean", torch.zeros(self.num_features * 4) 76 | ) 77 | self.register_buffer("running_var", torch.ones(self.num_features)) 78 | self.register_buffer( 79 | "num_batches_tracked", torch.tensor(0, dtype=torch.long) 80 | ) 81 | else: 82 | self.register_parameter("running_mean", None) 83 | self.register_parameter("running_var", None) 84 | self.register_parameter("num_batches_tracked", None) 85 | 86 | def forward(self, input): 87 | """Returns the normalized input tensor. 88 | 89 | Arguments 90 | --------- 91 | input : torch.Tensor (batch, time, [channels]) 92 | Input to normalize. It can be 2d, 3d, 4d. 93 | """ 94 | 95 | exponential_average_factor = 0.0 96 | 97 | # Entering training mode 98 | if self.training: 99 | if self.num_batches_tracked is not None: 100 | self.num_batches_tracked = self.num_batches_tracked + 1 101 | 102 | if self.momentum is None: # use cumulative moving average 103 | exponential_average_factor = ( 104 | 1.0 / self.num_batches_tracked.item() 105 | ) 106 | else: # use exponential moving average 107 | exponential_average_factor = self.momentum 108 | 109 | # Get mean along batch axis 110 | mu = torch.mean(input, dim=0) 111 | mu_r, mu_i, mu_j, mu_k = torch.chunk(mu, 4, dim=self.dim) 112 | 113 | # Get variance along batch axis 114 | delta = input - mu 115 | delta_r, delta_i, delta_j, delta_k = torch.chunk( 116 | delta, 4, dim=self.dim 117 | ) 118 | quat_variance = torch.mean( 119 | (delta_r ** 2 + delta_i ** 2 + delta_j ** 2 + delta_k ** 2), 120 | dim=0, 121 | ) 122 | 123 | denominator = torch.sqrt(quat_variance + self.eps) 124 | 125 | # x - mu / sqrt(var + e) 126 | out = input / torch.cat( 127 | [denominator, denominator, denominator, denominator], 128 | dim=self.dim, 129 | ) 130 | 131 | # Update the running stats 132 | if self.track_running_stats: 133 | self.running_mean = ( 134 | 1 - exponential_average_factor 135 | ) * self.running_mean + exponential_average_factor * mu.view( 136 | self.running_mean.size() 137 | ) 138 | 139 | self.running_var = ( 140 | 1 - exponential_average_factor 141 | ) * self.running_var + exponential_average_factor * quat_variance.view( 142 | self.running_var.size() 143 | ) 144 | else: 145 | q_var = torch.cat( 146 | [ 147 | self.running_var, 148 | self.running_var, 149 | self.running_var, 150 | self.running_var, 151 | ], 152 | dim=self.dim, 153 | ) 154 | out = (input - self.running_mean) / q_var 155 | 156 | # lambda * (x - mu / sqrt(var + e)) + beta 157 | 158 | q_gamma = torch.cat( 159 | [self.gamma, self.gamma, self.gamma, self.gamma], dim=self.dim 160 | ) 161 | out = (q_gamma * out) + self.beta 162 | 163 | return out 164 | -------------------------------------------------------------------------------- /speechbrain/nnet/transducer/__init__.py: -------------------------------------------------------------------------------- 1 | """Package containing transducer neural networks 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/nnet/transducer/transducer_joint.py: -------------------------------------------------------------------------------- 1 | """Library implementing transducer_joint. 2 | 3 | Author 4 | Abdelwahab HEBA 2020 5 | """ 6 | 7 | import torch 8 | import logging 9 | import torch.nn as nn 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class Transducer_joint(nn.Module): 15 | """Computes joint tensor between Transcription network (TN) & Prediction network (PN) 16 | 17 | Arguments 18 | --------- 19 | joint_network : torch.class (neural network modules) 20 | if joint == "concat", we call this network after the concatenation of TN and PN 21 | if None, we don't use this network. 22 | joint : joint the two tensors by ("sum",or "concat") option. 23 | nonlinearity : torch class 24 | Activation function used after the joint between TN and PN 25 | Type of nonlinearity (tanh, relu). 26 | 27 | Example 28 | ------- 29 | >>> from speechbrain.nnet.transducer.transducer_joint import Transducer_joint 30 | >>> from speechbrain.nnet.linear import Linear 31 | >>> input_TN = torch.rand(8, 200, 1, 40) 32 | >>> input_PN = torch.rand(8, 1, 12, 40) 33 | >>> joint_network = Linear(input_size=80, n_neurons=80) 34 | >>> TJoint = Transducer_joint(joint_network, joint="concat") 35 | >>> output = TJoint(input_TN, input_PN) 36 | >>> output.shape 37 | torch.Size([8, 200, 12, 80]) 38 | """ 39 | 40 | def __init__( 41 | self, joint_network=None, joint="sum", nonlinearity=torch.nn.LeakyReLU 42 | ): 43 | super().__init__() 44 | self.joint_network = joint_network 45 | self.joint = joint 46 | self.nonlinearity = nonlinearity() 47 | 48 | def init_params(self, first_input): 49 | """ 50 | Arguments 51 | --------- 52 | first_input : tensor 53 | A first input used for initializing the parameters. 54 | """ 55 | self.joint_network(first_input) 56 | 57 | def forward(self, input_TN, input_PN): 58 | """Returns the fusion of inputs tensors. 59 | 60 | Arguments 61 | --------- 62 | input_TN : torch.Tensor 63 | Input from Transcription Network. 64 | 65 | input_PN : torch.Tensor 66 | Input from Prediction Network. 67 | """ 68 | if len(input_TN.shape) != len(input_PN.shape): 69 | raise ValueError("Arg 1 and 2 must be have same size") 70 | if not (len(input_TN.shape) != 4 or len(input_TN.shape) != 1): 71 | raise ValueError("Tensors 1 and 2 must have dim=1 or dim=4") 72 | 73 | if self.joint == "sum": 74 | joint = input_TN + input_PN 75 | 76 | if self.joint == "concat": 77 | # For training 78 | if len(input_TN.shape) == 4: 79 | dim = len(input_TN.shape) - 1 80 | xs = input_TN 81 | ymat = input_PN 82 | sz = [ 83 | max(i, j) for i, j in zip(xs.size()[:-1], ymat.size()[:-1]) 84 | ] 85 | xs = xs.expand(torch.Size(sz + [xs.shape[-1]])) 86 | ymat = ymat.expand(torch.Size(sz + [ymat.shape[-1]])) 87 | joint = torch.cat((xs, ymat), dim=dim) 88 | # For evaluation 89 | elif len(input_TN.shape) == 1: 90 | joint = torch.cat((input_TN, input_PN), dim=0) 91 | 92 | if self.joint_network is not None: 93 | joint = self.joint_network(joint) 94 | 95 | return self.nonlinearity(joint) 96 | -------------------------------------------------------------------------------- /speechbrain/pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | """Pretrained models""" 2 | 3 | from .interfaces import * # noqa 4 | -------------------------------------------------------------------------------- /speechbrain/pretrained/fetching.py: -------------------------------------------------------------------------------- 1 | """Downloads or otherwise fetches pretrained models 2 | 3 | Authors: 4 | * Aku Rouhe 2021 5 | * Samuele Cornell 2021 6 | """ 7 | import urllib.request 8 | import urllib.error 9 | import pathlib 10 | import logging 11 | import huggingface_hub 12 | from requests.exceptions import HTTPError 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def _missing_ok_unlink(path): 18 | # missing_ok=True was added to Path.unlink() in Python 3.8 19 | # This does the same. 20 | try: 21 | path.unlink() 22 | except FileNotFoundError: 23 | pass 24 | 25 | 26 | def fetch( 27 | filename, 28 | source, 29 | savedir="./pretrained_model_checkpoints", 30 | overwrite=False, 31 | save_filename=None, 32 | use_auth_token=False, 33 | ): 34 | """Ensures you have a local copy of the file, returns its path 35 | 36 | In case the source is an external location, downloads the file. In case 37 | the source is already accessible on the filesystem, creates a symlink in 38 | the savedir. Thus, the side effects of this function always look similar: 39 | savedir/save_filename can be used to access the file. And save_filename 40 | defaults to the filename arg. 41 | 42 | Arguments 43 | --------- 44 | filename : str 45 | Name of the file including extensions. 46 | source : str 47 | Where to look for the file. This is interpreted in special ways: 48 | First, if the source begins with "http://" or "https://", it is 49 | interpreted as a web address and the file is downloaded. 50 | Second, if the source is a valid directory path, a symlink is 51 | created to the file. 52 | Otherwise, the source is interpreted as a Huggingface model hub ID, and 53 | the file is downloaded from there. 54 | savedir : str 55 | Path where to save downloads/symlinks. 56 | overwrite : bool 57 | If True, always overwrite existing savedir/filename file and download 58 | or recreate the link. If False (as by default), if savedir/filename 59 | exists, assume it is correct and don't download/relink. Note that 60 | Huggingface local cache is always used - with overwrite=True we just 61 | relink from the local cache. 62 | save_filename : str 63 | The filename to use for saving this file. Defaults to filename if not 64 | given. 65 | use_auth_token : bool (default: False) 66 | If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub, 67 | default is False because majority of models are public. 68 | Returns 69 | ------- 70 | pathlib.Path 71 | Path to file on local file system. 72 | 73 | Raises 74 | ------ 75 | ValueError 76 | If file is not found 77 | """ 78 | if save_filename is None: 79 | save_filename = filename 80 | savedir = pathlib.Path(savedir) 81 | savedir.mkdir(parents=True, exist_ok=True) 82 | sourcefile = f"{source}/{filename}" 83 | destination = savedir / save_filename 84 | if destination.exists() and not overwrite: 85 | MSG = f"Fetch {filename}: Using existing file/symlink in {str(destination)}." 86 | logger.info(MSG) 87 | return destination 88 | if str(source).startswith("http:") or str(source).startswith("https:"): 89 | # Interpret source as web address. 90 | MSG = ( 91 | f"Fetch {filename}: Downloading from normal URL {str(sourcefile)}." 92 | ) 93 | logger.info(MSG) 94 | # Download 95 | try: 96 | urllib.request.urlretrieve(sourcefile, destination) 97 | except urllib.error.URLError: 98 | raise ValueError( 99 | f"Interpreted {source} as web address, but could not download." 100 | ) 101 | elif pathlib.Path(source).is_dir(): 102 | # Interpret source as local directory path 103 | # Just symlink 104 | sourcepath = pathlib.Path(sourcefile).absolute() 105 | MSG = f"Fetch {filename}: Linking to local file in {str(sourcepath)}." 106 | logger.info(MSG) 107 | _missing_ok_unlink(destination) 108 | destination.symlink_to(sourcepath) 109 | else: 110 | # Interpret source as huggingface hub ID 111 | # Use huggingface hub's fancy cached download. 112 | MSG = f"Fetch {filename}: Delegating to Huggingface hub, source {str(source)}." 113 | logger.info(MSG) 114 | url = huggingface_hub.hf_hub_url(source, filename) 115 | try: 116 | fetched_file = huggingface_hub.cached_download(url, use_auth_token) 117 | except HTTPError as e: 118 | if e.response.status_code == 404: 119 | raise ValueError("File not found on HF hub") 120 | else: 121 | raise 122 | # Huggingface hub downloads to etag filename, symlink to the expected one: 123 | sourcepath = pathlib.Path(fetched_file).absolute() 124 | _missing_ok_unlink(destination) 125 | destination.symlink_to(sourcepath) 126 | return destination 127 | -------------------------------------------------------------------------------- /speechbrain/processing/NMF.py: -------------------------------------------------------------------------------- 1 | """Non-negative matrix factorization 2 | 3 | Authors 4 | * Cem Subakan 5 | """ 6 | import torch 7 | from speechbrain.processing.features import spectral_magnitude 8 | import speechbrain.processing.features as spf 9 | 10 | 11 | def spectral_phase(stft, power=2, log=False): 12 | """Returns the phase of a complex spectrogram. 13 | 14 | Arguments 15 | --------- 16 | stft : torch.Tensor 17 | A tensor, output from the stft function. 18 | 19 | Example 20 | ------- 21 | >>> BS, nfft, T = 10, 20, 300 22 | >>> X_stft = torch.randn(BS, nfft//2 + 1, T, 2) 23 | >>> phase_mix = spectral_phase(X_stft) 24 | """ 25 | 26 | phase = torch.atan2(stft[:, :, :, 1], stft[:, :, :, 0]) 27 | 28 | return phase 29 | 30 | 31 | def NMF_separate_spectra(Whats, Xmix): 32 | """This function separates the mixture signals, given NMF template matrices. 33 | 34 | Arguments 35 | --------- 36 | Whats : list 37 | This list contains the list [W1, W2], where W1 W2 are respectively 38 | the NMF template matrices that correspond to source1 and source2. 39 | W1, W2 are of size [nfft/2 + 1, K], where nfft is the fft size for STFT, 40 | and K is the number of vectors (templates) in W. 41 | Xmix : torch.tensor 42 | This is the magnitude spectra for the mixtures. 43 | The size is [BS x T x nfft//2 + 1] where, 44 | BS = batch size, nfft = fft size, T = number of time steps in the spectra. 45 | 46 | Outputs 47 | ------- 48 | X1hat : Separated spectrum for source1 49 | Size = [BS x (nfft/2 +1) x T] where, 50 | BS = batch size, nfft = fft size, T = number of time steps in the spectra. 51 | X2hat : Separated Spectrum for source2 52 | The size definitions are the same as above. 53 | 54 | Example 55 | -------- 56 | >>> BS, nfft, T = 4, 20, 400 57 | >>> K1, K2 = 10, 10 58 | >>> W1hat = torch.randn(nfft//2 + 1, K1) 59 | >>> W2hat = torch.randn(nfft//2 + 1, K2) 60 | >>> Whats = [W1hat, W2hat] 61 | >>> Xmix = torch.randn(BS, T, nfft//2 + 1) 62 | >>> X1hat, X2hat = NMF_separate_spectra(Whats, Xmix) 63 | """ 64 | 65 | W1, W2 = Whats 66 | 67 | nmixtures = Xmix.shape[0] 68 | Xmix = Xmix.permute(0, 2, 1).reshape(-1, Xmix.size(-1)).t() 69 | n = Xmix.shape[1] 70 | eps = 1e-20 71 | 72 | # Normalize input 73 | g = Xmix.sum(dim=0) + eps 74 | z = Xmix / g 75 | 76 | # initialize 77 | w = torch.cat([W1, W2], dim=1) 78 | K = w.size(1) 79 | K1 = W1.size(1) 80 | 81 | h = 0.1 * torch.rand(K, n) 82 | h /= torch.sum(h, dim=0) + eps 83 | 84 | for ep in range(1000): 85 | v = z / (torch.matmul(w, h) + eps) 86 | 87 | nh = h * torch.matmul(w.t(), v) 88 | h = nh / (torch.sum(nh, dim=0) + eps) 89 | 90 | h *= g 91 | Xhat1 = torch.matmul(w[:, :K1], h[:K1, :]) 92 | Xhat1 = torch.split(Xhat1.unsqueeze(0), Xhat1.size(1) // nmixtures, dim=2) 93 | Xhat1 = torch.cat(Xhat1, dim=0) 94 | 95 | Xhat2 = torch.matmul(w[:, K1:], h[K1:, :]) 96 | Xhat2 = torch.split(Xhat2.unsqueeze(0), Xhat2.size(1) // nmixtures, dim=2) 97 | Xhat2 = torch.cat(Xhat2, dim=0) 98 | 99 | return Xhat1, Xhat2 100 | 101 | 102 | def reconstruct_results( 103 | X1hat, X2hat, X_stft, sample_rate, win_length, hop_length, 104 | ): 105 | 106 | """This function reconstructs the separated spectra into waveforms. 107 | 108 | Arguments 109 | --------- 110 | Xhat1 : torch.tensor 111 | The separated spectrum for source 1 of size [BS, nfft/2 + 1, T], 112 | where, BS = batch size, nfft = fft size, T = length of the spectra. 113 | Xhat2 : torch.tensor 114 | The separated spectrum for source 2 of size [BS, nfft/2 + 1, T]. 115 | The size definitions are the same as Xhat1. 116 | X_stft : torch.tensor 117 | This is the magnitude spectra for the mixtures. 118 | The size is [BS x nfft//2 + 1 x T x 2] where, 119 | BS = batch size, nfft = fft size, T = number of time steps in the spectra. 120 | The last dimension is to represent complex numbers. 121 | sample_rate : int 122 | The sampling rate (in Hz) in which we would like to save the results. 123 | win_length : int 124 | The length of stft windows (in ms). 125 | hop_length : int 126 | The length with which we shift the STFT windows (in ms). 127 | 128 | Returns 129 | ------- 130 | x1hats : list 131 | List of waveforms for source 1. 132 | x2hats : list 133 | List of waveforms for source 2. 134 | 135 | Example 136 | ------- 137 | >>> BS, nfft, T = 10, 512, 16000 138 | >>> sample_rate, win_length, hop_length = 16000, 25, 10 139 | >>> X1hat = torch.randn(BS, nfft//2 + 1, T) 140 | >>> X2hat = torch.randn(BS, nfft//2 + 1, T) 141 | >>> X_stft = torch.randn(BS, nfft//2 + 1, T, 2) 142 | >>> x1hats, x2hats = reconstruct_results(X1hat, X2hat, X_stft, sample_rate, win_length, hop_length) 143 | """ 144 | 145 | ISTFT = spf.ISTFT( 146 | sample_rate=sample_rate, win_length=win_length, hop_length=hop_length 147 | ) 148 | 149 | phase_mix = spectral_phase(X_stft) 150 | mag_mix = spectral_magnitude(X_stft, power=2) 151 | 152 | x1hats, x2hats = [], [] 153 | eps = 1e-25 154 | for i in range(X1hat.shape[0]): 155 | X1hat_stft = ( 156 | (X1hat[i] / (eps + X1hat[i] + X2hat[i])).unsqueeze(-1) 157 | * mag_mix[i].unsqueeze(-1) 158 | * torch.cat( 159 | [ 160 | torch.cos(phase_mix[i].unsqueeze(-1)), 161 | torch.sin(phase_mix[i].unsqueeze(-1)), 162 | ], 163 | dim=-1, 164 | ) 165 | ) 166 | 167 | X2hat_stft = ( 168 | (X2hat[i] / (eps + X1hat[i] + X2hat[i])).unsqueeze(-1) 169 | * mag_mix[i].unsqueeze(-1) 170 | * torch.cat( 171 | [ 172 | torch.cos(phase_mix[i].unsqueeze(-1)), 173 | torch.sin(phase_mix[i].unsqueeze(-1)), 174 | ], 175 | dim=-1, 176 | ) 177 | ) 178 | X1hat_stft = X1hat_stft.unsqueeze(0).permute(0, 2, 1, 3) 179 | X2hat_stft = X2hat_stft.unsqueeze(0).permute(0, 2, 1, 3) 180 | shat1 = ISTFT(X1hat_stft) 181 | shat2 = ISTFT(X2hat_stft) 182 | 183 | div_factor = 10 184 | x1 = shat1 / (div_factor * shat1.std()) 185 | x2 = shat2 / (div_factor * shat2.std()) 186 | 187 | x1hats.append(x1) 188 | x2hats.append(x2) 189 | return x1hats, x2hats 190 | -------------------------------------------------------------------------------- /speechbrain/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing various techniques of speech processing 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package defining the SentencePiece tokenizer 2 | """ 3 | -------------------------------------------------------------------------------- /speechbrain/utils/Accuracy.py: -------------------------------------------------------------------------------- 1 | """Calculate accuracy. 2 | 3 | Authors 4 | * Jianyuan Zhong 2020 5 | """ 6 | import torch 7 | from speechbrain.dataio.dataio import length_to_mask 8 | 9 | 10 | def Accuracy(log_probabilities, targets, length=None): 11 | """Calculates the accuracy for predicted log probabilities and targets in a batch. 12 | 13 | Arguments 14 | ---------- 15 | log_probabilities : tensor 16 | Predicted log probabilities (batch_size, time, feature). 17 | targets : tensor 18 | Target (batch_size, time). 19 | length : tensor 20 | Length of target (batch_size,). 21 | 22 | Example 23 | ------- 24 | >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0) 25 | >>> acc = Accuracy(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3])) 26 | >>> print(acc) 27 | (1.0, 2.0) 28 | """ 29 | if length is not None: 30 | mask = length_to_mask( 31 | length * targets.shape[1], max_len=targets.shape[1], 32 | ).bool() 33 | if len(targets.shape) == 3: 34 | mask = mask.unsqueeze(2).repeat(1, 1, targets.shape[2]) 35 | 36 | padded_pred = log_probabilities.argmax(-1) 37 | 38 | if length is not None: 39 | numerator = torch.sum( 40 | padded_pred.masked_select(mask) == targets.masked_select(mask) 41 | ) 42 | denominator = torch.sum(mask) 43 | else: 44 | numerator = torch.sum(padded_pred == targets) 45 | denominator = targets.shape[0] * targets.shape[1] # (batch_size * time) 46 | return float(numerator), float(denominator) 47 | 48 | 49 | class AccuracyStats: 50 | """Module for calculate the overall one-step-forward prediction accuracy. 51 | 52 | Example 53 | ------- 54 | >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0) 55 | >>> stats = AccuracyStats() 56 | >>> stats.append(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3])) 57 | >>> acc = stats.summarize() 58 | >>> print(acc) 59 | 0.5 60 | """ 61 | 62 | def __init__(self): 63 | self.correct = 0 64 | self.total = 0 65 | 66 | def append(self, log_probabilities, targets, length=None): 67 | """This function is for updating the stats according to the prediction 68 | and target in the current batch. 69 | 70 | Arguments 71 | ---------- 72 | log_probabilities : tensor 73 | Predicted log probabilities (batch_size, time, feature). 74 | targets : tensor 75 | Target (batch_size, time). 76 | length: tensor 77 | Length of target (batch_size,). 78 | """ 79 | numerator, denominator = Accuracy(log_probabilities, targets, length) 80 | self.correct += numerator 81 | self.total += denominator 82 | 83 | def summarize(self): 84 | return self.correct / self.total -------------------------------------------------------------------------------- /speechbrain/utils/DER.py: -------------------------------------------------------------------------------- 1 | """Calculates Diarization Error Rate (DER) which is the sum of Missed Speaker (MS), 2 | False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Evaluation. 3 | 4 | Authors 5 | * Neville Ryant 2018 6 | * Nauman Dawalatabad 2020 7 | 8 | Credits 9 | This code is adapted from https://github.com/nryant/dscore 10 | """ 11 | 12 | import os 13 | import re 14 | import subprocess 15 | import numpy as np 16 | 17 | FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") 18 | SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+") 19 | MISS_SPEAKER_TIME = re.compile(r"(?<=MISSED SPEAKER TIME =)[\d.]+") 20 | FA_SPEAKER_TIME = re.compile(r"(?<=FALARM SPEAKER TIME =)[\d.]+") 21 | ERROR_SPEAKER_TIME = re.compile(r"(?<=SPEAKER ERROR TIME =)[\d.]+") 22 | 23 | 24 | def rectify(arr): 25 | """Corrects corner cases and converts scores into percentage. 26 | """ 27 | 28 | # Numerator and denominator both 0. 29 | arr[np.isnan(arr)] = 0 30 | 31 | # Numerator > 0, but denominator = 0. 32 | arr[np.isinf(arr)] = 1 33 | arr *= 100.0 34 | 35 | return arr 36 | 37 | 38 | def DER( 39 | ref_rttm, 40 | sys_rttm, 41 | ignore_overlap=False, 42 | collar=0.25, 43 | individual_file_scores=False, 44 | ): 45 | """Computes Missed Speaker percentage (MS), False Alarm (FA), 46 | Speaker Error Rate (SER), and Diarization Error Rate (DER). 47 | 48 | Arguments 49 | --------- 50 | ref_rttm : str 51 | The path of reference/groundtruth RTTM file. 52 | sys_rttm : str 53 | The path of the system generated RTTM file. 54 | individual_file : bool 55 | If True, returns scores for each file in order. 56 | collar : float 57 | Forgiveness collar. 58 | ignore_overlap : bool 59 | If True, ignores overlapping speech during evaluation. 60 | 61 | Returns 62 | ------- 63 | MS : float array 64 | Missed Speech. 65 | FA : float array 66 | False Alarms. 67 | SER : float array 68 | Speaker Error Rates. 69 | DER : float array 70 | Diarization Error Rates. 71 | 72 | Example 73 | ------- 74 | >>> import pytest 75 | >>> pytest.skip('Skipping because of Perl dependency') 76 | >>> ref_rttm = "../../samples/rttm_samples/ref_rttm/ES2014c.rttm" 77 | >>> sys_rttm = "../../samples/rttm_samples/sys_rttm/ES2014c.rttm" 78 | >>> ignore_overlap = True 79 | >>> collar = 0.25 80 | >>> individual_file_scores = True 81 | >>> Scores = DER(ref_rttm, sys_rttm, ignore_overlap, collar, individual_file_scores) 82 | >>> print (Scores) 83 | (array([0., 0.]), array([0., 0.]), array([7.16923618, 7.16923618]), array([7.16923618, 7.16923618])) 84 | """ 85 | 86 | curr = os.path.abspath(os.path.dirname(__file__)) 87 | mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl") 88 | 89 | cmd = [ 90 | mdEval, 91 | "-af", 92 | "-r", 93 | ref_rttm, 94 | "-s", 95 | sys_rttm, 96 | "-c", 97 | str(collar), 98 | ] 99 | if ignore_overlap: 100 | cmd.append("-1") 101 | 102 | try: 103 | stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT) 104 | 105 | except subprocess.CalledProcessError as ex: 106 | stdout = ex.output 107 | 108 | else: 109 | stdout = stdout.decode("utf-8") 110 | 111 | # Get all recording IDs 112 | file_ids = [m.strip() for m in FILE_IDS.findall(stdout)] 113 | file_ids = [ 114 | file_id[2:] if file_id.startswith("f=") else file_id 115 | for file_id in file_ids 116 | ] 117 | 118 | scored_speaker_times = np.array( 119 | [float(m) for m in SCORED_SPEAKER_TIME.findall(stdout)] 120 | ) 121 | 122 | miss_speaker_times = np.array( 123 | [float(m) for m in MISS_SPEAKER_TIME.findall(stdout)] 124 | ) 125 | 126 | fa_speaker_times = np.array( 127 | [float(m) for m in FA_SPEAKER_TIME.findall(stdout)] 128 | ) 129 | 130 | error_speaker_times = np.array( 131 | [float(m) for m in ERROR_SPEAKER_TIME.findall(stdout)] 132 | ) 133 | 134 | with np.errstate(invalid="ignore", divide="ignore"): 135 | tot_error_times = ( 136 | miss_speaker_times + fa_speaker_times + error_speaker_times 137 | ) 138 | miss_speaker_frac = miss_speaker_times / scored_speaker_times 139 | fa_speaker_frac = fa_speaker_times / scored_speaker_times 140 | sers_frac = error_speaker_times / scored_speaker_times 141 | ders_frac = tot_error_times / scored_speaker_times 142 | 143 | # Values in percentage of scored_speaker_time 144 | miss_speaker = rectify(miss_speaker_frac) 145 | fa_speaker = rectify(fa_speaker_frac) 146 | sers = rectify(sers_frac) 147 | ders = rectify(ders_frac) 148 | 149 | if individual_file_scores: 150 | return miss_speaker, fa_speaker, sers, ders 151 | else: 152 | return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] 153 | -------------------------------------------------------------------------------- /speechbrain/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ Package containing various tools (accuracy, checkpoints ...) 2 | """ 3 | import os 4 | 5 | __all__ = [] 6 | for filename in os.listdir(os.path.dirname(__file__)): 7 | filename = os.path.basename(filename) 8 | if filename.endswith(".py") and not filename.startswith("__"): 9 | __all__.append(filename[:-3]) 10 | 11 | from . import * # noqa 12 | -------------------------------------------------------------------------------- /speechbrain/utils/bleu.py: -------------------------------------------------------------------------------- 1 | from speechbrain.utils.metric_stats import MetricStats 2 | 3 | 4 | def merge_words(sequences): 5 | """Merge successive words into phrase, putting space between each word 6 | 7 | Arguments 8 | --------- 9 | sequences : list 10 | Each item contains a list, and this list contains a word sequence. 11 | Returns 12 | ------- 13 | The list contains phrase sequences. 14 | """ 15 | results = [] 16 | for seq in sequences: 17 | words = " ".join(seq) 18 | results.append(words) 19 | return results 20 | 21 | 22 | class BLEUStats(MetricStats): 23 | """A class for tracking BLEU (https://www.aclweb.org/anthology/P02-1040.pdf). 24 | Arguments 25 | --------- 26 | merge_words: bool 27 | Whether to merge the successive words to create sentences. 28 | Example 29 | ------- 30 | >>> bleu = BLEUStats() 31 | >>> i2l = {0: 'a', 1: 'b'} 32 | >>> bleu.append( 33 | ... ids=['utterance1'], 34 | ... predict=[[0, 1, 1]], 35 | ... targets=[[[0, 1, 0]], [[0, 1, 1]], [[1, 1, 0]]], 36 | ... ind2lab=lambda batch: [[i2l[int(x)] for x in seq] for seq in batch], 37 | ... ) 38 | >>> stats = bleu.summarize() 39 | >>> stats['BLEU'] 40 | 0.0 41 | """ 42 | 43 | def __init__( 44 | self, lang="en", merge_words=True, 45 | ): 46 | 47 | self.clear() 48 | self.merge_words = merge_words 49 | 50 | self.predicts = [] 51 | self.targets = None 52 | 53 | def append( 54 | self, ids, predict, targets, ind2lab=None, 55 | ): 56 | """Add stats to the relevant containers. 57 | * See MetricStats.append() 58 | Arguments 59 | --------- 60 | ids : list 61 | List of ids corresponding to utterances. 62 | predict : torch.tensor 63 | A predicted output, for comparison with the target output 64 | targets : list 65 | list of references (when measuring BLEU, one sentence could have more 66 | than one target translation). 67 | ind2lab : callable 68 | Callable that maps from indices to labels, operating on batches, 69 | for writing alignments. 70 | """ 71 | self.ids.extend(ids) 72 | 73 | if ind2lab is not None: 74 | predict = ind2lab(predict) 75 | targets = [ind2lab(t) for t in targets] 76 | 77 | if self.merge_words: 78 | predict = merge_words(predict) 79 | targets = [merge_words(t) for t in targets] 80 | 81 | self.predicts.extend(predict) 82 | if self.targets is None: 83 | self.targets = targets 84 | else: 85 | assert len(self.targets) == len(targets) 86 | for i in range(len(self.targets)): 87 | self.targets[i].extend(targets[i]) 88 | 89 | def summarize(self, field=None): 90 | """Summarize the BLEU and return relevant statistics. 91 | * See MetricStats.summarize() 92 | """ 93 | 94 | # Check extra-dependency for computing the bleu score 95 | try: 96 | import sacrebleu 97 | except ImportError: 98 | print( 99 | "Please install sacrebleu (https://github.com/mjpost/sacreble) in order to use the BLEU metric" 100 | ) 101 | 102 | scores = sacrebleu.corpus_bleu(self.predicts, self.targets) 103 | details = {} 104 | details["BLEU"] = scores.score 105 | details["BP"] = scores.bp 106 | details["ratio"] = scores.sys_len / scores.ref_len 107 | details["hyp_len"] = scores.sys_len 108 | details["ref_len"] = scores.ref_len 109 | details["precisions"] = scores.precisions 110 | 111 | self.scores = scores 112 | self.summary = details 113 | 114 | # Add additional, more generic key 115 | self.summary["bleu_score"] = self.summary["BLEU"] 116 | 117 | if field is not None: 118 | return self.summary[field] 119 | else: 120 | return self.summary 121 | 122 | def write_stats(self, filestream): 123 | """Write all relevant info (e.g., error rate alignments) to file. 124 | * See MetricStats.write_stats() 125 | """ 126 | if not self.summary: 127 | self.summarize() 128 | 129 | print(self.scores, file=filestream) 130 | -------------------------------------------------------------------------------- /speechbrain/utils/callchains.py: -------------------------------------------------------------------------------- 1 | """Chaining together callables, if some require relative lengths""" 2 | import inspect 3 | 4 | 5 | def lengths_arg_exists(func): 6 | """Returns True if func takes ``lengths`` keyword argument. 7 | 8 | Arguments 9 | --------- 10 | func : callable 11 | The function, method, or other callable to search for the lengths arg. 12 | """ 13 | spec = inspect.getfullargspec(func) 14 | return "lengths" in spec.args + spec.kwonlyargs 15 | 16 | 17 | class LengthsCapableChain: 18 | """Chain together callables. Can handle relative lengths. 19 | 20 | This is a more light-weight version of 21 | speechbrain.nnet.containers.LengthsCapableSequential 22 | 23 | Arguments 24 | --------- 25 | *funcs : list, optional 26 | Any number of functions or other callables, given in order of 27 | execution. 28 | 29 | Returns 30 | ------- 31 | Any 32 | The input as processed by each function. If no functions were given, 33 | simply returns the input. 34 | """ 35 | 36 | def __init__(self, *funcs): 37 | self.funcs = [] 38 | self.takes_lengths = [] 39 | for func in funcs: 40 | self.append(func) 41 | 42 | def __call__(self, x, lengths=None): 43 | """Run the chain of callables on the given input 44 | 45 | Arguments 46 | --------- 47 | x : Any 48 | The main input 49 | lengths : Any 50 | The lengths argument which will be conditionally passed to 51 | any functions in the chain that take a 'lengths' argument. 52 | In SpeechBrain the convention is to use relative lengths. 53 | 54 | Note 55 | ---- 56 | By convention, if a callable in the chain returns multiple outputs 57 | (returns a tuple), only the first output is passed to the next 58 | callable in the chain. 59 | """ 60 | if not self.funcs: 61 | return x 62 | for func, give_lengths in zip(self.funcs, self.takes_lengths): 63 | if give_lengths: 64 | x = func(x, lengths) 65 | else: 66 | x = func(x) 67 | if isinstance(x, tuple): 68 | x = x[0] 69 | return x 70 | 71 | def append(self, func): 72 | """Add a function to the chain""" 73 | self.funcs.append(func) 74 | self.takes_lengths.append(lengths_arg_exists(func)) 75 | 76 | def __str__(self): 77 | clsname = self.__class__.__name__ 78 | if self.funcs: 79 | return f"{clsname}:\n" + "\n".join(str(f) for f in self.funcs) 80 | else: 81 | return f"Empty {clsname}" 82 | -------------------------------------------------------------------------------- /speechbrain/utils/epoch_loop.py: -------------------------------------------------------------------------------- 1 | """Implements a checkpointable epoch counter (loop), optionally integrating early stopping. 2 | 3 | Authors 4 | * Aku Rouhe 2020 5 | * Davide Borra 2021 6 | """ 7 | from .checkpoints import register_checkpoint_hooks 8 | from .checkpoints import mark_as_saver 9 | from .checkpoints import mark_as_loader 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | @register_checkpoint_hooks 16 | class EpochCounter: 17 | """An epoch counter which can save and recall its state. 18 | 19 | Use this as the iterator for epochs. 20 | Note that this iterator gives you the numbers from [1 ... limit] not 21 | [0 ... limit-1] as range(limit) would. 22 | 23 | Example 24 | ------- 25 | >>> from speechbrain.utils.checkpoints import Checkpointer 26 | >>> tmpdir = getfixture('tmpdir') 27 | >>> epoch_counter = EpochCounter(10) 28 | >>> recoverer = Checkpointer(tmpdir, {"epoch": epoch_counter}) 29 | >>> recoverer.recover_if_possible() 30 | >>> # Now after recovery, 31 | >>> # the epoch starts from where it left off! 32 | >>> for epoch in epoch_counter: 33 | ... # Run training... 34 | ... ckpt = recoverer.save_checkpoint() 35 | """ 36 | 37 | def __init__(self, limit): 38 | self.current = 0 39 | self.limit = int(limit) 40 | 41 | def __iter__(self): 42 | return self 43 | 44 | def __next__(self): 45 | if self.current < self.limit: 46 | self.current += 1 47 | logger.info(f"Going into epoch {self.current}") 48 | return self.current 49 | raise StopIteration 50 | 51 | @mark_as_saver 52 | def _save(self, path): 53 | with open(path, "w") as fo: 54 | fo.write(str(self.current)) 55 | 56 | @mark_as_loader 57 | def _recover(self, path, end_of_epoch=True, device=None): 58 | # NOTE: end_of_epoch = True by default so that when 59 | # loaded in parameter transfer, this starts a new epoch. 60 | # However, parameter transfer to EpochCounter should 61 | # probably never be used really. 62 | del device # Not used. 63 | with open(path) as fi: 64 | saved_value = int(fi.read()) 65 | if end_of_epoch: 66 | self.current = saved_value 67 | else: 68 | self.current = saved_value - 1 69 | 70 | 71 | class EpochCounterWithStopper(EpochCounter): 72 | """An epoch counter which can save and recall its state, integrating an early stopper by tracking a target metric. 73 | 74 | Arguments 75 | --------- 76 | limit: int 77 | maximum number of epochs 78 | limit_to_stop : int 79 | maximum number of consecutive epochs without improvements in performance 80 | limit_warmup : int 81 | number of epochs to wait until start checking for early stopping 82 | direction : "max" or "min" 83 | direction to optimize the target metric 84 | 85 | Example 86 | ------- 87 | >>> limit = 10 88 | >>> limit_to_stop = 5 89 | >>> limit_warmup = 2 90 | >>> direction = "min" 91 | >>> epoch_counter = EpochCounterWithStopper(limit, limit_to_stop, limit_warmup, direction) 92 | >>> for epoch in epoch_counter: 93 | ... # Run training... 94 | ... # Track a validation metric, 95 | ... current_valid_metric = 0 96 | ... # get the current valid metric (get current_valid_metric) 97 | ... if epoch_counter.should_stop(current=epoch, 98 | ... current_metric=current_valid_metric,): 99 | ... epoch_counter.current = epoch_counter.limit # skipping unpromising epochs 100 | """ 101 | 102 | def __init__(self, limit, limit_to_stop, limit_warmup, direction): 103 | super().__init__(limit) 104 | self.limit_to_stop = limit_to_stop 105 | self.limit_warmup = limit_warmup 106 | self.direction = direction 107 | 108 | self.best_limit = 0 109 | self.min_delta = 1e-6 110 | 111 | if self.limit_to_stop < 0: 112 | raise ValueError("Stopper 'limit_to_stop' must be >= 0") 113 | if self.limit_warmup < 0: 114 | raise ValueError("Stopper 'limit_warmup' must be >= 0") 115 | if self.direction == "min": 116 | self.th, self.sign = float("inf"), 1 117 | elif self.direction == "max": 118 | self.th, self.sign = -float("inf"), -1 119 | else: 120 | raise ValueError("Stopper 'direction' must be 'min' or 'max'") 121 | 122 | def should_stop(self, current, current_metric): 123 | should_stop = False 124 | if current > self.limit_warmup: 125 | if self.sign * current_metric < self.sign * ( 126 | (1 - self.min_delta) * self.th 127 | ): 128 | self.best_limit = current 129 | self.th = current_metric 130 | should_stop = (current - self.best_limit) >= self.limit_to_stop 131 | return should_stop 132 | -------------------------------------------------------------------------------- /speechbrain/utils/logger.py: -------------------------------------------------------------------------------- 1 | """Managing the logger, utilities 2 | 3 | Author 4 | * Fang-Pen Lin 2012 https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/ 5 | * Peter Plantinga 2020 6 | * Aku Rouhe 2020 7 | """ 8 | 9 | import sys 10 | import os 11 | import yaml 12 | import tqdm 13 | import logging 14 | import logging.config 15 | import math 16 | import torch 17 | from speechbrain.utils.data_utils import recursive_update 18 | from speechbrain.utils.superpowers import run_shell 19 | 20 | ORDERS_ABBREV = { 21 | -24: "y", 22 | -21: "z", 23 | -18: "a", 24 | -15: "f", 25 | -12: "p", 26 | -9: "n", 27 | -6: "µ", 28 | -3: "m", 29 | 0: "", 30 | 3: "k", 31 | 6: "M", 32 | 9: "G", 33 | 12: "T", 34 | 15: "P", 35 | 18: "E", 36 | 21: "Z", 37 | 24: "Y", 38 | } 39 | 40 | # Short scale 41 | # Negative powers of ten in lowercase, positive in uppercase 42 | ORDERS_WORDS = { 43 | -24: "septillionths", 44 | -21: "sextillionths", 45 | -18: "quintillionths", 46 | -15: "quadrillionths", 47 | -12: "trillionths", 48 | -9: "billionths", 49 | -6: "millionths", 50 | -3: "thousandths", 51 | 0: "", 52 | 3: "Thousand", 53 | 6: "Million", 54 | 9: "Billion", 55 | 12: "Trillion", 56 | 15: "Quadrillion", 57 | 18: "Quintillion", 58 | 21: "Sextillion", 59 | 24: "Septillion", 60 | } 61 | 62 | 63 | class TqdmCompatibleStreamHandler(logging.StreamHandler): 64 | """TQDM compatible StreamHandler. 65 | 66 | Writes and prints should be passed through tqdm.tqdm.write 67 | so that the tqdm progressbar doesn't get messed up. 68 | """ 69 | 70 | def emit(self, record): 71 | try: 72 | msg = self.format(record) 73 | stream = self.stream 74 | tqdm.tqdm.write(msg, end=self.terminator, file=stream) 75 | self.flush() 76 | except RecursionError: 77 | raise 78 | except Exception: 79 | self.handleError(record) 80 | 81 | 82 | def setup_logging( 83 | config_path="log-config.yaml", overrides={}, default_level=logging.INFO, 84 | ): 85 | """Setup logging configuration. 86 | 87 | Arguments 88 | --------- 89 | config_path : str 90 | The path to a logging config file. 91 | default_level : int 92 | The level to use if the config file is not found. 93 | overrides : dict 94 | A dictionary of the same structure as the config dict 95 | with any updated values that need to be applied. 96 | """ 97 | if os.path.exists(config_path): 98 | with open(config_path, "rt") as f: 99 | config = yaml.safe_load(f) 100 | recursive_update(config, overrides) 101 | logging.config.dictConfig(config) 102 | else: 103 | logging.basicConfig(level=default_level) 104 | 105 | 106 | def format_order_of_magnitude(number, abbreviate=True): 107 | """Formats number to the appropriate order of magnitude for printing. 108 | 109 | Arguments 110 | --------- 111 | number : int, float 112 | The number to format. 113 | abbreviate : bool 114 | Whether to use abbreviations (k,M,G) or words (Thousand, Million, 115 | Billion). Numbers will be either like: "123.5k" or "123.5 Thousand". 116 | 117 | Returns 118 | ------- 119 | str 120 | The formatted number. Note that the order of magnitude token is part 121 | of the string. 122 | 123 | Example 124 | ------- 125 | >>> print(format_order_of_magnitude(123456)) 126 | 123.5k 127 | >>> print(format_order_of_magnitude(0.00000123, abbreviate=False)) 128 | 1.2 millionths 129 | >>> print(format_order_of_magnitude(5, abbreviate=False)) 130 | 5 131 | """ 132 | style = ORDERS_ABBREV if abbreviate else ORDERS_WORDS 133 | precision = "{num:3.1f}" 134 | order = 3 * math.floor(math.log(math.fabs(number), 1000)) 135 | # Fallback for very large numbers: 136 | while order not in style and order != 0: 137 | order = order - math.copysign(3, order) # Bring 3 units towards 0 138 | order_token = style[order] 139 | if order != 0: 140 | formatted_number = precision.format(num=number / 10 ** order) 141 | else: 142 | if isinstance(number, int): 143 | formatted_number = str(number) 144 | else: 145 | formatted_number = precision.format(num=number) 146 | if abbreviate or not order_token: 147 | return formatted_number + order_token 148 | else: 149 | return formatted_number + " " + order_token 150 | 151 | 152 | def get_environment_description(): 153 | """Returns a string describing the current Python / SpeechBrain environment. 154 | 155 | Useful for making experiments as replicable as possible. 156 | 157 | Returns 158 | ------- 159 | str 160 | The string is formatted ready to be written to a file. 161 | 162 | Example 163 | ------- 164 | >>> get_environment_description().splitlines()[0] 165 | 'SpeechBrain system description' 166 | """ 167 | python_version_str = "Python version:\n" + sys.version + "\n" 168 | try: 169 | freezed, _, _ = run_shell("pip freeze") 170 | python_packages_str = "Installed Python packages:\n" 171 | python_packages_str += freezed.decode(errors="replace") 172 | except OSError: 173 | python_packages_str = "Could not list python packages with pip freeze" 174 | try: 175 | git_hash, _, _ = run_shell("git rev-parse --short HEAD") 176 | git_str = "Git revision:\n" + git_hash.decode(errors="replace") 177 | except OSError: 178 | git_str = "Could not get git revision" 179 | if torch.cuda.is_available(): 180 | cuda_str = "Cuda version:\n" + torch.version.cuda 181 | else: 182 | cuda_str = "CUDA not available" 183 | result = "SpeechBrain system description\n" 184 | result += "==============================\n" 185 | result += python_version_str 186 | result += "==============================\n" 187 | result += python_packages_str 188 | result += "==============================\n" 189 | result += git_str 190 | result += "==============================\n" 191 | result += cuda_str 192 | return result 193 | -------------------------------------------------------------------------------- /speechbrain/utils/superpowers.py: -------------------------------------------------------------------------------- 1 | """Superpowers which should be sparingly used. 2 | 3 | This library contains functions for importing python files and 4 | for running shell commands. Remember, with great power comes great 5 | responsibility. 6 | 7 | Authors 8 | * Mirco Ravanelli 2020 9 | * Aku Rouhe 2021 10 | """ 11 | 12 | import logging 13 | import subprocess 14 | import importlib 15 | import pathlib 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def import_from_path(path): 21 | r"""Import module from absolute path 22 | 23 | Arguments 24 | --------- 25 | path : str, pathlib.Path 26 | The path to the module to import 27 | 28 | Returns 29 | ------- 30 | module 31 | The loaded module 32 | 33 | >>> modulepath = getfixture("tmpdir") / "helloer.py" 34 | >>> with open(modulepath, "w") as fo: 35 | ... _ = fo.write("def a():\n\treturn 'hello'") 36 | >>> helloer = import_from_path(modulepath) 37 | >>> helloer.a() 38 | 'hello' 39 | 40 | Implementation taken from: 41 | https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly 42 | """ 43 | path = pathlib.Path(path) 44 | modulename = path.with_suffix("").name 45 | spec = importlib.util.spec_from_file_location(modulename, path) 46 | module = importlib.util.module_from_spec(spec) 47 | spec.loader.exec_module(module) 48 | return module 49 | 50 | 51 | def run_shell(cmd): 52 | r"""This function can be used to run a command in the bash shell. 53 | 54 | Arguments 55 | --------- 56 | cmd : str 57 | Shell command to run. 58 | 59 | Returns 60 | ------- 61 | bytes 62 | The captured standard output. 63 | bytes 64 | The captured standard error. 65 | int 66 | The returncode. 67 | 68 | Raises 69 | ------ 70 | OSError 71 | If returncode is not 0, i.e., command failed. 72 | 73 | Example 74 | ------- 75 | >>> out, err, code = run_shell("echo 'hello world'") 76 | >>> out.decode(errors="ignore") 77 | 'hello world\n' 78 | """ 79 | 80 | # Executing the command 81 | p = subprocess.Popen( 82 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 83 | ) 84 | 85 | # Capturing standard output and error 86 | (output, err) = p.communicate() 87 | 88 | if p.returncode != 0: 89 | raise OSError(err.decode(errors="replace")) 90 | 91 | # Adding information in the logger 92 | msg = output.decode(errors="replace") + "\n" + err.decode(errors="replace") 93 | logger.debug(msg) 94 | 95 | return output, err, p.returncode 96 | -------------------------------------------------------------------------------- /speechbrain/utils/torch_audio_backend.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import logging 3 | import torchaudio 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def check_torchaudio_backend(): 9 | """Checks the torchaudio backend and sets it to soundfile if 10 | windows is detected. 11 | """ 12 | current_system = platform.system() 13 | if current_system == "Windows": 14 | logger.warn( 15 | "The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows." 16 | ) 17 | torchaudio.set_audio_backend("soundfile") 18 | -------------------------------------------------------------------------------- /speechbrain/version.txt: -------------------------------------------------------------------------------- 1 | 0.5.11 2 | --------------------------------------------------------------------------------