├── LICENSE
├── MIR_ST500
    ├── README.md
    ├── hparams
    │   └── train_audio_ssl.yaml
    ├── huggingface_interface.py
    ├── prepare_benchmarks.py
    ├── train_audio_ssl.py
    └── utils.py
├── N20EMv2
    ├── audio_only
    │   ├── README.md
    │   ├── extract_ssl_feats.py
    │   ├── hparams
    │   │   ├── extract_ssl_feats.yaml
    │   │   └── train_audio_ssl.yaml
    │   ├── huggingface_interface.py
    │   ├── prepare_benchmarks.py
    │   ├── prepare_n20emv2.py
    │   ├── train_audio_ssl.py
    │   └── utils.py
    ├── audio_visual
    │   ├── README.md
    │   ├── fusion.py
    │   ├── hparams
    │   │   ├── train_rca_a.yaml
    │   │   └── train_rca_av.yaml
    │   ├── prepare_musan.py
    │   ├── prepare_n20emv2.py
    │   ├── synthesis_noise.py
    │   ├── train_rca_a.py
    │   ├── train_rca_av.py
    │   └── utils.py
    └── video_only
    │   ├── README.md
    │   ├── decoder.py
    │   ├── extract_ssl_feats.py
    │   ├── fairseq_interface.py
    │   ├── hparams
    │       ├── extract_ssl_feats.yaml
    │       └── train_video_ssl.yaml
    │   ├── hubert.py
    │   ├── hubert_asr.py
    │   ├── hubert_dataset.py
    │   ├── hubert_pretraining.py
    │   ├── prepare_n20emv2.py
    │   ├── resnet.py
    │   ├── sequence_generator.py
    │   ├── train_video_ssl.py
    │   └── utils.py
├── README.md
├── assets
    ├── framework.png
    ├── noise_test.png
    ├── results.png
    └── results2.png
├── dependencies.txt
├── requirements.txt
├── setup.py
└── speechbrain
    ├── __init__.py
    ├── alignment
        ├── __init__.py
        ├── aligner.py
        └── ctc_segmentation.py
    ├── core.py
    ├── dataio
        ├── __init__.py
        ├── batch.py
        ├── dataio.py
        ├── dataloader.py
        ├── dataset.py
        ├── encoder.py
        ├── iterators.py
        ├── legacy.py
        ├── preprocess.py
        ├── sampler.py
        └── wer.py
    ├── decoders
        ├── __init__.py
        ├── ctc.py
        ├── seq2seq.py
        └── transducer.py
    ├── lm
        ├── __init__.py
        ├── arpa.py
        ├── counting.py
        └── ngram.py
    ├── lobes
        ├── __init__.py
        ├── augment.py
        ├── beamform_multimic.py
        ├── features.py
        └── models
        │   ├── CRDNN.py
        │   ├── ContextNet.py
        │   ├── ECAPA_TDNN.py
        │   ├── ESPnetVGG.py
        │   ├── IMU_CRNN.py
        │   ├── MetricGAN.py
        │   ├── MetricGAN_U.py
        │   ├── QuasiRNN.py
        │   ├── RNNLM.py
        │   ├── VanillaNN.py
        │   ├── Xvector.py
        │   ├── __init__.py
        │   ├── conv_tasnet.py
        │   ├── convolution.py
        │   ├── decoder.py
        │   ├── dual_path.py
        │   ├── fairseq_wav2vec.py
        │   ├── hubert.py
        │   ├── hubert_asr.py
        │   ├── hubert_dataset.py
        │   ├── hubert_pretraining.py
        │   ├── huggingface_wav2vec.py
        │   ├── resnet.py
        │   ├── segan_model.py
        │   ├── sequence_generator.py
        │   ├── transformer
        │       ├── Conformer.py
        │       ├── Transformer.py
        │       ├── TransformerASR.py
        │       ├── TransformerLM.py
        │       ├── TransformerSE.py
        │       ├── TransformerST.py
        │       └── __init__.py
        │   └── utils.py
    ├── log-config.yaml
    ├── nnet
        ├── CNN.py
        ├── RNN.py
        ├── __init__.py
        ├── activations.py
        ├── attention.py
        ├── complex_networks
        │   ├── __init__.py
        │   ├── c_CNN.py
        │   ├── c_RNN.py
        │   ├── c_linear.py
        │   ├── c_normalization.py
        │   └── c_ops.py
        ├── containers.py
        ├── dropout.py
        ├── embedding.py
        ├── linear.py
        ├── loss
        │   ├── __init__.py
        │   ├── guidedattn_loss.py
        │   ├── si_snr_loss.py
        │   ├── stoi_loss.py
        │   └── transducer_loss.py
        ├── losses.py
        ├── normalization.py
        ├── pooling.py
        ├── quaternion_networks
        │   ├── __init__.py
        │   ├── q_CNN.py
        │   ├── q_RNN.py
        │   ├── q_linear.py
        │   ├── q_normalization.py
        │   └── q_ops.py
        ├── schedulers.py
        └── transducer
        │   ├── __init__.py
        │   └── transducer_joint.py
    ├── pretrained
        ├── __init__.py
        ├── fetching.py
        └── interfaces.py
    ├── processing
        ├── NMF.py
        ├── PLDA_LDA.py
        ├── __init__.py
        ├── decomposition.py
        ├── diarization.py
        ├── features.py
        ├── multi_mic.py
        ├── signal_processing.py
        └── speech_augmentation.py
    ├── tokenizers
        ├── SentencePiece.py
        └── __init__.py
    ├── utils
        ├── Accuracy.py
        ├── DER.py
        ├── __init__.py
        ├── bleu.py
        ├── callchains.py
        ├── checkpoints.py
        ├── data_pipeline.py
        ├── data_utils.py
        ├── depgraph.py
        ├── distributed.py
        ├── edit_distance.py
        ├── epoch_loop.py
        ├── hpopt.py
        ├── logger.py
        ├── metric_stats.py
        ├── parameter_transfer.py
        ├── superpowers.py
        ├── torch_audio_backend.py
        └── train_logger.py
    └── version.txt


/MIR_ST500/README.md:
--------------------------------------------------------------------------------
 1 | # Audio-only Automatic Music Transcription with MIR-ST500 dataset
 2 | This sub-project contains recipes for trianing benchmark AMT system using MIR-ST500 dataset.
 3 | 
 4 | ## Prerequisites
 5 | 1. Before running our scripts, you need to download, preprocess and save the datasets properly. For polyphonic singing recordings, we use [spleeter](https://github.com/deezer/spleeter) to extract the vocal part. Besides, to meet the requirements of self-supervised-learning models in our project, we resample the audio data into 16 kHz. We provide sample code in `prepare_benchmarks.py`. For the annotations, we save them to a json file. 
 6 | 
 7 | The file organization for MIR-ST500 should be:
 8 | ```
 9 | /path/to/MIR_ST500
10 | ├── wav16kHz
11 |     ├── train
12 |         ├── song1
13 |             ├── vocals.wav
14 |         ├── song2
15 |         ├── ...
16 |     ├── test
17 | ├── Annotations.json
18 | ```
19 | 
20 | The file organization for ISMIR and TONAS should be:
21 | ```
22 | /path/to/ISMIR or /path/to/TONAS
23 | ├── wav16kHz
24 |     ├── song1
25 |         ├── vocals.wav
26 |     ├── song2
27 |     ├── ...
28 | ├── Annotations.json
29 | ```
30 | 
31 | 
32 | 2. Prepare benchmark AMT for singing datasets including MIR-ST500, TONAS, and ISMIR2014, run:
33 | ```
34 | python prepare_benchmarks.py --duration <duration> --frame_rate 49.8 --mir_st500 /path/to/MIR_ST500 --ismir /path/to/ISMIR2014 --tonas /path/to/TONAS
35 | ```
36 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s in our paper. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `49.8` fps, which is the frame rate of wav2vec 2.0 features.
37 | 
38 | After running this script, the file organization for MIR-ST500 should be:
39 | ```
40 | /path/to/MIR_ST500
41 | ├── wav16kHz
42 |     ├── train
43 |         ├── song1
44 |             ├── vocals.wav
45 |             ├── annotation.json
46 |             ├── frame_anno.npy
47 |         ├── song2
48 |         ├── ...
49 |     ├── test
50 | ├── Annotations.json
51 | ```
52 | 
53 | The file organization for ISMIR and TONAS should be:
54 | ```
55 | /path/to/ISMIR or /path/to/TONAS
56 | ├── wav16kHz
57 |     ├── song1
58 |         ├── vocals.wav
59 |         ├── annotation.json
60 |         ├── frame_anno.npy
61 |     ├── song2
62 |     ├── ...
63 | ├── Annotations.json
64 | ```
65 | 
66 | The resulted csv files are save in the same root folder: 
67 | ```
68 | ├── data
69 |     ├── dur_<duration>
70 |         ├── mir_st500_train.csv
71 |         ├── mir_st500_test.csv
72 |         ├── ismir2014.csv
73 |         ├── tonas.csv
74 | ├── prepare_benchmarks.py
75 | ```
76 | 
77 | ## How to run
78 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce `ours variant 1` in our paper, run:
79 | ```
80 | CUDA_VISIBLE_DEVICES=0,1 python train_audio_ssl.py hparams/train_audio_ssl.yaml --data_parallel_backend --data_folder /path/to/MIR_ST500 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60
81 | ```
82 | The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. The option `--ssl_model` refers to the self-supervised-learning (SSL) model we used. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM):
83 | ```
84 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_audio_ssl.py hparams/train_audio_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --data_folder /path/to/MIR_ST500 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60
85 | ```
86 | 
87 | NOTE: For SSL models, we used `wav2vec2-large-lv60` as the wav2vec 2.0 pretrained on speech data and `wav2vec2-large-960h-lv60-self` as the wav2vec 2.0 pretrained and finetuned on speech data. To facilitate the follow-up research, our repo supports the following SSL models: [wav2vec2](https://arxiv.org/abs/2006.11477), [HuBERT](https://arxiv.org/abs/2106.07447), [data2vec](https://arxiv.org/abs/2202.03555), [WavLM](https://arxiv.org/abs/2110.13900). Please find the checkpoint name in [Huggingface](https://huggingface.co/models).
88 | 
89 | ## Results
90 | We provide our trained AMT model of `ours 1`[[model link](https://drive.google.com/drive/folders/18IvMt8vrtZewCjCSy6DTPfZzhJw4SI95?usp=sharing)] in the paper.
91 | <p align="center">
92 | <img src="../assets/results.png" alt="" data-canonical-src="../assets/results.png width="100%"/>
93 | </p>


--------------------------------------------------------------------------------
/MIR_ST500/hparams/train_audio_ssl.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: wav2vec2 + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | ssl_model: wav2vec2-large-lv60
 12 | output_folder: !ref results/<ssl_model>/train_audio_ssl_dur<dur_threshold>_attempt<attempt>/<seed>
 13 | save_folder: !ref <output_folder>/save
 14 | csv_folder: !ref data/dur_<dur_threshold>s
 15 | train_log: !ref <output_folder>/train_log.txt
 16 | 
 17 | # URL for the biggest Fairseq english data2vec model.
 18 | wav2vec2_hub: !ref facebook/<ssl_model>
 19 | wav2vec2_local: !ref ssl_model/<ssl_model>
 20 | 
 21 | # Data files
 22 | data_folder: !PLACEHOLDER # e,g./path/to/N20EMv2
 23 | # noise/ris dataset will automatically be downloaded
 24 | data_folder_rirs: !ref <data_folder>
 25 | ckpt_interval_minutes: 25 # save checkpoint every N min
 26 | train_csv: !ref <csv_folder>/mir_st500_train.csv
 27 | valid_csv: !ref <csv_folder>/mir_st500_valid.csv
 28 | test_csv:
 29 |    - !ref <csv_folder>/mir_st500_test.csv
 30 |    - !ref <csv_folder>/tonas.csv
 31 |    - !ref <csv_folder>/ismir2014.csv
 32 | 
 33 | noise_type: babble
 34 | snr_db: -10
 35 | add_noise: False
 36 | # Training parameters
 37 | number_of_epochs: 10
 38 | lr: 0.0003
 39 | lr_wav2vec: 0.00005
 40 | sorting: ascending
 41 | auto_mix_prec: False
 42 | sample_rate: 16000
 43 | frame_rate: 49.8
 44 | linear_prob_epochs: 2
 45 | pretrain: False
 46 | pretrain_folder: ../pretrain_model
 47 | save_model: False
 48 | save_model_folder: ../save_model
 49 | 
 50 | # Evaluating parameters
 51 | onset_threshold: 0.4
 52 | offset_threshold: 0.5
 53 | onset_tolerance: 0.05
 54 | pitch_tolerance: 50
 55 | 
 56 | # With data_parallel batch_size is split into N jobs
 57 | # With DDP batch_size is multiplied by N jobs
 58 | # Must be 3 per GPU to fit 32GB of VRAM
 59 | batch_size: 8
 60 | test_batch_size: 1
 61 | num_workers: 8
 62 | 
 63 | # Dataloader options
 64 | train_dataloader_opts:
 65 |    batch_size: !ref <batch_size>
 66 |    num_workers: !ref <num_workers>
 67 | 
 68 | valid_dataloader_opts:
 69 |    batch_size: !ref <test_batch_size>
 70 |    num_workers: !ref <num_workers>
 71 | 
 72 | test_dataloader_opts:
 73 |    batch_size: !ref <test_batch_size>
 74 |    num_workers: !ref <num_workers>
 75 | 
 76 | # Model parameters
 77 | freeze_wav2vec: False
 78 | 
 79 | # Outputs
 80 | pitch_octave_num: 4
 81 | pitch_class_num: 12
 82 | feat_dim: 1024
 83 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 84 | 
 85 | #
 86 | # Functions and classes
 87 | #
 88 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 89 |    limit: !ref <number_of_epochs>
 90 | 
 91 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
 92 | #    sample_rate: !ref <sample_rate>
 93 | #    speeds: [95, 100, 105]
 94 | 
 95 | wav2vec2: !new:huggingface_interface.HuggingFaceWav2Vec2
 96 |    source: !ref <wav2vec2_hub>
 97 |    output_norm: True
 98 |    freeze: !ref <freeze_wav2vec>
 99 |    save_path: !ref <wav2vec2_local> # !ref <save_folder>/wav2vec2_checkpoint
100 | 
101 | #####
102 | # Uncomment this block if you prefer to use a Fairseq pretrained model instead
103 | # of a HuggingFace one. Here, we provide an URL that is obtained from the
104 | # Fairseq github for the multilingual XLSR.
105 | #
106 | # wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt
107 | # wav2vec2: !new:nets.fairseq_interface.FairseqWav2Vec2
108 | #    pretrained_path: !ref <wav2vec2_url>
109 | #    output_norm: True
110 | #    freeze: False
111 | #    save_path: !ref <wav2vec2_local>
112 | 
113 | model: !new:speechbrain.nnet.linear.Linear
114 |    input_size: !ref <feat_dim>
115 |    n_neurons: !ref <output_neurons>
116 | 
117 | modules:
118 |    wav2vec2: !ref <wav2vec2>
119 |    model: !ref <model>
120 | 
121 | log_softmax: !new:speechbrain.nnet.activations.Softmax
122 |    apply_log: True
123 | 
124 | onset_positive_weight: 15.0
125 | offset_positive_weight: 1.0
126 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
127 |    reduction: mean
128 |    allowed_len_diff: 3
129 |    label_smoothing: 0.0
130 | 
131 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
132 |    reduction: mean
133 |    allowed_len_diff: 3
134 |    label_smoothing: 0.0
135 | 
136 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
137 |    reduction: mean
138 |    allowed_len_diff: 3
139 |    label_smoothing: 0.0
140 | 
141 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
142 |    reduction: mean
143 |    allowed_len_diff: 3
144 |    label_smoothing: 0.0
145 | 
146 | model_opt_class: !name:torch.optim.Adadelta
147 |    lr: !ref <lr>
148 |    rho: 0.95
149 |    eps: 1.e-8
150 | 
151 | wav2vec_opt_class: !name:torch.optim.Adam
152 |    lr: !ref <lr_wav2vec>
153 | 
154 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
155 |    initial_value: !ref <lr>
156 |    improvement_threshold: 0.0025
157 |    annealing_factor: 0.8
158 |    patient: 0
159 | 
160 | lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
161 |    initial_value: !ref <lr_wav2vec>
162 |    improvement_threshold: 0.0025
163 |    annealing_factor: 0.9
164 |    patient: 0
165 | 
166 | 
167 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
168 |    checkpoints_dir: !ref <save_folder>
169 |    recoverables:
170 |       wav2vec2: !ref <wav2vec2>
171 |       model: !ref <model>
172 |       scheduler_model: !ref <lr_annealing_model>
173 |       scheduler_wav2vec: !ref <lr_annealing_wav2vec>
174 |       counter: !ref <epoch_counter>
175 | 
176 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
177 |    save_file: !ref <train_log>
178 |    precision: 3
179 | 
180 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
181 |     metric: !name:speechbrain.nnet.losses.bce_loss
182 |         reduction: batch
183 |         allowed_len_diff: 3
184 |         label_smoothing: 0.0
185 | 
186 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
187 |     metric: !name:speechbrain.nnet.losses.bce_loss
188 |         reduction: batch
189 |         allowed_len_diff: 3
190 |         label_smoothing: 0.0
191 | 
192 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
193 |     metric: !name:speechbrain.nnet.losses.nll_loss
194 |         reduction: batch
195 |         allowed_len_diff: 3
196 |         label_smoothing: 0.0
197 | 
198 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
199 |     metric: !name:speechbrain.nnet.losses.nll_loss
200 |         reduction: batch
201 |         allowed_len_diff: 3
202 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/audio_only/README.md:
--------------------------------------------------------------------------------
  1 | # Audio-only Automatic Music Transcription with N20EMv2 dataset
  2 | This sub-project contains recipes for trianing audio-only AMT system using N20EMv2 dataset.
  3 | 
  4 | ## Prerequisites
  5 | 1. Before running our scripts, you need to download, preprocess and save the datasets properly. For polyphonic singing recordings, we use [spleeter](https://github.com/deezer/spleeter) to extract the vocal part. Besides, to meet the requirements of self-supervised-learning models in our project, we resample the audio data into 16 kHz. We provide sample code in `prepare_benchmarks.py`. For the annotations, we save them to a json file. 
  6 | 
  7 | The file organization for MIR-ST500 should be:
  8 | ```
  9 | /path/to/MIR_ST500
 10 | ├── wav16kHz
 11 |     ├── train
 12 |         ├── song1
 13 |             ├── vocals.wav
 14 |         ├── song2
 15 |         ├── ...
 16 |     ├── test
 17 | ├── Annotations.json
 18 | ```
 19 | 
 20 | The file organization for ISMIR and TONAS should be:
 21 | ```
 22 | /path/to/ISMIR or /path/to/TONAS
 23 | ├── wav16kHz
 24 |     ├── song1
 25 |         ├── vocals.wav
 26 |     ├── song2
 27 |     ├── ...
 28 | ├── Annotations.json
 29 | ```
 30 | 
 31 | The file organization for N20EMv2 should be:
 32 | ```
 33 | /path/to/N20EMv2
 34 | ├── data
 35 |     ├── song1
 36 |         ├── vocals.wav
 37 |         ├── video_50fps.npy
 38 |     ├── song2
 39 |     ├── ...
 40 | ├── annotations.json
 41 | ```
 42 | 
 43 | 
 44 | 2. Prepare benchmark AMT for singing datasets including MIR-ST500, TONAS, and ISMIR2014, run:
 45 | ```
 46 | python prepare_benchmarks.py --duration <duration> --frame_rate 49.8 --mir_st500 /path/to/MIR_ST500 --ismir /path/to/ISMIR2014 --tonas /path/to/TONAS
 47 | ```
 48 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s in our paper. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `49.8` fps, which is the frame rate of wav2vec 2.0 features.
 49 | 
 50 | After running this script, the file organization for MIR-ST500 should be:
 51 | ```
 52 | /path/to/MIR_ST500
 53 | ├── wav16kHz
 54 |     ├── train
 55 |         ├── song1
 56 |             ├── vocals.wav
 57 |             ├── annotation.json
 58 |             ├── frame_anno.npy
 59 |         ├── song2
 60 |         ├── ...
 61 |     ├── test
 62 | ├── Annotations.json
 63 | ```
 64 | 
 65 | The file organization for ISMIR and TONAS should be:
 66 | ```
 67 | /path/to/ISMIR or /path/to/TONAS
 68 | ├── wav16kHz
 69 |     ├── song1
 70 |         ├── vocals.wav
 71 |         ├── annotation.json
 72 |         ├── frame_anno.npy
 73 |     ├── song2
 74 |     ├── ...
 75 | ├── Annotations.json
 76 | ```
 77 | 
 78 | 3. Prepare N20EMv2 dataset, run:
 79 | ```
 80 | python prepare_n20emv2.py --duration <duration> --frame_rate 49.8 --n20emv2 /path/to/n20emv2
 81 | ```
 82 | 
 83 | After running this script, the file organization for N20EMv2 should be:
 84 | ```
 85 | /path/to/N20EMv2
 86 | ├── data
 87 |     ├── song1
 88 |         ├── vocals.wav
 89 |         ├── video_50fps.npy
 90 |         ├── note_anno.json
 91 |         ├── audio_anno
 92 |             ├── 49.8fps
 93 |                 ├── audio_frame_anno.npy
 94 |     ├── song2
 95 |     ├── ...
 96 | ├── annotations.json
 97 | ```
 98 | 
 99 | The resulted csv files are save in the same root folder: 
100 | ```
101 | ├── data
102 |     ├── dur_<duration>
103 |         ├── mir_st500_train.csv
104 |         ├── mir_st500_test.csv
105 |         ├── n20emv2_train.csv
106 |         ├── n20emv2_valid.csv
107 |         ├── n20emv2_test.csv
108 |         ├── ismir2014.csv
109 |         ├── tonas.csv
110 |         ├── mix_train.csv
111 | ├── prepare_benchmarks.py
112 | ├── prepare_n20emv2.py
113 | ```
114 | 
115 | ## How to run
116 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce `ours variant 2` in our paper, run:
117 | ```
118 | CUDA_VISIBLE_DEVICES=0,1 python train_audio_ssl.py hparams/train_audio_ssl.yaml --data_parallel_backend --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60
119 | ```
120 | The option `--mix_train` refers to whether to mix the training data of MIR_ST500 and N20EMv2. If `False`, only N20EMv2 is used during training. The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. The option `--ssl_model` refers to the self-supervised-learning (SSL) model we used. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM):
121 | ```
122 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_audio_ssl.py hparams/train_audio_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60
123 | ```
124 | 
125 | NOTE: For SSL models, we used `wav2vec2-large-lv60` as the wav2vec 2.0 pretrained on speech data and `wav2vec2-large-960h-lv60-self` as the wav2vec 2.0 pretrained and finetuned on speech data. To facilitate the follow-up research, our repo supports the following SSL models: [wav2vec2](https://arxiv.org/abs/2006.11477), [HuBERT](https://arxiv.org/abs/2106.07447), [data2vec](https://arxiv.org/abs/2202.03555), [WavLM](https://arxiv.org/abs/2110.13900). Please find the checkpoint name in [Huggingface](https://huggingface.co/models).
126 | 
127 | ## Results
128 | We provide our trained AMT model of `ours 2`[[model link](https://drive.google.com/drive/folders/1FZFWf0JXDs2Esmu9GZqxmp5Wev5AclWU?usp=share_link)] in the paper.
129 | 
130 | Results on Benchmark datasets for AMT task:
131 | <p align="center">
132 | <img src="../../assets/results.png" alt="" data-canonical-src="../../assets/results.png" width="100%"/>
133 | </p>
134 | 
135 | Results on N20EMv2 dataset for AMT task:
136 | <p align="center">
137 | <img src="../../assets/results2.png" alt="" data-canonical-src="../../assets/results2.png" width="100%"/>
138 | </p>
139 | 


--------------------------------------------------------------------------------
/N20EMv2/audio_only/hparams/extract_ssl_feats.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: wav2vec2 + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | mix_train: True
 12 | ssl_model: wav2vec2-large-lv60
 13 | output_folder: !ref results/<ssl_model>_mix<mix_train>/train_audio_ssl_dur<dur_threshold>_attempt<attempt>/<seed>
 14 | save_folder: !ref <output_folder>/save
 15 | csv_folder: !ref data/dur_<dur_threshold>s
 16 | train_log: !ref <output_folder>/train_log.txt
 17 | 
 18 | # URL for the biggest Fairseq english data2vec model.
 19 | wav2vec2_hub: !ref facebook/<ssl_model>
 20 | wav2vec2_local: !ref ssl_model/<ssl_model>
 21 | 
 22 | # Data files
 23 | data_folder: !PLACEHOLDER # e,g./path/to/N20EMv2
 24 | # noise/ris dataset will automatically be downloaded
 25 | data_folder_rirs: !ref <data_folder>
 26 | ckpt_interval_minutes: 25 # save checkpoint every N min
 27 | train_csv: !ref <csv_folder>/n20em_train.csv
 28 | mix_train_csv: !ref <csv_folder>/mix_train.csv
 29 | valid_csv: !ref <csv_folder>/n20em_valid.csv
 30 | test_csv:
 31 |    - !ref <csv_folder>/n20em_test.csv
 32 |    - !ref <csv_folder>/n20em_valid.csv
 33 |    - !ref <csv_folder>/n20em_valid.csv
 34 | 
 35 | noise_type: babble
 36 | snr_db: -10
 37 | add_noise: False
 38 | # Training parameters
 39 | number_of_epochs: 10
 40 | lr: 0.0003
 41 | lr_wav2vec: 0.00005
 42 | sorting: ascending
 43 | auto_mix_prec: False
 44 | sample_rate: 16000
 45 | frame_rate: 49.8
 46 | linear_prob_epochs: 2
 47 | pretrain: False
 48 | pretrain_folder: ../pretrain_model
 49 | save_model: True
 50 | save_model_folder: ../save_model
 51 | 
 52 | # Evaluating parameters
 53 | onset_threshold: 0.4
 54 | offset_threshold: 0.5
 55 | onset_tolerance: 0.05
 56 | pitch_tolerance: 50
 57 | 
 58 | # With data_parallel batch_size is split into N jobs
 59 | # With DDP batch_size is multiplied by N jobs
 60 | # Must be 3 per GPU to fit 32GB of VRAM
 61 | batch_size: 8
 62 | test_batch_size: 1
 63 | num_workers: 8
 64 | 
 65 | # Dataloader options
 66 | train_dataloader_opts:
 67 |    batch_size: !ref <batch_size>
 68 |    num_workers: !ref <num_workers>
 69 | 
 70 | valid_dataloader_opts:
 71 |    batch_size: !ref <test_batch_size>
 72 |    num_workers: !ref <num_workers>
 73 | 
 74 | test_dataloader_opts:
 75 |    batch_size: !ref <test_batch_size>
 76 |    num_workers: !ref <num_workers>
 77 | 
 78 | # Model parameters
 79 | freeze_wav2vec: False
 80 | 
 81 | # Outputs
 82 | pitch_octave_num: 4
 83 | pitch_class_num: 12
 84 | feat_dim: 1024
 85 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 86 | 
 87 | #
 88 | # Functions and classes
 89 | #
 90 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 91 |    limit: !ref <number_of_epochs>
 92 | 
 93 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
 94 | #    sample_rate: !ref <sample_rate>
 95 | #    speeds: [95, 100, 105]
 96 | 
 97 | wav2vec2: !new:huggingface_interface.HuggingFaceWav2Vec2
 98 |    source: !ref <wav2vec2_hub>
 99 |    output_norm: True
100 |    freeze: !ref <freeze_wav2vec>
101 |    save_path: !ref <wav2vec2_local> # !ref <save_folder>/wav2vec2_checkpoint
102 | 
103 | #####
104 | # Uncomment this block if you prefer to use a Fairseq pretrained model instead
105 | # of a HuggingFace one. Here, we provide an URL that is obtained from the
106 | # Fairseq github for the multilingual XLSR.
107 | #
108 | # wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt
109 | # wav2vec2: !new:nets.fairseq_interface.FairseqWav2Vec2
110 | #    pretrained_path: !ref <wav2vec2_url>
111 | #    output_norm: True
112 | #    freeze: False
113 | #    save_path: !ref <wav2vec2_local>
114 | 
115 | model: !new:speechbrain.nnet.linear.Linear
116 |    input_size: !ref <feat_dim>
117 |    n_neurons: !ref <output_neurons>
118 | 
119 | modules:
120 |    wav2vec2: !ref <wav2vec2>
121 |    model: !ref <model>
122 | 
123 | log_softmax: !new:speechbrain.nnet.activations.Softmax
124 |    apply_log: True
125 | 
126 | onset_positive_weight: 15.0
127 | offset_positive_weight: 1.0
128 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
129 |    reduction: mean
130 |    allowed_len_diff: 3
131 |    label_smoothing: 0.0
132 | 
133 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
134 |    reduction: mean
135 |    allowed_len_diff: 3
136 |    label_smoothing: 0.0
137 | 
138 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
139 |    reduction: mean
140 |    allowed_len_diff: 3
141 |    label_smoothing: 0.0
142 | 
143 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
144 |    reduction: mean
145 |    allowed_len_diff: 3
146 |    label_smoothing: 0.0
147 | 
148 | model_opt_class: !name:torch.optim.Adadelta
149 |    lr: !ref <lr>
150 |    rho: 0.95
151 |    eps: 1.e-8
152 | 
153 | wav2vec_opt_class: !name:torch.optim.Adam
154 |    lr: !ref <lr_wav2vec>
155 | 
156 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
157 |    initial_value: !ref <lr>
158 |    improvement_threshold: 0.0025
159 |    annealing_factor: 0.8
160 |    patient: 0
161 | 
162 | lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
163 |    initial_value: !ref <lr_wav2vec>
164 |    improvement_threshold: 0.0025
165 |    annealing_factor: 0.9
166 |    patient: 0
167 | 
168 | 
169 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
170 |    checkpoints_dir: !ref <save_folder>
171 |    recoverables:
172 |       wav2vec2: !ref <wav2vec2>
173 |       model: !ref <model>
174 |       scheduler_model: !ref <lr_annealing_model>
175 |       scheduler_wav2vec: !ref <lr_annealing_wav2vec>
176 |       counter: !ref <epoch_counter>
177 | 
178 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
179 |    save_file: !ref <train_log>
180 |    precision: 3
181 | 
182 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
183 |     metric: !name:speechbrain.nnet.losses.bce_loss
184 |         reduction: batch
185 |         allowed_len_diff: 3
186 |         label_smoothing: 0.0
187 | 
188 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
189 |     metric: !name:speechbrain.nnet.losses.bce_loss
190 |         reduction: batch
191 |         allowed_len_diff: 3
192 |         label_smoothing: 0.0
193 | 
194 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
195 |     metric: !name:speechbrain.nnet.losses.nll_loss
196 |         reduction: batch
197 |         allowed_len_diff: 3
198 |         label_smoothing: 0.0
199 | 
200 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
201 |     metric: !name:speechbrain.nnet.losses.nll_loss
202 |         reduction: batch
203 |         allowed_len_diff: 3
204 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/audio_only/prepare_n20emv2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data preparation for datasets of automatic music transcription
  3 | 
  4 | Authors
  5 | * Xiangming Gu 2022
  6 | """
  7 | import os
  8 | import csv
  9 | import json
 10 | import argparse
 11 | import torchaudio
 12 | import numpy as np
 13 | from tqdm import tqdm
 14 | from utils import note2frame
 15 | from speechbrain.dataio.dataio import merge_csvs
 16 | SAMPLERATE = 16000
 17 | 
 18 | 
 19 | def prepare_frame_anno(folder, frame_rate=49.8):
 20 |     """
 21 |     This function processes the frame-level annotations for each song
 22 |     """
 23 |     json_file = os.path.join(folder, "annotations.json")
 24 |     folder_data = os.path.join(folder, "data")
 25 |     # open ground truth data
 26 |     with open(json_file) as f:
 27 |         annotations = json.load(f)
 28 |     f.close()
 29 |     # traverse the whole dataset
 30 |     for entry in tqdm(annotations.keys()):
 31 |         anno = annotations[entry]["midi"]
 32 |         json_path = os.path.join(folder_data, entry, "note_anno.json")
 33 |         # save json file
 34 |         with open(json_path, "w") as f:
 35 |             json.dump(anno, f)
 36 |         f.close()
 37 |         # save frame-level annotations
 38 |         wav_file = os.path.join(folder_data, entry, "vocals.wav")
 39 |         audio, fs = torchaudio.load(wav_file)
 40 |         assert fs == SAMPLERATE
 41 |         assert audio.shape[0] == 1
 42 |         duration = audio.shape[1] / SAMPLERATE
 43 |         length = round(duration * frame_rate)
 44 |         frame_label = note2frame(gt_data=anno, length=length, frame_size=1/frame_rate)
 45 |         # print(length)
 46 |         assert frame_label.shape[0] == length
 47 |         # save frame-level annotation
 48 |         os.makedirs(os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps"), exist_ok=True)
 49 |         npy_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy")
 50 |         np.save(npy_path, frame_label)
 51 | 
 52 | 
 53 | def prepare_csv_n20emv2(folder, csv_folder="./data", frame_rate=49.8, dur_thrd=5):
 54 |     """
 55 |     This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration
 56 |     """
 57 | 
 58 |     # initialize the csv lines
 59 |     csv_train_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 60 |     csv_valid_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 61 |     csv_test_lines = [["ID", "duration", "wav", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 62 |     # load the annotations
 63 |     json_file = os.path.join(folder, "annotations.json")
 64 |     folder_data = os.path.join(folder, "data")
 65 |     # open ground truth data
 66 |     with open(json_file) as f:
 67 |         annotations = json.load(f)
 68 |     f.close()
 69 |     # traverse the whole dataset
 70 |     for entry in tqdm(annotations.keys()):
 71 |         split = annotations[entry]["split"]
 72 |         audio_path = os.path.join(folder_data, entry, "vocals.wav")
 73 |         anno_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy")
 74 |         song_anno_path = os.path.join(folder_data, entry, "note_anno.json")
 75 | 
 76 |         # load the audio
 77 |         audio, fs = torchaudio.load(audio_path)  # audio: [1, N] for mono or [2, N] for stero
 78 |         assert fs == SAMPLERATE
 79 |         duration = audio.shape[1] / SAMPLERATE
 80 | 
 81 |         # split the whole song into utterances
 82 |         utter_num = round(duration / dur_thrd)
 83 |         for i in range(1, utter_num+1):
 84 |             ID = entry + "_" + str(i)
 85 |             if i == utter_num:
 86 |                 dur = duration - (utter_num - 1) * dur_thrd
 87 |                 assert 0 < dur <= dur_thrd * 3 / 2
 88 |             else:
 89 |                 dur = dur_thrd
 90 |             csv_line = [
 91 |                 ID, str(dur), audio_path, str(i), str(utter_num), anno_path, song_anno_path,
 92 |             ]
 93 |             if split == "train":
 94 |                 csv_train_lines.append(csv_line)
 95 |             elif split == "valid":
 96 |                 csv_valid_lines.append(csv_line)
 97 |             elif split == "test":
 98 |                 csv_test_lines.append(csv_line)
 99 |     # save csv files
100 |     save_folder = os.path.join(csv_folder, "dur_" + str(dur_thrd) + "s")
101 |     os.makedirs(save_folder, exist_ok=True)
102 |     save_train_path = os.path.join(save_folder, "n20em_train.csv")
103 |     save_valid_path = os.path.join(save_folder, "n20em_valid.csv")
104 |     save_test_path = os.path.join(save_folder, "n20em_test.csv")
105 |     # train
106 |     with open(save_train_path, mode="w") as csv_f:
107 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
108 |         for line in csv_train_lines:
109 |             csv_writer.writerow(line)
110 |     # valid
111 |     with open(save_valid_path, mode="w") as csv_f:
112 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
113 |         for line in csv_valid_lines:
114 |             csv_writer.writerow(line)
115 |     # test
116 |     with open(save_test_path, mode="w") as csv_f:
117 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
118 |         for line in csv_test_lines:
119 |             csv_writer.writerow(line)
120 |  
121 | 
122 | if __name__ == "__main__":
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument("--duration", type=int, default=5, help="the threshold for duration")
125 |     parser.add_argument("--frame_rate", type=float, default=49.8, help="The frame-rate for SSL models")
126 |     parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset")
127 |     args = parser.parse_args()
128 |     
129 |     prepare_frame_anno(folder=args.n20emv2, frame_rate=args.frame_rate)
130 |     prepare_csv_n20emv2(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration)
131 |     save_folder = os.path.join("./data", "dur_" + str(args.duration) + "s")
132 |     
133 |     merge_files = ["mir_st500_train.csv", "n20em_train.csv"]
134 |     merge_name = "mix_train.csv"
135 |     merge_csvs(
136 |             data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name,
137 |         )


--------------------------------------------------------------------------------
/N20EMv2/audio_visual/README.md:
--------------------------------------------------------------------------------
 1 | # Audio-Visual Automatic Music Transcription with N20EMv2 dataset
 2 | This sub-project contains recipes for training audio-visual AMT system using N20EMv2 dataset.
 3 | 
 4 | ## Prerequisites
 5 | Before running our scripts, you need to simulate the noisy environments by synthesizing the noisy data. We provide the synthesized noisy data in N20EMv2. If you want to obtain these data from scratch, you can follow the procedure.
 6 | 
 7 | Firstly, you should download and decompress the [MUSAN](https://www.openslr.org/17/). Assmue you save the data in `/path/to/MUSAN`, which contains the following folders `./music, ./speech, ./noise`. We borrow the code from [AVHuBERT](https://github.com/facebookresearch/av_hubert/tree/main/avhubert/preparation) to process the MUSAN dataset. Run the following code:
 8 | ```
 9 | python prepare_musan.py --musan /path/to/MUSAN --nshard <nshard>  --slurm_partition <slurm_partition>
10 | ```
11 | This will: (1) split raw audios into 10-second clips, (2) generate babble noise from MUSAN speech audio, (3) count number of frames per clip. The whole data will be sharded into ${nshard} parts and each job processes one part.
12 | 
13 | Next, we synthesize the noisy data considering four noise types, including `accomp`, `babble`, `white`, `natural`, and five noise levels, including `-10dB`, `-5dB`, `0dB`, `5dB`, `10dB`. Run the following code:
14 | ```
15 | python synthesis_noise.py --musan /path/to/MUSAN --n20emv2 /path/to/N20EMv2
16 | ```
17 | 
18 | The file organization for N20EMv2 should be:
19 | ```
20 | /path/to/N20EMv2
21 | ├── data
22 |     ├── song1
23 |         ├── vocals.wav
24 |         ├── accomp.wav
25 |         ├── video_50fps.npy
26 |         ├── note_anno.json
27 |         ├── audio_anno
28 |         ├── video_anno
29 |         ├── noise_data
30 |             ├── accomp
31 |                 ├── SNR_-10dB.wav
32 |                 ├── SNR_-5dB.wav
33 |                 ├── SNR_0dB.wav
34 |                 ├── SNR_5dB.wav
35 |                 ├── SNR_10dB.wav
36 |             ├── babble
37 |                 ├── ...
38 |             ├── natural
39 |                 ├── ...
40 |             ├── white
41 |                 ├── ...
42 |     ├── song2
43 |     ├── ...
44 | ├── annotations.json
45 | ```
46 | 
47 | 
48 | ## How to run
49 | 1. Firstly, we train our audio-only AMT system (check `N20EMv2/audio_only/README.md`) and video-only AMT system (check `N20EMv2/video_only/README.md`). Make sure the model are well saved in order to extract features as follows.
50 | 
51 | 2. Secondly, we fix the gradients of audio encoder and video encoder. Practically, we extract the acoustic features from audio-only AMT system. To do so, run following commands:
52 | ```
53 | cd N20EMv2/audio_only
54 | CUDA_VISIBLE_DEVICES=0,1 python extract_ssl_feats.py hparams/extract_ssl_feats.yaml --data_parallel_backend --mix_train True --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10 --ssl_model wav2vec2-large-lv60 --save_model True --add_noise True --noise_type <noise_type> --snr_db <snr_db> 
55 | ```
56 | Compared to the run script, we enable four extra options. `--add_noise` refers to noisy input or clean input, `--noise_type` refers to the type of noise, including `accomp`, `babble`, `white`, `natural`, while `--snr_db` refers to the noise level, including `-10`, `-5`, `0`, `5`, `10` dB. Finally, `--save_model==True` means that we will save the SSL model and AMT classifier to the folder `N20EMv2/save_model` 
57 | 
58 | Then we extract the visual features from video-only AMT system. To do so, run following commands:
59 | ```
60 | CUDA_VISIBLE_DEVICES=0,1 python extract_ssl_feats.py hparams/extract_ssl_feats.yaml --data_parallel_backend --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10
61 | ```
62 | 
63 | 3. Prepare csv files for N20EMv2 feats, run:
64 | ```
65 | python prepare_n20emv2.py --duration <duration> --n20emv2 /path/to/n20emv2
66 | ```
67 | 
68 | 4. Run the second training stage of audio-visual AMT system, run:
69 | ```
70 | CUDA_VISIBLE_DEVICES=0,1 python train_rca_av.py hparams/train_rca_av.yaml --data_folder /path/to/N20EMv2 --lr 0.003 --add_noise True --snr_db <snr_db> --noise_type <noise_type> --data_parallel_backend
71 | ```
72 | We also provide counterpart audio-only AMT system trained via two stages, run:
73 | ```
74 | CUDA_VISIBLE_DEVICES=0,1 python train_rca_a.py hparams/train_rca_a.yaml --data_folder /path/to/N20EMv2 --lr 0.003 --add_noise True --snr_db <snr_db> --noise_type <noise_type> --data_parallel_backend
75 | ```
76 | 
77 | ## Results
78 | 
79 | Results on N20EMv2 dataset for audio-visual / audio-only AMT task under the perturbation of musical accompaniments:
80 | 
81 | <p align="center">
82 | <img src="../../assets/noise_test.png" alt="" data-canonical-src="../../assets/noise_test.png" width="100%"/>
83 | </p>


--------------------------------------------------------------------------------
/N20EMv2/audio_visual/hparams/train_rca_a.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: wav2vec2 + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | noise_type: natural
 12 | snr_db: -10
 13 | add_noise: True
 14 | lr: 0.0003
 15 | output_folder: !ref results/audio_only_rca/<noise_type>/noise_<add_noise>_<noise_type>_<snr_db>db_lr<lr>/<seed>
 16 | save_folder: !ref <output_folder>/save
 17 | csv_folder: !ref data_feat/dur_<dur_threshold>s
 18 | train_log: !ref <output_folder>/train_log.txt
 19 | 
 20 | # Data files
 21 | data_folder: !PLACEHOLDER # e,g./path/to/DSing
 22 | # noise/ris dataset will automatically be downloaded
 23 | data_folder_rirs: !ref <data_folder>
 24 | ckpt_interval_minutes: 25 # save checkpoint every N min
 25 | train_csv: !ref <csv_folder>/n20em_train.csv
 26 | valid_csv: !ref <csv_folder>/n20em_valid.csv
 27 | test_csv:
 28 |    - !ref <csv_folder>/n20em_test.csv
 29 |    - !ref <csv_folder>/n20em_valid.csv
 30 | 
 31 | # Training parameters
 32 | number_of_epochs: 10
 33 | sorting: ascending
 34 | auto_mix_prec: False
 35 | audio_sample_rate: 49.8
 36 | video_sample_rate: 50
 37 | frame_rate: 49.8
 38 | linear_prob_epochs: 2
 39 | pretrain: True
 40 | pretrain_folder: ../save_model
 41 | 
 42 | # Evaluating parameters
 43 | onset_threshold: 0.4
 44 | offset_threshold: 0.5
 45 | onset_tolerance: 0.05
 46 | pitch_tolerance: 50
 47 | 
 48 | # With data_parallel batch_size is split into N jobs
 49 | # With DDP batch_size is multiplied by N jobs
 50 | # Must be 3 per GPU to fit 32GB of VRAM
 51 | batch_size: 8
 52 | test_batch_size: 1
 53 | num_workers: 8
 54 | 
 55 | # Dataloader options
 56 | train_dataloader_opts:
 57 |    batch_size: !ref <batch_size>
 58 |    num_workers: !ref <num_workers>
 59 | 
 60 | valid_dataloader_opts:
 61 |    batch_size: !ref <test_batch_size>
 62 |    num_workers: !ref <num_workers>
 63 | 
 64 | test_dataloader_opts:
 65 |    batch_size: !ref <test_batch_size>
 66 |    num_workers: !ref <num_workers>
 67 | 
 68 | # Outputs
 69 | pitch_octave_num: 4
 70 | pitch_class_num: 12
 71 | feat_dim: 1024
 72 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 73 | 
 74 | #
 75 | # Functions and classes
 76 | #
 77 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 78 |    limit: !ref <number_of_epochs>
 79 | 
 80 | head: !new:speechbrain.nnet.linear.Linear
 81 |    input_size: !ref <feat_dim>
 82 |    n_neurons: !ref <output_neurons>
 83 | 
 84 | fusion: !new:fusion.FusionRCA
 85 | 
 86 | model: !new:torch.nn.ModuleList
 87 |    - [!ref <fusion>, !ref <head>]
 88 | 
 89 | modules:
 90 |    head: !ref <head>
 91 |    fusion: !ref <fusion>
 92 | 
 93 | log_softmax: !new:speechbrain.nnet.activations.Softmax
 94 |    apply_log: True
 95 | 
 96 | onset_positive_weight: 15.0
 97 | offset_positive_weight: 1.0
 98 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
 99 |    reduction: mean
100 |    allowed_len_diff: 3
101 |    label_smoothing: 0.0
102 | 
103 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
104 |    reduction: mean
105 |    allowed_len_diff: 3
106 |    label_smoothing: 0.0
107 | 
108 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
109 |    reduction: mean
110 |    allowed_len_diff: 3
111 |    label_smoothing: 0.0
112 | 
113 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
114 |    reduction: mean
115 |    allowed_len_diff: 3
116 |    label_smoothing: 0.0
117 | 
118 | model_opt_class: !name:torch.optim.Adadelta
119 |    lr: !ref <lr>
120 |    rho: 0.95
121 |    eps: 1.e-8
122 | 
123 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
124 |    initial_value: !ref <lr>
125 |    improvement_threshold: 0.0025
126 |    annealing_factor: 0.8
127 |    patient: 0
128 | 
129 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
130 |    checkpoints_dir: !ref <save_folder>
131 |    recoverables:
132 |       model: !ref <model>
133 |       scheduler_model: !ref <lr_annealing_model>
134 |       counter: !ref <epoch_counter>
135 | 
136 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
137 |    save_file: !ref <train_log>
138 |    precision: 3
139 | 
140 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
141 |     metric: !name:speechbrain.nnet.losses.bce_loss
142 |         reduction: batch
143 |         allowed_len_diff: 3
144 |         label_smoothing: 0.0
145 | 
146 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
147 |     metric: !name:speechbrain.nnet.losses.bce_loss
148 |         reduction: batch
149 |         allowed_len_diff: 3
150 |         label_smoothing: 0.0
151 | 
152 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
153 |     metric: !name:speechbrain.nnet.losses.nll_loss
154 |         reduction: batch
155 |         allowed_len_diff: 3
156 |         label_smoothing: 0.0
157 | 
158 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
159 |     metric: !name:speechbrain.nnet.losses.nll_loss
160 |         reduction: batch
161 |         allowed_len_diff: 3
162 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/audio_visual/hparams/train_rca_av.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: wav2vec2 + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | noise_type: natural
 12 | snr_db: -10
 13 | add_noise: True
 14 | lr: 0.0003
 15 | output_folder: !ref results/audio_visual_rca/<noise_type>/noise_<add_noise>_<noise_type>_<snr_db>db_lr<lr>/<seed>
 16 | save_folder: !ref <output_folder>/save
 17 | csv_folder: !ref data_feat/dur_<dur_threshold>s
 18 | train_log: !ref <output_folder>/train_log.txt
 19 | 
 20 | # Data files
 21 | data_folder: !PLACEHOLDER # e,g./path/to/DSing
 22 | # noise/ris dataset will automatically be downloaded
 23 | data_folder_rirs: !ref <data_folder>
 24 | ckpt_interval_minutes: 25 # save checkpoint every N min
 25 | train_csv: !ref <csv_folder>/n20em_train.csv
 26 | valid_csv: !ref <csv_folder>/n20em_valid.csv
 27 | test_csv:
 28 |    - !ref <csv_folder>/n20em_test.csv
 29 |    - !ref <csv_folder>/n20em_valid.csv
 30 | 
 31 | # Training parameters
 32 | number_of_epochs: 10
 33 | sorting: ascending
 34 | auto_mix_prec: False
 35 | audio_sample_rate: 49.8
 36 | video_sample_rate: 50
 37 | frame_rate: 49.8
 38 | linear_prob_epochs: 2
 39 | pretrain: True
 40 | pretrain_folder: ../save_model
 41 | 
 42 | # Evaluating parameters
 43 | onset_threshold: 0.4
 44 | offset_threshold: 0.5
 45 | onset_tolerance: 0.05
 46 | pitch_tolerance: 50
 47 | 
 48 | # With data_parallel batch_size is split into N jobs
 49 | # With DDP batch_size is multiplied by N jobs
 50 | # Must be 3 per GPU to fit 32GB of VRAM
 51 | batch_size: 8
 52 | test_batch_size: 1
 53 | num_workers: 8
 54 | 
 55 | # Dataloader options
 56 | train_dataloader_opts:
 57 |    batch_size: !ref <batch_size>
 58 |    num_workers: !ref <num_workers>
 59 | 
 60 | valid_dataloader_opts:
 61 |    batch_size: !ref <test_batch_size>
 62 |    num_workers: !ref <num_workers>
 63 | 
 64 | test_dataloader_opts:
 65 |    batch_size: !ref <test_batch_size>
 66 |    num_workers: !ref <num_workers>
 67 | 
 68 | # Outputs
 69 | pitch_octave_num: 4
 70 | pitch_class_num: 12
 71 | feat_dim: 1024
 72 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 73 | 
 74 | #
 75 | # Functions and classes
 76 | #
 77 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 78 |    limit: !ref <number_of_epochs>
 79 | 
 80 | head: !new:speechbrain.nnet.linear.Linear
 81 |    input_size: !ref <feat_dim>
 82 |    n_neurons: !ref <output_neurons>
 83 | 
 84 | fusion: !new:fusion.FusionRCA
 85 | 
 86 | model: !new:torch.nn.ModuleList
 87 |    - [!ref <fusion>, !ref <head>]
 88 | 
 89 | modules:
 90 |    head: !ref <head>
 91 |    fusion: !ref <fusion>
 92 | 
 93 | log_softmax: !new:speechbrain.nnet.activations.Softmax
 94 |    apply_log: True
 95 | 
 96 | onset_positive_weight: 15.0
 97 | offset_positive_weight: 1.0
 98 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
 99 |    reduction: mean
100 |    allowed_len_diff: 3
101 |    label_smoothing: 0.0
102 | 
103 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
104 |    reduction: mean
105 |    allowed_len_diff: 3
106 |    label_smoothing: 0.0
107 | 
108 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
109 |    reduction: mean
110 |    allowed_len_diff: 3
111 |    label_smoothing: 0.0
112 | 
113 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
114 |    reduction: mean
115 |    allowed_len_diff: 3
116 |    label_smoothing: 0.0
117 | 
118 | model_opt_class: !name:torch.optim.Adadelta
119 |    lr: !ref <lr>
120 |    rho: 0.95
121 |    eps: 1.e-8
122 | 
123 | lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
124 |    initial_value: !ref <lr>
125 |    improvement_threshold: 0.0025
126 |    annealing_factor: 0.8
127 |    patient: 0
128 | 
129 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
130 |    checkpoints_dir: !ref <save_folder>
131 |    recoverables:
132 |       model: !ref <model>
133 |       scheduler_model: !ref <lr_annealing_model>
134 |       counter: !ref <epoch_counter>
135 | 
136 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
137 |    save_file: !ref <train_log>
138 |    precision: 3
139 | 
140 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
141 |     metric: !name:speechbrain.nnet.losses.bce_loss
142 |         reduction: batch
143 |         allowed_len_diff: 3
144 |         label_smoothing: 0.0
145 | 
146 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
147 |     metric: !name:speechbrain.nnet.losses.bce_loss
148 |         reduction: batch
149 |         allowed_len_diff: 3
150 |         label_smoothing: 0.0
151 | 
152 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
153 |     metric: !name:speechbrain.nnet.losses.nll_loss
154 |         reduction: batch
155 |         allowed_len_diff: 3
156 |         label_smoothing: 0.0
157 | 
158 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
159 |     metric: !name:speechbrain.nnet.losses.nll_loss
160 |         reduction: batch
161 |         allowed_len_diff: 3
162 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/audio_visual/prepare_n20emv2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data preparation for N20EMv2 of singing voice transcription
 3 | The input to model needs to be spectrum features
 4 | Authors
 5 | * Xiangming Gu 2022
 6 | """
 7 | import os
 8 | import csv
 9 | import json
10 | import torch
11 | import argparse
12 | from tqdm import tqdm
13 | SAMPLERATE=16000
14 | 
15 | 
16 | def prepare_csv_n20emv2_feat(folder, csv_folder="./data_feat", frame_rate=49.8, dur_thrd=5):
17 |     """
18 |     This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration
19 |     """
20 | 
21 |     # initialize the csv lines
22 |     csv_train_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
23 |     csv_valid_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
24 |     csv_test_lines = [["ID", "duration", "audio", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
25 |     # load the annotations
26 |     json_file = os.path.join(folder, "annotations.json")
27 |     folder_data = os.path.join(folder, "data")
28 |     # open ground truth data
29 |     with open(json_file) as f:
30 |         annotations = json.load(f)
31 |     f.close()
32 |     # traverse the whole dataset
33 |     for entry in tqdm(annotations.keys()):
34 |         split = annotations[entry]["split"]
35 |         audio_path = os.path.join(folder_data, entry, "noise_data", "clean_feats.pt")
36 |         video_path = os.path.join(folder_data, entry, "noise_data", "video_feats.pt")
37 |         anno_path = os.path.join(folder_data, entry, "audio_anno", str(frame_rate) + "fps", "audio_frame_anno.npy")
38 |         song_anno_path = os.path.join(folder_data, entry, "note_anno.json")
39 | 
40 |         # load the audio
41 |         audio = torch.load(audio_path)
42 |         video = torch.load(video_path)
43 |         frame1 = audio.shape[0]
44 |         frame2 = video.shape[0]
45 |         duration = frame1 / 49.8  # audio frame-rate 
46 | 
47 |         # split the whole song into utterances
48 |         utter_num = round(duration / dur_thrd)
49 |         for i in range(1, utter_num+1):
50 |             ID = entry + "_" + str(i)
51 |             if i == utter_num:
52 |                 dur = duration - (utter_num - 1) * dur_thrd
53 |                 assert 0 < dur <= dur_thrd * 3 / 2
54 |             else:
55 |                 dur = dur_thrd
56 |             csv_line = [
57 |                 ID, str(dur), audio_path, video_path, str(i), str(utter_num), anno_path, song_anno_path,
58 |             ]
59 |             if split == "train":
60 |                 csv_train_lines.append(csv_line)
61 |             elif split == "valid":
62 |                 csv_valid_lines.append(csv_line)
63 |             elif split == "test":
64 |                 csv_test_lines.append(csv_line)
65 |     # save csv files
66 |     save_folder = os.path.join(csv_folder, "dur_" + str(dur_thrd) + "s")
67 |     os.makedirs(save_folder, exist_ok=True)
68 |     save_train_path = os.path.join(save_folder, "n20em_train.csv")
69 |     save_valid_path = os.path.join(save_folder, "n20em_valid.csv")
70 |     save_test_path = os.path.join(save_folder, "n20em_test.csv")
71 |     # train
72 |     with open(save_train_path, mode="w") as csv_f:
73 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
74 |         for line in csv_train_lines:
75 |             csv_writer.writerow(line)
76 |     # valid
77 |     with open(save_valid_path, mode="w") as csv_f:
78 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
79 |         for line in csv_valid_lines:
80 |             csv_writer.writerow(line)
81 |     # test
82 |     with open(save_test_path, mode="w") as csv_f:
83 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
84 |         for line in csv_test_lines:
85 |             csv_writer.writerow(line)
86 | 
87 | if __name__ == "__main__":
88 |     parser = argparse.ArgumentParser()
89 |     parser.add_argument("--duration", type=int, default=5, help="the threshold for duration")
90 |     parser.add_argument("--frame_rate", type=float, default=49.8, help="The frame-rate for SSL models")
91 |     parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset")
92 |     args = parser.parse_args()
93 |     prepare_csv_n20emv2_feat(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration)


--------------------------------------------------------------------------------
/N20EMv2/audio_visual/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for automatic music transcription
  3 | 
  4 | Authors
  5 | * Xiangming Gu 2022
  6 | """
  7 | import numpy as np
  8 | 
  9 | 
 10 | def note2frame(gt_data, length, frame_size=1/49.8, pitch_shift=0):
 11 |     """
 12 |     This function transforms the note-level annotations into the frame-level annotations
 13 |     Adapted from https://github.com/york135/singing_transcription_ICASSP2021/blob/master/AST/data_utils/audio_dataset.py
 14 |     """
 15 |     new_label = []
 16 | 
 17 |     cur_note = 0
 18 |     cur_note_onset = gt_data[cur_note][0]
 19 |     cur_note_offset = gt_data[cur_note][1]
 20 |     cur_note_pitch = gt_data[cur_note][2] + pitch_shift
 21 | 
 22 |     # start from C2 (36) to B5 (83), total: 4 classes. This is a little confusing
 23 |     octave_start = 0
 24 |     octave_end = 3
 25 |     pitch_class_num = 12
 26 |     # frame_size = 1/ 49 # 1024.0 / 44100.0
 27 | 
 28 |     for i in range(length):
 29 |         cur_time = i * frame_size
 30 | 
 31 |         if abs(cur_time - cur_note_onset) <= (frame_size / 2.0):
 32 |             # First dim : onset
 33 |             # Second dim : no pitch
 34 |             if i == 0 or new_label[-1][0] != 1:
 35 |                 my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start
 36 |                 my_pitch_class = cur_note_pitch % pitch_class_num
 37 |                 label = [1, 0, my_oct, my_pitch_class]
 38 |                 new_label.append(label)
 39 |             else:
 40 |                 my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start
 41 |                 my_pitch_class = cur_note_pitch % pitch_class_num
 42 |                 label = [0, 0, my_oct, my_pitch_class]
 43 |                 new_label.append(label)
 44 | 
 45 |         elif cur_time < cur_note_onset or cur_note >= len(gt_data):
 46 |             # For the frame that doesn't belong to any note
 47 |             label = [0, 1, octave_end+1, pitch_class_num]
 48 |             new_label.append(label)
 49 | 
 50 |         elif abs(cur_time - cur_note_offset) <= (frame_size / 2.0):
 51 |             # For the offset frame
 52 |             my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start
 53 |             my_pitch_class = cur_note_pitch % pitch_class_num
 54 |             label = [0, 1, my_oct, my_pitch_class]
 55 | 
 56 |             cur_note = cur_note + 1
 57 |             if cur_note < len(gt_data):
 58 |                 cur_note_onset = gt_data[cur_note][0]
 59 |                 cur_note_offset = gt_data[cur_note][1]
 60 |                 cur_note_pitch = gt_data[cur_note][2] + pitch_shift
 61 |                 if abs(cur_time - cur_note_onset)  <= (frame_size / 2.0):
 62 |                     my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start
 63 |                     my_pitch_class = cur_note_pitch % pitch_class_num
 64 |                     label[0] = 1
 65 |                     label[1] = 0
 66 |                     label[2] = my_oct
 67 |                     label[3] = my_pitch_class
 68 | 
 69 |             new_label.append(label)
 70 | 
 71 |         else:
 72 |             # For the voiced frame
 73 |             my_oct = int(min(max(octave_start, (cur_note_pitch- 36)//pitch_class_num), octave_end)) - octave_start
 74 |             my_pitch_class = cur_note_pitch % pitch_class_num
 75 | 
 76 |             label = [0, 0, my_oct, my_pitch_class]
 77 |             new_label.append(label)
 78 | 
 79 |     return np.array(new_label)
 80 | 
 81 | 
 82 | def frame2note(frame_info, onset_thres, offset_thres, frame_size=1/49.8):
 83 |     """
 84 |     This function transforms the frame-level predictions into the note-level predictions.
 85 |     Parse frame info [(onset_probs, offset_probs, pitch_class)...] into desired label format.
 86 |     Adapted from https://github.com/york135/singing_transcription_ICASSP2021/blob/master/AST/predictor.py
 87 |     """
 88 | 
 89 |     result = []
 90 |     current_onset = None
 91 |     pitch_counter = []
 92 | 
 93 |     last_onset = 0.0
 94 |     onset_seq = np.array([frame_info[i][0] for i in range(len(frame_info))])
 95 | 
 96 |     local_max_size = 3
 97 |     current_time = 0.0
 98 | 
 99 |     onset_seq_length = len(onset_seq)
100 | 
101 |     for i in range(len(frame_info)):
102 | 
103 |         current_time = frame_size*i
104 |         info = frame_info[i]
105 | 
106 |         backward_frames = i - local_max_size
107 |         if backward_frames < 0:
108 |             backward_frames = 0
109 | 
110 |         forward_frames = i + local_max_size + 1
111 |         if forward_frames > onset_seq_length - 1:
112 |             forward_frames = onset_seq_length - 1
113 | 
114 |         # local max and more than threshold
115 |         if info[0] >= onset_thres and onset_seq[i] == np.amax(onset_seq[backward_frames : forward_frames]):
116 | 
117 |             if current_onset is None:
118 |                 current_onset = current_time
119 |                 last_onset = info[0] - onset_thres
120 | 
121 |             else:
122 |                 if len(pitch_counter) > 0:
123 |                     result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36])
124 | 
125 |                 current_onset = current_time
126 |                 last_onset = info[0] - onset_thres
127 |                 pitch_counter = []
128 | 
129 |         elif info[1] >= offset_thres:  # If is offset
130 |             if current_onset is not None:
131 |                 if len(pitch_counter) > 0:
132 |                     result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36])
133 |                 current_onset = None
134 | 
135 |                 pitch_counter = []
136 | 
137 |         # If current_onset exist, add count for the pitch
138 |         if current_onset is not None:
139 |             final_pitch = int(info[2]* 12 + info[3])
140 |             if info[2] != 4 and info[3] != 12:
141 |             # if final_pitch != 60:
142 |                 pitch_counter.append(final_pitch)
143 | 
144 |     if current_onset is not None:
145 |         if len(pitch_counter) > 0:
146 |             result.append([current_onset, current_time, max(set(pitch_counter), key=pitch_counter.count) + 36])
147 |         current_onset = None
148 | 
149 |     return result
150 | 
151 | 
152 | class AverageMeter(object):
153 |     """Computes and stores the average and current value"""
154 | 
155 |     def __init__(self):
156 |         self.reset()
157 | 
158 |     def reset(self):
159 |         self.val = 0
160 |         self.avg = 0
161 |         self.sum = 0
162 |         self.count = 0
163 | 
164 |     def update(self, val, n=1):
165 |         self.val = val
166 |         self.sum += val * n
167 |         self.count += n
168 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/N20EMv2/video_only/README.md:
--------------------------------------------------------------------------------
 1 | # Video-only Automatic Music Transcription with N20EMv2 dataset
 2 | This sub-project contains recipes for trianing video-only AMT system using N20EMv2 dataset.
 3 | 
 4 | ## Prerequisites
 5 | 1. Before running our scripts, you need to download, preprocess and save the N20EMv2 properly. For your convenience, we already crop the video clips of lip movements without releasing the identity of each subject.
 6 | 
 7 | The file organization for N20EMv2 should be:
 8 | ```
 9 | /path/to/N20EMv2
10 | ├── data
11 |     ├── song1
12 |         ├── vocals.wav
13 |         ├── video_50fps.npy
14 |     ├── song2
15 |     ├── ...
16 | ├── annotations.json
17 | ```
18 | 
19 | 
20 | 2. Prepare N20EMv2 dataset, run:
21 | ```
22 | python prepare_n20emv2.py --duration <duration> --frame_rate 50 --n20emv2 /path/to/n20emv2
23 | ```
24 | 
25 | The option `--duration` refers to the length of utterances during the training. To parallelize the training, we split the whole song into short utterances during the training. The evaluation is conducted on the whole song. As a default, we use `5` s, which is the same as audio-only automatic music transcription. The option `--frame_rate` refers to the frame rate of frame-level annotations. As a default, we use `50` fps, which is also the frame rate of video input.
26 | 
27 | After running this script, the file organization for N20EMv2 should be:
28 | ```
29 | /path/to/N20EMv2
30 | ├── data
31 |     ├── song1
32 |         ├── vocals.wav
33 |         ├── video_50fps.npy
34 |         ├── note_anno.json
35 |         ├── video_anno
36 |             ├── 50fps
37 |                 ├── video_frame_anno.npy
38 |     ├── song2
39 |     ├── ...
40 | ├── annotations.json
41 | ```
42 | 
43 | The resulted csv files are save in the same root folder: 
44 | ```
45 | ├── data
46 |     ├── frame_rate<frame_rate>
47 |         ├── dur_<duration>
48 |             ├── n20emv2_train.csv
49 |             ├── n20emv2_valid.csv
50 |             ├── n20emv2_test.csv
51 | ├── prepare_n20emv2.py
52 | ```
53 | 
54 | ## How to run
55 | We provide basic runnning scripts for those who intend to follow our research. You can change the hyperparameters or even the types of self-supervised-learning (SSL) models in your own project. To reproduce video-only sing voice transcription model in our paper, first download the AV-HuBERT model pretrained on audio-visual data:
56 | ```
57 | mkdir ssl_model/AVHuBERT
58 | cd ssl_model/AVHuBERT
59 | wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt
60 | ```
61 | 
62 | Then run the following command:
63 | ```
64 | CUDA_VISIBLE_DEVICES=0,1 python train_video_ssl.py hparams/train_video_ssl.yaml --data_parallel_backend --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10
65 | ```
66 | The option `--linear_prob_epochs` refers to the number of epochs for linear probing in our paper. We provide the config for AVHuBERT pretrained on audio-visual speech data. If you intend to use the config for AVHuBERT pretrained and finetuned on audio-visual speech data, please rewrite `hparams/train_video_ssl.yaml` to change the model. Although we use data parallel (DP) in our experiments, we also provide distributed data parallel (DDP) version (remember to change the `batch_size` to avoid OOM):
67 | ```
68 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train_video_ssl.py hparams/train_video_ssl.yaml --distributed_launch --distributed_backend='nccl' --find_unused_parameters --data_folder /path/to/N20EMv2 --attempt 1 --dur_threshold 5 --linear_prob_epochs 2 --number_of_epochs 10
69 | ```
70 | 
71 | 
72 | ## Results
73 | We provide our video-only automatic music transcription model [[model link](https://drive.google.com/drive/folders/1u82GaLM4AWtfp5VzDHryxCNUZglN0bYe?usp=sharing)] in the paper.
74 | <p align="center">
75 | <img src="../../assets/results2.png" alt="" data-canonical-src="../../assets/results2.png" width="100%"/>
76 | </p>


--------------------------------------------------------------------------------
/N20EMv2/video_only/hparams/extract_ssl_feats.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: avhubert + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | overlap: 0
 12 | frame_rate: 50
 13 | sample_rate: 50
 14 | output_folder: !ref results/AVHuBERT/train_video_ssl_dur<dur_threshold>_attempt<attempt>/<seed>
 15 | save_folder: !ref <output_folder>/save
 16 | csv_folder: !ref data/frame_rate<frame_rate>/dur_<dur_threshold>s
 17 | train_log: !ref <output_folder>/train_log.txt
 18 | 
 19 | # Data files
 20 | data_folder: !PLACEHOLDER # e,g./path/to/DSing
 21 | # noise/ris dataset will automatically be downloaded
 22 | data_folder_rirs: !ref <data_folder>
 23 | ckpt_interval_minutes: 25 # save checkpoint every N min
 24 | train_csv: !ref <csv_folder>/n20em_train.csv
 25 | valid_csv: !ref <csv_folder>/n20em_valid.csv
 26 | test_csv:
 27 |    - !ref <csv_folder>/n20em_test.csv
 28 |    - !ref <csv_folder>/n20em_valid.csv
 29 |    - !ref <csv_folder>/n20em_train.csv
 30 | 
 31 | # Training parameters
 32 | number_of_epochs: 10
 33 | lr: 0.0003
 34 | lr_encoder: 0.00005
 35 | sorting: ascending
 36 | auto_mix_prec: False
 37 | linear_prob_epochs: 2
 38 | 
 39 | split_noise: False
 40 | pretrain: False
 41 | pretrain_folder: ../pretrain_model
 42 | save_model: False
 43 | save_model_folder: ../save_model
 44 | 
 45 | # Evaluating parameters
 46 | onset_threshold: 0.4
 47 | offset_threshold: 0.5
 48 | onset_tolerance: 0.05
 49 | offset_tolerance: 0.05
 50 | pitch_tolerance: 50
 51 | 
 52 | # With data_parallel batch_size is split into N jobs
 53 | # With DDP batch_size is multiplied by N jobs
 54 | # Must be 3 per GPU to fit 32GB of VRAM
 55 | batch_size: 8
 56 | test_batch_size: 1
 57 | 
 58 | # Dataloader options
 59 | train_dataloader_opts:
 60 |    batch_size: !ref <batch_size>
 61 |    num_workers: 8
 62 | 
 63 | valid_dataloader_opts:
 64 |    batch_size: !ref <test_batch_size>
 65 |    num_workers: 8
 66 | 
 67 | test_dataloader_opts:
 68 |    batch_size: !ref <test_batch_size>
 69 |    num_workers: 8
 70 | 
 71 | # Model parameters
 72 | freeze_encoder: False
 73 | 
 74 | # Outputs
 75 | pitch_octave_num: 4
 76 | pitch_class_num: 12
 77 | feat_dim: 1024
 78 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 79 | 
 80 | #
 81 | # Functions and classes
 82 | #
 83 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 84 |    limit: !ref <number_of_epochs>
 85 | 
 86 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
 87 | #    sample_rate: !ref <sample_rate>
 88 | #    speeds: [95, 100, 105]
 89 | 
 90 | avhubert_url: https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt
 91 | encoder: !new:fairseq_interface.FairseqAVHubertPretrain
 92 |    pretrained_path: !ref <avhubert_url>
 93 |    output_norm: True
 94 |    freeze: !ref <freeze_encoder>
 95 |    save_path: ssl_model/AVHuBERT/large_vox_iter5.pt
 96 | 
 97 | head: !new:speechbrain.nnet.linear.Linear
 98 |    input_size: !ref <feat_dim>
 99 |    n_neurons: !ref <output_neurons>
100 | 
101 | modules:
102 |    encoder: !ref <encoder>
103 |    head: !ref <head>
104 | 
105 | log_softmax: !new:speechbrain.nnet.activations.Softmax
106 |    apply_log: True
107 | 
108 | onset_positive_weight: 15.0
109 | offset_positive_weight: 1.0
110 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
111 |    reduction: mean
112 |    allowed_len_diff: 3
113 |    label_smoothing: 0.0
114 | 
115 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
116 |    reduction: mean
117 |    allowed_len_diff: 3
118 |    label_smoothing: 0.0
119 | 
120 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
121 |    reduction: mean
122 |    allowed_len_diff: 3
123 |    label_smoothing: 0.0
124 | 
125 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
126 |    reduction: mean
127 |    allowed_len_diff: 3
128 |    label_smoothing: 0.0
129 | 
130 | head_opt_class: !name:torch.optim.Adadelta
131 |    lr: !ref <lr>
132 |    rho: 0.95
133 |    eps: 1.e-8
134 | 
135 | encoder_opt_class: !name:torch.optim.Adam
136 |    lr: !ref <lr_encoder>
137 | 
138 | lr_annealing_head: !new:speechbrain.nnet.schedulers.NewBobScheduler
139 |    initial_value: !ref <lr>
140 |    improvement_threshold: 0.0025
141 |    annealing_factor: 0.8
142 |    patient: 0
143 | 
144 | lr_annealing_encoder: !new:speechbrain.nnet.schedulers.NewBobScheduler
145 |    initial_value: !ref <lr_encoder>
146 |    improvement_threshold: 0.0025
147 |    annealing_factor: 0.9
148 |    patient: 0
149 | 
150 | 
151 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
152 |    checkpoints_dir: !ref <save_folder>
153 |    recoverables:
154 |       encoder: !ref <encoder>
155 |       head: !ref <head>
156 |       scheduler_head: !ref <lr_annealing_head>
157 |       scheduler_encoder: !ref <lr_annealing_encoder>
158 |       counter: !ref <epoch_counter>
159 | 
160 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
161 |    save_file: !ref <train_log>
162 |    precision: 3
163 | 
164 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
165 |     metric: !name:speechbrain.nnet.losses.bce_loss
166 |         reduction: batch
167 |         allowed_len_diff: 3
168 |         label_smoothing: 0.0
169 | 
170 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
171 |     metric: !name:speechbrain.nnet.losses.bce_loss
172 |         reduction: batch
173 |         allowed_len_diff: 3
174 |         label_smoothing: 0.0
175 | 
176 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
177 |     metric: !name:speechbrain.nnet.losses.nll_loss
178 |         reduction: batch
179 |         allowed_len_diff: 3
180 |         label_smoothing: 0.0
181 | 
182 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
183 |     metric: !name:speechbrain.nnet.losses.nll_loss
184 |         reduction: batch
185 |         allowed_len_diff: 3
186 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/video_only/hparams/train_video_ssl.yaml:
--------------------------------------------------------------------------------
  1 | # ################################
  2 | # Model: avhubert + Linear
  3 | # Authors: Xiangming Gu 2022
  4 | # ################################
  5 | 
  6 | # Seed needs to be set at top of yaml, before objects with parameters are made
  7 | seed: 1986
  8 | __set_seed: !apply:torch.manual_seed [!ref <seed>]
  9 | attempt: 1
 10 | dur_threshold: 5
 11 | overlap: 0
 12 | frame_rate: 50
 13 | sample_rate: 50
 14 | output_folder: !ref results/AVHuBERT/train_video_ssl_dur<dur_threshold>_attempt<attempt>/<seed>
 15 | save_folder: !ref <output_folder>/save
 16 | csv_folder: !ref data/frame_rate<frame_rate>/dur_<dur_threshold>s
 17 | train_log: !ref <output_folder>/train_log.txt
 18 | 
 19 | # Data files
 20 | data_folder: !PLACEHOLDER # e,g./path/to/DSing
 21 | # noise/ris dataset will automatically be downloaded
 22 | data_folder_rirs: !ref <data_folder>
 23 | ckpt_interval_minutes: 25 # save checkpoint every N min
 24 | train_csv: !ref <csv_folder>/n20em_train.csv
 25 | valid_csv: !ref <csv_folder>/n20em_valid.csv
 26 | test_csv:
 27 |    - !ref <csv_folder>/n20em_test.csv
 28 |    - !ref <csv_folder>/n20em_valid.csv
 29 | 
 30 | # Training parameters
 31 | number_of_epochs: 10
 32 | lr: 0.0003
 33 | lr_encoder: 0.00005
 34 | sorting: ascending
 35 | auto_mix_prec: False
 36 | linear_prob_epochs: 2
 37 | 
 38 | split_noise: False
 39 | pretrain: False
 40 | pretrain_folder: ../pretrain_model
 41 | save_model: False
 42 | save_model_folder: ../save_model
 43 | 
 44 | # Evaluating parameters
 45 | onset_threshold: 0.4
 46 | offset_threshold: 0.5
 47 | onset_tolerance: 0.05
 48 | offset_tolerance: 0.05
 49 | pitch_tolerance: 50
 50 | 
 51 | # With data_parallel batch_size is split into N jobs
 52 | # With DDP batch_size is multiplied by N jobs
 53 | # Must be 3 per GPU to fit 32GB of VRAM
 54 | batch_size: 8
 55 | test_batch_size: 1
 56 | 
 57 | # Dataloader options
 58 | train_dataloader_opts:
 59 |    batch_size: !ref <batch_size>
 60 |    num_workers: 8
 61 | 
 62 | valid_dataloader_opts:
 63 |    batch_size: !ref <test_batch_size>
 64 |    num_workers: 8
 65 | 
 66 | test_dataloader_opts:
 67 |    batch_size: !ref <test_batch_size>
 68 |    num_workers: 8
 69 | 
 70 | # Model parameters
 71 | freeze_encoder: False
 72 | 
 73 | # Outputs
 74 | pitch_octave_num: 4
 75 | pitch_class_num: 12
 76 | feat_dim: 1024
 77 | output_neurons: 20  # 2+pitch_octave+pitch_class+2
 78 | 
 79 | #
 80 | # Functions and classes
 81 | #
 82 | epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 83 |    limit: !ref <number_of_epochs>
 84 | 
 85 | # augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
 86 | #    sample_rate: !ref <sample_rate>
 87 | #    speeds: [95, 100, 105]
 88 | 
 89 | avhubert_url: https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/large_vox_iter5.pt
 90 | encoder: !new:fairseq_interface.FairseqAVHubertPretrain
 91 |    pretrained_path: !ref <avhubert_url>
 92 |    output_norm: True
 93 |    freeze: !ref <freeze_encoder>
 94 |    save_path: ssl_model/AVHuBERT/large_vox_iter5.pt
 95 | 
 96 | head: !new:speechbrain.nnet.linear.Linear
 97 |    input_size: !ref <feat_dim>
 98 |    n_neurons: !ref <output_neurons>
 99 | 
100 | modules:
101 |    encoder: !ref <encoder>
102 |    head: !ref <head>
103 | 
104 | log_softmax: !new:speechbrain.nnet.activations.Softmax
105 |    apply_log: True
106 | 
107 | onset_positive_weight: 15.0
108 | offset_positive_weight: 1.0
109 | onset_criterion: !name:speechbrain.nnet.losses.bce_loss
110 |    reduction: mean
111 |    allowed_len_diff: 3
112 |    label_smoothing: 0.0
113 | 
114 | offset_criterion: !name:speechbrain.nnet.losses.bce_loss
115 |    reduction: mean
116 |    allowed_len_diff: 3
117 |    label_smoothing: 0.0
118 | 
119 | octave_criterion: !name:speechbrain.nnet.losses.nll_loss
120 |    reduction: mean
121 |    allowed_len_diff: 3
122 |    label_smoothing: 0.0
123 | 
124 | pitch_criterion: !name:speechbrain.nnet.losses.nll_loss
125 |    reduction: mean
126 |    allowed_len_diff: 3
127 |    label_smoothing: 0.0
128 | 
129 | head_opt_class: !name:torch.optim.Adadelta
130 |    lr: !ref <lr>
131 |    rho: 0.95
132 |    eps: 1.e-8
133 | 
134 | encoder_opt_class: !name:torch.optim.Adam
135 |    lr: !ref <lr_encoder>
136 | 
137 | lr_annealing_head: !new:speechbrain.nnet.schedulers.NewBobScheduler
138 |    initial_value: !ref <lr>
139 |    improvement_threshold: 0.0025
140 |    annealing_factor: 0.8
141 |    patient: 0
142 | 
143 | lr_annealing_encoder: !new:speechbrain.nnet.schedulers.NewBobScheduler
144 |    initial_value: !ref <lr_encoder>
145 |    improvement_threshold: 0.0025
146 |    annealing_factor: 0.9
147 |    patient: 0
148 | 
149 | 
150 | checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
151 |    checkpoints_dir: !ref <save_folder>
152 |    recoverables:
153 |       encoder: !ref <encoder>
154 |       head: !ref <head>
155 |       scheduler_head: !ref <lr_annealing_head>
156 |       scheduler_encoder: !ref <lr_annealing_encoder>
157 |       counter: !ref <epoch_counter>
158 | 
159 | train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
160 |    save_file: !ref <train_log>
161 |    precision: 3
162 | 
163 | onset_stats: !name:speechbrain.utils.metric_stats.MetricStats
164 |     metric: !name:speechbrain.nnet.losses.bce_loss
165 |         reduction: batch
166 |         allowed_len_diff: 3
167 |         label_smoothing: 0.0
168 | 
169 | offset_stats: !name:speechbrain.utils.metric_stats.MetricStats
170 |     metric: !name:speechbrain.nnet.losses.bce_loss
171 |         reduction: batch
172 |         allowed_len_diff: 3
173 |         label_smoothing: 0.0
174 | 
175 | octave_stats: !name:speechbrain.utils.metric_stats.MetricStats
176 |     metric: !name:speechbrain.nnet.losses.nll_loss
177 |         reduction: batch
178 |         allowed_len_diff: 3
179 |         label_smoothing: 0.0
180 | 
181 | pitch_stats: !name:speechbrain.utils.metric_stats.MetricStats
182 |     metric: !name:speechbrain.nnet.losses.nll_loss
183 |         reduction: batch
184 |         allowed_len_diff: 3
185 |         label_smoothing: 0.0


--------------------------------------------------------------------------------
/N20EMv2/video_only/prepare_n20emv2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data preparation for datasets of automatic music transcription
  3 | 
  4 | Authors
  5 | * Xiangming Gu 2022
  6 | """
  7 | import os
  8 | import csv
  9 | import json
 10 | import argparse
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | from utils import note2frame
 14 | 
 15 | 
 16 | def prepare_frame_anno(folder, frame_rate=50):
 17 |     """
 18 |     This function processes the frame-level annotations for each song
 19 |     """
 20 |     json_file = os.path.join(folder, "annotations.json")
 21 |     folder_data = os.path.join(folder, "data")
 22 |     # open ground truth data
 23 |     with open(json_file) as f:
 24 |         annotations = json.load(f)
 25 |     f.close()
 26 |     # traverse the whole dataset
 27 |     for entry in tqdm(annotations.keys()):
 28 |         anno = annotations[entry]["midi"]
 29 |         json_path = os.path.join(folder_data, entry, "note_anno.json")
 30 |         # save json file
 31 |         with open(json_path, "w") as f:
 32 |             json.dump(anno, f)
 33 |         f.close()
 34 |         # load video file
 35 |         video_file = os.path.join(folder_data, entry, "video_" + str(frame_rate) + "fps.npy")
 36 |         video = np.load(video_file)
 37 |         # compute duration and length
 38 |         length = video.shape[0]
 39 |         frame_label = note2frame(gt_data=anno, length=length, frame_size=1/frame_rate)
 40 |         assert frame_label.shape[0] == length
 41 |         # save frame-level annotation
 42 |         os.makedirs(os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps"), exist_ok=True)
 43 |         frame_anno_path = os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps", "video_frame_anno.npy")
 44 |         np.save(frame_anno_path, frame_label)
 45 | 
 46 | 
 47 | def prepare_csv_n20emv2(folder, csv_folder="./data", frame_rate=50, dur_thrd=5):
 48 |     """
 49 |     This function creates csv files for speechbrain to process, dur_thrd is the threshold for the duration
 50 |     """
 51 | 
 52 |     # initialize the csv lines
 53 |     csv_train_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 54 |     csv_valid_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 55 |     csv_test_lines = [["ID", "duration", "video", "utter_id", "utter_num", "frame_anno", "song_anno"]]
 56 |     # load the annotations
 57 |     json_file = os.path.join(folder, "annotations.json")
 58 |     folder_data = os.path.join(folder, "data")
 59 |     # open ground truth data
 60 |     with open(json_file) as f:
 61 |         annotations = json.load(f)
 62 |     f.close()
 63 |     # traverse the whole dataset
 64 |     for entry in tqdm(annotations.keys()):
 65 |         split = annotations[entry]["split"]
 66 |         video_path = os.path.join(folder_data, entry, "video_" + str(frame_rate) + "fps.npy")
 67 |         anno_path = os.path.join(folder_data, entry, "video_anno", str(frame_rate) + "fps", "video_frame_anno.npy")
 68 |         song_anno_path = os.path.join(folder_data, entry, "note_anno.json")
 69 | 
 70 |         # load the video
 71 |         video = np.load(video_path)
 72 |         duration = video.shape[0] / frame_rate
 73 | 
 74 |         # split the whole song into utterances
 75 |         is_end = False
 76 |         cur_i = 1
 77 |         cur_time = 0
 78 |         utter_lines = []
 79 |         stride = dur_thrd
 80 |         while not is_end:
 81 |             ID = entry + "_" + str(cur_i)
 82 |             # whether is the end
 83 |             if duration - cur_time <= dur_thrd * 3 / 2:
 84 |                 is_end = True
 85 |                 dur = duration - cur_time
 86 |                 utter_num = cur_i
 87 |             else:
 88 |                 dur = dur_thrd
 89 |             
 90 |             # determine the csv_line
 91 |             utter_lines.append((ID, dur))
 92 |             
 93 |             # update variables
 94 |             cur_i = cur_i + 1
 95 |             cur_time = cur_time + stride
 96 |         
 97 |         for i in range(1, utter_num + 1):
 98 |             ID, dur = utter_lines[i - 1]
 99 |             csv_line = [
100 |                 ID, str(dur), video_path, str(i), str(utter_num), anno_path, song_anno_path,
101 |             ]
102 |             if split == "train":
103 |                 csv_train_lines.append(csv_line)
104 |             elif split == "valid":
105 |                 csv_valid_lines.append(csv_line)
106 |             elif split == "test":
107 |                 csv_test_lines.append(csv_line)
108 | 
109 |     # save csv files
110 |     save_folder = os.path.join(csv_folder, "frame_rate" + str(frame_rate), "dur_" + str(dur_thrd) + "s")
111 |     os.makedirs(save_folder, exist_ok=True)
112 |     save_train_path = os.path.join(save_folder, "n20em_train.csv")
113 |     save_valid_path = os.path.join(save_folder, "n20em_valid.csv")
114 |     save_test_path = os.path.join(save_folder, "n20em_test.csv")
115 |     # train
116 |     with open(save_train_path, mode="w") as csv_f:
117 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
118 |         for line in csv_train_lines:
119 |             csv_writer.writerow(line)
120 |     # valid
121 |     with open(save_valid_path, mode="w") as csv_f:
122 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
123 |         for line in csv_valid_lines:
124 |             csv_writer.writerow(line)
125 |     # test
126 |     with open(save_test_path, mode="w") as csv_f:
127 |         csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
128 |         for line in csv_test_lines:
129 |             csv_writer.writerow(line)
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = argparse.ArgumentParser()
134 |     parser.add_argument("--frame_rate", type=int, default=50, help="the frame rate for log fbanks features")
135 |     parser.add_argument("--duration", type=int, default=5, help="the threshold for duration")
136 |     parser.add_argument("--n20emv2", type=str, default="/path/to/N20EMv2", help="The path to save N20EMv2 dataset")
137 |     args = parser.parse_args()
138 |     prepare_frame_anno(folder=args.n20emv2, frame_rate=args.frame_rate)
139 |     prepare_csv_n20emv2(folder=args.n20emv2, frame_rate=args.frame_rate, dur_thrd=args.duration)


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/framework.png


--------------------------------------------------------------------------------
/assets/noise_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/noise_test.png


--------------------------------------------------------------------------------
/assets/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/results.png


--------------------------------------------------------------------------------
/assets/results2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxm2021/SVT_SpeechBrain/a9dc323cd6dd8f751f71cbfeff368b8a5c5eba87/assets/results2.png


--------------------------------------------------------------------------------
/dependencies.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | scikit-learn
4 | mir_eval
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | black==19.10b0
 2 | flake8==3.7.9
 3 | pycodestyle==2.5.0
 4 | pytest==5.4.1
 5 | yamllint==1.23.0
 6 | huggingface_hub>=0.0.6
 7 | hyperpyyaml>=0.0.1
 8 | joblib>=0.14.1
 9 | numpy>=1.17.0
10 | packaging
11 | pre-commit>=2.3.0
12 | scipy>=1.4.1
13 | sentencepiece>=0.1.91
14 | SoundFile; sys_platform == 'win32'
15 | torch>=1.8.0,<=1.10.1
16 | torchaudio>=0.8.0,<=0.10.1
17 | tqdm>=4.42.0
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import site
 5 | import setuptools
 6 | from distutils.core import setup
 7 | 
 8 | 
 9 | # Editable install in user site directory can be allowed with this hack:
10 | # https://github.com/pypa/pip/issues/7953.
11 | site.ENABLE_USER_SITE = "--user" in sys.argv[1:]
12 | 
13 | with open("README.md") as f:
14 |     long_description = f.read()
15 | 
16 | with open(os.path.join("speechbrain", "version.txt")) as f:
17 |     version = f.read().strip()
18 | 
19 | setup(
20 |     name="speechbrain",
21 |     version=version,
22 |     description="All-in-one speech toolkit in pure Python and Pytorch",
23 |     long_description=long_description,
24 |     long_description_content_type="text/markdown",
25 |     author="Mirco Ravanelli & Others",
26 |     author_email="speechbrain@gmail.com",
27 |     packages=setuptools.find_packages(),
28 |     package_data={"speechbrain": ["version.txt", "log-config.yaml"]},
29 |     install_requires=[
30 |         "hyperpyyaml",
31 |         "joblib",
32 |         "numpy",
33 |         "packaging",
34 |         "scipy",
35 |         "sentencepiece",
36 |         "torch>=1.7,<=1.11",
37 |         "torchaudio",
38 |         "tqdm",
39 |         "huggingface_hub",
40 |     ],
41 |     python_requires=">=3.7",
42 |     url="https://speechbrain.github.io/",
43 | )
44 | 


--------------------------------------------------------------------------------
/speechbrain/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Comprehensive speech processing toolkit
 2 | """
 3 | import os
 4 | from .core import Stage, Brain, create_experiment_directory, parse_arguments
 5 | from . import alignment  # noqa
 6 | from . import dataio  # noqa
 7 | from . import decoders  # noqa
 8 | from . import lobes  # noqa
 9 | from . import lm  # noqa
10 | from . import nnet  # noqa
11 | from . import processing  # noqa
12 | from . import tokenizers  # noqa
13 | from . import utils  # noqa
14 | 
15 | with open(os.path.join(os.path.dirname(__file__), "version.txt")) as f:
16 |     version = f.read().strip()
17 | 
18 | __all__ = [
19 |     "Stage",
20 |     "Brain",
21 |     "create_experiment_directory",
22 |     "parse_arguments",
23 | ]
24 | 
25 | __version__ = version
26 | 


--------------------------------------------------------------------------------
/speechbrain/alignment/__init__.py:
--------------------------------------------------------------------------------
1 | """Tools for aligning transcripts and speech signals
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/dataio/__init__.py:
--------------------------------------------------------------------------------
 1 | """Data loading and dataset preprocessing
 2 | """
 3 | import os
 4 | 
 5 | __all__ = []
 6 | for filename in os.listdir(os.path.dirname(__file__)):
 7 |     filename = os.path.basename(filename)
 8 |     if filename.endswith(".py") and not filename.startswith("__"):
 9 |         __all__.append(filename[:-3])
10 | 
11 | from . import *  # noqa
12 | 


--------------------------------------------------------------------------------
/speechbrain/dataio/preprocess.py:
--------------------------------------------------------------------------------
 1 | """Preprocessors for audio"""
 2 | import torch
 3 | import functools
 4 | from speechbrain.processing.speech_augmentation import Resample
 5 | 
 6 | 
 7 | class AudioNormalizer:
 8 |     """Normalizes audio into a standard format
 9 | 
10 |     Arguments
11 |     ---------
12 |     sample_rate : int
13 |         The sampling rate to which the incoming signals should be converted.
14 |     mix : {"avg-to-mono", "keep"}
15 |         "avg-to-mono" - add all channels together and normalize by number of
16 |         channels. This also removes the channel dimension, resulting in [time]
17 |         format tensor.
18 |         "keep" - don't normalize channel information
19 | 
20 |     Example
21 |     -------
22 |     >>> import torchaudio
23 |     >>> example_file = 'samples/audio_samples/example_multichannel.wav'
24 |     >>> signal, sr = torchaudio.load(example_file, channels_first = False)
25 |     >>> normalizer = AudioNormalizer(sample_rate=8000)
26 |     >>> normalized = normalizer(signal, sr)
27 |     >>> signal.shape
28 |     torch.Size([33882, 2])
29 |     >>> normalized.shape
30 |     torch.Size([16941])
31 | 
32 |     NOTE
33 |     ----
34 |     This will also upsample audio. However, upsampling cannot produce meaningful
35 |     information in the bandwidth which it adds. Generally models will not work
36 |     well for upsampled data if they have not specifically been trained to do so.
37 |     """
38 | 
39 |     def __init__(self, sample_rate=16000, mix="avg-to-mono"):
40 |         self.sample_rate = sample_rate
41 |         if mix not in ["avg-to-mono", "keep"]:
42 |             raise ValueError(f"Unexpected mixing configuration {mix}")
43 |         self.mix = mix
44 |         self._cached_resample = functools.lru_cache(maxsize=12)(Resample)
45 | 
46 |     def __call__(self, audio, sample_rate):
47 |         """Perform normalization
48 | 
49 |         Arguments
50 |         ---------
51 |         audio : tensor
52 |             The input waveform torch tensor. Assuming [time, channels],
53 |             or [time].
54 |         """
55 |         resampler = self._cached_resample(sample_rate, self.sample_rate)
56 |         resampled = resampler(audio.unsqueeze(0)).squeeze(0)
57 |         return self._mix(resampled)
58 | 
59 |     def _mix(self, audio):
60 |         """Handle channel mixing"""
61 |         flat_input = audio.dim() == 1
62 |         if self.mix == "avg-to-mono":
63 |             if flat_input:
64 |                 return audio
65 |             return torch.mean(audio, 1)
66 |         if self.mix == "keep":
67 |             return audio
68 | 


--------------------------------------------------------------------------------
/speechbrain/dataio/wer.py:
--------------------------------------------------------------------------------
  1 | """WER print functions.
  2 | 
  3 | The functions here are used to print the computed statistics
  4 | with human-readable formatting.
  5 | They have a file argument, but you can also just use
  6 | contextlib.redirect_stdout, which may give a nicer syntax.
  7 | 
  8 | Authors
  9 |  * Aku Rouhe 2020
 10 | """
 11 | import sys
 12 | from speechbrain.utils import edit_distance
 13 | 
 14 | 
 15 | def print_wer_summary(wer_details, file=sys.stdout):
 16 |     """Prints out WER summary details in human-readable format.
 17 | 
 18 |     This function essentially mirrors the Kaldi compute-wer output format.
 19 | 
 20 |     Arguments
 21 |     ---------
 22 |     wer_details : dict
 23 |         Dict of wer summary details,
 24 |         see ``speechbrain.utils.edit_distance.wer_summary``
 25 |         for format.
 26 |     file : stream
 27 |         Where to write. (default: sys.stdout)
 28 |     """
 29 |     print(
 30 |         "%WER {WER:.2f} [ {num_edits} / {num_scored_tokens}, {insertions} ins, {deletions} del, {substitutions} sub ]".format(  # noqa
 31 |             **wer_details
 32 |         ),
 33 |         file=file,
 34 |         end="",
 35 |     )
 36 |     print(
 37 |         " [PARTIAL]"
 38 |         if wer_details["num_scored_sents"] < wer_details["num_ref_sents"]
 39 |         else "",
 40 |         file=file,
 41 |     )
 42 |     print(
 43 |         "%SER {SER:.2f} [ {num_erraneous_sents} / {num_scored_sents} ]".format(
 44 |             **wer_details
 45 |         ),
 46 |         file=file,
 47 |     )
 48 |     print(
 49 |         "Scored {num_scored_sents} sentences, {num_absent_sents} not present in hyp.".format(  # noqa
 50 |             **wer_details
 51 |         ),
 52 |         file=file,
 53 |     )
 54 | 
 55 | 
 56 | def print_alignments(
 57 |     details_by_utterance, file=sys.stdout, empty_symbol="<eps>", separator=" ; "
 58 | ):
 59 |     """Print WER summary and alignments.
 60 | 
 61 |     Arguments
 62 |     ---------
 63 |     details_by_utterance : list
 64 |         List of wer details by utterance,
 65 |         see ``speechbrain.utils.edit_distance.wer_details_by_utterance``
 66 |         for format. Has to have alignments included.
 67 |     file : stream
 68 |         Where to write. (default: sys.stdout)
 69 |     empty_symbol : str
 70 |         Symbol to use when aligning to nothing.
 71 |     separator : str
 72 |         String that separates each token in the output. Note the spaces in the
 73 |         default.
 74 |     """
 75 |     _print_alignments_global_header(
 76 |         file=file, empty_symbol=empty_symbol, separator=separator
 77 |     )
 78 |     for dets in details_by_utterance:
 79 |         if dets["scored"]:
 80 |             _print_alignment_header(dets, file=file)
 81 |             _print_alignment(
 82 |                 dets["alignment"],
 83 |                 dets["ref_tokens"],
 84 |                 dets["hyp_tokens"],
 85 |                 file=file,
 86 |                 empty_symbol=empty_symbol,
 87 |                 separator=separator,
 88 |             )
 89 | 
 90 | 
 91 | # The following internal functions are used to
 92 | # print out more specific things
 93 | def _print_top_wer_utts(top_non_empty, top_empty, file=sys.stdout):
 94 |     print("=" * 80, file=file)
 95 |     print("UTTERANCES WITH HIGHEST WER", file=file)
 96 |     if top_non_empty:
 97 |         print(
 98 |             "Non-empty hypotheses -- utterances for which output was produced:",
 99 |             file=file,
100 |         )
101 |         for dets in top_non_empty:
102 |             print("{key} %WER {WER:.2f}".format(**dets), file=file)
103 |     else:
104 |         print("No utterances which had produced output!", file=file)
105 |     if top_empty:
106 |         print(
107 |             "Empty hypotheses -- utterances for which no output was produced:",
108 |             file=file,
109 |         )
110 |         for dets in top_empty:
111 |             print("{key} %WER {WER:.2f}".format(**dets), file=file)
112 |     else:
113 |         print("No utterances which had not produced output!", file=file)
114 | 
115 | 
116 | def _print_top_wer_spks(spks_by_wer, file=sys.stdout):
117 |     print("=" * 80, file=file)
118 |     print("SPEAKERS WITH HIGHEST WER", file=file)
119 |     for dets in spks_by_wer:
120 |         print("{speaker} %WER {WER:.2f}".format(**dets), file=file)
121 | 
122 | 
123 | def _print_alignment(
124 |     alignment, a, b, empty_symbol="<eps>", separator=" ; ", file=sys.stdout
125 | ):
126 |     # First, get equal length text for all:
127 |     a_padded = []
128 |     b_padded = []
129 |     ops_padded = []
130 |     for op, i, j in alignment:  # i indexes a, j indexes b
131 |         op_string = str(op)
132 |         a_string = str(a[i]) if i is not None else empty_symbol
133 |         b_string = str(b[j]) if j is not None else empty_symbol
134 |         # NOTE: the padding does not actually compute printed length,
135 |         # but hopefully we can assume that printed length is
136 |         # at most the str len
137 |         pad_length = max(len(op_string), len(a_string), len(b_string))
138 |         a_padded.append(a_string.center(pad_length))
139 |         b_padded.append(b_string.center(pad_length))
140 |         ops_padded.append(op_string.center(pad_length))
141 |     # Then print, in the order Ref, op, Hyp
142 |     print(separator.join(a_padded), file=file)
143 |     print(separator.join(ops_padded), file=file)
144 |     print(separator.join(b_padded), file=file)
145 | 
146 | 
147 | def _print_alignments_global_header(
148 |     empty_symbol="<eps>", separator=" ; ", file=sys.stdout
149 | ):
150 |     print("=" * 80, file=file)
151 |     print("ALIGNMENTS", file=file)
152 |     print("", file=file)
153 |     print("Format:", file=file)
154 |     print("<utterance-id>, WER DETAILS", file=file)
155 |     # Print the format with the actual
156 |     # print_alignment function, using artificial data:
157 |     a = ["reference", "on", "the", "first", "line"]
158 |     b = ["and", "hypothesis", "on", "the", "third"]
159 |     alignment = [
160 |         (edit_distance.EDIT_SYMBOLS["ins"], None, 0),
161 |         (edit_distance.EDIT_SYMBOLS["sub"], 0, 1),
162 |         (edit_distance.EDIT_SYMBOLS["eq"], 1, 2),
163 |         (edit_distance.EDIT_SYMBOLS["eq"], 2, 3),
164 |         (edit_distance.EDIT_SYMBOLS["sub"], 3, 4),
165 |         (edit_distance.EDIT_SYMBOLS["del"], 4, None),
166 |     ]
167 |     _print_alignment(
168 |         alignment,
169 |         a,
170 |         b,
171 |         file=file,
172 |         empty_symbol=empty_symbol,
173 |         separator=separator,
174 |     )
175 | 
176 | 
177 | def _print_alignment_header(wer_details, file=sys.stdout):
178 |     print("=" * 80, file=file)
179 |     print(
180 |         "{key}, %WER {WER:.2f} [ {num_edits} / {num_ref_tokens}, {insertions} ins, {deletions} del, {substitutions} sub ]".format(  # noqa
181 |             **wer_details
182 |         ),
183 |         file=file,
184 |     )
185 | 


--------------------------------------------------------------------------------
/speechbrain/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package containing the different decoders (ctc, beamsearch ...)
2 | """
3 | from .seq2seq import *  # noqa
4 | from .ctc import *  # noqa
5 | 


--------------------------------------------------------------------------------
/speechbrain/lm/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package defining language models
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/lm/counting.py:
--------------------------------------------------------------------------------
  1 | """
  2 | N-gram counting, discounting, interpolation, and backoff
  3 | 
  4 | Authors
  5 |  * Aku Rouhe 2020
  6 | """
  7 | import itertools
  8 | 
  9 | 
 10 | # The following functions are essentially copying the NLTK ngram counting
 11 | # pipeline with minor differences. Written from scratch, but with enough
 12 | # inspiration that I feel I want to mention the inspiration source:
 13 | # NLTK is licenced under the Apache 2.0 Licence, same as SpeechBrain
 14 | # See https://github.com/nltk/nltk
 15 | # The NLTK implementation is highly focused on getting lazy evaluation.
 16 | def pad_ends(
 17 |     sequence, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>"
 18 | ):
 19 |     """
 20 |     Pad sentence ends with start- and end-of-sentence tokens
 21 | 
 22 |     In speech recognition, it is important to predict the end of sentence
 23 |     and use the start of sentence to condition predictions. Typically this
 24 |     is done by adding special tokens (usually <s> and </s>) at the ends of
 25 |     each sentence. The <s> token should not be predicted, so some special
 26 |     care needs to be taken for unigrams.
 27 | 
 28 |     Arguments
 29 |     ---------
 30 |     sequence : iterator
 31 |         The sequence (any iterable type) to pad.
 32 |     pad_left : bool
 33 |         Whether to pad on the left side as well. True by default.
 34 |     left_pad_symbol : any
 35 |         The token to use for left side padding. "<s>" by default.
 36 |     right_pad_symbol : any
 37 |         The token to use for right side padding. "</s>" by default.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     generator
 42 |         A generator that yields the padded sequence.
 43 | 
 44 |     Example
 45 |     -------
 46 |     >>> for token in pad_ends(["Speech", "Brain"]):
 47 |     ...     print(token)
 48 |     <s>
 49 |     Speech
 50 |     Brain
 51 |     </s>
 52 | 
 53 |     """
 54 |     if pad_left:
 55 |         return itertools.chain(
 56 |             (left_pad_symbol,), tuple(sequence), (right_pad_symbol,)
 57 |         )
 58 |     else:
 59 |         return itertools.chain(tuple(sequence), (right_pad_symbol,))
 60 | 
 61 | 
 62 | def ngrams(sequence, n):
 63 |     """
 64 |     Produce all Nth order N-grams from the sequence.
 65 | 
 66 |     This will generally be used in an N-gram counting pipeline.
 67 | 
 68 |     Arguments
 69 |     ---------
 70 |     sequence : iterator
 71 |         The sequence from which to produce N-grams.
 72 |     n : int
 73 |         The order of N-grams to produce
 74 | 
 75 |     Yields
 76 |     ------
 77 |     tuple
 78 |         Yields each ngram as a tuple.
 79 | 
 80 |     Example
 81 |     -------
 82 |     >>> for ngram in ngrams("Brain", 3):
 83 |     ...     print(ngram)
 84 |     ('B', 'r', 'a')
 85 |     ('r', 'a', 'i')
 86 |     ('a', 'i', 'n')
 87 | 
 88 |     """
 89 |     if n <= 0:
 90 |         raise ValueError("N must be >=1")
 91 |     # Handle the unigram case specially:
 92 |     if n == 1:
 93 |         for token in sequence:
 94 |             yield (token,)
 95 |         return
 96 |     iterator = iter(sequence)
 97 |     history = []
 98 |     for hist_length, token in enumerate(iterator, start=1):
 99 |         history.append(token)
100 |         if hist_length == n - 1:
101 |             break
102 |     else:  # For-else is obscure but fits here perfectly
103 |         return
104 |     for token in iterator:
105 |         yield tuple(history) + (token,)
106 |         history.append(token)
107 |         del history[0]
108 |     return
109 | 
110 | 
111 | def ngrams_for_evaluation(sequence, max_n, predict_first=False):
112 |     """
113 |     Produce each token with the appropriate context.
114 | 
115 |     The function produces as large N-grams as possible, so growing from
116 |     unigrams/bigrams to max_n.
117 | 
118 |     E.G. when your model is a trigram model, you'll still only have one token
119 |     of context (the start of sentence) for the first token.
120 | 
121 |     In general this is useful when evaluating an N-gram model.
122 | 
123 |     Arguments
124 |     ---------
125 |     sequence : iterator
126 |         The sequence to produce tokens and context from.
127 |     max_n : int
128 |         The maximum N-gram length to produce.
129 |     predict_first : bool
130 |         To produce the first token in the sequence to predict (without
131 |         context) or not. Essentially this should be False when the start of
132 |         sentence symbol is the first in the sequence.
133 | 
134 |     Yields
135 |     ------
136 |     Any
137 |         The token to predict
138 |     tuple
139 |         The context to predict conditional on.
140 | 
141 |     Example
142 |     -------
143 |     >>> for token, context in ngrams_for_evaluation("Brain", 3, True):
144 |     ...     print(f"p( {token} |{' ' if context else ''}{' '.join(context)} )")
145 |     p( B | )
146 |     p( r | B )
147 |     p( a | B r )
148 |     p( i | r a )
149 |     p( n | a i )
150 |     """
151 |     if max_n <= 0:
152 |         raise ValueError("Max N must be >=1")
153 |     iterator = iter(sequence)
154 |     history = []
155 |     if not predict_first:
156 |         history.append(next(iterator))
157 |     for token in iterator:
158 |         if len(history) == max_n:
159 |             del history[0]
160 |         yield token, tuple(history)
161 |         history.append(token)
162 |     return
163 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package defining common blocks (DNN models, processing ...)
2 | 
3 | This subpackage gathers higher level blocks, or "lobes".
4 | The classes here may leverage the extended YAML syntax.
5 | """
6 | from . import models  # noqa
7 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/beamform_multimic.py:
--------------------------------------------------------------------------------
 1 | """Beamformer for multi-mic processing.
 2 | 
 3 | Authors
 4 |  * Nauman Dawalatabad
 5 | """
 6 | import torch
 7 | from speechbrain.processing.features import (
 8 |     STFT,
 9 |     ISTFT,
10 | )
11 | 
12 | from speechbrain.processing.multi_mic import (
13 |     Covariance,
14 |     GccPhat,
15 |     DelaySum,
16 | )
17 | 
18 | 
19 | class DelaySum_Beamformer(torch.nn.Module):
20 |     """Generate beamformed signal from multi-mic data using DelaySum beamforming.
21 | 
22 |     Arguments
23 |     ---------
24 |     sampling_rate : int (default: 16000)
25 |         Sampling rate of audio signals.
26 |     """
27 | 
28 |     def __init__(self, sampling_rate=16000):
29 |         super().__init__()
30 |         self.fs = sampling_rate
31 |         self.stft = STFT(sample_rate=self.fs)
32 |         self.cov = Covariance()
33 |         self.gccphat = GccPhat()
34 |         self.delaysum = DelaySum()
35 |         self.istft = ISTFT(sample_rate=self.fs)
36 | 
37 |     def forward(self, mics_signals):
38 |         """Returns beamformed signal using multi-mic data.
39 | 
40 |         Arguments
41 |         ---------
42 |         mics_sginal : tensor
43 |             Set of audio signals to be transformed.
44 |         """
45 |         with torch.no_grad():
46 | 
47 |             Xs = self.stft(mics_signals)
48 |             XXs = self.cov(Xs)
49 |             tdoas = self.gccphat(XXs)
50 |             Ys_ds = self.delaysum(Xs, tdoas)
51 |             sig = self.istft(Ys_ds)
52 | 
53 |         return sig
54 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/ESPnetVGG.py:
--------------------------------------------------------------------------------
  1 | """This lobes replicate the encoder first introduced in ESPNET v1
  2 | 
  3 | source: https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/rnn/encoders.py
  4 | 
  5 | Authors
  6 |  * Titouan Parcollet 2020
  7 | """
  8 | import torch
  9 | import speechbrain as sb
 10 | 
 11 | 
 12 | class ESPnetVGG(sb.nnet.containers.Sequential):
 13 |     """This model is a combination of CNNs and RNNs following
 14 |         the ESPnet encoder. (VGG+RNN+MLP+tanh())
 15 | 
 16 |     Arguments
 17 |     ---------
 18 |     input_shape : tuple
 19 |         The shape of an example expected input.
 20 |     activation : torch class
 21 |         A class used for constructing the activation layers. For CNN and DNN.
 22 |     dropout : float
 23 |         Neuron dropout rate, applied to RNN only.
 24 |     cnn_channels : list of ints
 25 |         A list of the number of output channels for each CNN block.
 26 |     rnn_class : torch class
 27 |         The type of RNN to use (LiGRU, LSTM, GRU, RNN)
 28 |     rnn_layers : int
 29 |         The number of recurrent layers to include.
 30 |     rnn_neurons : int
 31 |         Number of neurons in each layer of the RNN.
 32 |     rnn_bidirectional : bool
 33 |         Whether this model will process just forward or both directions.
 34 |     projection_neurons : int
 35 |         The number of neurons in the last linear layer.
 36 | 
 37 |     Example
 38 |     -------
 39 |     >>> inputs = torch.rand([10, 40, 60])
 40 |     >>> model = ESPnetVGG(input_shape=inputs.shape)
 41 |     >>> outputs = model(inputs)
 42 |     >>> outputs.shape
 43 |     torch.Size([10, 10, 512])
 44 |     """
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         input_shape,
 49 |         activation=torch.nn.ReLU,
 50 |         dropout=0.15,
 51 |         cnn_channels=[64, 128],
 52 |         rnn_class=sb.nnet.RNN.LSTM,
 53 |         rnn_layers=4,
 54 |         rnn_neurons=512,
 55 |         rnn_bidirectional=True,
 56 |         rnn_re_init=False,
 57 |         projection_neurons=512,
 58 |     ):
 59 |         super().__init__(input_shape=input_shape)
 60 | 
 61 |         self.append(sb.nnet.containers.Sequential, layer_name="VGG")
 62 | 
 63 |         self.append(
 64 |             sb.nnet.CNN.Conv2d,
 65 |             out_channels=cnn_channels[0],
 66 |             kernel_size=(3, 3),
 67 |             layer_name="conv_1_1",
 68 |         )
 69 |         self.append(activation(), layer_name="act_1_1")
 70 |         self.append(
 71 |             sb.nnet.CNN.Conv2d,
 72 |             out_channels=cnn_channels[0],
 73 |             kernel_size=(3, 3),
 74 |             layer_name="conv_1_2",
 75 |         )
 76 |         self.append(activation(), layer_name="act_1_2")
 77 |         self.append(
 78 |             sb.nnet.pooling.Pooling2d(
 79 |                 pool_type="max", kernel_size=(2, 2), pool_axis=(1, 2),
 80 |             ),
 81 |             layer_name="pooling_1",
 82 |         )
 83 | 
 84 |         self.append(
 85 |             sb.nnet.CNN.Conv2d,
 86 |             out_channels=cnn_channels[1],
 87 |             kernel_size=(3, 3),
 88 |             layer_name="conv_2_1",
 89 |         )
 90 |         self.append(activation(), layer_name="act_2_1")
 91 |         self.append(
 92 |             sb.nnet.CNN.Conv2d,
 93 |             out_channels=cnn_channels[1],
 94 |             kernel_size=(3, 3),
 95 |             layer_name="conv_2_2",
 96 |         )
 97 |         self.append(activation(), layer_name="act_2_2")
 98 |         self.append(
 99 |             sb.nnet.pooling.Pooling2d(
100 |                 pool_type="max", kernel_size=(2, 2), pool_axis=(1, 2),
101 |             ),
102 |             layer_name="pooling_2",
103 |         )
104 | 
105 |         if rnn_layers > 0:
106 |             self.append(
107 |                 rnn_class,
108 |                 layer_name="RNN",
109 |                 hidden_size=rnn_neurons,
110 |                 num_layers=rnn_layers,
111 |                 dropout=dropout,
112 |                 bidirectional=rnn_bidirectional,
113 |                 re_init=rnn_re_init,
114 |             )
115 | 
116 |         self.append(
117 |             sb.nnet.linear.Linear,
118 |             n_neurons=projection_neurons,
119 |             layer_name="proj",
120 |         )
121 |         self.append(torch.nn.Tanh(), layer_name="proj_act")
122 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/IMU_CRNN.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | 
  8 | def check_model(model):
  9 |     pytorch_total_params = sum(p.numel() for p in model.parameters())
 10 |     pytorch_train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
 11 |     print('Totalparams:', format(pytorch_total_params, ','))
 12 |     print('Trainableparams:', format(pytorch_train_params, ','))
 13 | 
 14 | 
 15 | class IMU_CRNN_GRU(nn.Module): # IMU_CRNN_Ott_GRU_3
 16 |     '''
 17 |     Modified net from Ott 2022
 18 |     GRU 2 with fewer neurons
 19 |     '''
 20 | 
 21 |     def __init__(self, dropout_cnn=0.5, dropout_rnn=0.2, rnn_width=60):
 22 |         super().__init__()
 23 | 
 24 |         channel_num_1 = 128
 25 |         channel_num_2 = 200
 26 | 
 27 |         self.down = nn.AvgPool1d(kernel_size=10, stride=5, padding=4)
 28 | 
 29 |         self.conv1 = nn.Conv1d(in_channels=8, out_channels=channel_num_1, kernel_size=3, stride=1, padding=1)  # floor(500 + 2*p - 3 + 1) = 500
 30 |         self.pool1 = nn.MaxPool1d(kernel_size=2)
 31 |         self.norm1 = nn.BatchNorm1d(num_features=channel_num_1)
 32 |         self.drop1 = nn.Dropout(p=dropout_cnn)
 33 | 
 34 |         self.conv2 = nn.Conv1d(in_channels=channel_num_1, out_channels=channel_num_2, kernel_size=3, stride=1, padding=1)  # (250 + 2*2 - 4) / 1 = 250
 35 |         self.norm2 = nn.BatchNorm1d(num_features=channel_num_2)
 36 |         self.drop2 = nn.Dropout(p=dropout_cnn)  # [B, C2, T]
 37 | 
 38 |         self.rnn = nn.GRU(input_size=channel_num_2, hidden_size=rnn_width, num_layers=2,
 39 |                            bias=True, batch_first=True, dropout=dropout_rnn, bidirectional=True)
 40 |         self.drop3 = nn.Dropout(p=dropout_rnn)
 41 | 
 42 |         self.fc = nn.Linear(in_features=rnn_width*2, out_features=1)
 43 | 
 44 |     def forward(self, x, cls=True):
 45 |         '''
 46 |         If don't want classification output, set cls=False
 47 |         '''
 48 |         if ('CUDA_VISIBLE_DEVICES' not in os.environ) or len(os.environ['CUDA_VISIBLE_DEVICES']) > 1:
 49 |             self.rnn.flatten_parameters()
 50 |         x = self.down(x)  # [B, 64, 500]
 51 | 
 52 |         x = F.relu_(self.conv1(x))
 53 |         x = self.pool1(x)
 54 |         x = self.norm1(x)
 55 |         x = self.drop1(x)  # [B, C1=200, T=50]
 56 | 
 57 |         x = F.relu_(self.conv2(x))
 58 |         x = self.norm2(x)
 59 |         x = self.drop2(x)  # [B, C2=200, T=25]
 60 | 
 61 |         x = x.permute([0, 2, 1])  # [B, T=25, C2=256]
 62 |         x, _ = self.rnn(x)  # [B, T=25, 512]
 63 |         x = self.drop3(x) # [B, T, F=120]
 64 | 
 65 |         if cls==True:
 66 |             x = torch.sigmoid(self.fc(x))
 67 |             x = x.squeeze()
 68 |         else:
 69 |             pass
 70 | 
 71 |         return x
 72 | 
 73 | 
 74 | 
 75 | class ConvBlock(nn.Module):
 76 |     def __init__(self, in_channels, out_channels):
 77 |         super(ConvBlock, self).__init__()
 78 | 
 79 |         self.conv1 = nn.Conv1d(in_channels=in_channels,
 80 |                                out_channels=out_channels,
 81 |                                kernel_size=3,
 82 |                                stride=1,
 83 |                                padding=1,
 84 |                                bias=False)
 85 | 
 86 |         self.conv2 = nn.Conv1d(in_channels=out_channels,
 87 |                                out_channels=out_channels,
 88 |                                kernel_size=3,
 89 |                                stride=1,
 90 |                                padding=1,
 91 |                                bias=False)
 92 | 
 93 |         self.bn1 = nn.BatchNorm1d(out_channels)
 94 |         self.bn2 = nn.BatchNorm1d(out_channels)
 95 | 
 96 |     def forward(self, input):
 97 |         """
 98 |         Args:
 99 |           input: (batch_size, in_channels, time_steps, freq_bins)
100 | 
101 |         Outputs:
102 |           output: (batch_size, out_channels, classes_num)
103 |         """
104 | 
105 |         x = F.relu_(self.bn1(self.conv1(input)))
106 |         x = F.relu_(self.bn2(self.conv2(x)))
107 | 
108 |         return x
109 | 
110 | 
111 | # if __name__ == '__main__':
112 | #     main()
113 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/MetricGAN.py:
--------------------------------------------------------------------------------
  1 | """Generator and discriminator used in MetricGAN
  2 | 
  3 | Authors:
  4 | * Szu-Wei Fu 2020
  5 | """
  6 | import torch
  7 | import speechbrain as sb
  8 | from torch import nn
  9 | from torch.nn.utils import spectral_norm
 10 | 
 11 | 
 12 | def xavier_init_layer(
 13 |     in_size, out_size=None, spec_norm=True, layer_type=nn.Linear, **kwargs
 14 | ):
 15 |     "Create a layer with spectral norm, xavier uniform init and zero bias"
 16 |     if out_size is None:
 17 |         out_size = in_size
 18 | 
 19 |     layer = layer_type(in_size, out_size, **kwargs)
 20 |     if spec_norm:
 21 |         layer = spectral_norm(layer)
 22 | 
 23 |     # Perform initialization
 24 |     nn.init.xavier_uniform_(layer.weight, gain=1.0)
 25 |     nn.init.zeros_(layer.bias)
 26 | 
 27 |     return layer
 28 | 
 29 | 
 30 | def shifted_sigmoid(x):
 31 |     return 1.2 / (1 + torch.exp(-(1 / 1.6) * x))
 32 | 
 33 | 
 34 | class Learnable_sigmoid(nn.Module):
 35 |     def __init__(self, in_features=257):
 36 |         super().__init__()
 37 |         self.slope = nn.Parameter(torch.ones(in_features))
 38 |         self.slope.requiresGrad = True  # set requiresGrad to true!
 39 | 
 40 |         # self.scale = nn.Parameter(torch.ones(1))
 41 |         # self.scale.requiresGrad = True # set requiresGrad to true!
 42 | 
 43 |     def forward(self, x):
 44 |         return 1.2 * torch.sigmoid(self.slope * x)
 45 | 
 46 | 
 47 | class EnhancementGenerator(nn.Module):
 48 |     """Simple LSTM for enhancement with custom initialization.
 49 | 
 50 |     Arguments
 51 |     ---------
 52 |     input_size : int
 53 |         Size of the input tensor's last dimension.
 54 |     hidden_size : int
 55 |         Number of neurons to use in the LSTM layers.
 56 |     num_layers : int
 57 |         Number of layers to use in the LSTM.
 58 |     dropout : int
 59 |         Fraction of neurons to drop during training.
 60 |     """
 61 | 
 62 |     def __init__(
 63 |         self, input_size=257, hidden_size=200, num_layers=2, dropout=0,
 64 |     ):
 65 |         super().__init__()
 66 |         self.activation = nn.LeakyReLU(negative_slope=0.3)
 67 | 
 68 |         self.blstm = sb.nnet.RNN.LSTM(
 69 |             input_size=input_size,
 70 |             hidden_size=hidden_size,
 71 |             num_layers=num_layers,
 72 |             dropout=dropout,
 73 |             bidirectional=True,
 74 |         )
 75 |         """
 76 |         Use orthogonal init for recurrent layers, xavier uniform for input layers
 77 |         Bias is 0
 78 |         """
 79 |         for name, param in self.blstm.named_parameters():
 80 |             if "bias" in name:
 81 |                 nn.init.zeros_(param)
 82 |             elif "weight_ih" in name:
 83 |                 nn.init.xavier_uniform_(param)
 84 |             elif "weight_hh" in name:
 85 |                 nn.init.orthogonal_(param)
 86 | 
 87 |         self.linear1 = xavier_init_layer(400, 300, spec_norm=False)
 88 |         self.linear2 = xavier_init_layer(300, 257, spec_norm=False)
 89 | 
 90 |         self.Learnable_sigmoid = Learnable_sigmoid()
 91 |         self.sigmoid = nn.Sigmoid()
 92 | 
 93 |     def forward(self, x, lengths):
 94 |         out, _ = self.blstm(x, lengths=lengths)
 95 | 
 96 |         out = self.linear1(out)
 97 |         out = self.activation(out)
 98 | 
 99 |         out = self.linear2(out)
100 |         out = self.Learnable_sigmoid(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class MetricDiscriminator(nn.Module):
106 |     """Metric estimator for enhancement training.
107 | 
108 |     Consists of:
109 |      * four 2d conv layers
110 |      * channel averaging
111 |      * three linear layers
112 | 
113 |     Arguments
114 |     ---------
115 |     kernel_size : tuple
116 |         The dimensions of the 2-d kernel used for convolution.
117 |     base_channels : int
118 |         Number of channels used in each conv layer.
119 |     """
120 | 
121 |     def __init__(
122 |         self, kernel_size=(5, 5), base_channels=15, activation=nn.LeakyReLU,
123 |     ):
124 |         super().__init__()
125 | 
126 |         self.activation = activation(negative_slope=0.3)
127 | 
128 |         self.BN = nn.BatchNorm2d(num_features=2, momentum=0.01)
129 | 
130 |         self.conv1 = xavier_init_layer(
131 |             2, base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
132 |         )
133 |         self.conv2 = xavier_init_layer(
134 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
135 |         )
136 |         self.conv3 = xavier_init_layer(
137 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
138 |         )
139 |         self.conv4 = xavier_init_layer(
140 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
141 |         )
142 | 
143 |         self.Linear1 = xavier_init_layer(base_channels, out_size=50)
144 |         self.Linear2 = xavier_init_layer(in_size=50, out_size=10)
145 |         self.Linear3 = xavier_init_layer(in_size=10, out_size=1)
146 | 
147 |     def forward(self, x):
148 |         out = self.BN(x)
149 | 
150 |         out = self.conv1(out)
151 |         out = self.activation(out)
152 | 
153 |         out = self.conv2(out)
154 |         out = self.activation(out)
155 | 
156 |         out = self.conv3(out)
157 |         out = self.activation(out)
158 | 
159 |         out = self.conv4(out)
160 |         out = self.activation(out)
161 | 
162 |         out = torch.mean(out, (2, 3))
163 | 
164 |         out = self.Linear1(out)
165 |         out = self.activation(out)
166 | 
167 |         out = self.Linear2(out)
168 |         out = self.activation(out)
169 | 
170 |         out = self.Linear3(out)
171 | 
172 |         return out
173 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/MetricGAN_U.py:
--------------------------------------------------------------------------------
  1 | """Generator and discriminator used in MetricGAN-U
  2 | 
  3 | Authors:
  4 | * Szu-Wei Fu 2020
  5 | """
  6 | import torch
  7 | import speechbrain as sb
  8 | from torch import nn
  9 | from torch.nn.utils import spectral_norm
 10 | 
 11 | 
 12 | def xavier_init_layer(
 13 |     in_size, out_size=None, spec_norm=True, layer_type=nn.Linear, **kwargs
 14 | ):
 15 |     "Create a layer with spectral norm, xavier uniform init and zero bias"
 16 |     if out_size is None:
 17 |         out_size = in_size
 18 | 
 19 |     layer = layer_type(in_size, out_size, **kwargs)
 20 |     if spec_norm:
 21 |         layer = spectral_norm(layer)
 22 | 
 23 |     # Perform initialization
 24 |     nn.init.xavier_uniform_(layer.weight, gain=1.0)
 25 |     nn.init.zeros_(layer.bias)
 26 | 
 27 |     return layer
 28 | 
 29 | 
 30 | class EnhancementGenerator(nn.Module):
 31 |     """Simple LSTM for enhancement with custom initialization.
 32 | 
 33 |     Arguments
 34 |     ---------
 35 |     input_size : int
 36 |         Size of the input tensor's last dimension.
 37 |     hidden_size : int
 38 |         Number of neurons to use in the LSTM layers.
 39 |     num_layers : int
 40 |         Number of layers to use in the LSTM.
 41 |     lin_dim: int
 42 |         Number of neurons in the last two linear layers.
 43 |     dropout : int
 44 |         Fraction of neurons to drop during training.
 45 | 
 46 |     Example
 47 |     -------
 48 |     >>> inputs = torch.rand([10, 100, 40])
 49 |     >>> model = EnhancementGenerator(input_size=40, hidden_size=50)
 50 |     >>> outputs = model(inputs, lengths=torch.ones([10]))
 51 |     >>> outputs.shape
 52 |     torch.Size([10, 100, 40])
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         input_size=257,
 58 |         hidden_size=200,
 59 |         num_layers=2,
 60 |         lin_dim=300,
 61 |         dropout=0,
 62 |     ):
 63 |         super().__init__()
 64 |         self.activation = nn.LeakyReLU(negative_slope=0.3)
 65 | 
 66 |         self.blstm = sb.nnet.RNN.LSTM(
 67 |             input_size=input_size,
 68 |             hidden_size=hidden_size,
 69 |             num_layers=num_layers,
 70 |             dropout=dropout,
 71 |             bidirectional=True,
 72 |         )
 73 |         """
 74 |         Use orthogonal init for recurrent layers, xavier uniform for input layers
 75 |         Bias is 0
 76 |         """
 77 |         for name, param in self.blstm.named_parameters():
 78 |             if "bias" in name:
 79 |                 nn.init.zeros_(param)
 80 |             elif "weight_ih" in name:
 81 |                 nn.init.xavier_uniform_(param)
 82 |             elif "weight_hh" in name:
 83 |                 nn.init.orthogonal_(param)
 84 | 
 85 |         self.linear1 = xavier_init_layer(
 86 |             hidden_size * 2, lin_dim, spec_norm=False
 87 |         )
 88 |         self.linear2 = xavier_init_layer(lin_dim, input_size, spec_norm=False)
 89 | 
 90 |         self.sigmoid = nn.Sigmoid()
 91 | 
 92 |     def forward(self, x, lengths):
 93 |         out, _ = self.blstm(x, lengths=lengths)
 94 | 
 95 |         out = self.linear1(out)
 96 |         out = self.activation(out)
 97 | 
 98 |         out = self.linear2(out)
 99 |         out = self.sigmoid(out)
100 | 
101 |         return out
102 | 
103 | 
104 | class MetricDiscriminator(nn.Module):
105 |     """Metric estimator for enhancement training.
106 | 
107 |     Consists of:
108 |      * four 2d conv layers
109 |      * channel averaging
110 |      * three linear layers
111 | 
112 |     Arguments
113 |     ---------
114 |     kernel_size : tuple
115 |         The dimensions of the 2-d kernel used for convolution.
116 |     base_channels : int
117 |         Number of channels used in each conv layer.
118 |     lin_dim1: int
119 |         Dimensionality of the first linear layer.
120 |     lin_dim2: int
121 |         Dimensionality of the second linear layer.
122 | 
123 | 
124 |     Example
125 |     -------
126 |     >>> inputs = torch.rand([1, 1, 100, 257])
127 |     >>> model = MetricDiscriminator()
128 |     >>> outputs = model(inputs)
129 |     >>> outputs.shape
130 |     torch.Size([1, 1])
131 |     """
132 | 
133 |     # FCN
134 |     def __init__(
135 |         self,
136 |         kernel_size=(5, 5),
137 |         base_channels=15,
138 |         activation=nn.LeakyReLU,
139 |         lin_dim1=50,
140 |         lin_dim2=10,
141 |     ):
142 |         super().__init__()
143 | 
144 |         self.activation = activation(negative_slope=0.3)
145 | 
146 |         self.BN = nn.BatchNorm2d(num_features=1, momentum=0.01)
147 | 
148 |         self.conv1 = xavier_init_layer(
149 |             1, base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
150 |         )
151 |         self.conv2 = xavier_init_layer(
152 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
153 |         )
154 |         self.conv3 = xavier_init_layer(
155 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
156 |         )
157 |         self.conv4 = xavier_init_layer(
158 |             base_channels, layer_type=nn.Conv2d, kernel_size=kernel_size
159 |         )
160 | 
161 |         self.Linear1 = xavier_init_layer(base_channels, out_size=lin_dim1)
162 |         self.Linear2 = xavier_init_layer(in_size=lin_dim1, out_size=lin_dim2)
163 |         self.Linear3 = xavier_init_layer(in_size=lin_dim2, out_size=1)
164 | 
165 |     def forward(self, x):
166 | 
167 |         out = self.conv1(x)
168 |         out = self.activation(out)
169 | 
170 |         out = self.conv2(out)
171 |         out = self.activation(out)
172 | 
173 |         out = self.conv3(out)
174 |         out = self.activation(out)
175 | 
176 |         out = self.conv4(out)
177 |         out = self.activation(out)
178 | 
179 |         out = torch.mean(out, (2, 3))
180 | 
181 |         out = self.Linear1(out)
182 |         out = self.activation(out)
183 | 
184 |         out = self.Linear2(out)
185 |         out = self.activation(out)
186 | 
187 |         out = self.Linear3(out)
188 | 
189 |         return out
190 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/RNNLM.py:
--------------------------------------------------------------------------------
  1 | """Implementation of a Recurrent Language Model.
  2 | 
  3 | Authors
  4 |  * Mirco Ravanelli 2020
  5 |  * Peter Plantinga 2020
  6 |  * Ju-Chieh Chou 2020
  7 |  * Titouan Parcollet 2020
  8 |  * Abdel 2020
  9 | """
 10 | import torch
 11 | from torch import nn
 12 | import speechbrain as sb
 13 | 
 14 | 
 15 | class RNNLM(nn.Module):
 16 |     """This model is a combination of embedding layer, RNN, DNN.
 17 |     It can be used for RNNLM.
 18 | 
 19 |     Arguments
 20 |     ---------
 21 |     output_neurons : int
 22 |         Number of entries in embedding table, also the number of neurons in
 23 |         output layer.
 24 |     embedding_dim : int
 25 |         Size of embedding vectors (default 128).
 26 |     activation : torch class
 27 |         A class used for constructing the activation layers for DNN.
 28 |     dropout : float
 29 |         Neuron dropout rate applied to embedding, RNN, and DNN.
 30 |     rnn_class : torch class
 31 |         The type of RNN to use in RNNLM network (LiGRU, LSTM, GRU, RNN)
 32 |     rnn_layers : int
 33 |         The number of recurrent layers to include.
 34 |     rnn_neurons : int
 35 |         Number of neurons in each layer of the RNN.
 36 |     rnn_re_init : bool
 37 |         Whether to initialize rnn with orthogonal initialization.
 38 |     rnn_return_hidden : bool
 39 |         Whether to return hidden states (default True).
 40 |     dnn_blocks : int
 41 |         The number of linear neural blocks to include.
 42 |     dnn_neurons : int
 43 |         The number of neurons in the linear layers.
 44 | 
 45 |     Example
 46 |     -------
 47 |     >>> model = RNNLM(output_neurons=5)
 48 |     >>> inputs = torch.Tensor([[1, 2, 3]])
 49 |     >>> outputs = model(inputs)
 50 |     >>> outputs.shape
 51 |     torch.Size([1, 3, 5])
 52 |     """
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         output_neurons,
 57 |         embedding_dim=128,
 58 |         activation=torch.nn.LeakyReLU,
 59 |         dropout=0.15,
 60 |         rnn_class=sb.nnet.RNN.LSTM,
 61 |         rnn_layers=2,
 62 |         rnn_neurons=1024,
 63 |         rnn_re_init=False,
 64 |         return_hidden=False,
 65 |         dnn_blocks=1,
 66 |         dnn_neurons=512,
 67 |     ):
 68 |         super().__init__()
 69 |         self.embedding = sb.nnet.embedding.Embedding(
 70 |             num_embeddings=output_neurons, embedding_dim=embedding_dim
 71 |         )
 72 |         self.dropout = nn.Dropout(p=dropout)
 73 |         self.rnn = rnn_class(
 74 |             input_size=embedding_dim,
 75 |             hidden_size=rnn_neurons,
 76 |             num_layers=rnn_layers,
 77 |             dropout=dropout,
 78 |             re_init=rnn_re_init,
 79 |         )
 80 |         self.return_hidden = return_hidden
 81 |         self.reshape = False
 82 | 
 83 |         self.dnn = sb.nnet.containers.Sequential(
 84 |             input_shape=[None, None, rnn_neurons]
 85 |         )
 86 |         for block_index in range(dnn_blocks):
 87 |             self.dnn.append(
 88 |                 sb.nnet.linear.Linear,
 89 |                 n_neurons=dnn_neurons,
 90 |                 bias=True,
 91 |                 layer_name="linear",
 92 |             )
 93 |             self.dnn.append(sb.nnet.normalization.LayerNorm, layer_name="norm")
 94 |             self.dnn.append(activation(), layer_name="act")
 95 |             self.dnn.append(torch.nn.Dropout(p=dropout), layer_name="dropout")
 96 | 
 97 |         self.out = sb.nnet.linear.Linear(
 98 |             input_size=dnn_neurons, n_neurons=output_neurons
 99 |         )
100 | 
101 |     def forward(self, x, hx=None):
102 | 
103 |         x = self.embedding(x)
104 |         x = self.dropout(x)
105 | 
106 |         # If 2d tensor, add a time-axis
107 |         # This is used for inference time
108 |         if len(x.shape) == 2:
109 |             x = x.unsqueeze(dim=1)
110 |             self.reshape = True
111 | 
112 |         x, hidden = self.rnn(x, hx)
113 |         x = self.dnn(x)
114 |         out = self.out(x)
115 | 
116 |         if self.reshape:
117 |             out = out.squeeze(dim=1)
118 | 
119 |         if self.return_hidden:
120 |             return out, hidden
121 |         else:
122 |             return out
123 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/VanillaNN.py:
--------------------------------------------------------------------------------
 1 | """Vanilla Neural Network for simple tests.
 2 | 
 3 | Authors
 4 | * Elena Rastorgueva 2020
 5 | """
 6 | import torch
 7 | import speechbrain as sb
 8 | 
 9 | 
10 | class VanillaNN(sb.nnet.containers.Sequential):
11 |     """A simple vanilla Deep Neural Network.
12 | 
13 |     Arguments
14 |     ---------
15 |     activation : torch class
16 |         A class used for constructing the activation layers.
17 |     dnn_blocks : int
18 |         The number of linear neural blocks to include.
19 |     dnn_neurons : int
20 |         The number of neurons in the linear layers.
21 | 
22 |     Example
23 |     -------
24 |     >>> inputs = torch.rand([10, 120, 60])
25 |     >>> model = VanillaNN(input_shape=inputs.shape)
26 |     >>> outputs = model(inputs)
27 |     >>> outputs.shape
28 |     torch.Size([10, 120, 512])
29 |     """
30 | 
31 |     def __init__(
32 |         self,
33 |         input_shape,
34 |         activation=torch.nn.LeakyReLU,
35 |         dnn_blocks=2,
36 |         dnn_neurons=512,
37 |     ):
38 |         super().__init__(input_shape=input_shape)
39 | 
40 |         for block_index in range(dnn_blocks):
41 |             self.append(
42 |                 sb.nnet.linear.Linear,
43 |                 n_neurons=dnn_neurons,
44 |                 bias=True,
45 |                 layer_name="linear",
46 |             )
47 |             self.append(activation(), layer_name="act")
48 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package defining neural netword models (CRDNN, Xvectors ...)
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/convolution.py:
--------------------------------------------------------------------------------
  1 | """This is a module to ensemble a convolution (depthwise) encoder with or without residule connection.
  2 | 
  3 | Authors
  4 |  * Jianyuan Zhong 2020
  5 | """
  6 | import torch
  7 | from speechbrain.nnet.CNN import Conv2d
  8 | from speechbrain.nnet.containers import Sequential
  9 | from speechbrain.nnet.normalization import BatchNorm2d
 10 | 
 11 | 
 12 | class ConvolutionFrontEnd(Sequential):
 13 |     """This is a module to ensemble a convolution (depthwise) encoder with or
 14 |     without residual connection.
 15 | 
 16 |      Arguments
 17 |     ----------
 18 |     out_channels: int
 19 |         Number of output channels of this model (default 640).
 20 |     out_channels: Optional(list[int])
 21 |         Number of output channels for each of block.
 22 |     kernel_size: int
 23 |         Kernel size of convolution layers (default 3).
 24 |     strides: Optional(list[int])
 25 |         Striding factor for each block, this stride is applied at the last convolution layer at each block.
 26 |     num_blocks: int
 27 |         Number of block (default 21).
 28 |     num_per_layers: int
 29 |         Number of convolution layers for each block (default 5).
 30 |     dropout: float
 31 |         Dropout (default 0.15).
 32 |     activation: torch class
 33 |         Activation function for each block (default Swish).
 34 |     norm: torch class
 35 |         Normalization to regularize the model (default BatchNorm1d).
 36 |     residuals: Optional(list[bool])
 37 |         Whether apply residual connection at each block (default None).
 38 | 
 39 |     Example
 40 |     -------
 41 |     >>> x = torch.rand((8, 30, 10))
 42 |     >>> conv = ConvolutionFrontEnd(input_shape=x.shape)
 43 |     >>> out = conv(x)
 44 |     >>> out.shape
 45 |     torch.Size([8, 8, 3, 512])
 46 |     """
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         input_shape,
 51 |         num_blocks=3,
 52 |         num_layers_per_block=5,
 53 |         out_channels=[128, 256, 512],
 54 |         kernel_sizes=[3, 3, 3],
 55 |         strides=[1, 2, 2],
 56 |         dilations=[1, 1, 1],
 57 |         residuals=[True, True, True],
 58 |         conv_module=Conv2d,
 59 |         activation=torch.nn.LeakyReLU,
 60 |         norm=BatchNorm2d,
 61 |         dropout=0.1,
 62 |     ):
 63 |         super().__init__(input_shape=input_shape)
 64 |         for i in range(num_blocks):
 65 |             self.append(
 66 |                 ConvBlock,
 67 |                 num_layers=num_layers_per_block,
 68 |                 out_channels=out_channels[i],
 69 |                 kernel_size=kernel_sizes[i],
 70 |                 stride=strides[i],
 71 |                 dilation=dilations[i],
 72 |                 residual=residuals[i],
 73 |                 conv_module=conv_module,
 74 |                 activation=activation,
 75 |                 norm=norm,
 76 |                 dropout=dropout,
 77 |                 layer_name=f"convblock_{i}",
 78 |             )
 79 | 
 80 | 
 81 | class ConvBlock(torch.nn.Module):
 82 |     """An implementation of convolution block with 1d or 2d convolutions (depthwise).
 83 | 
 84 |     Arguments
 85 |     ----------
 86 |     out_channels : int
 87 |         Number of output channels of this model (default 640).
 88 |     kernel_size : int
 89 |         Kernel size of convolution layers (default 3).
 90 |     strides : int
 91 |         Striding factor for this block (default 1).
 92 |     num_layers : int
 93 |         Number of depthwise convolution layers for this block.
 94 |     activation : torch class
 95 |         Activation function for this block.
 96 |     norm : torch class
 97 |         Normalization to regularize the model (default BatchNorm1d).
 98 |     residuals: bool
 99 |         Whether apply residual connection at this block (default None).
100 | 
101 |     Example
102 |     -------
103 |     >>> x = torch.rand((8, 30, 10))
104 |     >>> conv = ConvBlock(2, 16, input_shape=x.shape)
105 |     >>> out = conv(x)
106 |     >>> out.shape
107 |     torch.Size([8, 30, 10, 16])
108 |     """
109 | 
110 |     def __init__(
111 |         self,
112 |         num_layers,
113 |         out_channels,
114 |         input_shape,
115 |         kernel_size=3,
116 |         stride=1,
117 |         dilation=1,
118 |         residual=False,
119 |         conv_module=Conv2d,
120 |         activation=torch.nn.LeakyReLU,
121 |         norm=None,
122 |         dropout=0.1,
123 |     ):
124 |         super().__init__()
125 | 
126 |         self.convs = Sequential(input_shape=input_shape)
127 | 
128 |         for i in range(num_layers):
129 |             self.convs.append(
130 |                 conv_module,
131 |                 out_channels=out_channels,
132 |                 kernel_size=kernel_size,
133 |                 stride=stride if i == num_layers - 1 else 1,
134 |                 dilation=dilation,
135 |                 layer_name=f"conv_{i}",
136 |             )
137 |             if norm is not None:
138 |                 self.convs.append(norm, layer_name=f"norm_{i}")
139 |             self.convs.append(activation(), layer_name=f"act_{i}")
140 |             self.convs.append(
141 |                 torch.nn.Dropout(dropout), layer_name=f"dropout_{i}"
142 |             )
143 | 
144 |         self.reduce_conv = None
145 |         self.drop = None
146 |         if residual:
147 |             self.reduce_conv = Sequential(input_shape=input_shape)
148 |             self.reduce_conv.append(
149 |                 conv_module,
150 |                 out_channels=out_channels,
151 |                 kernel_size=1,
152 |                 stride=stride,
153 |                 layer_name="conv",
154 |             )
155 |             self.reduce_conv.append(norm, layer_name="norm")
156 |             self.drop = torch.nn.Dropout(dropout)
157 | 
158 |     def forward(self, x):
159 |         out = self.convs(x)
160 |         if self.reduce_conv:
161 |             out = out + self.reduce_conv(x)
162 |             out = self.drop(out)
163 | 
164 |         return out
165 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/transformer/TransformerLM.py:
--------------------------------------------------------------------------------
  1 | """An implementation of Transformer Language model.
  2 | 
  3 | Authors
  4 | * Jianyuan Zhong
  5 | * Samuele Cornell
  6 | """
  7 | 
  8 | 
  9 | import torch  # noqa 42
 10 | from torch import nn
 11 | 
 12 | from speechbrain.nnet.linear import Linear
 13 | from speechbrain.nnet.normalization import LayerNorm
 14 | from speechbrain.nnet.containers import ModuleList
 15 | from speechbrain.lobes.models.transformer.Transformer import (
 16 |     TransformerInterface,
 17 |     get_lookahead_mask,
 18 |     get_key_padding_mask,
 19 |     NormalizedEmbedding,
 20 | )
 21 | 
 22 | 
 23 | class TransformerLM(TransformerInterface):
 24 |     """This is an implementation of transformer language model.
 25 | 
 26 |     The architecture is based on the paper "Attention Is All You Need": https://arxiv.org/pdf/1706.03762.pdf
 27 | 
 28 |     Arguments
 29 |     ----------
 30 |     d_model : int
 31 |         The number of expected features in the encoder/decoder inputs (default=512).
 32 |     nhead : int
 33 |         The number of heads in the multiheadattention models (default=8).
 34 |     num_encoder_layers : int
 35 |         The number of sub-encoder-layers in the encoder (default=6).
 36 |     num_decoder_layers : int
 37 |         The number of sub-decoder-layers in the decoder (default=6).
 38 |     dim_ffn : int
 39 |         The dimension of the feedforward network model (default=2048).
 40 |     dropout : int
 41 |         The dropout value (default=0.1).
 42 |     activation: torch class
 43 |         The activation function of encoder/decoder intermediate layer, relu or gelu (default=relu).
 44 | 
 45 |     Example
 46 |     -------
 47 |     >>> src = torch.randint(0, 720, [8, 120])
 48 |     >>> net = TransformerLM(720, 512, 8, 1, 0, 1024, activation=torch.nn.GELU)
 49 |     >>> enc_out = net.forward(src)
 50 |     >>> print(enc_out.shape)
 51 |     torch.Size([8, 120, 720])
 52 |     """
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         vocab,
 57 |         d_model=512,
 58 |         nhead=8,
 59 |         num_encoder_layers=12,
 60 |         num_decoder_layers=0,
 61 |         d_ffn=2048,
 62 |         dropout=0.1,
 63 |         activation=nn.ReLU,
 64 |         positional_encoding="fixed_abs_sine",
 65 |         normalize_before=False,
 66 |         d_embedding=None,
 67 |         max_length=2500,
 68 |         causal=True,
 69 |         attention_type="regularMHA",
 70 |     ):
 71 |         super().__init__(
 72 |             d_model=d_model,
 73 |             nhead=nhead,
 74 |             num_encoder_layers=num_encoder_layers,
 75 |             num_decoder_layers=num_decoder_layers,
 76 |             d_ffn=d_ffn,
 77 |             dropout=dropout,
 78 |             activation=activation,
 79 |             positional_encoding=positional_encoding,
 80 |             normalize_before=normalize_before,
 81 |             max_length=max_length,
 82 |             causal=causal,
 83 |             attention_type=attention_type,
 84 |         )
 85 | 
 86 |         self.d_embedding = d_embedding
 87 |         if d_embedding is None:
 88 |             self.d_embedding = d_model
 89 | 
 90 |         self.custom_src_module = NormalizedEmbedding(self.d_embedding, vocab)
 91 | 
 92 |         self.embedding_proj = None
 93 |         if d_embedding is not None:
 94 |             self.embedding_proj = Linear(
 95 |                 input_size=self.d_embedding, n_neurons=d_model
 96 |             )
 97 | 
 98 |         self.output_proj = ModuleList(
 99 |             Linear(input_size=d_model, n_neurons=d_model),
100 |             LayerNorm(d_model, eps=1e-6),
101 |             Linear(input_size=d_model, n_neurons=vocab),
102 |         )
103 | 
104 |         self.num_encoder_layers = num_encoder_layers
105 |         self.num_decoder_layers = num_decoder_layers
106 | 
107 |         # reset the params of the transformer model
108 |         self._reset_params()
109 | 
110 |     def forward(self, src, hx=None):
111 |         """
112 |         Arguments
113 |         ---------
114 |         src : tensor
115 |             The sequence to the encoder (required).
116 |         """
117 |         src_mask, src_key_padding_mask = self.make_masks(src)
118 | 
119 |         src = self.custom_src_module(src)
120 |         if self.embedding_proj is not None:
121 |             src = self.embedding_proj(src)
122 |         src = src + self.positional_encoding(src)
123 |         if self.num_encoder_layers > 0:
124 |             encoder_out, _ = self.encoder(
125 |                 src=src,
126 |                 src_mask=src_mask,
127 |                 src_key_padding_mask=src_key_padding_mask,
128 |             )
129 | 
130 |         if self.num_decoder_layers > 0:
131 |             encoder_out, _ = self.decoder(
132 |                 src=src,
133 |                 tgt=src,
134 |                 tgt_mask=src_mask,
135 |                 tgt_key_padding_mask=src_key_padding_mask,
136 |             )
137 | 
138 |         pred = self.output_proj(encoder_out)
139 | 
140 |         return pred
141 | 
142 |     def _reset_params(self):
143 |         for p in self.parameters():
144 |             if p.dim() > 1:
145 |                 torch.nn.init.xavier_normal_(p)
146 | 
147 |     def make_masks(
148 |         self, src, pad_idx=0, look_ahead_mask=True, padding_mask=True
149 |     ):
150 |         src_mask = None
151 |         if look_ahead_mask:
152 |             src_mask = get_lookahead_mask(src)
153 | 
154 |         src_key_padding_mask = None
155 |         if padding_mask:
156 |             src_key_padding_mask = get_key_padding_mask(src, pad_idx)
157 | 
158 |         return src_mask, src_key_padding_mask
159 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/transformer/TransformerSE.py:
--------------------------------------------------------------------------------
  1 | """CNN Transformer model for SE in the SpeechBrain style.
  2 | 
  3 | Authors
  4 | * Chien-Feng Liao 2020
  5 | """
  6 | import torch  # noqa E402
  7 | from torch import nn
  8 | from speechbrain.nnet.linear import Linear
  9 | from speechbrain.lobes.models.transformer.Transformer import (
 10 |     TransformerInterface,
 11 |     get_lookahead_mask,
 12 | )
 13 | 
 14 | 
 15 | class CNNTransformerSE(TransformerInterface):
 16 |     """This is an implementation of transformer model with CNN pre-encoder for SE.
 17 | 
 18 |     Arguments
 19 |     ---------
 20 |     d_model : int
 21 |         The number of expected features in the encoder inputs.
 22 |     output_size : int
 23 |         The number of neurons in the output layer.
 24 |     output_activation : torch class
 25 |         The activation function of the output layer (default=ReLU).
 26 |     nhead : int
 27 |         The number of heads in the multi-head attention models (default=8).
 28 |     num_layers : int
 29 |         The number of sub-layers in the transformer (default=8).
 30 |     d_ffn : int
 31 |         The number of expected features in the encoder layers (default=512).
 32 |     dropout : int
 33 |         The dropout value (default=0.1).
 34 |     activation : torch class
 35 |         The activation function of intermediate layers (default=LeakyReLU).
 36 |     causal : bool
 37 |         True for causal setting, the model is forbidden to see future frames (default=True).
 38 |     custom_emb_module : torch class
 39 |         Module that processes the input features before the transformer model.
 40 | 
 41 |     Example
 42 |     -------
 43 |     >>> src = torch.rand([8, 120, 256])
 44 |     >>> net = CNNTransformerSE(d_model=256, output_size=257)
 45 |     >>> out = net(src)
 46 |     >>> out.shape
 47 |     torch.Size([8, 120, 257])
 48 |     """
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         d_model,
 53 |         output_size,
 54 |         output_activation=nn.ReLU,
 55 |         nhead=8,
 56 |         num_layers=8,
 57 |         d_ffn=512,
 58 |         dropout=0.1,
 59 |         activation=nn.LeakyReLU,
 60 |         causal=True,
 61 |         custom_emb_module=None,
 62 |         normalize_before=False,
 63 |     ):
 64 |         super().__init__(
 65 |             d_model=d_model,
 66 |             nhead=nhead,
 67 |             num_encoder_layers=num_layers,
 68 |             num_decoder_layers=0,
 69 |             d_ffn=d_ffn,
 70 |             dropout=dropout,
 71 |             activation=activation,
 72 |             positional_encoding=None,
 73 |             normalize_before=normalize_before,
 74 |             causal=causal,
 75 |         )
 76 | 
 77 |         self.custom_emb_module = custom_emb_module
 78 |         self.output_layer = Linear(output_size, input_size=d_model, bias=False)
 79 |         self.output_activation = output_activation()
 80 | 
 81 |     def forward(self, x, src_key_padding_mask=None):
 82 |         if self.causal:
 83 |             self.attn_mask = get_lookahead_mask(x)
 84 |         else:
 85 |             self.attn_mask = None
 86 | 
 87 |         if self.custom_emb_module is not None:
 88 |             x = self.custom_emb_module(x)
 89 | 
 90 |         encoder_output, _ = self.encoder(
 91 |             src=x,
 92 |             src_mask=self.attn_mask,
 93 |             src_key_padding_mask=src_key_padding_mask,
 94 |         )
 95 | 
 96 |         output = self.output_layer(encoder_output)
 97 |         output = self.output_activation(output)
 98 | 
 99 |         return output
100 | 


--------------------------------------------------------------------------------
/speechbrain/lobes/models/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | """High level processing blocks.
2 | 
3 | This subpackage gathers higher level blocks, or "lobes".
4 | The classes here may leverage the extended YAML syntax.
5 | """
6 | 


--------------------------------------------------------------------------------
/speechbrain/log-config.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |   simple:
 5 |     format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 6 |   console:
 7 |     format: "%(name)s - %(message)s"
 8 | 
 9 | handlers:
10 |   console:
11 |     class: speechbrain.utils.logger.TqdmCompatibleStreamHandler
12 |     level: INFO
13 |     formatter: console
14 |     stream: ext://sys.stdout
15 | 
16 |   file_handler:
17 |     class: logging.FileHandler
18 |     level: DEBUG
19 |     formatter: simple
20 |     filename: log.txt
21 |     encoding: utf8
22 | 
23 | root:
24 |   level: DEBUG
25 |   handlers: [console, file_handler]
26 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Package containing the different neural networks layers
 2 | """
 3 | import os
 4 | 
 5 | __all__ = []
 6 | for filename in os.listdir(os.path.dirname(__file__)):
 7 |     filename = os.path.basename(filename)
 8 |     if filename.endswith(".py") and not filename.startswith("__"):
 9 |         __all__.append(filename[:-3])
10 | 
11 | from . import *  # noqa
12 | from .loss import stoi_loss  # noqa
13 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/activations.py:
--------------------------------------------------------------------------------
  1 | """Library implementing activation functions.
  2 | 
  3 | Authors
  4 |  * Mirco Ravanelli 2020
  5 |  * Jianyuan Zhong 2020
  6 | """
  7 | 
  8 | import torch
  9 | import logging
 10 | import torch.nn.functional as F
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class Softmax(torch.nn.Module):
 16 |     """Computes the softmax of a 2d, 3d, or 4d input tensor.
 17 | 
 18 |     Arguments
 19 |     ---------
 20 |     apply_log : bool
 21 |         Whether to apply the log function before softmax.
 22 |     dim : int
 23 |         If the dimension where softmax is applied.
 24 | 
 25 |     Example
 26 |     -------
 27 |     >>> classifier = Softmax()
 28 |     >>> inputs = torch.rand(10, 50, 40)
 29 |     >>> output = classifier(inputs)
 30 |     >>> output.shape
 31 |     torch.Size([10, 50, 40])
 32 |     """
 33 | 
 34 |     def __init__(self, apply_log=False, dim=-1):
 35 |         super().__init__()
 36 | 
 37 |         if apply_log:
 38 |             self.act = torch.nn.LogSoftmax(dim=dim)
 39 |         else:
 40 |             self.act = torch.nn.Softmax(dim=dim)
 41 | 
 42 |     def forward(self, x):
 43 |         """Returns the softmax of the input tensor.
 44 | 
 45 |         Arguments
 46 |         ---------
 47 |         x : torch.Tensor
 48 |             Input tensor.
 49 |         """
 50 |         # Reshaping the tensors
 51 |         dims = x.shape
 52 | 
 53 |         if len(dims) == 3:
 54 |             x = x.reshape(dims[0] * dims[1], dims[2])
 55 | 
 56 |         if len(dims) == 4:
 57 |             x = x.reshape(dims[0] * dims[1], dims[2], dims[3])
 58 | 
 59 |         x_act = self.act(x)
 60 | 
 61 |         # Retrieving the original shape format
 62 |         if len(dims) == 3:
 63 |             x_act = x_act.reshape(dims[0], dims[1], dims[2])
 64 | 
 65 |         if len(dims) == 4:
 66 |             x_act = x_act.reshape(dims[0], dims[1], dims[2], dims[3])
 67 | 
 68 |         return x_act
 69 | 
 70 | 
 71 | class GumbelSoftmax(torch.nn.Module):
 72 |     """Samples from the Gumbel-Softmax distribution and optionally discretizes.
 73 | 
 74 |     Reference: https://arxiv.org/abs/1611.00712, https://arxiv.org/abs/1611.01144
 75 | 
 76 |     Arguments
 77 |     ----------
 78 |     tau: float
 79 |         non-negative scalar temperature
 80 |     hard: bool
 81 |         if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd
 82 |     dim: int
 83 |         A dimension along which softmax will be computed (default: -1).
 84 | 
 85 |     Example
 86 |     -------
 87 |     >>> x = torch.randn((8, 40, 120))
 88 |     >>> act = GumbelSoftmax(0.8, True)
 89 |     >>> x = act(x)
 90 |     """
 91 | 
 92 |     def __init__(self, tau, hard=False, apply_log=False):
 93 |         super().__init__()
 94 |         self.tau = tau
 95 |         self.hard = hard
 96 |         self.apply_log = apply_log
 97 | 
 98 |     def forward(self, x):
 99 |         if self.apply_log:
100 |             return torch.log(F.gumbel_softmax(x, tau=self.tau, hard=self.hard))
101 |         return F.gumbel_softmax(x, tau=self.tau, hard=self.hard)
102 | 
103 | 
104 | class Swish(torch.nn.Module):
105 |     """ The class implements the Swish activation function from
106 |     https://arxiv.org/pdf/2005.03191.pdf
107 | 
108 |     given input x. Swish(x) = x / (1 + exp(beta * x))
109 | 
110 |     Arguments
111 |     ---------
112 |     beta: float
113 |         Beta value.
114 | 
115 |     Example
116 |     -------
117 |     >>> x = torch.randn((8, 40, 120))
118 |     >>> act = Swish()
119 |     >>> x = act(x)
120 |     """
121 | 
122 |     def __init__(self, beta=1):
123 |         super().__init__()
124 |         self.beta = beta
125 |         self.sigmoid = torch.nn.Sigmoid()
126 | 
127 |     def forward(self, x):
128 |         """Returns the Swished input tensor.
129 | 
130 |         Arguments
131 |         ---------
132 |         x : torch.Tensor
133 |             Input tensor.
134 |         """
135 |         return x * self.sigmoid(self.beta * x)
136 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/complex_networks/__init__.py:
--------------------------------------------------------------------------------
1 | """Package containing complex neural networks
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/complex_networks/c_linear.py:
--------------------------------------------------------------------------------
  1 | """Library implementing complex-valued linear transformation.
  2 | 
  3 | Authors
  4 |  * Titouan Parcollet 2020
  5 | """
  6 | 
  7 | import torch
  8 | import logging
  9 | from speechbrain.nnet.complex_networks.c_ops import (
 10 |     affect_init,
 11 |     complex_init,
 12 |     unitary_init,
 13 |     complex_linear_op,
 14 |     check_complex_input,
 15 | )
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class CLinear(torch.nn.Module):
 21 |     """This function implements a fully connected complex-valued
 22 |     linear layer: y = Wx + b. y, W, x and b are thus complex
 23 |     numbers. A complex number is written as: r + xi. A tensor of
 24 |     complex numbers x = [batch, 32] can be understood as
 25 |     [batch, 0:15] = R and [batch, 16:31] = Xi. Thus the features
 26 |     dimension is cut in half (must be divisible by 2).
 27 | 
 28 |     Arguments
 29 |     ---------
 30 |     n_neurons : int
 31 |         It is the number of output neurons (i.e, the dimensionality of the
 32 |         output). Please note that these are complex-valued neurons. If 256
 33 |         neurons are specified, the output dimension will be 512.
 34 |     input_shape : tuple
 35 |         Expected size of the input.
 36 |     bias : bool
 37 |         if True, the additive bias b is adopted.
 38 |     init_criterion : str , optional
 39 |         (glorot, he).
 40 |         This parameter controls the initialization criterion of the weights.
 41 |         It is combined with weights_init to build the initialization method of
 42 |         the complex-valued weights (default "glorot").
 43 |     weight_init : str, optional
 44 |         (complex, unitary).
 45 |         This parameter defines the initialization procedure of the
 46 |         complex-valued weights (default "complex"). "complex" will generate random complex-valued
 47 |         weights following the init_criterion and the complex polar form.
 48 |         "unitary" will normalize the weights to lie on the unit circle.
 49 |         More details in: "Deep Complex Networks", Trabelsi C. et al.
 50 | 
 51 |     Example
 52 |     -------
 53 |     >>> inputs = torch.rand(10, 50, 40)
 54 |     >>> lin = CLinear(n_neurons=100, input_shape=inputs.shape)
 55 |     >>> output = lin(inputs)
 56 |     >>> output.shape
 57 |     torch.Size([10, 50, 200])
 58 |     """
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         n_neurons,
 63 |         input_shape,
 64 |         bias=True,
 65 |         init_criterion="glorot",
 66 |         weight_init="complex",
 67 |     ):
 68 |         super().__init__()
 69 |         self.n_neurons = n_neurons
 70 |         self.bias = bias
 71 |         self.init_criterion = init_criterion
 72 |         self.weight_init = weight_init
 73 | 
 74 |         # When initialising with speechbrain the input_shape is an integer !
 75 |         # we need to transform it into a list it works with all the question ops
 76 |         if isinstance(input_shape, int):
 77 |             input_shape = [1, input_shape]
 78 | 
 79 |         # Check the complex_valued form of the input
 80 |         check_complex_input(input_shape)
 81 | 
 82 |         # Computing the complex dimensionality of the input
 83 |         self.in_features = input_shape[-1] // 2
 84 |         self.out_features = self.n_neurons
 85 | 
 86 |         # Two weight matrices are created for the real and imaginary parts of
 87 |         # the weights. This will also allow an easier complex product.
 88 |         self.real_weight = torch.nn.Parameter(
 89 |             torch.Tensor(self.in_features, self.out_features)
 90 |         )
 91 |         self.imag_weight = torch.nn.Parameter(
 92 |             torch.Tensor(self.in_features, self.out_features)
 93 |         )
 94 | 
 95 |         if self.bias:
 96 |             self.b = torch.nn.Parameter(torch.Tensor(2 * self.out_features))
 97 |         else:
 98 |             self.b = torch.Tensor(2 * self.out_features).requires_grad_(False)
 99 | 
100 |         # Managing the weight initialization and bias
101 |         self.winit = {"complex": complex_init, "unitary": unitary_init}[
102 |             self.weight_init
103 |         ]
104 | 
105 |         affect_init(
106 |             self.real_weight, self.imag_weight, self.winit, init_criterion
107 |         )
108 | 
109 |     def forward(self, x):
110 |         """Returns the linear transformation of input tensor.
111 | 
112 |         Arguments
113 |         ---------
114 |         x : torch.Tensor
115 |             Input to transform linearly.
116 |         """
117 |         wx = complex_linear_op(x, self.real_weight, self.imag_weight, self.b)
118 | 
119 |         return wx
120 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/dropout.py:
--------------------------------------------------------------------------------
 1 | """Library implementing dropout.
 2 | 
 3 | Authors
 4 |  * Mirco Ravanelli 2020
 5 | """
 6 | import torch  # noqa: F401
 7 | import logging
 8 | import torch.nn as nn
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class Dropout2d(nn.Module):
14 |     """This function implements dropout 2d. It randomly put zeros on
15 |     entire channels.
16 | 
17 | 
18 |     Arguments
19 |     ---------
20 |     dropout_rate : float
21 |         It is the dropout factor (between 0 and 1).
22 |     inplace : bool
23 |         If True, it uses inplace operations.
24 | 
25 |     Example
26 |     -------
27 |     >>> drop = Dropout2d(drop_rate=0.5)
28 |     >>> inputs = torch.rand(10, 50, 40)
29 |     >>> output=drop(inputs)
30 |     >>> output.shape
31 |     torch.Size([10, 50, 40])
32 |     """
33 | 
34 |     def __init__(
35 |         self, drop_rate, inplace=False,
36 |     ):
37 |         super().__init__()
38 |         self.drop_rate = drop_rate
39 |         self.inplace = inplace
40 |         self.drop = nn.Dropout2d(p=self.drop_rate, inplace=self.inplace)
41 | 
42 |     def forward(self, x):
43 |         """Applies dropout 2d to the input tensor.
44 | 
45 |         Arguments
46 |         ---------
47 |         x : torch.Tensor (batch, time, channel1, channel2)
48 |             input to normalize. 4d tensors are expected.
49 |         """
50 | 
51 |         # time must be the last
52 |         x = x.transpose(1, 2).transpose(2, -1)
53 |         x_drop = self.drop(x)
54 |         x_drop = x_drop.transpose(-1, 1).transpose(2, -1)
55 | 
56 |         return x_drop
57 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/embedding.py:
--------------------------------------------------------------------------------
  1 | """Library implementing embedding.
  2 | 
  3 | Authors
  4 |  * Abdelwahab Heba 2020
  5 | """
  6 | 
  7 | import torch
  8 | import logging
  9 | import torch.nn as nn
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class Embedding(nn.Module):
 15 |     """Computes an embedding x = wx.
 16 | 
 17 |     Arguments
 18 |     ---------
 19 |     num_embeddings : int
 20 |         Size of the dictionary of embeddings.
 21 |     embedding_dim : int
 22 |         It is the dim of embedding (i.e, the dimensionality of the output).
 23 |     consider_as_one_hot : bool
 24 |         Create non-trainable one-hot vector.
 25 |     blank_id : int
 26 |         If consider_as_one_hot == True: consider the embedding as one_hot
 27 |         and use blank_index as zero one_hot vector.
 28 | 
 29 |     Example
 30 |     -------
 31 |     >>> from speechbrain.nnet.embedding import Embedding
 32 |     >>> import torch
 33 |     >>> emb = Embedding(
 34 |     ...     num_embeddings=40,
 35 |     ...     embedding_dim=39,
 36 |     ...     consider_as_one_hot=True,
 37 |     ...     blank_id=39
 38 |     ... )
 39 |     >>> inputs = torch.Tensor([10,5,2,0,39]).long()
 40 |     >>> output = emb(inputs)
 41 |     >>> output.shape
 42 |     torch.Size([5, 39])
 43 |     >>> output
 44 |     tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
 45 |              0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 46 |              0., 0., 0.],
 47 |             [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 48 |              0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 49 |              0., 0., 0.],
 50 |             [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 51 |              0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 52 |              0., 0., 0.],
 53 |             [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 54 |              0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 55 |              0., 0., 0.],
 56 |             [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 57 |              0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
 58 |              0., 0., 0.]])
 59 |     >>> emb = Embedding(num_embeddings=5, embedding_dim=3, consider_as_one_hot=False)
 60 |     >>> e = emb(torch.LongTensor([[0, 1, 2], [3, 4, 2]]))
 61 |     >>> e.shape
 62 |     torch.Size([2, 3, 3])
 63 |      """
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         num_embeddings,
 68 |         embedding_dim=128,
 69 |         consider_as_one_hot=False,
 70 |         blank_id=0,
 71 |     ):
 72 | 
 73 |         super().__init__()
 74 |         self.num_embeddings = num_embeddings
 75 |         self.consider_as_one_hot = consider_as_one_hot
 76 |         if self.consider_as_one_hot:
 77 |             self.embedding_dim = self.num_embeddings - 1
 78 |         else:
 79 |             self.embedding_dim = embedding_dim
 80 |         self.blank_id = blank_id
 81 | 
 82 |         if self.consider_as_one_hot:
 83 |             # deal with blank_id, the output should be embedding_dim-1 as we consider blank output as zeros one_hot vect
 84 |             # padding_idx fix the idx row to zeros
 85 |             self.Embedding = nn.Embedding(
 86 |                 self.num_embeddings,
 87 |                 self.embedding_dim,
 88 |                 padding_idx=self.blank_id,
 89 |             )
 90 |             one_hot = torch.eye(self.embedding_dim)
 91 |             if self.blank_id + 1 != self.num_embeddings:
 92 |                 self.Embedding.weight.data[self.blank_id + 1 :] = one_hot[
 93 |                     self.blank_id :
 94 |                 ]
 95 |             if self.blank_id != 0:
 96 |                 self.Embedding.weight.data[: self.blank_id] = one_hot[
 97 |                     : self.blank_id
 98 |                 ]
 99 |             self.Embedding.weight.requires_grad = False
100 |         else:
101 |             self.Embedding = nn.Embedding(
102 |                 self.num_embeddings, self.embedding_dim
103 |             )
104 | 
105 |     def forward(self, x):
106 |         """Returns the embedding of input tensor.
107 | 
108 |         Arguments
109 |         ---------
110 |         x : torch.Tensor
111 |            Input to embed.
112 |         """
113 |         # pytorch embedding layer only accept long dtype
114 |         return self.Embedding(x.long())
115 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/linear.py:
--------------------------------------------------------------------------------
  1 | """Library implementing linear transformation.
  2 | 
  3 | Authors
  4 |  * Mirco Ravanelli 2020
  5 |  * Davide Borra 2021
  6 | """
  7 | 
  8 | import torch
  9 | import logging
 10 | import torch.nn as nn
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class Linear(torch.nn.Module):
 16 |     """Computes a linear transformation y = wx + b.
 17 | 
 18 |     Arguments
 19 |     ---------
 20 |     n_neurons : int
 21 |         It is the number of output neurons (i.e, the dimensionality of the
 22 |         output).
 23 |     input_shape: tuple
 24 |         It is the shape of the input tensor.
 25 |     input_size: int
 26 |         Size of the input tensor.
 27 |     bias : bool
 28 |         If True, the additive bias b is adopted.
 29 |     combine_dims : bool
 30 |         If True and the input is 4D, combine 3rd and 4th dimensions of input.
 31 | 
 32 |     Example
 33 |     -------
 34 |     >>> inputs = torch.rand(10, 50, 40)
 35 |     >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
 36 |     >>> output = lin_t(inputs)
 37 |     >>> output.shape
 38 |     torch.Size([10, 50, 100])
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         n_neurons,
 44 |         input_shape=None,
 45 |         input_size=None,
 46 |         bias=True,
 47 |         combine_dims=False,
 48 |     ):
 49 |         super().__init__()
 50 |         self.combine_dims = combine_dims
 51 | 
 52 |         if input_shape is None and input_size is None:
 53 |             raise ValueError("Expected one of input_shape or input_size")
 54 | 
 55 |         if input_size is None:
 56 |             input_size = input_shape[-1]
 57 |             if len(input_shape) == 4 and self.combine_dims:
 58 |                 input_size = input_shape[2] * input_shape[3]
 59 | 
 60 |         # Weights are initialized following pytorch approach
 61 |         self.w = nn.Linear(input_size, n_neurons, bias=bias)
 62 | 
 63 |     def forward(self, x):
 64 |         """Returns the linear transformation of input tensor.
 65 | 
 66 |         Arguments
 67 |         ---------
 68 |         x : torch.Tensor
 69 |             Input to transform linearly.
 70 |         """
 71 |         if x.ndim == 4 and self.combine_dims:
 72 |             x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
 73 | 
 74 |         wx = self.w(x)
 75 | 
 76 |         return wx
 77 | 
 78 | 
 79 | class LinearWithConstraint(Linear):
 80 |     """Computes a linear transformation y = wx + b with kernel max-norm constaint.
 81 |     This corresponds to set an upper bound for the kernel norm.
 82 | 
 83 |     Arguments
 84 |     ---------
 85 |     n_neurons : int
 86 |         It is the number of output neurons (i.e, the dimensionality of the
 87 |         output).
 88 |     input_shape: tuple
 89 |         It is the shape of the input tensor.
 90 |     input_size: int
 91 |         Size of the input tensor.
 92 |     bias : bool
 93 |         If True, the additive bias b is adopted.
 94 |     combine_dims : bool
 95 |         If True and the input is 4D, combine 3rd and 4th dimensions of input.
 96 |     max_norm : float
 97 |         Kernel max-norm
 98 | 
 99 |     Example
100 |     -------
101 |     >>> inputs = torch.rand(100,)
102 |     >>> max_norm = 1.
103 |     >>> lin_t_contrained = LinearWithConstraint(input_size=inputs.shape[0], n_neurons=2, max_norm=max_norm)
104 |     >>> output = lin_t_contrained(inputs)
105 |     >>> torch.any(torch.norm(lin_t_contrained.w.weight.data, p=2, dim=0)>max_norm)
106 |     tensor(False)
107 |     """
108 | 
109 |     def __init__(self, *args, max_norm=1, **kwargs):
110 |         self.max_norm = max_norm
111 |         super(LinearWithConstraint, self).__init__(*args, **kwargs)
112 | 
113 |     def forward(self, x):
114 |         """Returns the linear transformation of input tensor.
115 | 
116 |         Arguments
117 |         ---------
118 |         x : torch.Tensor
119 |             Input to transform linearly.
120 |         """
121 |         self.w.weight.data = torch.renorm(
122 |             self.w.weight.data, p=2, dim=0, maxnorm=self.max_norm
123 |         )
124 |         return super(LinearWithConstraint, self).forward(x)
125 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/loss/__init__.py:
--------------------------------------------------------------------------------
1 | """Package containing specific losses (transducer, stoi ...)
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/loss/guidedattn_loss.py:
--------------------------------------------------------------------------------
  1 | """The Guided Attention Loss implementation
  2 | 
  3 | This loss can be used to speed up the training of
  4 | models in which the correspondence between inputs and
  5 | outputs is roughly linear, and the attention alignments
  6 | are expected to be approximately diagonal, such as Grapheme-to-Phoneme
  7 | and Text-to-Speech
  8 | 
  9 | Authors
 10 | * Artem Ploujnikov 2021
 11 | """
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | 
 16 | 
 17 | class GuidedAttentionLoss(nn.Module):
 18 |     """
 19 |     A loss implementation that forces attention matrices to be
 20 |     near-diagonal, imposing progressively larger penalties for paying
 21 |     attention to regions far away from the diagonal). It is useful
 22 |     for sequence-to-sequence models in which the sequence of outputs
 23 |     is expected to corrsespond closely to the sequence of inputs,
 24 |     such as TTS or G2P
 25 | 
 26 |     https://arxiv.org/abs/1710.08969
 27 | 
 28 |     The implementation is inspired by the R9Y9 DeepVoice3 model
 29 |     https://github.com/r9y9/deepvoice3_pytorch
 30 | 
 31 |     It should be roughly equivalent to it; however, it has been
 32 |     fully vectorized.
 33 | 
 34 |     Arguments
 35 |     ---------
 36 |     sigma:
 37 |         the guided attention weight
 38 | 
 39 |     Example
 40 |     -------
 41 |     NOTE: In a real scenario, the input_lengths and
 42 |     target_lengths would come from a data batch,
 43 |     whereas alignments would come from a model
 44 |     >>> import torch
 45 |     >>> from speechbrain.nnet.loss.guidedattn_loss import GuidedAttentionLoss
 46 |     >>> loss = GuidedAttentionLoss(sigma=0.2)
 47 |     >>> input_lengths = torch.tensor([2, 3])
 48 |     >>> target_lengths = torch.tensor([3, 4])
 49 |     >>> alignments = torch.tensor(
 50 |     ...     [
 51 |     ...         [
 52 |     ...             [0.8, 0.2, 0.0],
 53 |     ...             [0.4, 0.6, 0.0],
 54 |     ...             [0.2, 0.8, 0.0],
 55 |     ...             [0.0, 0.0, 0.0],
 56 |     ...         ],
 57 |     ...         [
 58 |     ...             [0.6, 0.2, 0.2],
 59 |     ...             [0.1, 0.7, 0.2],
 60 |     ...             [0.3, 0.4, 0.3],
 61 |     ...             [0.2, 0.3, 0.5],
 62 |     ...         ],
 63 |     ...     ]
 64 |     ... )
 65 |     >>> loss(alignments, input_lengths, target_lengths)
 66 |     tensor(0.1142)
 67 |     """
 68 | 
 69 |     def __init__(self, sigma=0.2):
 70 |         super().__init__()
 71 |         self.sigma = sigma
 72 |         self.weight_factor = 2 * (sigma ** 2)
 73 | 
 74 |     def forward(
 75 |         self,
 76 |         attention,
 77 |         input_lengths,
 78 |         target_lengths,
 79 |         max_input_len=None,
 80 |         max_target_len=None,
 81 |     ):
 82 |         """
 83 |         Computes the guided attention loss for a single batch
 84 | 
 85 |         Arguments
 86 |         ---------
 87 |         attention: torch.Tensor
 88 |             A padded attention/alignments matrix
 89 |             (batch, targets, inputs)
 90 |         input_lengths: torch.tensor
 91 |             A (batch, lengths) tensor of input lengths
 92 |         target_lengths: torch.tensor
 93 |             A (batch, lengths) tensor of target lengths
 94 |         max_input_len: int
 95 |             The maximum input length - optional,
 96 |             if not computed will be set to the maximum
 97 |             of target_lengths. Setting it explicitly
 98 |             might be necessary when using data parallelism
 99 |         max_target_len: int
100 |             The maximum target length - optional,
101 |             if not computed will be set to the maximum
102 |             of target_lengths. Setting it explicitly
103 |             might be necessary when using data parallelism
104 | 
105 | 
106 |         Returns
107 |         -------
108 |         loss: torch.Tensor
109 |             A single-element tensor with the loss value
110 |         """
111 |         soft_mask = self.guided_attentions(
112 |             input_lengths, target_lengths, max_input_len, max_target_len
113 |         )
114 |         return (attention * soft_mask.transpose(-1, -2)).mean()
115 | 
116 |     def guided_attentions(
117 |         self,
118 |         input_lengths,
119 |         target_lengths,
120 |         max_input_len=None,
121 |         max_target_len=None,
122 |     ):
123 |         """
124 |         Computes guided attention matrices
125 | 
126 |         Arguments
127 |         ---------
128 |         input_lengths: torch.Tensor
129 |             A tensor of input lengths
130 |         target_lengths: torch.Tensor
131 |             A tensor of target lengths
132 |         max_input_len: int
133 |             The maximum input length - optional,
134 |             if not computed will be set to the maximum
135 |             of target_lengths. Setting it explicitly
136 |             might be necessary when using data parallelism
137 |         max_target_len: int
138 |             The maximum target length - optional,
139 |             if not computed will be set to the maximum
140 |             of target_lengths. Setting it explicitly
141 |             might be necessary when using data parallelism
142 | 
143 |         Returns
144 |         -------
145 |         soft_mask: torch.Tensor
146 |             The guided attention tensor of shape (batch, max_input_len, max_target_len)
147 |         """
148 |         input_lengths_broad = input_lengths.view(-1, 1, 1)
149 |         target_lengths_broad = target_lengths.view(-1, 1, 1)
150 |         if max_input_len is None:
151 |             max_input_len = input_lengths.max()
152 |         if max_target_len is None:
153 |             max_target_len = target_lengths.max()
154 |         input_mesh, target_mesh = torch.meshgrid(
155 |             torch.arange(max_input_len).to(input_lengths.device),
156 |             torch.arange(max_target_len).to(target_lengths.device),
157 |         )
158 |         input_mesh, target_mesh = (
159 |             input_mesh.unsqueeze(0),
160 |             target_mesh.unsqueeze(0),
161 |         )
162 |         input_lengths_broad = input_lengths.view(-1, 1, 1)
163 |         target_lengths_broad = target_lengths.view(-1, 1, 1)
164 |         soft_mask = 1.0 - torch.exp(
165 |             -(
166 |                 (
167 |                     input_mesh / input_lengths_broad
168 |                     - target_mesh / target_lengths_broad
169 |                 )
170 |                 ** 2
171 |             )
172 |             / self.weight_factor
173 |         )
174 |         outside = (input_mesh >= input_lengths_broad) | (
175 |             target_mesh >= target_lengths_broad
176 |         )
177 |         soft_mask[outside] = 0.0
178 |         return soft_mask
179 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/loss/si_snr_loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # Authors:
 3 |  * Szu-Wei, Fu 2021
 4 |  * Mirco Ravanelli 2020
 5 |  * Samuele Cornell 2020
 6 |  * Hwidong Na 2020
 7 |  * Yan Gao 2020
 8 |  * Titouan Parcollet 2020
 9 | """
10 | 
11 | import torch
12 | import numpy as np
13 | 
14 | smallVal = np.finfo("float").eps  # To avoid divide by zero
15 | 
16 | 
17 | def si_snr_loss(y_pred_batch, y_true_batch, lens, reduction="mean"):
18 |     """Compute the si_snr score and return -1 * that score.
19 | 
20 |     This function can be used as a loss function for training
21 |     with SGD-based updates.
22 | 
23 |     Arguments
24 |     ---------
25 |     y_pred_batch : torch.Tensor
26 |         The degraded (enhanced) waveforms.
27 |     y_true_batch : torch.Tensor
28 |         The clean (reference) waveforms.
29 |     lens : torch.Tensor
30 |         The relative lengths of the waveforms within the batch.
31 |     reduction : str
32 |         The type of reduction ("mean" or "batch") to use.
33 | 
34 |     Example
35 |     -------
36 |     """
37 | 
38 |     y_pred_batch = torch.squeeze(y_pred_batch, dim=-1)
39 |     y_true_batch = torch.squeeze(y_true_batch, dim=-1)
40 | 
41 |     batch_size = y_pred_batch.shape[0]
42 |     SI_SNR = torch.zeros(batch_size)
43 | 
44 |     for i in range(0, batch_size):  # Run over mini-batches
45 |         s_target = y_true_batch[i, 0 : int(lens[i] * y_pred_batch.shape[1])]
46 |         s_estimate = y_pred_batch[i, 0 : int(lens[i] * y_pred_batch.shape[1])]
47 | 
48 |         # s_target = <s', s>s / ||s||^2
49 |         dot = torch.sum(s_estimate * s_target, dim=0, keepdim=True)
50 |         s_target_energy = (
51 |             torch.sum(s_target ** 2, dim=0, keepdim=True) + smallVal
52 |         )
53 |         proj = dot * s_target / s_target_energy
54 | 
55 |         # e_noise = s' - s_target
56 |         e_noise = s_estimate - proj
57 | 
58 |         # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
59 |         si_snr_beforelog = torch.sum(proj ** 2, dim=0) / (
60 |             torch.sum(e_noise ** 2, dim=0) + smallVal
61 |         )
62 |         SI_SNR[i] = 10 * torch.log10(si_snr_beforelog + smallVal)
63 | 
64 |     if reduction == "mean":
65 |         return -SI_SNR.mean()
66 | 
67 |     return -SI_SNR
68 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/quaternion_networks/__init__.py:
--------------------------------------------------------------------------------
1 | """Package containing quaternion neural networks
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/quaternion_networks/q_normalization.py:
--------------------------------------------------------------------------------
  1 | """Library implementing quaternion-valued normalization.
  2 | 
  3 | Authors
  4 |  * Titouan Parcollet 2020
  5 | """
  6 | 
  7 | import torch
  8 | from torch.nn import Parameter
  9 | 
 10 | 
 11 | class QBatchNorm(torch.nn.Module):
 12 |     """This class implements the simplest form of a quaternion batchnorm as
 13 |     described in : "Quaternion Convolutional Neural Network for
 14 |     Color Image Classification and Forensics", Qilin Y. et al.
 15 | 
 16 |     Arguments
 17 |     ---------
 18 |     input_size : int
 19 |         Expected size of the dimension to be normalized.
 20 |     dim : int, optional
 21 |         It defines the axis that should be normalized. It usually correspond to
 22 |         the channel dimension (default -1).
 23 |     gamma_init : float, optional
 24 |         First value of gamma to be used (mean) (default 1.0).
 25 |     beta_param : bool, optional
 26 |         When set to True the beta parameter of the BN is applied (default True).
 27 |     momentum : float, optional
 28 |         It defines the momentum as for the real-valued batch-normalization (default 0.1).
 29 |     eps : float, optional
 30 |         Term used to stabilize operation (default 1e-4).
 31 |     track_running_stats : bool, optional
 32 |         Equivalent to the real-valued batchnormalization parameter.
 33 |         When True, stats are tracked. When False, solely statistics computed
 34 |         over the batch are used (default True).
 35 | 
 36 | 
 37 |     Example
 38 |     -------
 39 |     >>> inp_tensor = torch.rand([10, 40])
 40 |     >>> QBN = QBatchNorm(input_size=40)
 41 |     >>> out_tensor = QBN(inp_tensor)
 42 |     >>> out_tensor.shape
 43 |     torch.Size([10, 40])
 44 | 
 45 |     """
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         input_size,
 50 |         dim=-1,
 51 |         gamma_init=1.0,
 52 |         beta_param=True,
 53 |         momentum=0.1,
 54 |         eps=1e-4,
 55 |         track_running_stats=True,
 56 |     ):
 57 |         super(QBatchNorm, self).__init__()
 58 | 
 59 |         self.num_features = input_size // 4
 60 |         self.gamma_init = gamma_init
 61 |         self.beta_param = beta_param
 62 |         self.momentum = momentum
 63 |         self.dim = dim
 64 |         self.eps = eps
 65 |         self.track_running_stats = track_running_stats
 66 | 
 67 |         self.gamma = Parameter(torch.full([self.num_features], self.gamma_init))
 68 |         self.beta = Parameter(
 69 |             torch.zeros(self.num_features * 4), requires_grad=self.beta_param
 70 |         )
 71 | 
 72 |         # instantiate moving statistics
 73 |         if track_running_stats:
 74 |             self.register_buffer(
 75 |                 "running_mean", torch.zeros(self.num_features * 4)
 76 |             )
 77 |             self.register_buffer("running_var", torch.ones(self.num_features))
 78 |             self.register_buffer(
 79 |                 "num_batches_tracked", torch.tensor(0, dtype=torch.long)
 80 |             )
 81 |         else:
 82 |             self.register_parameter("running_mean", None)
 83 |             self.register_parameter("running_var", None)
 84 |             self.register_parameter("num_batches_tracked", None)
 85 | 
 86 |     def forward(self, input):
 87 |         """Returns the normalized input tensor.
 88 | 
 89 |         Arguments
 90 |         ---------
 91 |         input : torch.Tensor (batch, time, [channels])
 92 |             Input to normalize. It can be 2d, 3d, 4d.
 93 |         """
 94 | 
 95 |         exponential_average_factor = 0.0
 96 | 
 97 |         # Entering training mode
 98 |         if self.training:
 99 |             if self.num_batches_tracked is not None:
100 |                 self.num_batches_tracked = self.num_batches_tracked + 1
101 | 
102 |             if self.momentum is None:  # use cumulative moving average
103 |                 exponential_average_factor = (
104 |                     1.0 / self.num_batches_tracked.item()
105 |                 )
106 |             else:  # use exponential moving average
107 |                 exponential_average_factor = self.momentum
108 | 
109 |             # Get mean along batch axis
110 |             mu = torch.mean(input, dim=0)
111 |             mu_r, mu_i, mu_j, mu_k = torch.chunk(mu, 4, dim=self.dim)
112 | 
113 |             # Get variance along batch axis
114 |             delta = input - mu
115 |             delta_r, delta_i, delta_j, delta_k = torch.chunk(
116 |                 delta, 4, dim=self.dim
117 |             )
118 |             quat_variance = torch.mean(
119 |                 (delta_r ** 2 + delta_i ** 2 + delta_j ** 2 + delta_k ** 2),
120 |                 dim=0,
121 |             )
122 | 
123 |             denominator = torch.sqrt(quat_variance + self.eps)
124 | 
125 |             # x - mu / sqrt(var + e)
126 |             out = input / torch.cat(
127 |                 [denominator, denominator, denominator, denominator],
128 |                 dim=self.dim,
129 |             )
130 | 
131 |             # Update the running stats
132 |             if self.track_running_stats:
133 |                 self.running_mean = (
134 |                     1 - exponential_average_factor
135 |                 ) * self.running_mean + exponential_average_factor * mu.view(
136 |                     self.running_mean.size()
137 |                 )
138 | 
139 |                 self.running_var = (
140 |                     1 - exponential_average_factor
141 |                 ) * self.running_var + exponential_average_factor * quat_variance.view(
142 |                     self.running_var.size()
143 |                 )
144 |         else:
145 |             q_var = torch.cat(
146 |                 [
147 |                     self.running_var,
148 |                     self.running_var,
149 |                     self.running_var,
150 |                     self.running_var,
151 |                 ],
152 |                 dim=self.dim,
153 |             )
154 |             out = (input - self.running_mean) / q_var
155 | 
156 |         # lambda * (x - mu / sqrt(var + e)) + beta
157 | 
158 |         q_gamma = torch.cat(
159 |             [self.gamma, self.gamma, self.gamma, self.gamma], dim=self.dim
160 |         )
161 |         out = (q_gamma * out) + self.beta
162 | 
163 |         return out
164 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | """Package containing transducer neural networks
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/nnet/transducer/transducer_joint.py:
--------------------------------------------------------------------------------
 1 | """Library implementing transducer_joint.
 2 | 
 3 | Author
 4 |     Abdelwahab HEBA 2020
 5 | """
 6 | 
 7 | import torch
 8 | import logging
 9 | import torch.nn as nn
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class Transducer_joint(nn.Module):
15 |     """Computes joint tensor between Transcription network (TN) & Prediction network (PN)
16 | 
17 |     Arguments
18 |     ---------
19 |     joint_network : torch.class (neural network modules)
20 |         if joint == "concat", we call this network after the concatenation of TN and PN
21 |         if None, we don't use this network.
22 |     joint : joint the two tensors by ("sum",or "concat") option.
23 |     nonlinearity : torch class
24 |         Activation function used after the joint between TN and PN
25 |          Type of nonlinearity (tanh, relu).
26 | 
27 |     Example
28 |     -------
29 |     >>> from speechbrain.nnet.transducer.transducer_joint import Transducer_joint
30 |     >>> from speechbrain.nnet.linear import Linear
31 |     >>> input_TN = torch.rand(8, 200, 1, 40)
32 |     >>> input_PN = torch.rand(8, 1, 12, 40)
33 |     >>> joint_network = Linear(input_size=80, n_neurons=80)
34 |     >>> TJoint = Transducer_joint(joint_network, joint="concat")
35 |     >>> output = TJoint(input_TN, input_PN)
36 |     >>> output.shape
37 |     torch.Size([8, 200, 12, 80])
38 |     """
39 | 
40 |     def __init__(
41 |         self, joint_network=None, joint="sum", nonlinearity=torch.nn.LeakyReLU
42 |     ):
43 |         super().__init__()
44 |         self.joint_network = joint_network
45 |         self.joint = joint
46 |         self.nonlinearity = nonlinearity()
47 | 
48 |     def init_params(self, first_input):
49 |         """
50 |         Arguments
51 |         ---------
52 |         first_input : tensor
53 |             A first input used for initializing the parameters.
54 |         """
55 |         self.joint_network(first_input)
56 | 
57 |     def forward(self, input_TN, input_PN):
58 |         """Returns the fusion of inputs tensors.
59 | 
60 |         Arguments
61 |         ---------
62 |         input_TN : torch.Tensor
63 |            Input from Transcription Network.
64 | 
65 |         input_PN : torch.Tensor
66 |            Input from Prediction Network.
67 |         """
68 |         if len(input_TN.shape) != len(input_PN.shape):
69 |             raise ValueError("Arg 1 and 2 must be have same size")
70 |         if not (len(input_TN.shape) != 4 or len(input_TN.shape) != 1):
71 |             raise ValueError("Tensors 1 and 2 must have dim=1 or dim=4")
72 | 
73 |         if self.joint == "sum":
74 |             joint = input_TN + input_PN
75 | 
76 |         if self.joint == "concat":
77 |             # For training
78 |             if len(input_TN.shape) == 4:
79 |                 dim = len(input_TN.shape) - 1
80 |                 xs = input_TN
81 |                 ymat = input_PN
82 |                 sz = [
83 |                     max(i, j) for i, j in zip(xs.size()[:-1], ymat.size()[:-1])
84 |                 ]
85 |                 xs = xs.expand(torch.Size(sz + [xs.shape[-1]]))
86 |                 ymat = ymat.expand(torch.Size(sz + [ymat.shape[-1]]))
87 |                 joint = torch.cat((xs, ymat), dim=dim)
88 |             # For evaluation
89 |             elif len(input_TN.shape) == 1:
90 |                 joint = torch.cat((input_TN, input_PN), dim=0)
91 | 
92 |             if self.joint_network is not None:
93 |                 joint = self.joint_network(joint)
94 | 
95 |         return self.nonlinearity(joint)
96 | 


--------------------------------------------------------------------------------
/speechbrain/pretrained/__init__.py:
--------------------------------------------------------------------------------
1 | """Pretrained models"""
2 | 
3 | from .interfaces import *  # noqa
4 | 


--------------------------------------------------------------------------------
/speechbrain/pretrained/fetching.py:
--------------------------------------------------------------------------------
  1 | """Downloads or otherwise fetches pretrained models
  2 | 
  3 | Authors:
  4 |  * Aku Rouhe 2021
  5 |  * Samuele Cornell 2021
  6 | """
  7 | import urllib.request
  8 | import urllib.error
  9 | import pathlib
 10 | import logging
 11 | import huggingface_hub
 12 | from requests.exceptions import HTTPError
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def _missing_ok_unlink(path):
 18 |     # missing_ok=True was added to Path.unlink() in Python 3.8
 19 |     # This does the same.
 20 |     try:
 21 |         path.unlink()
 22 |     except FileNotFoundError:
 23 |         pass
 24 | 
 25 | 
 26 | def fetch(
 27 |     filename,
 28 |     source,
 29 |     savedir="./pretrained_model_checkpoints",
 30 |     overwrite=False,
 31 |     save_filename=None,
 32 |     use_auth_token=False,
 33 | ):
 34 |     """Ensures you have a local copy of the file, returns its path
 35 | 
 36 |     In case the source is an external location, downloads the file.  In case
 37 |     the source is already accessible on the filesystem, creates a symlink in
 38 |     the savedir. Thus, the side effects of this function always look similar:
 39 |     savedir/save_filename can be used to access the file. And save_filename
 40 |     defaults to the filename arg.
 41 | 
 42 |     Arguments
 43 |     ---------
 44 |     filename : str
 45 |         Name of the file including extensions.
 46 |     source : str
 47 |         Where to look for the file. This is interpreted in special ways:
 48 |         First, if the source begins with "http://" or "https://", it is
 49 |         interpreted as a web address and the file is downloaded.
 50 |         Second, if the source is a valid directory path, a symlink is
 51 |         created to the file.
 52 |         Otherwise, the source is interpreted as a Huggingface model hub ID, and
 53 |         the file is downloaded from there.
 54 |     savedir : str
 55 |         Path where to save downloads/symlinks.
 56 |     overwrite : bool
 57 |         If True, always overwrite existing savedir/filename file and download
 58 |         or recreate the link. If False (as by default), if savedir/filename
 59 |         exists, assume it is correct and don't download/relink. Note that
 60 |         Huggingface local cache is always used - with overwrite=True we just
 61 |         relink from the local cache.
 62 |     save_filename : str
 63 |         The filename to use for saving this file. Defaults to filename if not
 64 |         given.
 65 |     use_auth_token : bool (default: False)
 66 |         If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub,
 67 |         default is False because majority of models are public.
 68 |     Returns
 69 |     -------
 70 |     pathlib.Path
 71 |         Path to file on local file system.
 72 | 
 73 |     Raises
 74 |     ------
 75 |     ValueError
 76 |         If file is not found
 77 |     """
 78 |     if save_filename is None:
 79 |         save_filename = filename
 80 |     savedir = pathlib.Path(savedir)
 81 |     savedir.mkdir(parents=True, exist_ok=True)
 82 |     sourcefile = f"{source}/{filename}"
 83 |     destination = savedir / save_filename
 84 |     if destination.exists() and not overwrite:
 85 |         MSG = f"Fetch {filename}: Using existing file/symlink in {str(destination)}."
 86 |         logger.info(MSG)
 87 |         return destination
 88 |     if str(source).startswith("http:") or str(source).startswith("https:"):
 89 |         # Interpret source as web address.
 90 |         MSG = (
 91 |             f"Fetch {filename}: Downloading from normal URL {str(sourcefile)}."
 92 |         )
 93 |         logger.info(MSG)
 94 |         # Download
 95 |         try:
 96 |             urllib.request.urlretrieve(sourcefile, destination)
 97 |         except urllib.error.URLError:
 98 |             raise ValueError(
 99 |                 f"Interpreted {source} as web address, but could not download."
100 |             )
101 |     elif pathlib.Path(source).is_dir():
102 |         # Interpret source as local directory path
103 |         # Just symlink
104 |         sourcepath = pathlib.Path(sourcefile).absolute()
105 |         MSG = f"Fetch {filename}: Linking to local file in {str(sourcepath)}."
106 |         logger.info(MSG)
107 |         _missing_ok_unlink(destination)
108 |         destination.symlink_to(sourcepath)
109 |     else:
110 |         # Interpret source as huggingface hub ID
111 |         # Use huggingface hub's fancy cached download.
112 |         MSG = f"Fetch {filename}: Delegating to Huggingface hub, source {str(source)}."
113 |         logger.info(MSG)
114 |         url = huggingface_hub.hf_hub_url(source, filename)
115 |         try:
116 |             fetched_file = huggingface_hub.cached_download(url, use_auth_token)
117 |         except HTTPError as e:
118 |             if e.response.status_code == 404:
119 |                 raise ValueError("File not found on HF hub")
120 |             else:
121 |                 raise
122 |         # Huggingface hub downloads to etag filename, symlink to the expected one:
123 |         sourcepath = pathlib.Path(fetched_file).absolute()
124 |         _missing_ok_unlink(destination)
125 |         destination.symlink_to(sourcepath)
126 |     return destination
127 | 


--------------------------------------------------------------------------------
/speechbrain/processing/NMF.py:
--------------------------------------------------------------------------------
  1 | """Non-negative matrix factorization
  2 | 
  3 | Authors
  4 |  * Cem Subakan
  5 | """
  6 | import torch
  7 | from speechbrain.processing.features import spectral_magnitude
  8 | import speechbrain.processing.features as spf
  9 | 
 10 | 
 11 | def spectral_phase(stft, power=2, log=False):
 12 |     """Returns the phase of a complex spectrogram.
 13 | 
 14 |     Arguments
 15 |     ---------
 16 |     stft : torch.Tensor
 17 |         A tensor, output from the stft function.
 18 | 
 19 |     Example
 20 |     -------
 21 |     >>> BS, nfft, T = 10, 20, 300
 22 |     >>> X_stft = torch.randn(BS, nfft//2 + 1, T, 2)
 23 |     >>> phase_mix = spectral_phase(X_stft)
 24 |     """
 25 | 
 26 |     phase = torch.atan2(stft[:, :, :, 1], stft[:, :, :, 0])
 27 | 
 28 |     return phase
 29 | 
 30 | 
 31 | def NMF_separate_spectra(Whats, Xmix):
 32 |     """This function separates the mixture signals, given NMF template matrices.
 33 | 
 34 |     Arguments
 35 |     ---------
 36 |     Whats : list
 37 |         This list contains the list [W1, W2], where W1 W2 are respectively
 38 |         the NMF template matrices that correspond to source1 and source2.
 39 |         W1, W2 are of size [nfft/2 + 1, K], where nfft is the fft size for STFT,
 40 |         and K is the number of vectors (templates) in W.
 41 |     Xmix : torch.tensor
 42 |         This is the magnitude spectra for the mixtures.
 43 |         The size is [BS x T x nfft//2 + 1] where,
 44 |         BS = batch size, nfft = fft size, T = number of time steps in the spectra.
 45 | 
 46 |     Outputs
 47 |     -------
 48 |     X1hat : Separated spectrum for source1
 49 |         Size = [BS x (nfft/2 +1) x T] where,
 50 |         BS = batch size, nfft = fft size, T = number of time steps in the spectra.
 51 |     X2hat : Separated Spectrum for source2
 52 |         The size definitions are the same as above.
 53 | 
 54 |     Example
 55 |     --------
 56 |     >>> BS, nfft, T = 4, 20, 400
 57 |     >>> K1, K2 = 10, 10
 58 |     >>> W1hat = torch.randn(nfft//2 + 1, K1)
 59 |     >>> W2hat = torch.randn(nfft//2 + 1, K2)
 60 |     >>> Whats = [W1hat, W2hat]
 61 |     >>> Xmix = torch.randn(BS, T, nfft//2 + 1)
 62 |     >>> X1hat, X2hat = NMF_separate_spectra(Whats, Xmix)
 63 |     """
 64 | 
 65 |     W1, W2 = Whats
 66 | 
 67 |     nmixtures = Xmix.shape[0]
 68 |     Xmix = Xmix.permute(0, 2, 1).reshape(-1, Xmix.size(-1)).t()
 69 |     n = Xmix.shape[1]
 70 |     eps = 1e-20
 71 | 
 72 |     # Normalize input
 73 |     g = Xmix.sum(dim=0) + eps
 74 |     z = Xmix / g
 75 | 
 76 |     # initialize
 77 |     w = torch.cat([W1, W2], dim=1)
 78 |     K = w.size(1)
 79 |     K1 = W1.size(1)
 80 | 
 81 |     h = 0.1 * torch.rand(K, n)
 82 |     h /= torch.sum(h, dim=0) + eps
 83 | 
 84 |     for ep in range(1000):
 85 |         v = z / (torch.matmul(w, h) + eps)
 86 | 
 87 |         nh = h * torch.matmul(w.t(), v)
 88 |         h = nh / (torch.sum(nh, dim=0) + eps)
 89 | 
 90 |     h *= g
 91 |     Xhat1 = torch.matmul(w[:, :K1], h[:K1, :])
 92 |     Xhat1 = torch.split(Xhat1.unsqueeze(0), Xhat1.size(1) // nmixtures, dim=2)
 93 |     Xhat1 = torch.cat(Xhat1, dim=0)
 94 | 
 95 |     Xhat2 = torch.matmul(w[:, K1:], h[K1:, :])
 96 |     Xhat2 = torch.split(Xhat2.unsqueeze(0), Xhat2.size(1) // nmixtures, dim=2)
 97 |     Xhat2 = torch.cat(Xhat2, dim=0)
 98 | 
 99 |     return Xhat1, Xhat2
100 | 
101 | 
102 | def reconstruct_results(
103 |     X1hat, X2hat, X_stft, sample_rate, win_length, hop_length,
104 | ):
105 | 
106 |     """This function reconstructs the separated spectra into waveforms.
107 | 
108 |     Arguments
109 |     ---------
110 |     Xhat1 : torch.tensor
111 |         The separated spectrum for source 1 of size [BS, nfft/2 + 1, T],
112 |         where,  BS = batch size, nfft = fft size, T = length of the spectra.
113 |     Xhat2 : torch.tensor
114 |         The separated spectrum for source 2 of size [BS, nfft/2 + 1, T].
115 |         The size definitions are the same as Xhat1.
116 |     X_stft : torch.tensor
117 |         This is the magnitude spectra for the mixtures.
118 |         The size is [BS x nfft//2 + 1 x T x 2] where,
119 |         BS = batch size, nfft = fft size, T = number of time steps in the spectra.
120 |         The last dimension is to represent complex numbers.
121 |     sample_rate : int
122 |         The sampling rate (in Hz) in which we would like to save the results.
123 |     win_length : int
124 |         The length of stft windows (in ms).
125 |     hop_length : int
126 |         The length with which we shift the STFT windows (in ms).
127 | 
128 |     Returns
129 |     -------
130 |     x1hats : list
131 |         List of waveforms for source 1.
132 |     x2hats : list
133 |         List of waveforms for source 2.
134 | 
135 |     Example
136 |     -------
137 |     >>> BS, nfft, T = 10, 512, 16000
138 |     >>> sample_rate, win_length, hop_length = 16000, 25, 10
139 |     >>> X1hat = torch.randn(BS, nfft//2 + 1, T)
140 |     >>> X2hat = torch.randn(BS, nfft//2 + 1, T)
141 |     >>> X_stft = torch.randn(BS, nfft//2 + 1, T, 2)
142 |     >>> x1hats, x2hats = reconstruct_results(X1hat, X2hat, X_stft, sample_rate, win_length, hop_length)
143 |     """
144 | 
145 |     ISTFT = spf.ISTFT(
146 |         sample_rate=sample_rate, win_length=win_length, hop_length=hop_length
147 |     )
148 | 
149 |     phase_mix = spectral_phase(X_stft)
150 |     mag_mix = spectral_magnitude(X_stft, power=2)
151 | 
152 |     x1hats, x2hats = [], []
153 |     eps = 1e-25
154 |     for i in range(X1hat.shape[0]):
155 |         X1hat_stft = (
156 |             (X1hat[i] / (eps + X1hat[i] + X2hat[i])).unsqueeze(-1)
157 |             * mag_mix[i].unsqueeze(-1)
158 |             * torch.cat(
159 |                 [
160 |                     torch.cos(phase_mix[i].unsqueeze(-1)),
161 |                     torch.sin(phase_mix[i].unsqueeze(-1)),
162 |                 ],
163 |                 dim=-1,
164 |             )
165 |         )
166 | 
167 |         X2hat_stft = (
168 |             (X2hat[i] / (eps + X1hat[i] + X2hat[i])).unsqueeze(-1)
169 |             * mag_mix[i].unsqueeze(-1)
170 |             * torch.cat(
171 |                 [
172 |                     torch.cos(phase_mix[i].unsqueeze(-1)),
173 |                     torch.sin(phase_mix[i].unsqueeze(-1)),
174 |                 ],
175 |                 dim=-1,
176 |             )
177 |         )
178 |         X1hat_stft = X1hat_stft.unsqueeze(0).permute(0, 2, 1, 3)
179 |         X2hat_stft = X2hat_stft.unsqueeze(0).permute(0, 2, 1, 3)
180 |         shat1 = ISTFT(X1hat_stft)
181 |         shat2 = ISTFT(X2hat_stft)
182 | 
183 |         div_factor = 10
184 |         x1 = shat1 / (div_factor * shat1.std())
185 |         x2 = shat2 / (div_factor * shat2.std())
186 | 
187 |         x1hats.append(x1)
188 |         x2hats.append(x2)
189 |     return x1hats, x2hats
190 | 


--------------------------------------------------------------------------------
/speechbrain/processing/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package containing various techniques of speech processing
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | """ Package defining the SentencePiece tokenizer
2 | """
3 | 


--------------------------------------------------------------------------------
/speechbrain/utils/Accuracy.py:
--------------------------------------------------------------------------------
 1 | """Calculate accuracy.
 2 | 
 3 | Authors
 4 | * Jianyuan Zhong 2020
 5 | """
 6 | import torch
 7 | from speechbrain.dataio.dataio import length_to_mask
 8 | 
 9 | 
10 | def Accuracy(log_probabilities, targets, length=None):
11 |     """Calculates the accuracy for predicted log probabilities and targets in a batch.
12 | 
13 |     Arguments
14 |     ----------
15 |     log_probabilities : tensor
16 |         Predicted log probabilities (batch_size, time, feature).
17 |     targets : tensor
18 |         Target (batch_size, time).
19 |     length : tensor
20 |         Length of target (batch_size,).
21 | 
22 |     Example
23 |     -------
24 |     >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0)
25 |     >>> acc = Accuracy(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3]))
26 |     >>> print(acc)
27 |     (1.0, 2.0)
28 |     """
29 |     if length is not None:
30 |         mask = length_to_mask(
31 |             length * targets.shape[1], max_len=targets.shape[1],
32 |         ).bool()
33 |         if len(targets.shape) == 3:
34 |             mask = mask.unsqueeze(2).repeat(1, 1, targets.shape[2])
35 | 
36 |     padded_pred = log_probabilities.argmax(-1)
37 | 
38 |     if length is not None:
39 |         numerator = torch.sum(
40 |             padded_pred.masked_select(mask) == targets.masked_select(mask)
41 |         )
42 |         denominator = torch.sum(mask)
43 |     else:
44 |         numerator = torch.sum(padded_pred == targets)
45 |         denominator = targets.shape[0] * targets.shape[1]  # (batch_size * time)
46 |     return float(numerator), float(denominator)
47 | 
48 | 
49 | class AccuracyStats:
50 |     """Module for calculate the overall one-step-forward prediction accuracy.
51 | 
52 |     Example
53 |     -------
54 |     >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0)
55 |     >>> stats = AccuracyStats()
56 |     >>> stats.append(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3]))
57 |     >>> acc = stats.summarize()
58 |     >>> print(acc)
59 |     0.5
60 |     """
61 | 
62 |     def __init__(self):
63 |         self.correct = 0
64 |         self.total = 0
65 | 
66 |     def append(self, log_probabilities, targets, length=None):
67 |         """This function is for updating the stats according to the prediction
68 |         and target in the current batch.
69 | 
70 |         Arguments
71 |         ----------
72 |         log_probabilities : tensor
73 |             Predicted log probabilities (batch_size, time, feature).
74 |         targets : tensor
75 |             Target (batch_size, time).
76 |         length: tensor
77 |             Length of target (batch_size,).
78 |         """
79 |         numerator, denominator = Accuracy(log_probabilities, targets, length)
80 |         self.correct += numerator
81 |         self.total += denominator
82 | 
83 |     def summarize(self):
84 |         return self.correct / self.total


--------------------------------------------------------------------------------
/speechbrain/utils/DER.py:
--------------------------------------------------------------------------------
  1 | """Calculates Diarization Error Rate (DER) which is the sum of Missed Speaker (MS),
  2 | False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Evaluation.
  3 | 
  4 | Authors
  5 |  * Neville Ryant 2018
  6 |  * Nauman Dawalatabad 2020
  7 | 
  8 | Credits
  9 |  This code is adapted from https://github.com/nryant/dscore
 10 | """
 11 | 
 12 | import os
 13 | import re
 14 | import subprocess
 15 | import numpy as np
 16 | 
 17 | FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 18 | SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
 19 | MISS_SPEAKER_TIME = re.compile(r"(?<=MISSED SPEAKER TIME =)[\d.]+")
 20 | FA_SPEAKER_TIME = re.compile(r"(?<=FALARM SPEAKER TIME =)[\d.]+")
 21 | ERROR_SPEAKER_TIME = re.compile(r"(?<=SPEAKER ERROR TIME =)[\d.]+")
 22 | 
 23 | 
 24 | def rectify(arr):
 25 |     """Corrects corner cases and converts scores into percentage.
 26 |     """
 27 | 
 28 |     # Numerator and denominator both 0.
 29 |     arr[np.isnan(arr)] = 0
 30 | 
 31 |     # Numerator > 0, but denominator = 0.
 32 |     arr[np.isinf(arr)] = 1
 33 |     arr *= 100.0
 34 | 
 35 |     return arr
 36 | 
 37 | 
 38 | def DER(
 39 |     ref_rttm,
 40 |     sys_rttm,
 41 |     ignore_overlap=False,
 42 |     collar=0.25,
 43 |     individual_file_scores=False,
 44 | ):
 45 |     """Computes Missed Speaker percentage (MS), False Alarm (FA),
 46 |     Speaker Error Rate (SER), and Diarization Error Rate (DER).
 47 | 
 48 |     Arguments
 49 |     ---------
 50 |     ref_rttm : str
 51 |         The path of reference/groundtruth RTTM file.
 52 |     sys_rttm : str
 53 |         The path of the system generated RTTM file.
 54 |     individual_file : bool
 55 |         If True, returns scores for each file in order.
 56 |     collar : float
 57 |         Forgiveness collar.
 58 |     ignore_overlap : bool
 59 |         If True, ignores overlapping speech during evaluation.
 60 | 
 61 |     Returns
 62 |     -------
 63 |     MS : float array
 64 |         Missed Speech.
 65 |     FA : float array
 66 |         False Alarms.
 67 |     SER : float array
 68 |         Speaker Error Rates.
 69 |     DER : float array
 70 |         Diarization Error Rates.
 71 | 
 72 |     Example
 73 |     -------
 74 |     >>> import pytest
 75 |     >>> pytest.skip('Skipping because of Perl dependency')
 76 |     >>> ref_rttm = "../../samples/rttm_samples/ref_rttm/ES2014c.rttm"
 77 |     >>> sys_rttm = "../../samples/rttm_samples/sys_rttm/ES2014c.rttm"
 78 |     >>> ignore_overlap = True
 79 |     >>> collar = 0.25
 80 |     >>> individual_file_scores = True
 81 |     >>> Scores = DER(ref_rttm, sys_rttm, ignore_overlap, collar, individual_file_scores)
 82 |     >>> print (Scores)
 83 |     (array([0., 0.]), array([0., 0.]), array([7.16923618, 7.16923618]), array([7.16923618, 7.16923618]))
 84 |     """
 85 | 
 86 |     curr = os.path.abspath(os.path.dirname(__file__))
 87 |     mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl")
 88 | 
 89 |     cmd = [
 90 |         mdEval,
 91 |         "-af",
 92 |         "-r",
 93 |         ref_rttm,
 94 |         "-s",
 95 |         sys_rttm,
 96 |         "-c",
 97 |         str(collar),
 98 |     ]
 99 |     if ignore_overlap:
100 |         cmd.append("-1")
101 | 
102 |     try:
103 |         stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
104 | 
105 |     except subprocess.CalledProcessError as ex:
106 |         stdout = ex.output
107 | 
108 |     else:
109 |         stdout = stdout.decode("utf-8")
110 | 
111 |         # Get all recording IDs
112 |         file_ids = [m.strip() for m in FILE_IDS.findall(stdout)]
113 |         file_ids = [
114 |             file_id[2:] if file_id.startswith("f=") else file_id
115 |             for file_id in file_ids
116 |         ]
117 | 
118 |         scored_speaker_times = np.array(
119 |             [float(m) for m in SCORED_SPEAKER_TIME.findall(stdout)]
120 |         )
121 | 
122 |         miss_speaker_times = np.array(
123 |             [float(m) for m in MISS_SPEAKER_TIME.findall(stdout)]
124 |         )
125 | 
126 |         fa_speaker_times = np.array(
127 |             [float(m) for m in FA_SPEAKER_TIME.findall(stdout)]
128 |         )
129 | 
130 |         error_speaker_times = np.array(
131 |             [float(m) for m in ERROR_SPEAKER_TIME.findall(stdout)]
132 |         )
133 | 
134 |         with np.errstate(invalid="ignore", divide="ignore"):
135 |             tot_error_times = (
136 |                 miss_speaker_times + fa_speaker_times + error_speaker_times
137 |             )
138 |             miss_speaker_frac = miss_speaker_times / scored_speaker_times
139 |             fa_speaker_frac = fa_speaker_times / scored_speaker_times
140 |             sers_frac = error_speaker_times / scored_speaker_times
141 |             ders_frac = tot_error_times / scored_speaker_times
142 | 
143 |         # Values in percentage of scored_speaker_time
144 |         miss_speaker = rectify(miss_speaker_frac)
145 |         fa_speaker = rectify(fa_speaker_frac)
146 |         sers = rectify(sers_frac)
147 |         ders = rectify(ders_frac)
148 | 
149 |         if individual_file_scores:
150 |             return miss_speaker, fa_speaker, sers, ders
151 |         else:
152 |             return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1]
153 | 


--------------------------------------------------------------------------------
/speechbrain/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Package containing various tools (accuracy, checkpoints ...)
 2 | """
 3 | import os
 4 | 
 5 | __all__ = []
 6 | for filename in os.listdir(os.path.dirname(__file__)):
 7 |     filename = os.path.basename(filename)
 8 |     if filename.endswith(".py") and not filename.startswith("__"):
 9 |         __all__.append(filename[:-3])
10 | 
11 | from . import *  # noqa
12 | 


--------------------------------------------------------------------------------
/speechbrain/utils/bleu.py:
--------------------------------------------------------------------------------
  1 | from speechbrain.utils.metric_stats import MetricStats
  2 | 
  3 | 
  4 | def merge_words(sequences):
  5 |     """Merge successive words into phrase, putting space between each word
  6 | 
  7 |     Arguments
  8 |     ---------
  9 |     sequences : list
 10 |         Each item contains a list, and this list contains a word sequence.
 11 |     Returns
 12 |     -------
 13 |     The list contains phrase sequences.
 14 |     """
 15 |     results = []
 16 |     for seq in sequences:
 17 |         words = " ".join(seq)
 18 |         results.append(words)
 19 |     return results
 20 | 
 21 | 
 22 | class BLEUStats(MetricStats):
 23 |     """A class for tracking BLEU (https://www.aclweb.org/anthology/P02-1040.pdf).
 24 |     Arguments
 25 |     ---------
 26 |     merge_words: bool
 27 |         Whether to merge the successive words to create sentences.
 28 |     Example
 29 |     -------
 30 |     >>> bleu = BLEUStats()
 31 |     >>> i2l = {0: 'a', 1: 'b'}
 32 |     >>> bleu.append(
 33 |     ...     ids=['utterance1'],
 34 |     ...     predict=[[0, 1, 1]],
 35 |     ...     targets=[[[0, 1, 0]], [[0, 1, 1]], [[1, 1, 0]]],
 36 |     ...     ind2lab=lambda batch: [[i2l[int(x)] for x in seq] for seq in batch],
 37 |     ... )
 38 |     >>> stats = bleu.summarize()
 39 |     >>> stats['BLEU']
 40 |     0.0
 41 |     """
 42 | 
 43 |     def __init__(
 44 |         self, lang="en", merge_words=True,
 45 |     ):
 46 | 
 47 |         self.clear()
 48 |         self.merge_words = merge_words
 49 | 
 50 |         self.predicts = []
 51 |         self.targets = None
 52 | 
 53 |     def append(
 54 |         self, ids, predict, targets, ind2lab=None,
 55 |     ):
 56 |         """Add stats to the relevant containers.
 57 |         * See MetricStats.append()
 58 |         Arguments
 59 |         ---------
 60 |         ids : list
 61 |             List of ids corresponding to utterances.
 62 |         predict : torch.tensor
 63 |             A predicted output, for comparison with the target output
 64 |         targets : list
 65 |             list of references (when measuring BLEU, one sentence could have more
 66 |                                 than one target translation).
 67 |         ind2lab : callable
 68 |             Callable that maps from indices to labels, operating on batches,
 69 |             for writing alignments.
 70 |         """
 71 |         self.ids.extend(ids)
 72 | 
 73 |         if ind2lab is not None:
 74 |             predict = ind2lab(predict)
 75 |             targets = [ind2lab(t) for t in targets]
 76 | 
 77 |         if self.merge_words:
 78 |             predict = merge_words(predict)
 79 |             targets = [merge_words(t) for t in targets]
 80 | 
 81 |         self.predicts.extend(predict)
 82 |         if self.targets is None:
 83 |             self.targets = targets
 84 |         else:
 85 |             assert len(self.targets) == len(targets)
 86 |             for i in range(len(self.targets)):
 87 |                 self.targets[i].extend(targets[i])
 88 | 
 89 |     def summarize(self, field=None):
 90 |         """Summarize the BLEU and return relevant statistics.
 91 |         * See MetricStats.summarize()
 92 |         """
 93 | 
 94 |         # Check extra-dependency for computing the bleu score
 95 |         try:
 96 |             import sacrebleu
 97 |         except ImportError:
 98 |             print(
 99 |                 "Please install sacrebleu (https://github.com/mjpost/sacreble) in order to use the BLEU metric"
100 |             )
101 | 
102 |         scores = sacrebleu.corpus_bleu(self.predicts, self.targets)
103 |         details = {}
104 |         details["BLEU"] = scores.score
105 |         details["BP"] = scores.bp
106 |         details["ratio"] = scores.sys_len / scores.ref_len
107 |         details["hyp_len"] = scores.sys_len
108 |         details["ref_len"] = scores.ref_len
109 |         details["precisions"] = scores.precisions
110 | 
111 |         self.scores = scores
112 |         self.summary = details
113 | 
114 |         # Add additional, more generic key
115 |         self.summary["bleu_score"] = self.summary["BLEU"]
116 | 
117 |         if field is not None:
118 |             return self.summary[field]
119 |         else:
120 |             return self.summary
121 | 
122 |     def write_stats(self, filestream):
123 |         """Write all relevant info (e.g., error rate alignments) to file.
124 |         * See MetricStats.write_stats()
125 |         """
126 |         if not self.summary:
127 |             self.summarize()
128 | 
129 |         print(self.scores, file=filestream)
130 | 


--------------------------------------------------------------------------------
/speechbrain/utils/callchains.py:
--------------------------------------------------------------------------------
 1 | """Chaining together callables, if some require relative lengths"""
 2 | import inspect
 3 | 
 4 | 
 5 | def lengths_arg_exists(func):
 6 |     """Returns True if func takes ``lengths`` keyword argument.
 7 | 
 8 |     Arguments
 9 |     ---------
10 |     func : callable
11 |         The function, method, or other callable to search for the lengths arg.
12 |     """
13 |     spec = inspect.getfullargspec(func)
14 |     return "lengths" in spec.args + spec.kwonlyargs
15 | 
16 | 
17 | class LengthsCapableChain:
18 |     """Chain together callables. Can handle relative lengths.
19 | 
20 |     This is a more light-weight version of
21 |     speechbrain.nnet.containers.LengthsCapableSequential
22 | 
23 |     Arguments
24 |     ---------
25 |     *funcs : list, optional
26 |         Any number of functions or other callables, given in order of
27 |         execution.
28 | 
29 |     Returns
30 |     -------
31 |     Any
32 |         The input as processed by each function. If no functions were given,
33 |         simply returns the input.
34 |     """
35 | 
36 |     def __init__(self, *funcs):
37 |         self.funcs = []
38 |         self.takes_lengths = []
39 |         for func in funcs:
40 |             self.append(func)
41 | 
42 |     def __call__(self, x, lengths=None):
43 |         """Run the chain of callables on the given input
44 | 
45 |         Arguments
46 |         ---------
47 |         x : Any
48 |             The main input
49 |         lengths : Any
50 |             The lengths argument which will be conditionally passed to
51 |             any functions in the chain that take a 'lengths' argument.
52 |             In SpeechBrain the convention is to use relative lengths.
53 | 
54 |         Note
55 |         ----
56 |         By convention, if a callable in the chain returns multiple outputs
57 |         (returns a tuple), only the first output is passed to the next
58 |         callable in the chain.
59 |         """
60 |         if not self.funcs:
61 |             return x
62 |         for func, give_lengths in zip(self.funcs, self.takes_lengths):
63 |             if give_lengths:
64 |                 x = func(x, lengths)
65 |             else:
66 |                 x = func(x)
67 |             if isinstance(x, tuple):
68 |                 x = x[0]
69 |         return x
70 | 
71 |     def append(self, func):
72 |         """Add a function to the chain"""
73 |         self.funcs.append(func)
74 |         self.takes_lengths.append(lengths_arg_exists(func))
75 | 
76 |     def __str__(self):
77 |         clsname = self.__class__.__name__
78 |         if self.funcs:
79 |             return f"{clsname}:\n" + "\n".join(str(f) for f in self.funcs)
80 |         else:
81 |             return f"Empty {clsname}"
82 | 


--------------------------------------------------------------------------------
/speechbrain/utils/epoch_loop.py:
--------------------------------------------------------------------------------
  1 | """Implements a checkpointable epoch counter (loop), optionally integrating early stopping.
  2 | 
  3 | Authors
  4 |  * Aku Rouhe 2020
  5 |  * Davide Borra 2021
  6 | """
  7 | from .checkpoints import register_checkpoint_hooks
  8 | from .checkpoints import mark_as_saver
  9 | from .checkpoints import mark_as_loader
 10 | import logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | @register_checkpoint_hooks
 16 | class EpochCounter:
 17 |     """An epoch counter which can save and recall its state.
 18 | 
 19 |     Use this as the iterator for epochs.
 20 |     Note that this iterator gives you the numbers from [1 ... limit] not
 21 |     [0 ... limit-1] as range(limit) would.
 22 | 
 23 |     Example
 24 |     -------
 25 |     >>> from speechbrain.utils.checkpoints import Checkpointer
 26 |     >>> tmpdir = getfixture('tmpdir')
 27 |     >>> epoch_counter = EpochCounter(10)
 28 |     >>> recoverer = Checkpointer(tmpdir, {"epoch": epoch_counter})
 29 |     >>> recoverer.recover_if_possible()
 30 |     >>> # Now after recovery,
 31 |     >>> # the epoch starts from where it left off!
 32 |     >>> for epoch in epoch_counter:
 33 |     ...     # Run training...
 34 |     ...     ckpt = recoverer.save_checkpoint()
 35 |     """
 36 | 
 37 |     def __init__(self, limit):
 38 |         self.current = 0
 39 |         self.limit = int(limit)
 40 | 
 41 |     def __iter__(self):
 42 |         return self
 43 | 
 44 |     def __next__(self):
 45 |         if self.current < self.limit:
 46 |             self.current += 1
 47 |             logger.info(f"Going into epoch {self.current}")
 48 |             return self.current
 49 |         raise StopIteration
 50 | 
 51 |     @mark_as_saver
 52 |     def _save(self, path):
 53 |         with open(path, "w") as fo:
 54 |             fo.write(str(self.current))
 55 | 
 56 |     @mark_as_loader
 57 |     def _recover(self, path, end_of_epoch=True, device=None):
 58 |         # NOTE: end_of_epoch = True by default so that when
 59 |         #  loaded in parameter transfer, this starts a new epoch.
 60 |         #  However, parameter transfer to EpochCounter should
 61 |         #  probably never be used really.
 62 |         del device  # Not used.
 63 |         with open(path) as fi:
 64 |             saved_value = int(fi.read())
 65 |             if end_of_epoch:
 66 |                 self.current = saved_value
 67 |             else:
 68 |                 self.current = saved_value - 1
 69 | 
 70 | 
 71 | class EpochCounterWithStopper(EpochCounter):
 72 |     """An epoch counter which can save and recall its state, integrating an early stopper by tracking a target metric.
 73 | 
 74 |     Arguments
 75 |     ---------
 76 |     limit: int
 77 |         maximum number of epochs
 78 |     limit_to_stop : int
 79 |         maximum number of consecutive epochs without improvements in performance
 80 |     limit_warmup : int
 81 |         number of epochs to wait until start checking for early stopping
 82 |     direction : "max" or "min"
 83 |         direction to optimize the target metric
 84 | 
 85 |     Example
 86 |     -------
 87 |     >>> limit = 10
 88 |     >>> limit_to_stop = 5
 89 |     >>> limit_warmup = 2
 90 |     >>> direction = "min"
 91 |     >>> epoch_counter = EpochCounterWithStopper(limit, limit_to_stop, limit_warmup, direction)
 92 |     >>> for epoch in epoch_counter:
 93 |     ...     # Run training...
 94 |     ...     # Track a validation metric,
 95 |     ...     current_valid_metric = 0
 96 |     ...     # get the current valid metric (get current_valid_metric)
 97 |     ...     if epoch_counter.should_stop(current=epoch,
 98 |     ...                                  current_metric=current_valid_metric,):
 99 |     ...         epoch_counter.current = epoch_counter.limit  # skipping unpromising epochs
100 |     """
101 | 
102 |     def __init__(self, limit, limit_to_stop, limit_warmup, direction):
103 |         super().__init__(limit)
104 |         self.limit_to_stop = limit_to_stop
105 |         self.limit_warmup = limit_warmup
106 |         self.direction = direction
107 | 
108 |         self.best_limit = 0
109 |         self.min_delta = 1e-6
110 | 
111 |         if self.limit_to_stop < 0:
112 |             raise ValueError("Stopper 'limit_to_stop' must be >= 0")
113 |         if self.limit_warmup < 0:
114 |             raise ValueError("Stopper 'limit_warmup' must be >= 0")
115 |         if self.direction == "min":
116 |             self.th, self.sign = float("inf"), 1
117 |         elif self.direction == "max":
118 |             self.th, self.sign = -float("inf"), -1
119 |         else:
120 |             raise ValueError("Stopper 'direction' must be 'min' or 'max'")
121 | 
122 |     def should_stop(self, current, current_metric):
123 |         should_stop = False
124 |         if current > self.limit_warmup:
125 |             if self.sign * current_metric < self.sign * (
126 |                 (1 - self.min_delta) * self.th
127 |             ):
128 |                 self.best_limit = current
129 |                 self.th = current_metric
130 |             should_stop = (current - self.best_limit) >= self.limit_to_stop
131 |         return should_stop
132 | 


--------------------------------------------------------------------------------
/speechbrain/utils/logger.py:
--------------------------------------------------------------------------------
  1 | """Managing the logger, utilities
  2 | 
  3 | Author
  4 |  * Fang-Pen Lin 2012 https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/
  5 |  * Peter Plantinga 2020
  6 |  * Aku Rouhe 2020
  7 | """
  8 | 
  9 | import sys
 10 | import os
 11 | import yaml
 12 | import tqdm
 13 | import logging
 14 | import logging.config
 15 | import math
 16 | import torch
 17 | from speechbrain.utils.data_utils import recursive_update
 18 | from speechbrain.utils.superpowers import run_shell
 19 | 
 20 | ORDERS_ABBREV = {
 21 |     -24: "y",
 22 |     -21: "z",
 23 |     -18: "a",
 24 |     -15: "f",
 25 |     -12: "p",
 26 |     -9: "n",
 27 |     -6: "µ",
 28 |     -3: "m",
 29 |     0: "",
 30 |     3: "k",
 31 |     6: "M",
 32 |     9: "G",
 33 |     12: "T",
 34 |     15: "P",
 35 |     18: "E",
 36 |     21: "Z",
 37 |     24: "Y",
 38 | }
 39 | 
 40 | # Short scale
 41 | # Negative powers of ten in lowercase, positive in uppercase
 42 | ORDERS_WORDS = {
 43 |     -24: "septillionths",
 44 |     -21: "sextillionths",
 45 |     -18: "quintillionths",
 46 |     -15: "quadrillionths",
 47 |     -12: "trillionths",
 48 |     -9: "billionths",
 49 |     -6: "millionths",
 50 |     -3: "thousandths",
 51 |     0: "",
 52 |     3: "Thousand",
 53 |     6: "Million",
 54 |     9: "Billion",
 55 |     12: "Trillion",
 56 |     15: "Quadrillion",
 57 |     18: "Quintillion",
 58 |     21: "Sextillion",
 59 |     24: "Septillion",
 60 | }
 61 | 
 62 | 
 63 | class TqdmCompatibleStreamHandler(logging.StreamHandler):
 64 |     """TQDM compatible StreamHandler.
 65 | 
 66 |     Writes and prints should be passed through tqdm.tqdm.write
 67 |     so that the tqdm progressbar doesn't get messed up.
 68 |     """
 69 | 
 70 |     def emit(self, record):
 71 |         try:
 72 |             msg = self.format(record)
 73 |             stream = self.stream
 74 |             tqdm.tqdm.write(msg, end=self.terminator, file=stream)
 75 |             self.flush()
 76 |         except RecursionError:
 77 |             raise
 78 |         except Exception:
 79 |             self.handleError(record)
 80 | 
 81 | 
 82 | def setup_logging(
 83 |     config_path="log-config.yaml", overrides={}, default_level=logging.INFO,
 84 | ):
 85 |     """Setup logging configuration.
 86 | 
 87 |     Arguments
 88 |     ---------
 89 |     config_path : str
 90 |         The path to a logging config file.
 91 |     default_level : int
 92 |         The level to use if the config file is not found.
 93 |     overrides : dict
 94 |         A dictionary of the same structure as the config dict
 95 |         with any updated values that need to be applied.
 96 |     """
 97 |     if os.path.exists(config_path):
 98 |         with open(config_path, "rt") as f:
 99 |             config = yaml.safe_load(f)
100 |         recursive_update(config, overrides)
101 |         logging.config.dictConfig(config)
102 |     else:
103 |         logging.basicConfig(level=default_level)
104 | 
105 | 
106 | def format_order_of_magnitude(number, abbreviate=True):
107 |     """Formats number to the appropriate order of magnitude for printing.
108 | 
109 |     Arguments
110 |     ---------
111 |     number : int, float
112 |         The number to format.
113 |     abbreviate : bool
114 |         Whether to use abbreviations (k,M,G) or words (Thousand, Million,
115 |         Billion). Numbers will be either like: "123.5k" or "123.5 Thousand".
116 | 
117 |     Returns
118 |     -------
119 |     str
120 |         The formatted number. Note that the order of magnitude token is part
121 |         of the string.
122 | 
123 |     Example
124 |     -------
125 |     >>> print(format_order_of_magnitude(123456))
126 |     123.5k
127 |     >>> print(format_order_of_magnitude(0.00000123, abbreviate=False))
128 |     1.2 millionths
129 |     >>> print(format_order_of_magnitude(5, abbreviate=False))
130 |     5
131 |     """
132 |     style = ORDERS_ABBREV if abbreviate else ORDERS_WORDS
133 |     precision = "{num:3.1f}"
134 |     order = 3 * math.floor(math.log(math.fabs(number), 1000))
135 |     # Fallback for very large numbers:
136 |     while order not in style and order != 0:
137 |         order = order - math.copysign(3, order)  # Bring 3 units towards 0
138 |     order_token = style[order]
139 |     if order != 0:
140 |         formatted_number = precision.format(num=number / 10 ** order)
141 |     else:
142 |         if isinstance(number, int):
143 |             formatted_number = str(number)
144 |         else:
145 |             formatted_number = precision.format(num=number)
146 |     if abbreviate or not order_token:
147 |         return formatted_number + order_token
148 |     else:
149 |         return formatted_number + " " + order_token
150 | 
151 | 
152 | def get_environment_description():
153 |     """Returns a string describing the current Python / SpeechBrain environment.
154 | 
155 |     Useful for making experiments as replicable as possible.
156 | 
157 |     Returns
158 |     -------
159 |     str
160 |         The string is formatted ready to be written to a file.
161 | 
162 |     Example
163 |     -------
164 |     >>> get_environment_description().splitlines()[0]
165 |     'SpeechBrain system description'
166 |     """
167 |     python_version_str = "Python version:\n" + sys.version + "\n"
168 |     try:
169 |         freezed, _, _ = run_shell("pip freeze")
170 |         python_packages_str = "Installed Python packages:\n"
171 |         python_packages_str += freezed.decode(errors="replace")
172 |     except OSError:
173 |         python_packages_str = "Could not list python packages with pip freeze"
174 |     try:
175 |         git_hash, _, _ = run_shell("git rev-parse --short HEAD")
176 |         git_str = "Git revision:\n" + git_hash.decode(errors="replace")
177 |     except OSError:
178 |         git_str = "Could not get git revision"
179 |     if torch.cuda.is_available():
180 |         cuda_str = "Cuda version:\n" + torch.version.cuda
181 |     else:
182 |         cuda_str = "CUDA not available"
183 |     result = "SpeechBrain system description\n"
184 |     result += "==============================\n"
185 |     result += python_version_str
186 |     result += "==============================\n"
187 |     result += python_packages_str
188 |     result += "==============================\n"
189 |     result += git_str
190 |     result += "==============================\n"
191 |     result += cuda_str
192 |     return result
193 | 


--------------------------------------------------------------------------------
/speechbrain/utils/superpowers.py:
--------------------------------------------------------------------------------
 1 | """Superpowers which should be sparingly used.
 2 | 
 3 | This library contains functions for importing python files and
 4 | for running shell commands. Remember, with great power comes great
 5 | responsibility.
 6 | 
 7 | Authors
 8 |  * Mirco Ravanelli 2020
 9 |  * Aku Rouhe 2021
10 | """
11 | 
12 | import logging
13 | import subprocess
14 | import importlib
15 | import pathlib
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | def import_from_path(path):
21 |     r"""Import module from absolute path
22 | 
23 |     Arguments
24 |     ---------
25 |     path : str, pathlib.Path
26 |         The path to the module to import
27 | 
28 |     Returns
29 |     -------
30 |     module
31 |         The loaded module
32 | 
33 |     >>> modulepath = getfixture("tmpdir") / "helloer.py"
34 |     >>> with open(modulepath, "w") as fo:
35 |     ...     _ = fo.write("def a():\n\treturn 'hello'")
36 |     >>> helloer = import_from_path(modulepath)
37 |     >>> helloer.a()
38 |     'hello'
39 | 
40 |     Implementation taken from:
41 |     https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
42 |     """
43 |     path = pathlib.Path(path)
44 |     modulename = path.with_suffix("").name
45 |     spec = importlib.util.spec_from_file_location(modulename, path)
46 |     module = importlib.util.module_from_spec(spec)
47 |     spec.loader.exec_module(module)
48 |     return module
49 | 
50 | 
51 | def run_shell(cmd):
52 |     r"""This function can be used to run a command in the bash shell.
53 | 
54 |     Arguments
55 |     ---------
56 |     cmd : str
57 |         Shell command to run.
58 | 
59 |     Returns
60 |     -------
61 |     bytes
62 |         The captured standard output.
63 |     bytes
64 |         The captured standard error.
65 |     int
66 |         The returncode.
67 | 
68 |     Raises
69 |     ------
70 |     OSError
71 |         If returncode is not 0, i.e., command failed.
72 | 
73 |     Example
74 |     -------
75 |     >>> out, err, code = run_shell("echo 'hello world'")
76 |     >>> out.decode(errors="ignore")
77 |     'hello world\n'
78 |     """
79 | 
80 |     # Executing the command
81 |     p = subprocess.Popen(
82 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
83 |     )
84 | 
85 |     # Capturing standard output and error
86 |     (output, err) = p.communicate()
87 | 
88 |     if p.returncode != 0:
89 |         raise OSError(err.decode(errors="replace"))
90 | 
91 |     # Adding information in the logger
92 |     msg = output.decode(errors="replace") + "\n" + err.decode(errors="replace")
93 |     logger.debug(msg)
94 | 
95 |     return output, err, p.returncode
96 | 


--------------------------------------------------------------------------------
/speechbrain/utils/torch_audio_backend.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import logging
 3 | import torchaudio
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def check_torchaudio_backend():
 9 |     """Checks the torchaudio backend and sets it to soundfile if
10 |     windows is detected.
11 |     """
12 |     current_system = platform.system()
13 |     if current_system == "Windows":
14 |         logger.warn(
15 |             "The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows."
16 |         )
17 |         torchaudio.set_audio_backend("soundfile")
18 | 


--------------------------------------------------------------------------------
/speechbrain/version.txt:
--------------------------------------------------------------------------------
1 | 0.5.11
2 | 


--------------------------------------------------------------------------------