├── tasks ├── __init__.py ├── slot_filling │ ├── __pycache__ │ │ ├── metric.cpython-38.pyc │ │ └── dataset.cpython-38.pyc │ ├── readme.md │ ├── sf.sh │ ├── metric.py │ ├── slot_filling.py │ └── dataset.py ├── phoneme_recognition │ ├── __pycache__ │ │ └── text.cpython-38.pyc │ ├── phoneme.txt │ ├── readme.md │ ├── pr.sh │ ├── phoneme_recognition.py │ └── text.py ├── intent_classification │ ├── __pycache__ │ │ ├── dataset.cpython-38.pyc │ │ └── customtrain.cpython-38.pyc │ ├── readme.md │ ├── customtrain.py │ ├── dataset.py │ ├── ic.sh │ └── intent_cls.py ├── tts │ ├── readme.md │ ├── LTS │ │ ├── train.yaml │ │ ├── preprocess.yaml │ │ └── model.yaml │ ├── L2ARCTIC │ │ ├── train.yaml │ │ ├── preprocess.yaml │ │ └── model.yaml │ └── transformer.py ├── keyword_spotting │ ├── readme.md │ ├── ks.sh │ └── keyword_spotting.py ├── speaker_classification │ ├── readme.md │ ├── sp_cls.sh │ ├── sp_cls_esd.sh │ └── speaker_recg.py └── asr │ ├── readme.md │ ├── asr_fleurs.sh │ └── asr.py ├── __pycache__ ├── data.cpython-38.pyc ├── model.cpython-38.pyc ├── utils.cpython-38.pyc ├── modules.cpython-38.pyc └── modeling_wav2vec2.cpython-38.pyc ├── requirements.txt ├── LICENSE ├── emotion_cls.sh ├── train.py ├── README.md ├── modules.py └── data.py /tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /__pycache__/data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/__pycache__/data.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/modeling_wav2vec2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/__pycache__/modeling_wav2vec2.cpython-38.pyc -------------------------------------------------------------------------------- /tasks/slot_filling/__pycache__/metric.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/tasks/slot_filling/__pycache__/metric.cpython-38.pyc -------------------------------------------------------------------------------- /tasks/slot_filling/__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/tasks/slot_filling/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /tasks/phoneme_recognition/__pycache__/text.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/tasks/phoneme_recognition/__pycache__/text.cpython-38.pyc -------------------------------------------------------------------------------- /tasks/intent_classification/__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/tasks/intent_classification/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /tasks/intent_classification/__pycache__/customtrain.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/declare-lab/speech-adapters/HEAD/tasks/intent_classification/__pycache__/customtrain.cpython-38.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface-hub==0.9.1 2 | transformers==4.22.1 3 | datasets==2.4.0 4 | click==8.1.3 5 | six==1.16.0 6 | pandas==1.5.0 7 | librosa==0.9.2 8 | h5py==3.7.0 9 | tensorboard==2.10.0 10 | setuptools==59.5.0 11 | adapter-transformers==3.1.0 12 | loralib==0.1.1 13 | jiwer==2.5.1 14 | fairseq==0.12.2 15 | tensorboardX==2.5.1 16 | ipython==8.5.0 17 | path==16.5.0 18 | matplotlib==3.6.1 19 | webrtcvad==2.0.10 20 | editdistance==0.6.0 21 | flashlight==0.1.1 22 | -------------------------------------------------------------------------------- /tasks/tts/readme.md: -------------------------------------------------------------------------------- 1 | # TTS 2 | We use [Comprehensive-Transformer-TTS](https://github.com/keonlee9420/Comprehensive-Transformer-TTS), choice "transformer" as the backbone model, and implement fine-tune, prefix-tuning, lora, bottleneck adapter, and convadapter on it. 3 | 4 | One can follow the README in Comprehensive-Transformer-TTS, and place the "LTS"(for LibriTTS) and "L2ARCTIC"(for L2ARCTIC) under "config" folder, and replace the file "../model/transformer.py" in library use "transformer.py"。 5 | 6 | We use the checkpoint trained by 900000 steps on VCTK dataset, and finetune LTS and L2ARCTIC both 4000 steps. 7 | -------------------------------------------------------------------------------- /tasks/phoneme_recognition/phoneme.txt: -------------------------------------------------------------------------------- 1 | SIL 2 | SPN 3 | AA0 4 | AA1 5 | AA2 6 | AE0 7 | AE1 8 | AE2 9 | AH0 10 | AH1 11 | AH2 12 | AO0 13 | AO1 14 | AO2 15 | AW0 16 | AW1 17 | AW2 18 | AY0 19 | AY1 20 | AY2 21 | B 22 | CH 23 | D 24 | DH 25 | EH0 26 | EH1 27 | EH2 28 | ER0 29 | ER1 30 | ER2 31 | EY0 32 | EY1 33 | EY2 34 | F 35 | G 36 | HH 37 | IH0 38 | IH1 39 | IH2 40 | IY0 41 | IY1 42 | IY2 43 | JH 44 | K 45 | L 46 | M 47 | N 48 | NG 49 | OW0 50 | OW1 51 | OW2 52 | OY0 53 | OY1 54 | OY2 55 | P 56 | R 57 | S 58 | SH 59 | T 60 | TH 61 | UH0 62 | UH1 63 | UH2 64 | UW0 65 | UW1 66 | UW2 67 | V 68 | W 69 | Y 70 | Z 71 | ZH -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Deep Cognition and Language Research (DeCLaRe) Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tasks/tts/LTS/train.yaml: -------------------------------------------------------------------------------- 1 | seed: 1234 2 | dist_config: 3 | dist_backend: "nccl" 4 | dist_url: "tcp://localhost:54321" 5 | world_size: 1 6 | path: 7 | ckpt_path: "./output/ckpt/LTS/LTS_TRANSFORMER/LTS_PREFIX" 8 | log_path: "./output/log/LTS/LTS_TRANSFORMER/LTS_PREFIX" 9 | result_path: "./output/result/LTS/LTS_TRANSFORMER/LTS_PREFIX" 10 | optimizer: 11 | batch_size: 16 12 | betas: [0.9, 0.98] 13 | eps: 0.000000001 14 | weight_decay: 0.0 15 | grad_clip_thresh: 1.0 16 | grad_acc_step: 1 17 | warm_up_step: 4000 18 | anneal_steps: [300000, 400000, 500000] 19 | anneal_rate: 0.3 20 | loss: 21 | noise_loss: "l1" 22 | dur_loss: "mse" 23 | pitch_loss: "l1" 24 | cwt_loss: "l1" 25 | # cwt_add_f0_loss: false 26 | lambda_f0: 1.0 27 | lambda_uv: 1.0 28 | lambda_ph_dur: 1.0 29 | lambda_word_dur: 0.0 # lambda_word_dur should not be activated, otherwise it will produce NaN value (For VCTK) 30 | lambda_sent_dur: 1.0 31 | step: 32 | total_step: 940000 33 | log_step: 100 34 | synth_step: 1000 35 | val_step: 1000 36 | save_step: 2500 37 | var_start_steps: 50000 38 | duration: 39 | binarization_start_steps: 6000 40 | binarization_loss_enable_steps: 18000 41 | binarization_loss_warmup_steps: 10000 42 | prosody: 43 | gmm_mdn_beta: 0.02 44 | prosody_loss_enable_steps: 100000 45 | 46 | 47 | -------------------------------------------------------------------------------- /tasks/tts/L2ARCTIC/train.yaml: -------------------------------------------------------------------------------- 1 | seed: 1234 2 | dist_config: 3 | dist_backend: "nccl" 4 | dist_url: "tcp://localhost:54321" 5 | world_size: 1 6 | path: 7 | ckpt_path: "./output/ckpt/L2ARCTIC/L2ARCTIC_TRANSFORMER/L2ARCTIC-CONVADAPTER" 8 | log_path: "./output/log/L2ARCTIC/L2ARCTIC_TRANSFORMER/L2ARCTIC-CONVADAPTER" 9 | result_path: "./output/result/L2ARCTIC/L2ARCTIC_TRANSFORMER/L2ARCTIC-CONVADAPTER" 10 | optimizer: 11 | batch_size: 16 12 | betas: [0.9, 0.98] 13 | eps: 0.000000001 14 | weight_decay: 0.0 15 | grad_clip_thresh: 1.0 16 | grad_acc_step: 1 17 | warm_up_step: 4000 18 | anneal_steps: [300000, 400000, 500000] 19 | anneal_rate: 0.3 20 | loss: 21 | noise_loss: "l1" 22 | dur_loss: "mse" 23 | pitch_loss: "l1" 24 | cwt_loss: "l1" 25 | # cwt_add_f0_loss: false 26 | lambda_f0: 1.0 27 | lambda_uv: 1.0 28 | lambda_ph_dur: 1.0 29 | lambda_word_dur: 0.0 # lambda_word_dur should not be activated, otherwise it will produce NaN value (For VCTK) 30 | lambda_sent_dur: 1.0 31 | step: 32 | total_step: 940000 33 | log_step: 100 34 | synth_step: 1000 35 | val_step: 1000 36 | save_step: 2500 37 | var_start_steps: 50000 38 | duration: 39 | binarization_start_steps: 6000 40 | binarization_loss_enable_steps: 18000 41 | binarization_loss_warmup_steps: 10000 42 | prosody: 43 | gmm_mdn_beta: 0.02 44 | prosody_loss_enable_steps: 100000 45 | -------------------------------------------------------------------------------- /tasks/tts/LTS/preprocess.yaml: -------------------------------------------------------------------------------- 1 | dataset: "LTS" 2 | 3 | path: 4 | corpus_path: "/666/dsets/LTS/" 5 | wav_tag: "mic1" 6 | wav_dir: "train-clean-100" 7 | txt_dir: "train-clean-100" 8 | lexicon_path: "lexicon/librispeech-lexicon.txt" 9 | raw_path: "./raw_data/LTS" 10 | preprocessed_path: "./preprocessed_data/LTS" 11 | 12 | preprocessing: 13 | speaker_embedder: "DeepSpeaker" # support 'none', 'DeepSpeaker' 14 | speaker_embedder_cuda: False # since it's too large to load in a single GPU 15 | val_size: 512 16 | text: 17 | text_cleaners: ["english_cleaners"] 18 | language: "en" 19 | audio: 20 | trim_top_db: 23 21 | sampling_rate: 22050 22 | max_wav_value: 32768.0 23 | stft: 24 | filter_length: 1024 25 | hop_length: 256 26 | win_length: 1024 27 | mel: 28 | n_mel_channels: 80 29 | mel_fmin: 0 30 | mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder 31 | pitch: 32 | pitch_type: "cwt" # support 'frame', 'ph', 'cwt' 33 | pitch_norm: "log" # support 'standard', 'log' 34 | pitch_norm_eps: 0.000000001 35 | pitch_ar: False 36 | with_f0: True 37 | with_f0cwt: True 38 | use_uv: True 39 | cwt_scales: -1 40 | energy: 41 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 42 | normalization: True 43 | duration: 44 | beta_binomial_scaling_factor: 1.0 45 | -------------------------------------------------------------------------------- /tasks/tts/L2ARCTIC/preprocess.yaml: -------------------------------------------------------------------------------- 1 | dataset: "L2ARCTIC" 2 | 3 | path: 4 | corpus_path: "/666/dsets/l2arctic/" 5 | wav_tag: "mic1" 6 | wav_dir: "wav" 7 | txt_dir: "txt" 8 | lexicon_path: "lexicon/librispeech-lexicon.txt" 9 | raw_path: "./raw_data/L2ARCTIC" 10 | preprocessed_path: "./preprocessed_data/L2ARCTIC" 11 | 12 | preprocessing: 13 | speaker_embedder: "DeepSpeaker" # support 'none', 'DeepSpeaker' 14 | speaker_embedder_cuda: False # since it's too large to load in a single GPU 15 | val_size: 512 16 | text: 17 | text_cleaners: ["english_cleaners"] 18 | language: "en" 19 | audio: 20 | trim_top_db: 23 21 | sampling_rate: 22050 22 | max_wav_value: 32768.0 23 | stft: 24 | filter_length: 1024 25 | hop_length: 256 26 | win_length: 1024 27 | mel: 28 | n_mel_channels: 80 29 | mel_fmin: 0 30 | mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder 31 | pitch: 32 | pitch_type: "cwt" # support 'frame', 'ph', 'cwt' 33 | pitch_norm: "log" # support 'standard', 'log' 34 | pitch_norm_eps: 0.000000001 35 | pitch_ar: False 36 | with_f0: True 37 | with_f0cwt: True 38 | use_uv: True 39 | cwt_scales: -1 40 | energy: 41 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 42 | normalization: True 43 | duration: 44 | beta_binomial_scaling_factor: 1.0 45 | -------------------------------------------------------------------------------- /tasks/slot_filling/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | ### SNIPS dataset 3 | Here is the download [link](https://huggingface.co/datasets/s3prl/SNIPS) 4 | # Train 5 | Here is an example use SNIPS dataset and fine tuning it. 6 | ```python 7 | CUDA_VISIBLE_DEVICES=2,3 python slot_filling.py \ 8 | --dataset snips \ 9 | --data_dir '/data/path/Dataset/SNIPS/' \ 10 | --output_dir '/data/path/output_earlystop_sf_finetune_2e4_scheduler' \ 11 | --do_train True \ 12 | --do_eval True \ 13 | --do_predict False \ 14 | --evaluation_strategy "steps" \ 15 | --save_strategy "steps" \ 16 | --max_steps 50000 \ 17 | --save_steps 5000 \ 18 | --eval_steps 200 \ 19 | --learning_rate 2e-4 \ 20 | --feat_adapter_name "conv_adapter" \ 21 | --trans_adapter_name "adapterblock" \ 22 | --output_adapter False \ 23 | --mh_adapter False \ 24 | --prefixtuning False \ 25 | --prefix_tuning False \ 26 | --lora_adapter False \ 27 | --feat_enc_adapter False \ 28 | --fine_tune True \ 29 | --per_device_train_batch_size 8 \ 30 | --gradient_accumulation_steps 1 \ 31 | --per_device_eval_batch_size 8 \ 32 | --num_train_epochs 30 \ 33 | --warmup_ratio 0.1 \ 34 | --logging_steps 100 \ 35 | --logging_dir '/data/path/output_earlystop_sf_finetune_2e4_scheduler/log' \ 36 | --load_best_model_at_end True \ 37 | --metric_for_best_model "slot_type_f1" 38 | ``` 39 | We also placed examples according to each training method in "sf.sh", using the following command to start new sf task: 40 | ```python 41 | bash sf.sh 42 | ``` 43 | -------------------------------------------------------------------------------- /tasks/intent_classification/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | ## Fluent Speech Commands 3 | Here is the download [link](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) 4 | # Train 5 | Here is an example use fluent speech commands dataset and fine tuning it. 6 | ```python 7 | CUDA_VISIBLE_DEVICES=0,1 python intent_cls.py \ 8 | --dataset fluent_commands \ 9 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 10 | --output_dir '/data/path/Output/output_earlystop_ic_finetune_2e4' \ 11 | --do_train True \ 12 | --do_eval True \ 13 | --do_predict False \ 14 | --evaluation_strategy "steps" \ 15 | --save_strategy "steps" \ 16 | --save_steps 500 \ 17 | --eval_steps 25 \ 18 | --learning_rate 2e-4 \ 19 | --feat_adapter_name "conv_adapter" \ 20 | --trans_adapter_name "adapterblock" \ 21 | --output_adapter False \ 22 | --mh_adapter False \ 23 | --prefix_tuning False \ 24 | --lora_adapter False \ 25 | --feat_enc_adapter False \ 26 | --fine_tune True \ 27 | --per_device_train_batch_size 8 \ 28 | --gradient_accumulation_steps 4 \ 29 | --per_device_eval_batch_size 8 \ 30 | --num_train_epochs 100 \ 31 | --warmup_ratio 0.1 \ 32 | --logging_steps 20 \ 33 | --logging_dir '/data/path/Output/output_earlystop_ic_finetune_2e4/log' \ 34 | --load_best_model_at_end True \ 35 | --metric_for_best_model "acc" 36 | ``` 37 | 38 | We also placed examples according to each training method in "ic.sh", using the following command to start new ic task: 39 | ```python 40 | bash ic.sh 41 | ``` 42 | -------------------------------------------------------------------------------- /tasks/phoneme_recognition/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | ### Librispeech Dataset 3 | Here is the download [link](https://huggingface.co/datasets/librispeech_asr) 4 | # Train 5 | Here is an example use librispeech dataset and fine tuning it. 6 | ```python 7 | CUDA_VISIBLE_DEVICES=0,1 python phoneme_recognition.py \ 8 | --dataset "librispeech" \ 9 | --data_dir '/data/path/hf_datasets' \ 10 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_finetune_2e2' \ 11 | --group_by_length True \ 12 | --do_train True \ 13 | --do_eval True \ 14 | --do_predict False \ 15 | --fp16 True \ 16 | --gradient_checkpointing True \ 17 | --evaluation_strategy "steps" \ 18 | --save_strategy "steps" \ 19 | --save_steps 200 \ 20 | --eval_steps 100 \ 21 | --learning_rate 2e-2 \ 22 | --feat_adapter_name "conv_adapter" \ 23 | --trans_adapter_name "bottleneck" \ 24 | --output_adapter False \ 25 | --mh_adapter False \ 26 | --prefix_tuning False \ 27 | --lora_adapter False \ 28 | --feat_enc_adapter False \ 29 | --fine_tune True \ 30 | --per_device_train_batch_size 16 \ 31 | --gradient_accumulation_steps 4 \ 32 | --per_device_eval_batch_size 16 \ 33 | --num_train_epochs 30 \ 34 | --weight_decay=0.005 \ 35 | --warmup_steps=1000 \ 36 | --logging_steps 20 \ 37 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_finetune_2e2/log' \ 38 | --load_best_model_at_end True \ 39 | --metric_for_best_model "per" \ 40 | --greater_is_better False 41 | ``` 42 | We also placed examples according to each training method in "pr.sh", using the following command to start new pr task: 43 | 44 | ```python 45 | bash pr.sh 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /tasks/keyword_spotting/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | ### Google Speech Commands Dataset 4 | Here is the download [link](http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz) 5 | , also can use this [link](https://github.com/NVIDIA/NeMo/blob/v0.10.1/examples/asr/notebooks/3_Speech_Commands_using_NeMo.ipynb) to download. 6 | 7 | Only use these ['off', 'up', 'stop', 'four', 'no', 'down', 'left', 'go', 'yes', 'on', 'right'] classes, same as in [this](https://arxiv.org/ftp/arxiv/papers/2101/2101.04792.pdf) paper 8 | # Train 9 | Here is an example use speech commands dataset and fine tuning it. 10 | ```python 11 | CUDA_VISIBLE_DEVICES=0,1 python keyword_spotting.py \ 12 | --output_dir '/data/path/output_earlystop_ks_finetune_8e6' \ 13 | --do_train False \ 14 | --do_eval False \ 15 | --do_predict True \ 16 | --evaluation_strategy "steps" \ 17 | --save_strategy "steps" \ 18 | --save_steps 500 \ 19 | --eval_steps 25 \ 20 | --learning_rate 8e-6 \ 21 | --feat_adapter_name "conv_adapter" \ 22 | --trans_adapter_name "bottleneck" \ 23 | --output_adapter False \ 24 | --mh_adapter False \ 25 | --prefix_tuning False \ 26 | --lora_adapter False \ 27 | --feat_enc_adapter False \ 28 | --fine_tune True \ 29 | --per_device_train_batch_size 64 \ 30 | --gradient_accumulation_steps 4 \ 31 | --per_device_eval_batch_size 64 \ 32 | --num_train_epochs 100 \ 33 | --warmup_ratio 0.1 \ 34 | --logging_steps 20 \ 35 | --logging_dir '/data/path/output_earlystop_ks_finetune_8e6/log' \ 36 | --load_best_model_at_end True \ 37 | --metric_for_best_model "accuracy" 38 | ``` 39 | We also placed examples according to each training method in "ks.sh", using the following command to start new ks task: 40 | ```python 41 | bash ks.sh 42 | ``` 43 | -------------------------------------------------------------------------------- /tasks/intent_classification/customtrain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn 4 | from transformers import Trainer 5 | 6 | class CustomTrainer(Trainer): 7 | def compute_loss(self, model, inputs, return_outputs=False): 8 | labels = inputs.get("labels") 9 | # forward pass 10 | outputs = model(**inputs) 11 | 12 | intent_logits = outputs.get("logits") 13 | 14 | intent_loss = 0 15 | start_index = 0 16 | predicted_intent = [] 17 | 18 | values_per_slot = [6,14,4] 19 | loss_fct = nn.CrossEntropyLoss().to(labels.device) 20 | 21 | 22 | for slot in range(3): 23 | end_index = start_index + values_per_slot[slot] 24 | subset = intent_logits[:, start_index:end_index] 25 | 26 | # breakpoint() 27 | 28 | intent_loss += loss_fct(subset, labels[:, slot]) 29 | predicted_intent.append(subset.max(1)[1]) 30 | 31 | # breakpoint() 32 | 33 | start_index = end_index 34 | 35 | def idx2slots(indices: torch.Tensor): 36 | action_idx, object_idx, location_idx = indices.cpu().tolist() 37 | return ( 38 | self.Sy_intent["action"][action_idx], 39 | self.Sy_intent["object"][object_idx], 40 | self.Sy_intent["location"][location_idx], 41 | ) 42 | 43 | return (intent_loss, outputs) if return_outputs else intent_loss 44 | 45 | def compute_metrics(eval_pred): 46 | 47 | action = eval_pred.predictions[:, :6].argmax(axis=1) 48 | object_ = eval_pred.predictions[:, 6:20].argmax(axis=1) 49 | location = eval_pred.predictions[:, 20:].argmax(axis=1) 50 | 51 | predicted_intent = np.vstack((action, object_, location)).T 52 | 53 | acc_list = (predicted_intent == eval_pred.label_ids).prod(1).astype(np.float32).tolist() 54 | 55 | acc = sum(acc_list) * 1.0 / len(acc_list) 56 | 57 | return {"acc":acc} -------------------------------------------------------------------------------- /tasks/speaker_classification/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | ### ESD Dataset 3 | Here is the download [link](https://github.com/HLTSingapore/Emotional-Speech-Data) 4 | ### VCTK Dataset 5 | 1. load from huggingface ```load_dataset("vctk", split='train', cache_dir='/data/path/VCTK')``` 6 | 2. or can download raw data from [link](https://datashare.ed.ac.uk/handle/10283/2651) and follow the data preparation strategy of [nuwave](https://github.com/mindslab-ai/nuwave) 7 | # Train 8 | Here is an example use vctk dataset and fine tuning it. 9 | ```python 10 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 11 | --dataset vctk\ 12 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 13 | --output_dir '/data/yingting/output_earlystop_sp_finetune_8e6' \ 14 | --do_train True \ 15 | --do_eval True \ 16 | --do_predict False \ 17 | --evaluation_strategy "steps" \ 18 | --save_strategy "steps" \ 19 | --save_steps 500 \ 20 | --eval_steps 25 \ 21 | --learning_rate 8e-6 \ 22 | --feat_adapter_name "conv_adapter" \ 23 | --trans_adapter_name "bottleneck" \ 24 | --output_adapter False \ 25 | --mh_adapter False \ 26 | --prefix_tuning False \ 27 | --lora_adapter False \ 28 | --feat_enc_adapter False \ 29 | --fine_tune True \ 30 | --per_device_train_batch_size 64 \ 31 | --gradient_accumulation_steps 4 \ 32 | --per_device_eval_batch_size 64 \ 33 | --num_train_epochs 100 \ 34 | --warmup_ratio 0.1 \ 35 | --logging_steps 20 \ 36 | --logging_dir '/data/yingting/output_earlystop_sp_finetune_8e6/log' \ 37 | --load_best_model_at_end True \ 38 | --metric_for_best_model "accuracy" 39 | ``` 40 | We also placed examples according to each training method in "sp_cls.sh", using the following command to start new sr task: 41 | ```python 42 | bash sp_cls.sh 43 | ``` 44 | -------------------------------------------------------------------------------- /tasks/asr/readme.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | ### ESD Dataset 4 | Here is the download [link](https://github.com/HLTSingapore/Emotional-Speech-Data) 5 | ### FLEURS Dataset 6 | Here is the download [link](https://huggingface.co/datasets/google/fleurs) 7 | ### Librispeech Dataset 8 | Here is the download [link](https://huggingface.co/datasets/librispeech_asr) 9 | 10 | # Train 11 | Here is an example use fleurs dataset and fine tuning it. 12 | ```python 13 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 14 | --output_dir '/data/path/output_earlystop_asr_fleurs_finetune_2e3' \ 15 | --dataset "fleurs" \ 16 | --data_dir '/data/path/Dataset/fleurs' \ 17 | --group_by_length True \ 18 | --do_train False \ 19 | --do_eval False \ 20 | --do_predict True \ 21 | --fp16 True \ 22 | --gradient_checkpointing True \ 23 | --evaluation_strategy "steps" \ 24 | --save_strategy "steps" \ 25 | --save_steps 500 \ 26 | --eval_steps 100 \ 27 | --learning_rate 2e-3 \ 28 | --feat_adapter_name "conv_adapter" \ 29 | --trans_adapter_name "adapterblock" \ 30 | --output_adapter False \ 31 | --mh_adapter False \ 32 | --prefix_tuning False \ 33 | --lora_adapter False \ 34 | --feat_enc_adapter False \ 35 | --fine_tune True \ 36 | --per_device_train_batch_size 32 \ 37 | --per_device_eval_batch_size 32 \ 38 | --num_train_epochs 50 \ 39 | --weight_decay=0.005 \ 40 | --warmup_steps=1000 \ 41 | --logging_steps 50 \ 42 | --logging_dir '/data/path/output_earlystop_asr_fleurs_finetune_2e3/log' \ 43 | --load_best_model_at_end True \ 44 | --metric_for_best_model "wer" \ 45 | --greater_is_better False 46 | ``` 47 | 48 | We also placed examples according to each training method in "asr_fleurs.sh", using the following command to start new asr task: 49 | ```python 50 | bash asr_fleurs.sh 51 | ``` 52 | -------------------------------------------------------------------------------- /tasks/tts/L2ARCTIC/model.yaml: -------------------------------------------------------------------------------- 1 | block_type: "transformer" # ["transformer_fs2", "transformer", "fastformer", "lstransformer", "conformer", "reformer"] 2 | external_speaker_dim: 512 3 | 4 | duration_modeling: 5 | learn_alignment: True 6 | aligner_temperature: 0.0005 7 | 8 | prosody_modeling: 9 | model_type: "none" # ["none", "du2021", "liu2021"] 10 | 11 | # Du et al., 2021 12 | # This is only supported under supervised duration modeling (learn_alignment: False) 13 | du2021: 14 | extractor_kernel_size: 9 15 | predictor_kernel_size: [9, 5] 16 | predictor_num_gaussians: 20 17 | predictor_dropout: 0.2 18 | 19 | # Liu et al., 2021 20 | # This is only tested under supervised duration modeling (learn_alignment: False) 21 | liu2021: 22 | bottleneck_size_u: 256 23 | bottleneck_size_p: 4 24 | ref_enc_filters: [32, 32, 64, 64, 128, 128] 25 | ref_enc_size: [3, 3] 26 | ref_enc_strides: [1, 2] # '1' is to keep the sequence length 27 | ref_enc_pad: [1, 1] 28 | ref_enc_gru_size: 32 29 | ref_attention_dropout: 0. 30 | token_num: 32 31 | predictor_kernel_size: 3 # [9, 5] for non-parallel predictor / 3 for parallel predictor 32 | predictor_dropout: 0.5 33 | 34 | transformer_fs2: 35 | encoder_layer: 4 36 | encoder_head: 2 37 | encoder_hidden: 256 38 | decoder_layer: 6 39 | decoder_head: 2 40 | decoder_hidden: 256 41 | ffn_kernel_size: 9 42 | encoder_dropout: 0.1 43 | decoder_dropout: 0.1 44 | 45 | transformer: 46 | encoder_layer: 4 47 | encoder_head: 2 48 | encoder_hidden: 256 49 | decoder_layer: 6 50 | decoder_head: 2 51 | decoder_hidden: 256 52 | conv_filter_size: 1024 53 | conv_kernel_size: [9, 1] 54 | encoder_dropout: 0.2 55 | decoder_dropout: 0.2 56 | 57 | conformer: 58 | encoder_layer: 4 59 | encoder_head: 8 60 | encoder_hidden: 256 61 | decoder_layer: 6 62 | decoder_head: 8 63 | decoder_hidden: 256 64 | feed_forward_expansion_factor: 4 65 | conv_expansion_factor: 2 66 | conv_kernel_size: 31 67 | half_step_residual: True 68 | encoder_dropout: 0.1 69 | decoder_dropout: 0.1 70 | 71 | variance_predictor: 72 | filter_size: 256 73 | predictor_grad: 0.1 74 | predictor_layers: 2 75 | predictor_kernel: 5 76 | cwt_hidden_size: 128 77 | cwt_std_scale: 0.8 78 | dur_predictor_layers: 2 79 | dur_predictor_kernel: 3 80 | dropout: 0.5 81 | ffn_padding: "SAME" 82 | ffn_act: "gelu" 83 | 84 | variance_embedding: 85 | use_pitch_embed: True 86 | pitch_n_bins: 300 87 | use_energy_embed: True 88 | energy_n_bins: 256 89 | energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing 90 | 91 | multi_speaker: True 92 | 93 | max_seq_len: 1500 # max sequence length of VCTK is 1298 94 | 95 | vocoder: 96 | model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN' 97 | speaker: "universal" # support 'LJSpeech', 'universal' 98 | 99 | adapter: 100 | # fine-tune 101 | fine_tune: False 102 | # prefix_tuning 103 | prefix_tuning: False 104 | prefix_projection: False 105 | prefix_seq_len: 30 106 | prefix_dropout_prob: 0.1 107 | # bottleneck 108 | output_bottleneck: False 109 | # lora 110 | lora: False 111 | # tiny_attention 112 | tiny_attention: False 113 | # tiny_external_attention 114 | tiny_external_attention: False 115 | # tiny conformer 116 | tiny_conformer: False 117 | # proposed conv adapter 118 | conv_adapter: True 119 | -------------------------------------------------------------------------------- /tasks/tts/LTS/model.yaml: -------------------------------------------------------------------------------- 1 | block_type: "transformer" # ["transformer_fs2", "transformer", "fastformer", "lstransformer", "conformer", "reformer"] 2 | external_speaker_dim: 512 3 | 4 | duration_modeling: 5 | learn_alignment: True 6 | aligner_temperature: 0.0005 7 | 8 | prosody_modeling: 9 | model_type: "none" # ["none", "du2021", "liu2021"] 10 | 11 | # Du et al., 2021 12 | # This is only supported under supervised duration modeling (learn_alignment: False) 13 | du2021: 14 | extractor_kernel_size: 9 15 | predictor_kernel_size: [9, 5] 16 | predictor_num_gaussians: 20 17 | predictor_dropout: 0.2 18 | 19 | # Liu et al., 2021 20 | # This is only tested under supervised duration modeling (learn_alignment: False) 21 | liu2021: 22 | bottleneck_size_u: 256 23 | bottleneck_size_p: 4 24 | ref_enc_filters: [32, 32, 64, 64, 128, 128] 25 | ref_enc_size: [3, 3] 26 | ref_enc_strides: [1, 2] # '1' is to keep the sequence length 27 | ref_enc_pad: [1, 1] 28 | ref_enc_gru_size: 32 29 | ref_attention_dropout: 0. 30 | token_num: 32 31 | predictor_kernel_size: 3 # [9, 5] for non-parallel predictor / 3 for parallel predictor 32 | predictor_dropout: 0.5 33 | 34 | transformer_fs2: 35 | encoder_layer: 4 36 | encoder_head: 2 37 | encoder_hidden: 256 38 | decoder_layer: 6 39 | decoder_head: 2 40 | decoder_hidden: 256 41 | ffn_kernel_size: 9 42 | encoder_dropout: 0.1 43 | decoder_dropout: 0.1 44 | 45 | transformer: 46 | encoder_layer: 4 47 | encoder_head: 2 48 | encoder_hidden: 256 49 | decoder_layer: 6 50 | decoder_head: 2 51 | decoder_hidden: 256 52 | conv_filter_size: 1024 53 | conv_kernel_size: [9, 1] 54 | encoder_dropout: 0.2 55 | decoder_dropout: 0.2 56 | 57 | conformer: 58 | encoder_layer: 4 59 | encoder_head: 8 60 | encoder_hidden: 256 61 | decoder_layer: 6 62 | decoder_head: 8 63 | decoder_hidden: 256 64 | feed_forward_expansion_factor: 4 65 | conv_expansion_factor: 2 66 | conv_kernel_size: 31 67 | half_step_residual: True 68 | encoder_dropout: 0.1 69 | decoder_dropout: 0.1 70 | 71 | ########## 72 | reformer: 73 | depth: 2 74 | encoder_head: 8 75 | decoder_head: 8 76 | ######### 77 | 78 | variance_predictor: 79 | filter_size: 256 80 | predictor_grad: 0.1 81 | predictor_layers: 2 82 | predictor_kernel: 5 83 | cwt_hidden_size: 128 84 | cwt_std_scale: 0.8 85 | dur_predictor_layers: 2 86 | dur_predictor_kernel: 3 87 | dropout: 0.5 88 | ffn_padding: "SAME" 89 | ffn_act: "gelu" 90 | 91 | variance_embedding: 92 | use_pitch_embed: True 93 | pitch_n_bins: 300 94 | use_energy_embed: True 95 | energy_n_bins: 256 96 | energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing 97 | 98 | multi_speaker: True 99 | 100 | max_seq_len: 3000 # max sequence length of VCTK is 1298 101 | 102 | vocoder: 103 | model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN' 104 | speaker: "universal" # support 'LJSpeech', 'universal' 105 | 106 | adapter: 107 | # fine-tune 108 | fine_tune: False 109 | # prefix_tuning 110 | prefix_tuning: True 111 | prefix_projection: False 112 | prefix_seq_len: 30 113 | prefix_dropout_prob: 0.1 114 | # bottleneck 115 | output_bottleneck: False 116 | # lora 117 | lora: False 118 | # tiny_attention 119 | tiny_attention: False 120 | # tiny_external_attention 121 | tiny_external_attention: False 122 | # tiny conformer 123 | tiny_conformer: False 124 | # proposed conv adapter 125 | conv_adapter: False 126 | -------------------------------------------------------------------------------- /tasks/intent_classification/dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from os.path import join 3 | import csv 4 | import librosa 5 | from pathlib import Path 6 | from transformers import Wav2Vec2Processor 7 | import random 8 | random.seed(4) 9 | 10 | class ICDataset(Dataset): 11 | def __init__(self, root_dir, mode, processor: Wav2Vec2Processor): 12 | 13 | self.processor = processor 14 | if mode=="train": 15 | file_name = "train_data.csv" 16 | elif mode=="valid": 17 | file_name = "valid_data.csv" 18 | elif mode=="test": 19 | file_name = "test_data.csv" 20 | else: 21 | raise "mode need be one of {train, valid, test}" 22 | csv_file = join(root_dir, "data", file_name) 23 | 24 | samples = [] 25 | 26 | with open(csv_file, newline='') as csvfile: 27 | reader = csv.reader(csvfile) 28 | line_count = 0 29 | for i, row in enumerate(reader): 30 | if line_count == 0: 31 | line_count += 1 32 | else: 33 | line_count += 1 34 | samples.append({ 35 | "path":join(root_dir, row[1]), 36 | "speaker_id":row[2], 37 | "text":row[3], 38 | "action":row[4], 39 | "object":row[5], 40 | "location":row[6] 41 | }) 42 | # Shuffle 43 | if mode == "train": 44 | random.shuffle(samples) 45 | 46 | self.id2label = {0: 'change language', 1: 'activate', 2: 'deactivate', 3: 'increase', 4: 'decrease', 47 | 5: 'bring', 6: 'none_object', 7: 'music', 8: 'lights', 9: 'volume', 10: 'heat', 48 | 11: 'lamp', 12: 'newspaper', 13: 'juice', 14: 'socks', 15: 'Chinese', 16: 'Korean', 49 | 17: 'English', 18: 'German', 19: 'shoes', 20: 'none_location', 21: 'kitchen', 50 | 22: 'bedroom', 23: 'washroom'} 51 | self.label2id = {v:k for k,v in self.id2label.items()} 52 | self.Sy_intent = {'action': {'change language': 0, 0: 'change language', 'activate': 1, 1: 'activate', 53 | 'deactivate': 2, 2: 'deactivate', 'increase': 3, 3: 'increase', 54 | 'decrease': 4, 4: 'decrease', 'bring': 5, 5: 'bring'}, 55 | 'object': {'none': 0, 0: 'none', 'music': 1, 1: 'music', 'lights': 2, 2: 'lights', 56 | 'volume': 3, 3: 'volume', 'heat': 4, 4: 'heat', 'lamp': 5, 5: 'lamp', 57 | 'newspaper': 6, 6: 'newspaper', 'juice': 7, 7: 'juice', 'socks': 8, 8: 'socks', 58 | 'Chinese': 9, 9: 'Chinese', 'Korean': 10, 10: 'Korean', 'English': 11, 11: 'English', 59 | 'German': 12, 12: 'German', 'shoes': 13, 13: 'shoes'}, 60 | 'location': {'none': 0, 0: 'none', 'kitchen': 1, 1: 'kitchen', 61 | 'bedroom': 2, 2: 'bedroom', 'washroom': 3, 3: 'washroom'}} 62 | 63 | self.samples = samples 64 | self.num_labels = len(self.id2label) 65 | 66 | def __getitem__(self, index): 67 | 68 | audio_wav, _ = librosa.load(self.samples[index]["path"], sr=16000, mono=True) 69 | 70 | inputs = self.processor(audio_wav, padding="max_length", max_length=220000, truncation=True, sampling_rate=16000, return_tensors="pt") 71 | 72 | label = [] 73 | for slot in ["action", "object", "location"]: 74 | value = self.samples[index][slot] 75 | label.append(self.Sy_intent[slot][value]) 76 | 77 | return {'input_values' : inputs.input_values.squeeze(0), 78 | 'attention_mask' : inputs.attention_mask.squeeze(0), 79 | 'labels': label 80 | # 'intents': label 81 | } 82 | 83 | def __len__(self): 84 | return len(self.samples) 85 | 86 | if __name__=="__main__": 87 | path = "/data/path/Dataset/fluent_speech_commands_dataset/" 88 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 89 | icdataset = ICDataset(path, "train", processor) 90 | print("len of icdataset:", len(icdataset)) 91 | print(icdataset[0]) -------------------------------------------------------------------------------- /tasks/keyword_spotting/ks.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=0,1 python keyword_spotting.py \ 3 | --output_dir '/data/path/output_earlystop_ks_finetune_8e6' \ 4 | --do_train False \ 5 | --do_eval False \ 6 | --do_predict True \ 7 | --evaluation_strategy "steps" \ 8 | --save_strategy "steps" \ 9 | --save_steps 500 \ 10 | --eval_steps 25 \ 11 | --learning_rate 8e-6 \ 12 | --feat_adapter_name "conv_adapter" \ 13 | --trans_adapter_name "bottleneck" \ 14 | --output_adapter False \ 15 | --mh_adapter False \ 16 | --prefix_tuning False \ 17 | --lora_adapter False \ 18 | --feat_enc_adapter False \ 19 | --fine_tune True \ 20 | --per_device_train_batch_size 64 \ 21 | --gradient_accumulation_steps 4 \ 22 | --per_device_eval_batch_size 64 \ 23 | --num_train_epochs 100 \ 24 | --warmup_ratio 0.1 \ 25 | --logging_steps 20 \ 26 | --logging_dir '/data/path/output_earlystop_ks_finetune_8e6/log' \ 27 | --load_best_model_at_end True \ 28 | --metric_for_best_model "accuracy" 29 | 30 | ##### Bottleneck ###### 31 | CUDA_VISIBLE_DEVICES=0,1 python keyword_spotting.py \ 32 | --output_dir '/data/path/output_earlystop_ks_bottleneck_8e6' \ 33 | --do_train False \ 34 | --do_eval False \ 35 | --do_predict True \ 36 | --evaluation_strategy "steps" \ 37 | --save_strategy "steps" \ 38 | --save_steps 500 \ 39 | --eval_steps 25 \ 40 | --learning_rate 8e-6 \ 41 | --feat_adapter_name "conv_adapter" \ 42 | --trans_adapter_name "bottleneck" \ 43 | --output_adapter True \ 44 | --mh_adapter False \ 45 | --prefix_tuning False \ 46 | --lora_adapter False \ 47 | --feat_enc_adapter False \ 48 | --fine_tune False \ 49 | --per_device_train_batch_size 64 \ 50 | --gradient_accumulation_steps 4 \ 51 | --per_device_eval_batch_size 64 \ 52 | --num_train_epochs 100 \ 53 | --warmup_ratio 0.1 \ 54 | --logging_steps 20 \ 55 | --logging_dir '/data/path/output_earlystop_ks_bottleneck_8e6/log' \ 56 | --load_best_model_at_end True \ 57 | --metric_for_best_model "accuracy" 58 | 59 | ##### Lora ###### 60 | CUDA_VISIBLE_DEVICES=0,1 python keyword_spotting.py \ 61 | --output_dir '/data/path/output_earlystop_ks_lora_8e6' \ 62 | --do_train False \ 63 | --do_eval False \ 64 | --do_predict True \ 65 | --evaluation_strategy "steps" \ 66 | --save_strategy "steps" \ 67 | --save_steps 500 \ 68 | --eval_steps 25 \ 69 | --learning_rate 8e-6 \ 70 | --feat_adapter_name "conv_adapter" \ 71 | --trans_adapter_name "bottleneck" \ 72 | --output_adapter False \ 73 | --mh_adapter False \ 74 | --prefix_tuning False \ 75 | --lora_adapter True \ 76 | --feat_enc_adapter False \ 77 | --fine_tune False \ 78 | --per_device_train_batch_size 64 \ 79 | --gradient_accumulation_steps 4 \ 80 | --per_device_eval_batch_size 64 \ 81 | --num_train_epochs 100 \ 82 | --warmup_ratio 0.1 \ 83 | --logging_steps 20 \ 84 | --logging_dir '/data/path/output_earlystop_ks_lora_8e6/log' \ 85 | --load_best_model_at_end True \ 86 | --metric_for_best_model "accuracy" 87 | 88 | ##### Prefix-tuning ###### 89 | CUDA_VISIBLE_DEVICES=2,3 python keyword_spotting.py \ 90 | --output_dir '/data/path/output_earlystop_ks_prefixtuningmy_2e4' \ 91 | --do_train False \ 92 | --do_eval False \ 93 | --do_predict True \ 94 | --evaluation_strategy "steps" \ 95 | --save_strategy "steps" \ 96 | --save_steps 500 \ 97 | --eval_steps 25 \ 98 | --learning_rate 2e-4 \ 99 | --feat_adapter_name "conv_adapter" \ 100 | --trans_adapter_name "bottleneck" \ 101 | --output_adapter False \ 102 | --mh_adapter False \ 103 | --prefix_tuning True \ 104 | --lora_adapter False \ 105 | --feat_enc_adapter False \ 106 | --fine_tune False \ 107 | --per_device_train_batch_size 64 \ 108 | --gradient_accumulation_steps 4 \ 109 | --per_device_eval_batch_size 64 \ 110 | --num_train_epochs 100 \ 111 | --warmup_ratio 0.1 \ 112 | --logging_steps 20 \ 113 | --logging_dir '/data/path/output_earlystop_ks_prefixtuningmy_2e4/log' \ 114 | --load_best_model_at_end True \ 115 | --metric_for_best_model "accuracy" 116 | 117 | ##### Adapterblock ###### 118 | CUDA_VISIBLE_DEVICES=2,3 python keyword_spotting.py \ 119 | --data_dir "/data/path/Dataset/SpeechCommand/12classes/" \ 120 | --output_dir '/data/path/output_earlystop_ks_adapterblock_2e4' \ 121 | --do_train True \ 122 | --do_eval True \ 123 | --do_predict False \ 124 | --evaluation_strategy "steps" \ 125 | --save_strategy "steps" \ 126 | --save_steps 500 \ 127 | --eval_steps 25 \ 128 | --learning_rate 2e-4 \ 129 | --feat_adapter_name "conv_adapter" \ 130 | --trans_adapter_name "adapterblock" \ 131 | --output_adapter True \ 132 | --mh_adapter False \ 133 | --prefix_tuning False \ 134 | --lora_adapter False \ 135 | --feat_enc_adapter False \ 136 | --fine_tune False \ 137 | --per_device_train_batch_size 64 \ 138 | --gradient_accumulation_steps 4 \ 139 | --per_device_eval_batch_size 64 \ 140 | --num_train_epochs 100 \ 141 | --warmup_ratio 0.1 \ 142 | --logging_steps 20 \ 143 | --logging_dir '/data/path/output_earlystop_ks_adapterblock_2e4/log' \ 144 | --load_best_model_at_end True \ 145 | --metric_for_best_model "accuracy" 146 | 147 | -------------------------------------------------------------------------------- /emotion_cls.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 3 | --dataset "meld" \ 4 | --data_dir "/data/path/MELD.Raw" \ 5 | --output_dir '/data/path/output_earlystop_ser_meld_finetune_2e3' \ 6 | --do_train True \ 7 | --do_eval True \ 8 | --do_predict False \ 9 | --evaluation_strategy "steps" \ 10 | --save_strategy "steps" \ 11 | --save_steps 500 \ 12 | --eval_steps 25 \ 13 | --learning_rate 2e-3 \ 14 | --feat_adapter_name "conv_adapter" \ 15 | --trans_adapter_name "adapterblock" \ 16 | --output_adapter False \ 17 | --mh_adapter False \ 18 | --prefix_tuning False \ 19 | --lora_adapter False \ 20 | --feat_enc_adapter False \ 21 | --fine_tune True \ 22 | --per_device_train_batch_size 64 \ 23 | --gradient_accumulation_steps 4 \ 24 | --per_device_eval_batch_size 64 \ 25 | --num_train_epochs 100 \ 26 | --warmup_ratio 0.1 \ 27 | --logging_steps 20 \ 28 | --logging_dir '/data/path/output_earlystop_ser_meld_finetune_2e3/log' \ 29 | --load_best_model_at_end True \ 30 | --metric_for_best_model "f1" 31 | 32 | ##### Bottleneck ###### 33 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 34 | --dataset "meld" \ 35 | --data_dir "/data/path/MELD.Raw" \ 36 | --output_dir '/data/path/output_earlystop_ser_meld_bottleneck_2e3' \ 37 | --do_train True \ 38 | --do_eval True \ 39 | --do_predict False \ 40 | --evaluation_strategy "steps" \ 41 | --save_strategy "steps" \ 42 | --save_steps 500 \ 43 | --eval_steps 25 \ 44 | --learning_rate 2e-3 \ 45 | --feat_adapter_name "conv_adapter" \ 46 | --trans_adapter_name "bottleneck" \ 47 | --output_adapter True \ 48 | --mh_adapter False \ 49 | --prefix_tuning False \ 50 | --lora_adapter False \ 51 | --feat_enc_adapter False \ 52 | --fine_tune False \ 53 | --per_device_train_batch_size 64 \ 54 | --gradient_accumulation_steps 4 \ 55 | --per_device_eval_batch_size 64 \ 56 | --num_train_epochs 100 \ 57 | --warmup_ratio 0.1 \ 58 | --logging_steps 20 \ 59 | --logging_dir '/data/path/output_earlystop_ser_meld_bottleneck_2e3/log' \ 60 | --load_best_model_at_end True \ 61 | --metric_for_best_model "f1" 62 | 63 | ##### Prefix-tuning ###### 64 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 65 | --dataset "meld" \ 66 | --data_dir "/data/path/MELD.Raw" \ 67 | --output_dir '/data/path/output_earlystop_ser_meld_prefix_2e3' \ 68 | --do_train True \ 69 | --do_eval True \ 70 | --do_predict False \ 71 | --evaluation_strategy "steps" \ 72 | --save_strategy "steps" \ 73 | --save_steps 500 \ 74 | --eval_steps 25 \ 75 | --learning_rate 2e-3 \ 76 | --feat_adapter_name "conv_adapter" \ 77 | --trans_adapter_name "adapterblock" \ 78 | --output_adapter False \ 79 | --mh_adapter False \ 80 | --prefix_tuning True \ 81 | --lora_adapter False \ 82 | --feat_enc_adapter False \ 83 | --fine_tune False \ 84 | --per_device_train_batch_size 64 \ 85 | --gradient_accumulation_steps 4 \ 86 | --per_device_eval_batch_size 64 \ 87 | --num_train_epochs 100 \ 88 | --warmup_ratio 0.1 \ 89 | --logging_steps 20 \ 90 | --logging_dir '/data/path/output_earlystop_ser_meld_prefix_2e3/log' \ 91 | --load_best_model_at_end True \ 92 | --metric_for_best_model "f1" 93 | 94 | ##### Lora ###### 95 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 96 | --dataset "meld" \ 97 | --data_dir "/data/path/MELD.Raw" \ 98 | --output_dir '/data/path/output_earlystop_ser_meld_lora_2e3' \ 99 | --do_train True \ 100 | --do_eval True \ 101 | --do_predict False \ 102 | --evaluation_strategy "steps" \ 103 | --save_strategy "steps" \ 104 | --save_steps 500 \ 105 | --eval_steps 25 \ 106 | --learning_rate 2e-3 \ 107 | --feat_adapter_name "conv_adapter" \ 108 | --trans_adapter_name "adapterblock" \ 109 | --output_adapter False \ 110 | --mh_adapter False \ 111 | --prefix_tuning False \ 112 | --lora_adapter True \ 113 | --feat_enc_adapter False \ 114 | --fine_tune False \ 115 | --per_device_train_batch_size 64 \ 116 | --gradient_accumulation_steps 4 \ 117 | --per_device_eval_batch_size 64 \ 118 | --num_train_epochs 100 \ 119 | --warmup_ratio 0.1 \ 120 | --logging_steps 20 \ 121 | --logging_dir '/data/path/output_earlystop_ser_meld_lora_2e3/log' \ 122 | --load_best_model_at_end True \ 123 | --metric_for_best_model "f1" 124 | 125 | ##### Adapterblock ###### 126 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 127 | --dataset "meld" \ 128 | --data_dir "/data/path/MELD.Raw" \ 129 | --output_dir '/data/path/output_earlystop_ser_meld_adapterblock_2e3' \ 130 | --do_train True \ 131 | --do_eval True \ 132 | --do_predict False \ 133 | --evaluation_strategy "steps" \ 134 | --save_strategy "steps" \ 135 | --save_steps 500 \ 136 | --eval_steps 25 \ 137 | --learning_rate 2e-3 \ 138 | --feat_adapter_name "conv_adapter" \ 139 | --trans_adapter_name "adapterblock" \ 140 | --output_adapter True \ 141 | --mh_adapter False \ 142 | --prefix_tuning False \ 143 | --lora_adapter False \ 144 | --feat_enc_adapter False \ 145 | --fine_tune False \ 146 | --per_device_train_batch_size 64 \ 147 | --gradient_accumulation_steps 4 \ 148 | --per_device_eval_batch_size 64 \ 149 | --num_train_epochs 100 \ 150 | --warmup_ratio 0.1 \ 151 | --logging_steps 20 \ 152 | --logging_dir '/data/path/output_earlystop_ser_meld_adapterblock_2e3/log' \ 153 | --load_best_model_at_end True \ 154 | --metric_for_best_model "f1" 155 | 156 | -------------------------------------------------------------------------------- /tasks/speaker_classification/sp_cls.sh: -------------------------------------------------------------------------------- 1 | 2 | ##### Fine-tune ###### 3 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 4 | --dataset vctk\ 5 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 6 | --output_dir '/data/yingting/output_earlystop_sp_finetune_8e6' \ 7 | --do_train True \ 8 | --do_eval True \ 9 | --do_predict False \ 10 | --evaluation_strategy "steps" \ 11 | --save_strategy "steps" \ 12 | --save_steps 500 \ 13 | --eval_steps 25 \ 14 | --learning_rate 8e-6 \ 15 | --feat_adapter_name "conv_adapter" \ 16 | --trans_adapter_name "bottleneck" \ 17 | --output_adapter False \ 18 | --mh_adapter False \ 19 | --prefix_tuning False \ 20 | --lora_adapter False \ 21 | --feat_enc_adapter False \ 22 | --fine_tune True \ 23 | --per_device_train_batch_size 64 \ 24 | --gradient_accumulation_steps 4 \ 25 | --per_device_eval_batch_size 64 \ 26 | --num_train_epochs 100 \ 27 | --warmup_ratio 0.1 \ 28 | --logging_steps 20 \ 29 | --logging_dir '/data/yingting/output_earlystop_sp_finetune_8e6/log' \ 30 | --load_best_model_at_end True \ 31 | --metric_for_best_model "accuracy" 32 | 33 | ##### Bottleneck ###### 34 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 35 | --dataset vctk\ 36 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 37 | --output_dir '/data/yingting/output_earlystop_sp_bottleneck_8e6' \ 38 | --do_train True \ 39 | --do_eval True \ 40 | --do_predict False \ 41 | --evaluation_strategy "steps" \ 42 | --save_strategy "steps" \ 43 | --save_steps 500 \ 44 | --eval_steps 25 \ 45 | --learning_rate 8e-6 \ 46 | --feat_adapter_name "conv_adapter" \ 47 | --trans_adapter_name "bottleneck" \ 48 | --output_adapter True \ 49 | --mh_adapter False \ 50 | --prefix_tuning False \ 51 | --lora_adapter False \ 52 | --feat_enc_adapter False \ 53 | --fine_tune False \ 54 | --per_device_train_batch_size 64 \ 55 | --gradient_accumulation_steps 4 \ 56 | --per_device_eval_batch_size 64 \ 57 | --num_train_epochs 100 \ 58 | --warmup_ratio 0.1 \ 59 | --logging_steps 20 \ 60 | --logging_dir '/data/yingting/output_earlystop_sp_bottleneck_8e6/log' \ 61 | --load_best_model_at_end True \ 62 | --metric_for_best_model "accuracy" 63 | 64 | ##### Lora ###### 65 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 66 | --dataset vctk\ 67 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 68 | --output_dir '/data/yingting/output_earlystop_sp_lora_2e4' \ 69 | --do_train True \ 70 | --do_eval True \ 71 | --do_predict False \ 72 | --evaluation_strategy "steps" \ 73 | --save_strategy "steps" \ 74 | --save_steps 500 \ 75 | --eval_steps 25 \ 76 | --learning_rate 2e-4 \ 77 | --feat_adapter_name "conv_adapter" \ 78 | --trans_adapter_name "bottleneck" \ 79 | --output_adapter False \ 80 | --mh_adapter False \ 81 | --prefix_tuning False \ 82 | --lora_adapter True \ 83 | --feat_enc_adapter False \ 84 | --fine_tune False \ 85 | --per_device_train_batch_size 64 \ 86 | --gradient_accumulation_steps 4 \ 87 | --per_device_eval_batch_size 64 \ 88 | --num_train_epochs 100 \ 89 | --warmup_ratio 0.1 \ 90 | --logging_steps 20 \ 91 | --logging_dir '/data/yingting/output_earlystop_sp_lora_2e4/log' \ 92 | --load_best_model_at_end True \ 93 | --metric_for_best_model "accuracy" 94 | 95 | ##### Prefix-tuning ###### 96 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 97 | --dataset vctk\ 98 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 99 | --output_dir '/data/yingting/output_earlystop_sp_prefix_2e3' \ 100 | --do_train True \ 101 | --do_eval True \ 102 | --do_predict False \ 103 | --evaluation_strategy "steps" \ 104 | --save_strategy "steps" \ 105 | --save_steps 500 \ 106 | --eval_steps 25 \ 107 | --learning_rate 2e-3 \ 108 | --feat_adapter_name "conv_adapter" \ 109 | --trans_adapter_name "bottleneck" \ 110 | --output_adapter False \ 111 | --mh_adapter False \ 112 | --prefix_tuning True \ 113 | --lora_adapter False \ 114 | --feat_enc_adapter False \ 115 | --fine_tune False \ 116 | --per_device_train_batch_size 64 \ 117 | --gradient_accumulation_steps 4 \ 118 | --per_device_eval_batch_size 64 \ 119 | --num_train_epochs 100 \ 120 | --warmup_ratio 0.1 \ 121 | --logging_steps 20 \ 122 | --logging_dir '/data/yingting/output_earlystop_sp_prefix_2e3/log' \ 123 | --load_best_model_at_end True \ 124 | --metric_for_best_model "accuracy" 125 | 126 | ##### Adapterblock ###### 127 | CUDA_VISIBLE_DEVICES=2,3 python speaker_recg.py \ 128 | --dataset vctk\ 129 | --data_dir "/data/yingting/VCTK_Wav/wav48/" \ 130 | --output_dir '/data/yingting/output_earlystop_sr_vctk_adapterblock_2e3' \ 131 | --do_train True \ 132 | --do_eval True \ 133 | --do_predict False \ 134 | --evaluation_strategy "steps" \ 135 | --save_strategy "steps" \ 136 | --save_steps 500 \ 137 | --eval_steps 25 \ 138 | --learning_rate 2e-3 \ 139 | --feat_adapter_name "conv_adapter" \ 140 | --trans_adapter_name "adapterblock" \ 141 | --output_adapter True \ 142 | --mh_adapter False \ 143 | --prefix_tuning False \ 144 | --lora_adapter False \ 145 | --feat_enc_adapter False \ 146 | --fine_tune False \ 147 | --per_device_train_batch_size 64 \ 148 | --gradient_accumulation_steps 4 \ 149 | --per_device_eval_batch_size 64 \ 150 | --num_train_epochs 100 \ 151 | --warmup_ratio 0.1 \ 152 | --logging_steps 20 \ 153 | --logging_dir '/data/yingting/output_earlystop_sr_vctk_adapterblock_2e3/log' \ 154 | --load_best_model_at_end True \ 155 | --metric_for_best_model "accuracy" 156 | 157 | 158 | -------------------------------------------------------------------------------- /tasks/speaker_classification/sp_cls_esd.sh: -------------------------------------------------------------------------------- 1 | 2 | ##### Fine-tune ###### 3 | CUDA_VISIBLE_DEVICES=0,1 python speaker_recg.py \ 4 | --dataset esd\ 5 | --data_dir "/data/yingting/Dataset/ESD/en/" \ 6 | --output_dir '/data/yingting/output_earlystop_sp_esd_finetune_8e6' \ 7 | --do_train True \ 8 | --do_eval True \ 9 | --do_predict False \ 10 | --evaluation_strategy "steps" \ 11 | --save_strategy "steps" \ 12 | --save_steps 500 \ 13 | --eval_steps 25 \ 14 | --learning_rate 8e-6 \ 15 | --feat_adapter_name "conv_adapter" \ 16 | --trans_adapter_name "bottleneck" \ 17 | --output_adapter False \ 18 | --mh_adapter False \ 19 | --prefix_tuning False \ 20 | --lora_adapter False \ 21 | --feat_enc_adapter False \ 22 | --fine_tune True \ 23 | --per_device_train_batch_size 64 \ 24 | --gradient_accumulation_steps 4 \ 25 | --per_device_eval_batch_size 64 \ 26 | --num_train_epochs 100 \ 27 | --warmup_ratio 0.1 \ 28 | --logging_steps 20 \ 29 | --logging_dir '/data/yingting/output_earlystop_sp_esd_finetune_8e6/log' \ 30 | --load_best_model_at_end True \ 31 | --metric_for_best_model "accuracy" 32 | 33 | ##### Bottleneck ###### 34 | CUDA_VISIBLE_DEVICES=0,1 python speaker_recg.py \ 35 | --dataset esd\ 36 | --data_dir "/data/yingting/Dataset/ESD/en/" \ 37 | --output_dir '/data/yingting/output_earlystop_sp_esd_bottleneck_8e6' \ 38 | --do_train True \ 39 | --do_eval True \ 40 | --do_predict False \ 41 | --evaluation_strategy "steps" \ 42 | --save_strategy "steps" \ 43 | --save_steps 500 \ 44 | --eval_steps 25 \ 45 | --learning_rate 8e-6 \ 46 | --feat_adapter_name "conv_adapter" \ 47 | --trans_adapter_name "bottleneck" \ 48 | --output_adapter True \ 49 | --mh_adapter False \ 50 | --prefix_tuning False \ 51 | --lora_adapter False \ 52 | --feat_enc_adapter False \ 53 | --fine_tune False \ 54 | --per_device_train_batch_size 64 \ 55 | --gradient_accumulation_steps 4 \ 56 | --per_device_eval_batch_size 64 \ 57 | --num_train_epochs 100 \ 58 | --warmup_ratio 0.1 \ 59 | --logging_steps 20 \ 60 | --logging_dir '/data/yingting/output_earlystop_sp_esd_bottleneck_8e6/log' \ 61 | --load_best_model_at_end True \ 62 | --metric_for_best_model "accuracy" 63 | 64 | ##### Lora ###### 65 | CUDA_VISIBLE_DEVICES=0,1 python speaker_recg.py \ 66 | --dataset esd\ 67 | --data_dir "/data/yingting/ESD/en/" \ 68 | --output_dir '/data/yingting/output_earlystop_sp_esd_lora_8e6' \ 69 | --do_train False \ 70 | --do_eval False \ 71 | --do_predict True \ 72 | --evaluation_strategy "steps" \ 73 | --save_strategy "steps" \ 74 | --save_steps 500 \ 75 | --eval_steps 25 \ 76 | --learning_rate 8e-6 \ 77 | --feat_adapter_name "conv_adapter" \ 78 | --trans_adapter_name "bottleneck" \ 79 | --output_adapter False \ 80 | --mh_adapter False \ 81 | --prefix_tuning False \ 82 | --lora_adapter True \ 83 | --feat_enc_adapter False \ 84 | --fine_tune False \ 85 | --per_device_train_batch_size 64 \ 86 | --gradient_accumulation_steps 4 \ 87 | --per_device_eval_batch_size 64 \ 88 | --num_train_epochs 100 \ 89 | --warmup_ratio 0.1 \ 90 | --logging_steps 20 \ 91 | --logging_dir '/data/yingting/output_earlystop_sp_esd_lora_8e6/log' \ 92 | --load_best_model_at_end True \ 93 | --metric_for_best_model "accuracy" 94 | 95 | ##### Prefix-tuning ###### 96 | CUDA_VISIBLE_DEVICES=0,1 python speaker_recg.py \ 97 | --dataset esd\ 98 | --data_dir "/data/yingting/ESD/en/" \ 99 | --output_dir '/data/yingting/output_earlystop_sp_esd_prefix_8e6' \ 100 | --do_train False \ 101 | --do_eval False \ 102 | --do_predict True \ 103 | --evaluation_strategy "steps" \ 104 | --save_strategy "steps" \ 105 | --save_steps 500 \ 106 | --eval_steps 25 \ 107 | --learning_rate 8e-6 \ 108 | --feat_adapter_name "conv_adapter" \ 109 | --trans_adapter_name "bottleneck" \ 110 | --output_adapter False \ 111 | --mh_adapter False \ 112 | --prefix_tuning True \ 113 | --lora_adapter False \ 114 | --feat_enc_adapter False \ 115 | --fine_tune False \ 116 | --per_device_train_batch_size 64 \ 117 | --gradient_accumulation_steps 4 \ 118 | --per_device_eval_batch_size 64 \ 119 | --num_train_epochs 100 \ 120 | --warmup_ratio 0.1 \ 121 | --logging_steps 20 \ 122 | --logging_dir '/data/yingting/output_earlystop_sp_esd_prefix_8e6/log' \ 123 | --load_best_model_at_end True \ 124 | --metric_for_best_model "accuracy" 125 | 126 | ##### Adapterblock ###### 127 | CUDA_VISIBLE_DEVICES=0,3 python speaker_recg.py \ 128 | --dataset esd \ 129 | --data_dir "/data/yingting/Dataset/ESD/en/" \ 130 | --output_dir '/data/yingting/output_earlystop_sr_esd_adapterblock_2e3_scale16' \ 131 | --do_train True \ 132 | --do_eval True \ 133 | --do_predict False \ 134 | --evaluation_strategy "steps" \ 135 | --save_strategy "steps" \ 136 | --save_steps 500 \ 137 | --eval_steps 25 \ 138 | --learning_rate 2e-3 \ 139 | --feat_adapter_name "conv_adapter" \ 140 | --trans_adapter_name "adapterblock" \ 141 | --output_adapter True \ 142 | --mh_adapter False \ 143 | --prefix_tuning False \ 144 | --lora_adapter False \ 145 | --feat_enc_adapter False \ 146 | --fine_tune False \ 147 | --per_device_train_batch_size 64 \ 148 | --gradient_accumulation_steps 4 \ 149 | --per_device_eval_batch_size 64 \ 150 | --num_train_epochs 100 \ 151 | --warmup_ratio 0.1 \ 152 | --logging_steps 20 \ 153 | --logging_dir '/data/yingting/output_earlystop_sr_esd_adapterblock_2e3_scale16/log' \ 154 | --load_best_model_at_end True \ 155 | --metric_for_best_model "accuracy" 156 | 157 | 158 | -------------------------------------------------------------------------------- /tasks/intent_classification/ic.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=0,1 python intent_cls.py \ 3 | --dataset fluent_commands \ 4 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 5 | --output_dir '/data/path/Output/output_earlystop_ic_finetune_2e4' \ 6 | --do_train True \ 7 | --do_eval True \ 8 | --do_predict False \ 9 | --evaluation_strategy "steps" \ 10 | --save_strategy "steps" \ 11 | --save_steps 500 \ 12 | --eval_steps 25 \ 13 | --learning_rate 2e-4 \ 14 | --feat_adapter_name "conv_adapter" \ 15 | --trans_adapter_name "adapterblock" \ 16 | --output_adapter False \ 17 | --mh_adapter False \ 18 | --prefix_tuning False \ 19 | --lora_adapter False \ 20 | --feat_enc_adapter False \ 21 | --fine_tune True \ 22 | --per_device_train_batch_size 8 \ 23 | --gradient_accumulation_steps 4 \ 24 | --per_device_eval_batch_size 8 \ 25 | --num_train_epochs 100 \ 26 | --warmup_ratio 0.1 \ 27 | --logging_steps 20 \ 28 | --logging_dir '/data/path/Output/output_earlystop_ic_finetune_2e4/log' \ 29 | --load_best_model_at_end True \ 30 | --metric_for_best_model "acc" \ 31 | 32 | ##### Bottleneck ###### 33 | CUDA_VISIBLE_DEVICES=0,3 python intent_cls.py \ 34 | --dataset fluent_commands \ 35 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 36 | --output_dir '/data/path/Output/output_earlystop_ic_bottleneck_2e4' \ 37 | --do_train True \ 38 | --do_eval True \ 39 | --do_predict False \ 40 | --evaluation_strategy "steps" \ 41 | --save_strategy "steps" \ 42 | --save_steps 500 \ 43 | --eval_steps 25 \ 44 | --learning_rate 2e-4 \ 45 | --feat_adapter_name "conv_adapter" \ 46 | --trans_adapter_name "bottleneck" \ 47 | --output_adapter True \ 48 | --mh_adapter False \ 49 | --prefix_tuning False \ 50 | --lora_adapter False \ 51 | --feat_enc_adapter False \ 52 | --fine_tune False \ 53 | --per_device_train_batch_size 8 \ 54 | --gradient_accumulation_steps 4 \ 55 | --per_device_eval_batch_size 8 \ 56 | --num_train_epochs 100 \ 57 | --warmup_ratio 0.1 \ 58 | --logging_steps 20 \ 59 | --logging_dir '/data/path/Output/output_earlystop_ic_bottleneck_2e4/log' \ 60 | --load_best_model_at_end True \ 61 | --metric_for_best_model "acc" \ 62 | 63 | ##### Lora ###### 64 | CUDA_VISIBLE_DEVICES=2,3 python intent_cls.py \ 65 | --dataset fluent_commands \ 66 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 67 | --output_dir '/data/path/Output/output_earlystop_ic_lora_2e4' \ 68 | --do_train True \ 69 | --do_eval True \ 70 | --do_predict False \ 71 | --evaluation_strategy "steps" \ 72 | --save_strategy "steps" \ 73 | --save_steps 500 \ 74 | --eval_steps 25 \ 75 | --learning_rate 2e-4 \ 76 | --feat_adapter_name "conv_adapter" \ 77 | --trans_adapter_name "adapterblock" \ 78 | --output_adapter False \ 79 | --mh_adapter False \ 80 | --prefix_tuning False \ 81 | --lora_adapter True \ 82 | --feat_enc_adapter False \ 83 | --fine_tune False \ 84 | --per_device_train_batch_size 8 \ 85 | --gradient_accumulation_steps 4 \ 86 | --per_device_eval_batch_size 8 \ 87 | --num_train_epochs 100 \ 88 | --warmup_ratio 0.1 \ 89 | --logging_steps 20 \ 90 | --logging_dir '/data/path/Output/output_earlystop_ic_lora_2e4/log' \ 91 | --load_best_model_at_end True \ 92 | --metric_for_best_model "acc" \ 93 | 94 | ##### Prefix-tuning ###### 95 | CUDA_VISIBLE_DEVICES=0,1 python intent_cls.py \ 96 | --dataset fluent_commands \ 97 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 98 | --output_dir '/data/path/Output/output_earlystop_ic_prefix_2e3' \ 99 | --do_train True \ 100 | --do_eval True \ 101 | --do_predict False \ 102 | --evaluation_strategy "steps" \ 103 | --save_strategy "steps" \ 104 | --save_steps 500 \ 105 | --eval_steps 25 \ 106 | --learning_rate 2e-3 \ 107 | --feat_adapter_name "conv_adapter" \ 108 | --trans_adapter_name "bottleneck" \ 109 | --output_adapter False \ 110 | --mh_adapter False \ 111 | --prefix_tuning True \ 112 | --lora_adapter False \ 113 | --feat_enc_adapter False \ 114 | --fine_tune False \ 115 | --per_device_train_batch_size 8 \ 116 | --gradient_accumulation_steps 4 \ 117 | --per_device_eval_batch_size 8 \ 118 | --num_train_epochs 100 \ 119 | --warmup_ratio 0.1 \ 120 | --logging_steps 20 \ 121 | --logging_dir '/data/path/Output/output_earlystop_ic_prefix_2e3/log' \ 122 | --load_best_model_at_end True \ 123 | --metric_for_best_model "acc" \ 124 | 125 | ##### Adapterblock ###### 126 | CUDA_VISIBLE_DEVICES=0,3 python intent_cls.py \ 127 | --dataset fluent_commands \ 128 | --data_dir '/data/path/Dataset/fluent_speech_commands_dataset' \ 129 | --output_dir '/data/path/Output/output_earlystop_ic_adapterblock_2e4' \ 130 | --do_train True \ 131 | --do_eval True \ 132 | --do_predict False \ 133 | --evaluation_strategy "steps" \ 134 | --save_strategy "steps" \ 135 | --save_steps 500 \ 136 | --eval_steps 25 \ 137 | --learning_rate 2e-4 \ 138 | --feat_adapter_name "conv_adapter" \ 139 | --trans_adapter_name "adapterblock" \ 140 | --output_adapter True \ 141 | --mh_adapter False \ 142 | --prefix_tuning False \ 143 | --lora_adapter False \ 144 | --feat_enc_adapter False \ 145 | --fine_tune False \ 146 | --per_device_train_batch_size 8 \ 147 | --gradient_accumulation_steps 4 \ 148 | --per_device_eval_batch_size 8 \ 149 | --num_train_epochs 100 \ 150 | --warmup_ratio 0.1 \ 151 | --logging_steps 20 \ 152 | --logging_dir '/data/path/Output/output_earlystop_ic_adapterblock_2e4/log' \ 153 | --load_best_model_at_end True \ 154 | --metric_for_best_model "acc" \ 155 | 156 | -------------------------------------------------------------------------------- /tasks/keyword_spotting/keyword_spotting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dataclasses import field, dataclass 4 | from typing import * 5 | from gc import callbacks 6 | from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2Config, TrainingArguments, HfArgumentParser, EarlyStoppingCallback 7 | from transformers.integrations import TensorBoardCallback 8 | 9 | # import sys 10 | # sys.path.append("..") 11 | 12 | from path import Path 13 | import sys 14 | folder = Path(__file__).abspath() 15 | sys.path.append(folder.parent.parent.parent) 16 | 17 | import os 18 | from os.path import join 19 | import utils 20 | from modules import CustomTrainer 21 | from modeling_wav2vec2 import Wav2Vec2ForSequenceClassification 22 | from data import get_ks_cls_data, compute_metrics 23 | 24 | @dataclass 25 | class DataTrainingArguments(TrainingArguments): 26 | data_dir: Optional[str] = field( 27 | default="/data/path/KS/12classes/", metadata={"help": "The dir of the dataset."} 28 | ) 29 | feat_adapter_name: Optional[str] = field( 30 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 31 | ) 32 | trans_adapter_name: Optional[str] = field( 33 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 34 | ) 35 | output_adapter: Optional[bool] = field( 36 | default=False, metadata={"help": "use adapter after FFN"} 37 | ) 38 | mh_adapter: Optional[bool] = field( 39 | default=False, metadata={"help": "use adapter after multi-head attention"} 40 | ) 41 | prefix_tuning: Optional[bool] = field( 42 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 43 | ) 44 | prefix_seq_len: Optional[int] = field( 45 | default=30, metadata={"help": "prefix sequence length"} 46 | ) 47 | prefix_projection: Optional[bool] = field( 48 | default=False, 49 | metadata={ 50 | "help": "Apply a two-layer MLP head over the prefix embeddings" 51 | } 52 | ) 53 | prefix_dropout_prob: Optional[bool] = field( 54 | default=0.1, 55 | metadata={ 56 | "help": "The dropout probability used in the models" 57 | } 58 | ) 59 | feat_enc_adapter: Optional[bool] = field( 60 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 61 | ) 62 | lora_adapter: Optional[bool] = field( 63 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 64 | ) 65 | fine_tune: Optional[bool] = field( 66 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 67 | ) 68 | 69 | 70 | def main(): 71 | set_seed(1314) 72 | 73 | parser = HfArgumentParser(DataTrainingArguments) 74 | args = parser.parse_args_into_dataclasses()[0] 75 | 76 | #processor 77 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 78 | 79 | # audio dataset 80 | ### ESD 81 | train_set, max_len_train = get_ks_cls_data(args.data_dir, processor, "train") 82 | valid_set, max_len_valid = get_ks_cls_data(args.data_dir, processor, "evaluation") 83 | test_set, max_len_test = get_ks_cls_data(args.data_dir, processor, "test") 84 | 85 | # config 86 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", 87 | num_labels=train_set.num_labels, 88 | label2id = train_set.label2id, 89 | id2label = train_set.id2label 90 | ) 91 | 92 | config.adapter_name = args.trans_adapter_name 93 | config.output_adapter = args.output_adapter 94 | config.mh_adapter = args.mh_adapter 95 | config.prefix_tuning = args.prefix_tuning 96 | config.feat_enc_adapter = args.feat_enc_adapter 97 | config.lora_adapter = args.lora_adapter 98 | config.prefix_seq_len = args.prefix_seq_len 99 | config.prefix_projection = args.prefix_projection 100 | config.prefix_dropout_prob = args.prefix_dropout_prob 101 | 102 | 103 | # load pretrained model 104 | model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config) 105 | 106 | print(model) 107 | 108 | print("\n #Train: {}, #Valid: {}, #Test: {} ".format(len(train_set), len(valid_set), len(test_set))) 109 | print(" #Train Max len: {}, #Valid Max len: {}, #Test Max len: {} \n".format(max_len_train, max_len_valid, max_len_test)) 110 | 111 | ## freeze all params exclude promptblock and classification head 112 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 113 | if not args.fine_tune: 114 | model.freeze_exclude_prompt() 115 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 116 | 117 | for name, param in model.named_parameters(): 118 | if param.requires_grad: 119 | print(name, param.requires_grad, param.size()) 120 | 121 | trainer = CustomTrainer( 122 | model, 123 | args, 124 | train_dataset=train_set, 125 | eval_dataset=valid_set, 126 | tokenizer=processor, 127 | compute_metrics=compute_metrics, 128 | callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 129 | ) 130 | 131 | save_dir = join(args.output_dir, "best_model") 132 | if args.do_train: # train and test 133 | trainer.train(resume_from_checkpoint=None) 134 | trainer.save_model(save_dir) 135 | 136 | test_metrics = trainer.predict(test_set).metrics 137 | print(test_metrics) 138 | 139 | if args.do_predict: # only for test 140 | device = trainer.model.device 141 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 142 | test_metrics= trainer.predict(test_set).metrics 143 | print(test_metrics) 144 | 145 | 146 | if __name__ == "__main__": 147 | main() 148 | 149 | 150 | -------------------------------------------------------------------------------- /tasks/intent_classification/intent_cls.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dataclasses import field, dataclass 4 | from typing import * 5 | from gc import callbacks 6 | from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2Config, TrainingArguments, HfArgumentParser, EarlyStoppingCallback 7 | from transformers.integrations import TensorBoardCallback 8 | from os.path import join 9 | 10 | set_seed(1314) 11 | 12 | from path import Path 13 | import sys 14 | folder = Path(__file__).abspath() 15 | sys.path.append(folder.parent) 16 | sys.path.append(folder.parent.parent.parent) 17 | 18 | from dataset import ICDataset 19 | from customtrain import CustomTrainer, compute_metrics 20 | from modeling_wav2vec2 import Wav2Vec2ForSequenceClassification 21 | 22 | 23 | @dataclass 24 | class DataTrainingArguments(TrainingArguments): 25 | dataset: Optional[str] = field( 26 | default="esd", metadata={"help": "dataset name"} 27 | ) 28 | data_dir: Optional[str] = field( 29 | default="/data/path/ESD/en/", metadata={"help": "The dir of the dataset."} 30 | ) 31 | feat_adapter_name: Optional[str] = field( 32 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 33 | ) 34 | trans_adapter_name: Optional[str] = field( 35 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 36 | ) 37 | output_adapter: Optional[bool] = field( 38 | default=False, metadata={"help": "use adapter after FFN"} 39 | ) 40 | mh_adapter: Optional[bool] = field( 41 | default=False, metadata={"help": "use adapter after multi-head attention"} 42 | ) 43 | prefix_tuning: Optional[bool] = field( 44 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 45 | ) 46 | prefix_seq_len: Optional[int] = field( 47 | default=30, metadata={"help": "prefix sequence length"} 48 | ) 49 | prefix_projection: Optional[bool] = field( 50 | default=False, 51 | metadata={ 52 | "help": "Apply a two-layer MLP head over the prefix embeddings" 53 | } 54 | ) 55 | prefix_dropout_prob: Optional[bool] = field( 56 | default=0.1, 57 | metadata={ 58 | "help": "The dropout probability used in the models" 59 | } 60 | ) 61 | feat_enc_adapter: Optional[bool] = field( 62 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 63 | ) 64 | lora_adapter: Optional[bool] = field( 65 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 66 | ) 67 | fine_tune: Optional[bool] = field( 68 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 69 | ) 70 | 71 | def main(): 72 | 73 | # args 74 | parser = HfArgumentParser(DataTrainingArguments) 75 | args = parser.parse_args_into_dataclasses()[0] 76 | 77 | #processor 78 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 79 | 80 | #dataset fluent_commands 81 | train_set = ICDataset(args.data_dir, "train", processor) 82 | valid_set = ICDataset(args.data_dir, "valid", processor) 83 | test_set = ICDataset(args.data_dir, "test", processor) 84 | 85 | # breakpoint() 86 | 87 | # config 88 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", 89 | num_labels=train_set.num_labels, 90 | label2id = train_set.label2id, 91 | id2label = train_set.id2label 92 | ) 93 | 94 | config.adapter_name = args.trans_adapter_name 95 | config.output_adapter = args.output_adapter 96 | config.mh_adapter = args.mh_adapter 97 | # config.prefixtuning = args.prefixtuning 98 | config.prefix_tuning = args.prefix_tuning 99 | config.feat_enc_adapter = args.feat_enc_adapter 100 | config.lora_adapter = args.lora_adapter 101 | config.prefix_seq_len = args.prefix_seq_len 102 | config.prefix_projection = args.prefix_projection 103 | config.prefix_dropout_prob = args.prefix_dropout_prob 104 | 105 | 106 | # load pretrained model 107 | model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config) 108 | 109 | print(model) 110 | 111 | print("\n #Train: {}, #Valid: {}, #Test: {} \n".format(len(train_set), len(valid_set), len(test_set))) 112 | # print(" #Train Max len: {}, #Valid Max len: {}, #Test Max len: {} \n".format(max_len_train, max_len_valid, max_len_test)) 113 | 114 | ## freeze all params exclude promptblock and classification head 115 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 116 | if not args.fine_tune: 117 | model.freeze_exclude_prompt() 118 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 119 | 120 | for name, param in model.named_parameters(): 121 | if param.requires_grad: 122 | print(name, param.requires_grad, param.size()) 123 | 124 | trainer = CustomTrainer( 125 | model, 126 | args, 127 | train_dataset=train_set, 128 | eval_dataset=valid_set, 129 | tokenizer=processor, 130 | compute_metrics=compute_metrics, 131 | callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)] 132 | ) 133 | # breakpoint() 134 | 135 | save_dir = join(args.output_dir, "best_model") 136 | if args.do_train: # train and test 137 | trainer.train(resume_from_checkpoint=None) #join(args.output_dir, "checkpoint-4000") 138 | trainer.save_model(save_dir) 139 | test_metrics = trainer.predict(test_set).metrics 140 | print(test_metrics) 141 | 142 | if args.do_predict: # only for test 143 | device = trainer.model.device 144 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 145 | test_metrics= trainer.predict(test_set).metrics 146 | print(test_metrics) 147 | 148 | 149 | if __name__ == "__main__": 150 | main() -------------------------------------------------------------------------------- /tasks/slot_filling/sf.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=2,3 python slot_filling.py \ 3 | --dataset snips \ 4 | --data_dir '/data/path/Dataset/SNIPS/' \ 5 | --output_dir '/data/path/output_earlystop_sf_finetune_2e4_scheduler' \ 6 | --do_train True \ 7 | --do_eval True \ 8 | --do_predict False \ 9 | --evaluation_strategy "steps" \ 10 | --save_strategy "steps" \ 11 | --max_steps 50000 \ 12 | --save_steps 5000 \ 13 | --eval_steps 200 \ 14 | --learning_rate 2e-4 \ 15 | --feat_adapter_name "conv_adapter" \ 16 | --trans_adapter_name "adapterblock" \ 17 | --output_adapter False \ 18 | --mh_adapter False \ 19 | --prefixtuning False \ 20 | --prefix_tuning False \ 21 | --lora_adapter False \ 22 | --feat_enc_adapter False \ 23 | --fine_tune True \ 24 | --per_device_train_batch_size 8 \ 25 | --gradient_accumulation_steps 1 \ 26 | --per_device_eval_batch_size 8 \ 27 | --num_train_epochs 30 \ 28 | --warmup_ratio 0.1 \ 29 | --logging_steps 100 \ 30 | --logging_dir '/data/path/output_earlystop_sf_finetune_2e4_scheduler/log' \ 31 | --load_best_model_at_end True \ 32 | --metric_for_best_model "slot_type_f1" 33 | 34 | ##### Bottleneck ###### 35 | CUDA_VISIBLE_DEVICES=2,3 python slot_filling.py \ 36 | --dataset snips \ 37 | --data_dir '/data/path/Dataset/SNIPS/' \ 38 | --output_dir '/data/path/output_earlystop_sf_bottleneck_2e4_scheduler' \ 39 | --do_train True \ 40 | --do_eval True \ 41 | --do_predict False \ 42 | --evaluation_strategy "steps" \ 43 | --save_strategy "steps" \ 44 | --max_steps 50000 \ 45 | --save_steps 5000 \ 46 | --eval_steps 200 \ 47 | --learning_rate 2e-4 \ 48 | --feat_adapter_name "conv_adapter" \ 49 | --trans_adapter_name "bottleneck" \ 50 | --output_adapter True \ 51 | --mh_adapter False \ 52 | --prefixtuning False \ 53 | --prefix_tuning False \ 54 | --lora_adapter False \ 55 | --feat_enc_adapter False \ 56 | --fine_tune False \ 57 | --per_device_train_batch_size 8 \ 58 | --gradient_accumulation_steps 1 \ 59 | --per_device_eval_batch_size 8 \ 60 | --num_train_epochs 30 \ 61 | --warmup_ratio 0.1 \ 62 | --logging_steps 100 \ 63 | --logging_dir '/data/path/output_earlystop_sf_bottleneck_2e4_scheduler/log' \ 64 | --load_best_model_at_end True \ 65 | --metric_for_best_model "slot_type_f1" 66 | 67 | ##### Prefix-tuning ###### 68 | CUDA_VISIBLE_DEVICES=2,3 python slot_filling.py \ 69 | --dataset snips \ 70 | --data_dir '/data/path/Dataset/SNIPS/' \ 71 | --output_dir '/data/path/output_earlystop_sf_prefix_2e3_scheduler' \ 72 | --do_train True \ 73 | --do_eval True \ 74 | --do_predict False \ 75 | --evaluation_strategy "steps" \ 76 | --save_strategy "steps" \ 77 | --max_steps 50000 \ 78 | --save_steps 5000 \ 79 | --eval_steps 200 \ 80 | --learning_rate 2e-3 \ 81 | --feat_adapter_name "conv_adapter" \ 82 | --trans_adapter_name "bottleneck" \ 83 | --output_adapter False \ 84 | --mh_adapter False \ 85 | --prefixtuning False \ 86 | --prefix_tuning True \ 87 | --lora_adapter False \ 88 | --feat_enc_adapter False \ 89 | --fine_tune False \ 90 | --per_device_train_batch_size 8 \ 91 | --gradient_accumulation_steps 1 \ 92 | --per_device_eval_batch_size 8 \ 93 | --num_train_epochs 30 \ 94 | --warmup_ratio 0.1 \ 95 | --logging_steps 100 \ 96 | --logging_dir '/data/path/output_earlystop_sf_prefix_2e3_scheduler/log' \ 97 | --load_best_model_at_end True \ 98 | --metric_for_best_model "slot_type_f1" 99 | 100 | ##### Lora ###### 101 | CUDA_VISIBLE_DEVICES=0,1 python slot_filling.py \ 102 | --dataset snips \ 103 | --data_dir '/data/path/Dataset/SNIPS/' \ 104 | --output_dir '/data/path/output_earlystop_sf_lora_2e4_scheduler' \ 105 | --do_train True \ 106 | --do_eval True \ 107 | --do_predict False \ 108 | --evaluation_strategy "steps" \ 109 | --save_strategy "steps" \ 110 | --max_steps 50000 \ 111 | --save_steps 5000 \ 112 | --eval_steps 200 \ 113 | --learning_rate 2e-4 \ 114 | --feat_adapter_name "conv_adapter" \ 115 | --trans_adapter_name "bottleneck" \ 116 | --output_adapter False \ 117 | --mh_adapter False \ 118 | --prefixtuning False \ 119 | --prefix_tuning False \ 120 | --lora_adapter True \ 121 | --feat_enc_adapter False \ 122 | --fine_tune False \ 123 | --per_device_train_batch_size 8 \ 124 | --gradient_accumulation_steps 1 \ 125 | --per_device_eval_batch_size 8 \ 126 | --num_train_epochs 30 \ 127 | --warmup_ratio 0.1 \ 128 | --logging_steps 100 \ 129 | --logging_dir '/data/path/output_earlystop_sf_lora_2e4_scheduler/log' \ 130 | --load_best_model_at_end True \ 131 | --metric_for_best_model "slot_type_f1" 132 | 133 | ##### Adapterblock ###### 134 | CUDA_VISIBLE_DEVICES=2,3 python slot_filling.py \ 135 | --dataset snips \ 136 | --data_dir '/data/path/Dataset/SNIPS/' \ 137 | --output_dir '/data/path/output_earlystop_sf_adapterblock_2e5_scheduler' \ 138 | --do_train True \ 139 | --do_eval True \ 140 | --do_predict False \ 141 | --evaluation_strategy "steps" \ 142 | --save_strategy "steps" \ 143 | --max_steps 50000 \ 144 | --save_steps 5000 \ 145 | --eval_steps 200 \ 146 | --learning_rate 2e-5 \ 147 | --feat_adapter_name "conv_adapter" \ 148 | --trans_adapter_name "adapterblock" \ 149 | --output_adapter True \ 150 | --mh_adapter False \ 151 | --prefixtuning False \ 152 | --prefix_tuning False \ 153 | --lora_adapter False \ 154 | --feat_enc_adapter False \ 155 | --fine_tune False \ 156 | --per_device_train_batch_size 8 \ 157 | --gradient_accumulation_steps 1 \ 158 | --per_device_eval_batch_size 8 \ 159 | --num_train_epochs 30 \ 160 | --warmup_ratio 0.1 \ 161 | --logging_steps 100 \ 162 | --logging_dir '/data/path/output_earlystop_sf_adapterblock_2e5_scheduler/log' \ 163 | --load_best_model_at_end True \ 164 | --metric_for_best_model "slot_type_f1" -------------------------------------------------------------------------------- /tasks/asr/asr_fleurs.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 3 | --output_dir '/data/path/output_earlystop_asr_fleurs_finetune_2e3' \ 4 | --dataset "fleurs" \ 5 | --data_dir '/data/path/Dataset/fleurs' \ 6 | --group_by_length True \ 7 | --do_train False \ 8 | --do_eval False \ 9 | --do_predict True \ 10 | --fp16 True \ 11 | --gradient_checkpointing True \ 12 | --evaluation_strategy "steps" \ 13 | --save_strategy "steps" \ 14 | --save_steps 500 \ 15 | --eval_steps 100 \ 16 | --learning_rate 2e-3 \ 17 | --feat_adapter_name "conv_adapter" \ 18 | --trans_adapter_name "adapterblock" \ 19 | --output_adapter False \ 20 | --mh_adapter False \ 21 | --prefix_tuning False \ 22 | --lora_adapter False \ 23 | --feat_enc_adapter False \ 24 | --fine_tune True \ 25 | --per_device_train_batch_size 32 \ 26 | --per_device_eval_batch_size 32 \ 27 | --num_train_epochs 50 \ 28 | --weight_decay=0.005 \ 29 | --warmup_steps=1000 \ 30 | --logging_steps 50 \ 31 | --logging_dir '/data/path/output_earlystop_asr_fleurs_finetune_2e3/log' \ 32 | --load_best_model_at_end True \ 33 | --metric_for_best_model "wer" \ 34 | --greater_is_better False 35 | # --gradient_accumulation_steps 4 \ 36 | 37 | ##### Bottleneck ###### 38 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 39 | --output_dir '/data/path/output_earlystop_asr_fleurs_bottleneck_2e3' \ 40 | --dataset "fleurs" \ 41 | --data_dir '/data/path/Dataset/fleurs' \ 42 | --group_by_length True \ 43 | --do_train False \ 44 | --do_eval False \ 45 | --do_predict True \ 46 | --fp16 True \ 47 | --gradient_checkpointing True \ 48 | --evaluation_strategy "steps" \ 49 | --save_strategy "steps" \ 50 | --save_steps 500 \ 51 | --eval_steps 100 \ 52 | --learning_rate 2e-3 \ 53 | --feat_adapter_name "conv_adapter" \ 54 | --trans_adapter_name "bottleneck" \ 55 | --output_adapter True \ 56 | --mh_adapter False \ 57 | --prefix_tuning False \ 58 | --lora_adapter False \ 59 | --feat_enc_adapter False \ 60 | --fine_tune False \ 61 | --per_device_train_batch_size 32 \ 62 | --per_device_eval_batch_size 32 \ 63 | --num_train_epochs 50 \ 64 | --weight_decay=0.005 \ 65 | --warmup_steps=1000 \ 66 | --logging_steps 50 \ 67 | --logging_dir '/data/path/output_earlystop_asr_fleurs_bottleneck_2e3/log' \ 68 | --load_best_model_at_end True \ 69 | --metric_for_best_model "wer" \ 70 | --greater_is_better False 71 | # --gradient_accumulation_steps 4 \ 72 | 73 | ##### Prefix-tuning ###### 74 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 75 | --output_dir '/data/path/output_earlystop_asr_fleurs_prefixtuning_2e3' \ 76 | --dataset "fleurs" \ 77 | --data_dir '/data/path/Dataset/fleurs' \ 78 | --group_by_length True \ 79 | --do_train False \ 80 | --do_eval False \ 81 | --do_predict True \ 82 | --fp16 True \ 83 | --gradient_checkpointing True \ 84 | --evaluation_strategy "steps" \ 85 | --save_strategy "steps" \ 86 | --save_steps 500 \ 87 | --eval_steps 100 \ 88 | --learning_rate 2e-3 \ 89 | --feat_adapter_name "conv_adapter" \ 90 | --trans_adapter_name "bottleneck" \ 91 | --output_adapter False \ 92 | --mh_adapter False \ 93 | --prefix_tuning True \ 94 | --lora_adapter False \ 95 | --feat_enc_adapter False \ 96 | --fine_tune False \ 97 | --per_device_train_batch_size 32 \ 98 | --per_device_eval_batch_size 32 \ 99 | --num_train_epochs 50 \ 100 | --weight_decay=0.005 \ 101 | --warmup_steps=1000 \ 102 | --logging_steps 50 \ 103 | --logging_dir '/data/path/output_earlystop_asr_fleurs_prefixtuning_2e3/log' \ 104 | --load_best_model_at_end True \ 105 | --metric_for_best_model "wer" \ 106 | --greater_is_better False 107 | # --gradient_accumulation_steps 4 \ 108 | 109 | ##### Lora ###### 110 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 111 | --output_dir '/data/path/output_earlystop_asr_fleurs_lora_2e3' \ 112 | --dataset "fleurs" \ 113 | --data_dir '/data/path/Dataset/fleurs' \ 114 | --group_by_length True \ 115 | --do_train False \ 116 | --do_eval False \ 117 | --do_predict True \ 118 | --fp16 True \ 119 | --gradient_checkpointing True \ 120 | --evaluation_strategy "steps" \ 121 | --save_strategy "steps" \ 122 | --save_steps 500 \ 123 | --eval_steps 100 \ 124 | --learning_rate 2e-3 \ 125 | --feat_adapter_name "conv_adapter" \ 126 | --trans_adapter_name "bottleneck" \ 127 | --output_adapter False \ 128 | --mh_adapter False \ 129 | --prefix_tuning False \ 130 | --lora_adapter True \ 131 | --feat_enc_adapter False \ 132 | --fine_tune False \ 133 | --per_device_train_batch_size 32 \ 134 | --per_device_eval_batch_size 32 \ 135 | --num_train_epochs 50 \ 136 | --weight_decay=0.005 \ 137 | --warmup_steps=1000 \ 138 | --logging_steps 50 \ 139 | --logging_dir '/data/path/output_earlystop_asr_fleurs_lora_2e3/log' \ 140 | --load_best_model_at_end True \ 141 | --metric_for_best_model "wer" \ 142 | --greater_is_better False 143 | # --gradient_accumulation_steps 4 \ 144 | 145 | ##### Adapterblock ###### 146 | CUDA_VISIBLE_DEVICES=2,3 python asr.py \ 147 | --output_dir '/data/path/output_earlystop_asr_fleurs_adapterblock_2e3' \ 148 | --dataset "fleurs" \ 149 | --data_dir '/data/path/Dataset/fleurs' \ 150 | --group_by_length True \ 151 | --do_train False \ 152 | --do_eval False \ 153 | --do_predict True \ 154 | --fp16 True \ 155 | --gradient_checkpointing True \ 156 | --evaluation_strategy "steps" \ 157 | --save_strategy "steps" \ 158 | --save_steps 500 \ 159 | --eval_steps 100 \ 160 | --learning_rate 2e-3 \ 161 | --feat_adapter_name "conv_adapter" \ 162 | --trans_adapter_name "adapterblock" \ 163 | --output_adapter True \ 164 | --mh_adapter False \ 165 | --prefix_tuning False \ 166 | --lora_adapter False \ 167 | --feat_enc_adapter False \ 168 | --fine_tune False \ 169 | --per_device_train_batch_size 32 \ 170 | --per_device_eval_batch_size 32 \ 171 | --num_train_epochs 50 \ 172 | --weight_decay=0.005 \ 173 | --warmup_steps=1000 \ 174 | --logging_steps 50 \ 175 | --logging_dir '/data/path/output_earlystop_asr_fleurs_adapterblock_2e3/log' \ 176 | --load_best_model_at_end True \ 177 | --metric_for_best_model "wer" \ 178 | --greater_is_better False 179 | # --gradient_accumulation_steps 4 \ -------------------------------------------------------------------------------- /tasks/phoneme_recognition/pr.sh: -------------------------------------------------------------------------------- 1 | ##### Fine-tune ###### 2 | CUDA_VISIBLE_DEVICES=0,1 python phoneme_recognition.py \ 3 | --dataset "librispeech" \ 4 | --data_dir '/data/path/hf_datasets' \ 5 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_finetune_2e2' \ 6 | --group_by_length True \ 7 | --do_train True \ 8 | --do_eval True \ 9 | --do_predict False \ 10 | --fp16 True \ 11 | --gradient_checkpointing True \ 12 | --evaluation_strategy "steps" \ 13 | --save_strategy "steps" \ 14 | --save_steps 200 \ 15 | --eval_steps 100 \ 16 | --learning_rate 2e-2 \ 17 | --feat_adapter_name "conv_adapter" \ 18 | --trans_adapter_name "bottleneck" \ 19 | --output_adapter False \ 20 | --mh_adapter False \ 21 | --prefix_tuning False \ 22 | --lora_adapter False \ 23 | --feat_enc_adapter False \ 24 | --fine_tune True \ 25 | --per_device_train_batch_size 16 \ 26 | --gradient_accumulation_steps 4 \ 27 | --per_device_eval_batch_size 16 \ 28 | --num_train_epochs 30 \ 29 | --weight_decay=0.005 \ 30 | --warmup_steps=1000 \ 31 | --logging_steps 20 \ 32 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_finetune_2e2/log' \ 33 | --load_best_model_at_end True \ 34 | --metric_for_best_model "per" \ 35 | --greater_is_better False 36 | 37 | ##### Bottleneck ###### 38 | CUDA_VISIBLE_DEVICES=0,3 python phoneme_recognition.py \ 39 | --dataset "librispeech" \ 40 | --data_dir '/data/path/hf_datasets' \ 41 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_bottleneck_2e3' \ 42 | --group_by_length True \ 43 | --do_train True \ 44 | --do_eval True \ 45 | --do_predict False \ 46 | --fp16 True \ 47 | --gradient_checkpointing True \ 48 | --evaluation_strategy "steps" \ 49 | --save_strategy "steps" \ 50 | --save_steps 200 \ 51 | --eval_steps 100 \ 52 | --learning_rate 2e-3 \ 53 | --feat_adapter_name "conv_adapter" \ 54 | --trans_adapter_name "bottleneck" \ 55 | --output_adapter True \ 56 | --mh_adapter False \ 57 | --prefix_tuning False \ 58 | --lora_adapter False \ 59 | --feat_enc_adapter False \ 60 | --fine_tune False \ 61 | --per_device_train_batch_size 16 \ 62 | --gradient_accumulation_steps 4 \ 63 | --per_device_eval_batch_size 16 \ 64 | --num_train_epochs 100 \ 65 | --weight_decay=0.005 \ 66 | --warmup_steps=1000 \ 67 | --logging_steps 20 \ 68 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_bottleneck_2e3/log' \ 69 | --load_best_model_at_end True \ 70 | --metric_for_best_model "per" \ 71 | --greater_is_better False 72 | 73 | 74 | ##### Lora ###### 75 | CUDA_VISIBLE_DEVICES=0,1 python phoneme_recognition.py \ 76 | --dataset "librispeech" \ 77 | --data_dir '/data/path/hf_datasets' \ 78 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_lora_2e2' \ 79 | --group_by_length True \ 80 | --do_train True \ 81 | --do_eval True \ 82 | --do_predict False \ 83 | --fp16 True \ 84 | --gradient_checkpointing True \ 85 | --evaluation_strategy "steps" \ 86 | --save_strategy "steps" \ 87 | --save_steps 200 \ 88 | --eval_steps 100 \ 89 | --learning_rate 2e-2 \ 90 | --feat_adapter_name "conv_adapter" \ 91 | --trans_adapter_name "bottleneck" \ 92 | --output_adapter False \ 93 | --mh_adapter False \ 94 | --prefix_tuning False \ 95 | --lora_adapter True \ 96 | --feat_enc_adapter False \ 97 | --fine_tune False \ 98 | --per_device_train_batch_size 16 \ 99 | --gradient_accumulation_steps 4 \ 100 | --per_device_eval_batch_size 16 \ 101 | --num_train_epochs 30 \ 102 | --weight_decay=0.005 \ 103 | --warmup_steps=1000 \ 104 | --logging_steps 20 \ 105 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_lora_2e2/log' \ 106 | --load_best_model_at_end True \ 107 | --metric_for_best_model "per" \ 108 | --greater_is_better False 109 | 110 | ##### Prefix-tuning ###### 111 | CUDA_VISIBLE_DEVICES=0,1 python phoneme_recognition.py \ 112 | --dataset "librispeech" \ 113 | --data_dir '/data/path/hf_datasets' \ 114 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_prefixtuning_2e2' \ 115 | --group_by_length True \ 116 | --do_train True \ 117 | --do_eval True \ 118 | --do_predict False \ 119 | --fp16 True \ 120 | --gradient_checkpointing True \ 121 | --evaluation_strategy "steps" \ 122 | --save_strategy "steps" \ 123 | --save_steps 200 \ 124 | --eval_steps 100 \ 125 | --learning_rate 2e-2 \ 126 | --feat_adapter_name "conv_adapter" \ 127 | --trans_adapter_name "bottleneck" \ 128 | --output_adapter False \ 129 | --mh_adapter False \ 130 | --prefix_tuning True \ 131 | --lora_adapter False \ 132 | --feat_enc_adapter False \ 133 | --fine_tune False \ 134 | --per_device_train_batch_size 16 \ 135 | --gradient_accumulation_steps 4 \ 136 | --per_device_eval_batch_size 16 \ 137 | --num_train_epochs 30 \ 138 | --weight_decay=0.005 \ 139 | --warmup_steps=1000 \ 140 | --logging_steps 20 \ 141 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_prefixtuning_2e2/log' \ 142 | --load_best_model_at_end True \ 143 | --metric_for_best_model "per" \ 144 | --greater_is_better False 145 | 146 | ##### Adapterblock ###### 147 | CUDA_VISIBLE_DEVICES=0,1 python phoneme_recognition.py \ 148 | --dataset "librispeech" \ 149 | --data_dir '/data/path/hf_datasets' \ 150 | --output_dir '/data/path/Output/output_earlystop_pr_librispeech_adapterblock_2e2' \ 151 | --group_by_length True \ 152 | --do_train True \ 153 | --do_eval True \ 154 | --do_predict False \ 155 | --fp16 True \ 156 | --gradient_checkpointing True \ 157 | --evaluation_strategy "steps" \ 158 | --save_strategy "steps" \ 159 | --save_steps 200 \ 160 | --eval_steps 100 \ 161 | --learning_rate 2e-2 \ 162 | --feat_adapter_name "conv_adapter" \ 163 | --trans_adapter_name "adapterblock" \ 164 | --output_adapter True \ 165 | --mh_adapter False \ 166 | --prefix_tuning False \ 167 | --lora_adapter False \ 168 | --feat_enc_adapter False \ 169 | --fine_tune False \ 170 | --per_device_train_batch_size 16 \ 171 | --gradient_accumulation_steps 4 \ 172 | --per_device_eval_batch_size 16 \ 173 | --num_train_epochs 30 \ 174 | --weight_decay=0.005 \ 175 | --warmup_steps=1000 \ 176 | --logging_steps 20 \ 177 | --logging_dir '/data/path/Output/output_earlystop_pr_librispeech_adapterblock_2e2/log' \ 178 | --load_best_model_at_end True \ 179 | --metric_for_best_model "per" \ 180 | --greater_is_better False 181 | 182 | -------------------------------------------------------------------------------- /tasks/slot_filling/metric.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import editdistance as ed 4 | 5 | from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor 6 | 7 | 8 | def cer(hypothesis, groundtruth, **kwargs): 9 | err = 0 10 | tot = 0 11 | for p, t in zip(hypothesis, groundtruth): 12 | err += float(ed.eval(p, t)) 13 | tot += len(t) 14 | return err / tot 15 | 16 | def clean(ref): 17 | ref = re.sub(r'B\-(\S+) ', '', ref) 18 | ref = re.sub(r' E\-(\S+)', '', ref) 19 | return ref 20 | 21 | def parse(hyp, ref): 22 | gex = re.compile(r'B\-(\S+) (.+?) E\-\1') 23 | 24 | # breakpoint() 25 | 26 | hyp = re.sub(r' +', ' ', hyp) 27 | ref = re.sub(r' +', ' ', ref) 28 | 29 | # breakpoint() 30 | 31 | hyp_slots = gex.findall(hyp) 32 | ref_slots = gex.findall(ref) 33 | 34 | ref_slots = ';'.join([':'.join([x[1], x[0]]) for x in ref_slots]) 35 | if len(hyp_slots)>0: 36 | hyp_slots = ';'.join([':'.join([clean(x[1]), x[0]]) for x in hyp_slots]) 37 | else: 38 | hyp_slots = '' 39 | 40 | ref = clean(ref) 41 | hyp = clean(hyp) 42 | 43 | # breakpoint() 44 | 45 | return ref, hyp, ref_slots, hyp_slots 46 | 47 | #processor 48 | tokenizer = Wav2Vec2CTCTokenizer("vocab_snips.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 49 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 50 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 51 | 52 | def compute_metric(pred): 53 | pred_logits = pred.predictions 54 | pred_ids = np.argmax(pred_logits, axis=-1) 55 | 56 | pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id 57 | 58 | pred_str = processor.batch_decode(pred_ids) 59 | # we do not want to group tokens when computing the metrics 60 | label_str = processor.batch_decode(pred.label_ids, group_tokens=False) 61 | 62 | hypothesis = pred_str 63 | groundtruth = label_str 64 | 65 | value_hyps = [] 66 | value_refs = [] 67 | F1s = [] 68 | for p, t in zip(hypothesis, groundtruth): 69 | ref_text, hyp_text, ref_slots, hyp_slots = parse(p, t) 70 | ref_slots = ref_slots.split(';') 71 | hyp_slots = hyp_slots.split(';') 72 | unique_slots = [] 73 | ref_dict = {} 74 | hyp_dict = {} 75 | if ref_slots[0] != '': 76 | for ref_slot in ref_slots: 77 | v, k = ref_slot.split(':') 78 | ref_dict.setdefault(k, []) 79 | ref_dict[k].append(v) 80 | if hyp_slots[0] != '': 81 | for hyp_slot in hyp_slots: 82 | v, k = hyp_slot.split(':') 83 | hyp_dict.setdefault(k, []) 84 | hyp_dict[k].append(v) 85 | # Slot Type F1 evaluation 86 | if len(hyp_dict.keys()) == 0 and len(ref_dict.keys()) == 0: 87 | F1 = 1.0 88 | elif len(hyp_dict.keys()) == 0: 89 | F1 = 0.0 90 | elif len(ref_dict.keys()) == 0: 91 | F1 = 0.0 92 | else: 93 | P, R = 0.0, 0.0 94 | for slot in ref_dict: 95 | if slot in hyp_dict: 96 | R += 1 97 | R = R / len(ref_dict.keys()) 98 | for slot in hyp_dict: 99 | if slot in ref_dict: 100 | P += 1 101 | P = P / len(hyp_dict.keys()) 102 | F1 = 2*P*R/(P+R) if (P+R) > 0 else 0.0 103 | F1s.append(F1) 104 | # Slot Value WER/CER evaluation 105 | unique_slots = list(ref_dict.keys()) 106 | for slot in unique_slots: 107 | for ref_i, ref_v in enumerate(ref_dict[slot]): 108 | if slot not in hyp_dict: 109 | hyp_v = '' 110 | value_refs.append(ref_v) 111 | value_hyps.append(hyp_v) 112 | else: 113 | min_cer = 100 114 | best_hyp_v = "" 115 | for hyp_v in hyp_dict[slot]: 116 | tmp_cer = cer([hyp_v], [ref_v]) 117 | if min_cer > tmp_cer: 118 | min_cer = tmp_cer 119 | best_hyp_v = hyp_v 120 | value_refs.append(ref_v) 121 | value_hyps.append(best_hyp_v) 122 | 123 | return {"slot_type_f1" : sum(F1s) / len(F1s), 124 | "slot_value_cer" : cer(value_hyps, value_refs)} 125 | 126 | def slot_type_f1_(hypothesis, groundtruth, **kwargs): 127 | F1s = [] 128 | for p, t in zip(hypothesis, groundtruth): 129 | ref_text, hyp_text, ref_slots, hyp_slots = parse(p, t) 130 | ref_slots = ref_slots.split(';') 131 | hyp_slots = hyp_slots.split(';') 132 | unique_slots = [] 133 | ref_dict = {} 134 | hyp_dict = {} 135 | if ref_slots[0] != '': 136 | for ref_slot in ref_slots: 137 | v, k = ref_slot.split(':') 138 | ref_dict.setdefault(k, []) 139 | ref_dict[k].append(v) 140 | if hyp_slots[0] != '': 141 | for hyp_slot in hyp_slots: 142 | v, k = hyp_slot.split(':') 143 | hyp_dict.setdefault(k, []) 144 | hyp_dict[k].append(v) 145 | # Slot Type F1 evaluation 146 | if len(hyp_dict.keys()) == 0 and len(ref_dict.keys()) == 0: 147 | F1 = 1.0 148 | elif len(hyp_dict.keys()) == 0: 149 | F1 = 0.0 150 | elif len(ref_dict.keys()) == 0: 151 | F1 = 0.0 152 | else: 153 | P, R = 0.0, 0.0 154 | for slot in ref_dict: 155 | if slot in hyp_dict: 156 | R += 1 157 | R = R / len(ref_dict.keys()) 158 | for slot in hyp_dict: 159 | if slot in ref_dict: 160 | P += 1 161 | P = P / len(hyp_dict.keys()) 162 | F1 = 2*P*R/(P+R) if (P+R) > 0 else 0.0 163 | F1s.append(F1) 164 | return sum(F1s) / len(F1s) 165 | 166 | def slot_value_cer(hypothesis, groundtruth, **kwargs): 167 | value_hyps = [] 168 | value_refs = [] 169 | for p, t in zip(hypothesis, groundtruth): 170 | ref_text, hyp_text, ref_slots, hyp_slots = parse(p, t) 171 | ref_slots = ref_slots.split(';') 172 | hyp_slots = hyp_slots.split(';') 173 | unique_slots = [] 174 | ref_dict = {} 175 | hyp_dict = {} 176 | if ref_slots[0] != '': 177 | for ref_slot in ref_slots: 178 | v, k = ref_slot.split(':') 179 | ref_dict.setdefault(k, []) 180 | ref_dict[k].append(v) 181 | if hyp_slots[0] != '': 182 | for hyp_slot in hyp_slots: 183 | v, k = hyp_slot.split(':') 184 | hyp_dict.setdefault(k, []) 185 | hyp_dict[k].append(v) 186 | # Slot Value WER/CER evaluation 187 | unique_slots = list(ref_dict.keys()) 188 | for slot in unique_slots: 189 | for ref_i, ref_v in enumerate(ref_dict[slot]): 190 | if slot not in hyp_dict: 191 | hyp_v = '' 192 | value_refs.append(ref_v) 193 | value_hyps.append(hyp_v) 194 | else: 195 | min_cer = 100 196 | best_hyp_v = "" 197 | for hyp_v in hyp_dict[slot]: 198 | tmp_cer = cer([hyp_v], [ref_v]) 199 | if min_cer > tmp_cer: 200 | min_cer = tmp_cer 201 | best_hyp_v = hyp_v 202 | value_refs.append(ref_v) 203 | value_hyps.append(best_hyp_v) 204 | 205 | return cer(value_hyps, value_refs) -------------------------------------------------------------------------------- /tasks/speaker_classification/speaker_recg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dataclasses import field, dataclass 4 | from typing import * 5 | from gc import callbacks 6 | from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2Config, TrainingArguments, HfArgumentParser, EarlyStoppingCallback 7 | from transformers.integrations import TensorBoardCallback 8 | 9 | set_seed(1314) 10 | 11 | from path import Path 12 | import sys 13 | folder = Path(__file__).abspath() 14 | sys.path.append(folder.parent.parent.parent) 15 | 16 | import os 17 | from os.path import join 18 | import utils 19 | from modules import CustomTrainer 20 | from modeling_wav2vec2 import Wav2Vec2ForSequenceClassification 21 | from data import get_sp_cls_data, compute_metrics, get_sp_vctk_data 22 | 23 | @dataclass 24 | class DataTrainingArguments(TrainingArguments): 25 | dataset: Optional[str] = field( 26 | default="esd", metadata={"help": "dataset name"} 27 | ) 28 | data_dir: Optional[str] = field( 29 | default="/data/path/VCTK/", metadata={"help": "The dir of the dataset."} 30 | ) 31 | feat_adapter_name: Optional[str] = field( 32 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 33 | ) 34 | trans_adapter_name: Optional[str] = field( 35 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 36 | ) 37 | output_adapter: Optional[bool] = field( 38 | default=False, metadata={"help": "use adapter after FFN"} 39 | ) 40 | mh_adapter: Optional[bool] = field( 41 | default=False, metadata={"help": "use adapter after multi-head attention"} 42 | ) 43 | prefix_tuning: Optional[bool] = field( 44 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 45 | ) 46 | prefix_seq_len: Optional[int] = field( 47 | default=30, metadata={"help": "prefix sequence length"} 48 | ) 49 | prefix_projection: Optional[bool] = field( 50 | default=False, 51 | metadata={ 52 | "help": "Apply a two-layer MLP head over the prefix embeddings" 53 | } 54 | ) 55 | prefix_dropout_prob: Optional[bool] = field( 56 | default=0.1, 57 | metadata={ 58 | "help": "The dropout probability used in the models" 59 | } 60 | ) 61 | feat_enc_adapter: Optional[bool] = field( 62 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 63 | ) 64 | lora_adapter: Optional[bool] = field( 65 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 66 | ) 67 | fine_tune: Optional[bool] = field( 68 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 69 | ) 70 | 71 | 72 | 73 | def main(): 74 | 75 | # args 76 | parser = HfArgumentParser(DataTrainingArguments) 77 | args = parser.parse_args_into_dataclasses()[0] 78 | 79 | #processor 80 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 81 | 82 | # audio dataset 83 | if args.dataset.lower() == "esd": 84 | ### ESD "/data/path/ESD/en/" 85 | train_set, max_len_train = get_sp_cls_data(args.data_dir, processor, "train") 86 | valid_set, max_len_valid = get_sp_cls_data(args.data_dir, processor, "evaluation") 87 | test_set, max_len_test = get_sp_cls_data(args.data_dir, processor, "test") 88 | elif args.dataset.lower() == "vctk": 89 | ### VCTK "/data/path/VCTK_Wav/wav48/" 90 | train_set, _ = get_sp_vctk_data(args.data_dir, processor, "train") 91 | valid_set, _ = get_sp_vctk_data(args.data_dir, processor, "evaluation") 92 | test_set, _ = get_sp_vctk_data(args.data_dir, processor, "test") 93 | else: 94 | raise NotImplementedError 95 | 96 | print("len of train_set:", len(train_set)) 97 | print("len of valid_set:", len(valid_set)) 98 | print("len of test_set:", len(test_set)) 99 | 100 | # config 101 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", 102 | num_labels=train_set.num_labels, 103 | label2id = train_set.label2id, 104 | id2label = train_set.id2label 105 | ) 106 | 107 | config.adapter_name = args.trans_adapter_name 108 | config.output_adapter = args.output_adapter 109 | config.mh_adapter = args.mh_adapter 110 | config.prefix_tuning = args.prefix_tuning 111 | config.feat_enc_adapter = args.feat_enc_adapter 112 | config.lora_adapter = args.lora_adapter 113 | config.prefix_seq_len = args.prefix_seq_len 114 | config.prefix_projection = args.prefix_projection 115 | config.prefix_dropout_prob = args.prefix_dropout_prob 116 | 117 | 118 | # load pretrained model 119 | model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config) 120 | 121 | print(model) 122 | 123 | print("\n #Train: {}, #Valid: {}, #Test: {} \n".format(len(train_set), len(valid_set), len(test_set))) 124 | # print(" #Train Max len: {}, #Valid Max len: {}, #Test Max len: {} \n".format(max_len_train, max_len_valid, max_len_test)) 125 | 126 | ## freeze all params exclude promptblock and classification head 127 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 128 | if not args.fine_tune: 129 | model.freeze_exclude_prompt() 130 | # print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 131 | 132 | if args.fine_tune: 133 | free_layers = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19] 134 | 135 | for name, param in model.named_parameters(): 136 | for num in free_layers: 137 | if f"wav2vec2.encoder.layers.{num}." in name: 138 | param.requires_grad = False 139 | for name, param in model.named_parameters(): 140 | if param.requires_grad: 141 | print(name, param.requires_grad, param.size()) 142 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 143 | # breakpoint() 144 | 145 | trainer = CustomTrainer( 146 | model, 147 | args, 148 | train_dataset=train_set, 149 | eval_dataset=valid_set, 150 | tokenizer=processor, 151 | compute_metrics=compute_metrics, 152 | callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 153 | ) 154 | save_dir = join(args.output_dir, "best_model") 155 | if args.do_train: # train and test 156 | trainer.train(resume_from_checkpoint=None) 157 | trainer.save_model(save_dir) 158 | test_metrics = trainer.predict(test_set).metrics 159 | print(test_metrics) 160 | 161 | if args.do_predict: # only for test 162 | device = trainer.model.device 163 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 164 | test_metrics= trainer.predict(test_set).metrics 165 | print(test_metrics) 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | 171 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from dataclasses import field, dataclass 4 | from typing import * 5 | from gc import callbacks 6 | from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2Config, TrainingArguments, EarlyStoppingCallback, HfArgumentParser 7 | from transformers.integrations import TensorBoardCallback 8 | 9 | import os 10 | from os.path import join 11 | import utils 12 | from modules import CustomTrainer 13 | from modeling_wav2vec2 import Wav2Vec2ForSequenceClassification 14 | from data import get_data, compute_metrics, get_emo_cls_iemocap_data, get_emo_meld_data, compute_metrics_macro_f1 15 | 16 | @dataclass 17 | class DataTrainingArguments(TrainingArguments): 18 | dataset: Optional[str] = field( 19 | default="esd", metadata={"help": "dataset name"} 20 | ) 21 | data_dir: Optional[str] = field( 22 | default="/data/path/ESD/en/", metadata={"help": "The dir of the dataset."} 23 | ) 24 | feat_adapter_name: Optional[str] = field( 25 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 26 | ) 27 | trans_adapter_name: Optional[str] = field( 28 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 29 | ) 30 | output_adapter: Optional[bool] = field( 31 | default=False, metadata={"help": "use adapter after FFN"} 32 | ) 33 | mh_adapter: Optional[bool] = field( 34 | default=False, metadata={"help": "use adapter after multi-head attention"} 35 | ) 36 | prefix_tuning: Optional[bool] = field( 37 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 38 | ) 39 | prefix_seq_len: Optional[int] = field( 40 | default=30, metadata={"help": "prefix sequence length"} 41 | ) 42 | prefix_projection: Optional[bool] = field( 43 | default=False, 44 | metadata={ 45 | "help": "Apply a two-layer MLP head over the prefix embeddings" 46 | } 47 | ) 48 | prefix_dropout_prob: Optional[bool] = field( 49 | default=0.1, 50 | metadata={ 51 | "help": "The dropout probability used in the models" 52 | } 53 | ) 54 | feat_enc_adapter: Optional[bool] = field( 55 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 56 | ) 57 | lora_adapter: Optional[bool] = field( 58 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 59 | ) 60 | fine_tune: Optional[bool] = field( 61 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 62 | ) 63 | 64 | 65 | 66 | def main(): 67 | set_seed(1314) 68 | 69 | # args 70 | parser = HfArgumentParser(DataTrainingArguments) 71 | args = parser.parse_args_into_dataclasses()[0] 72 | 73 | #processor 74 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 75 | 76 | # audio dataset 77 | if args.dataset.lower() == "esd": 78 | ### ESD 79 | train_set, max_len_train = get_data(args.data_dir, processor, "train") 80 | valid_set, max_len_valid = get_data(args.data_dir, processor, "evaluation") 81 | test_set, max_len_test = get_data(args.data_dir, processor, "test") 82 | 83 | # train_set = get_emo_cls_data(args.data_dir, processor, "train") # for read wav when need 84 | # valid_set = get_emo_cls_data(args.data_dir, processor, "evaluation") 85 | # test_set = get_emo_cls_data(args.data_dir, processor, "test") 86 | elif args.dataset.lower() == "iemocap": 87 | ### IEMOCAP 88 | wav_file_names, emotions = utils.get_iemocap_labels(args.data_dir) 89 | 90 | train_set, max_len_train = get_emo_cls_iemocap_data(args.data_dir, processor, "train", wav_file_names, emotions) 91 | valid_set, max_len_valid = get_emo_cls_iemocap_data(args.data_dir, processor, "evaluation", wav_file_names, emotions) 92 | test_set, max_len_test = get_emo_cls_iemocap_data(args.data_dir, processor, "test", wav_file_names, emotions) 93 | 94 | elif args.dataset.lower() == "meld": 95 | ### MELD 96 | train_set, max_len_train = get_emo_meld_data(args.data_dir, processor, "train") 97 | valid_set, max_len_valid = get_emo_meld_data(args.data_dir, processor, "evaluation") 98 | test_set, max_len_test = get_emo_meld_data(args.data_dir, processor, "test") 99 | 100 | # config 101 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", 102 | num_labels=train_set.num_labels, 103 | label2id = train_set.label2id, 104 | id2label = train_set.id2label 105 | ) 106 | 107 | config.adapter_name = args.trans_adapter_name 108 | config.output_adapter = args.output_adapter 109 | config.mh_adapter = args.mh_adapter 110 | config.prefix_tuning = args.prefix_tuning 111 | config.feat_enc_adapter = args.feat_enc_adapter 112 | config.lora_adapter = args.lora_adapter 113 | config.prefix_seq_len = args.prefix_seq_len 114 | config.prefix_projection = args.prefix_projection 115 | config.prefix_dropout_prob = args.prefix_dropout_prob 116 | 117 | 118 | # load pretrained model 119 | model = Wav2Vec2ForSequenceClassification.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config) 120 | 121 | print(model) 122 | 123 | print("\n #Train: {}, #Valid: {}, #Test: {} ".format(len(train_set), len(valid_set), len(test_set))) 124 | # print(" #Train Max len: {}, #Valid Max len: {}, #Test Max len: {} \n".format(max_len_train, max_len_valid, max_len_test)) 125 | print("\n") 126 | 127 | ## freeze all params exclude promptblock and classification head 128 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 129 | if not args.fine_tune: 130 | model.freeze_exclude_prompt() 131 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 132 | 133 | for name, param in model.named_parameters(): 134 | if param.requires_grad: 135 | print(name, param.requires_grad, param.size()) 136 | 137 | if args.metric_for_best_model == "f1": 138 | com_metrics = compute_metrics_macro_f1 139 | else: 140 | com_metrics = compute_metrics 141 | 142 | 143 | trainer = CustomTrainer( 144 | model, 145 | args, 146 | train_dataset=train_set, 147 | eval_dataset=valid_set, 148 | tokenizer=processor, 149 | # compute_metrics=compute_metrics, 150 | compute_metrics=com_metrics, 151 | callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 152 | ) 153 | 154 | save_dir = join(args.output_dir, "best_model") 155 | if args.do_train: # train and test 156 | trainer.train(resume_from_checkpoint=None) 157 | trainer.save_model(save_dir) 158 | 159 | test_metrics = trainer.predict(test_set).metrics 160 | print(test_metrics) 161 | 162 | if args.do_predict: # only for test 163 | device = trainer.model.device 164 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 165 | test_metrics= trainer.predict(test_set).metrics 166 | print(test_metrics) 167 | 168 | 169 | if __name__ == "__main__": 170 | main() 171 | 172 | 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Evaluating parameter-efficient transfer learning approaches on SURE benchmark for speech understanding 2 | 3 | ## [Paper](https://arxiv.org/pdf/2303.03267.pdf) 4 | 5 | ## Motivation 6 | 7 | Fine-tuning is widely used as the default algorithm for transfer learning from pre-trained models. Parameter inefficiency can however arise when, during transfer learning, all the parameters of a large pre-trained model need to be updated for individual downstream tasks. As the number of parameters grows, fine-tuning is prone to overfitting and catastrophic forgetting. In addition, full fine-tuning can become prohibitively expensive when the model is used for many tasks. To mitigate this issue, parameter-efficient transfer learning algorithms, such as adapters and prefix tuning, have been proposed as a way to introduce a few trainable parameters that can be plugged into large pre-trained language models such as BERT, and HuBERT. In this paper, we introduce the Speech UndeRstanding Evaluation (SURE) benchmark for parameter-efficient learning for various speech-processing tasks. Additionally, we introduce a new adapter, ConvAdapter, based on 1D convolution. We show that ConvAdapter outperforms the standard adapters while showing comparable performance against prefix tuning and LoRA with only 0.94% of trainable parameters on some of the tasks in SURE. We further explore the effectiveness of parameter efficient transfer learning for speech synthesis task such as Text-to-Speech (TTS). 8 | 9 | ![image](https://user-images.githubusercontent.com/35062414/221511052-a6f4c44a-f779-4fca-9142-6ea10254b764.png) 10 | 11 | ![image](https://user-images.githubusercontent.com/35062414/221511119-27c65410-3086-4509-8927-1ce43efc13af.png) 12 | 13 | ## Installation 14 | * Set up environments 15 | ```python 16 | conda create --name speechprompt python==3.8.5 17 | conda activate speechprompt 18 | conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 -c pytorch 19 | ``` 20 | * Install other dependencies 21 | ```python 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | ## Supported tasks and datasets 26 | 27 | ![image](https://user-images.githubusercontent.com/35062414/221520253-3fba52bf-ff2f-4a2a-8199-be75d4de3989.png) 28 | 29 | 30 | ## How to run 31 | First, we need to specify datasets and arguments. let's use "esd" as the dataset, "finetune" as the tuning method in "speech emotion recognition" task as an example: 32 | 33 | ```python 34 | CUDA_VISIBLE_DEVICES=2,3 python train.py \ 35 | --dataset "esd" \ 36 | --data_dir "/data/path/ESD" \ 37 | --output_dir '/data/path/output_earlystop_ser_esd_finetune_2e3' \ 38 | --do_train True \ 39 | --do_eval True \ 40 | --do_predict False \ 41 | --evaluation_strategy "steps" \ 42 | --save_strategy "steps" \ 43 | --save_steps 500 \ 44 | --eval_steps 25 \ 45 | --learning_rate 2e-3 \ 46 | --feat_adapter_name "conv_adapter" \ 47 | --trans_adapter_name "adapterblock" \ 48 | --output_adapter False \ 49 | --mh_adapter False \ 50 | --prefix_tuning False \ 51 | --lora_adapter False \ 52 | --feat_enc_adapter False \ 53 | --fine_tune True \ 54 | --per_device_train_batch_size 64 \ 55 | --gradient_accumulation_steps 4 \ 56 | --per_device_eval_batch_size 64 \ 57 | --num_train_epochs 100 \ 58 | --warmup_ratio 0.1 \ 59 | --logging_steps 20 \ 60 | --logging_dir '/data/path/output_earlystop_ser_esd_finetune_2e3/log' \ 61 | --load_best_model_at_end True \ 62 | --metric_for_best_model "f1" 63 | ``` 64 | 65 | #### Parameters 66 | 67 | * dataset: specify the dataset, such as "esd", "fleurs", "fluent_commands", etc. 68 | * data_dir: path to the dataset file, for instance, "../data/path/ESD" 69 | * output_dir: path to the checkpoints and logs, for instance, '../data/path/output_earlystop_ser_esd_finetune_2e3' 70 | * do_train: True if want to train 71 | * do_eval: True if want to eval 72 | * do_predict: True if want to inference 73 | * evaluation_strategy: It can be set according to the official setting of huggingface 74 | * save_strategy: It can be set according to the official setting of huggingface 75 | * save_steps: It can be set according to the official setting of huggingface 76 | * eval_steps: It can be set according to the official setting of huggingface 77 | * learning_rate: It can be set according to the official setting of huggingface 78 | * feat_adapter_name: The adapter type added in the features encoder, but not applied to this article, can be skipped 79 | * trans_adapter_name: The adapter type added in transformer layer, such as "adapterblock" for ConvAdapter and "bottleneck" for Bottleneck Adapter 80 | * output_adapter: True if added after feedforward of every transformer layer, only control ConvAdapter and Bottleneck Adapter 81 | * mh_adapter: True if added after multihead attention of every transformer layer, only control ConvAdapter and Bottleneck Adapter 82 | * prefix_tuning: True if prefix-tuning is added 83 | * lora_adapter: True if Lora is added 84 | * feat_enc_adapter: True if adapter is add in features encoder of wav2vec2 85 | * fine_tune: True if only need fine tuning 86 | * per_device_train_batch_size: It can be set according to the official setting of huggingface 87 | * gradient_accumulation_steps: It can be set according to the official setting of huggingface 88 | * per_device_eval_batch_size: It can be set according to the official setting of huggingface 89 | * num_train_epochs: It can be set according to the official setting of huggingface 90 | * warmup_ratio: It can be set according to the official setting of huggingface 91 | * logging_steps: It can be set according to the official setting of huggingface 92 | * logging_dir: It can be set according to the official setting of huggingface 93 | * load_best_model_at_end: It can be set according to the official setting of huggingface 94 | * metric_for_best_model: It can be set according to the official setting of huggingface 95 | 96 | #### Emotion classification 97 | Let's further explain the five training methods of the model. For example, start a new emotion classification task, we will set the corresponding parameter like below: 98 | ```python 99 | ## finetune 100 | --fine_tune True 101 | ## bottleneck 102 | --trans_adapter_name "bottleneck" 103 | --output_adapter True 104 | ## prefix-tuning 105 | --prefix_tuning True 106 | ## lora 107 | --lora_adapter True 108 | ## ConvAdapter 109 | --trans_adapter_name "adapterblock" 110 | --output_adapter True 111 | ``` 112 | 113 | We also placed examples according to each training method in "emotion_cls.sh", using the following command to start new emotion classification task: 114 | ```python 115 | bash emotion_cls.sh 116 | ``` 117 | 118 | ## Tensorboard 119 | In order to further supervise the convergence of model training, we can view the log file through Tensorboard: 120 | ```python 121 | tensorboard --logdir=/data/path/output_earlystop_asr_fleurs_lora_2e3/log --bind_all 122 | ``` 123 | 124 | ## Citation 125 | ```python 126 | @inproceedings{li2023evaluating, 127 | title={Evaluating Parameter-Efficient Transfer Learning Approaches on SURE Benchmark for Speech Understanding}, 128 | author={Li, Yingting and Mehrish, Ambuj and Zhao, Shuai and Bhardwaj, Rishabh and Zadeh, Amir and Majumder, Navonil and Mihalcea, Rada and Poria, Soujanya}, 129 | booktitle={ICASSP}, 130 | year={2023} 131 | } 132 | ``` 133 | Note: Please cite our paper if you find this repository useful. 134 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import * 4 | 5 | from transformers import Trainer 6 | from transformers.adapters import AdapterConfig 7 | from transformers.adapters.modeling import Adapter 8 | 9 | from transformers.activations import ACT2FN 10 | 11 | from torch.nn import init 12 | from torch import Tensor 13 | 14 | class SELayer4Vision(nn.Module): 15 | def __init__(self, channel, reduction=16): 16 | super(SELayer, self).__init__() 17 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 18 | self.fc = nn.Sequential( 19 | nn.Linear(channel, channel // reduction, bias=False), 20 | nn.ReLU(inplace=True), 21 | nn.Linear(channel // reduction, channel, bias=False), 22 | nn.Sigmoid() 23 | ) 24 | 25 | def forward(self, x): 26 | b, c, _, _ = x.size() 27 | y = self.avg_pool(x).view(b, c) 28 | y = self.fc(y).view(b, c, 1, 1) 29 | return x * y.expand_as(x) 30 | 31 | 32 | class SELayer(nn.Module): 33 | def __init__(self, channel, reduction=16): 34 | super(SELayer, self).__init__() 35 | self.avg_pool = nn.AdaptiveAvgPool1d(1) 36 | self.fc = nn.Sequential( 37 | nn.Linear(channel, channel // reduction, bias=False), 38 | nn.ReLU(inplace=True), 39 | nn.Linear(channel // reduction, channel, bias=False), 40 | nn.Sigmoid() 41 | ) 42 | 43 | def forward(self, x): 44 | resdiual = x 45 | b, c, _= x.size() 46 | y = self.avg_pool(x).view(b, c) 47 | y = self.fc(y).view(b, c, 1) 48 | # return resdiual + x * y.expand_as(x) 49 | return x * y.expand_as(x) 50 | ''' 51 | def depthwise_conv5X5(in_planes, out_planes, stride=1): 52 | return nn.Conv1d(in_planes, out_planes, kernel_size=5, stride=stride, bias=False, groups=4) 53 | 54 | def conv1x1(in_planes, out_planes, stride=1): 55 | "1x1 convolution without padding" 56 | return nn.Conv1d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 57 | ''' 58 | 59 | class PrefixEncoder(nn.Module): 60 | #code from P-tuning-v2 61 | #https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py 62 | r''' 63 | The torch.nn model to encode the prefix 64 | 65 | Input shape: (batch-size, prefix-length) 66 | 67 | Output shape: (batch-size, prefix-length, 2*layers*hidden) 68 | ''' 69 | def __init__(self, config): 70 | super().__init__() 71 | self.prefix_projection = config.prefix_projection 72 | if self.prefix_projection: 73 | # Use a two-layer MLP to encode the prefix 74 | self.embedding = torch.nn.Embedding(config.prefix_seq_len, config.hidden_size) 75 | self.trans = torch.nn.Sequential( 76 | torch.nn.Linear(config.hidden_size, config.prefix_hidden_size), 77 | torch.nn.Tanh(), 78 | torch.nn.Linear(config.prefix_hidden_size, config.num_hidden_layers * 2 * config.hidden_size) 79 | ) 80 | else: 81 | self.embedding = torch.nn.Embedding(config.prefix_seq_len, config.num_hidden_layers * 2 * config.hidden_size) 82 | 83 | def forward(self, prefix: torch.Tensor): 84 | if self.prefix_projection: 85 | prefix_tokens = self.embedding(prefix) 86 | past_key_values = self.trans(prefix_tokens) 87 | else: 88 | past_key_values = self.embedding(prefix) 89 | return past_key_values 90 | 91 | class LinearNorm(nn.Module): 92 | """ LinearNorm Projection """ 93 | 94 | def __init__(self, in_features, out_features, bias=False): 95 | super(LinearNorm, self).__init__() 96 | self.linear = nn.Linear(in_features, out_features, bias) 97 | 98 | nn.init.xavier_uniform_(self.linear.weight) 99 | if bias: 100 | nn.init.constant_(self.linear.bias, 0.0) 101 | 102 | def forward(self, x): 103 | x = self.linear(x) 104 | return x 105 | 106 | class Swish(nn.Module): 107 | """ 108 | Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied 109 | to a variety of challenging domains such as Image classification and Machine translation. 110 | """ 111 | def __init__(self): 112 | super(Swish, self).__init__() 113 | 114 | def forward(self, inputs): 115 | return inputs * inputs.sigmoid() 116 | 117 | class FeedForwardModule(nn.Module): 118 | """ 119 | Conformer Feed Forward Module follow pre-norm residual units and apply layer normalization within the residual unit 120 | and on the input before the first linear layer. This module also apply Swish activation and dropout, which helps 121 | regularizing the network. 122 | Args: 123 | encoder_dim (int): Dimension of conformer encoder 124 | expansion_factor (int): Expansion factor of feed forward module. 125 | dropout_p (float): Ratio of dropout 126 | Inputs: inputs 127 | - **inputs** (batch, time, dim): Tensor contains input sequences 128 | Outputs: outputs 129 | - **outputs** (batch, time, dim): Tensor produces by feed forward module. 130 | """ 131 | def __init__( 132 | self, 133 | encoder_dim: int = 512, 134 | expansion_factor: float = 4, 135 | dropout_p: float = 0.1, 136 | ) -> None: 137 | super(FeedForwardModule, self).__init__() 138 | self.sequential = nn.Sequential( 139 | nn.LayerNorm(encoder_dim), 140 | # LinearNorm(encoder_dim, encoder_dim, bias=True), 141 | LinearNorm(encoder_dim, int(encoder_dim * expansion_factor), bias=True), 142 | Swish(), 143 | nn.Dropout(p=dropout_p), 144 | # LinearNorm(int(encoder_dim * expansion_factor), encoder_dim, bias=True), 145 | # nn.Dropout(p=dropout_p), 146 | ) 147 | 148 | def forward(self, inputs: Tensor, past_key_value=None) -> Tensor: 149 | return self.sequential(inputs) 150 | 151 | class AdapterBlock(nn.Module): 152 | def __init__(self, in_dim, out_dim, kernel_size=1, stride=1, bias=False): 153 | super(AdapterBlock, self).__init__() 154 | self.layer_norm1 = nn.LayerNorm(in_dim) 155 | self.conv1 = nn.Conv1d(in_dim, out_dim, kernel_size=3, stride=stride, bias=bias,groups=out_dim, padding='same') 156 | self.relu1 = nn.ReLU(inplace=True) 157 | # self.se1 = SELayer(out_dim) 158 | self.conv2 = nn.Conv1d(out_dim, out_dim, kernel_size=5, stride=stride, bias=False, groups=out_dim, padding='same') 159 | # self.se2 = SELayer(out_dim) 160 | self.conv3 = nn.Conv1d(out_dim, in_dim, kernel_size=3, stride=stride, bias=bias,groups=out_dim, padding='same') 161 | # self.relu2 = nn.ReLU(inplace=True) 162 | self.se3 = SELayer(in_dim) 163 | # self.layer_norm2 = nn.LayerNorm(out_dim) 164 | # self.dropout = nn.Dropout(p=0.1) 165 | def forward(self, x, residual_input): 166 | out = self.layer_norm1(x) 167 | out = torch.transpose(out,-1,-2) 168 | out = self.conv1(out) 169 | out = self.relu1(out) 170 | out = self.conv2(out) 171 | out = self.conv3(out) 172 | out = self.se3(out) 173 | # out = self.dropout(out) 174 | out = torch.transpose(out,-1,-2) 175 | out = residual_input + out #skip connection 176 | return out 177 | 178 | class BottleneckAdapter(nn.Module): 179 | def __init__(self, adapter_name, input_size, down_sample): 180 | super(BottleneckAdapter, self).__init__() 181 | self.config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu") 182 | self.bottleneck_adapter = Adapter(adapter_name, input_size=input_size, down_sample=down_sample, config=self.config) 183 | def forward(self, x, residual_input): 184 | output, down, up = self.bottleneck_adapter(x, residual_input) 185 | return output 186 | 187 | 188 | class CustomTrainer(Trainer): 189 | def compute_loss(self, model, inputs, return_outputs=False): 190 | labels = inputs.get("labels") 191 | # forward pass 192 | outputs = model(**inputs) 193 | 194 | logits = outputs.get("logits") 195 | 196 | # compute custom loss (suppose one has 3 labels with different weights) 197 | # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0])).to(labels.device) #add weight or not? 198 | loss_fct = nn.CrossEntropyLoss().to(labels.device) 199 | loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) 200 | return (loss, outputs) if return_outputs else loss 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /tasks/slot_filling/slot_filling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import math 4 | 5 | from dataclasses import field, dataclass 6 | from datasets import load_dataset 7 | from typing import * 8 | from gc import callbacks 9 | from transformers import set_seed, Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2Config, Wav2Vec2Tokenizer 10 | from transformers import TrainingArguments, EarlyStoppingCallback, HfArgumentParser, Trainer 11 | from transformers.integrations import TensorBoardCallback 12 | 13 | # import sys 14 | # sys.path.append("..") 15 | from path import Path 16 | import sys 17 | folder = Path(__file__).abspath() 18 | sys.path.append(folder.parent.parent.parent) 19 | 20 | import os 21 | from os.path import join 22 | import utils 23 | 24 | from dataset import SnipsDataset 25 | from data import DataCollatorCTCWithPadding 26 | from metric import compute_metric 27 | from modeling_wav2vec2 import Wav2Vec2ForCTC 28 | 29 | from torch.optim.lr_scheduler import LambdaLR 30 | 31 | @dataclass 32 | class DataTrainingArguments(TrainingArguments): 33 | dataset: Optional[str] = field( 34 | default="esd", metadata={"help": "dataset name"} 35 | ) 36 | data_dir: Optional[str] = field( 37 | default="/data/path/ESD/en/", metadata={"help": "The dir of the dataset."} 38 | ) 39 | feat_adapter_name: Optional[str] = field( 40 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 41 | ) 42 | trans_adapter_name: Optional[str] = field( 43 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 44 | ) 45 | output_adapter: Optional[bool] = field( 46 | default=False, metadata={"help": "use adapter after FFN"} 47 | ) 48 | mh_adapter: Optional[bool] = field( 49 | default=False, metadata={"help": "use adapter after multi-head attention"} 50 | ) 51 | prefix_tuning: Optional[bool] = field( 52 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 53 | ) 54 | prefix_seq_len: Optional[int] = field( 55 | default=30, metadata={"help": "prefix sequence length"} 56 | ) 57 | prefix_projection: Optional[bool] = field( 58 | default=False, 59 | metadata={ 60 | "help": "Apply a two-layer MLP head over the prefix embeddings" 61 | } 62 | ) 63 | prefix_dropout_prob: Optional[bool] = field( 64 | default=0.1, 65 | metadata={ 66 | "help": "The dropout probability used in the models" 67 | } 68 | ) 69 | feat_enc_adapter: Optional[bool] = field( 70 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 71 | ) 72 | lora_adapter: Optional[bool] = field( 73 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 74 | ) 75 | fine_tune: Optional[bool] = field( 76 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 77 | ) 78 | 79 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_constant_steps, num_training_steps, last_epoch=-1): 80 | def lr_lambda(current_step: int): 81 | if current_step < num_warmup_steps: 82 | return float(current_step) / float(max(1, num_warmup_steps)) 83 | elif current_step >= num_warmup_steps and current_step < num_constant_steps: 84 | return float(1.0) 85 | return max( 86 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_constant_steps)) 87 | ) 88 | 89 | return LambdaLR(optimizer, lr_lambda, last_epoch) 90 | 91 | 92 | class CustomTrainer(Trainer): 93 | def __init__(self, *args, **kwargs): 94 | super().__init__(*args, **kwargs) 95 | self.constant_ratio = 0.4 96 | self.num_constant_steps = -1 97 | def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): 98 | if self.lr_scheduler is None: 99 | self.lr_scheduler = get_linear_schedule_with_warmup( 100 | self.optimizer if optimizer is None else optimizer, 101 | num_warmup_steps=self.args.get_warmup_steps(num_training_steps), 102 | num_constant_steps=self.get_keep_constant_steps(num_training_steps), 103 | num_training_steps=num_training_steps) 104 | return self.lr_scheduler 105 | def get_keep_constant_steps(self, num_training_steps: int): 106 | keep_constant_steps = ( 107 | self.num_constant_steps if self.num_constant_steps > 0 else math.ceil(num_training_steps * (self.constant_ratio + self.args.warmup_ratio)) 108 | ) 109 | return keep_constant_steps 110 | 111 | def main(): 112 | set_seed(1314) 113 | 114 | # args 115 | parser = HfArgumentParser(DataTrainingArguments) 116 | args = parser.parse_args_into_dataclasses()[0] 117 | 118 | #processor 119 | tokenizer = Wav2Vec2CTCTokenizer("vocab_snips.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 120 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 121 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 122 | 123 | # audio dataset 124 | # snips 125 | if args.dataset.lower() == "snips": 126 | train_set = SnipsDataset(args.data_dir, processor, "train") 127 | valid_set = SnipsDataset(args.data_dir, processor, "valid") 128 | test_set = SnipsDataset(args.data_dir, processor, "test") 129 | elif args.dataset.lower() == "voxceleb": 130 | pass 131 | else: 132 | raise NotImplementedError 133 | 134 | # config 135 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", vocab_size=len(processor.tokenizer)) 136 | config._name_or_path = "" 137 | 138 | config.adapter_name = args.trans_adapter_name 139 | config.output_adapter = args.output_adapter 140 | config.mh_adapter = args.mh_adapter 141 | config.prefix_tuning = args.prefix_tuning 142 | config.feat_enc_adapter = args.feat_enc_adapter 143 | config.lora_adapter = args.lora_adapter 144 | config.prefix_seq_len = args.prefix_seq_len 145 | config.prefix_projection = args.prefix_projection 146 | config.prefix_dropout_prob = args.prefix_dropout_prob 147 | config.ctc_loss_reduction = "mean" 148 | config.pad_token_id = processor.tokenizer.pad_token_id 149 | 150 | 151 | # load pretrained model 152 | model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config, ignore_mismatched_sizes=True) 153 | model.freeze_feature_encoder() 154 | 155 | print("\n #Train: {}, #Valid: {}, #Test: {} ".format(len(train_set), len(valid_set), len(test_set))) 156 | 157 | ## freeze all params exclude promptblock and classification head 158 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 159 | if not args.fine_tune: 160 | model.freeze_exclude_prompt() 161 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 162 | 163 | # for name, param in model.named_parameters(): 164 | # if param.requires_grad: 165 | # print(name, param.requires_grad, param.size()) 166 | 167 | data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True, max_length=200000) 168 | 169 | trainer = CustomTrainer( 170 | model=model, 171 | data_collator=data_collator, 172 | args=args, 173 | compute_metrics=compute_metric, 174 | train_dataset=train_set, 175 | eval_dataset=valid_set, 176 | tokenizer=processor.tokenizer, 177 | # callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 178 | callbacks = [TensorBoardCallback], 179 | ) 180 | 181 | save_dir = join(args.output_dir, "best_model") 182 | if args.do_train: # train and test 183 | trainer.train(resume_from_checkpoint=None) 184 | trainer.save_model(save_dir) 185 | 186 | test_metrics = trainer.predict(test_set).metrics 187 | print(test_metrics) 188 | 189 | if args.do_predict: # only for test 190 | from torch.utils.data import DataLoader 191 | device = trainer.model.device 192 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 193 | 194 | print(trainer.predict) 195 | 196 | test_metrics = trainer.predict(test_set).metrics 197 | print(test_metrics) 198 | 199 | if __name__ == "__main__": 200 | main() -------------------------------------------------------------------------------- /tasks/slot_filling/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch.utils.data.dataset import Dataset 7 | import torchaudio 8 | from os.path import join 9 | from pathlib import Path 10 | import json 11 | import re 12 | from os.path import exists 13 | import numpy as np 14 | 15 | from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor 16 | 17 | chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]' 18 | def remove_special_characters(txt): 19 | txt = re.sub(chars_to_ignore_regex, '', txt)#.lower() 20 | return txt 21 | 22 | def extract_all_chars(all_texts): 23 | all_text = " ".join(all_texts) 24 | vocab = list(set(all_text)) 25 | return {"vocab": [vocab], "all_text": [all_text]} 26 | 27 | class SnipsDataset(Dataset): 28 | def __init__(self, base_path, processor, mode): 29 | self.base_path = base_path 30 | self.processor = processor 31 | self.mode = mode 32 | 33 | self.datas = self.get_datas()#[:500] 34 | 35 | def get_datas(self): 36 | train_splits = [] 37 | valid_splits = [] 38 | test_splits = [] 39 | vocab_snips = [] 40 | all_text = [] 41 | with open(join(self.base_path, "all.iob.snips.txt"), "r") as f: 42 | lines = f.readlines() 43 | for line in lines: 44 | splits = line.strip().split() 45 | utterence_id = splits[0] 46 | mode = utterence_id.split("-")[2] 47 | if mode == "train": 48 | train_splits.append(line) 49 | elif mode == "valid": 50 | valid_splits.append(line) 51 | else: 52 | test_splits.append(line) 53 | 54 | u_id_text, slot_names = line.strip().split("\t") 55 | u_id = u_id_text.split(" ")[0] 56 | text = u_id_text.split(" ")[1:][1:-1] 57 | clean_text = remove_special_characters(" ".join(text)) 58 | slot = slot_names.split(" ")[1:-1] 59 | all_text.append(clean_text) 60 | if not os.path.exists("vocab.txt"): 61 | 62 | vocabs_snips = extract_all_chars(all_text) 63 | vocab_list = list(set(vocabs_snips["vocab"][0])) 64 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 65 | vocab_dict["|"] = vocab_dict[" "] 66 | del vocab_dict[" "] 67 | vocab_dict["[UNK]"] = len(vocab_dict) 68 | vocab_dict["[PAD]"] = len(vocab_dict) 69 | 70 | with open("vocab.txt", "w") as vf: 71 | for vl in vocab_list: 72 | vf.writelines(vl) 73 | 74 | with open('vocab.json', 'w') as vocab_file: 75 | json.dump(vocab_dict, vocab_file) 76 | 77 | if not os.path.exists("vocab_snips.json"): 78 | vocabs_snips = extract_all_chars(all_text) 79 | vocab_list = list(set(vocabs_snips["vocab"][0])) 80 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 81 | 82 | slots_file = join(self.base_path, "slots.txt") 83 | org_slots = open(slots_file).read().split('\n') 84 | slots = [] 85 | for slot in org_slots[1:]: 86 | slots.append('B-'+slot) 87 | slots.append('E-'+slot) 88 | for slot in slots: 89 | vocab_dict[slot] = len(vocab_dict) 90 | 91 | vocab_dict["|"] = vocab_dict[" "] 92 | del vocab_dict[" "] 93 | vocab_dict["[UNK]"] = len(vocab_dict) 94 | vocab_dict["[PAD]"] = len(vocab_dict) 95 | 96 | with open('vocab_snips.json', 'w') as vocab_file: 97 | json.dump(vocab_dict, vocab_file) 98 | 99 | if self.mode == "train": 100 | return train_splits 101 | elif self.mode == "valid": 102 | return valid_splits 103 | else: 104 | return test_splits 105 | 106 | def __len__(self): 107 | return len(self.datas) 108 | 109 | def __getitem__(self, idx): 110 | u_id_text, slot_names = self.datas[idx].strip().split("\t") 111 | u_id = u_id_text.split(" ")[0] 112 | sent = u_id_text.split(" ")[1:][1:-1] 113 | iobs = slot_names.split(" ")[1:-1] 114 | 115 | processed_seqs = [] 116 | for i, (wrd, iob) in enumerate(zip(sent, iobs)): 117 | if wrd in "?!.,;-": 118 | continue 119 | if wrd == '&': 120 | wrd = 'AND' 121 | if iob != 'O' and (i == 0 or iobs[i-1] != iob): 122 | processed_seqs.append('B-'+iob) 123 | processed_seqs.append("|") 124 | processed_seqs.append(wrd) 125 | if iob != 'O' and (i == len(sent)-1 or iobs[i+1] != iob): 126 | processed_seqs.append("|") 127 | processed_seqs.append('E-'+iob) 128 | processed_seqs.append("|") 129 | if i == (len(sent)-1): 130 | pass 131 | else: 132 | processed_seqs.append("|") 133 | 134 | # breakpoint() 135 | text_slot = self.processor.tokenizer.encode(" ".join(processed_seqs)) 136 | 137 | wav_path = join(self.base_path, self.mode, u_id+".wav") 138 | wav, sr = torchaudio.load(wav_path) 139 | wav = wav.squeeze(0) 140 | input_value = self.processor(wav, sampling_rate=self.processor.feature_extractor.sampling_rate).input_values[0] 141 | 142 | # breakpoint() 143 | return {"input_values":input_value, 144 | "labels":text_slot} 145 | 146 | import abc 147 | class _BaseTextEncoder(abc.ABC): 148 | @abc.abstractmethod 149 | def encode(self, s): 150 | raise NotImplementedError 151 | 152 | @abc.abstractmethod 153 | def decode(self, ids, ignore_repeat=False): 154 | raise NotImplementedError 155 | 156 | @abc.abstractproperty 157 | def vocab_size(self): 158 | raise NotImplementedError 159 | 160 | @abc.abstractproperty 161 | def token_type(self): 162 | raise NotImplementedError 163 | 164 | @abc.abstractclassmethod 165 | def load_from_file(cls, vocab_file): 166 | raise NotImplementedError 167 | 168 | @property 169 | def pad_idx(self): 170 | return 0 171 | 172 | @property 173 | def eos_idx(self): 174 | return 1 175 | 176 | @property 177 | def unk_idx(self): 178 | return 2 179 | 180 | def __repr__(self): 181 | return "<{} vocab_size={}>".format(type(self).__name__, self.vocab_size) 182 | 183 | class CharacterTextSlotEncoder(_BaseTextEncoder): 184 | def __init__(self, vocab_list, slots): 185 | # Note that vocab_list must not contain , and 186 | # =0, =1, =2 187 | self._vocab_list = ["[PAD]", "|", "[UNK]"] + vocab_list 188 | self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)} 189 | self.slots = slots 190 | self.slot2id = {self.slots[i]:(i+len(self._vocab_list)) for i in range(len(self.slots))} 191 | self.id2slot = {(i+len(self._vocab_list)):self.slots[i] for i in range(len(self.slots))} 192 | 193 | 194 | def encode(self, s): 195 | # Always strip trailing space, \r and \n 196 | sent, iobs = s.strip('\r\n ').split('\t') 197 | sent = sent.split(' ')[1:-1] 198 | iobs = iobs.split(' ')[1:-1] 199 | tokens = [] 200 | for i, (wrd, iob) in enumerate(zip(sent, iobs)): 201 | if wrd in "?!.,;-": 202 | continue 203 | if wrd == '&': 204 | wrd = 'AND' 205 | if iob != 'O' and (i == 0 or iobs[i-1] != iob): 206 | tokens.append(self.slot2id['B-'+iob]) 207 | tokens += [self.vocab_to_idx(v) for v in wrd] 208 | if iob != 'O' and (i == len(sent)-1 or iobs[i+1] != iob): 209 | tokens.append(self.slot2id['E-'+iob]) 210 | if i == (len(sent)-1): 211 | tokens.append(self.eos_idx) 212 | else: 213 | tokens.append(self.vocab_to_idx(' ')) 214 | return tokens 215 | 216 | def decode(self, idxs, ignore_repeat=False): 217 | vocabs = [] 218 | for t, idx in enumerate(idxs): 219 | v = self.idx_to_vocab(idx) 220 | if idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 221 | continue 222 | elif idx == self.eos_idx: 223 | break 224 | else: 225 | vocabs.append(v) 226 | return "".join(vocabs) 227 | 228 | @classmethod 229 | def load_from_file(cls, vocab_file, slots_file): 230 | with open(vocab_file, "r") as f: 231 | # Do not strip space because character based text encoder should 232 | # have a space token 233 | vocab_list = [line.strip("\r\n") for line in f] 234 | org_slots = open(slots_file).read().split('\n') 235 | slots = [] 236 | for slot in org_slots[1:]: 237 | slots.append('B-'+slot) 238 | slots.append('E-'+slot) 239 | return cls(vocab_list, slots) 240 | 241 | @property 242 | def vocab_size(self): 243 | return len(self._vocab_list) + len(self.slots) 244 | 245 | @property 246 | def token_type(self): 247 | return 'character-slot' 248 | 249 | def vocab_to_idx(self, vocab): 250 | return self._vocab2idx.get(vocab, self.unk_idx) 251 | 252 | def idx_to_vocab(self, idx): 253 | idx = int(idx) 254 | if idx < len(self._vocab_list): 255 | return self._vocab_list[idx] 256 | else: 257 | token = self.id2slot[idx] 258 | if token[0] == 'B': 259 | return token + ' ' 260 | elif token[0] == 'E': 261 | return ' ' + token 262 | else: 263 | raise ValueError('id2slot get:', token) 264 | 265 | 266 | 267 | if __name__ == "__main__": 268 | base_path = "/data/path/Dataset/SNIPS" 269 | 270 | #processor 271 | tokenizer = Wav2Vec2CTCTokenizer("vocab_snips.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 272 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 273 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 274 | 275 | train_data = SnipsDataset(base_path, processor, "train") 276 | print("wav :", train_data[0]["input_values"].shape) 277 | print("text_slot :", train_data[0]["labels"]) 278 | print("decode text_slot:", tokenizer.decode(train_data[0]["labels"])) 279 | # breakpoint() 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /tasks/phoneme_recognition/phoneme_recognition.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, load_metric 2 | 3 | import re 4 | import json 5 | import torch 6 | 7 | from path import Path 8 | import sys 9 | from os.path import join 10 | folder = Path(__file__).abspath() 11 | sys.path.append(folder.parent.parent.parent) 12 | 13 | from data import LibriPhoneDataset, DataCollatorCTCWithPadding 14 | from text import load_text_encoder 15 | from transformers import Trainer 16 | 17 | from transformers import (set_seed, Wav2Vec2CTCTokenizer, 18 | Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Config, 19 | TrainingArguments, EarlyStoppingCallback, HfArgumentParser) 20 | 21 | from modeling_wav2vec2 import Wav2Vec2ForCTC 22 | 23 | from torch.utils.data import DataLoader, DistributedSampler 24 | from functools import partial 25 | import torchaudio 26 | from torch.nn.utils.rnn import pad_sequence 27 | import numpy as np 28 | 29 | import datasets 30 | from datasets import Dataset 31 | from torch.distributed import is_initialized 32 | from typing import Optional 33 | from dataclasses import field, dataclass 34 | 35 | from transformers.integrations import TensorBoardCallback 36 | 37 | @dataclass 38 | class DataTrainingArguments(TrainingArguments): 39 | dataset: Optional[str] = field( 40 | default="esd", metadata={"help": "dataset name"} 41 | ) 42 | data_dir: Optional[str] = field( 43 | default="/data/path/ESD/en/", metadata={"help": "The dir of the dataset."} 44 | ) 45 | feat_adapter_name: Optional[str] = field( 46 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 47 | ) 48 | trans_adapter_name: Optional[str] = field( 49 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 50 | ) 51 | output_adapter: Optional[bool] = field( 52 | default=False, metadata={"help": "use adapter after FFN"} 53 | ) 54 | mh_adapter: Optional[bool] = field( 55 | default=False, metadata={"help": "use adapter after multi-head attention"} 56 | ) 57 | prefix_tuning: Optional[bool] = field( 58 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 59 | ) 60 | prefix_seq_len: Optional[int] = field( 61 | default=30, metadata={"help": "prefix sequence length"} 62 | ) 63 | prefix_projection: Optional[bool] = field( 64 | default=False, 65 | metadata={ 66 | "help": "Apply a two-layer MLP head over the prefix embeddings" 67 | } 68 | ) 69 | prefix_dropout_prob: Optional[bool] = field( 70 | default=0.1, 71 | metadata={ 72 | "help": "The dropout probability used in the models" 73 | } 74 | ) 75 | feat_enc_adapter: Optional[bool] = field( 76 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 77 | ) 78 | lora_adapter: Optional[bool] = field( 79 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 80 | ) 81 | fine_tune: Optional[bool] = field( 82 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 83 | ) 84 | 85 | def seed_worker(_): 86 | """ 87 | Helper function to set worker seed during Dataloader initialization. 88 | """ 89 | worker_seed = torch.initial_seed() % 2**32 90 | set_seed(worker_seed) 91 | 92 | class CustomTrainer(Trainer): 93 | def get_train_dataloader(self) -> DataLoader: 94 | """ 95 | Returns the training [`~torch.utils.data.DataLoader`]. 96 | Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed 97 | training if necessary) otherwise. 98 | Subclass and override this method if you want to inject some custom behavior. 99 | """ 100 | if self.train_dataset is None: 101 | raise ValueError("Trainer: training requires a train_dataset.") 102 | train_dataset = self.train_dataset 103 | 104 | train_sampler = DistributedSampler(dataset) if is_initialized() else None 105 | 106 | collate_fn = partial(self.collect_audio_batch, split="train") 107 | 108 | return DataLoader( 109 | train_dataset, 110 | batch_size=self._train_batch_size, 111 | sampler=train_sampler, 112 | # collate_fn=data_collator, 113 | collate_fn=collate_fn, 114 | drop_last=self.args.dataloader_drop_last, 115 | num_workers=self.args.dataloader_num_workers, 116 | pin_memory=self.args.dataloader_pin_memory, 117 | worker_init_fn=seed_worker, 118 | ) 119 | 120 | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: 121 | """ 122 | Returns the evaluation [`~torch.utils.data.DataLoader`]. 123 | Subclass and override this method if you want to inject some custom behavior. 124 | Args: 125 | eval_dataset (`torch.utils.data.Dataset`, *optional*): 126 | If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted 127 | by the `model.forward()` method are automatically removed. It must implement `__len__`. 128 | """ 129 | if eval_dataset is None and self.eval_dataset is None: 130 | raise ValueError("Trainer: evaluation requires an eval_dataset.") 131 | eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset 132 | 133 | collate_fn = partial(self.collect_audio_batch, split="dev") 134 | 135 | return DataLoader( 136 | eval_dataset, 137 | # sampler=eval_sampler, 138 | shuffle=False, 139 | batch_size=self.args.eval_batch_size, 140 | collate_fn=collate_fn, 141 | drop_last=self.args.dataloader_drop_last, 142 | num_workers=self.args.dataloader_num_workers, 143 | pin_memory=self.args.dataloader_pin_memory, 144 | ) 145 | 146 | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: 147 | """ 148 | Returns the test [`~torch.utils.data.DataLoader`]. 149 | Subclass and override this method if you want to inject some custom behavior. 150 | Args: 151 | test_dataset (`torch.utils.data.Dataset`, *optional*): 152 | The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the 153 | `model.forward()` method are automatically removed. It must implement `__len__`. 154 | """ 155 | data_collator = self.data_collator 156 | 157 | collate_fn = partial(self.collect_audio_batch, split="test") 158 | 159 | # We use the same batch_size as for eval. 160 | return DataLoader( 161 | test_dataset, 162 | batch_size=self.args.eval_batch_size, 163 | collate_fn=collate_fn, 164 | drop_last=self.args.dataloader_drop_last, 165 | num_workers=self.args.dataloader_num_workers, 166 | pin_memory=self.args.dataloader_pin_memory, 167 | ) 168 | 169 | def collect_audio_batch(self, batch, split, half_batch_size_wav_len=300000): 170 | '''Collects a batch, should be list of tuples (audio_path , list of int token ) 171 | e.g. [(file1,txt1),(file2,txt2),...] 172 | ''' 173 | def audio_reader(filepath): 174 | wav, sample_rate = torchaudio.load(filepath) 175 | return wav.reshape(-1) 176 | 177 | # Bucketed batch should be [[(file1,txt1),(file2,txt2),...]] 178 | if type(batch[0]) is not tuple: 179 | batch = batch[0] 180 | 181 | # Make sure that batch size is reasonable 182 | first_len = audio_reader(str(batch[0][0])).size(0) 183 | if split == 'train': 184 | if first_len > half_batch_size_wav_len and len(batch) > 1: 185 | batch = batch[:len(batch)//2] 186 | 187 | # Read batch 188 | file, audio_feat, audio_len, text = [], [], [], [] 189 | with torch.no_grad(): 190 | for b in batch: 191 | file.append(str(b[0]).split('/')[-1].split('.')[0]) 192 | feat = audio_reader(str(b[0])).numpy() 193 | audio_feat.append(feat) 194 | audio_len.append(len(feat)) 195 | text.append(torch.LongTensor(b[1]).numpy()) 196 | 197 | # Descending audio length within each batch 198 | audio_len, file, audio_feat, text = zip(*[(feat_len, f_name, feat, txt) 199 | for feat_len, f_name, feat, txt in sorted(zip(audio_len, file, audio_feat, text), reverse=True, key=lambda x:x[0])]) 200 | 201 | # return audio_feat, text, file 202 | 203 | labels = [torch.FloatTensor(label) for label in text] 204 | labels = pad_sequence(labels, padding_value=-100).transpose(0,1) 205 | 206 | wavs = [torch.FloatTensor(wav) for wav in audio_feat] 207 | wavs = pad_sequence(wavs).transpose(0,1) 208 | 209 | return {"input_values":wavs, 210 | "labels":labels} 211 | 212 | 213 | # tokenizer 214 | tokenizer = load_text_encoder(mode="word", vocab_file="phoneme.txt") 215 | 216 | wer_metric = load_metric("wer") 217 | 218 | def compute_metrics(pred): 219 | 220 | pred_logits = pred.predictions 221 | pred_ids = np.argmax(pred_logits, axis=-1) 222 | 223 | pred.label_ids[pred.label_ids == -100] = tokenizer._vocab2idx[""] 224 | 225 | pred.label_ids = pred.label_ids.astype(int) 226 | 227 | pred_str = [[tokenizer.decode(seq)] for seq in pred_ids] 228 | label_str = [[tokenizer.decode(seq.tolist())] for seq in pred.label_ids] 229 | 230 | per = wer_metric.compute(predictions=pred_str, references=label_str) 231 | 232 | return {"per": per} 233 | 234 | def main(): 235 | set_seed(1314) 236 | # args 237 | parser = HfArgumentParser(DataTrainingArguments) 238 | args = parser.parse_args_into_dataclasses()[0] 239 | 240 | 241 | 242 | # audio dataset 243 | if args.dataset.lower() == "timit": 244 | timit = load_dataset("timit_asr", cache_dir="/data/path/Dataset/timit_asr/") 245 | elif args.dataset.lower() == "librispeech": 246 | train_path = "/data/path/hf_datasets/downloads/extracted/baf2e051c7d5c26b3b25db6157338d0eca8b961c9f49f25f65e10b0d583678e1/LibriSpeech" 247 | dev_path = "/data/path/hf_datasets/downloads/extracted/d89a8a1d668652cbb712b0970ff79b3e200655cf354aa6e8b87660ee441a7edf/LibriSpeech" 248 | test_path = "/data/path/hf_datasets/downloads/extracted/f6e39073841bee74aaa6f25d34420963669676bf57915cf6ad2403a7a833df68/LibriSpeech" 249 | word2phonemes_path = "/home/path/PromptSpeech/tasks/phoneme_recognition/word2phonemes.json" 250 | 251 | kwargs = {'num_workers': 24, 'train': ['train-clean-100'], 'dev': ['dev-clean'], 'test': ['test-clean']} 252 | 253 | dev_dataset = LibriPhoneDataset(kwargs['dev'], tokenizer, 1, dev_path, word2phonemes_path, **kwargs) 254 | test_dataset = LibriPhoneDataset(kwargs['test'], tokenizer, 1, test_path, word2phonemes_path, **kwargs) 255 | kwargs["ratio"] = 1.0 256 | kwargs["offset"] = 0 257 | train_dataset = LibriPhoneDataset(kwargs['train'], tokenizer, 1, train_path, word2phonemes_path, **kwargs) 258 | else: 259 | raise NotImplementedError 260 | 261 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", vocab_size=tokenizer.vocab_size) 262 | 263 | config.adapter_name = args.trans_adapter_name 264 | config.output_adapter = args.output_adapter 265 | config.mh_adapter = args.mh_adapter 266 | config.prefix_tuning = args.prefix_tuning 267 | config.feat_enc_adapter = args.feat_enc_adapter 268 | config.lora_adapter = args.lora_adapter 269 | config.prefix_seq_len = args.prefix_seq_len 270 | config.prefix_projection = args.prefix_projection 271 | config.prefix_dropout_prob = args.prefix_dropout_prob 272 | config.ctc_loss_reduction = "mean" 273 | config.pad_token_id = tokenizer._vocab2idx[""] 274 | 275 | model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config, ignore_mismatched_sizes=True) 276 | 277 | model.freeze_feature_encoder() 278 | 279 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 280 | if not args.fine_tune: 281 | model.freeze_exclude_prompt() 282 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 283 | 284 | trainer = CustomTrainer( 285 | model=model, 286 | args=args, 287 | compute_metrics=compute_metrics, 288 | train_dataset=train_dataset, 289 | eval_dataset=dev_dataset, 290 | tokenizer=tokenizer, 291 | # callbacks = [TensorBoardCallback], 292 | callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 293 | ) 294 | 295 | save_dir = join(args.output_dir, "best_model") 296 | if args.do_train: # train and test 297 | trainer.train(resume_from_checkpoint=None) #join(args.output_dir, "checkpoint-4400") 298 | trainer.save_model(save_dir) 299 | 300 | test_metrics = trainer.predict(test_dataset).metrics 301 | print(test_metrics) 302 | 303 | if args.do_predict: # only for test 304 | device = trainer.model.device 305 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 306 | 307 | test_metrics = trainer.predict(test_dataset).metrics 308 | print(test_metrics) 309 | 310 | if __name__ == "__main__": 311 | main() 312 | 313 | 314 | 315 | -------------------------------------------------------------------------------- /tasks/phoneme_recognition/text.py: -------------------------------------------------------------------------------- 1 | """Modified from tensorflow_datasets.features.text.* 2 | 3 | Reference: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text_lib 4 | """ 5 | import abc 6 | import os 7 | 8 | BERT_FIRST_IDX = 997 # Replacing the 2 tokens right before english starts as & 9 | BERT_LAST_IDX = 29635 # Drop rest of tokens 10 | 11 | 12 | class _BaseTextEncoder(abc.ABC): 13 | @abc.abstractmethod 14 | def encode(self, s): 15 | raise NotImplementedError 16 | 17 | @abc.abstractmethod 18 | def decode(self, ids, ignore_repeat=False): 19 | raise NotImplementedError 20 | 21 | @abc.abstractproperty 22 | def vocab_size(self): 23 | raise NotImplementedError 24 | 25 | @abc.abstractproperty 26 | def token_type(self): 27 | raise NotImplementedError 28 | 29 | @abc.abstractclassmethod 30 | def load_from_file(cls, vocab_file): 31 | raise NotImplementedError 32 | 33 | @property 34 | def pad_idx(self): 35 | return 0 36 | 37 | @property 38 | def eos_idx(self): 39 | return 1 40 | 41 | @property 42 | def unk_idx(self): 43 | return 2 44 | 45 | def __repr__(self): 46 | return "<{} vocab_size={}>".format(type(self).__name__, self.vocab_size) 47 | 48 | def save_pretrained(self, save_directory, **kwargs): 49 | os.makedirs(save_directory, exist_ok=True) 50 | 51 | # for attribute_name in self.attributes: 52 | # attribute = getattr(self, attribute_name) 53 | # # Include the processor class in the attribute config so this processor can then be reloaded with the 54 | # # `AutoProcessor` API. 55 | # if hasattr(attribute, "_set_processor_class"): 56 | # attribute._set_processor_class(self.__class__.__name__) 57 | # attribute.save_pretrained(save_directory) 58 | pass 59 | 60 | @classmethod 61 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 62 | # args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) 63 | # return cls(*args) 64 | pass 65 | 66 | 67 | class CharacterTextEncoder(_BaseTextEncoder): 68 | def __init__(self, vocab_list): 69 | # Note that vocab_list must not contain , and 70 | # =0, =1, =2 71 | self._vocab_list = ["", "", ""] + vocab_list 72 | self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)} 73 | 74 | def encode(self, s): 75 | # Always strip trailing space, \r and \n 76 | s = s.strip("\r\n ") 77 | # Manually append eos to the end 78 | return [self.vocab_to_idx(v) for v in s] + [self.eos_idx] 79 | 80 | def decode(self, idxs, ignore_repeat=False): 81 | vocabs = [] 82 | for t, idx in enumerate(idxs): 83 | v = self.idx_to_vocab(idx) 84 | if idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 85 | continue 86 | elif idx == self.eos_idx: 87 | break 88 | else: 89 | vocabs.append(v) 90 | return "".join(vocabs) 91 | 92 | @classmethod 93 | def load_from_file(cls, vocab_file): 94 | with open(vocab_file, "r") as f: 95 | # Do not strip space because character based text encoder should 96 | # have a space token 97 | vocab_list = [line.strip("\r\n") for line in f] 98 | return cls(vocab_list) 99 | 100 | @property 101 | def vocab_size(self): 102 | return len(self._vocab_list) 103 | 104 | @property 105 | def token_type(self): 106 | return 'character' 107 | 108 | def vocab_to_idx(self, vocab): 109 | return self._vocab2idx.get(vocab, self.unk_idx) 110 | 111 | def idx_to_vocab(self, idx): 112 | return self._vocab_list[idx] 113 | 114 | class CharacterTextSlotEncoder(_BaseTextEncoder): 115 | def __init__(self, vocab_list, slots): 116 | # Note that vocab_list must not contain , and 117 | # =0, =1, =2 118 | self._vocab_list = ["", "", ""] + vocab_list 119 | self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)} 120 | self.slots = slots 121 | self.slot2id = {self.slots[i]:(i+len(self._vocab_list)) for i in range(len(self.slots))} 122 | self.id2slot = {(i+len(self._vocab_list)):self.slots[i] for i in range(len(self.slots))} 123 | 124 | 125 | def encode(self, s): 126 | # Always strip trailing space, \r and \n 127 | sent, iobs = s.strip('\r\n ').split('\t') 128 | sent = sent.split(' ')[1:-1] 129 | iobs = iobs.split(' ')[1:-1] 130 | tokens = [] 131 | for i, (wrd, iob) in enumerate(zip(sent, iobs)): 132 | if wrd in "?!.,;-": 133 | continue 134 | if wrd == '&': 135 | wrd = 'AND' 136 | if iob != 'O' and (i == 0 or iobs[i-1] != iob): 137 | tokens.append(self.slot2id['B-'+iob]) 138 | tokens += [self.vocab_to_idx(v) for v in wrd] 139 | if iob != 'O' and (i == len(sent)-1 or iobs[i+1] != iob): 140 | tokens.append(self.slot2id['E-'+iob]) 141 | if i == (len(sent)-1): 142 | tokens.append(self.eos_idx) 143 | else: 144 | tokens.append(self.vocab_to_idx(' ')) 145 | return tokens 146 | 147 | def decode(self, idxs, ignore_repeat=False): 148 | vocabs = [] 149 | for t, idx in enumerate(idxs): 150 | v = self.idx_to_vocab(idx) 151 | if idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 152 | continue 153 | elif idx == self.eos_idx: 154 | break 155 | else: 156 | vocabs.append(v) 157 | return "".join(vocabs) 158 | 159 | @classmethod 160 | def load_from_file(cls, vocab_file, slots_file): 161 | with open(vocab_file, "r") as f: 162 | # Do not strip space because character based text encoder should 163 | # have a space token 164 | vocab_list = [line.strip("\r\n") for line in f] 165 | org_slots = open(slots_file).read().split('\n') 166 | slots = [] 167 | for slot in org_slots[1:]: 168 | slots.append('B-'+slot) 169 | slots.append('E-'+slot) 170 | return cls(vocab_list, slots) 171 | 172 | @property 173 | def vocab_size(self): 174 | return len(self._vocab_list) + len(self.slots) 175 | 176 | @property 177 | def token_type(self): 178 | return 'character-slot' 179 | 180 | def vocab_to_idx(self, vocab): 181 | return self._vocab2idx.get(vocab, self.unk_idx) 182 | 183 | def idx_to_vocab(self, idx): 184 | idx = int(idx) 185 | if idx < len(self._vocab_list): 186 | return self._vocab_list[idx] 187 | else: 188 | token = self.id2slot[idx] 189 | if token[0] == 'B': 190 | return token + ' ' 191 | elif token[0] == 'E': 192 | return ' ' + token 193 | else: 194 | raise ValueError('id2slot get:', token) 195 | 196 | 197 | 198 | class SubwordTextEncoder(_BaseTextEncoder): 199 | def __init__(self, spm): 200 | if spm.pad_id() != 0 or spm.eos_id() != 1 or spm.unk_id() != 2: 201 | raise ValueError( 202 | "Please train sentencepiece model with following argument:\n" 203 | "--pad_id=0 --eos_id=1 --unk_id=2 --bos_id=-1 --model_type=bpe --eos_piece=") 204 | self.spm = spm 205 | 206 | def encode(self, s): 207 | return self.spm.encode_as_ids(s) 208 | 209 | def decode(self, idxs, ignore_repeat=False): 210 | crop_idx = [] 211 | for t, idx in enumerate(idxs): 212 | if idx == self.eos_idx: 213 | break 214 | elif idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 215 | continue 216 | else: 217 | crop_idx.append(idx) 218 | return self.spm.decode_ids(crop_idx) 219 | 220 | @classmethod 221 | def load_from_file(cls, filepath): 222 | import sentencepiece as splib 223 | spm = splib.SentencePieceProcessor() 224 | spm.load(filepath) 225 | spm.set_encode_extra_options(":eos") 226 | return cls(spm) 227 | 228 | @property 229 | def vocab_size(self): 230 | return len(self.spm) 231 | 232 | @property 233 | def token_type(self): 234 | return 'subword' 235 | 236 | 237 | class SubwordTextSlotEncoder(_BaseTextEncoder): 238 | def __init__(self, spm, slots): 239 | if spm.pad_id() != 0 or spm.eos_id() != 1 or spm.unk_id() != 2: 240 | raise ValueError( 241 | "Please train sentencepiece model with following argument:\n" 242 | "--pad_id=0 --eos_id=1 --unk_id=2 --bos_id=-1 --model_type=bpe --eos_piece=") 243 | self.spm = spm 244 | self.slots = slots 245 | self.slot2id = {self.slots[i]:(i+len(self.spm)) for i in range(len(self.slots))} 246 | self.id2slot = {(i+len(self.spm)):self.slots[i] for i in range(len(self.slots))} 247 | 248 | def encode(self, s): 249 | sent, iobs = s.strip().split('\t') 250 | sent = sent.split(' ')[1:-1] 251 | iobs = iobs.split(' ')[1:-1] 252 | tokens = [] 253 | for i, (wrd, iob) in enumerate(zip(sent, iobs)): 254 | if wrd in "?!.,;-": 255 | continue 256 | if wrd == '&': 257 | wrd = 'AND' 258 | if iob != 'O' and (i == 0 or iobs[i-1] != iob): 259 | tokens.append(self.slot2id['B-'+iob]) 260 | tokens += self.spm.encode_as_ids(wrd)[:-1] #if i != len(sent)-1 else self.spm.encode_as_ids(wrd) 261 | if iob != 'O' and (i == len(sent)-1 or iobs[i+1] != iob): 262 | tokens.append(self.slot2id['E-'+iob]) 263 | if tokens[-1] != 1: 264 | tokens.append(1) 265 | return tokens #self.spm.encode_as_ids(s) 266 | 267 | def decode(self, idxs, ignore_repeat=False): 268 | crop_idx = [] 269 | for t, idx in enumerate(idxs): 270 | if idx == self.eos_idx: 271 | break 272 | elif idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 273 | continue 274 | else: 275 | crop_idx.append(idx) 276 | sent = [] 277 | ret = [] 278 | for i, x in enumerate(crop_idx): 279 | if x >= len(self.spm): 280 | ret.append(self.spm.decode_ids(sent) + [self.id2slot[x]]) 281 | else: 282 | sent.append(x) 283 | return ret 284 | 285 | @classmethod 286 | def load_from_file(cls, filepath, slots_file): 287 | import sentencepiece as splib 288 | spm = splib.SentencePieceProcessor() 289 | spm.load(filepath) 290 | spm.set_encode_extra_options(":eos") 291 | org_slots = open(slots_file).read().split('\n') 292 | slots = [] 293 | for slot in org_slots[1:]: 294 | slots.append('B-'+slot) 295 | slots.append('E-'+slot) 296 | return cls(spm, slots) 297 | 298 | @property 299 | def vocab_size(self): 300 | return len(self.spm) + len(self.slots) 301 | 302 | @property 303 | def token_type(self): 304 | return 'subword-slot' 305 | 306 | 307 | 308 | class WordTextEncoder(CharacterTextEncoder): 309 | def encode(self, s): 310 | # Always strip trailing space, \r and \n 311 | s = s.strip("\r\n ") 312 | # Space as the delimiter between words 313 | words = s.split(" ") 314 | # Manually append eos to the end 315 | return [self.vocab_to_idx(v) for v in words] + [self.eos_idx] 316 | 317 | def decode(self, idxs, ignore_repeat=False): 318 | vocabs = [] 319 | for t, idx in enumerate(idxs): 320 | v = self.idx_to_vocab(idx) 321 | if idx == self.eos_idx: 322 | break 323 | elif idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 324 | continue 325 | else: 326 | vocabs.append(v) 327 | return " ".join(vocabs) 328 | 329 | @property 330 | def token_type(self): 331 | return 'word' 332 | 333 | 334 | class BertTextEncoder(_BaseTextEncoder): 335 | """Bert Tokenizer. 336 | 337 | https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_bert.py 338 | """ 339 | 340 | def __init__(self, tokenizer): 341 | self._tokenizer = tokenizer 342 | self._tokenizer.pad_token = "" 343 | self._tokenizer.eos_token = "" 344 | self._tokenizer.unk_token = "" 345 | 346 | def encode(self, s): 347 | # Reduce vocab size manually 348 | reduced_idx = [] 349 | for idx in self._tokenizer.encode(s): 350 | try: 351 | r_idx = idx-BERT_FIRST_IDX 352 | assert r_idx > 0 353 | reduced_idx.append(r_idx) 354 | except: 355 | reduced_idx.append(self.unk_idx) 356 | reduced_idx.append(self.eos_idx) 357 | return reduced_idx 358 | 359 | def decode(self, idxs, ignore_repeat=False): 360 | crop_idx = [] 361 | for t, idx in enumerate(idxs): 362 | if idx == self.eos_idx: 363 | break 364 | elif idx == self.pad_idx or (ignore_repeat and t > 0 and idx == idxs[t-1]): 365 | continue 366 | else: 367 | # Shift to correct idx for bert tokenizer 368 | crop_idx.append(idx+BERT_FIRST_IDX) 369 | return self._tokenizer.decode(crop_idx) 370 | 371 | @property 372 | def vocab_size(self): 373 | return BERT_LAST_IDX-BERT_FIRST_IDX+1 374 | 375 | @property 376 | def token_type(self): 377 | return "bert" 378 | 379 | @classmethod 380 | def load_from_file(cls, vocab_file): 381 | from pytorch_transformers import BertTokenizer 382 | return cls(BertTokenizer.from_pretrained(vocab_file)) 383 | 384 | @property 385 | def pad_idx(self): 386 | return 0 387 | 388 | @property 389 | def eos_idx(self): 390 | return 1 391 | 392 | @property 393 | def unk_idx(self): 394 | return 2 395 | 396 | 397 | def load_text_encoder(mode, vocab_file, slots_file=None): 398 | if mode == "character": 399 | return CharacterTextEncoder.load_from_file(vocab_file) 400 | elif mode == "character-slot": 401 | return CharacterTextSlotEncoder.load_from_file(vocab_file, slots_file) 402 | elif mode == "subword": 403 | return SubwordTextEncoder.load_from_file(vocab_file) 404 | elif mode == "subword-slot": 405 | return SubwordTextSlotEncoder.load_from_file(vocab_file, slots_file) 406 | elif mode == "word": 407 | return WordTextEncoder.load_from_file(vocab_file) 408 | elif mode.startswith("bert-"): 409 | return BertTextEncoder.load_from_file(mode) 410 | else: 411 | raise NotImplementedError("`{}` is not yet supported.".format(mode)) 412 | -------------------------------------------------------------------------------- /tasks/tts/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from torch.nn import functional as F 5 | 6 | from text.symbols import symbols 7 | 8 | from .constants import PAD 9 | from .blocks import ( 10 | get_sinusoid_encoding_table, 11 | LinearNorm, 12 | ) 13 | 14 | import loralib as lora 15 | from .adapters import PrefixEncoder, AdapterBlock, BottleneckAdapter 16 | 17 | class TextEncoder(nn.Module): 18 | """ Text Encoder """ 19 | 20 | def __init__(self, config): 21 | super(TextEncoder, self).__init__() 22 | 23 | n_position = config["max_seq_len"] + 1 24 | n_src_vocab = len(symbols) + 1 25 | d_word_vec = config["transformer"]["encoder_hidden"] 26 | n_layers = config["transformer"]["encoder_layer"] 27 | n_head = config["transformer"]["encoder_head"] 28 | d_k = d_v = ( 29 | config["transformer"]["encoder_hidden"] 30 | // config["transformer"]["encoder_head"] 31 | ) 32 | d_model = config["transformer"]["encoder_hidden"] 33 | d_inner = config["transformer"]["conv_filter_size"] 34 | kernel_size = config["transformer"]["conv_kernel_size"] 35 | dropout = config["transformer"]["encoder_dropout"] 36 | 37 | self.max_seq_len = config["max_seq_len"] 38 | self.d_model = d_model 39 | 40 | self.src_word_emb = nn.Embedding( 41 | n_src_vocab, d_word_vec, padding_idx=PAD 42 | ) 43 | self.position_enc = nn.Parameter( 44 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 45 | requires_grad=False, 46 | ) 47 | 48 | self.layer_stack = nn.ModuleList( 49 | [ 50 | FFTBlock( 51 | config, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout, is_decoder=False 52 | ) 53 | for _ in range(n_layers) 54 | ] 55 | ) 56 | 57 | self.config = config 58 | if config['adapter']['prefix_tuning']: 59 | self.num_heads = n_head 60 | self.prefix_seq_len = self.config["adapter"]["prefix_seq_len"] 61 | self.hidden_size = d_word_vec 62 | self.n_embd = self.hidden_size // self.num_heads 63 | self.prefix_tokens = torch.arange(self.prefix_seq_len).long() 64 | self.prefix_dropout = torch.nn.Dropout(config["adapter"]["prefix_dropout_prob"]) 65 | self.num_layers = n_layers 66 | 67 | self.prefix_encoder = PrefixEncoder(config, num_hidden_layers=self.num_layers, hidden_size=self.hidden_size) 68 | 69 | def get_prefix_tuning(self, batch_size): 70 | 71 | prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1)#.to(self.layer_norm.device) 72 | past_key_values = self.prefix_encoder(prefix_tokens) 73 | # bsz, seqlen, _ = past_key_values.shape 74 | past_key_values = past_key_values.view( 75 | batch_size, 76 | self.prefix_seq_len, 77 | self.num_layers * 2, 78 | self.num_heads, 79 | self.n_embd 80 | ) 81 | past_key_values = self.prefix_dropout(past_key_values) 82 | past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2) 83 | return past_key_values 84 | 85 | def forward(self, src_seq, mask, return_attns=False): #(16, 231, 256) 86 | 87 | enc_slf_attn_list = [] 88 | batch_size, max_len = src_seq.shape[0], src_seq.shape[1] 89 | 90 | if self.config['adapter']['prefix_tuning']: 91 | past_key_values=self.get_prefix_tuning(batch_size=batch_size) 92 | else: 93 | past_key_values=None 94 | 95 | # -- Prepare masks 96 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 97 | 98 | # -- Forward 99 | src_word_emb = self.src_word_emb(src_seq) 100 | if not self.training and src_seq.shape[1] > self.max_seq_len: 101 | enc_output = src_word_emb + get_sinusoid_encoding_table( 102 | src_seq.shape[1], self.d_model 103 | )[: src_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to( 104 | src_seq.device 105 | ) 106 | else: 107 | enc_output = src_word_emb + self.position_enc[ 108 | :, :max_len, : 109 | ].expand(batch_size, -1, -1) 110 | 111 | for i, enc_layer in enumerate(self.layer_stack): 112 | 113 | past_key_value = past_key_values[i] if past_key_values is not None else None 114 | 115 | enc_output, enc_slf_attn = enc_layer( 116 | enc_output, mask=mask, slf_attn_mask=slf_attn_mask, past_key_value=past_key_value 117 | ) 118 | if return_attns: 119 | enc_slf_attn_list += [enc_slf_attn] 120 | 121 | return enc_output, src_word_emb 122 | 123 | 124 | class Decoder(nn.Module): 125 | """ Decoder """ 126 | 127 | def __init__(self, config): 128 | super(Decoder, self).__init__() 129 | 130 | n_position = config["max_seq_len"] + 1 131 | d_word_vec = config["transformer"]["decoder_hidden"] 132 | n_layers = config["transformer"]["decoder_layer"] 133 | n_head = config["transformer"]["decoder_head"] 134 | d_k = d_v = ( 135 | config["transformer"]["decoder_hidden"] 136 | // config["transformer"]["decoder_head"] 137 | ) 138 | d_model = config["transformer"]["decoder_hidden"] 139 | d_inner = config["transformer"]["conv_filter_size"] 140 | kernel_size = config["transformer"]["conv_kernel_size"] 141 | dropout = config["transformer"]["decoder_dropout"] 142 | 143 | self.max_seq_len = config["max_seq_len"] 144 | self.d_model = d_model 145 | 146 | self.position_enc = nn.Parameter( 147 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 148 | requires_grad=False, 149 | ) 150 | 151 | self.layer_stack = nn.ModuleList( 152 | [ 153 | FFTBlock( 154 | config, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout, is_decoder=True 155 | ) 156 | for _ in range(n_layers) 157 | ] 158 | ) 159 | 160 | self.config = config 161 | if config['adapter']['prefix_tuning']: 162 | self.num_heads = n_head 163 | self.prefix_seq_len = self.config["adapter"]["prefix_seq_len"] 164 | self.hidden_size = d_word_vec 165 | self.n_embd = self.hidden_size // self.num_heads 166 | self.prefix_tokens = torch.arange(self.prefix_seq_len).long() 167 | self.prefix_dropout = torch.nn.Dropout(config["adapter"]["prefix_dropout_prob"]) 168 | self.num_layers = n_layers 169 | 170 | self.prefix_encoder = PrefixEncoder(config, num_hidden_layers=self.num_layers, hidden_size=self.hidden_size) 171 | 172 | def get_prefix_tuning(self, batch_size): 173 | 174 | prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1)#.to(self.layer_norm.device) 175 | past_key_values = self.prefix_encoder(prefix_tokens) 176 | # bsz, seqlen, _ = past_key_values.shape 177 | past_key_values = past_key_values.view( 178 | batch_size, 179 | self.prefix_seq_len, 180 | self.num_layers * 2, 181 | self.num_heads, 182 | self.n_embd 183 | ) 184 | past_key_values = self.prefix_dropout(past_key_values) 185 | past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2) 186 | return past_key_values 187 | 188 | def forward(self, enc_seq, mask, return_attns=False): # (16, 2842, 256) 189 | 190 | dec_slf_attn_list = [] 191 | batch_size, max_len = enc_seq.shape[0], enc_seq.shape[1] 192 | 193 | if self.config['adapter']['prefix_tuning']: 194 | past_key_values=self.get_prefix_tuning(batch_size=batch_size) 195 | else: 196 | past_key_values=None 197 | 198 | # -- Forward 199 | if not self.training and enc_seq.shape[1] > self.max_seq_len: 200 | # -- Prepare masks 201 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 202 | dec_output = enc_seq + get_sinusoid_encoding_table( 203 | enc_seq.shape[1], self.d_model 204 | )[: enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to( 205 | enc_seq.device 206 | ) 207 | else: 208 | max_len = min(max_len, self.max_seq_len) 209 | 210 | # -- Prepare masks 211 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 212 | dec_output = enc_seq[:, :max_len, :] + self.position_enc[ 213 | :, :max_len, : 214 | ].expand(batch_size, -1, -1) 215 | mask = mask[:, :max_len] 216 | slf_attn_mask = slf_attn_mask[:, :, :max_len] 217 | 218 | for i, dec_layer in enumerate(self.layer_stack): 219 | 220 | past_key_value = past_key_values[i] if past_key_values is not None else None 221 | 222 | dec_output, dec_slf_attn = dec_layer( 223 | dec_output, mask=mask, slf_attn_mask=slf_attn_mask, past_key_value=past_key_value 224 | ) 225 | if return_attns: 226 | dec_slf_attn_list += [dec_slf_attn] 227 | 228 | return dec_output, mask 229 | 230 | 231 | class FFTBlock(nn.Module): 232 | """ FFT Block """ 233 | 234 | def __init__(self, config, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1, is_decoder=False): 235 | super(FFTBlock, self).__init__() 236 | self.slf_attn = MultiHeadAttention(config, n_head, d_model, d_k, d_v, dropout=dropout) 237 | self.pos_ffn = PositionwiseFeedForward( 238 | d_model, d_inner, kernel_size, dropout=dropout 239 | ) 240 | 241 | self.config = config 242 | if config["adapter"]["output_bottleneck"]: 243 | self.adapterblock = BottleneckAdapter("bottleneck_adapter", d_model, int(d_model/2)) 244 | elif config["adapter"]["conv_adapter"]: 245 | self.adapterblock = AdapterBlock(d_model, int(d_model/2)) 246 | 247 | def forward(self, enc_input, mask=None, slf_attn_mask=None, past_key_value=None): 248 | 249 | enc_output, enc_slf_attn = self.slf_attn( 250 | enc_input, enc_input, enc_input, mask=slf_attn_mask, past_key_value=past_key_value 251 | ) 252 | if mask is not None: 253 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 254 | 255 | enc_output = self.pos_ffn(enc_output) 256 | 257 | if self.config["adapter"]["output_bottleneck"]: 258 | enc_output = self.adapterblock(x=enc_output, residual_input=enc_output) 259 | elif self.config["adapter"]["conv_adapter"]: 260 | enc_output = self.adapterblock(x=enc_output, residual_input=enc_output) 261 | elif self.config["adapter"]["tiny_attention"]: 262 | enc_output = enc_output + self.tiny_attn(hidden_states=enc_output) 263 | elif self.config["adapter"]["tiny_external_attention"]: 264 | enc_output = enc_output + self.tiny_attn(hidden_states=enc_output) 265 | elif self.config["adapter"]["tiny_conformer"]: 266 | enc_output = enc_output + self.tiny_conformer(hidden_states=enc_output) 267 | 268 | if mask is not None: 269 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 270 | 271 | return enc_output, enc_slf_attn 272 | 273 | 274 | class MultiHeadAttention(nn.Module): 275 | """ Multi-Head Attention """ 276 | 277 | def __init__(self, config, n_head, d_model, d_k, d_v, dropout=0.1): 278 | super(MultiHeadAttention, self).__init__() 279 | 280 | self.n_head = n_head 281 | self.d_k = d_k 282 | self.d_v = d_v 283 | 284 | self.config = config 285 | 286 | if config["adapter"]["lora"]: 287 | self.w_qs = lora.Linear(d_model, n_head * d_k, r=8) 288 | self.w_ks = LinearNorm(d_model, n_head * d_k) 289 | self.w_vs = lora.Linear(d_model, n_head * d_v, r=8) 290 | else: 291 | self.w_qs = LinearNorm(d_model, n_head * d_k) 292 | self.w_ks = LinearNorm(d_model, n_head * d_k) 293 | self.w_vs = LinearNorm(d_model, n_head * d_v) 294 | 295 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 296 | self.layer_norm = nn.LayerNorm(d_model) 297 | 298 | self.fc = LinearNorm(n_head * d_v, d_model) 299 | 300 | self.dropout = nn.Dropout(dropout) 301 | 302 | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int, dim: int): 303 | return tensor.view(bsz, seq_len, self.n_head, dim).transpose(1, 2).contiguous() 304 | 305 | def forward(self, q, k, v, mask=None, past_key_value=None): 306 | 307 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 308 | 309 | sz_b, len_q, _ = q.size() 310 | sz_b, len_k, _ = k.size() 311 | sz_b, len_v, _ = v.size() 312 | 313 | residual = q 314 | 315 | batch_size = q.size(0) 316 | 317 | if past_key_value is not None: # prefix-tuning 318 | mask = mask.repeat(n_head, 1, 1) 319 | 320 | key_states = self._shape(self.w_ks(k), -1, batch_size, self.d_k) 321 | value_states = self._shape(self.w_vs(v), -1, batch_size, self.d_v) 322 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 323 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 324 | 325 | prefix_attention_mask = torch.ones(batch_size, self.config['adapter']['prefix_seq_len']).to(mask.device) 326 | prefix_attention_mask = 1.0 - prefix_attention_mask 327 | prefix_attention_mask = prefix_attention_mask[:, None, :].repeat(n_head, mask.size(-1), 1) 328 | 329 | mask = torch.tensor(torch.cat((prefix_attention_mask, mask), dim=-1), dtype=torch.bool) 330 | 331 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k).permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) 332 | len_k = key_states.size(2) 333 | len_v = value_states.size(2) 334 | k = key_states.permute(1,0,2,3).contiguous().view(-1, len_k, d_k) 335 | v = value_states.permute(1,0,2,3).contiguous().view(-1, len_v, d_v) 336 | 337 | output, attn = self.attention(q, k, v, mask=mask) 338 | 339 | else: 340 | 341 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 342 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 343 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 344 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 345 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 346 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 347 | 348 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 349 | output, attn = self.attention(q, k, v, mask=mask) 350 | 351 | output = output.view(n_head, sz_b, len_q, d_v) 352 | output = ( 353 | output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) 354 | ) # b x lq x (n*dv) 355 | 356 | output = self.dropout(self.fc(output)) 357 | output = self.layer_norm(output + residual) 358 | 359 | return output, attn 360 | 361 | 362 | class ScaledDotProductAttention(nn.Module): 363 | """ Scaled Dot-Product Attention """ 364 | 365 | def __init__(self, temperature): 366 | super(ScaledDotProductAttention, self).__init__() 367 | self.temperature = temperature 368 | self.softmax = nn.Softmax(dim=2) 369 | 370 | def forward(self, q, k, v, mask=None): 371 | 372 | attn = torch.bmm(q, k.transpose(1, 2)) 373 | attn = attn / self.temperature 374 | 375 | if mask is not None: 376 | attn = attn.masked_fill(mask, -np.inf) 377 | 378 | attn = self.softmax(attn) 379 | output = torch.bmm(attn, v) 380 | 381 | return output, attn 382 | 383 | 384 | class PositionwiseFeedForward(nn.Module): 385 | """ A two-feed-forward-layer """ 386 | 387 | def __init__(self, d_in, d_hid, kernel_size, dropout=0.1): 388 | super(PositionwiseFeedForward, self).__init__() 389 | 390 | # Use Conv1D 391 | # position-wise 392 | self.w_1 = nn.Conv1d( 393 | d_in, 394 | d_hid, 395 | kernel_size=kernel_size[0], 396 | padding=(kernel_size[0] - 1) // 2, 397 | ) 398 | # position-wise 399 | self.w_2 = nn.Conv1d( 400 | d_hid, 401 | d_in, 402 | kernel_size=kernel_size[1], 403 | padding=(kernel_size[1] - 1) // 2, 404 | ) 405 | 406 | self.layer_norm = nn.LayerNorm(d_in) 407 | self.dropout = nn.Dropout(dropout) 408 | 409 | def forward(self, x): 410 | residual = x 411 | output = x.transpose(1, 2) 412 | output = self.w_2(F.relu(self.w_1(output))) 413 | output = output.transpose(1, 2) 414 | output = self.dropout(output) 415 | output = self.layer_norm(output + residual) 416 | 417 | return output 418 | -------------------------------------------------------------------------------- /tasks/asr/asr.py: -------------------------------------------------------------------------------- 1 | 2 | from datasets import load_dataset, load_metric#, Audio 3 | # from datasets import ClassLabel 4 | import random 5 | import pandas as pd 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from dataclasses import field, dataclass 11 | from typing import * 12 | from transformers import (set_seed, Wav2Vec2CTCTokenizer, 13 | Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Config, 14 | TrainingArguments, EarlyStoppingCallback, HfArgumentParser) 15 | from transformers.integrations import TensorBoardCallback 16 | from torch.optim.lr_scheduler import LambdaLR 17 | 18 | from path import Path 19 | import sys 20 | folder = Path(__file__).abspath() 21 | sys.path.append(folder.parent.parent.parent) 22 | 23 | from os.path import join 24 | import math 25 | 26 | import utils 27 | from transformers import Trainer 28 | from modeling_wav2vec2 import Wav2Vec2ForCTC 29 | from data import get_asr_data, DataCollatorCTCWithPadding, get_asr_meld_data, get_asr_esd_vocab_dict 30 | from datasets import load_metric 31 | 32 | import re 33 | import json 34 | import statistics 35 | 36 | @dataclass 37 | class DataTrainingArguments(TrainingArguments): 38 | dataset: Optional[str] = field( 39 | default="esd", metadata={"help": "dataset name"} 40 | ) 41 | data_dir: Optional[str] = field( 42 | default="/data/path/ESD/en/", metadata={"help": "The dir of the dataset."} 43 | ) 44 | feat_adapter_name: Optional[str] = field( 45 | default="conv_adapter", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter }."} 46 | ) 47 | trans_adapter_name: Optional[str] = field( 48 | default="bottleneck", metadata={"help": "The type of adapter, should be chosen among in {conv_adapter, bottleneck, adapterblock}."} 49 | ) 50 | output_adapter: Optional[bool] = field( 51 | default=False, metadata={"help": "use adapter after FFN"} 52 | ) 53 | mh_adapter: Optional[bool] = field( 54 | default=False, metadata={"help": "use adapter after multi-head attention"} 55 | ) 56 | prefix_tuning: Optional[bool] = field( 57 | default=False, metadata={"help": "use prefix-tuning in multi-head attention, implemented by us"} 58 | ) 59 | prefix_seq_len: Optional[int] = field( 60 | default=30, metadata={"help": "prefix sequence length"} 61 | ) 62 | prefix_projection: Optional[bool] = field( 63 | default=False, 64 | metadata={ 65 | "help": "Apply a two-layer MLP head over the prefix embeddings" 66 | } 67 | ) 68 | prefix_dropout_prob: Optional[bool] = field( 69 | default=0.1, 70 | metadata={ 71 | "help": "The dropout probability used in the models" 72 | } 73 | ) 74 | feat_enc_adapter: Optional[bool] = field( 75 | default=False, metadata={"help": "use conv_adapter in feature encoder and Adapterblock in "} 76 | ) 77 | lora_adapter: Optional[bool] = field( 78 | default=False, metadata={"help": "use lora_adapter in feature encoder"} 79 | ) 80 | fine_tune: Optional[bool] = field( 81 | default=False, metadata={"help": "if fine-tune wav2vec2 or not"} 82 | ) 83 | 84 | def get_mean_length(dataset): 85 | all_lens = [] 86 | for idx, _ in enumerate(dataset): 87 | all_lens.append(dataset[idx]["input_length"]) 88 | return statistics.mean(all_lens) 89 | 90 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_constant_steps, num_training_steps, last_epoch=-1): 91 | def lr_lambda(current_step: int): 92 | if current_step < num_warmup_steps: 93 | return float(current_step) / float(max(1, num_warmup_steps)) 94 | elif current_step >= num_warmup_steps and current_step < num_constant_steps: 95 | return float(1.0) 96 | return max( 97 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_constant_steps)) 98 | ) 99 | 100 | return LambdaLR(optimizer, lr_lambda, last_epoch) 101 | 102 | 103 | class CustomTrainer(Trainer): 104 | def __init__(self, *args, **kwargs): 105 | super().__init__(*args, **kwargs) 106 | self.constant_ratio = 0.4 107 | self.num_constant_steps = -1 108 | def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): 109 | if self.lr_scheduler is None: 110 | self.lr_scheduler = get_linear_schedule_with_warmup( 111 | self.optimizer if optimizer is None else optimizer, 112 | num_warmup_steps=self.args.get_warmup_steps(num_training_steps), 113 | num_constant_steps=self.get_keep_constant_steps(num_training_steps), 114 | num_training_steps=num_training_steps) 115 | return self.lr_scheduler 116 | def get_keep_constant_steps(self, num_training_steps: int): 117 | keep_constant_steps = ( 118 | self.num_constant_steps if self.num_constant_steps > 0 else math.ceil(num_training_steps * (self.constant_ratio + self.args.warmup_ratio)) 119 | ) 120 | return keep_constant_steps 121 | 122 | def main(): 123 | set_seed(1314) 124 | 125 | # args 126 | parser = HfArgumentParser(DataTrainingArguments) 127 | args = parser.parse_args_into_dataclasses()[0] 128 | 129 | vocab_json = None 130 | 131 | # audio dataset 132 | if args.dataset.lower() == "esd": 133 | get_asr_esd_vocab_dict(args.data_dir) ## create esd vocab dict 134 | 135 | vocab_json = 'vocab_esd.json' 136 | #processor 137 | tokenizer = Wav2Vec2CTCTokenizer(vocab_json, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 138 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 139 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 140 | 141 | train_set, max_len_train = get_asr_data(args.data_dir, processor, "train") 142 | valid_set, max_len_valid = get_asr_data(args.data_dir, processor, "evaluation") 143 | test_set, max_len_test = get_asr_data(args.data_dir, processor, "test") 144 | 145 | elif args.dataset.lower() == "meld": 146 | train_set, max_len_train = get_asr_meld_data(args.data_dir, processor, "train") 147 | valid_set, max_len_valid = get_asr_meld_data(args.data_dir, processor, "evaluation") 148 | test_set, max_len_test = get_asr_meld_data(args.data_dir, processor, "test") 149 | elif args.dataset.lower() == "fleurs": 150 | 151 | fleurs = load_dataset("google/xtreme_s", "fleurs.en_us", cache_dir="/data/path/fleurs") 152 | fleurs = fleurs.remove_columns(["num_samples", "raw_transcription", "gender", "lang_id","language", "lang_group_id"]) 153 | 154 | chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]' 155 | def remove_special_characters(batch): 156 | batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() 157 | return batch 158 | fleurs = fleurs.map(remove_special_characters) 159 | 160 | def extract_all_chars(batch): 161 | all_text = " ".join(batch["transcription"]) 162 | vocab = list(set(all_text)) 163 | return {"vocab": [vocab], "all_text": [all_text]} 164 | 165 | vocabs = fleurs.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=fleurs.column_names["train"]) 166 | vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0])) 167 | 168 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 169 | 170 | vocab_dict["|"] = vocab_dict[" "] 171 | del vocab_dict[" "] 172 | vocab_dict["[UNK]"] = len(vocab_dict) 173 | vocab_dict["[PAD]"] = len(vocab_dict) 174 | 175 | with open('vocab_fleurs.json', 'w') as vocab_file: 176 | json.dump(vocab_dict, vocab_file) 177 | 178 | vocab_json = 'vocab_fleurs.json' 179 | 180 | train_set = fleurs["train"] 181 | valid_set = fleurs["validation"] 182 | test_set = fleurs["test"] 183 | elif args.dataset.lower() == "voxpopuli": 184 | 185 | voxpopuli = load_dataset("facebook/voxpopuli", "en", cache_dir="/data/path/voxpopuli") 186 | voxpopuli = voxpopuli.remove_columns(["language", "raw_text", "gender", "speaker_id","is_gold_transcript", "accent"]) 187 | 188 | chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]' 189 | def remove_special_characters(batch): 190 | batch["normalized_text"] = re.sub(chars_to_ignore_regex, '', batch["normalized_text"]).lower() 191 | return batch 192 | voxpopuli = voxpopuli.map(remove_special_characters) 193 | 194 | def extract_all_chars(batch): 195 | all_text = " ".join(batch["normalized_text"]) 196 | vocab = list(set(all_text)) 197 | return {"vocab": [vocab], "all_text": [all_text]} 198 | 199 | vocabs = voxpopuli.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=voxpopuli.column_names["train"]) 200 | vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0])) 201 | 202 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 203 | 204 | vocab_dict["|"] = vocab_dict[" "] 205 | del vocab_dict[" "] 206 | vocab_dict["[UNK]"] = len(vocab_dict) 207 | vocab_dict["[PAD]"] = len(vocab_dict) 208 | 209 | with open('vocab_voxpopuli.json', 'w') as vocab_file: 210 | json.dump(vocab_dict, vocab_file) 211 | 212 | vocab_json = 'vocab_voxpopuli.json' 213 | 214 | train_set = voxpopuli["train"] 215 | valid_set = voxpopuli["validation"] 216 | test_set = voxpopuli["test"] 217 | elif args.dataset.lower() == "librispeech": 218 | librispeech_train = load_dataset('librispeech_asr', 'clean', split='train.100', cache_dir='/data/path/hf_datasets') 219 | librispeech_dev = load_dataset('librispeech_asr', 'clean', split='validation', cache_dir='/data/path/hf_datasets') 220 | librispeech_test = load_dataset('librispeech_asr', 'clean', split='test', cache_dir='/data/path/hf_datasets') 221 | 222 | chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]' 223 | def remove_special_characters(batch): 224 | batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() 225 | return batch 226 | librispeech_train = librispeech_train.map(remove_special_characters) 227 | librispeech_dev = librispeech_dev.map(remove_special_characters) 228 | librispeech_test = librispeech_test.map(remove_special_characters) 229 | 230 | def extract_all_chars(batch): 231 | all_text = " ".join(batch["text"]) 232 | vocab = list(set(all_text)) 233 | return {"vocab": [vocab], "all_text": [all_text]} 234 | 235 | vocabs_train = librispeech_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=librispeech_train.column_names) 236 | vocab_list = list(set(vocabs_train["vocab"][0])) 237 | 238 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 239 | 240 | vocab_dict["|"] = vocab_dict[" "] 241 | del vocab_dict[" "] 242 | vocab_dict["[UNK]"] = len(vocab_dict) 243 | vocab_dict["[PAD]"] = len(vocab_dict) 244 | 245 | with open('vocab_librispeech.json', 'w') as vocab_file: 246 | json.dump(vocab_dict, vocab_file) 247 | 248 | vocab_json = 'vocab_librispeech.json' 249 | 250 | train_set = librispeech_train 251 | valid_set = librispeech_dev 252 | test_set = librispeech_test 253 | 254 | if args.dataset.lower() in ["librispeech", "voxpopuli", "fleurs"]: 255 | #processor 256 | tokenizer = Wav2Vec2CTCTokenizer(vocab_json, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 257 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 258 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 259 | 260 | # prepare dataset 261 | def prepare_dataset(batch): 262 | audio = batch["audio"] 263 | 264 | # batched output is "un-batched" 265 | batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] 266 | # batch["input_values"] = processor(audio["array"], padding="True", max_length=160000, truncation=True, sampling_rate=audio["sampling_rate"], return_tensors="pt").input_values[0] 267 | batch["input_length"] = len(batch["input_values"]) 268 | 269 | with processor.as_target_processor(): 270 | if args.dataset.lower() == "fleurs": 271 | batch["labels"] = processor(batch["transcription"]).input_ids 272 | elif args.dataset.lower() == "voxpopuli": 273 | batch["labels"] = processor(batch["normalized_text"]).input_ids 274 | elif args.dataset.lower() == "librispeech": 275 | batch["labels"] = processor(batch["text"]).input_ids 276 | # breakpoint() 277 | return batch 278 | 279 | train_set = train_set.map(prepare_dataset, remove_columns=train_set.column_names) 280 | valid_set = valid_set.map(prepare_dataset, remove_columns=valid_set.column_names) 281 | test_set = test_set.map(prepare_dataset, remove_columns=test_set.column_names) 282 | 283 | print(train_set) 284 | print(valid_set) 285 | print(test_set) 286 | 287 | 288 | # config 289 | config = Wav2Vec2Config.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", vocab_size=len(processor.tokenizer)) 290 | config._name_or_path = "" 291 | 292 | config.adapter_name = args.trans_adapter_name 293 | config.output_adapter = args.output_adapter 294 | config.mh_adapter = args.mh_adapter 295 | config.prefix_tuning = args.prefix_tuning 296 | config.feat_enc_adapter = args.feat_enc_adapter 297 | config.lora_adapter = args.lora_adapter 298 | config.prefix_seq_len = args.prefix_seq_len 299 | config.prefix_projection = args.prefix_projection 300 | config.prefix_dropout_prob = args.prefix_dropout_prob 301 | config.ctc_loss_reduction = "mean" 302 | config.pad_token_id = processor.tokenizer.pad_token_id 303 | 304 | 305 | # load pretrained model 306 | model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", config=config, ignore_mismatched_sizes=True) 307 | model.freeze_feature_encoder() 308 | 309 | # print(model) 310 | 311 | print("\n #Train: {}, #Valid: {}, #Test: {} ".format(len(train_set), len(valid_set), len(test_set))) 312 | # print(" #Train Max len: {}, #Valid Max len: {}, #Test Max len: {} \n".format(max_len_train, max_len_valid, max_len_test)) 313 | 314 | ## freeze all params exclude promptblock and classification head 315 | print("------>>> Trainable params(before freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 316 | if not args.fine_tune: 317 | model.freeze_exclude_prompt() 318 | print("------>>> Trainable params(after freeze):", sum(p.numel() for p in model.parameters() if p.requires_grad)) 319 | 320 | # for name, param in model.named_parameters(): 321 | # if param.requires_grad: 322 | # print(name, param.requires_grad, param.size()) 323 | 324 | data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True, max_length=200000) 325 | wer_metric = load_metric("wer") 326 | 327 | def compute_metrics(pred): 328 | 329 | pred_logits = pred.predictions 330 | pred_ids = np.argmax(pred_logits, axis=-1) 331 | 332 | pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id 333 | 334 | pred_str = processor.batch_decode(pred_ids) 335 | # we do not want to group tokens when computing the metrics 336 | label_str = processor.batch_decode(pred.label_ids, group_tokens=False) 337 | 338 | wer = wer_metric.compute(predictions=pred_str, references=label_str) 339 | 340 | return {"wer": wer} 341 | 342 | trainer = CustomTrainer( 343 | model=model, 344 | data_collator=data_collator, 345 | args=args, 346 | compute_metrics=compute_metrics, 347 | train_dataset=train_set, 348 | eval_dataset=valid_set, 349 | tokenizer=processor.feature_extractor, ####changed 350 | # callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)] 351 | callbacks = [TensorBoardCallback], 352 | ) 353 | 354 | 355 | save_dir = join(args.output_dir, "best_model") 356 | if args.do_train: # train and test 357 | trainer.train(resume_from_checkpoint=None) 358 | trainer.save_model(save_dir) 359 | 360 | test_metrics = trainer.predict(test_set).metrics 361 | print(test_metrics) 362 | 363 | if args.do_predict: # only for test 364 | device = trainer.model.device 365 | trainer.model = trainer.model.from_pretrained(save_dir).to(device) 366 | 367 | print(trainer.predict) 368 | 369 | test_metrics = trainer.predict(test_set).metrics 370 | print(test_metrics) 371 | 372 | 373 | if __name__ == "__main__": 374 | main() -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | from transformers import Wav2Vec2Processor 2 | from datasets import load_metric 3 | from torch.utils.data import Dataset, DataLoader 4 | import numpy as np 5 | 6 | import utils 7 | 8 | import torch 9 | import os 10 | import re 11 | from os.path import join 12 | from glob import glob 13 | from torch import nn 14 | import random 15 | import torch.nn.functional as F 16 | 17 | # import librosa as rosa 18 | import librosa 19 | # from omegaconf import OmegaConf as OC 20 | from tqdm import tqdm 21 | import multiprocessing as mp 22 | 23 | from dataclasses import dataclass, field 24 | from typing import Any, Dict, List, Optional, Union 25 | 26 | from datasets import ClassLabel 27 | import pandas as pd 28 | # from IPython.display import display, HTML 29 | 30 | import soundfile as sf 31 | from pathlib import Path 32 | from tqdm import tqdm 33 | import json 34 | 35 | from sklearn.metrics import f1_score 36 | from sklearn.metrics.pairwise import cosine_similarity 37 | 38 | def compute_metrics(eval_pred): 39 | """Computes accuracy on a batch of predictions""" 40 | predictions = np.argmax(eval_pred.predictions, axis=1) 41 | metric = load_metric("accuracy") 42 | return metric.compute(predictions=predictions, references=eval_pred.label_ids) 43 | 44 | def compute_metrics_macro_f1(eval_pred): 45 | """Computes accuracy on a batch of predictions""" 46 | predictions = np.argmax(eval_pred.predictions, axis=1) 47 | metric = load_metric("f1") 48 | 49 | # return metric.compute(predictions=predictions, references=eval_pred.label_ids, average='macro') 50 | return metric.compute(predictions=predictions, references=eval_pred.label_ids, average='weighted') 51 | 52 | class SpeechDataset(Dataset): 53 | def __init__(self, audios, labels, processor: Wav2Vec2Processor, sample_rate, all_labels): 54 | self.audios = audios 55 | self.labels = labels 56 | self.processor = processor 57 | self.sample_rate = sample_rate 58 | label2id, id2label = dict(), dict() 59 | for i, label in enumerate(all_labels): 60 | label2id[label] = str(i) 61 | id2label[str(i)] = label 62 | 63 | self.num_labels = len(all_labels) 64 | self.label2id = label2id 65 | self.id2label = id2label 66 | 67 | def __getitem__(self, index): 68 | audio_wav = self.audios[index] 69 | 70 | inputs = self.processor(audio_wav, padding="max_length", max_length=40000, truncation=True, sampling_rate=self.sample_rate, return_tensors="pt") 71 | 72 | label = self.labels[index] 73 | 74 | return {'input_values':inputs.input_values.squeeze(0), 75 | 'attention_mask':inputs.attention_mask.squeeze(0), 76 | 'labels': label} 77 | 78 | def __len__(self): 79 | return len(self.audios) 80 | 81 | class AsrDataset(Dataset): 82 | def __init__(self, audios, texts, processor: Wav2Vec2Processor, sample_rate): 83 | self.audios = audios 84 | self.texts = texts 85 | self.processor = processor 86 | self.sample_rate = sample_rate 87 | self.chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]' 88 | 89 | def remove_special_characters(self, text): 90 | text = re.sub(self.chars_to_ignore_regex, '', text).lower() 91 | return text 92 | 93 | def __getitem__(self, index): 94 | audio_wav = self.audios[index] 95 | text = self.texts[index] 96 | text = self.remove_special_characters(text) 97 | inputs = self.processor(audio_wav, padding="max_length", max_length=40000, truncation=True, sampling_rate=self.sample_rate, return_tensors="pt") 98 | 99 | with self.processor.as_target_processor(): 100 | label = self.processor(text).input_ids 101 | 102 | # breakpoint() 103 | 104 | # import IPython.display as ipd 105 | # ipd.Audio(data=np.asarray(audio_wav), autoplay=True, rate=16000) 106 | input_values = inputs.input_values.squeeze(0) 107 | return {'input_values' : input_values, 108 | 'input_length' : len(input_values), 109 | 'attention_mask' : inputs.attention_mask.squeeze(0), 110 | 'text': text, 111 | 'labels': label} 112 | 113 | def __len__(self): 114 | return len(self.audios) 115 | 116 | class LibriPhoneDataset(Dataset): 117 | def __init__(self, split, tokenizer, bucket_size, path, word2phonemes_path, ascending=False, **kwargs): 118 | # Setup 119 | self.path = path 120 | self.bucket_size = bucket_size 121 | 122 | with open(word2phonemes_path, "r") as f: 123 | word2phonemes = json.load(f) 124 | 125 | # List all wave files 126 | file_list = [] 127 | for s in split: 128 | split_list = list(Path(join(path, s)).rglob("*.flac")) 129 | assert len(split_list) > 0, "No data found @ {}".format(join(path,s)) 130 | file_list += split_list 131 | 132 | text = [] 133 | for f in tqdm(file_list, desc='word -> phonemes'): 134 | text.append(self.read_text(str(f), word2phonemes, tokenizer)) 135 | 136 | self.file_list, self.text = zip(*[(f_name, txt) 137 | for f_name, txt in sorted(zip(file_list, text), reverse=not ascending, key=lambda x:len(x[1]))]) 138 | 139 | # self.file_list = self.file_list[:100] 140 | # self.text = self.text[:100] 141 | 142 | def __getitem__(self, index): 143 | if self.bucket_size > 1: 144 | index = min(len(self.file_list)-self.bucket_size, index) 145 | return [(f_path, txt) for f_path, txt in 146 | zip(self.file_list[index:index+self.bucket_size], self.text[index:index+self.bucket_size])] 147 | else: 148 | return self.file_list[index], self.text[index] 149 | 150 | def __len__(self): 151 | return len(self.file_list) 152 | 153 | def read_text(self, file, word2phonemes, tokenizer): 154 | '''Get transcription of target wave file, 155 | it's somewhat redundant for accessing each txt multiplt times, 156 | but it works fine with multi-thread''' 157 | src_file = '-'.join(file.split('-')[:-1])+'.trans.txt' 158 | idx = file.split('/')[-1].split('.')[0] 159 | 160 | with open(src_file, 'r') as fp: 161 | for line in fp: 162 | if idx == line.split(' ')[0]: 163 | transcription = line[:-1].split(' ', 1)[1] 164 | phonemes = [] 165 | for word in transcription.split(): 166 | phonemes += word2phonemes[word] 167 | return tokenizer.encode(' '.join(phonemes)) 168 | 169 | 170 | 171 | class DataCollatorCTCWithPadding: 172 | """ 173 | Data collator that will dynamically pad the inputs received. 174 | Args: 175 | processor (:class:`~transformers.Wav2Vec2Processor`) 176 | The processor used for proccessing the data. 177 | padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): 178 | Select a strategy to pad the returned sequences (according to the model's padding side and padding index) 179 | among: 180 | * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single 181 | sequence if provided). 182 | * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the 183 | maximum acceptable input length for the model if that argument is not provided. 184 | * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of 185 | different lengths). 186 | max_length (:obj:`int`, `optional`): 187 | Maximum length of the ``input_values`` of the returned list and optionally padding length (see above). 188 | max_length_labels (:obj:`int`, `optional`): 189 | Maximum length of the ``labels`` returned list and optionally padding length (see above). 190 | pad_to_multiple_of (:obj:`int`, `optional`): 191 | If set will pad the sequence to a multiple of the provided value. 192 | This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 193 | 7.5 (Volta). 194 | """ 195 | def __init__(self, 196 | processor: Wav2Vec2Processor, 197 | padding: Union[bool, str] = True, 198 | max_length: Optional[int] = None, 199 | max_length_labels: Optional[int] = None, 200 | pad_to_multiple_of: Optional[int] = None, 201 | pad_to_multiple_of_labels: Optional[int] = None): 202 | 203 | self.processor = processor 204 | self.padding = padding 205 | self.max_length = max_length 206 | self.max_length_labels = max_length_labels 207 | self.pad_to_multiple_of = pad_to_multiple_of 208 | self.pad_to_multiple_of_labels = pad_to_multiple_of_labels 209 | 210 | def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: 211 | # split inputs and labels since they have to be of different lenghts and need 212 | # different padding methods 213 | input_features = [{"input_values": feature["input_values"]} for feature in features] 214 | label_features = [{"input_ids": feature["labels"]} for feature in features] 215 | 216 | batch = self.processor.pad( 217 | input_features, 218 | # padding=self.padding, 219 | padding='max_length', 220 | max_length=self.max_length, 221 | pad_to_multiple_of=self.pad_to_multiple_of, 222 | return_tensors="pt", 223 | truncation=True 224 | ) 225 | 226 | assert batch["input_values"].size(1) == 200000 227 | 228 | with self.processor.as_target_processor(): 229 | labels_batch = self.processor.pad( 230 | label_features, 231 | padding=self.padding, 232 | max_length=self.max_length_labels, 233 | pad_to_multiple_of=self.pad_to_multiple_of_labels, 234 | return_tensors="pt", 235 | ) 236 | 237 | # replace padding with -100 to ignore loss correctly 238 | labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) 239 | 240 | batch["labels"] = labels 241 | 242 | return batch 243 | 244 | 245 | 246 | def get_data(data_dir, processor, mode): 247 | data_list = utils.get_file_list(data_dir, mode) 248 | if mode == "train": 249 | import random 250 | random.seed(100) 251 | random.shuffle(data_list) 252 | data_wavs, data_labels, sample_rate, all_labels = utils.read_wav(data_list) 253 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, all_labels) 254 | max_length = max([len(d_wav) for d_wav in data_wavs]) 255 | return data_set, max_length 256 | 257 | def get_sp_cls_data(data_dir, processor, mode): 258 | data_list = utils.get_file_list(data_dir, mode) 259 | if mode == "train": 260 | import random 261 | random.seed(100) 262 | random.shuffle(data_list) 263 | data_wavs, data_labels, sample_rate, all_labels = utils.read_sp_cls_wav(data_list) 264 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, all_labels) 265 | max_length = max([len(d_wav) for d_wav in data_wavs]) 266 | return data_set, max_length 267 | 268 | def get_sp_vctk_data(data_dir, processor, mode): 269 | from collections import Counter 270 | 271 | data_list = utils.get_vctk_files(data_dir, mode) 272 | if mode == "train": 273 | import random 274 | random.seed(100) 275 | random.shuffle(data_list) 276 | data_wavs, data_labels, sample_rate, all_labels = utils.read_sp_vstk_wav(data_list) 277 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, all_labels) 278 | max_length = max([len(d_wav) for d_wav in data_wavs]) 279 | return data_set, max_length 280 | 281 | def get_sv_vctk_data(data_dir, processor, mode): 282 | #from collections import Counter 283 | data_list = utils.get_vctk_files_no_overlap(data_dir, mode)[:512] 284 | if mode == "train": 285 | import random 286 | random.seed(100) 287 | random.shuffle(data_list) 288 | data_wavs, data_labels, sample_rate, all_labels = utils.read_sv_vstk_wav(data_list) 289 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, all_labels) 290 | max_length = max([len(d_wav) for d_wav in data_wavs]) 291 | return data_set, data_list ,max_length 292 | 293 | 294 | def get_emo_cls_iemocap_data(data_dir, processor, mode, wav_file_names, emotions): 295 | from collections import Counter 296 | data_list = utils.get_iemocap_files(data_dir, mode) 297 | if mode == "train": 298 | import random 299 | random.seed(100) 300 | random.shuffle(data_list) 301 | emotion_labels = ['sad', 'xxx', 'ang', 'fru', 'fea', 'exc', 'hap', 'sur', 'oth', 'neu', 'dis'] 302 | 303 | data_wavs, data_labels, sample_rate = utils.read_emo_iemocap_wav(data_list, wav_file_names, emotions, emotion_labels) 304 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, emotion_labels) 305 | max_length = max([len(d_wav) for d_wav in data_wavs]) 306 | return data_set, max_length 307 | 308 | 309 | def get_emo_meld_data(data_dir, processor, mode): #/data/path/MELD.Raw 310 | data_list = utils.get_meld_files(data_dir, mode) 311 | labels_dict = utils.get_emo_meld_label(data_dir, mode) 312 | emotion_labels = ['neutral', 'joy', 'sadness', 'anger', 'surprise', 'fear', 'disgust'] 313 | 314 | data_wavs, data_labels, sample_rate = utils.read_emo_meld_wav(data_list, labels_dict, emotion_labels) 315 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, emotion_labels) 316 | max_length = max([len(d_wav) for d_wav in data_wavs]) 317 | return data_set, max_length 318 | 319 | 320 | def get_ks_cls_data(data_dir, processor, mode): 321 | data_list = utils.get_ks_file_list(data_dir, mode) 322 | if mode == "train": 323 | import random 324 | random.seed(100) 325 | random.shuffle(data_list) 326 | data_wavs, data_labels, sample_rate, all_labels = utils.read_ks_cls_wav(data_list) 327 | data_set = SpeechDataset(data_wavs, data_labels, processor, sample_rate, all_labels) 328 | max_length = max([len(d_wav) for d_wav in data_wavs]) 329 | return data_set, max_length 330 | 331 | def get_asr_esd_vocab_dict(data_dir): 332 | all_texts = utils.read_text(data_dir) 333 | def extract_all_chars(texts): 334 | all_text = " ".join(texts) 335 | vocab = list(set(all_text)) 336 | return {"vocab":vocab} 337 | 338 | vocabs = extract_all_chars(all_texts) 339 | 340 | vocab_list = vocabs["vocab"] 341 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 342 | 343 | vocab_dict["|"] = vocab_dict[" "] 344 | del vocab_dict[" "] 345 | vocab_dict["[UNK]"] = len(vocab_dict) 346 | vocab_dict["[PAD]"] = len(vocab_dict) 347 | 348 | with open('vocab_esd.json', 'w') as vocab_file: 349 | json.dump(vocab_dict, vocab_file) 350 | 351 | 352 | 353 | def get_asr_data(data_dir, processor, mode): 354 | data_list = utils.get_file_list(data_dir, mode) 355 | all_texts = utils.read_text(data_dir) 356 | if mode == "train": 357 | import random 358 | random.seed(100) 359 | random.shuffle(data_list) 360 | data_wavs, data_texts, sample_rate = utils.read_asr_wav(data_list, all_texts) 361 | data_set = AsrDataset(data_wavs, data_texts, processor, sample_rate) 362 | max_length = max([len(d_wav) for d_wav in data_wavs]) 363 | return data_set, max_length 364 | 365 | def get_asr_meld_data(data_dir, processor, mode): 366 | data_list = utils.get_meld_files(data_dir, mode) 367 | utterances_dict = utils.read_meld_text(data_dir, mode) 368 | if mode == "train": 369 | import random 370 | random.seed(100) 371 | random.shuffle(data_list) 372 | data_wavs, data_texts, sample_rate = utils.read_asr_meld_wav(data_list, utterances_dict) 373 | data_set = AsrDataset(data_wavs, data_texts, processor, sample_rate) 374 | max_length = max([len(d_wav) for d_wav in data_wavs]) 375 | return data_set, max_length 376 | 377 | if __name__=="__main__": 378 | processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") 379 | # dataset, max_length = get_asr_data("/data/path/ESD/en/", processor, "test") 380 | # dataset = get_emo_cls_data("/data/path/ESD/en/", processor, "test") 381 | 382 | # get_sp_vctk_data("/data/path/VCTK_Wav/wav48/", processor, "train") 383 | # get_emo_cls_iemocap_data("/data/path/IEMOCAP/IEMOCAP_full_release/", processor, "train") #['train', 'evaluation', 'test']: 384 | # get_emo_meld_data("/data/path/MELD.Raw/", processor, "train") 385 | # get_emo_meld_data("/data/path/MELD.Raw/", processor, "evaluation") 386 | # get_emo_meld_data("/data/path/MELD.Raw/", processor, "test") 387 | 388 | # get_asr_meld_data("/data/path/MELD.Raw/", processor, "train") 389 | # get_asr_meld_data("/data/path/MELD.Raw/", processor, "evaluation") 390 | # get_asr_meld_data("/data/path/MELD.Raw/", processor, "test") 391 | 392 | 393 | ##### process VCTK data from .flac to .pt 394 | 395 | 396 | # def wav2pt(wav): 397 | # y,_ = rosa.load(wav, sr = 48000, mono = True) 398 | # y,_ = rosa.effects.trim(y, 15) 399 | # pt_name = os.path.splitext(wav)[0]+'.pt' 400 | # pt = torch.tensor(y) 401 | # torch.save(pt ,pt_name) 402 | # del y, pt 403 | # return 404 | 405 | # if __name__=='__main__': 406 | # vctk_dataset = load_dataset("vctk", cache_dir='/data/path/VCTK') 407 | # dir = "/data/path/VCTK/downloads/extracted/data/wav48_silence_trimmed" 408 | # wavs = glob(os.path.join(dir, '*/*.flac')) 409 | # pool = mp.Pool(processes = 64) 410 | # with tqdm(total = len(wavs)) as pbar: 411 | # for _ in tqdm(pool.imap_unordered(wav2pt, wavs)): 412 | # pbar.update() 413 | --------------------------------------------------------------------------------