├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── Speech2C ├── README.md └── speech2c │ ├── __init__.py │ ├── config │ ├── base_100h.yaml │ ├── base_10h.yaml │ └── speech2c_base_librispeech.yaml │ ├── criterions │ ├── __init__.py │ ├── ctc_ce.py │ └── speech2c_criterion.py │ ├── data │ └── speech2c_dataset.py │ ├── models │ ├── modules │ │ ├── ctc_prefix_score.py │ │ ├── multihead_attention.py │ │ ├── relative_pos_enc.py │ │ ├── transformer_decoder.py │ │ ├── transformer_decoder_layer.py │ │ └── transformer_encoder.py │ ├── speech2c.py │ ├── speech2c_asr.py │ └── t5_transformer_lm.py │ ├── squence_generator.py │ └── tasks │ └── speech2c_pretraining.py ├── Speech2S ├── README.md └── speech2s │ ├── __init__.py │ ├── config │ ├── finetune_asr │ │ ├── speechut_base_100h.yaml │ │ ├── speechut_large_100h.yaml │ │ └── speechut_large_960h.yaml │ └── pretrain │ │ ├── speechut_base_librispeech.yaml │ │ └── speechut_large_librilight.yaml │ ├── criterions │ ├── __init__.py │ ├── ctc_ce.py │ └── speechut_criterion.py │ ├── data │ ├── concat_dataset.py │ ├── hubert_dataset.py │ ├── language_trible_dataset.py │ ├── load_langpair_dataset.py │ └── multimodal_corpus_dataset.py │ ├── models │ ├── __init__.py │ ├── speechut.py │ ├── speechut_asr.py │ ├── speechut_st.py │ └── t5_transformer_lm.py │ ├── modules │ ├── __init__.py │ ├── ctc_prefix_score.py │ ├── learned_positional_embedding.py │ ├── multihead_attention.py │ ├── relative_pos_enc.py │ ├── transformer_decoder.py │ ├── transformer_encoder.py │ ├── transformer_layer.py │ └── w2v_encoder.py │ ├── scripts copy │ ├── pretrain_speechut │ │ ├── base_speechut_for_asr.sh │ │ ├── base_speechut_for_st.sh │ │ ├── base_speechut_for_st_enfr.sh │ │ └── large_speechut_for_asr.sh │ ├── tune_speechut_asr │ │ ├── finetune960h_large_edctc.sh │ │ ├── finetune_base_edctc.sh │ │ ├── inference_edctc.sh │ │ ├── inference_edctclm.sh │ │ ├── inference_lm_nj.sh │ │ └── inference_nj.sh │ └── tune_speechut_st │ │ ├── finetune_base_mustc_enxx.sh │ │ └── inference_st.sh │ ├── scripts │ ├── __init__.py │ ├── average_checkpoints.py │ ├── build_sym_alignment.py │ ├── compare_namespaces.py │ ├── compound_split_bleu.sh │ ├── constraints │ │ ├── extract.py │ │ └── validate.py │ ├── convert_dictionary.lua │ ├── convert_model.lua │ ├── count_docs.py │ ├── read_binarized.py │ ├── rm_pt.py │ ├── sacrebleu.sh │ ├── shard_docs.py │ ├── split_train_valid_docs.py │ ├── spm_decode.py │ ├── spm_encode.py │ ├── spm_train.py │ └── test_fsdp.sh │ ├── stpretrain_scripts │ ├── base_sc2c_enes.sh │ ├── base_sc2c_esen.sh │ ├── config.yaml │ ├── config │ │ ├── finetune_asr │ │ │ ├── base_100h.yaml │ │ │ └── large_960h.yaml │ │ ├── pretrain │ │ │ ├── mbart.yaml │ │ │ └── sc2t_base_librispeech.yaml │ │ └── translation │ │ │ └── text2code.yaml │ ├── config_mbart.yaml │ ├── data_process │ │ ├── extract_hubert_feature_itp.sh │ │ ├── merge_code.py │ │ ├── txt2idx.sh │ │ ├── txt2spm.sh │ │ └── wmt │ │ │ ├── normalize_en_text.py │ │ │ └── normalize_es_text.py │ ├── decode_text2code_beam2.sh │ ├── eval2.sh │ ├── eval3.sh │ ├── finetune_enes.sh │ ├── finetune_esen.sh │ ├── inference_ed.sh │ └── train_text2code │ │ ├── base_ReleaseIter2_text2unicode_from400k.sh │ │ ├── base_ReleaseIter2_text2unicode_from400k_es.sh │ │ ├── base_ReleaseIter2_text2unicode_from400k_es2.sh │ │ ├── decode_text2code.sh │ │ ├── decode_text2code_beam2.sh │ │ ├── inference_code_bleu.sh │ │ └── inference_code_wer.sh │ └── tasks │ └── joint_sc2t_pretrain.py ├── SpeechLM ├── README.md ├── SpeechLM.py ├── dataset │ ├── CommonVoice │ │ └── v4 │ │ │ └── en │ │ │ └── en-de │ │ │ ├── config_base_ende.yaml │ │ │ ├── config_large_ende.yaml │ │ │ ├── dev-sample100_st_en_de_local.tsv │ │ │ ├── spm_char_st_en_de.model │ │ │ ├── spm_char_st_en_de.txt │ │ │ └── spm_char_st_en_de.vocab │ ├── LibriLM │ │ ├── hidden_unit │ │ │ └── bin-idx │ │ │ │ ├── config.yaml │ │ │ │ ├── dict.km.txt │ │ │ │ └── dict.ltr.txt │ │ └── phone_unit │ │ │ └── bin-idx │ │ │ ├── config.yaml │ │ │ ├── dict.ltr.txt │ │ │ └── dict.phn.txt │ └── LibriSpeech │ │ ├── asr │ │ ├── dict.ltr.txt │ │ ├── train_sample100.ltr │ │ └── train_sample100.tsv │ │ ├── fast_phone2unit │ │ ├── config.yaml │ │ ├── config_generate.yaml │ │ ├── dict.PHN.txt │ │ ├── dict.km.txt │ │ ├── dict.phn.txt │ │ ├── genset_examples.tsv │ │ └── train_exmples.tsv │ │ ├── hidden_unit │ │ ├── dict.km.txt │ │ ├── train_sample100.km │ │ └── train_sample100.tsv │ │ └── phone_unit │ │ ├── dict.phn.txt │ │ ├── train_sample100.phn │ │ └── train_sample100.tsv ├── modules.py └── speechlm │ ├── __init__.py │ ├── config │ ├── decode │ │ ├── infer_fsqlm.yaml │ │ ├── infer_kenlm.yaml │ │ └── infer_viterbi.yaml │ ├── finetune │ │ ├── speechlm_base_100h.yaml │ │ └── speechlm_large_960h.yaml │ └── pretrain │ │ ├── speechlm_base_librispeech.yaml │ │ ├── speechlm_large_librilight.yaml │ │ └── speechlmp_base_cfg.pt │ ├── criterions │ ├── __init__.py │ ├── fasttext2unit_loss.py │ └── speechlm_criterion.py │ ├── data │ ├── concat_dataset.py │ ├── hubert_dataset.py │ ├── language_trible_dataset.py │ ├── load_langpair_dataset.py │ ├── multimodal_corpus_dataset.py │ └── text_to_unit_dataset.py │ ├── data_process │ ├── covost2 │ │ ├── mp3_to_wav.py │ │ └── prepare_covost_data.py │ ├── filter_paireddata_by_len.py │ ├── get_t2u_manifest.py │ ├── get_t2u_manifest_textonly.py │ ├── phoneize_with_sil.py │ ├── phoneme_tokenizer │ │ ├── ltr2kaldi_phn_sil025.py │ │ ├── mean5_and_std25_sil14_spn32.dict │ │ └── repeat_withou_insert_sil_less_4375.py │ ├── prepare_covost2_enxx.sh │ ├── prepare_phn2ltr_librilm.sh │ ├── txt2idx.sh │ └── wrd2ltr.py │ ├── generate_unit.py │ ├── infer.py │ ├── models │ ├── __init__.py │ ├── fasttext2unit.py │ ├── speechlm.py │ ├── speechlm_ctcasr.py │ └── speechlm_st.py │ ├── modules │ ├── __init__.py │ ├── learned_positional_embedding.py │ ├── multihead_attention.py │ ├── relative_pos_enc.py │ ├── transformer_decoder.py │ ├── transformer_encoder.py │ ├── transformer_layer.py │ └── w2v_encoder.py │ ├── scripts │ ├── pretrain_speechlm │ │ ├── base_speechlmh.sh │ │ ├── base_speechlmp.sh │ │ └── large_speechlmp.sh │ ├── tokenizer_fastT2U │ │ ├── generate.sh │ │ ├── infer.sh │ │ └── train_s_5e-4.sh │ ├── tune_speechlm_asr │ │ ├── finetune_base_ctc.sh │ │ ├── finetune_large_ctc.sh │ │ ├── inference_ctc.sh │ │ ├── inference_ctc_kenlm.sh │ │ ├── inference_ctc_large.sh │ │ └── inference_ctc_large_fsqlm.sh │ └── tune_speechlm_st │ │ ├── ft_base_covost_enxx.sh │ │ ├── ft_large_covost_enxx.sh │ │ ├── inference_base.sh │ │ └── inference_large.sh │ ├── tasks │ ├── fast_text_to_unit.py │ └── joint_sc2t_pretrain.py │ └── unit_generator.py ├── SpeechT5 ├── README.md ├── results │ ├── ablation_study.png │ ├── asr.png │ ├── se.png │ ├── sid.png │ ├── st.png │ ├── tts.png │ └── vc.png ├── scripts │ ├── generate_class.py │ └── generate_speech.py ├── speecht5 │ ├── __init__.py │ ├── criterions │ │ ├── __init__.py │ │ ├── speech_pretrain_criterion.py │ │ ├── speech_to_text_loss.py │ │ ├── speecht5_criterion.py │ │ ├── text_pretrain_criterion.py │ │ └── text_to_speech_loss.py │ ├── data │ │ ├── __init__.py │ │ ├── multitask_dataset.py │ │ ├── speech_dataset.py │ │ ├── speech_to_class_dataset.py │ │ ├── speech_to_speech_dataset.py │ │ ├── speech_to_text_dataset.py │ │ ├── text_dataset.py │ │ └── text_to_speech_dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── decoder.py │ │ │ ├── encoder.py │ │ │ ├── multihead_attention.py │ │ │ ├── speaker_decoder_postnet.py │ │ │ ├── speech_decoder_postnet.py │ │ │ ├── speech_decoder_prenet.py │ │ │ ├── speech_encoder_postnet.py │ │ │ ├── speech_encoder_prenet.py │ │ │ ├── text_decoder_postnet.py │ │ │ ├── text_decoder_prenet.py │ │ │ ├── text_encoder_prenet.py │ │ │ └── transformer_layer.py │ │ ├── speecht5.py │ │ └── t5_transformer_lm.py │ ├── sequence_generator.py │ └── tasks │ │ ├── __init__.py │ │ └── speecht5.py └── speecht5_framework.png ├── SpeechUT ├── README.md ├── dataset │ ├── LibriSpeech │ │ ├── dict.km.txt │ │ ├── dict.kmu.txt │ │ ├── dict.ltr.txt │ │ └── dict.txt │ └── MuSTC │ │ ├── dict.km.txt │ │ ├── dict.kmu.txt │ │ ├── en_de │ │ ├── config.yaml │ │ ├── config_ende.yaml │ │ ├── dict.kmu.txt │ │ ├── dict.spm.txt │ │ └── spm_unigram10000.model │ │ ├── en_es │ │ ├── config.yaml │ │ ├── config_enes.yaml │ │ ├── dict.kmu.txt │ │ ├── dict.spm.txt │ │ └── spm_unigram10000.model │ │ └── en_fr │ │ ├── config.yaml │ │ ├── config_enfr.yaml │ │ ├── dict.kmu.txt │ │ ├── dict.spm.txt │ │ └── spm_unigram10000.model └── speechut │ ├── __init__.py │ ├── config │ ├── finetune_asr │ │ ├── speechut_base_100h.yaml │ │ ├── speechut_large_100h.yaml │ │ └── speechut_large_960h.yaml │ └── pretrain │ │ ├── speechut_base_librispeech.yaml │ │ └── speechut_large_librilight.yaml │ ├── criterions │ ├── __init__.py │ ├── ctc_ce.py │ └── speechut_criterion.py │ ├── data │ ├── concat_dataset.py │ ├── hubert_dataset.py │ ├── language_trible_dataset.py │ ├── load_langpair_dataset.py │ └── multimodal_corpus_dataset.py │ ├── models │ ├── __init__.py │ ├── speechut.py │ ├── speechut_asr.py │ ├── speechut_st.py │ └── t5_transformer_lm.py │ ├── modules │ ├── __init__.py │ ├── ctc_prefix_score.py │ ├── learned_positional_embedding.py │ ├── multihead_attention.py │ ├── relative_pos_enc.py │ ├── transformer_decoder.py │ ├── transformer_encoder.py │ ├── transformer_layer.py │ └── w2v_encoder.py │ ├── scripts │ ├── pretrain_speechut │ │ ├── base_speechut_for_asr.sh │ │ ├── base_speechut_for_st.sh │ │ ├── base_speechut_for_st_enfr.sh │ │ └── large_speechut_for_asr.sh │ ├── tune_speechut_asr │ │ ├── finetune960h_large_edctc.sh │ │ ├── finetune_base_edctc.sh │ │ ├── inference_edctc.sh │ │ ├── inference_edctclm.sh │ │ ├── inference_lm_nj.sh │ │ └── inference_nj.sh │ └── tune_speechut_st │ │ ├── finetune_base_mustc_enxx.sh │ │ └── inference_st.sh │ ├── squence_generator.py │ └── tasks │ └── joint_sc2t_pretrain.py ├── VATLM ├── README.md └── vat_hubert │ ├── requirements.txt │ └── vathubert │ ├── __init__.py │ ├── conf │ ├── finetune │ │ ├── base_lrs3_30h_av.yaml │ │ ├── base_lrs3_30h_v.yaml │ │ ├── base_vox_30h_av.yaml │ │ ├── base_vox_30h_v.yaml │ │ ├── base_vox_433h_av.yaml │ │ ├── base_vox_433h_v.yaml │ │ ├── large_vox_30h_av.yaml │ │ ├── large_vox_30h_v.yaml │ │ ├── large_vox_433h_av.yaml │ │ └── large_vox_433h_v.yaml │ ├── pretrain │ │ ├── base_lrs3_iter5.yaml │ │ ├── base_vox_iter5.yaml │ │ └── large_vox_iter5.yaml │ └── s2s_decode.yaml │ ├── criterions │ ├── __init__.py │ └── vathubert_criterion.py │ ├── data │ ├── audiohubert_dataset.py │ ├── onlyaudiohubert_dataset.py │ ├── texthubert_dataset.py │ ├── utils.py │ └── vathubert_dataset.py │ ├── decode_avhubert_lrs3.sh │ ├── infer_s2s.py │ ├── models │ ├── decoder.py │ ├── resnet.py │ ├── utils.py │ ├── vathubert.py │ └── vathubert_asr.py │ ├── scripts │ ├── finetune_avsr │ │ ├── base_lrs3_finetune30_av.sh │ │ ├── base_vox_finetune30_av.sh │ │ ├── base_vox_finetune433_av.sh │ │ ├── large_vox_finetune30_av.sh │ │ └── large_vox_finetune433_av.sh │ ├── finetune_vsr │ │ ├── base_lrs3_finetune30_v.sh │ │ ├── base_vox_finetune30_v.sh │ │ ├── base_vox_finetune433_v.sh │ │ ├── large_vox_finetune30_v.sh │ │ └── large_vox_finetune433_v.sh │ └── pretrain │ │ ├── base_lsr3_pretrain_iter5.sh │ │ ├── base_vox_pretrain_iter5.sh │ │ └── large_vox_pretrain_iter5.sh │ ├── sequence_generator.py │ ├── tasks │ └── vathubert_pretraining.py │ └── utils.py ├── WavLLM ├── README.md ├── download │ └── download.sh └── wavllm │ ├── __init__.py │ ├── criterions │ └── cross_entropy_acc.py │ ├── data │ ├── speechllm_dataset.py │ └── tokenizer.py │ ├── inference │ ├── generate.py │ └── sequence_generator.py │ ├── models │ ├── llama.py │ ├── speechllm_model.py │ ├── wavlm.py │ └── whisper_encoder.py │ ├── modules │ └── convolution.py │ ├── requirements.txt │ ├── scripts │ └── inference_sft.sh │ ├── tasks │ └── speechllm_task.py │ ├── test_data │ ├── CoT-task-story.tsv │ ├── CoT-task.tsv │ ├── II-task.tsv │ ├── SQA.tsv │ ├── SQQA.tsv │ ├── asr.tsv │ ├── audio │ │ ├── CoT-task-story.wav │ │ ├── CoT-task.wav │ │ ├── II-task.wav │ │ ├── asr.flac │ │ ├── emo.wav │ │ ├── sqa.wav │ │ ├── sqqa.wav │ │ ├── st.flac │ │ └── sv.wav │ ├── dict.txt │ ├── emo.tsv │ ├── en2de.tsv │ ├── gaokao.tsv │ └── sv.tsv │ └── tokenizer │ └── tokenizer.model └── YiTrans ├── .gitignore ├── exp_scripts ├── finetune_ASR │ └── finetune_hubert24_mbart24_en.sh ├── finetune_MT │ └── finetune_mbart_en-de.sh ├── finetune_ST │ └── en-de │ │ └── jtst_pt36s2_mustc.sh └── pretrain │ ├── pretrain_pt36_adaptor_step1.sh │ └── pretrain_pt36_adaptor_step2.sh ├── readme.md └── yitrans_iwslt22 ├── __init__.py ├── config ├── finetune_asr │ └── large_mustc.yaml ├── finetune_mt │ └── mt_translation.yaml └── pretrain │ ├── joint_base.yaml │ └── joint_large.yaml ├── criterions ├── __init__.py ├── ctc_ce.py ├── joint_step1_criterion.py ├── joint_step1_split_batch_criterion.py └── joint_step2_criterion.py ├── data ├── concat_dataset.py ├── denoising_dataset.py ├── lang_pair_mask_dataset.py ├── load_langpair_dataset.py ├── multimodal_corpus_dataset.py └── speech2c_dataset.py ├── models ├── __init__.py ├── _hubert_mt.py ├── finetune_asr.py ├── finetune_mt.py ├── finetune_st.py ├── pretrain_ed.py └── pretrain_ed_step2.py ├── modules ├── __init__.py ├── multihead_attention.py ├── multimodal_transformer_decoder.py ├── relative_pos_enc.py ├── transformer_decoder.py ├── transformer_decoder_layer.py └── w2v_encoder.py ├── sequence_generator.py └── tasks ├── iwslt_joint_pretraining.py └── iwslt_translation_from_pretrain.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SpeechT5/fairseq"] 2 | path = SpeechT5/fairseq 3 | url = https://github.com/pytorch/fairseq 4 | [submodule "Speech2C/fairseq"] 5 | path = Speech2C/fairseq 6 | url = https://github.com/facebookresearch/fairseq.git 7 | [submodule "YiTrans/fairseq"] 8 | path = YiTrans/fairseq 9 | url = https://github.com/facebookresearch/fairseq 10 | [submodule "SpeechLM/fairseq"] 11 | path = SpeechLM/fairseq 12 | url = https://github.com/facebookresearch/fairseq.git 13 | [submodule "SpeechUT/fairseq"] 14 | path = SpeechUT/fairseq 15 | url = https://github.com/facebookresearch/fairseq.git 16 | [submodule "VATLM/fairseq"] 17 | path = VATLM/fairseq 18 | url = https://github.com/facebookresearch/fairseq.git 19 | [submodule "Speech2S/fairseq"] 20 | path = Speech2S/fairseq 21 | url = https://github.com/facebookresearch/fairseq.git 22 | branch = adding_womenbios 23 | [submodule "WavLLM/fairseq"] 24 | path = WavLLM/fairseq 25 | url = https://github.com/pytorch/fairseq.git 26 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /Speech2C/speech2c/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models # noqa -------------------------------------------------------------------------------- /Speech2C/speech2c/config/base_100h.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | seed: 1337 9 | 10 | checkpoint: 11 | no_epoch_checkpoints: true 12 | best_checkpoint_metric: dec_accuracy 13 | maximize_best_checkpoint_metric: true 14 | 15 | distributed_training: 16 | ddp_backend: c10d 17 | find_unused_parameters: true 18 | distributed_world_size: 1 19 | distributed_port: 29671 20 | nprocs_per_node: 8 21 | 22 | task: 23 | _name: speech2c_pretraining 24 | data: ??? 25 | fine_tuning: true 26 | label_dir: ??? 27 | normalize: false # must be consistent with pre-training 28 | labels: ["ltr"] 29 | single_target: true 30 | add_decoder: true 31 | pad_audio: true 32 | random_crop: false 33 | 34 | dataset: 35 | num_workers: 6 36 | max_tokens: 3200000 37 | skip_invalid_size_inputs_valid_test: true 38 | train_subset: train_100h 39 | valid_subset: dev_other 40 | 41 | criterion: 42 | _name: ctc_ce 43 | zero_infinity: true 44 | 45 | optimization: 46 | max_update: 80000 47 | lr: [0.00004] 48 | sentence_avg: true 49 | update_freq: [1] 50 | 51 | optimizer: 52 | _name: adam 53 | adam_betas: (0.9,0.98) 54 | adam_eps: 1e-08 55 | 56 | lr_scheduler: 57 | _name: tri_stage 58 | phase_ratio: [0.1, 0.4, 0.5] 59 | final_lr_scale: 0.05 60 | 61 | model: 62 | _name: speech2c_ctc 63 | w2v_path: ??? 64 | apply_mask: true 65 | mask_prob: 0.65 66 | mask_channel_prob: 0.5 67 | mask_channel_length: 64 68 | layerdrop: 0.1 69 | decoder_layerdrop: 0.1 70 | activation_dropout: 0.1 71 | feature_grad_mult: 0.0 72 | freeze_finetune_updates: 25000 73 | 74 | hydra: 75 | job: 76 | config: 77 | override_dirname: 78 | kv_sep: '-' 79 | item_sep: '__' 80 | exclude_keys: 81 | - run 82 | - task.data 83 | - task.label_dir 84 | - model.w2v_path 85 | - dataset.train_subset 86 | - dataset.valid_subset 87 | - criterion.wer_kenlm_model 88 | - criterion.wer_lexicon 89 | run: 90 | dir: ??? 91 | sweep: 92 | dir: ??? 93 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 94 | -------------------------------------------------------------------------------- /Speech2C/speech2c/config/speech2c_base_librispeech.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | seed: 1337 8 | tensorboard_logdir: tblog 9 | 10 | checkpoint: 11 | save_interval_updates: 25000 12 | keep_interval_updates: 1 13 | no_epoch_checkpoints: true 14 | 15 | 16 | distributed_training: 17 | ddp_backend: no_c10d 18 | distributed_backend: 'nccl' 19 | distributed_world_size: 32 20 | distributed_port: 29671 21 | nprocs_per_node: 8 22 | find_unused_parameters: true 23 | 24 | task: 25 | _name: speech2c_pretraining 26 | data: ??? 27 | label_dir: ??? 28 | labels: ??? 29 | label_rate: ${model.label_rate} 30 | sample_rate: 16000 31 | max_sample_size: 250000 32 | min_sample_size: 32000 33 | pad_audio: false 34 | random_crop: true 35 | normalize: false # must be consistent with extractor 36 | add_decoder: true 37 | 38 | dataset: 39 | num_workers: 6 40 | max_tokens: 1400000 41 | skip_invalid_size_inputs_valid_test: true 42 | validate_interval: 5 43 | validate_interval_updates: 10000 44 | 45 | criterion: 46 | _name: speech2c 47 | pred_masked_weight: 1.0 48 | pred_nomask_weight: 0.0 49 | loss_weights: [10,] 50 | 51 | optimization: 52 | max_update: 400000 53 | lr: [0.0005] 54 | clip_norm: 10.0 55 | 56 | optimizer: 57 | _name: adam 58 | adam_betas: (0.9,0.98) 59 | adam_eps: 1e-06 60 | weight_decay: 0.01 61 | 62 | lr_scheduler: 63 | _name: polynomial_decay 64 | warmup_updates: 32000 65 | 66 | model: 67 | _name: speech2c 68 | label_rate: ??? 69 | skip_masked: false 70 | skip_nomask: false 71 | mask_prob: 0.80 72 | extractor_mode: default 73 | conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' 74 | final_dim: 256 75 | encoder_layerdrop: 0.05 76 | dropout_input: 0.1 77 | dropout_features: 0.1 78 | dropout: 0.1 79 | attention_dropout: 0.1 80 | feature_grad_mult: 0.1 81 | untie_final_proj: true 82 | activation_dropout: 0.0 83 | use_rel_pos_enc: true 84 | decoder_dict_size: -1 85 | 86 | hydra: 87 | job: 88 | config: 89 | override_dirname: 90 | kv_sep: '-' 91 | item_sep: '__' 92 | exclude_keys: 93 | - run 94 | - task.data 95 | - task.label_dir 96 | run: 97 | dir: ??? 98 | sweep: 99 | dir: ??? 100 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 101 | -------------------------------------------------------------------------------- /Speech2C/speech2c/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | 5 | for file in os.listdir(os.path.dirname(__file__)): 6 | if file.endswith(".py") and not file.startswith("_"): 7 | criterion_name = file[: file.find(".py")] 8 | importlib.import_module( 9 | "speech2c.criterions." + criterion_name 10 | ) 11 | -------------------------------------------------------------------------------- /Speech2C/speech2c/models/modules/relative_pos_enc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | import torch 11 | 12 | class RelativePositionalEncoding(torch.nn.Module): 13 | def __init__(self, d_model, maxlen=1000, embed_v=False): 14 | super(RelativePositionalEncoding, self).__init__() 15 | 16 | self.d_model = d_model 17 | self.maxlen = maxlen 18 | self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 19 | if embed_v: 20 | self.pe_v = torch.nn.Embedding(2*maxlen, d_model) 21 | self.embed_v = embed_v 22 | 23 | 24 | def forward(self, pos_seq, incremental_state=None): 25 | pos_seq[pos_seq < -self.maxlen] = -self.maxlen 26 | pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1 27 | pos_seq = pos_seq + self.maxlen 28 | 29 | if incremental_state is not None: 30 | pos_seq = pos_seq[-1:] 31 | 32 | if self.embed_v: 33 | return self.pe_k(pos_seq), self.pe_v(pos_seq) 34 | else: 35 | return self.pe_k(pos_seq), None 36 | -------------------------------------------------------------------------------- /Speech2C/speech2c/models/t5_transformer_lm.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | from fairseq.models import ( 11 | register_model_architecture, 12 | ) 13 | from fairseq.models.transformer_lm import base_lm_architecture 14 | 15 | 16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5") 17 | def transformer_lm_t5(args): 18 | args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280) 19 | args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144) 20 | args.decoder_layers = getattr(args, "decoder_layers", 20) 21 | args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) 22 | args.dropout = getattr(args, "dropout", 0.1) 23 | args.attention_dropout = getattr(args, "attention_dropout", 0.1) 24 | args.activation_fn = getattr(args, "activation_fn", "gelu") 25 | base_lm_architecture(args) 26 | -------------------------------------------------------------------------------- /Speech2S/speech2s/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models 2 | -------------------------------------------------------------------------------- /Speech2S/speech2s/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | for file in os.listdir(os.path.dirname(__file__)): 5 | if file.endswith(".py") and not file.startswith("_"): 6 | criterion_name = file[: file.find(".py")] 7 | importlib.import_module( 8 | "speechut.criterions." + criterion_name 9 | ) 10 | -------------------------------------------------------------------------------- /Speech2S/speech2s/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/Speech2S/speech2s/models/__init__.py -------------------------------------------------------------------------------- /Speech2S/speech2s/models/t5_transformer_lm.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | from fairseq.models import ( 11 | register_model_architecture, 12 | ) 13 | from fairseq.models.transformer_lm import base_lm_architecture 14 | 15 | 16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5") 17 | def transformer_lm_t5(args): 18 | args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280) 19 | args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144) 20 | args.decoder_layers = getattr(args, "decoder_layers", 20) 21 | args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) 22 | args.dropout = getattr(args, "dropout", 0.1) 23 | args.attention_dropout = getattr(args, "attention_dropout", 0.1) 24 | args.activation_fn = getattr(args, "activation_fn", "gelu") 25 | base_lm_architecture(args) 26 | -------------------------------------------------------------------------------- /Speech2S/speech2s/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Copyright (c) 2022 Microsoft 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Based on fairseq code bases 5 | # https://github.com/facebookresearch/fairseq 6 | # -------------------------------------------------------- 7 | 8 | from .learned_positional_embedding import LearnedPositionalEmbedding 9 | from .multihead_attention import MultiheadAttention 10 | from .relative_pos_enc import RelativePositionalEncoding 11 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase 12 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer 13 | from .transformer_encoder import TransformerEncoderBase 14 | from .transformer_decoder import TransformerDecoderScriptable, TransformerDecoderBaseScriptable 15 | 16 | __all__ = [ 17 | "MultiheadAttention", 18 | "RelativePositionalEncoding", 19 | "LearnedPositionalEmbedding", 20 | "TransformerEncoderLayerBase", 21 | "TransformerDecoderLayerBase", 22 | "TransformerEncoder", 23 | "TransformerSentenceEncoderLayer", 24 | "TransformerEncoderBase", 25 | "TransformerDecoderScriptable", 26 | "TransformerDecoderBaseScriptable", 27 | ] 28 | -------------------------------------------------------------------------------- /Speech2S/speech2s/modules/relative_pos_enc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Copyright (c) 2022 Microsoft 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Based on fairseq code bases 5 | # https://github.com/facebookresearch/fairseq 6 | # -------------------------------------------------------- 7 | 8 | import torch 9 | 10 | class RelativePositionalEncoding(torch.nn.Module): 11 | def __init__(self, d_model, maxlen=1000, embed_v=False): 12 | super(RelativePositionalEncoding, self).__init__() 13 | 14 | self.d_model = d_model 15 | self.maxlen = maxlen 16 | self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 17 | if embed_v: 18 | self.pe_v = torch.nn.Embedding(2*maxlen, d_model) 19 | self.embed_v = embed_v 20 | 21 | 22 | def forward(self, pos_seq, incremental_state=None): 23 | pos_seq[pos_seq < -self.maxlen] = -self.maxlen 24 | pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1 25 | pos_seq = pos_seq + self.maxlen 26 | 27 | if incremental_state is not None: 28 | pos_seq = pos_seq[-1:] 29 | 30 | if self.embed_v: 31 | return self.pe_k(pos_seq), self.pe_v(pos_seq) 32 | else: 33 | return self.pe_k(pos_seq), None 34 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_asr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=1 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4asr_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 21 | --config-name speechut_base_librispeech \ 22 | common.user_dir=$CODE_ROOT/speechut \ 23 | \ 24 | task.labels='["km"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_960+pseudo_libritext.kmu-ltr+merge_960.kmu-none\" \ 31 | dataset.valid_subset=\"dev_clean+dev.kmu-ltr+dev.kmu-none\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=1400000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=base_speechut4asr_${world_size}gpu_${update_freq}accum 41 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | lang=$3 9 | mount=$4 10 | world_size=$5 11 | update_freq=$6 12 | [ -z $mount ] && mount=${PWD} 13 | [ -z $world_size ] && world_size=32 14 | [ -z $update_freq ] && update_freq=1 15 | 16 | CODE_ROOT=${PWD} 17 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum" 18 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 19 | 20 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 21 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 22 | --config-name speechut_base_librispeech \ 23 | common.user_dir=$CODE_ROOT/speechut \ 24 | \ 25 | task.labels='["km"]' \ 26 | model.label_rate=50 \ 27 | task.data=$DATA_DIR \ 28 | task.label_dir=$DATA_DIR \ 29 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 30 | \ 31 | model.add_text_ctc=false \ 32 | model.text_transformer.share_decoder_input_output_embed=true \ 33 | criterion.u2t_ed_weight=1.0 \ 34 | criterion.u2t_ctc_weight=0 \ 35 | \ 36 | dataset.train_subset=\"train_960,mustcuns_${lang}+pseudo_wmt_en${lang}.kmu-spm+train_960.kmu-none,mustcuns_${lang}.kmu-none\" \ 37 | dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \ 38 | dataset.num_workers=0 \ 39 | dataset.max_tokens=1400000 \ 40 | distributed_training.distributed_world_size=${world_size} \ 41 | optimization.update_freq=[${update_freq}] \ 42 | \ 43 | common.tensorboard_logdir=$MODEL_DIR \ 44 | checkpoint.save_dir=$MODEL_DIR \ 45 | hydra.run.dir=$MODEL_DIR \ 46 | hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum 47 | 48 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st_enfr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [lang=fr] [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | lang=$3 9 | mount=$4 10 | world_size=$5 11 | update_freq=$6 12 | [ -z $lang ] && lang=fr 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=32 15 | [ -z $update_freq ] && update_freq=1 16 | 17 | CODE_ROOT=${PWD} 18 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum" 19 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 20 | 21 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 22 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 23 | --config-name speechut_base_librispeech \ 24 | common.user_dir=$CODE_ROOT/speechut \ 25 | \ 26 | task.labels='["km"]' \ 27 | model.label_rate=50 \ 28 | task.data=$DATA_DIR \ 29 | task.label_dir=$DATA_DIR \ 30 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 31 | \ 32 | model.add_text_ctc=false \ 33 | criterion.u2t_ed_weight=1.0 \ 34 | criterion.u2t_ctc_weight=0 \ 35 | \ 36 | dataset.train_subset=\"train_960,pretrain_mustc+pseudo_wmt14_enfr.kmu-spm+train_960.kmu-none,pretrain_mustc.kmu-none\" \ 37 | dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \ 38 | dataset.num_workers=0 \ 39 | dataset.max_tokens=1400000 \ 40 | optimization.max_update=600000 \ 41 | distributed_training.distributed_world_size=${world_size} \ 42 | optimization.update_freq=[${update_freq}] \ 43 | \ 44 | common.tensorboard_logdir=$MODEL_DIR \ 45 | checkpoint.save_dir=$MODEL_DIR \ 46 | hydra.run.dir=$MODEL_DIR \ 47 | hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum 48 | 49 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/pretrain_speechut/large_speechut_for_asr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Large model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=4 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/large_speechut4asr_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 21 | --config-name speechut_large_librilight \ 22 | common.user_dir=$CODE_ROOT/speechut \ 23 | \ 24 | task.labels='["km"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_small+pseudo_libritext.kmu-ltr\" \ 31 | dataset.valid_subset=\"dev_clean+dev.kmu-ltr\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=900000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=large_speechut4asr_${world_size}gpu_${update_freq}accum 41 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune960h_large_edctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Large model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=3 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechut/config/finetune_asr \ 26 | --config-name speechut_large_960h \ 27 | common.user_dir=$CODE_ROOT/speechut \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=80000 \ 35 | dataset.max_tokens=1100000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_960" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5 46 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune_base_edctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=2 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechut/config/finetune_asr \ 26 | --config-name speechut_base_100h \ 27 | common.user_dir=$CODE_ROOT/speechut \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=40000 \ 35 | dataset.max_tokens=1300000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_clean_100" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5 46 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctc.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechUT ASR model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | beam_size=$4 11 | ctc_weight=$5 12 | extra=$6 13 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..." 14 | [ -z $gen_set ] && gen_set="dev_other" 15 | [ -z $beam_size ] && beam_size=10 16 | [ -z $ctc_weight ] && ctc_weight=0.2 17 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1 18 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1" 19 | 20 | src_dir=${model_path%/*} 21 | cpt=${model_path##*/} 22 | cpt=${cpt%.*} 23 | 24 | CODE_ROOT=${PWD} 25 | 26 | for subset in ${gen_set//,/ }; do 27 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank} 28 | [ ! -d $results_path ] && mkdir -p $results_path 29 | 30 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 31 | --user-dir $CODE_ROOT/speechut \ 32 | --label-dir ${DATA_DIR} \ 33 | --labels '["ltr"]' \ 34 | --single-target \ 35 | --post-process letter \ 36 | --gen-subset ${subset} \ 37 | --max-tokens 2000000 \ 38 | \ 39 | --task joint_sc2t_pretraining \ 40 | --add-decoder-target \ 41 | --fine-tuning \ 42 | --pad-audio \ 43 | --random-crop \ 44 | \ 45 | --ctc-weight ${ctc_weight} $extra \ 46 | --beam ${beam_size} \ 47 | \ 48 | --path ${model_path} \ 49 | --results-path $results_path \ 50 | \ 51 | --scoring wer --max-len-a 0.00078125 --max-len-b 200 \ 52 | & 53 | done 54 | wait 55 | 56 | 57 | for subset in ${gen_set//,/ }; do 58 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank} 59 | echo $results_path 60 | tail -n 1 $results_path/generate-*.txt 61 | done 62 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts copy/tune_speechut_st/inference_st.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | lang=$3 10 | gen_set=$4 11 | beam_size=$5 12 | lenpen=$6 13 | [ -z $gen_set ] && gen_set="dev" 14 | [ -z $beam_size ] && beam_size=10 15 | [ -z $lenpen ] && lenpen=1 16 | src_dir=${model_path%/*} 17 | cpt=${model_path##*/} 18 | cpt=${cpt%.*} 19 | 20 | CODE_ROOT=${PWD} 21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 25 | --gen-subset ${gen_set}_st \ 26 | --max-tokens 2000000 \ 27 | --max-source-positions 2000000 \ 28 | --num-workers 0 \ 29 | \ 30 | --user-dir $CODE_ROOT/speechut \ 31 | --task speech_to_text \ 32 | --config-yaml config_en${lang}.yaml \ 33 | \ 34 | --path ${model_path} \ 35 | --results-path $results_path \ 36 | \ 37 | --scoring sacrebleu --max-len-a 0 --max-len-b 512 \ 38 | --beam ${beam_size} \ 39 | --lenpen $lenpen \ 40 | # --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \ 41 | 42 | echo $results_path 43 | tail -n 1 $results_path/generate-*.txt 44 | sleep 1s 45 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/Speech2S/speech2s/scripts/__init__.py -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/compare_namespaces.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Helper script to compare two argparse.Namespace objects.""" 3 | 4 | from argparse import Namespace # noqa 5 | 6 | 7 | def main(): 8 | 9 | ns1 = eval(input("Namespace 1: ")) 10 | ns2 = eval(input("Namespace 2: ")) 11 | 12 | def keys(ns): 13 | ks = set() 14 | for k in dir(ns): 15 | if not k.startswith("_"): 16 | ks.add(k) 17 | return ks 18 | 19 | k1 = keys(ns1) 20 | k2 = keys(ns2) 21 | 22 | def print_keys(ks, ns1, ns2=None): 23 | for k in ks: 24 | if ns2 is None: 25 | print("{}\t{}".format(k, getattr(ns1, k, None))) 26 | else: 27 | print( 28 | "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None)) 29 | ) 30 | 31 | print("Keys unique to namespace 1:") 32 | print_keys(k1 - k2, ns1) 33 | print() 34 | 35 | print("Keys unique to namespace 2:") 36 | print_keys(k2 - k1, ns2) 37 | print() 38 | 39 | print("Overlapping keys with different values:") 40 | ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")] 41 | print_keys(ks, ns1, ns2) 42 | print() 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/compound_split_bleu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "usage: $0 GENERATE_PY_OUTPUT" 5 | exit 1 6 | fi 7 | 8 | GEN=$1 9 | 10 | SYS=$GEN.sys 11 | REF=$GEN.ref 12 | 13 | if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then 14 | echo "not done generating" 15 | exit 16 | fi 17 | 18 | grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS 19 | grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF 20 | fairseq-score --sys $SYS --ref $REF 21 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/constraints/validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | # 5 | # This source code is licensed under the MIT license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | import sys 9 | 10 | 11 | """Reads in a fairseq output file, and verifies that the constraints 12 | (C- lines) are present in the output (the first H- line). Assumes that 13 | constraints are listed prior to the first hypothesis. 14 | """ 15 | 16 | constraints = [] 17 | found = 0 18 | total = 0 19 | for line in sys.stdin: 20 | if line.startswith("C-"): 21 | constraints.append(line.rstrip().split("\t")[1]) 22 | elif line.startswith("H-"): 23 | text = line.split("\t")[2] 24 | 25 | for constraint in constraints: 26 | total += 1 27 | if constraint in text: 28 | found += 1 29 | else: 30 | print(f"No {constraint} in {text}", file=sys.stderr) 31 | 32 | constraints = [] 33 | 34 | print(f"Found {found} / {total} = {100 * found / total:.1f}%") 35 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/convert_dictionary.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) Facebook, Inc. and its affiliates. 2 | -- 3 | -- This source code is licensed under the MIT license found in the 4 | -- LICENSE file in the root directory of this source tree. 5 | -- 6 | -- Usage: convert_dictionary.lua 7 | require 'fairseq' 8 | require 'torch' 9 | require 'paths' 10 | 11 | if #arg < 1 then 12 | print('usage: convert_dictionary.lua ') 13 | os.exit(1) 14 | end 15 | if not paths.filep(arg[1]) then 16 | print('error: file does not exit: ' .. arg[1]) 17 | os.exit(1) 18 | end 19 | 20 | dict = torch.load(arg[1]) 21 | dst = paths.basename(arg[1]):gsub('.th7', '.txt') 22 | assert(dst:match('.txt$')) 23 | 24 | f = io.open(dst, 'w') 25 | for idx, symbol in ipairs(dict.index_to_symbol) do 26 | if idx > dict.cutoff then 27 | break 28 | end 29 | f:write(symbol) 30 | f:write(' ') 31 | f:write(dict.index_to_freq[idx]) 32 | f:write('\n') 33 | end 34 | f:close() 35 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/count_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Count the number of documents and average number of lines and tokens per 8 | document in a large file. Documents should be separated by a single empty line. 9 | """ 10 | 11 | import argparse 12 | import gzip 13 | import sys 14 | 15 | import numpy as np 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("input") 21 | parser.add_argument("--gzip", action="store_true") 22 | args = parser.parse_args() 23 | 24 | def gopen(): 25 | if args.gzip: 26 | return gzip.open(args.input, "r") 27 | else: 28 | return open(args.input, "r", encoding="utf-8") 29 | 30 | num_lines = [] 31 | num_toks = [] 32 | with gopen() as h: 33 | num_docs = 1 34 | num_lines_in_doc = 0 35 | num_toks_in_doc = 0 36 | for i, line in enumerate(h): 37 | if len(line.strip()) == 0: # empty line indicates new document 38 | num_docs += 1 39 | num_lines.append(num_lines_in_doc) 40 | num_toks.append(num_toks_in_doc) 41 | num_lines_in_doc = 0 42 | num_toks_in_doc = 0 43 | else: 44 | num_lines_in_doc += 1 45 | num_toks_in_doc += len(line.rstrip().split()) 46 | if i % 1000000 == 0: 47 | print(i, file=sys.stderr, end="", flush=True) 48 | elif i % 100000 == 0: 49 | print(".", file=sys.stderr, end="", flush=True) 50 | print(file=sys.stderr, flush=True) 51 | 52 | print("found {} docs".format(num_docs)) 53 | print("average num lines per doc: {}".format(np.mean(num_lines))) 54 | print("average num toks per doc: {}".format(np.mean(num_toks))) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/read_binarized.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import argparse 8 | 9 | from fairseq.data import Dictionary, data_utils, indexed_dataset 10 | 11 | 12 | def get_parser(): 13 | parser = argparse.ArgumentParser( 14 | description="writes text from binarized file to stdout" 15 | ) 16 | # fmt: off 17 | parser.add_argument('--dataset-impl', help='dataset implementation', 18 | choices=indexed_dataset.get_available_dataset_impl()) 19 | parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None) 20 | parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read') 21 | # fmt: on 22 | 23 | return parser 24 | 25 | 26 | def main(): 27 | parser = get_parser() 28 | args = parser.parse_args() 29 | 30 | dictionary = Dictionary.load(args.dict) if args.dict is not None else None 31 | dataset = data_utils.load_indexed_dataset( 32 | args.input, 33 | dictionary, 34 | dataset_impl=args.dataset_impl, 35 | default="lazy", 36 | ) 37 | 38 | for tensor_line in dataset: 39 | if dictionary is None: 40 | line = " ".join([str(int(x)) for x in tensor_line]) 41 | else: 42 | line = dictionary.string(tensor_line) 43 | 44 | print(line) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/sacrebleu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 4 ]; then 4 | echo "usage: $0 TESTSET SRCLANG TGTLANG GEN" 5 | exit 1 6 | fi 7 | 8 | TESTSET=$1 9 | SRCLANG=$2 10 | TGTLANG=$3 11 | 12 | GEN=$4 13 | 14 | if ! command -v sacremoses &> /dev/null 15 | then 16 | echo "sacremoses could not be found, please install with: pip install sacremoses" 17 | exit 18 | fi 19 | 20 | grep ^H $GEN \ 21 | | sed 's/^H\-//' \ 22 | | sort -n -k 1 \ 23 | | cut -f 3 \ 24 | | sacremoses detokenize \ 25 | > $GEN.sorted.detok 26 | 27 | sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok 28 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/shard_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Split a large file into shards while respecting document boundaries. Documents 8 | should be separated by a single empty line. 9 | """ 10 | 11 | import argparse 12 | import contextlib 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("input") 18 | parser.add_argument("--num-shards", type=int) 19 | args = parser.parse_args() 20 | 21 | assert args.num_shards is not None and args.num_shards > 1 22 | 23 | with open(args.input, "r", encoding="utf-8") as h: 24 | with contextlib.ExitStack() as stack: 25 | outputs = [ 26 | stack.enter_context( 27 | open(args.input + ".shard" + str(i), "w", encoding="utf-8") 28 | ) 29 | for i in range(args.num_shards) 30 | ] 31 | 32 | doc = [] 33 | first_doc = [True] * args.num_shards 34 | 35 | def output_doc(i): 36 | if not first_doc[i]: 37 | outputs[i].write("\n") 38 | first_doc[i] = False 39 | for line in doc: 40 | outputs[i].write(line) 41 | doc.clear() 42 | 43 | num_docs = 0 44 | for line in h: 45 | if line.strip() == "": # empty line indicates new document 46 | output_doc(num_docs % args.num_shards) 47 | num_docs += 1 48 | else: 49 | doc.append(line) 50 | output_doc(num_docs % args.num_shards) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/spm_decode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import argparse 11 | 12 | import sentencepiece as spm 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--model", required=True, help="sentencepiece model to use for decoding" 19 | ) 20 | parser.add_argument("--input", required=True, help="input file to decode") 21 | parser.add_argument("--input_format", choices=["piece", "id"], default="piece") 22 | args = parser.parse_args() 23 | 24 | sp = spm.SentencePieceProcessor() 25 | sp.Load(args.model) 26 | 27 | if args.input_format == "piece": 28 | 29 | def decode(input): 30 | return "".join(sp.DecodePieces(input)) 31 | 32 | elif args.input_format == "id": 33 | 34 | def decode(input): 35 | return "".join(sp.DecodeIds(input)) 36 | 37 | else: 38 | raise NotImplementedError 39 | 40 | def tok2int(tok): 41 | # remap reference-side (represented as <>) to 0 42 | return int(tok) if tok != "<>" else 0 43 | 44 | with open(args.input, "r", encoding="utf-8") as h: 45 | for line in h: 46 | if args.input_format == "id": 47 | print(decode(list(map(tok2int, line.rstrip().split())))) 48 | elif args.input_format == "piece": 49 | print(decode(line.rstrip().split())) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/spm_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import sys 11 | 12 | import sentencepiece as spm 13 | 14 | 15 | if __name__ == "__main__": 16 | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) 17 | -------------------------------------------------------------------------------- /Speech2S/speech2s/scripts/test_fsdp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | rm -rf fsdp_dummy 3 | mkdir -p fsdp_dummy 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ 5 | --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ 6 | --cpu-offload --checkpoint-activations \ 7 | --task language_modeling --tokens-per-sample 256 --batch-size 8 \ 8 | --arch transformer_lm_gpt2_tiny \ 9 | --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ 10 | --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ 11 | --max-update 5 --log-format json --log-interval 1 \ 12 | --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \ 13 | --restore-file x.pt "$@" 14 | 15 | # Now we try to load the checkpoint 16 | CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ 17 | --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ 18 | --cpu-offload --checkpoint-activations \ 19 | --task language_modeling --tokens-per-sample 256 --batch-size 8 \ 20 | --arch transformer_lm_gpt2_tiny \ 21 | --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ 22 | --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ 23 | --max-update 2 --log-format json --log-interval 1 \ 24 | --save-interval-updates 2 --save-dir fsdp_dummy 25 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh: -------------------------------------------------------------------------------- 1 | 2 | # #################################### 3 | # Hubert SCT2T ED model # 4 | # #################################### 5 | 6 | world_size=$1 7 | update_freq=$2 8 | exp_name=$3 9 | [ -z $world_size ] && world_size=8 10 | [ -z $update_freq ] && update_freq=1 11 | [ -z $exp_name ] && exp_name=sc2t_base_enes_${world_size}gpu_${update_freq}accum6666 12 | 13 | 14 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 15 | CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config 16 | DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_enes" 17 | TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx" 18 | MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/$exp_name" 19 | 20 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 21 | 22 | 23 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \ 24 | --config-dir $CONFIG_DIR/pretrain \ 25 | --config-name sc2t_base_librispeech \ 26 | \ 27 | +task.store_labels=true \ 28 | task.labels='["km"]' \ 29 | model.label_rate=50 \ 30 | task.data=$DATA_DIR \ 31 | task.label_dir=$DATA_DIR \ 32 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 33 | +task.text_cfg.data_config=config.yaml \ 34 | task.text_cfg.text_maxtokens_ratio=3.0 \ 35 | \ 36 | +criterion.dec_loss_type="ce" \ 37 | \ 38 | criterion.text_weight=1.0 \ 39 | \ 40 | model.use_rel_pos_enc=true \ 41 | +model.code_use_rel_pos_enc=true \ 42 | +model.pad_with_code=true \ 43 | model.text_transformer.no_scale_embedding=true \ 44 | model.text_transformer.layernorm_embedding=true \ 45 | +model.share_decoder_input_output_embed=true \ 46 | \ 47 | dataset.train_subset=\"train_all+en.kmu-spm\" \ 48 | dataset.valid_subset=\"valid+en_valid.kmu-spm\" \ 49 | dataset.num_workers=0 \ 50 | dataset.max_tokens=1000000 \ 51 | optimization.update_freq=[${update_freq}] \ 52 | optimization.max_update=400000 \ 53 | \ 54 | distributed_training.distributed_world_size=${world_size} \ 55 | \ 56 | common.tensorboard_logdir=$MODEL_DIR \ 57 | checkpoint.save_dir=$MODEL_DIR \ 58 | hydra.run.dir=$MODEL_DIR \ 59 | hydra.job.name=${exp_name} 60 | 61 | 62 | sleep 5m 63 | echo "All finished" 64 | 65 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh: -------------------------------------------------------------------------------- 1 | 2 | # #################################### 3 | # Hubert SCT2T ED model # 4 | # #################################### 5 | 6 | world_size=$1 7 | update_freq=$2 8 | exp_name=$3 9 | [ -z $world_size ] && world_size=24 10 | [ -z $update_freq ] && update_freq=3 11 | [ -z $exp_name ] && exp_name=sc2t_base_esen_${world_size}gpu_${update_freq}accum1 12 | 13 | 14 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 15 | CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config 16 | DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_esen" 17 | TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_esen" 18 | MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_esen/$exp_name" 19 | 20 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 21 | 22 | 23 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \ 24 | --config-dir $CONFIG_DIR/pretrain \ 25 | --config-name sc2t_base_librispeech \ 26 | \ 27 | +task.store_labels=true \ 28 | task.labels='["km"]' \ 29 | model.label_rate=50 \ 30 | task.data=$DATA_DIR \ 31 | task.label_dir=$DATA_DIR \ 32 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 33 | +task.text_cfg.data_config=config.yaml \ 34 | task.text_cfg.text_maxtokens_ratio=3.0 \ 35 | \ 36 | +criterion.dec_loss_type="ce" \ 37 | \ 38 | criterion.text_weight=1.0 \ 39 | \ 40 | model.use_rel_pos_enc=true \ 41 | +model.code_use_rel_pos_enc=true \ 42 | +model.pad_with_code=true \ 43 | model.text_transformer.no_scale_embedding=true \ 44 | model.text_transformer.layernorm_embedding=true \ 45 | +model.share_decoder_input_output_embed=true \ 46 | \ 47 | dataset.train_subset=\"train+en.kmu-spm\" \ 48 | dataset.valid_subset=\"valid+en_valid.kmu-spm\" \ 49 | dataset.num_workers=0 \ 50 | dataset.max_tokens=1000000 \ 51 | optimization.update_freq=[${update_freq}] \ 52 | optimization.max_update=400000 \ 53 | \ 54 | distributed_training.distributed_world_size=${world_size} \ 55 | \ 56 | common.tensorboard_logdir=$MODEL_DIR \ 57 | checkpoint.save_dir=$MODEL_DIR \ 58 | hydra.run.dir=$MODEL_DIR \ 59 | hydra.job.name=${exp_name} 60 | 61 | 62 | sleep 5m 63 | echo "All finished" 64 | 65 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/config.yaml: -------------------------------------------------------------------------------- 1 | audio_root: ./ 2 | standardize_audio: true 3 | use_audio_input: true 4 | vocab_filename: dict.txt 5 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | 9 | checkpoint: 10 | save_interval: 1 11 | keep_last_epochs: 10 12 | keep_best_checkpoints: 5 13 | best_checkpoint_metric: wer 14 | restore_file: checkpoint_last.pt 15 | 16 | distributed_training: 17 | ddp_backend: c10d 18 | find_unused_parameters: true 19 | distributed_world_size: 24 20 | distributed_port: -1 21 | nprocs_per_node: 8 22 | 23 | task: 24 | _name: hubert_pretraining 25 | data: ??? 26 | fine_tuning: true 27 | label_dir: ??? 28 | normalize: true # must be consistent with pre-training 29 | labels: ["ltr"] 30 | single_target: true 31 | add_decoder: false 32 | pad_audio: false 33 | random_crop: true 34 | tokenizer: "none" 35 | sp_path: None 36 | 37 | dataset: 38 | num_workers: 0 39 | max_tokens: 1280000 40 | skip_invalid_size_inputs_valid_test: true 41 | valid_subset: dev_other 42 | required_batch_size_multiple: 1 43 | 44 | criterion: 45 | _name: ctc 46 | zero_infinity: true 47 | 48 | optimization: 49 | max_update: 200000 50 | lr: [0.00003] 51 | sentence_avg: true 52 | update_freq: [1] 53 | 54 | optimizer: 55 | _name: adam 56 | adam_betas: (0.9,0.98) 57 | adam_eps: 1e-08 58 | weight_decay: 0.0 59 | 60 | lr_scheduler: 61 | _name: tri_stage 62 | phase_ratio: [0.1, 0.4, 0.5] 63 | final_lr_scale: 0.05 64 | 65 | model: 66 | _name: hubert_ctc 67 | w2v_path: ??? 68 | apply_mask: true 69 | mask_prob: 0.5 70 | mask_channel_prob: 0.25 71 | mask_channel_length: 64 72 | layerdrop: 0.0 73 | decoder_layerdrop: 0.1 74 | activation_dropout: 0.1 75 | feature_grad_mult: 0.0 76 | freeze_finetune_updates: 0 77 | add_decoder: false 78 | 79 | hydra: 80 | job: 81 | config: 82 | override_dirname: 83 | kv_sep: '-' 84 | item_sep: '__' 85 | exclude_keys: 86 | - run 87 | - task.data 88 | - task.label_dir 89 | - model.w2v_path 90 | - dataset.train_subset 91 | - dataset.valid_subset 92 | - criterion.wer_kenlm_model 93 | - criterion.wer_lexicon 94 | run: 95 | dir: ??? 96 | sweep: 97 | dir: ??? 98 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 99 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | seed: 1337 9 | 10 | checkpoint: 11 | save_interval: 1000000 12 | keep_last_epochs: 5 13 | save_interval_updates: 1000 14 | keep_interval_updates_pattern: 10000 15 | keep_interval_updates: 5 16 | best_checkpoint_metric: accuracy 17 | maximize_best_checkpoint_metric: true 18 | 19 | distributed_training: 20 | ddp_backend: c10d 21 | find_unused_parameters: true 22 | distributed_world_size: 1 23 | nprocs_per_node: 8 24 | 25 | 26 | criterion: 27 | _name: "label_smoothed_cross_entropy" 28 | 29 | 30 | task: 31 | _name: "translation_from_jst" 32 | 33 | dataset: 34 | num_workers: 0 35 | max_tokens: 4096 36 | skip_invalid_size_inputs_valid_test: true 37 | validate_after_updates: ${model.freeze_finetune_updates} 38 | validate_interval: ${checkpoint.save_interval} 39 | validate_interval_updates: ${checkpoint.save_interval_updates} 40 | train_subset: train_clean_100 41 | valid_subset: dev_clean 42 | required_batch_size_multiple: 1 43 | 44 | optimizer: 45 | _name: adam 46 | adam_betas: (0.9,0.98) 47 | adam_eps: 1e-06 48 | weight_decay: 0.0 49 | 50 | lr_scheduler: 51 | _name: tri_stage 52 | phase_ratio: [0.1, 0.4, 0.5] 53 | final_lr_scale: 0.05 54 | 55 | model: 56 | _name: hubert_t2c 57 | w2v_path: ??? 58 | layerdrop: 0.1 59 | decoder_layerdrop: 0.1 60 | activation_dropout: 0.1 61 | feature_grad_mult: 0.0 62 | freeze_finetune_updates: 0 63 | 64 | hydra: 65 | job: 66 | config: 67 | override_dirname: 68 | kv_sep: '-' 69 | item_sep: '__' 70 | exclude_keys: 71 | - run 72 | - task.data 73 | - task.label_dir 74 | - model.w2v_path 75 | - dataset.train_subset 76 | - dataset.valid_subset 77 | run: 78 | dir: ??? 79 | sweep: 80 | dir: ??? 81 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 82 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh: -------------------------------------------------------------------------------- 1 | 2 | if [ ! -d ${HOME}/azcopy_linux_amd64_10.11.0 ]; then 3 | CURRENT_DIR=`pwd` 4 | cd ${HOME} && wget https://azcopyvnext.azureedge.net/release20210616/azcopy_linux_amd64_10.11.0.tar.gz && tar -zxvf azcopy_linux_amd64_10.11.0.tar.gz && rm -f azcopy_linux_amd64_10.11.0.tar.gz && cd ${CURRENT_DIR} 5 | fi 6 | export PATH=$PATH:${HOME}/azcopy_linux_amd64_10.11.0/:${HOME}/.local/bin 7 | export PYTHONPATH=$PYTHONPATH:/mnt/output/users/v-kunwei/code/fairseq 8 | 9 | rank=$1 10 | nshard=$2 11 | split=$3 12 | [ -z $rank ] && echo "please specify rank" 13 | [ -z $nshard ] && nshard=1 14 | [ -z $split ] && split="train" 15 | 16 | 17 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq 18 | ckpt_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3.pt 19 | tsv_dir=/home/v-kunwei 20 | 21 | feat_dir=${HOME}/$split 22 | python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} 9 ${nshard} ${rank} ${feat_dir} || exit 1 23 | 24 | 25 | echo "-------------------------------------------------------------------------------------------" 26 | echo "---------------------------------- done ---------------------------------------------" 27 | echo "-------------------------------------------------------------------------------------------" 28 | 29 | km_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin 30 | lab_dir=${HOME}/${split} 31 | python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir} 32 | 33 | 34 | # sas="?sv=2020-08-04&st=2022-01-02T04%3A58%3A15Z&se=2022-06-01T04%3A58%3A00Z&sr=c&sp=racwdl&sig=NyZKOHivgesEoZ8yvLsVT6aZMYQZMevLLmXNOTaWyvU%3D" 35 | # blob="https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-ziqzhang/data/stbert/data/librispeech/libri_960/hubert_release_iter2_layer9_kmeans/${split}" 36 | # azcopy copy $feat_dir/${split}_${rank}_${nshard}.len "$blob/$sas" 37 | # azcopy copy $feat_dir/${split}_${rank}_${nshard}.npy "$blob/$sas" 38 | # azcopy copy $lab_dir "$blob/$sas" --recursive 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | 4 | 5 | def main(): 6 | for line in sys.stdin: 7 | line = line.rstrip() 8 | codes = list(map(int, line.split())) 9 | merged_codes = torch.unique_consecutive(torch.tensor(codes)).numpy() 10 | merged_codes = map(str, merged_codes) 11 | print(" ".join(merged_codes)) 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh: -------------------------------------------------------------------------------- 1 | [ $# -lt 3 ] && echo "Usage: $0 " && exit 0 2 | 3 | if [ ! -d ${HOME}/sentencepiece ]; then 4 | CURRENT_DIR=`pwd` 5 | cd ${HOME} 6 | git clone https://github.com/google/sentencepiece.git 7 | cd sentencepiece 8 | mkdir build && cd build 9 | cmake .. && make -j 16 10 | sudo make install 11 | sudo ldconfig -v 12 | cd ${HOME} 13 | cd ${CURRENT_DIR} 14 | fi 15 | 16 | input=$1 17 | outdir=$2 18 | DICT=$3 19 | suffix=$4 20 | outname=${input##*/} 21 | outname=${outname%.txt*} 22 | [ -z $input ] && echo "You must specify a source file" && exit 1 23 | 24 | [ -z $DICT ] && echo "No dict was specified!" && exit 1 25 | [ -z $outdir ] && outdir=${input%/*} 26 | [ -z $outdir ] && outdir="." 27 | [ ! -d $outdir ] && mkdir -p $outdir 28 | 29 | echo "Dict : $DICT" 30 | echo "------------------------------- creating idx/bin--------------------------------------------" 31 | echo "$input --> $outdir/${outname}${suffix}.idx" 32 | fairseq-preprocess \ 33 | --only-source \ 34 | --trainpref $input \ 35 | --destdir $outdir \ 36 | --thresholdsrc 0 \ 37 | --srcdict ${DICT} \ 38 | --workers 40 39 | 40 | mv $outdir/train.idx $outdir/${outname}${suffix}.idx 41 | mv $outdir/train.bin $outdir/${outname}${suffix}.bin 42 | echo "----------------------------------- done --------------------------------------------" 43 | 44 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh: -------------------------------------------------------------------------------- 1 | [ $# -lt 2 ] && echo "Usage: $0 " && exit 0 2 | 3 | if [ ! -d ${HOME}/sentencepiece ]; then 4 | CURRENT_DIR=`pwd` 5 | cd ${HOME} 6 | git clone https://github.com/google/sentencepiece.git 7 | cd sentencepiece 8 | mkdir build && cd build 9 | cmake .. && make -j 16 10 | sudo make install 11 | sudo ldconfig -v 12 | cd ${HOME} 13 | cd ${CURRENT_DIR} 14 | fi 15 | 16 | input=$1 17 | outdir=$2 18 | MODEL=$3 19 | suffix=$4 20 | outname=${input##*/} 21 | outname=${outname%.wrd*} 22 | [ -z $input ] && echo "You must specify a source file" && exit 1 23 | 24 | [ -z $MODEL ] && MODEL=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/spm_unigram_10000.model && echo "No spm model was specified!, set default to $MODEL" 25 | [ -z $outdir ] && outdir=${input%/*} 26 | [ -z $outdir ] && outdir="." 27 | [ ! -d $outdir ] && mkdir -p $outdir 28 | 29 | echo "Output: $outdir/$outname.spm" 30 | 31 | echo "------------------------------- tokenize text...--------------------------------------------" 32 | spm_encode --model=$MODEL < ${input} > $outdir/$outname.spm || exit 1 33 | echo "----------------------------------- done --------------------------------------------" 34 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/wmt/normalize_en_text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import regex 4 | import argparse 5 | from tqdm import tqdm 6 | from num2words import num2words 7 | 8 | def writefile(filename, lines): 9 | with open(filename, 'w', encoding='utf-8') as f: 10 | f.writelines(lines) 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--input", "-i", required=True, type=str) 15 | parser.add_argument("--output", "-o", required=True, type=str) 16 | args = parser.parse_args() 17 | outlines = [] 18 | 19 | with open(f"{args.input}", 'r') as f: 20 | inputs = f.readlines() 21 | 22 | for line in tqdm(inputs): 23 | line = line.strip().upper() 24 | line = re.sub(u"([^\u0041-\u005a\u0061-\u007a\u0030-\u0039\'])", " ", line) 25 | items = [] 26 | for item in line.split(): 27 | if item.isdigit(): 28 | try: 29 | item = num2words(item) 30 | except Exception as e: 31 | print(line) 32 | raise(e) 33 | items.append(item) 34 | line = " ".join(items) 35 | line = line.replace("-", " ") 36 | line = line.upper() 37 | line = line.replace("' S", "'S") 38 | line = line.replace(" ", "|") 39 | line = " ".join(line) + " |" 40 | outlines.append(line + '\n') 41 | # print(line) 42 | 43 | writefile(args.output, outlines) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/data_process/wmt/normalize_es_text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import regex 4 | import argparse 5 | import re,string 6 | from tqdm import tqdm 7 | from num2words import num2words 8 | 9 | def writefile(filename, lines): 10 | with open(filename, 'w', encoding='utf-8') as f: 11 | f.writelines(lines) 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--input", "-i", required=True, type=str) 16 | parser.add_argument("--output", "-o", required=True, type=str) 17 | args = parser.parse_args() 18 | outlines = [] 19 | 20 | with open(f"{args.input}", 'r') as f: 21 | inputs = f.readlines() 22 | 23 | for line in tqdm(inputs): 24 | line = line.strip() 25 | line = re.sub(u"([^\u0041-\u005a\u0061-\u007a\u0030-\u0039\u00d1\u00f1\'])", " ", line) 26 | items = [] 27 | punc='~`!#$%^&*()_+-=|\';":/.,?><~.' 28 | for item in line.split(): 29 | if item.isdigit(): 30 | try: 31 | item = num2words(item, lang='es') 32 | except Exception as e: 33 | print(line) 34 | raise(e) 35 | items.append(item) 36 | line = " ".join(items) 37 | line = (re.sub(r"[%s]+" %punc, "",line)) 38 | line = line.replace("-", " ") 39 | line = line.lower() 40 | line = line.replace("' S", "'S") 41 | line = line.replace(" ", "|") 42 | line = " ".join(line) + " |" 43 | outlines.append(line + '\n') 44 | # print(line) 45 | 46 | writefile(args.output, outlines) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/decode_text2code_beam2.sh: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | # Hubert ED model # 4 | ##################################### 5 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing 7 | 8 | model_path=$1 9 | gen_set=$2 10 | tgt=$3 11 | src="ltr" 12 | max_tokens=$4 13 | word_size=$5 14 | rank=$6 15 | outdir=$7 16 | 17 | [ -z $tgt ] && tgt="kmu" 18 | [ -z $gen_set ] && gen_set="dev_clean" 19 | [ -z $word_size ] && word_size=1 20 | [ -z $rank ] && rank=0 21 | [ -z $max_tokens ] && max_tokens=16000 22 | 23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 24 | DATA_DIR=/home/v-kunwei/ 25 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/code/fairseq_mlstku 26 | [ -z $outdir ] && outdir=$DATA_DIR 27 | 28 | 29 | results_path=$outdir/pseudo_${gen_set}_${rank} 30 | [ ! -d $results_path ] && mkdir -p $results_path 31 | 32 | for subset in $gen_set; do 33 | python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \ 34 | --path ${model_path} \ 35 | --task "translation_from_jst" \ 36 | --max-target-positions 18000 \ 37 | --gen-subset $subset \ 38 | -t $tgt -s "ltr" \ 39 | --dataset-impl "raw" \ 40 | --max-tokens ${max_tokens} \ 41 | --beam 2 \ 42 | --max-len-a 3 --max-len-b 100 \ 43 | --results-path $results_path \ 44 | --distributed-world-size $word_size --distributed-rank $rank \ 45 | 46 | echo "$model" > $results_path/model.record 47 | sleep 1s 48 | done | tee $results_path/decode.log 49 | 50 | sleep 2s 51 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/eval2.sh: -------------------------------------------------------------------------------- 1 | lmweight=0 2 | num_gpus=8 3 | python examples/speech_recognition/new/infer.py --config-dir /mnt/output/users/v-kunwei/code/fairseq/examples/speech_recognition/new/conf \ 4 | --config-name infer task=audio_finetuning task.data=/home/v-kunwei common.user_dir=/mnt/output/users/v-kunwei/code/fairseq/examples/data2vec \ 5 | task.labels=ltr decoding.type=viterbi \ 6 | decoding.lexicon=models/es_eval/espeak_dict.txt \ 7 | decoding.unique_wer_file=True \ 8 | dataset.gen_subset=test \ 9 | common_eval.path=/mnt/output/users/v-kunwei/code/fairseq/models/es_eval/espeak_26lang_m10.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus} \ 10 | decoding.results_path=/home/v-kunwei 11 | 12 | #sclite -h "/home/v-kunwei/hypo.units" -r "/home/v-kunwei/ref.units" -i rm -o all stdout > "./result.txt" 13 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/eval3.sh: -------------------------------------------------------------------------------- 1 | #$subset=test 2 | python examples/speech_recognition/infer.py /home/v-kunwei --task audio_finetuning \ 3 | --nbest 1 --path /mnt/output/users/v-kunwei/code/fairseq/models/es_eval/espeak_26lang_m10.pt --gen-subset test --results-path /home/v-kunwei --criterion ctc --labels ltr --max-tokens 4000000 \ 4 | --post-process letter 5 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/inference_ed.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Hubert base model # 3 | ##################################### 4 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 5 | 6 | model_path=$1 7 | src_dir=${model_path%/*} 8 | cpt=${model_path##*/} 9 | cpt=${cpt%.*} 10 | 11 | #beam_size=$2 12 | gen_set=$2 13 | #lang=$4 14 | [ -z $gen_set ] && gen_set="test_et" 15 | [ -z $beam_size ] && beam_size=2 16 | [ -z $lang ] && lang="fr" 17 | 18 | 19 | #DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/fin_enes 20 | DATA_DIR=/home/v-kunwei 21 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 22 | 23 | for subset in $gen_set; do 24 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${subset} 25 | [ ! -d $results_path ] && mkdir -p $results_path 26 | 27 | python $FAIRSEQ_ROOT/fairseq_cli/generate.py \ 28 | $DATA_DIR --label-dir ${DATA_DIR} \ 29 | --labels '["spm"]' --gen-subset ${subset} \ 30 | --max-tokens 9000000 --task hubert_pretraining \ 31 | --add-decoder --fine-tuning --random-crop \ 32 | --path ${model_path} --results-path /home/v-kunwei --scoring sacrebleu \ 33 | --max-len-a 0 --max-len-b 900 \ 34 | --beam 10 --single-target 35 | 36 | tail -n 1 /home/v-kunwei/generate-*.txt 37 | sleep 1s 38 | done 39 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Hubert mt model # 3 | ##################################### 4 | [ $# -gt 3 ] && echo "Usage: $0 " && exit 0 5 | world_size=$1 6 | update_freq=$2 7 | w2v_path=$3 8 | Mount="" 9 | 10 | [ -z $world_size ] && world_size=8 11 | [ -z $update_freq ] && update_freq=1 12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_wo_emb_32_1004.pt" 13 | 14 | 15 | langs="ltr,kmu" 16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation 18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/en_asr_data/ 19 | 20 | ### set save-dir 21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_en" 22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_alll" 23 | MODEL_DIR=$MODEL_DIR/$exp_name 24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 25 | 26 | 27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \ 28 | --config-dir $CONFIG_ROOT \ 29 | --config-name text2code \ 30 | +task.data=$DATA_DIR \ 31 | dataset.dataset_impl="raw" \ 32 | +task.source_lang="ltr" +task.target_lang="kmu" \ 33 | +task.normalize=false \ 34 | \ 35 | +criterion.label_smoothing=0.1 \ 36 | +criterion.report_accuracy=true \ 37 | optimizer.weight_decay=0.00001 \ 38 | +lr_scheduler.lr="[0.0001]" \ 39 | optimization.max_update=500000 \ 40 | \ 41 | +model.dropout=0.1 \ 42 | +model.attention_dropout=0.1 \ 43 | model.activation_dropout=0.1 \ 44 | model.decoder_layerdrop=0 \ 45 | model.layerdrop=0 \ 46 | model.w2v_path=$w2v_path \ 47 | +model.text_transformer_encoder_layers=6 \ 48 | \ 49 | dataset.train_subset="en_train" \ 50 | dataset.valid_subset="en_dev" \ 51 | optimization.update_freq=[${update_freq}] \ 52 | optimization.clip_norm=5 \ 53 | \ 54 | common.seed=222 \ 55 | common.log_interval=100 \ 56 | common.log_format="json" \ 57 | \ 58 | distributed_training.distributed_world_size=${world_size} \ 59 | distributed_training.nprocs_per_node=8 \ 60 | distributed_training.ddp_backend="legacy_ddp" \ 61 | \ 62 | common.tensorboard_logdir=$MODEL_DIR \ 63 | checkpoint.save_dir=$MODEL_DIR \ 64 | hydra.run.dir=$MODEL_DIR \ 65 | hydra.job.name=${exp_name} \ 66 | 67 | sleep 10s 68 | # sleep infinity 69 | 70 | 71 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k_es.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Hubert mt model # 3 | ##################################### 4 | [ $# -gt 3 ] && echo "Usage: $0 " && exit 0 5 | world_size=$1 6 | update_freq=$2 7 | w2v_path=$3 8 | Mount="" 9 | 10 | [ -z $world_size ] && world_size=8 11 | [ -z $update_freq ] && update_freq=1 12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_es_emb_90_1004.pt" 13 | 14 | 15 | langs="ltr,kmu" 16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation 18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_no_data/ 19 | 20 | ### set save-dir 21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_es" 22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_no" 23 | MODEL_DIR=$MODEL_DIR/$exp_name 24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 25 | 26 | 27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \ 28 | --config-dir $CONFIG_ROOT \ 29 | --config-name text2code \ 30 | +task.data=$DATA_DIR \ 31 | dataset.dataset_impl="raw" \ 32 | +task.source_lang="ltr" +task.target_lang="kmu" \ 33 | +task.normalize=false \ 34 | \ 35 | +criterion.label_smoothing=0.1 \ 36 | +criterion.report_accuracy=true \ 37 | optimizer.weight_decay=0.00001 \ 38 | +lr_scheduler.lr="[0.0001]" \ 39 | optimization.max_update=500000 \ 40 | \ 41 | +model.dropout=0.1 \ 42 | +model.attention_dropout=0.1 \ 43 | model.activation_dropout=0.1 \ 44 | model.decoder_layerdrop=0 \ 45 | model.layerdrop=0 \ 46 | model.w2v_path=$w2v_path \ 47 | +model.text_transformer_encoder_layers=6 \ 48 | \ 49 | dataset.train_subset="es_train" \ 50 | dataset.valid_subset="es_dev" \ 51 | optimization.update_freq=[${update_freq}] \ 52 | optimization.clip_norm=5 \ 53 | \ 54 | common.seed=222 \ 55 | common.log_interval=100 \ 56 | common.log_format="json" \ 57 | \ 58 | distributed_training.distributed_world_size=${world_size} \ 59 | distributed_training.nprocs_per_node=8 \ 60 | distributed_training.ddp_backend="legacy_ddp" \ 61 | \ 62 | common.tensorboard_logdir=$MODEL_DIR \ 63 | checkpoint.save_dir=$MODEL_DIR \ 64 | hydra.run.dir=$MODEL_DIR \ 65 | hydra.job.name=${exp_name} \ 66 | 67 | sleep 10s 68 | # sleep infinity 69 | 70 | 71 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k_es2.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Hubert mt model # 3 | ##################################### 4 | [ $# -gt 3 ] && echo "Usage: $0 " && exit 0 5 | world_size=$1 6 | update_freq=$2 7 | w2v_path=$3 8 | Mount="" 9 | 10 | [ -z $world_size ] && world_size=8 11 | [ -z $update_freq ] && update_freq=1 12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_es_emb_81_1004.pt" 13 | 14 | 15 | langs="ltr,kmu" 16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation 18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asrl_data/ 19 | 20 | ### set save-dir 21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_es" 22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_ll" 23 | MODEL_DIR=$MODEL_DIR/$exp_name 24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 25 | 26 | 27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \ 28 | --config-dir $CONFIG_ROOT \ 29 | --config-name text2code \ 30 | +task.data=$DATA_DIR \ 31 | dataset.dataset_impl="raw" \ 32 | +task.source_lang="ltr" +task.target_lang="kmu" \ 33 | +task.normalize=false \ 34 | \ 35 | +criterion.label_smoothing=0.1 \ 36 | +criterion.report_accuracy=true \ 37 | optimizer.weight_decay=0.00001 \ 38 | +lr_scheduler.lr="[0.0001]" \ 39 | optimization.max_update=500000 \ 40 | \ 41 | +model.dropout=0.1 \ 42 | +model.attention_dropout=0.1 \ 43 | model.activation_dropout=0.1 \ 44 | model.decoder_layerdrop=0 \ 45 | model.layerdrop=0 \ 46 | model.w2v_path=$w2v_path \ 47 | +model.text_transformer_encoder_layers=6 \ 48 | \ 49 | dataset.train_subset="es_train" \ 50 | dataset.valid_subset="es_dev" \ 51 | optimization.update_freq=[${update_freq}] \ 52 | optimization.clip_norm=5 \ 53 | \ 54 | common.seed=222 \ 55 | common.log_interval=100 \ 56 | common.log_format="json" \ 57 | \ 58 | distributed_training.distributed_world_size=${world_size} \ 59 | distributed_training.nprocs_per_node=8 \ 60 | distributed_training.ddp_backend="legacy_ddp" \ 61 | \ 62 | common.tensorboard_logdir=$MODEL_DIR \ 63 | checkpoint.save_dir=$MODEL_DIR \ 64 | hydra.run.dir=$MODEL_DIR \ 65 | hydra.job.name=${exp_name} \ 66 | 67 | sleep 10s 68 | # sleep infinity 69 | 70 | 71 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/decode_text2code.sh: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | # Hubert ED model # 4 | ##################################### 5 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing 7 | 8 | model_path=$1 9 | gen_set=$2 10 | tgt=$3 11 | src="ltr" 12 | max_tokens=$4 13 | word_size=$5 14 | rank=$6 15 | outdir=$7 16 | 17 | [ -z $tgt ] && tgt="kmu" 18 | [ -z $gen_set ] && gen_set="dev_clean" 19 | [ -z $word_size ] && word_size=1 20 | [ -z $rank ] && rank=0 21 | [ -z $max_tokens ] && max_tokens=2000 22 | 23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst 24 | DATA_DIR=${gen_set%/*} 25 | gen_set=${gen_set##*/} 26 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/en_asr_data 27 | [ -z $outdir ] && outdir=$DATA_DIR 28 | 29 | 30 | results_path=$outdir/pseudo_${gen_set}_${rank} 31 | [ ! -d $results_path ] && mkdir -p $results_path 32 | 33 | for subset in $gen_set; do 34 | python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \ 35 | --path ${model_path} \ 36 | --task "translation_from_jst" \ 37 | --max-target-positions 3000 \ 38 | --gen-subset $subset \ 39 | -t $tgt -s "ltr" \ 40 | --max-tokens ${max_tokens} \ 41 | --dataset-impl "raw" \ 42 | --max-len-a 2 --max-len-b 100 \ 43 | --results-path $results_path \ 44 | --skip-invalid-size-inputs-valid-test \ 45 | --distributed-world-size $word_size --distributed-rank $rank \ 46 | 47 | echo "$model" > $results_path/model.record 48 | sleep 1s 49 | done | tee $results_path/decode.log 50 | 51 | sleep 2s 52 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/decode_text2code_beam2.sh: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | # Hubert ED model # 4 | ##################################### 5 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing 7 | 8 | model_path=$1 9 | gen_set=$2 10 | tgt=$3 11 | src="ltr" 12 | max_tokens=$4 13 | word_size=$5 14 | rank=$6 15 | outdir=$7 16 | 17 | [ -z $tgt ] && tgt="kmu" 18 | [ -z $gen_set ] && gen_set="dev_clean" 19 | [ -z $word_size ] && word_size=1 20 | [ -z $rank ] && rank=0 21 | [ -z $max_tokens ] && max_tokens=2000 22 | 23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku 24 | DATA_DIR=${gen_set%/*} 25 | gen_set=${gen_set##*/} 26 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/code/fairseq_mlstku 27 | [ -z $outdir ] && outdir=$DATA_DIR 28 | 29 | 30 | results_path=$outdir/pseudo_${gen_set}_${rank} 31 | [ ! -d $results_path ] && mkdir -p $results_path 32 | 33 | for subset in $gen_set; do 34 | python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \ 35 | --path ${model_path} \ 36 | --task "translation_from_jst" \ 37 | --max-target-positions 3000 \ 38 | --gen-subset $subset \ 39 | -t $tgt -s "ltr" \ 40 | --dataset-impl "raw" \ 41 | --max-tokens ${max_tokens} \ 42 | --beam 2 \ 43 | --max-len-a 2 --max-len-b 100 \ 44 | --results-path $results_path \ 45 | --skip-invalid-size-inputs-valid-test \ 46 | --distributed-world-size $word_size --distributed-rank $rank \ 47 | 48 | echo "$model" > $results_path/model.record 49 | sleep 1s 50 | done | tee $results_path/decode.log 51 | 52 | sleep 2s 53 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/inference_code_bleu.sh: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | # Hubert ED model # 4 | ##################################### 5 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 6 | 7 | model_path=$1 8 | src_dir=${model_path%/*} 9 | cpt=${model_path##*/} 10 | cpt=${cpt%.*} 11 | 12 | gen_set=$2 13 | tgt=$3 14 | outdir=$4 15 | src="ltr" 16 | [ -z $tgt ] && tgt="kmu" 17 | [ -z $gen_set ] && gen_set="es_dev" 18 | [ -z $outdir ] && outdir=$src_dir/decode_${cpt} 19 | 20 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asr_data/ 21 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_joint_splitenc_400k/ltr-$tgt 22 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_400k/ltr-$tgt 23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst 24 | 25 | langs="ltr,$tgt" 26 | 27 | for subset in $gen_set; do 28 | results_path=$outdir/${subset} 29 | [ ! -d $results_path ] && mkdir -p $results_path 30 | 31 | python $FAIRSEQ_ROOT/fairseq_cli/generate.py $DATA_DIR \ 32 | --path ${model_path} \ 33 | --task "translation_from_jst" \ 34 | --max-target-positions 3000 \ 35 | --gen-subset $subset \ 36 | -t $tgt -s "ltr" --dataset-impl "raw" \ 37 | --batch-size 16 \ 38 | --max-len-a 2 --max-len-b 400 \ 39 | --results-path $results_path \ 40 | --scoring sacrebleu $extra 41 | 42 | echo $results_path 43 | tail -n 1 $results_path/generate-*.txt 44 | sleep 1s 45 | done 46 | 47 | # --distributed-world-size 1000 --distributed-rank 0 \ 48 | 49 | sleep 2s 50 | 51 | # cat generate-newstest2020_enja.txt | grep "^D-" | cut -d'-' -f 2- | sort -n -k1 | cut -f3 > decode-newstest2020_enja.txt 52 | # sacrebleu -t wmt20 -l en-ja -i decode-newstest2020_enja.txt --tokenize char 53 | -------------------------------------------------------------------------------- /Speech2S/speech2s/stpretrain_scripts/train_text2code/inference_code_wer.sh: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | # Hubert ED model # 4 | ##################################### 5 | [ $# -lt 1 ] && echo "Usage: $0 " && exit 0 6 | 7 | model_path=$1 8 | src_dir=${model_path%/*} 9 | cpt=${model_path##*/} 10 | cpt=${cpt%.*} 11 | 12 | gen_set=$2 13 | tgt=$3 14 | outdir=$4 15 | src="ltr" 16 | [ -z $tgt ] && tgt="kmu" 17 | [ -z $gen_set ] && gen_set="en_dev" 18 | [ -z $outdir ] && outdir=$src_dir/decode_${cpt} 19 | 20 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/ltr-$tgt 21 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_joint_splitenc_400k/ltr-$tgt 22 | #DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_400k/ltr-$tgt 23 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asr_data/ 24 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst 25 | 26 | langs="ltr,$tgt" 27 | 28 | for subset in $gen_set; do 29 | results_path=$outdir/${subset} 30 | [ ! -d $results_path ] && mkdir -p $results_path 31 | 32 | python $FAIRSEQ_ROOT/fairseq_cli/generate.py $DATA_DIR \ 33 | --path ${model_path} \ 34 | --task "translation_from_jst" \ 35 | --max-target-positions 3000 \ 36 | --gen-subset $subset \ 37 | -t $tgt -s "ltr" --dataset-impl "raw" \ 38 | --batch-size 16 \ 39 | --max-len-a 2 --max-len-b 400 \ 40 | --results-path $results_path \ 41 | --scoring wer 42 | 43 | echo $results_path 44 | tail -n 1 $results_path/generate-*.txt 45 | sleep 1s 46 | done 47 | 48 | # --distributed-world-size 1000 --distributed-rank 0 \ 49 | 50 | sleep 2s 51 | 52 | # cat generate-newstest2020_enja.txt | grep "^D-" | cut -d'-' -f 2- | sort -n -k1 | cut -f3 > decode-newstest2020_enja.txt 53 | # sacrebleu -t wmt20 -l en-ja -i decode-newstest2020_enja.txt --tokenize char 54 | -------------------------------------------------------------------------------- /SpeechLM/dataset/CommonVoice/v4/en/en-de/config_base_ende.yaml: -------------------------------------------------------------------------------- 1 | bpe_tokenizer: 2 | bpe: sentencepiece 3 | sentencepiece_model: spm_char_st_en_de.model 4 | 5 | shuffle: false 6 | use_audio_input: true 7 | use_sample_rate: 16000 8 | standardize_audio: false 9 | vocab_filename: spm_char_st_en_de.txt 10 | 11 | # required by speech_to_text task but never used 12 | input_channels: 1 13 | input_feat_per_channel: 1 14 | 15 | -------------------------------------------------------------------------------- /SpeechLM/dataset/CommonVoice/v4/en/en-de/config_large_ende.yaml: -------------------------------------------------------------------------------- 1 | bpe_tokenizer: 2 | bpe: sentencepiece 3 | sentencepiece_model: spm_char_st_en_de.model 4 | 5 | shuffle: false 6 | use_audio_input: true 7 | use_sample_rate: 16000 8 | standardize_audio: true 9 | vocab_filename: spm_char_st_en_de.txt 10 | 11 | # required by speech_to_text task but never used 12 | input_channels: 1 13 | input_feat_per_channel: 1 14 | 15 | -------------------------------------------------------------------------------- /SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.model -------------------------------------------------------------------------------- /SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.txt: -------------------------------------------------------------------------------- 1 | ▁ 1 2 | e 1 3 | n 1 4 | i 1 5 | r 1 6 | t 1 7 | s 1 8 | a 1 9 | d 1 10 | h 1 11 | u 1 12 | l 1 13 | o 1 14 | c 1 15 | g 1 16 | m 1 17 | . 1 18 | b 1 19 | f 1 20 | w 1 21 | k 1 22 | z 1 23 | S 1 24 | v 1 25 | p 1 26 | , 1 27 | D 1 28 | ü 1 29 | E 1 30 | ä 1 31 | A 1 32 | B 1 33 | M 1 34 | G 1 35 | " 1 36 | F 1 37 | K 1 38 | P 1 39 | W 1 40 | T 1 41 | y 1 42 | H 1 43 | ö 1 44 | I 1 45 | R 1 46 | L 1 47 | - 1 48 | C 1 49 | V 1 50 | N 1 51 | ß 1 52 | Z 1 53 | J 1 54 | U 1 55 | j 1 56 | O 1 57 | x 1 58 | ? 1 59 | ! 1 60 | ' 1 61 | q 1 62 | Y 1 63 | Ü 1 64 | : 1 65 | Q 1 66 | Ä 1 67 | Ö 1 68 | ; 1 69 | ( 1 70 | ) 1 71 | X 1 72 | 0 1 73 | 1 1 74 | [ 1 75 | ] 1 76 | é 1 77 | 2 1 78 | & 1 79 | 3 1 80 | 5 1 81 | 4 1 82 | 7 1 83 | 9 1 84 | 8 1 85 | 6 1 86 | / 1 87 | á 1 88 | ō 1 89 | ó 1 90 | ñ 1 91 | ú 1 92 | í 1 93 | ā 1 94 | è 1 95 | * 1 96 | ć 1 97 | à 1 98 | ê 1 99 | ë 1 100 | ¡ 1 101 | ç 1 102 | ð 1 103 | ã 1 104 | č 1 105 | ū 1 106 | % 1 107 | É 1 108 | â 1 109 | ø 1 110 | š 1 111 | å 1 112 | ô 1 113 | ł 1 114 | œ 1 115 | ş 1 116 | Š 1 117 | _ 1 118 | Î 1 119 | Ó 1 120 | æ 1 121 | ï 1 122 | ă 1 123 | ě 1 124 | ī 1 125 | ı 1 126 | ʻ 1 127 | ʿ 1 128 | π 1 129 | и 1 130 | к 1 131 | = 1 132 | à 1 133 | Ø 1 134 | î 1 135 | û 1 136 | þ 1 137 | ċ 1 138 | Č 1 139 | ę 1 140 | ğ 1 141 | ń 1 142 | Ō 1 143 | ő 1 144 | ř 1 145 | ž 1 146 | ǎ 1 147 | α 1 148 | В 1 149 | е 1 150 | з 1 151 | й 1 152 | л 1 153 | н 1 154 | ь 1 155 | я 1 156 | ṃ 1 157 | ạ 1 158 | ụ 1 159 | → 1 160 | ≡ 1 161 | 京 1 162 | 大 1 163 | 都 1 164 | 阪 1 165 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriLM/hidden_unit/bin-idx/config.yaml: -------------------------------------------------------------------------------- 1 | vocab_filename: dict.ltr.txt 2 | src_vocab_filename: dict.km.txt 3 | 4 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriLM/hidden_unit/bin-idx/dict.ltr.txt: -------------------------------------------------------------------------------- 1 | ../../phone_unit/bin-idx/dict.ltr.txt -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriLM/phone_unit/bin-idx/config.yaml: -------------------------------------------------------------------------------- 1 | vocab_filename: dict.ltr.txt 2 | src_vocab_filename: dict.phn.txt 3 | 4 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriLM/phone_unit/bin-idx/dict.ltr.txt: -------------------------------------------------------------------------------- 1 | | 803288730 2 | E 439294199 3 | T 319071758 4 | A 277306732 5 | O 263784364 6 | N 239361162 7 | I 237353011 8 | H 223346762 9 | S 220175453 10 | R 203352500 11 | D 152198685 12 | L 141597450 13 | U 98913389 14 | M 87138757 15 | C 84680142 16 | W 81375101 17 | F 80240665 18 | G 70642902 19 | Y 68388038 20 | P 58436929 21 | B 52538531 22 | V 33250231 23 | K 26906609 24 | ' 9162896 25 | X 5075632 26 | J 4746771 27 | Q 3401794 28 | Z 2186971 29 | 1 30 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriSpeech/asr/dict.ltr.txt: -------------------------------------------------------------------------------- 1 | ../../LibriLM/phone_unit/bin-idx/dict.ltr.txt -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriSpeech/fast_phone2unit/config.yaml: -------------------------------------------------------------------------------- 1 | audio_root: /home/v-ziqzhang/dataset/librispeech_phone2unit 2 | features: 3 | energy_max: 5.733445167541504 4 | energy_min: 1.0e-08 5 | eps: 1.0e-05 6 | hop_length: 256 7 | pitch_max: 6.608609099713706 8 | pitch_min: 1.0e-08 9 | sample_rate: 16000 10 | sample_rate: 16000 11 | vocab_filename: dict.km.txt 12 | src_vocab_filename: dict.phn.txt 13 | 14 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriSpeech/fast_phone2unit/config_generate.yaml: -------------------------------------------------------------------------------- 1 | audio_root: /home/v-ziqzhang/dataset/librispeech_phone2unit 2 | features: 3 | energy_max: 5.733445167541504 4 | energy_min: 1.0e-08 5 | eps: 1.0e-05 6 | hop_length: 256 7 | pitch_max: 6.608609099713706 8 | pitch_min: 1.0e-08 9 | sample_rate: 16000 10 | sample_rate: 16000 11 | vocab_filename: dict.km.txt 12 | src_vocab_filename: dict.PHN.txt 13 | 14 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriSpeech/fast_phone2unit/dict.PHN.txt: -------------------------------------------------------------------------------- 1 | | 0 2 | 1 3 | ' 2 4 | AA 3 5 | AE 4 6 | AH 5 7 | AO 6 8 | AW 7 9 | AY 8 10 | B 9 11 | CH 10 12 | D 11 13 | DH 12 14 | EH 13 15 | ER 14 16 | EY 15 17 | F 16 18 | G 17 19 | HH 18 20 | IH 19 21 | IY 20 22 | JH 21 23 | K 22 24 | L 23 25 | M 24 26 | N 25 27 | NG 26 28 | OW 27 29 | OY 28 30 | P 29 31 | R 30 32 | S 31 33 | SH 32 34 | T 33 35 | TH 34 36 | UH 35 37 | UW 36 38 | V 37 39 | W 38 40 | Y 39 41 | Z 40 42 | ZH 41 43 | -------------------------------------------------------------------------------- /SpeechLM/dataset/LibriSpeech/fast_phone2unit/dict.phn.txt: -------------------------------------------------------------------------------- 1 | | 0 2 | 1 3 | ' 2 4 | AA 3 5 | AE 4 6 | AH 5 7 | AO 6 8 | AW 7 9 | AY 8 10 | B 9 11 | CH 10 12 | D 11 13 | DH 12 14 | EH 13 15 | ER 14 16 | EY 15 17 | F 16 18 | G 17 19 | HH 18 20 | IH 19 21 | IY 20 22 | JH 21 23 | K 22 24 | L 23 25 | M 24 26 | N 25 27 | NG 26 28 | OW 27 29 | OY 28 30 | P 29 31 | R 30 32 | S 31 33 | SH 32 34 | T 33 35 | TH 34 36 | UH 35 37 | UW 36 38 | V 37 39 | W 38 40 | Y 39 41 | Z 40 42 | ZH 41 43 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models 2 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/decode/infer_fsqlm.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | defaults: 4 | - model: null 5 | 6 | hydra: 7 | run: 8 | dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} 9 | sweep: 10 | dir: ${common_eval.results_path} 11 | subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} 12 | 13 | task: 14 | _name: joint_sc2t_pretraining 15 | data: ??? 16 | label_dir: ??? 17 | labels: ["ltr"] 18 | store_labels: true 19 | single_target: true 20 | fine_tuning: true 21 | normalize: ??? # must be consistent with pre-training 22 | add_decoder_target: false 23 | pad_audio: false 24 | random_crop: true 25 | hubert_tokenizer: "none" 26 | sp_path: None 27 | 28 | decoding: 29 | type: fairseqlm 30 | lexicon: ??? 31 | lmpath: ??? 32 | beamthreshold: 25 33 | beam: 500 34 | lmweight: 2 35 | wordscore: -1 36 | silweight: 0 37 | unique_wer_file: true 38 | common_eval: 39 | results_path: ??? 40 | path: ??? 41 | post_process: letter 42 | dataset: 43 | max_tokens: 1100000 44 | gen_subset: ??? 45 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/decode/infer_kenlm.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | defaults: 4 | - model: null 5 | 6 | hydra: 7 | run: 8 | dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} 9 | sweep: 10 | dir: ${common_eval.results_path} 11 | subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} 12 | 13 | task: 14 | _name: joint_sc2t_pretraining 15 | data: ??? 16 | label_dir: ??? 17 | labels: ["ltr"] 18 | store_labels: true 19 | single_target: true 20 | fine_tuning: true 21 | normalize: ??? # must be consistent with pre-training 22 | add_decoder_target: false 23 | pad_audio: false 24 | random_crop: true 25 | hubert_tokenizer: "none" 26 | sp_path: None 27 | 28 | decoding: 29 | type: kenlm 30 | lexicon: ??? 31 | lmpath: ??? 32 | beamthreshold: 100 33 | beam: 500 34 | lmweight: 2 35 | wordscore: -1 36 | silweight: 0 37 | unique_wer_file: true 38 | common_eval: 39 | results_path: ??? 40 | path: ??? 41 | post_process: letter 42 | dataset: 43 | max_tokens: 1100000 44 | gen_subset: ??? 45 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/decode/infer_viterbi.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | defaults: 4 | - model: null 5 | 6 | hydra: 7 | run: 8 | dir: ${common_eval.results_path}/viterbi 9 | sweep: 10 | dir: ${common_eval.results_path} 11 | subdir: viterbi 12 | 13 | task: 14 | _name: joint_sc2t_pretraining 15 | data: ??? 16 | label_dir: ??? 17 | labels: ["ltr"] 18 | store_labels: true 19 | single_target: true 20 | fine_tuning: true 21 | normalize: ??? # must be consistent with pre-training 22 | add_decoder_target: false 23 | pad_audio: false 24 | random_crop: true 25 | hubert_tokenizer: "none" 26 | sp_path: None 27 | 28 | decoding: 29 | type: viterbi 30 | unique_wer_file: true 31 | common_eval: 32 | results_path: ??? 33 | path: ??? 34 | post_process: letter 35 | dataset: 36 | batch_size: 1 37 | gen_subset: ??? 38 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/finetune/speechlm_base_100h.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | seed: 1337 9 | 10 | checkpoint: 11 | save_interval: 1 12 | keep_last_epochs: 1 13 | keep_best_checkpoints: -1 14 | best_checkpoint_metric: wer 15 | restore_file: checkpoint_last.pt 16 | 17 | distributed_training: 18 | ddp_backend: legacy_ddp 19 | find_unused_parameters: true 20 | distributed_world_size: 1 21 | distributed_port: -1 22 | nprocs_per_node: 8 23 | 24 | task: 25 | _name: joint_sc2t_pretraining 26 | data: ??? 27 | fine_tuning: true 28 | label_dir: ??? 29 | normalize: false # must be consistent with pre-training 30 | labels: ["ltr"] 31 | store_labels: true 32 | single_target: true 33 | add_decoder_target: false 34 | pad_audio: false 35 | random_crop: true 36 | hubert_tokenizer: "none" 37 | sp_path: None 38 | 39 | dataset: 40 | num_workers: 0 41 | max_tokens: 1600000 42 | skip_invalid_size_inputs_valid_test: true 43 | train_subset: train_100 44 | valid_subset: dev_other 45 | required_batch_size_multiple: 1 46 | 47 | criterion: 48 | _name: ctc 49 | zero_infinity: true 50 | 51 | optimization: 52 | max_update: 30000 53 | lr: [0.00001] 54 | sentence_avg: true 55 | update_freq: [1] 56 | 57 | optimizer: 58 | _name: adam 59 | adam_betas: (0.9,0.98) 60 | adam_eps: 1e-08 61 | weight_decay: 0.0 62 | 63 | lr_scheduler: 64 | _name: tri_stage 65 | phase_ratio: [0.1, 0.4, 0.5] 66 | final_lr_scale: 0.05 67 | 68 | model: 69 | _name: speechlm_ctc 70 | w2v_path: ??? 71 | apply_mask: true 72 | mask_prob: 0.65 73 | mask_channel_prob: 0.5 74 | mask_channel_length: 64 75 | layerdrop: 0.1 76 | activation_dropout: 0.1 77 | feature_grad_mult: 0.0 78 | freeze_finetune_updates: 0 79 | 80 | hydra: 81 | job: 82 | config: 83 | override_dirname: 84 | kv_sep: '-' 85 | item_sep: '__' 86 | exclude_keys: 87 | - run 88 | - task.data 89 | - task.label_dir 90 | - model.w2v_path 91 | - dataset.train_subset 92 | - dataset.valid_subset 93 | - criterion.wer_kenlm_model 94 | - criterion.wer_lexicon 95 | run: 96 | dir: ??? 97 | sweep: 98 | dir: ??? 99 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 100 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/finetune/speechlm_large_960h.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | 9 | checkpoint: 10 | save_interval: 1 11 | keep_last_epochs: 5 12 | keep_best_checkpoints: 5 13 | best_checkpoint_metric: wer 14 | restore_file: checkpoint_last.pt 15 | 16 | distributed_training: 17 | ddp_backend: legacy_ddp 18 | find_unused_parameters: true 19 | distributed_world_size: 32 20 | distributed_port: -1 21 | nprocs_per_node: 8 22 | 23 | task: 24 | _name: joint_sc2t_pretraining 25 | data: ??? 26 | fine_tuning: true 27 | label_dir: ??? 28 | normalize: true # must be consistent with pre-training 29 | labels: ["ltr"] 30 | store_labels: true 31 | single_target: true 32 | add_decoder_target: false 33 | pad_audio: false 34 | random_crop: true 35 | hubert_tokenizer: "none" 36 | sp_path: None 37 | 38 | dataset: 39 | num_workers: 0 40 | max_tokens: 900000 41 | skip_invalid_size_inputs_valid_test: true 42 | train_subset: train_960 43 | valid_subset: dev_other 44 | required_batch_size_multiple: 1 45 | 46 | criterion: 47 | _name: ctc 48 | zero_infinity: true 49 | 50 | optimization: 51 | max_update: 200000 52 | lr: [0.00001] 53 | sentence_avg: true 54 | update_freq: [1] 55 | 56 | optimizer: 57 | _name: adam 58 | adam_betas: (0.9,0.98) 59 | adam_eps: 1e-08 60 | weight_decay: 0.0 61 | 62 | lr_scheduler: 63 | _name: tri_stage 64 | phase_ratio: [0.1, 0.4, 0.5] 65 | final_lr_scale: 0.05 66 | 67 | model: 68 | _name: speechlm_ctc 69 | w2v_path: ??? 70 | apply_mask: true 71 | mask_prob: 0.5 72 | mask_channel_prob: 0.25 73 | mask_channel_length: 64 74 | layerdrop: 0.0 75 | activation_dropout: 0.1 76 | feature_grad_mult: 0.0 77 | freeze_finetune_updates: 0 78 | 79 | hydra: 80 | job: 81 | config: 82 | override_dirname: 83 | kv_sep: '-' 84 | item_sep: '__' 85 | exclude_keys: 86 | - run 87 | - task.data 88 | - task.label_dir 89 | - model.w2v_path 90 | - dataset.train_subset 91 | - dataset.valid_subset 92 | - criterion.wer_kenlm_model 93 | - criterion.wer_lexicon 94 | run: 95 | dir: ??? 96 | sweep: 97 | dir: ??? 98 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 99 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/config/pretrain/speechlmp_base_cfg.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/speechlm/config/pretrain/speechlmp_base_cfg.pt -------------------------------------------------------------------------------- /SpeechLM/speechlm/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | for file in os.listdir(os.path.dirname(__file__)): 5 | if file.endswith(".py") and not file.startswith("_"): 6 | criterion_name = file[: file.find(".py")] 7 | importlib.import_module( 8 | "speechlm.criterions." + criterion_name 9 | ) 10 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/covost2/mp3_to_wav.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | from pydub import AudioSegment 4 | import torchaudio 5 | import os 6 | 7 | def mp3_convert_wav(mp3_file, wav_file): 8 | try: 9 | sound = AudioSegment.from_mp3(mp3_file) 10 | sound=sound.set_frame_rate(16000) 11 | sound=sound.set_channels(1) 12 | sound=sound.set_sample_width(2) 13 | sound.export(wav_file, format="wav") 14 | except Exception as e: 15 | print(e) 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--input", "-i", required=True, type=str) 20 | parser.add_argument("--shard", "-n", required=True, type=int) 21 | parser.add_argument("--rank", "-r", required=True, type=int) 22 | args = parser.parse_args() 23 | 24 | assert args.rank < args.shard, f"rank: {args.rank} >= shard: {args.shard}" 25 | 26 | with open(args.input, 'r') as f: 27 | files = [line.strip() for line in f ] 28 | 29 | mp3_files = files[args.rank::args.shard] 30 | for mp3_file in tqdm(mp3_files): 31 | wav_file = mp3_file.replace("/clips/", "/wav/").replace(".mp3", ".wav") 32 | if os.path.exists(wav_file): 33 | try: 34 | torchaudio.info(wav_file) 35 | except Exception as e: 36 | print(e) 37 | mp3_convert_wav(mp3_file, wav_file) 38 | else: 39 | mp3_convert_wav(mp3_file, wav_file) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/filter_paireddata_by_len.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4 5 | # 6 | # Copyright (c) 2022 Microsoft 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # ---------------------------------------------------------------------------- 9 | 10 | import os 11 | import argparse 12 | from tqdm import tqdm 13 | import numpy as np 14 | 15 | 16 | lg_label = "__label__{}" 17 | 18 | def writefile(filename, lines): 19 | with open(filename, 'w', encoding='utf-8') as f: 20 | f.writelines(lines) 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--input", "-i", required=True, type=str) 26 | parser.add_argument("--output", "-o", required=True, type=str) 27 | parser.add_argument("--src", "-s", required=True, type=str) 28 | parser.add_argument("--tgt", "-t", required=True, type=str) 29 | parser.add_argument("--max-len", "-m", default=2998, type=int) 30 | args = parser.parse_args() 31 | 32 | src_lines, tgt_lines = [], [] 33 | with open(f"{args.input}.{args.src}", 'r') as f1, open(f"{args.input}.{args.tgt}", 'r') as f2: 34 | for src_line, tgt_line in tqdm(zip(f1, f2)): 35 | src_len = len(src_line.strip().split()) 36 | tgt_len = len(tgt_line.strip().split()) 37 | if src_len < args.max_len and src_len > 0 and tgt_len < args.max_len and tgt_len > 0: 38 | src_lines.append(src_line) 39 | tgt_lines.append(tgt_line) 40 | 41 | writefile(f"{args.output}.{args.src}", src_lines) 42 | writefile(f"{args.output}.{args.tgt}", tgt_lines) 43 | 44 | if __name__ == "__main__": 45 | main() 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/phoneme_tokenizer/repeat_withou_insert_sil_less_4375.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4 5 | # 6 | # Copyright (c) 2022 Microsoft 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # ---------------------------------------------------------------------------- 9 | 10 | import sys, json, tqdm 11 | import numpy as np 12 | 13 | input_file = sys.argv[1] 14 | mean_and_std_file = sys.argv[2] 15 | out_file = sys.argv[3] 16 | 17 | mean_and_std = json.load(open(mean_and_std_file, 'r')) 18 | 19 | with open(input_file, 'r') as f, open(out_file, 'w') as w: 20 | for line in tqdm.tqdm(f): 21 | l = line.split() 22 | 23 | new_l = [] 24 | for phn in l: 25 | if phn not in mean_and_std: 26 | mean_and_std[phn] = [5, 2.5] 27 | print(f'unk phone {phn}') 28 | n = max(1, round(np.random.normal(loc=mean_and_std[phn][0], scale=mean_and_std[phn][1]))) 29 | new_l.extend([phn] * int(n)) 30 | 31 | minus = 0 32 | while len(new_l) >= 4375: 33 | minus += 1 34 | new_l = [] 35 | for phn in l: 36 | n = max(1, round(mean_and_std[phn][0] - minus)) 37 | new_l.extend([phn] * n) 38 | print(f"too long line try minus {minus}") 39 | 40 | w.write(' '.join(new_l)+'\n') 41 | 42 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/prepare_covost2_enxx.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 4 | [ $# -lt 1 ] && echo "Usage: $0 [root=${PWD}/dataset/CommonVoice/v4]" && exit 0 5 | cwd=${PWD} 6 | src=${PWD}/speechlm/data_process 7 | lang=$1 8 | root=$2 9 | [ -z $root ] && root="${PWD}/dataset/CommonVoice/v4" 10 | set -e -o pipefail -u 11 | 12 | 13 | ### step1, convert mp3 to wav 14 | cd $root/en && mkdir -p wav 15 | cut -f2 validated.tsv | sed '1d' | sed "s|^|${root}/en/clips/|" > validated.id 16 | for i in $(seq 0 39); do 17 | echo extracting $i; 18 | python $src/covost2/mp3_to_wav.py -i validated.id -n 40 -r $i & 19 | done 20 | wait 21 | cd $cwd 22 | 23 | 24 | ### step2, manifest 25 | datadir="$root/en/en-$lang" && mkdir -p $datadir && cd $datadir 26 | python /mnt/default/v-ziqzhang/code/stpretrain_scripts/data_process/covost2/prepare_covost_data.py --data-root $root --src-lang en --tgt-lang $lang --vocab-type char 27 | mv ../*en_${lang}.* ./ 28 | 29 | # adjust config_base_en${lang}.yaml 30 | echo "bpe_tokenizer:" > config_base_en${lang}.yaml 31 | echo " bpe: sentencepiece" >> config_base_en${lang}.yaml 32 | echo " sentencepiece_model: spm_char_st_en_de.model" >> config_base_en${lang}.yaml 33 | echo "" >> config_base_en${lang}.yaml 34 | echo "shuffle: false" >> config_base_en${lang}.yaml 35 | echo "use_audio_input: true" >> config_base_en${lang}.yaml 36 | echo "use_sample_rate: 16000" >> config_base_en${lang}.yaml 37 | echo "standardize_audio: false" >> config_base_en${lang}.yaml 38 | echo "vocab_filename: spm_char_st_en_de.txt" >> config_base_en${lang}.yaml 39 | echo "" >> config_base_en${lang}.yaml 40 | echo "# required by speech_to_text task but never used" >> config_base_en${lang}.yaml 41 | echo "input_channels: 1" >> config_base_en${lang}.yaml 42 | echo "input_feat_per_channel: 1" >> config_base_en${lang}.yaml 43 | echo "" >> config_base_en${lang}.yaml 44 | # adjust config_large_en${lang}.yaml 45 | cat config_base_en${lang}.yaml | sed "s|standardize_audio: false|standardize_audio: true|" > config_large_en${lang}.yaml 46 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/txt2idx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | [ $# -lt 3 ] && echo "Usage: $0 " && exit 0 3 | 4 | input=$1 5 | outdir=$2 6 | DICT=$3 7 | suffix=$4 8 | outname=${input##*/} 9 | outname=${outname%.txt*} 10 | [ -z $input ] && echo "You must specify a source file" && exit 1 11 | 12 | [ -z $DICT ] && echo "No dict was specified!" && exit 1 13 | [ -z $outdir ] && outdir=${input%/*} 14 | [ -z $outdir ] && outdir="." 15 | [ ! -d $outdir ] && mkdir -p $outdir 16 | 17 | echo "------------------------------- creating idx/bin--------------------------------------------" 18 | echo "$input --> $outdir/${outname}${suffix}.idx" 19 | fairseq-preprocess \ 20 | --only-source \ 21 | --trainpref $input \ 22 | --destdir $outdir \ 23 | --thresholdsrc 0 \ 24 | --srcdict ${DICT} \ 25 | --workers 40 26 | 27 | mv $outdir/train.idx $outdir/${outname}${suffix}.idx 28 | mv $outdir/train.bin $outdir/${outname}${suffix}.bin 29 | echo "----------------------------------- done --------------------------------------------" 30 | 31 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/data_process/wrd2ltr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def main(): 4 | for line in sys.stdin: 5 | line = line.replace("", "") 6 | line = " ".join(line.strip().split()) 7 | line = line.replace(" ", "|").upper() + "|" 8 | print(" ".join(line)) 9 | 10 | if __name__ == "__main__": 11 | main() 12 | 13 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/speechlm/models/__init__.py -------------------------------------------------------------------------------- /SpeechLM/speechlm/models/speechlm_ctcasr.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4 5 | # 6 | # Copyright (c) 2022 Microsoft 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # ---------------------------------------------------------------------------- 9 | 10 | from dataclasses import dataclass 11 | from fairseq.models import BaseFairseqModel, register_model 12 | from fairseq.tasks import FairseqTask 13 | 14 | from fairseq.models.hubert import HubertAsrConfig, HubertCtc, HubertEncoder 15 | 16 | @dataclass 17 | class SpeechLMCtcConfig(HubertAsrConfig): 18 | pass 19 | 20 | 21 | @register_model("speechlm_ctc", dataclass=SpeechLMCtcConfig) 22 | class SpeechLMCtc(HubertCtc): 23 | def __init__(self, cfg: SpeechLMCtcConfig, w2v_encoder: BaseFairseqModel): 24 | super().__init__(cfg, w2v_encoder) 25 | 26 | @classmethod 27 | def build_model(cls, cfg: SpeechLMCtcConfig, task: FairseqTask): 28 | """Build a new model instance.""" 29 | w2v_encoder = SpeechLMEncoder(cfg, task) 30 | return cls(cfg, w2v_encoder) 31 | 32 | 33 | class SpeechLMEncoder(HubertEncoder): 34 | def __init__(self, cfg: HubertAsrConfig, task): 35 | super().__init__(cfg, task) 36 | 37 | if (task.target_dictionary is not None) and ( 38 | hasattr(self.w2v_model, "unit_encoder_ctc_head") 39 | ): 40 | self.proj = self.w2v_model.unit_encoder_ctc_head 41 | self.conv_ctc_proj = True 42 | else: 43 | self.conv_ctc_proj = False 44 | 45 | def forward(self, source, padding_mask, tbc=True, **kwargs): 46 | results = super().forward( 47 | source, 48 | padding_mask, 49 | tbc, 50 | **kwargs, 51 | ) 52 | if self.conv_ctc_proj: 53 | padding_mask = self.w2v_model.downsample_ctc_padding_mask(results["padding_mask"]) 54 | results["encoder_padding_mask"] = padding_mask 55 | results["padding_mask"] = padding_mask 56 | return results 57 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/facebookresearch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | from .multihead_attention import MultiheadAttention 11 | from .relative_pos_enc import RelativePositionalEncoding 12 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase 13 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer 14 | from .learned_positional_embedding import LearnedPositionalEmbedding 15 | 16 | __all__ = [ 17 | "MultiheadAttention", 18 | "RelativePositionalEncoding", 19 | "TransformerEncoderLayerBase", 20 | "TransformerDecoderLayerBase", 21 | "TransformerEncoder", 22 | "TransformerSentenceEncoderLayer" 23 | ] 24 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/modules/relative_pos_enc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | import torch 11 | 12 | class RelativePositionalEncoding(torch.nn.Module): 13 | def __init__(self, d_model, maxlen=1000, embed_v=False): 14 | super(RelativePositionalEncoding, self).__init__() 15 | 16 | self.d_model = d_model 17 | self.maxlen = maxlen 18 | self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 19 | if embed_v: 20 | self.pe_v = torch.nn.Embedding(2*maxlen, d_model) 21 | self.embed_v = embed_v 22 | 23 | 24 | def forward(self, pos_seq, incremental_state=None): 25 | pos_seq[pos_seq < -self.maxlen] = -self.maxlen 26 | pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1 27 | pos_seq = pos_seq + self.maxlen 28 | 29 | if incremental_state is not None: 30 | pos_seq = pos_seq[-1:] 31 | 32 | if self.embed_v: 33 | return self.pe_k(pos_seq), self.pe_v(pos_seq) 34 | else: 35 | return self.pe_k(pos_seq), None 36 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/pretrain_speechlm/base_speechlmh.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM-H Base model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=1 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/base_speechlmh_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechlm/config/pretrain \ 21 | --config-name speechlm_base_librispeech \ 22 | common.user_dir=$CODE_ROOT/speechlm \ 23 | \ 24 | task.labels='["km"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_960+train_text.km-ltr\" \ 31 | dataset.valid_subset=\"dev_clean+dev_clean.km-ltr\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=1400000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=pretrain 41 | 42 | # data_dir="/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/local" 43 | # text_data_dir="/mnt/default/v-ziqzhang/dataset/LibriLM/from_fastT2U/bin-idx" 44 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/pretrain_speechlm/base_speechlmp.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM-P Base model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=1 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/base_speechlmp_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechlm/config/pretrain \ 21 | --config-name speechlm_base_librispeech \ 22 | common.user_dir=$CODE_ROOT/speechlm \ 23 | \ 24 | task.labels='["phn"]' \ 25 | model.label_rate=100 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_960+train_text.phn-ltr\" \ 31 | dataset.valid_subset=\"dev_clean+dev_clean.phn-ltr\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=1400000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=pretrain 41 | 42 | # data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/tri4b_mono_label" 43 | # text_data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/filt2k_sil025_m5std25_sil14_spn32/bin-idx" 44 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/pretrain_speechlm/large_speechlmp.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM-P Large model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=4 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/large_speechlmp_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechlm/config/pretrain \ 21 | --config-name speechlm_large_librilight \ 22 | common.user_dir=$CODE_ROOT/speechlm \ 23 | \ 24 | task.labels='["phn"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_60k+train_text.phn-ltr\" \ 31 | dataset.valid_subset=\"dev_clean+dev_clean.phn-ltr\" \ 32 | dataset.num_workers=1 \ 33 | dataset.max_tokens=900000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.fp16_scale_tolerance=0.1 \ 38 | common.tensorboard_logdir=$MODEL_DIR \ 39 | checkpoint.save_dir=$MODEL_DIR \ 40 | hydra.run.dir=$MODEL_DIR \ 41 | hydra.job.name=pretrain 42 | 43 | # data_dir="/stdblob/users/v-ziqzhang/dataset/librilight/chunkdata" 44 | # text_data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/filt2k_sil025_m5std25_sil14_spn32/bin-idx" 45 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tokenizer_fastT2U/generate.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Fast Text2Unit Model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [outdir={gen_set%/*}]" && exit 0 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | src_dir=${model_path%/*} 9 | cpt=${model_path##*/} 10 | cpt=${cpt%.*} 11 | 12 | gen_set=$2 13 | outdir=$3 14 | 15 | DATA_DIR=${gen_set%/*} 16 | gen_set=${gen_set##*/} 17 | [ -z $outdir ] && outdir=${DATA_DIR} 18 | 19 | CODE_ROOT=${PWD} 20 | 21 | nj=4 22 | for rank in $(seq 0 $((nj-1))); do 23 | results_path=$outdir/pseudo_${gen_set}/${rank} 24 | [ ! -d $results_path ] && mkdir -p $results_path 25 | echo "$model_path" > $results_path/model.record 26 | 27 | python $CODE_ROOT/speechlm/generate_unit.py $DATA_DIR \ 28 | --user-dir $CODE_ROOT/speechlm \ 29 | --config-yaml config_generate.yaml \ 30 | --path ${model_path} \ 31 | --task fast_text_to_unit \ 32 | --gen-subset $gen_set \ 33 | \ 34 | --beam 1 \ 35 | --max-tokens 10000 \ 36 | --results-path $results_path \ 37 | --scoring sacrebleu \ 38 | --skip-invalid-size-inputs-valid-test \ 39 | --distributed-world-size $nj --distributed-rank ${rank} \ 40 | & 41 | done 42 | wait 43 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tokenizer_fastT2U/infer.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Fast Text2Unit Model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 " && exit 0 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | src_dir=${model_path%/*} 9 | cpt=${model_path##*/} 10 | cpt=${cpt%.*} 11 | 12 | gen_set=$2 13 | 14 | DATA_DIR=${gen_set%/*} 15 | gen_set=${gen_set##*/} 16 | outdir=$src_dir/decode_${cpt} 17 | 18 | CODE_ROOT=${PWD} 19 | 20 | for subset in ${gen_set//,/ }; do 21 | results_path=$outdir/phone2unit_${subset} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $CODE_ROOT/speechlm/generate_unit.py $DATA_DIR \ 25 | --user-dir $CODE_ROOT/speechlm \ 26 | --config-yaml config.yaml \ 27 | --path ${model_path} \ 28 | --task fast_text_to_unit \ 29 | --gen-subset $subset \ 30 | \ 31 | --beam 1 \ 32 | --max-tokens 10000 \ 33 | --results-path $results_path \ 34 | --scoring sacrebleu 35 | 36 | echo $results_path 37 | tail -n 1 $results_path/generate-*.txt 38 | sleep 1s 39 | done 40 | 41 | # --distributed-world-size 1000 --distributed-rank 0 \ 42 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tokenizer_fastT2U/train_s_5e-4.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # Fast Text2Unit Model # 3 | ##################################### 4 | [ $# -lt 1 ] && echo "Usage: $0 [mount] [world_size=4] [update_freq=1]" && exit 0 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | DATA_DIR=$1 8 | mount=$2 9 | world_size=$3 10 | update_freq=$4 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=4 13 | [ -z $update_freq ] && update_freq=1 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="$mount/exp/fast_text2unit/small_lr5e-4_tristage_ls0.1_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | fairseq-train ${DATA_DIR} --save-dir ${MODEL_DIR} \ 20 | --config-yaml config.yaml \ 21 | --user-dir $CODE_ROOT/speechlm \ 22 | --train-subset train_100 --valid-subset dev_clean \ 23 | --num-workers 4 --max-tokens 20000 \ 24 | --distributed-world-size ${world_size} --update-freq ${update_freq} \ 25 | \ 26 | --task fast_text_to_unit --criterion fasttext2unit_criterion --arch fasttext2unit_s \ 27 | --label-smoothing 0.1 \ 28 | \ 29 | --clip-norm 5.0 --n-frames-per-step 1 \ 30 | --dropout 0.1 --attention-dropout 0.1 \ 31 | --optimizer adam --lr 5e-4 --lr-scheduler tri_stage --phase-ratio [0.3,0.0,0.7] --max-update 10000 \ 32 | --seed 1 --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ 33 | \ 34 | --save-interval 2 \ 35 | --tensorboard-logdir ${MODEL_DIR} \ 36 | --fp16 --find-unused-parameters \ 37 | | tee ${MODEL_DIR}/train.log 38 | 39 | # DATA_DIR=/mnt/default/v-ziqzhang/dataset/librispeech_phone2unit/phone2unit 40 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/finetune_base_ctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=1 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/ctc30k_from_${cpt}_bz1.6m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechlm/config/finetune \ 26 | --config-name speechlm_base_100h \ 27 | common.user_dir=$CODE_ROOT/speechlm \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=30000 \ 35 | dataset.max_tokens=1600000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_clean_100" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=${exp_name} 46 | 47 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/exp/base/base_speechlmp_32gpu_1accum/checkpoint_298_400000.pt 48 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr 49 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/finetune_large_ctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM Large model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=4]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=4 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/ctc200k_from_${cpt}_bz3.6m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechlm/config/finetune \ 26 | --config-name speechlm_large_960h \ 27 | common.user_dir=$CODE_ROOT/speechlm \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=200000 \ 35 | dataset.max_tokens=900000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_960" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=${exp_name} 46 | 47 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/exp/large/large_speechlmp_32gpu_4accum/checkpoint_31_400000.pt 48 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr 49 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechLM Base model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other" 11 | src_dir=${model_path%/*} 12 | cpt=${model_path##*/} 13 | cpt=${cpt%.*} 14 | 15 | CODE_ROOT=${PWD} 16 | 17 | for subset in ${gen_set//,/ }; do 18 | results_path=$src_dir/decode_${cpt}_ctc/${subset} 19 | [ ! -d $results_path ] && mkdir -p $results_path 20 | 21 | python $CODE_ROOT/speechlm/infer.py \ 22 | --config-dir $CODE_ROOT/speechlm/config/decode \ 23 | --config-name infer_viterbi \ 24 | common.user_dir=$CODE_ROOT/speechlm \ 25 | \ 26 | dataset.gen_subset=${subset} \ 27 | task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=false \ 28 | common_eval.results_path=${results_path} common_eval.path=${model_path} \ 29 | \ 30 | common_eval.quiet=true \ 31 | & 32 | done 33 | wait 34 | 35 | ### important to know 36 | # When loading the fine-tuned model for decoding, fairseq also loads the pre-trained model to use its states['model'] to build the model instance. 37 | # To prevent the error about the w2v_path (if you don't have the pre-trained model at w2v_path), we set common_eval.model_overrides to override 38 | # the w2v_path by speechlmp_base_cfg.pt. speechlmp_base_cfg.pt is just a pre-trained model checkpoint without parameters (only contains config). 39 | # So, if you have trained a model with different model config (e.g. different encoder layers), you should modify the common_eval.model_overrides to your own. 40 | # common_eval.model_overrides=\"{\'w2v_path\':\'$CODE_ROOT/speechlm/config/pretrain/speechlmp_base_cfg.pt\'}\" \ 41 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_kenlm.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechLM Base model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other" 11 | src_dir=${model_path%/*} 12 | cpt=${model_path##*/} 13 | cpt=${cpt%.*} 14 | 15 | CODE_ROOT=${PWD} 16 | path_to_lexicon=${DATA_DIR}/librispeech_lexicon.lst 17 | path_to_lm=${DATA_DIR}/4-gram.arpa 18 | [ ! -f $path_to_lexicon ] && echo "Error: $path_to_lexicon not found !" && exit 1 19 | [ ! -f $path_to_lm ] && echo "Error: $path_to_lm not found !" && exit 1 20 | 21 | for subset in ${gen_set//,/ }; do 22 | results_path=$src_dir/decode_${cpt}_ctc/${subset} 23 | [ ! -d $results_path ] && mkdir -p $results_path 24 | 25 | python $CODE_ROOT/speechlm/infer.py \ 26 | --config-dir $CODE_ROOT/speechlm/config/decode \ 27 | --config-name infer_kenlm \ 28 | common.user_dir=$CODE_ROOT/speechlm \ 29 | \ 30 | dataset.gen_subset=${subset} \ 31 | task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=false \ 32 | common_eval.results_path=${results_path} common_eval.path=${model_path} \ 33 | \ 34 | decoding.lexicon=$path_to_lexicon \ 35 | decoding.lmpath=$path_to_lm \ 36 | decoding.beam=1500 \ 37 | \ 38 | common_eval.quiet=false \ 39 | & 40 | done 41 | wait 42 | 43 | ### important to know 44 | # When loading the fine-tuned model for decoding, fairseq also loads the pre-trained model to use its states['model'] to build the model instance. 45 | # To prevent the error about the w2v_path (if you don't have the pre-trained model at w2v_path), we set common_eval.model_overrides to override 46 | # the w2v_path by speechlmp_base_cfg.pt. speechlmp_base_cfg.pt is just a pre-trained model checkpoint without parameters (only contains config). 47 | # So, if you have trained a model with different model config (e.g. different encoder layers), you should modify the common_eval.model_overrides to your own. 48 | # common_eval.model_overrides=\"{\'w2v_path\':\'$CODE_ROOT/speechlm/config/pretrain/speechlmp_base_cfg.pt\'}\" \ 49 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_large.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechLM Large model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other" 11 | src_dir=${model_path%/*} 12 | cpt=${model_path##*/} 13 | cpt=${cpt%.*} 14 | 15 | CODE_ROOT=${PWD} 16 | 17 | for subset in ${gen_set//,/ }; do 18 | results_path=$src_dir/decode_${cpt}_ctc/${subset} 19 | [ ! -d $results_path ] && mkdir -p $results_path 20 | 21 | python $CODE_ROOT/speechlm/infer.py \ 22 | --config-dir $CODE_ROOT/speechlm/config/decode \ 23 | --config-name infer_viterbi \ 24 | common.user_dir=$CODE_ROOT/speechlm \ 25 | \ 26 | dataset.gen_subset=${subset} \ 27 | task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=true \ 28 | common_eval.results_path=${results_path} common_eval.path=${model_path} \ 29 | \ 30 | common_eval.quiet=true \ 31 | & 32 | done 33 | wait 34 | 35 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_asr/large_speechlmp_32gpu_4accum/ctc200k_from_400k_bz3.6m_lr1e-5/checkpoint_convert.pt 36 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr 37 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_large_fsqlm.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechLM Large model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other" 11 | src_dir=${model_path%/*} 12 | cpt=${model_path##*/} 13 | cpt=${cpt%.*} 14 | 15 | CODE_ROOT=${PWD} 16 | path_to_lexicon=${DATA_DIR}/librispeech_lexicon.lst 17 | path_to_lm=${DATA_DIR}/fairseq_word_lm/lm_librispeech_word_transformer.pt 18 | [ ! -f $path_to_lexicon ] && echo "Error: $path_to_lexicon not found !" && exit 1 19 | [ ! -f $path_to_lm ] && echo "Error: $path_to_lm not found !" && exit 1 20 | 21 | for subset in ${gen_set//,/ }; do 22 | results_path=$src_dir/decode_${cpt}_ctc/${subset} 23 | [ ! -d $results_path ] && mkdir -p $results_path 24 | 25 | python $CODE_ROOT/speechlm/infer.py \ 26 | --config-dir $CODE_ROOT/speechlm/config/decode \ 27 | --config-name infer_fsqlm \ 28 | common.user_dir=$CODE_ROOT/speechlm \ 29 | \ 30 | dataset.gen_subset=${subset} \ 31 | task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=true \ 32 | common_eval.results_path=${results_path} common_eval.path=${model_path} \ 33 | \ 34 | decoding.lexicon=$path_to_lexicon \ 35 | decoding.lmpath=$path_to_lm \ 36 | decoding.lmweight=0.90 \ 37 | decoding.wordscore=-0.31 \ 38 | decoding.beam=500 \ 39 | \ 40 | common_eval.quiet=false \ 41 | & 42 | done 43 | wait 44 | 45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_asr/large_speechlmp_32gpu_4accum/ctc200k_from_400k_bz3.6m_lr1e-5/checkpoint_convert.pt 46 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr 47 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_st/inference_base.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [gen-set=dev] [beam_size=5] [lenpen=1.0]" && exit 0 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | lang=$3 10 | gen_set=$4 11 | beam_size=$5 12 | lenpen=$6 13 | [ -z $gen_set ] && gen_set="dev" 14 | [ -z $beam_size ] && beam_size=5 15 | [ -z $lenpen ] && lenpen=1 16 | src_dir=${model_path%/*} 17 | cpt=${model_path##*/} 18 | cpt=${cpt%.*} 19 | 20 | CODE_ROOT=${PWD} 21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 25 | --gen-subset ${gen_set}_st_en_${lang}_local \ 26 | --max-tokens 2300000 \ 27 | --max-source-positions 2300000 \ 28 | --num-workers 0 \ 29 | \ 30 | --user-dir $CODE_ROOT/speechlm \ 31 | --task speech_to_text \ 32 | --config-yaml config_base_en${lang}.yaml \ 33 | \ 34 | --path ${model_path} \ 35 | --results-path $results_path \ 36 | \ 37 | --scoring sacrebleu --max-len-a 0 --max-len-b 512 \ 38 | --beam ${beam_size} \ 39 | --lenpen $lenpen \ 40 | 41 | echo $results_path 42 | tail -n 1 $results_path/generate-*.txt 43 | sleep 1s 44 | 45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_covost/base_speechlmp_32gpu_1accum/legacy_ende_from_400k_bz3.2m_lr1e-4/checkpoint_best_convert.pt 46 | # data_dir=dataset/CommonVoice/v4/en/en-de 47 | -------------------------------------------------------------------------------- /SpeechLM/speechlm/scripts/tune_speechlm_st/inference_large.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechLM Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [gen-set=dev] [beam_size=5] [lenpen=1.0]" && exit 0 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | lang=$3 10 | gen_set=$4 11 | beam_size=$5 12 | lenpen=$6 13 | [ -z $gen_set ] && gen_set="dev" 14 | [ -z $beam_size ] && beam_size=5 15 | [ -z $lenpen ] && lenpen=1 16 | src_dir=${model_path%/*} 17 | cpt=${model_path##*/} 18 | cpt=${cpt%.*} 19 | 20 | CODE_ROOT=${PWD} 21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 25 | --gen-subset ${gen_set}_st_en_${lang}_local \ 26 | --max-tokens 2300000 \ 27 | --max-source-positions 2300000 \ 28 | --num-workers 0 \ 29 | \ 30 | --user-dir $CODE_ROOT/speechlm \ 31 | --task speech_to_text \ 32 | --config-yaml config_large_en${lang}.yaml \ 33 | \ 34 | --path ${model_path} \ 35 | --results-path $results_path \ 36 | \ 37 | --scoring sacrebleu --max-len-a 0 --max-len-b 512 \ 38 | --beam ${beam_size} \ 39 | --lenpen $lenpen \ 40 | 41 | echo $results_path 42 | tail -n 1 $results_path/generate-*.txt 43 | sleep 1s 44 | 45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_covost/large_speechlmp_32gpu_4accum/legacy_ende_from_400k_bz3.6m_lr1e-4/checkpoint.avgnbest_convert.pt 46 | # data_dir=dataset/CommonVoice/v4/en/en-de 47 | -------------------------------------------------------------------------------- /SpeechT5/results/ablation_study.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/ablation_study.png -------------------------------------------------------------------------------- /SpeechT5/results/asr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/asr.png -------------------------------------------------------------------------------- /SpeechT5/results/se.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/se.png -------------------------------------------------------------------------------- /SpeechT5/results/sid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/sid.png -------------------------------------------------------------------------------- /SpeechT5/results/st.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/st.png -------------------------------------------------------------------------------- /SpeechT5/results/tts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/tts.png -------------------------------------------------------------------------------- /SpeechT5/results/vc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/vc.png -------------------------------------------------------------------------------- /SpeechT5/speecht5/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models # noqa -------------------------------------------------------------------------------- /SpeechT5/speecht5/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | 5 | for file in os.listdir(os.path.dirname(__file__)): 6 | if file.endswith(".py") and not file.startswith("_"): 7 | criterion_name = file[: file.find(".py")] 8 | importlib.import_module( 9 | "speecht5.criterions." + criterion_name 10 | ) -------------------------------------------------------------------------------- /SpeechT5/speecht5/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/data/__init__.py -------------------------------------------------------------------------------- /SpeechT5/speecht5/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .speecht5 import * # noqa 2 | from .t5_transformer_lm import * # noqa 3 | -------------------------------------------------------------------------------- /SpeechT5/speecht5/models/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/models/modules/__init__.py -------------------------------------------------------------------------------- /SpeechT5/speecht5/models/modules/text_encoder_prenet.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing (https://arxiv.org/abs/2110.07205) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechT5 4 | # Copyright (c) 2021 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq and espnet code bases 7 | # https://github.com/pytorch/fairseq; https://github.com/espnet/espnet 8 | # -------------------------------------------------------- 9 | 10 | import torch.nn as nn 11 | 12 | from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding 13 | from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding 14 | 15 | 16 | class TextEncoderPrenet(nn.Module): 17 | """ 18 | 19 | Args: 20 | in_channels (int): the number of input channels 21 | mid_channels (int): the number of intermediate channels 22 | out_channels (int): the number of output channels 23 | kernel_sizes (List[int]): the kernel size for each convolutional layer 24 | """ 25 | 26 | def __init__( 27 | self, 28 | embed_tokens, 29 | args, 30 | ): 31 | super(TextEncoderPrenet, self).__init__() 32 | self.padding_idx = embed_tokens.padding_idx 33 | # define encoder prenet 34 | # get positional encoding class 35 | pos_enc_class = ( 36 | ScaledPositionalEncoding if args.enc_use_scaled_pos_enc else PositionalEncoding 37 | ) 38 | 39 | self.encoder_prenet = nn.Sequential( 40 | embed_tokens, 41 | pos_enc_class(args.encoder_embed_dim, args.transformer_enc_positional_dropout_rate, max_len=args.max_text_positions), 42 | ) 43 | 44 | def forward(self, src_tokens): 45 | return self.encoder_prenet(src_tokens), src_tokens.eq(self.padding_idx) 46 | -------------------------------------------------------------------------------- /SpeechT5/speecht5/models/t5_transformer_lm.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing (https://arxiv.org/abs/2110.07205) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechT5 4 | # Copyright (c) 2021 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq and espnet code bases 7 | # https://github.com/pytorch/fairseq; https://github.com/espnet/espnet 8 | # -------------------------------------------------------- 9 | 10 | from fairseq.models import ( 11 | register_model_architecture, 12 | ) 13 | from fairseq.models.transformer_lm import base_lm_architecture 14 | 15 | 16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5") 17 | def transformer_lm_t5(args): 18 | args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280) 19 | args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144) 20 | args.decoder_layers = getattr(args, "decoder_layers", 20) 21 | args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) 22 | args.dropout = getattr(args, "dropout", 0.1) 23 | args.attention_dropout = getattr(args, "attention_dropout", 0.1) 24 | args.activation_fn = getattr(args, "activation_fn", "gelu") 25 | base_lm_architecture(args) 26 | -------------------------------------------------------------------------------- /SpeechT5/speecht5/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/tasks/__init__.py -------------------------------------------------------------------------------- /SpeechT5/speecht5_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5_framework.png -------------------------------------------------------------------------------- /SpeechUT/dataset/LibriSpeech/dict.kmu.txt: -------------------------------------------------------------------------------- 1 | dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/LibriSpeech/dict.ltr.txt: -------------------------------------------------------------------------------- 1 | | 803288730 2 | E 439294199 3 | T 319071758 4 | A 277306732 5 | O 263784364 6 | N 239361162 7 | I 237353011 8 | H 223346762 9 | S 220175453 10 | R 203352500 11 | D 152198685 12 | L 141597450 13 | U 98913389 14 | M 87138757 15 | C 84680142 16 | W 81375101 17 | F 80240665 18 | G 70642902 19 | Y 68388038 20 | P 58436929 21 | B 52538531 22 | V 33250231 23 | K 26906609 24 | ' 9162896 25 | X 5075632 26 | J 4746771 27 | Q 3401794 28 | Z 2186971 29 | 1 30 | -------------------------------------------------------------------------------- /SpeechUT/dataset/LibriSpeech/dict.txt: -------------------------------------------------------------------------------- 1 | | 94802 2 | E 51860 3 | T 38431 4 | A 33152 5 | O 31495 6 | N 28855 7 | I 28794 8 | H 27187 9 | S 26071 10 | R 23546 11 | D 18289 12 | L 16308 13 | U 12400 14 | M 10685 15 | W 10317 16 | C 9844 17 | F 9062 18 | G 8924 19 | Y 8226 20 | P 6890 21 | B 6339 22 | V 3936 23 | K 3456 24 | ' 1023 25 | X 636 26 | J 598 27 | Q 437 28 | Z 213 29 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/dict.km.txt: -------------------------------------------------------------------------------- 1 | ../LibriSpeech/dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/dict.kmu.txt: -------------------------------------------------------------------------------- 1 | ../LibriSpeech/dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_de/config.yaml: -------------------------------------------------------------------------------- 1 | vocab_filename: dict.spm.txt 2 | src_vocab_filename: dict.kmu.txt 3 | 4 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_de/config_ende.yaml: -------------------------------------------------------------------------------- 1 | bpe_tokenizer: 2 | bpe: sentencepiece 3 | sentencepiece_model: spm_unigram10000.model 4 | 5 | sampling_alpha: 1.0 6 | shuffle: false 7 | use_audio_input: true 8 | use_sample_rate: 16000 9 | 10 | vocab_filename: dict.spm.txt 11 | 12 | # required by speech_to_text task but never used 13 | input_channels: 1 14 | input_feat_per_channel: 1 15 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_de/dict.kmu.txt: -------------------------------------------------------------------------------- 1 | ../../LibriSpeech/dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_de/spm_unigram10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_de/spm_unigram10000.model -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_es/config.yaml: -------------------------------------------------------------------------------- 1 | vocab_filename: dict.spm.txt 2 | src_vocab_filename: dict.kmu.txt 3 | 4 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_es/config_enes.yaml: -------------------------------------------------------------------------------- 1 | bpe_tokenizer: 2 | bpe: sentencepiece 3 | sentencepiece_model: spm_unigram10000.model 4 | 5 | sampling_alpha: 1.0 6 | shuffle: false 7 | use_audio_input: true 8 | use_sample_rate: 16000 9 | 10 | vocab_filename: dict.spm.txt 11 | 12 | # required by speech_to_text task but never used 13 | input_channels: 1 14 | input_feat_per_channel: 1 15 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_es/dict.kmu.txt: -------------------------------------------------------------------------------- 1 | ../../LibriSpeech/dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_es/spm_unigram10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_es/spm_unigram10000.model -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_fr/config.yaml: -------------------------------------------------------------------------------- 1 | vocab_filename: dict.spm.txt 2 | src_vocab_filename: dict.kmu.txt 3 | 4 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_fr/config_enfr.yaml: -------------------------------------------------------------------------------- 1 | bpe_tokenizer: 2 | bpe: sentencepiece 3 | sentencepiece_model: spm_unigram10000.model 4 | 5 | sampling_alpha: 1.0 6 | shuffle: false 7 | use_audio_input: true 8 | use_sample_rate: 16000 9 | 10 | vocab_filename: dict.spm.txt 11 | 12 | # required by speech_to_text task but never used 13 | input_channels: 1 14 | input_feat_per_channel: 1 15 | -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_fr/dict.kmu.txt: -------------------------------------------------------------------------------- 1 | ../../LibriSpeech/dict.km.txt -------------------------------------------------------------------------------- /SpeechUT/dataset/MuSTC/en_fr/spm_unigram10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_fr/spm_unigram10000.model -------------------------------------------------------------------------------- /SpeechUT/speechut/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models 2 | -------------------------------------------------------------------------------- /SpeechUT/speechut/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | for file in os.listdir(os.path.dirname(__file__)): 5 | if file.endswith(".py") and not file.startswith("_"): 6 | criterion_name = file[: file.find(".py")] 7 | importlib.import_module( 8 | "speechut.criterions." + criterion_name 9 | ) 10 | -------------------------------------------------------------------------------- /SpeechUT/speechut/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/speechut/models/__init__.py -------------------------------------------------------------------------------- /SpeechUT/speechut/models/t5_transformer_lm.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | from fairseq.models import ( 11 | register_model_architecture, 12 | ) 13 | from fairseq.models.transformer_lm import base_lm_architecture 14 | 15 | 16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5") 17 | def transformer_lm_t5(args): 18 | args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280) 19 | args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144) 20 | args.decoder_layers = getattr(args, "decoder_layers", 20) 21 | args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) 22 | args.dropout = getattr(args, "dropout", 0.1) 23 | args.attention_dropout = getattr(args, "attention_dropout", 0.1) 24 | args.activation_fn = getattr(args, "activation_fn", "gelu") 25 | base_lm_architecture(args) 26 | -------------------------------------------------------------------------------- /SpeechUT/speechut/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Copyright (c) 2022 Microsoft 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Based on fairseq code bases 5 | # https://github.com/facebookresearch/fairseq 6 | # -------------------------------------------------------- 7 | 8 | from .learned_positional_embedding import LearnedPositionalEmbedding 9 | from .multihead_attention import MultiheadAttention 10 | from .relative_pos_enc import RelativePositionalEncoding 11 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase 12 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer 13 | from .transformer_encoder import TransformerEncoderBase 14 | from .transformer_decoder import TransformerDecoderScriptable, TransformerDecoderBaseScriptable 15 | 16 | __all__ = [ 17 | "MultiheadAttention", 18 | "RelativePositionalEncoding", 19 | "LearnedPositionalEmbedding", 20 | "TransformerEncoderLayerBase", 21 | "TransformerDecoderLayerBase", 22 | "TransformerEncoder", 23 | "TransformerSentenceEncoderLayer", 24 | "TransformerEncoderBase", 25 | "TransformerDecoderScriptable", 26 | "TransformerDecoderBaseScriptable", 27 | ] 28 | -------------------------------------------------------------------------------- /SpeechUT/speechut/modules/relative_pos_enc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Copyright (c) 2022 Microsoft 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Based on fairseq code bases 5 | # https://github.com/facebookresearch/fairseq 6 | # -------------------------------------------------------- 7 | 8 | import torch 9 | 10 | class RelativePositionalEncoding(torch.nn.Module): 11 | def __init__(self, d_model, maxlen=1000, embed_v=False): 12 | super(RelativePositionalEncoding, self).__init__() 13 | 14 | self.d_model = d_model 15 | self.maxlen = maxlen 16 | self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 17 | if embed_v: 18 | self.pe_v = torch.nn.Embedding(2*maxlen, d_model) 19 | self.embed_v = embed_v 20 | 21 | 22 | def forward(self, pos_seq, incremental_state=None): 23 | pos_seq[pos_seq < -self.maxlen] = -self.maxlen 24 | pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1 25 | pos_seq = pos_seq + self.maxlen 26 | 27 | if incremental_state is not None: 28 | pos_seq = pos_seq[-1:] 29 | 30 | if self.embed_v: 31 | return self.pe_k(pos_seq), self.pe_v(pos_seq) 32 | else: 33 | return self.pe_k(pos_seq), None 34 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_asr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=1 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4asr_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 21 | --config-name speechut_base_librispeech \ 22 | common.user_dir=$CODE_ROOT/speechut \ 23 | \ 24 | task.labels='["km"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_960+pseudo_libritext.kmu-ltr+merge_960.kmu-none\" \ 31 | dataset.valid_subset=\"dev_clean+dev.kmu-ltr+dev.kmu-none\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=1400000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=base_speechut4asr_${world_size}gpu_${update_freq}accum 41 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_st.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | lang=$3 9 | mount=$4 10 | world_size=$5 11 | update_freq=$6 12 | [ -z $mount ] && mount=${PWD} 13 | [ -z $world_size ] && world_size=32 14 | [ -z $update_freq ] && update_freq=1 15 | 16 | CODE_ROOT=${PWD} 17 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum" 18 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 19 | 20 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 21 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 22 | --config-name speechut_base_librispeech \ 23 | common.user_dir=$CODE_ROOT/speechut \ 24 | \ 25 | task.labels='["km"]' \ 26 | model.label_rate=50 \ 27 | task.data=$DATA_DIR \ 28 | task.label_dir=$DATA_DIR \ 29 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 30 | \ 31 | model.add_text_ctc=false \ 32 | model.text_transformer.share_decoder_input_output_embed=true \ 33 | criterion.u2t_ed_weight=1.0 \ 34 | criterion.u2t_ctc_weight=0 \ 35 | \ 36 | dataset.train_subset=\"train_960,mustcuns_${lang}+pseudo_wmt_en${lang}.kmu-spm+train_960.kmu-none,mustcuns_${lang}.kmu-none\" \ 37 | dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \ 38 | dataset.num_workers=0 \ 39 | dataset.max_tokens=1400000 \ 40 | distributed_training.distributed_world_size=${world_size} \ 41 | optimization.update_freq=[${update_freq}] \ 42 | \ 43 | common.tensorboard_logdir=$MODEL_DIR \ 44 | checkpoint.save_dir=$MODEL_DIR \ 45 | hydra.run.dir=$MODEL_DIR \ 46 | hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum 47 | 48 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_st_enfr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [lang=fr] [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | lang=$3 9 | mount=$4 10 | world_size=$5 11 | update_freq=$6 12 | [ -z $lang ] && lang=fr 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=32 15 | [ -z $update_freq ] && update_freq=1 16 | 17 | CODE_ROOT=${PWD} 18 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum" 19 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 20 | 21 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 22 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 23 | --config-name speechut_base_librispeech \ 24 | common.user_dir=$CODE_ROOT/speechut \ 25 | \ 26 | task.labels='["km"]' \ 27 | model.label_rate=50 \ 28 | task.data=$DATA_DIR \ 29 | task.label_dir=$DATA_DIR \ 30 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 31 | \ 32 | model.add_text_ctc=false \ 33 | criterion.u2t_ed_weight=1.0 \ 34 | criterion.u2t_ctc_weight=0 \ 35 | \ 36 | dataset.train_subset=\"train_960,pretrain_mustc+pseudo_wmt14_enfr.kmu-spm+train_960.kmu-none,pretrain_mustc.kmu-none\" \ 37 | dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \ 38 | dataset.num_workers=0 \ 39 | dataset.max_tokens=1400000 \ 40 | optimization.max_update=600000 \ 41 | distributed_training.distributed_world_size=${world_size} \ 42 | optimization.update_freq=[${update_freq}] \ 43 | \ 44 | common.tensorboard_logdir=$MODEL_DIR \ 45 | checkpoint.save_dir=$MODEL_DIR \ 46 | hydra.run.dir=$MODEL_DIR \ 47 | hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum 48 | 49 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/pretrain_speechut/large_speechut_for_asr.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Large model # 3 | # #################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | DATA_DIR=$1 7 | TEXT_DATA_DIR=$2 8 | mount=$3 9 | world_size=$4 10 | update_freq=$5 11 | [ -z $mount ] && mount=${PWD} 12 | [ -z $world_size ] && world_size=32 13 | [ -z $update_freq ] && update_freq=4 14 | 15 | CODE_ROOT=${PWD} 16 | MODEL_DIR="${mount}/exp/pretrain/large_speechut4asr_${world_size}gpu_${update_freq}accum" 17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 18 | 19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 20 | --config-dir $CODE_ROOT/speechut/config/pretrain \ 21 | --config-name speechut_large_librilight \ 22 | common.user_dir=$CODE_ROOT/speechut \ 23 | \ 24 | task.labels='["km"]' \ 25 | model.label_rate=50 \ 26 | task.data=$DATA_DIR \ 27 | task.label_dir=$DATA_DIR \ 28 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 29 | \ 30 | dataset.train_subset=\"train_small+pseudo_libritext.kmu-ltr\" \ 31 | dataset.valid_subset=\"dev_clean+dev.kmu-ltr\" \ 32 | dataset.num_workers=0 \ 33 | dataset.max_tokens=900000 \ 34 | distributed_training.distributed_world_size=${world_size} \ 35 | optimization.update_freq=[${update_freq}] \ 36 | \ 37 | common.tensorboard_logdir=$MODEL_DIR \ 38 | checkpoint.save_dir=$MODEL_DIR \ 39 | hydra.run.dir=$MODEL_DIR \ 40 | hydra.job.name=large_speechut4asr_${world_size}gpu_${update_freq}accum 41 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/tune_speechut_asr/finetune960h_large_edctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Large model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=3 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechut/config/finetune_asr \ 26 | --config-name speechut_large_960h \ 27 | common.user_dir=$CODE_ROOT/speechut \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=80000 \ 35 | dataset.max_tokens=1100000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_960" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5 46 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/tune_speechut_asr/finetune_base_edctc.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | w2v_path=$1 8 | DATA_DIR=$2 9 | cpt=$3 10 | mount=$4 11 | world_size=$5 12 | update_freq=$6 13 | [ -z $mount ] && mount=${PWD} 14 | [ -z $world_size ] && world_size=8 15 | [ -z $update_freq ] && update_freq=2 16 | 17 | CODE_ROOT=${PWD} 18 | 19 | exp_name=${w2v_path%/*} 20 | exp_name=${exp_name##*/} 21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5" 22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/speechut/config/finetune_asr \ 26 | --config-name speechut_base_100h \ 27 | common.user_dir=$CODE_ROOT/speechut \ 28 | \ 29 | task.data=$DATA_DIR \ 30 | task.label_dir=$DATA_DIR \ 31 | model.w2v_path=${w2v_path} \ 32 | \ 33 | optimization.lr=[0.00001] \ 34 | optimization.max_update=40000 \ 35 | dataset.max_tokens=1300000 \ 36 | optimization.update_freq=[${update_freq}] \ 37 | distributed_training.distributed_world_size=${world_size} \ 38 | \ 39 | dataset.train_subset="train_clean_100" \ 40 | dataset.valid_subset="dev_other" \ 41 | \ 42 | common.tensorboard_logdir=$MODEL_DIR \ 43 | checkpoint.save_dir=$MODEL_DIR \ 44 | hydra.run.dir=$MODEL_DIR \ 45 | hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5 46 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/tune_speechut_asr/inference_edctc.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechUT ASR model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | beam_size=$4 11 | ctc_weight=$5 12 | extra=$6 13 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..." 14 | [ -z $gen_set ] && gen_set="dev_other" 15 | [ -z $beam_size ] && beam_size=10 16 | [ -z $ctc_weight ] && ctc_weight=0.2 17 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1 18 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1" 19 | 20 | src_dir=${model_path%/*} 21 | cpt=${model_path##*/} 22 | cpt=${cpt%.*} 23 | 24 | CODE_ROOT=${PWD} 25 | 26 | for subset in ${gen_set//,/ }; do 27 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank} 28 | [ ! -d $results_path ] && mkdir -p $results_path 29 | 30 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 31 | --user-dir $CODE_ROOT/speechut \ 32 | --label-dir ${DATA_DIR} \ 33 | --labels '["ltr"]' \ 34 | --single-target \ 35 | --post-process letter \ 36 | --gen-subset ${subset} \ 37 | --max-tokens 2000000 \ 38 | \ 39 | --task joint_sc2t_pretraining \ 40 | --add-decoder-target \ 41 | --fine-tuning \ 42 | --pad-audio \ 43 | --random-crop \ 44 | \ 45 | --ctc-weight ${ctc_weight} $extra \ 46 | --beam ${beam_size} \ 47 | \ 48 | --path ${model_path} \ 49 | --results-path $results_path \ 50 | \ 51 | --scoring wer --max-len-a 0.00078125 --max-len-b 200 \ 52 | & 53 | done 54 | wait 55 | 56 | 57 | for subset in ${gen_set//,/ }; do 58 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank} 59 | echo $results_path 60 | tail -n 1 $results_path/generate-*.txt 61 | done 62 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/tune_speechut_asr/inference_edctclm.sh: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # SpeechUT ASR model # 3 | ##################################### 4 | [ $# -lt 2 ] && echo "Usage: $0 [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [--normalize]" && exit 1 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | gen_set=$3 10 | beam_size=$4 11 | ctc_weight=$5 12 | lm_weight=$6 13 | lm_path=$7 14 | extra=$8 15 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..." 16 | [ -z $gen_set ] && gen_set="dev_other" 17 | [ -z $beam_size ] && beam_size=30 18 | [ -z $ctc_weight ] && ctc_weight=0.3 19 | [ -z $lm_weight ] && lm_weight=0.7 20 | [ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt" 21 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0 22 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1" 23 | 24 | src_dir=${model_path%/*} 25 | cpt=${model_path##*/} 26 | cpt=${cpt%.*} 27 | 28 | CODE_ROOT=${PWD} 29 | 30 | for subset in ${gen_set//,/ }; do 31 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank} 32 | [ ! -d $results_path ] && mkdir -p $results_path 33 | 34 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 35 | --user-dir $CODE_ROOT/speechut \ 36 | --label-dir ${DATA_DIR} \ 37 | --labels '["ltr"]' \ 38 | --single-target \ 39 | --post-process letter \ 40 | --gen-subset ${subset} \ 41 | --max-tokens 800000 \ 42 | \ 43 | --task joint_sc2t_pretraining \ 44 | --add-decoder-target \ 45 | --fine-tuning \ 46 | --pad-audio \ 47 | --random-crop \ 48 | \ 49 | --ctc-weight ${ctc_weight} $extra \ 50 | --lm-weight ${lm_weight} --lm-path ${lm_path} \ 51 | --beam ${beam_size} \ 52 | \ 53 | --path ${model_path} \ 54 | --results-path ${results_path} \ 55 | \ 56 | --scoring wer --max-len-a 0.00078125 --max-len-b 200 \ 57 | & 58 | done 59 | wait 60 | 61 | 62 | for subset in ${gen_set//,/ }; do 63 | results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank} 64 | echo $results_path 65 | tail -n 1 $results_path/generate-*.txt 66 | done 67 | -------------------------------------------------------------------------------- /SpeechUT/speechut/scripts/tune_speechut_st/inference_st.sh: -------------------------------------------------------------------------------- 1 | # #################################### 2 | # SpeechUT Base model # 3 | # #################################### 4 | [ $# -lt 3 ] && echo "Usage: $0 [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1 6 | 7 | model_path=$1 8 | DATA_DIR=$2 9 | lang=$3 10 | gen_set=$4 11 | beam_size=$5 12 | lenpen=$6 13 | [ -z $gen_set ] && gen_set="dev" 14 | [ -z $beam_size ] && beam_size=10 15 | [ -z $lenpen ] && lenpen=1 16 | src_dir=${model_path%/*} 17 | cpt=${model_path##*/} 18 | cpt=${cpt%.*} 19 | 20 | CODE_ROOT=${PWD} 21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \ 25 | --gen-subset ${gen_set}_st \ 26 | --max-tokens 2000000 \ 27 | --max-source-positions 2000000 \ 28 | --num-workers 0 \ 29 | \ 30 | --user-dir $CODE_ROOT/speechut \ 31 | --task speech_to_text \ 32 | --config-yaml config_en${lang}.yaml \ 33 | \ 34 | --path ${model_path} \ 35 | --results-path $results_path \ 36 | \ 37 | --scoring sacrebleu --max-len-a 0 --max-len-b 512 \ 38 | --beam ${beam_size} \ 39 | --lenpen $lenpen \ 40 | # --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \ 41 | 42 | echo $results_path 43 | tail -n 1 $results_path/generate-*.txt 44 | sleep 1s 45 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/requirements.txt: -------------------------------------------------------------------------------- 1 | python-speech-features==0.6 2 | scipy==1.5.4 3 | opencv-python==4.5.4.60 4 | sentencepiece==0.1.96 5 | editdistance==0.6.0 6 | kaldiio==2.17.2 -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # from .hubert import * # noqa 7 | # from .hubert_asr import * # noqa 8 | # from .hubert_dataset import * 9 | # from .hubert_pretraining import * 10 | # from .hubert_criterion import * 11 | from . import data, tasks, criterions, models -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/conf/s2s_decode.yaml: -------------------------------------------------------------------------------- 1 | common: 2 | user_dir: ??? 3 | 4 | generation: 5 | beam: 50 6 | max_len_a: 1.0 7 | max_len_b: 0 8 | lenpen: 1.0 9 | lm_weight: 0 10 | 11 | common_eval: 12 | results_path: ??? 13 | path: ??? 14 | 15 | dataset: 16 | max_tokens: 1000 17 | gen_subset: valid 18 | num_workers: 0 19 | 20 | override: 21 | noise_prob: 0.0 22 | noise_snr: 0 23 | modalities: ??? 24 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | for file in os.listdir(os.path.dirname(__file__)): 5 | if file.endswith(".py") and not file.startswith("_"): 6 | criterion_name = file[: file.find(".py")] 7 | importlib.import_module( 8 | "vathubert.criterions." + criterion_name 9 | ) 10 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/decode_avhubert_lrs3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decode_path=/path/to/finetuned_model 4 | finetuned_model=checkpoint_best.pt 5 | beam=50 6 | data=$1 7 | [ -z $data ] && data="test" 8 | 9 | python -B infer_s2s.py --config-dir /path/to/vat_hubert/vathubert/conf/ --config-name s2s_decode.yaml \ 10 | dataset.gen_subset=${data} common_eval.path=${decode_path}/checkpoints/${finetuned_model} \ 11 | common_eval.results_path=${decode_path}/${finetuned_model}_${data}_video_beam${beam} \ 12 | override.modalities=["video"] \ 13 | common.user_dir=/path/to/vat_hubert/vathubert \ 14 | override.data=/path/to/data \ 15 | override.label_dir=/path/to/data \ 16 | generation.beam=${beam} 17 | 18 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_lrs3_finetune30_av.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_lrs3_30h_av.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["audio","video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_vox_finetune30_av.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_30h_av.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["audio","video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_vox_finetune433_av.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_433h_av.yaml \ 11 | task.data=/path/to/433h_data_tsv \ 12 | task.label_dir=/path/to/433h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["audio","video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_avsr/large_vox_finetune30_av.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_30h_av.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["audio","video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_avsr/large_vox_finetune433_av.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_433h_av.yaml \ 11 | task.data=/path/to/433h_data_tsv \ 12 | task.label_dir=/path/to/433h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["audio","video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_lrs3_finetune30_v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_lrs3_30h_v.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_vox_finetune30_v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_30h_v.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_vox_finetune433_v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_433h_v.yaml \ 11 | task.data=/path/to/433h_data_tsv \ 12 | task.label_dir=/path/to/433h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_vsr/large_vox_finetune30_v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_30h_v.yaml \ 11 | task.data=/path/to/30h_data_tsv \ 12 | task.label_dir=/path/to/30h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/finetune_vsr/large_vox_finetune433_v.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ngpu=$1 4 | updatefreq=$2 5 | max_tokens=$3 6 | pretrained_model_path=$4 7 | save_path=$5 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_433h_v.yaml \ 11 | task.data=/path/to/433h_data_tsv \ 12 | task.label_dir=/path/to/433h_data_tsv \ 13 | task.tokenizer_bpe_model=/path/to/sentencepiece/model \ 14 | task.modalities=["video"] \ 15 | model.w2v_path=${pretrained_model_path} \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | distributed_training.ddp_backend="no_c10d" \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=${max_tokens} \ 22 | +task.use_supervised_data=False \ 23 | +task.use_extra_textdata=False \ 24 | +task.use_extra_audiodata=False \ 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/pretrain/base_lsr3_pretrain_iter5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ngpu=$1 3 | updatefreq=$2 4 | datapath=/LocalData/vatlm_related/fbankdata 5 | save_path=$3 6 | 7 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 8 | --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name base_lrs3_iter5.yaml \ 9 | task.data=${datapath}/433pre_lrs3_433h_tsv \ 10 | task.label_dir=${datapath}/433pre_lrs3_433h_tsv \ 11 | +task.sup_data_path=${datapath}/433pre_tedv3_phone_concat_tsv2 \ 12 | +task.sup_manifest=${datapath}/433pre_tedv3_phone_concat_tsv2 \ 13 | +task.onlytext_manifest=${datapath}/433pre_cantab_tsv \ 14 | +task.onlyaudio_manifest=${datapath}/433pre_giga_tsv_km \ 15 | hydra.run.dir=${save_path} \ 16 | common.user_dir=/path/to/vat_hubert/vathubert \ 17 | distributed_training.distributed_world_size=${ngpu} \ 18 | optimization.update_freq=[${updatefreq}] \ 19 | dataset.max_tokens=3000 \ 20 | model.label_rate=25 \ 21 | common.log_interval=200 \ 22 | checkpoint.save_interval=5 \ 23 | +task.sample_distributions=\"0.08,0.1,0.15,0.15\" \ 24 | +criterion.banlance_loss_weights=[1.0,1.0] \ 25 | dataset.data_buffer_size=40 \ 26 | +task.use_supervised_data=True \ 27 | +task.use_extra_textdata=True \ 28 | +task.use_extra_audiodata=True \ 29 | 30 | 31 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/pretrain/base_vox_pretrain_iter5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ngpu=$1 3 | updatefreq=$2 4 | datapath=/LocalData/vatlm_related/fbankdata 5 | save_path=$3 6 | 7 | 8 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 9 | --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name base_vox_iter5.yaml \ 10 | task.data=${datapath}/fbank_lrs3_vox_tsv \ 11 | task.label_dir=${datapath}/fbank_lrs3_vox_tsv \ 12 | +task.sup_data_path=${datapath}/fbank_tedv3_phone_concat_vox_tsv \ 13 | +task.sup_manifest=${datapath}/fbank_tedv3_phone_concat_vox_tsv \ 14 | +task.onlytext_manifest=${datapath}/cantab2_vox_tsv \ 15 | +task.onlyaudio_manifest=${datapath}/fbank_giga_vox_tsv_km \ 16 | hydra.run.dir=${save_path} \ 17 | common.user_dir=/path/to/vat_hubert/vathubert \ 18 | distributed_training.distributed_world_size=${ngpu} \ 19 | optimization.update_freq=[${updatefreq}] \ 20 | dataset.max_tokens=3000 \ 21 | model.label_rate=25 \ 22 | common.log_interval=200 \ 23 | checkpoint.save_interval=5 \ 24 | +task.sample_distributions=\"0.13,0.15,0.32,0.3\" \ 25 | +criterion.banlance_loss_weights=[1.0,1.0] \ 26 | dataset.data_buffer_size=40 \ 27 | +task.use_supervised_data=True \ 28 | +task.use_extra_textdata=True \ 29 | +task.use_extra_audiodata=True \ 30 | 31 | -------------------------------------------------------------------------------- /VATLM/vat_hubert/vathubert/scripts/pretrain/large_vox_pretrain_iter5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | unset WORLD_SIZE 3 | ngpu=$1 4 | updatefreq=$2 5 | datapath=/LocalData/vatlm_related/fbankdata 6 | save_path=$3 7 | 8 | 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \ 10 | --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name large_vox_iter5.yaml \ 11 | task.data=${datapath}/fbank_lrs3_vox_tsv \ 12 | task.label_dir=${datapath}/fbank_lrs3_vox_tsv \ 13 | +task.sup_data_path=${datapath}/fbank_tedv3_phone_concat_vox_tsv \ 14 | +task.sup_manifest=${datapath}/fbank_tedv3_phone_concat_vox_tsv \ 15 | +task.onlytext_manifest=${datapath}/cantab2_vox_tsv \ 16 | +task.onlyaudio_manifest=${datapath}/fbank_giga_vox_tsv_km \ 17 | hydra.run.dir=${save_path} \ 18 | common.user_dir=/path/to/vat_hubert/vathubert \ 19 | distributed_training.distributed_world_size=${ngpu} \ 20 | optimization.update_freq=[${updatefreq}] \ 21 | dataset.max_tokens=3000 \ 22 | model.label_rate=25 \ 23 | common.log_interval=200 \ 24 | checkpoint.save_interval=5 \ 25 | +task.sample_distributions=\"0.13,0.15,0.32,0.3\" \ 26 | +criterion.banlance_loss_weights=[1.0,1.0] \ 27 | dataset.data_buffer_size=40 \ 28 | +task.use_supervised_data=True \ 29 | +task.use_extra_textdata=True \ 30 | +task.use_extra_audiodata=True \ 31 | 32 | -------------------------------------------------------------------------------- /WavLLM/download/download.sh: -------------------------------------------------------------------------------- 1 | stage=$1 2 | # WavLLM model 3 | if [ "$stage" -eq 0 ]; then 4 | url_p1="https://valle.blob.core.windows.net/share/wavllm/fi" 5 | url_p2="nal.pt?sv=2021-10-04&st=2024-04-24T04%3A50%3A" 6 | url_p3="15Z&se=2025-04-25T04%3A50%3A00Z&sr=b&sp=r&si" 7 | url_p4="g=M82edjKinydPiVd86oS78ZS9L" 8 | url_p5="TVxg0%2F2om3IaEkodIo%3D" 9 | curl -o final.pt ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5} 10 | else 11 | # gaokao_audio 12 | url_p1="https://valle.blob.core.windows.net/share/wavllm/ga" 13 | url_p2="okao_audio.zip?sv=2021-10-04&st=2024-04-24T04%3A58%3A" 14 | url_p3="56Z&se=2025-04-25T04%3A58%3A00Z&sr=b&sp=r&s" 15 | url_p4="ig=0ql1dkz59%2FSxRHkz1ajtC" 16 | url_p5="yfCR5Hva4UISlIfDrOO%2BRc%3D" 17 | curl -o gaokao_audio.zip ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5} 18 | 19 | # gaokao_transcript 20 | url_p1="https://valle.blob.core.windows.net/share/wavllm/ga" 21 | url_p2="okao_text.zip?sv=2021-10-04&st=2024-04-24T04%3A57%3A" 22 | url_p3="37Z&se=2025-04-25T04%3A57%3A00Z&sr=b&sp=r&s" 23 | url_p4="ig=n5QKXU3F9RiP6SxHl6uVEJ" 24 | url_p5="8m7WZ3iEeOGns1BoIozvI%3D" 25 | curl -o gaokao_text.zip ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5} 26 | fi -------------------------------------------------------------------------------- /WavLLM/wavllm/__init__.py: -------------------------------------------------------------------------------- 1 | from . import criterions 2 | -------------------------------------------------------------------------------- /WavLLM/wavllm/data/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import csv 7 | import io 8 | import logging 9 | import re 10 | from collections import defaultdict 11 | from pathlib import Path 12 | from typing import Dict, List, Optional 13 | from dataclasses import dataclass 14 | 15 | import os 16 | from sentencepiece import SentencePieceProcessor 17 | from copy import deepcopy 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | 23 | class Tokenizer: 24 | def __init__(self, model_path: str): 25 | # reload tokenizer 26 | assert os.path.isfile(model_path), model_path 27 | self.sp_model = SentencePieceProcessor(model_file=model_path) 28 | logger.info(f"Reloaded SentencePiece model from {model_path}") 29 | 30 | # BOS / EOS token IDs 31 | self.n_words: int = self.sp_model.vocab_size() 32 | self.bos_id: int = self.sp_model.bos_id() 33 | self.eos_id: int = self.sp_model.eos_id() 34 | self.pad_id: int = self.sp_model.pad_id() 35 | self.unk_id: int = self.sp_model.unk_id() 36 | logger.info(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id} - PAD ID: {self.pad_id} - UNK ID: {self.unk_id}") 37 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 38 | 39 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]: 40 | assert type(s) is str 41 | t = self.sp_model.encode(s) 42 | if bos: 43 | t = [self.bos_id] + t 44 | if eos: 45 | t = t + [self.eos_id] 46 | return t 47 | 48 | def decode(self, t: List[int]) -> str: 49 | return self.sp_model.decode(t) -------------------------------------------------------------------------------- /WavLLM/wavllm/requirements.txt: -------------------------------------------------------------------------------- 1 | fairscale==0.4.13 2 | fairseq==0.12.2 3 | numpy==1.24.3 4 | omegaconf==2.0.6 5 | sentencepiece==0.1.99 6 | torch==2.0.1 7 | transformers==4.32.1 8 | -------------------------------------------------------------------------------- /WavLLM/wavllm/scripts/inference_sft.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | export HYDRA_FULL_ERROR=1 3 | export PYTHONPATH=$$PYTHONPATH:${PWD} 4 | 5 | model_path=$1 6 | [ -z $model_path ] && model_path="?" 7 | 8 | src_dir=${model_path%/*} 9 | cpt=${model_path##*/} 10 | cpt=${cpt%.*} 11 | 12 | gen_set=$2 13 | [ -z $gen_set ] && gen_set="?" 14 | [ -z $beam_size ] && beam_size=1 15 | 16 | 17 | FAIRSEQ_ROOT=${PWD} 18 | DATA_DIR=$FAIRSEQ_ROOT/examples/wavllm/test_data 19 | 20 | for subset in $gen_set; do 21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${subset} 22 | [ ! -d $results_path ] && mkdir -p $results_path 23 | 24 | python $FAIRSEQ_ROOT/examples/wavllm/inference/generate.py $DATA_DIR \ 25 | --user-dir examples/wavllm \ 26 | --tokenizer-path $FAIRSEQ_ROOT/examples/wavllm/tokenizer/tokenizer.model \ 27 | --gen-subset ${subset} \ 28 | \ 29 | --task speechllm_task \ 30 | \ 31 | --path ${model_path} \ 32 | --results-path $results_path \ 33 | \ 34 | --scoring wer \ 35 | --skip-invalid-size-inputs-valid-test \ 36 | --max-tokens 1600000 \ 37 | --sampling --beam 1 --nbest 1 --temperature 0.5 \ 38 | --max-len-a 0 --max-len-b 512 39 | done -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/CoT-task-story.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech orig_story 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/CoT-task-story.wav 1079348 First of all, transcribe the audio recording into text, capturing every spoken word; Additionally given this audio clip and text, can you condense it into a clear, concise summary, no more than 20 words?; Lastly disregarding the sound, translate this English summary into German. Bis zum Jahr 2500 ist die Erde eine umweltfreundliche Utopie mit fortschrittlicher KI, neuronaler Vernetzung und einer perfekten Mischung aus Technologie und Natur. True In the year 2500, Earth gleamed like a sapphire, a futuristic utopia where harmony reigned. Skyscrapers, draped in lush greenery, stretched towards the heavens, their glass surfaces reflecting the tranquil azure of a pollution-free sky. Humanity had transcended past conflicts, embracing an era of shared consciousness through neural connectivity. Autonomous vehicles glided silently on solar pathways, while people mingled in serene communal spaces, their basic needs met by advanced AI that predicted and catered to their every whim. The Great Reconciliation had merged technology with nature, and in this new world, every individual thrived, their potential limited only by the expanses of their own creativity. The utopia wasn't just a place; it was the pulse of civilization, beating in perfect rhythm with the universe. -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/CoT-task.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt with_speech tgt_text 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/CoT-task.wav 214437 First of all, transcribe the audio recording into text, capturing every spoken word; Additionally given this audio clip and text, can you condense it into a clear, concise summary, no more than 20 words?; Lastly disregarding the sound, translate this English summary into German. True Drei Filme aus dem asiatisch-pazifischen Raum im Rennen in Cannes -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/II-task.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames with_speech prompt tgt_text 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/II-task.wav 111111 True To begin, Transcribe the audio recording into text, capturing every spoken word; Subsequently, How does the woman finally decide to go home? A. By bus; B. In the man’s car; C. In her father’s car.; Furthermore, ignore the audio clip, What is the capital of New Zealand?; Lastly, Continue the narrative of given audio clip in a coherent and engaging way ASR+SQA+SFT+Continue -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/SQA.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sqa.wav 111111 What will the man do next? A. Start to take exercise; B. Do as he always does; C. Change his working time. A True 3 | -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/SQQA.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sqqa.wav 182574 The fundamental theorem of calculus is a theorem that links the concept of the derivative of a function with the concept of the integral . True -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/asr.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/asr.flac 166960 Based on the attached audio, generate a comprehensive text transcription of the spoken content. he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce True 3 | -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/CoT-task-story.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/CoT-task-story.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/CoT-task.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/CoT-task.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/II-task.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/II-task.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/asr.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/asr.flac -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/emo.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/emo.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/sqa.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sqa.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/sqqa.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sqqa.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/st.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/st.flac -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/audio/sv.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sv.wav -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/dict.txt: -------------------------------------------------------------------------------- 1 | 1 1 2 | 2 2 3 | 3 3 4 | 4 4 5 | 5 5 6 | -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/emo.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/emo.wav 12345 Can you describe the emotional condition of the speaker in the provided audio clip? sad True 3 | -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/en2de.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames tgt_text prompt with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/st.flac 34560 Sie wird schon in Ordnung sein. Translate the audio clip into German. True 3 | -------------------------------------------------------------------------------- /WavLLM/wavllm/test_data/sv.tsv: -------------------------------------------------------------------------------- 1 | id audio n_frames prompt tgt_text with_speech 2 | 0 SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sv.wav 351362 Is there only one speaker in the audio clip? Yes True 3 | -------------------------------------------------------------------------------- /WavLLM/wavllm/tokenizer/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/tokenizer/tokenizer.model -------------------------------------------------------------------------------- /YiTrans/.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | 3 | -------------------------------------------------------------------------------- /YiTrans/exp_scripts/finetune_ASR/finetune_hubert24_mbart24_en.sh: -------------------------------------------------------------------------------- 1 | world_size=$1 2 | update_freq=$2 3 | [ -z $world_size ] && world_size=8 4 | [ -z $update_freq ] && update_freq=8 5 | 6 | EXP_NAME=train_iwslt_asr_hubert24_mbart24_norel 7 | SAVE_DIR=${HOME}/data/iwslt/asr_v3/${EXP_NAME} 8 | 9 | DATA_ROOT=${HOME}/dataset/iwslt_mustc 10 | LABEL_DIR=${DATA_ROOT}/fine-tune_en_bpe250k 11 | SP_PATH=${LABEL_DIR}/sentence.bpe.model 12 | retain_dict=${LABEL_DIR}/index_en_onlyMUSTC 13 | W2V_PATH=${HOME}/dataset/iwslt_mustc/pretrain_ed_model_cfg.pt 14 | 15 | TRAIN_SUBSET=train_asr_MUSTC 16 | VALID_SUBSET=dev_asr_MUSTC 17 | 18 | 19 | mbart_path="/mnt/default/v-junyiao/released_exsp/mbart50.pretrained/model.pt" 20 | hubert_path="/mnt/default/v-junyiao/speechexp/fairseq_mlst/hubert_large_librivox_released/checkpoint_last.pt" 21 | 22 | CODE_ROOT=${HOME}/code/SpeechT5/YiTrans 23 | 24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \ 25 | --config-dir $CODE_ROOT/yitrans_iwslt22/config/finetune_asr \ 26 | --config-name large_mustc \ 27 | common.user_dir=$CODE_ROOT/yitrans_iwslt22 \ 28 | distributed_training.distributed_world_size=$world_size \ 29 | optimization.update_freq=[$update_freq] \ 30 | \ 31 | dataset.max_tokens=400001 \ 32 | dataset.num_workers=0 \ 33 | optimization.max_update=120000 \ 34 | \ 35 | task._name="iwslt_joint_pretraining" \ 36 | task.data=${DATA_ROOT} \ 37 | task.label_dir=${LABEL_DIR} \ 38 | +task.store_labels=True \ 39 | task.hubert_tokenizer="sentencepiece" \ 40 | task.sp_path=${SP_PATH} \ 41 | task.max_keep_size=400000 \ 42 | criterion.dec_weight=0.5 \ 43 | \ 44 | model._name="yitrans_asr" \ 45 | model.w2v_path=${W2V_PATH} \ 46 | +model.reuse_text_emb=true \ 47 | +model.share_ctc_decoder_embed=true \ 48 | +model.retain_dict_path=${retain_dict} \ 49 | model.freeze_finetune_updates=15000 \ 50 | \ 51 | +model.no_pretrained_weights=true \ 52 | +model.use_rel_pos_enc=false \ 53 | +model.encoder_layers=24 \ 54 | +model.add_text_encoder=true \ 55 | +model.share_s2t_t2t_embeddings=false \ 56 | +model.share_enc_dec_embeddings=false \ 57 | +model.add_adaptor=false \ 58 | +model.load_pretrained_w2v_from=$hubert_path \ 59 | +model.load_pretrained_mbart_from=$mbart_path \ 60 | \ 61 | dataset.train_subset=${TRAIN_SUBSET} \ 62 | dataset.valid_subset=${VALID_SUBSET} \ 63 | checkpoint.save_dir=${SAVE_DIR} \ 64 | common.tensorboard_logdir=${SAVE_DIR} \ 65 | hydra.run.dir=${SAVE_DIR} \ 66 | hydra.job.name=${EXP_NAME} 67 | 68 | -------------------------------------------------------------------------------- /YiTrans/exp_scripts/pretrain/pretrain_pt36_adaptor_step1.sh: -------------------------------------------------------------------------------- 1 | export HYDRA_FULL_ERROR=1 2 | YiTrans=/home/v-ziqzhang/Code/SpeechT5/YiTrans 3 | DATA_DIR=/mnt/default/lozhou/speechdata/hubert_data 4 | LABEL_DIR=${DATA_DIR}/layer9_k500_label 5 | SP_PATH=${LABEL_DIR}/spm_unigram8000.model 6 | TEXT_DATA_DIR=/mnt/default/lozhou/speechdata/text_data/v3/bin_idx_step1 7 | EXP_NAME=pretrain_pt36_addadaptor_bpecode_large_step1 8 | SAVE_DIR=${HOME}/data/speechexp/${EXP_NAME} 9 | W2V_PATH=${HOME}/data/speechexp/hubert_large_librivox_released/checkpoint_last.pt 10 | MBART_PATH=${HOME}/data/speechexp/mbart50.pretrained/model.pt 11 | 12 | python ${YiTrans}/fairseq/fairseq_cli/hydra_train.py \ 13 | --config-dir ${YiTrans}/yitrans_iwslt22/config/pretrain \ 14 | --config-name joint_large \ 15 | common.user_dir=${YiTrans}/yitrans_iwslt22 \ 16 | \ 17 | task.data=$DATA_DIR \ 18 | task.labels='["km"]' \ 19 | task.label_dir=$LABEL_DIR \ 20 | task.text_cfg.text_data=$TEXT_DATA_DIR \ 21 | +task.hubert_tokenizer="sentencepiece" \ 22 | +task.sp_path=${SP_PATH} \ 23 | \ 24 | model.label_rate=50 \ 25 | model.encoder_layers=12 \ 26 | +model.load_pretrained_w2v_from=${W2V_PATH} \ 27 | +model.load_pretrained_mbart_from=${MBART_PATH} \ 28 | \ 29 | dataset.train_subset=\"train_LS,train_MUSTC+mono_deduped_filt_sort.en_XX.en_XX,mt8corpus_filt_slct.en_XX-de_DE\" \ 30 | dataset.valid_subset=\"dev_MUSTC+valid.en_XX-de_DE,dev_MUSTC+valid.en_XX-ja_XX,dev_MUSTC+valid.en_XX-zh_CN,dev_MUSTC+dev4x.en_XX.en_XX\" \ 31 | dataset.max_tokens=300000 \ 32 | \ 33 | distributed_training.distributed_world_size=8 \ 34 | distributed_training.nprocs_per_node=8 \ 35 | optimization.update_freq=[2] \ 36 | \ 37 | common.tensorboard_logdir=$SAVE_DIR \ 38 | checkpoint.save_dir=$SAVE_DIR \ 39 | hydra.run.dir=$SAVE_DIR \ 40 | hydra.job.name=$EXP_NAME \ 41 | checkpoint.reset_optimizer=true \ 42 | checkpoint.reset_dataloader=true 43 | 44 | 45 | 46 | # dataset.train_subset=\"train_CV,train_EUR,train_LS,train_MUSTC,train_TEDLIUM,train_VP+mono_deduped_filt_sort.en_XX.en_XX,mt8corpus_filt_slct.en_XX-de_DE,mt8corpus_filt_slct.en_XX-ja_XX,mt8corpus_filt_slct.en_XX-zh_CN\" \ 47 | -------------------------------------------------------------------------------- /YiTrans/exp_scripts/pretrain/pretrain_pt36_adaptor_step2.sh: -------------------------------------------------------------------------------- 1 | EXP_NAME=train_speech_text_joint_adaptor_large_step2_300k 2 | SAVE_DIR=/datablob/users/v-junyiao/speechexp/fairseq_mlst/${EXP_NAME} 3 | DATA_ROOT=/datablob/users/v-junyiao/speechdata/hubert_mlst 4 | LABEL_DIR=${DATA_ROOT}/fine-tune_en_bpe250k_full 5 | W2V_PATH=/mnt/default/v-junyiao/speechexp/train_speech_text_joint_addadaptor_bpecode_large_step1_mbartpt_400k/checkpoint_last_up.pt 6 | TEXT_DATA_DIR=/datablob/users/v-junyiao/speechdata/text_data/v4/bin-idx 7 | SP_PATH=${LABEL_DIR}/sentence.bpe.model 8 | # export CUDA_VISIBLE_DEVICES=1 9 | python fairseq_cli/hydra_train.py \ 10 | --config-dir examples/hubert/config/pretrain \ 11 | --config-name pretrain_step2 \ 12 | distributed_training.distributed_world_size=64 \ 13 | distributed_training.nprocs_per_node=8 \ 14 | \ 15 | dataset.train_subset=\"train_COVOST,train_asr_VP,train_punc_TEDLIUM,train_asr_MUSTC,train_punc_LS,train_asr_EUR+covost2.en_XX-ja_XX,covost2.en_XX-zh_CN,covost_eurST.en_XX-de_DE,mt8corpus_domain45.en_XX-ja_XX,mt8corpus_filt_slct80_domain44.en_XX-de_DE,mt8corpus_filt_slct80_domain40.en_XX-zh_CN,train.en_XX-de_DE,train.en_XX-ja_XX,train.en_XX-zh_CN\" \ 16 | dataset.valid_subset=\"dev_asr_MUSTC+valid.en_XX-de_DE,dev_asr_MUSTC+valid.en_XX-ja_XX,dev_asr_MUSTC+valid.en_XX-zh_CN\" \ 17 | dataset.max_tokens=480001 \ 18 | dataset.num_workers=0 \ 19 | optimization.update_freq=[1] \ 20 | optimization.max_update=300000 \ 21 | \ 22 | task.hubert_tokenizer="sentencepiece" \ 23 | task.sp_path=${SP_PATH} \ 24 | task.max_keep_size=480000 \ 25 | +task.split_modality_batch=true \ 26 | +task.speech_tgt_lang="en_XX" \ 27 | +task.mbart_style_lang_id=true \ 28 | +task.text_sampling_alpha=1.0 \ 29 | +task.store_labels=true \ 30 | model.freeze_finetune_updates=15000 \ 31 | criterion.dec_weight=0.5 \ 32 | +model.reuse_text_emb=true \ 33 | +model.share_ctc_decoder_embed=true \ 34 | +model.share_speech_text_embeddings=true \ 35 | \ 36 | task.data=${DATA_ROOT} \ 37 | task.label_dir=${LABEL_DIR} \ 38 | task.text_cfg.text_data=${TEXT_DATA_DIR} \ 39 | model.w2v_path=${W2V_PATH} \ 40 | checkpoint.save_dir=${SAVE_DIR} \ 41 | common.tensorboard_logdir=${SAVE_DIR} \ 42 | hydra.run.dir=${SAVE_DIR} \ 43 | hydra.job.name=${EXP_NAME} 44 | 45 | sleep infinity 46 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, tasks, criterions, models 2 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/config/finetune_mt/mt_translation.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | common: 4 | fp16: true 5 | log_format: json 6 | log_interval: 200 7 | tensorboard_logdir: tblog 8 | seed: 1337 9 | 10 | checkpoint: 11 | save_interval: 1000000 12 | keep_last_epochs: 5 13 | save_interval_updates: 10000 14 | keep_interval_updates_pattern: 20000 15 | keep_interval_updates: 5 16 | keep_best_checkpoints: 5 17 | best_checkpoint_metric: accuracy 18 | maximize_best_checkpoint_metric: true 19 | 20 | distributed_training: 21 | ddp_backend: legacy_ddp 22 | find_unused_parameters: true 23 | distributed_world_size: -1 24 | nprocs_per_node: 8 25 | 26 | 27 | criterion: 28 | _name: "label_smoothed_cross_entropy" 29 | label_smoothing: 0.2 30 | report_accuracy: true 31 | 32 | 33 | task: 34 | _name: "iwslt_translation_from_pretrained" 35 | 36 | dataset: 37 | num_workers: 6 38 | max_tokens: 3200000 39 | skip_invalid_size_inputs_valid_test: true 40 | validate_after_updates: ${model.freeze_finetune_updates} 41 | validate_interval: ${checkpoint.save_interval} 42 | validate_interval_updates: ${checkpoint.save_interval_updates} 43 | train_subset: train_100 44 | valid_subset: dev_other 45 | required_batch_size_multiple: 1 46 | 47 | optimizer: 48 | _name: adam 49 | adam_betas: (0.9,0.98) 50 | adam_eps: 1e-06 51 | weight_decay: 0.0 52 | 53 | lr_scheduler: 54 | lr: [0.0001] 55 | _name: polynomial_decay 56 | warmup_updates: 5000 57 | total_num_update: 200000 58 | 59 | model: 60 | _name: finetune_mt 61 | w2v_path: ??? 62 | apply_mask: true 63 | mask_prob: 0.65 64 | mask_channel_prob: 0.5 65 | mask_channel_length: 64 66 | layerdrop: 0.1 67 | decoder_layerdrop: 0.1 68 | activation_dropout: 0.1 69 | feature_grad_mult: 0.0 70 | freeze_finetune_updates: 0 71 | 72 | hydra: 73 | job: 74 | config: 75 | override_dirname: 76 | kv_sep: '-' 77 | item_sep: '__' 78 | exclude_keys: 79 | - run 80 | - task.data 81 | - task.label_dir 82 | - model.w2v_path 83 | - dataset.train_subset 84 | - dataset.valid_subset 85 | run: 86 | dir: ??? 87 | sweep: 88 | dir: ??? 89 | subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} 90 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | for file in os.listdir(os.path.dirname(__file__)): 5 | if file.endswith(".py") and not file.startswith("_"): 6 | criterion_name = file[: file.find(".py")] 7 | importlib.import_module( 8 | "yitrans_iwslt22.criterions." + criterion_name 9 | ) 10 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/data/lang_pair_mask_dataset.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/facebookresearch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | """ 11 | Modified from https://github.com/facebookresearch/fairseq/blob/main/fairseq/data/audio/multi_modality_dataset.py 12 | """ 13 | 14 | 15 | from typing import Optional 16 | 17 | import numpy as np 18 | import torch 19 | from fairseq.data import ( 20 | LanguagePairDataset, 21 | ) 22 | from fairseq.data.audio.multi_modality_dataset import LangPairMaskDataset as FairseqLangPairMaskDataset 23 | 24 | class LangPairMaskDataset(FairseqLangPairMaskDataset): 25 | def __init__( 26 | self, 27 | dataset: LanguagePairDataset, 28 | src_eos: int, 29 | src_bos: Optional[int] = None, 30 | noise_id: Optional[int] = -1, 31 | mask_ratio: Optional[float] = 0, 32 | mask_type: Optional[str] = "random", 33 | ): 34 | super.__init__( 35 | dataset, 36 | src_eos, 37 | src_bos, 38 | noise_id, 39 | mask_ratio, 40 | mask_type, 41 | ) 42 | def mask_src_tokens(self, sample): 43 | src_item = sample["source"] 44 | mask = None 45 | if self.mask_type == "random": 46 | mask = torch.rand(len(src_item)).le(self.mask_ratio) 47 | else: 48 | mask = torch.ones(len(src_item)) 49 | mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0 50 | mask = mask.eq(1) 51 | if src_item[0] == self.src_bos: 52 | mask[0] = False 53 | if src_item[-1] == self.src_eos: 54 | mask[-1] = False 55 | mask_src_item = src_item.masked_fill(mask, self.noise_id) 56 | smp = sample 57 | smp["source"] = mask_src_item 58 | return smp 59 | 60 | def collater(self, samples, pad_to_length=None): 61 | return self.dataset.collater(samples, pad_to_length=pad_to_length) 62 | 63 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/YiTrans/yitrans_iwslt22/models/__init__.py -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/facebookresearch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | from .multihead_attention import MultiheadAttention 11 | from .relative_pos_enc import RelativePositionalEncoding 12 | from .transformer_decoder_layer import TransformerDecoderLayerBase 13 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer 14 | from .multimodal_transformer_decoder import MultimodalTransformerDecoder 15 | 16 | __all__ = [ 17 | "MultiheadAttention", 18 | "RelativePositionalEncoding", 19 | "TransformerDecoderLayerBase", 20 | "TransformerEncoder", 21 | "TransformerSentenceEncoderLayer", 22 | "MultimodalTransformerDecoder", 23 | ] 24 | -------------------------------------------------------------------------------- /YiTrans/yitrans_iwslt22/modules/relative_pos_enc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113) 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C 4 | # Copyright (c) 2022 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Based on fairseq code bases 7 | # https://github.com/pytorch/fairseq 8 | # -------------------------------------------------------- 9 | 10 | import torch 11 | 12 | class RelativePositionalEncoding(torch.nn.Module): 13 | def __init__(self, d_model, maxlen=1000, embed_v=False): 14 | super(RelativePositionalEncoding, self).__init__() 15 | 16 | self.d_model = d_model 17 | self.maxlen = maxlen 18 | self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 19 | if embed_v: 20 | self.pe_v = torch.nn.Embedding(2*maxlen, d_model) 21 | self.embed_v = embed_v 22 | 23 | 24 | def forward(self, pos_seq, incremental_state=None): 25 | pos_seq[pos_seq < -self.maxlen] = -self.maxlen 26 | pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1 27 | pos_seq = pos_seq + self.maxlen 28 | 29 | if incremental_state is not None: 30 | pos_seq = pos_seq[-1:] 31 | 32 | if self.embed_v: 33 | return self.pe_k(pos_seq), self.pe_v(pos_seq) 34 | else: 35 | return self.pe_k(pos_seq), None 36 | --------------------------------------------------------------------------------