├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── Speech2C
    ├── README.md
    └── speech2c
    │   ├── __init__.py
    │   ├── config
    │       ├── base_100h.yaml
    │       ├── base_10h.yaml
    │       └── speech2c_base_librispeech.yaml
    │   ├── criterions
    │       ├── __init__.py
    │       ├── ctc_ce.py
    │       └── speech2c_criterion.py
    │   ├── data
    │       └── speech2c_dataset.py
    │   ├── models
    │       ├── modules
    │       │   ├── ctc_prefix_score.py
    │       │   ├── multihead_attention.py
    │       │   ├── relative_pos_enc.py
    │       │   ├── transformer_decoder.py
    │       │   ├── transformer_decoder_layer.py
    │       │   └── transformer_encoder.py
    │       ├── speech2c.py
    │       ├── speech2c_asr.py
    │       └── t5_transformer_lm.py
    │   ├── squence_generator.py
    │   └── tasks
    │       └── speech2c_pretraining.py
├── Speech2S
    ├── README.md
    └── speech2s
    │   ├── __init__.py
    │   ├── config
    │       ├── finetune_asr
    │       │   ├── speechut_base_100h.yaml
    │       │   ├── speechut_large_100h.yaml
    │       │   └── speechut_large_960h.yaml
    │       └── pretrain
    │       │   ├── speechut_base_librispeech.yaml
    │       │   └── speechut_large_librilight.yaml
    │   ├── criterions
    │       ├── __init__.py
    │       ├── ctc_ce.py
    │       └── speechut_criterion.py
    │   ├── data
    │       ├── concat_dataset.py
    │       ├── hubert_dataset.py
    │       ├── language_trible_dataset.py
    │       ├── load_langpair_dataset.py
    │       └── multimodal_corpus_dataset.py
    │   ├── models
    │       ├── __init__.py
    │       ├── speechut.py
    │       ├── speechut_asr.py
    │       ├── speechut_st.py
    │       └── t5_transformer_lm.py
    │   ├── modules
    │       ├── __init__.py
    │       ├── ctc_prefix_score.py
    │       ├── learned_positional_embedding.py
    │       ├── multihead_attention.py
    │       ├── relative_pos_enc.py
    │       ├── transformer_decoder.py
    │       ├── transformer_encoder.py
    │       ├── transformer_layer.py
    │       └── w2v_encoder.py
    │   ├── scripts copy
    │       ├── pretrain_speechut
    │       │   ├── base_speechut_for_asr.sh
    │       │   ├── base_speechut_for_st.sh
    │       │   ├── base_speechut_for_st_enfr.sh
    │       │   └── large_speechut_for_asr.sh
    │       ├── tune_speechut_asr
    │       │   ├── finetune960h_large_edctc.sh
    │       │   ├── finetune_base_edctc.sh
    │       │   ├── inference_edctc.sh
    │       │   ├── inference_edctclm.sh
    │       │   ├── inference_lm_nj.sh
    │       │   └── inference_nj.sh
    │       └── tune_speechut_st
    │       │   ├── finetune_base_mustc_enxx.sh
    │       │   └── inference_st.sh
    │   ├── scripts
    │       ├── __init__.py
    │       ├── average_checkpoints.py
    │       ├── build_sym_alignment.py
    │       ├── compare_namespaces.py
    │       ├── compound_split_bleu.sh
    │       ├── constraints
    │       │   ├── extract.py
    │       │   └── validate.py
    │       ├── convert_dictionary.lua
    │       ├── convert_model.lua
    │       ├── count_docs.py
    │       ├── read_binarized.py
    │       ├── rm_pt.py
    │       ├── sacrebleu.sh
    │       ├── shard_docs.py
    │       ├── split_train_valid_docs.py
    │       ├── spm_decode.py
    │       ├── spm_encode.py
    │       ├── spm_train.py
    │       └── test_fsdp.sh
    │   ├── stpretrain_scripts
    │       ├── base_sc2c_enes.sh
    │       ├── base_sc2c_esen.sh
    │       ├── config.yaml
    │       ├── config
    │       │   ├── finetune_asr
    │       │   │   ├── base_100h.yaml
    │       │   │   └── large_960h.yaml
    │       │   ├── pretrain
    │       │   │   ├── mbart.yaml
    │       │   │   └── sc2t_base_librispeech.yaml
    │       │   └── translation
    │       │   │   └── text2code.yaml
    │       ├── config_mbart.yaml
    │       ├── data_process
    │       │   ├── extract_hubert_feature_itp.sh
    │       │   ├── merge_code.py
    │       │   ├── txt2idx.sh
    │       │   ├── txt2spm.sh
    │       │   └── wmt
    │       │   │   ├── normalize_en_text.py
    │       │   │   └── normalize_es_text.py
    │       ├── decode_text2code_beam2.sh
    │       ├── eval2.sh
    │       ├── eval3.sh
    │       ├── finetune_enes.sh
    │       ├── finetune_esen.sh
    │       ├── inference_ed.sh
    │       └── train_text2code
    │       │   ├── base_ReleaseIter2_text2unicode_from400k.sh
    │       │   ├── base_ReleaseIter2_text2unicode_from400k_es.sh
    │       │   ├── base_ReleaseIter2_text2unicode_from400k_es2.sh
    │       │   ├── decode_text2code.sh
    │       │   ├── decode_text2code_beam2.sh
    │       │   ├── inference_code_bleu.sh
    │       │   └── inference_code_wer.sh
    │   └── tasks
    │       └── joint_sc2t_pretrain.py
├── SpeechLM
    ├── README.md
    ├── SpeechLM.py
    ├── dataset
    │   ├── CommonVoice
    │   │   └── v4
    │   │   │   └── en
    │   │   │       └── en-de
    │   │   │           ├── config_base_ende.yaml
    │   │   │           ├── config_large_ende.yaml
    │   │   │           ├── dev-sample100_st_en_de_local.tsv
    │   │   │           ├── spm_char_st_en_de.model
    │   │   │           ├── spm_char_st_en_de.txt
    │   │   │           └── spm_char_st_en_de.vocab
    │   ├── LibriLM
    │   │   ├── hidden_unit
    │   │   │   └── bin-idx
    │   │   │   │   ├── config.yaml
    │   │   │   │   ├── dict.km.txt
    │   │   │   │   └── dict.ltr.txt
    │   │   └── phone_unit
    │   │   │   └── bin-idx
    │   │   │       ├── config.yaml
    │   │   │       ├── dict.ltr.txt
    │   │   │       └── dict.phn.txt
    │   └── LibriSpeech
    │   │   ├── asr
    │   │       ├── dict.ltr.txt
    │   │       ├── train_sample100.ltr
    │   │       └── train_sample100.tsv
    │   │   ├── fast_phone2unit
    │   │       ├── config.yaml
    │   │       ├── config_generate.yaml
    │   │       ├── dict.PHN.txt
    │   │       ├── dict.km.txt
    │   │       ├── dict.phn.txt
    │   │       ├── genset_examples.tsv
    │   │       └── train_exmples.tsv
    │   │   ├── hidden_unit
    │   │       ├── dict.km.txt
    │   │       ├── train_sample100.km
    │   │       └── train_sample100.tsv
    │   │   └── phone_unit
    │   │       ├── dict.phn.txt
    │   │       ├── train_sample100.phn
    │   │       └── train_sample100.tsv
    ├── modules.py
    └── speechlm
    │   ├── __init__.py
    │   ├── config
    │       ├── decode
    │       │   ├── infer_fsqlm.yaml
    │       │   ├── infer_kenlm.yaml
    │       │   └── infer_viterbi.yaml
    │       ├── finetune
    │       │   ├── speechlm_base_100h.yaml
    │       │   └── speechlm_large_960h.yaml
    │       └── pretrain
    │       │   ├── speechlm_base_librispeech.yaml
    │       │   ├── speechlm_large_librilight.yaml
    │       │   └── speechlmp_base_cfg.pt
    │   ├── criterions
    │       ├── __init__.py
    │       ├── fasttext2unit_loss.py
    │       └── speechlm_criterion.py
    │   ├── data
    │       ├── concat_dataset.py
    │       ├── hubert_dataset.py
    │       ├── language_trible_dataset.py
    │       ├── load_langpair_dataset.py
    │       ├── multimodal_corpus_dataset.py
    │       └── text_to_unit_dataset.py
    │   ├── data_process
    │       ├── covost2
    │       │   ├── mp3_to_wav.py
    │       │   └── prepare_covost_data.py
    │       ├── filter_paireddata_by_len.py
    │       ├── get_t2u_manifest.py
    │       ├── get_t2u_manifest_textonly.py
    │       ├── phoneize_with_sil.py
    │       ├── phoneme_tokenizer
    │       │   ├── ltr2kaldi_phn_sil025.py
    │       │   ├── mean5_and_std25_sil14_spn32.dict
    │       │   └── repeat_withou_insert_sil_less_4375.py
    │       ├── prepare_covost2_enxx.sh
    │       ├── prepare_phn2ltr_librilm.sh
    │       ├── txt2idx.sh
    │       └── wrd2ltr.py
    │   ├── generate_unit.py
    │   ├── infer.py
    │   ├── models
    │       ├── __init__.py
    │       ├── fasttext2unit.py
    │       ├── speechlm.py
    │       ├── speechlm_ctcasr.py
    │       └── speechlm_st.py
    │   ├── modules
    │       ├── __init__.py
    │       ├── learned_positional_embedding.py
    │       ├── multihead_attention.py
    │       ├── relative_pos_enc.py
    │       ├── transformer_decoder.py
    │       ├── transformer_encoder.py
    │       ├── transformer_layer.py
    │       └── w2v_encoder.py
    │   ├── scripts
    │       ├── pretrain_speechlm
    │       │   ├── base_speechlmh.sh
    │       │   ├── base_speechlmp.sh
    │       │   └── large_speechlmp.sh
    │       ├── tokenizer_fastT2U
    │       │   ├── generate.sh
    │       │   ├── infer.sh
    │       │   └── train_s_5e-4.sh
    │       ├── tune_speechlm_asr
    │       │   ├── finetune_base_ctc.sh
    │       │   ├── finetune_large_ctc.sh
    │       │   ├── inference_ctc.sh
    │       │   ├── inference_ctc_kenlm.sh
    │       │   ├── inference_ctc_large.sh
    │       │   └── inference_ctc_large_fsqlm.sh
    │       └── tune_speechlm_st
    │       │   ├── ft_base_covost_enxx.sh
    │       │   ├── ft_large_covost_enxx.sh
    │       │   ├── inference_base.sh
    │       │   └── inference_large.sh
    │   ├── tasks
    │       ├── fast_text_to_unit.py
    │       └── joint_sc2t_pretrain.py
    │   └── unit_generator.py
├── SpeechT5
    ├── README.md
    ├── results
    │   ├── ablation_study.png
    │   ├── asr.png
    │   ├── se.png
    │   ├── sid.png
    │   ├── st.png
    │   ├── tts.png
    │   └── vc.png
    ├── scripts
    │   ├── generate_class.py
    │   └── generate_speech.py
    ├── speecht5
    │   ├── __init__.py
    │   ├── criterions
    │   │   ├── __init__.py
    │   │   ├── speech_pretrain_criterion.py
    │   │   ├── speech_to_text_loss.py
    │   │   ├── speecht5_criterion.py
    │   │   ├── text_pretrain_criterion.py
    │   │   └── text_to_speech_loss.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── multitask_dataset.py
    │   │   ├── speech_dataset.py
    │   │   ├── speech_to_class_dataset.py
    │   │   ├── speech_to_speech_dataset.py
    │   │   ├── speech_to_text_dataset.py
    │   │   ├── text_dataset.py
    │   │   └── text_to_speech_dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── decoder.py
    │   │   │   ├── encoder.py
    │   │   │   ├── multihead_attention.py
    │   │   │   ├── speaker_decoder_postnet.py
    │   │   │   ├── speech_decoder_postnet.py
    │   │   │   ├── speech_decoder_prenet.py
    │   │   │   ├── speech_encoder_postnet.py
    │   │   │   ├── speech_encoder_prenet.py
    │   │   │   ├── text_decoder_postnet.py
    │   │   │   ├── text_decoder_prenet.py
    │   │   │   ├── text_encoder_prenet.py
    │   │   │   └── transformer_layer.py
    │   │   ├── speecht5.py
    │   │   └── t5_transformer_lm.py
    │   ├── sequence_generator.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   └── speecht5.py
    └── speecht5_framework.png
├── SpeechUT
    ├── README.md
    ├── dataset
    │   ├── LibriSpeech
    │   │   ├── dict.km.txt
    │   │   ├── dict.kmu.txt
    │   │   ├── dict.ltr.txt
    │   │   └── dict.txt
    │   └── MuSTC
    │   │   ├── dict.km.txt
    │   │   ├── dict.kmu.txt
    │   │   ├── en_de
    │   │       ├── config.yaml
    │   │       ├── config_ende.yaml
    │   │       ├── dict.kmu.txt
    │   │       ├── dict.spm.txt
    │   │       └── spm_unigram10000.model
    │   │   ├── en_es
    │   │       ├── config.yaml
    │   │       ├── config_enes.yaml
    │   │       ├── dict.kmu.txt
    │   │       ├── dict.spm.txt
    │   │       └── spm_unigram10000.model
    │   │   └── en_fr
    │   │       ├── config.yaml
    │   │       ├── config_enfr.yaml
    │   │       ├── dict.kmu.txt
    │   │       ├── dict.spm.txt
    │   │       └── spm_unigram10000.model
    └── speechut
    │   ├── __init__.py
    │   ├── config
    │       ├── finetune_asr
    │       │   ├── speechut_base_100h.yaml
    │       │   ├── speechut_large_100h.yaml
    │       │   └── speechut_large_960h.yaml
    │       └── pretrain
    │       │   ├── speechut_base_librispeech.yaml
    │       │   └── speechut_large_librilight.yaml
    │   ├── criterions
    │       ├── __init__.py
    │       ├── ctc_ce.py
    │       └── speechut_criterion.py
    │   ├── data
    │       ├── concat_dataset.py
    │       ├── hubert_dataset.py
    │       ├── language_trible_dataset.py
    │       ├── load_langpair_dataset.py
    │       └── multimodal_corpus_dataset.py
    │   ├── models
    │       ├── __init__.py
    │       ├── speechut.py
    │       ├── speechut_asr.py
    │       ├── speechut_st.py
    │       └── t5_transformer_lm.py
    │   ├── modules
    │       ├── __init__.py
    │       ├── ctc_prefix_score.py
    │       ├── learned_positional_embedding.py
    │       ├── multihead_attention.py
    │       ├── relative_pos_enc.py
    │       ├── transformer_decoder.py
    │       ├── transformer_encoder.py
    │       ├── transformer_layer.py
    │       └── w2v_encoder.py
    │   ├── scripts
    │       ├── pretrain_speechut
    │       │   ├── base_speechut_for_asr.sh
    │       │   ├── base_speechut_for_st.sh
    │       │   ├── base_speechut_for_st_enfr.sh
    │       │   └── large_speechut_for_asr.sh
    │       ├── tune_speechut_asr
    │       │   ├── finetune960h_large_edctc.sh
    │       │   ├── finetune_base_edctc.sh
    │       │   ├── inference_edctc.sh
    │       │   ├── inference_edctclm.sh
    │       │   ├── inference_lm_nj.sh
    │       │   └── inference_nj.sh
    │       └── tune_speechut_st
    │       │   ├── finetune_base_mustc_enxx.sh
    │       │   └── inference_st.sh
    │   ├── squence_generator.py
    │   └── tasks
    │       └── joint_sc2t_pretrain.py
├── VATLM
    ├── README.md
    └── vat_hubert
    │   ├── requirements.txt
    │   └── vathubert
    │       ├── __init__.py
    │       ├── conf
    │           ├── finetune
    │           │   ├── base_lrs3_30h_av.yaml
    │           │   ├── base_lrs3_30h_v.yaml
    │           │   ├── base_vox_30h_av.yaml
    │           │   ├── base_vox_30h_v.yaml
    │           │   ├── base_vox_433h_av.yaml
    │           │   ├── base_vox_433h_v.yaml
    │           │   ├── large_vox_30h_av.yaml
    │           │   ├── large_vox_30h_v.yaml
    │           │   ├── large_vox_433h_av.yaml
    │           │   └── large_vox_433h_v.yaml
    │           ├── pretrain
    │           │   ├── base_lrs3_iter5.yaml
    │           │   ├── base_vox_iter5.yaml
    │           │   └── large_vox_iter5.yaml
    │           └── s2s_decode.yaml
    │       ├── criterions
    │           ├── __init__.py
    │           └── vathubert_criterion.py
    │       ├── data
    │           ├── audiohubert_dataset.py
    │           ├── onlyaudiohubert_dataset.py
    │           ├── texthubert_dataset.py
    │           ├── utils.py
    │           └── vathubert_dataset.py
    │       ├── decode_avhubert_lrs3.sh
    │       ├── infer_s2s.py
    │       ├── models
    │           ├── decoder.py
    │           ├── resnet.py
    │           ├── utils.py
    │           ├── vathubert.py
    │           └── vathubert_asr.py
    │       ├── scripts
    │           ├── finetune_avsr
    │           │   ├── base_lrs3_finetune30_av.sh
    │           │   ├── base_vox_finetune30_av.sh
    │           │   ├── base_vox_finetune433_av.sh
    │           │   ├── large_vox_finetune30_av.sh
    │           │   └── large_vox_finetune433_av.sh
    │           ├── finetune_vsr
    │           │   ├── base_lrs3_finetune30_v.sh
    │           │   ├── base_vox_finetune30_v.sh
    │           │   ├── base_vox_finetune433_v.sh
    │           │   ├── large_vox_finetune30_v.sh
    │           │   └── large_vox_finetune433_v.sh
    │           └── pretrain
    │           │   ├── base_lsr3_pretrain_iter5.sh
    │           │   ├── base_vox_pretrain_iter5.sh
    │           │   └── large_vox_pretrain_iter5.sh
    │       ├── sequence_generator.py
    │       ├── tasks
    │           └── vathubert_pretraining.py
    │       └── utils.py
├── WavLLM
    ├── README.md
    ├── download
    │   └── download.sh
    └── wavllm
    │   ├── __init__.py
    │   ├── criterions
    │       └── cross_entropy_acc.py
    │   ├── data
    │       ├── speechllm_dataset.py
    │       └── tokenizer.py
    │   ├── inference
    │       ├── generate.py
    │       └── sequence_generator.py
    │   ├── models
    │       ├── llama.py
    │       ├── speechllm_model.py
    │       ├── wavlm.py
    │       └── whisper_encoder.py
    │   ├── modules
    │       └── convolution.py
    │   ├── requirements.txt
    │   ├── scripts
    │       └── inference_sft.sh
    │   ├── tasks
    │       └── speechllm_task.py
    │   ├── test_data
    │       ├── CoT-task-story.tsv
    │       ├── CoT-task.tsv
    │       ├── II-task.tsv
    │       ├── SQA.tsv
    │       ├── SQQA.tsv
    │       ├── asr.tsv
    │       ├── audio
    │       │   ├── CoT-task-story.wav
    │       │   ├── CoT-task.wav
    │       │   ├── II-task.wav
    │       │   ├── asr.flac
    │       │   ├── emo.wav
    │       │   ├── sqa.wav
    │       │   ├── sqqa.wav
    │       │   ├── st.flac
    │       │   └── sv.wav
    │       ├── dict.txt
    │       ├── emo.tsv
    │       ├── en2de.tsv
    │       ├── gaokao.tsv
    │       └── sv.tsv
    │   └── tokenizer
    │       └── tokenizer.model
└── YiTrans
    ├── .gitignore
    ├── exp_scripts
        ├── finetune_ASR
        │   └── finetune_hubert24_mbart24_en.sh
        ├── finetune_MT
        │   └── finetune_mbart_en-de.sh
        ├── finetune_ST
        │   └── en-de
        │   │   └── jtst_pt36s2_mustc.sh
        └── pretrain
        │   ├── pretrain_pt36_adaptor_step1.sh
        │   └── pretrain_pt36_adaptor_step2.sh
    ├── readme.md
    └── yitrans_iwslt22
        ├── __init__.py
        ├── config
            ├── finetune_asr
            │   └── large_mustc.yaml
            ├── finetune_mt
            │   └── mt_translation.yaml
            └── pretrain
            │   ├── joint_base.yaml
            │   └── joint_large.yaml
        ├── criterions
            ├── __init__.py
            ├── ctc_ce.py
            ├── joint_step1_criterion.py
            ├── joint_step1_split_batch_criterion.py
            └── joint_step2_criterion.py
        ├── data
            ├── concat_dataset.py
            ├── denoising_dataset.py
            ├── lang_pair_mask_dataset.py
            ├── load_langpair_dataset.py
            ├── multimodal_corpus_dataset.py
            └── speech2c_dataset.py
        ├── models
            ├── __init__.py
            ├── _hubert_mt.py
            ├── finetune_asr.py
            ├── finetune_mt.py
            ├── finetune_st.py
            ├── pretrain_ed.py
            └── pretrain_ed_step2.py
        ├── modules
            ├── __init__.py
            ├── multihead_attention.py
            ├── multimodal_transformer_decoder.py
            ├── relative_pos_enc.py
            ├── transformer_decoder.py
            ├── transformer_decoder_layer.py
            └── w2v_encoder.py
        ├── sequence_generator.py
        └── tasks
            ├── iwslt_joint_pretraining.py
            └── iwslt_translation_from_pretrain.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "SpeechT5/fairseq"]
 2 | 	path = SpeechT5/fairseq
 3 | 	url = https://github.com/pytorch/fairseq
 4 | [submodule "Speech2C/fairseq"]
 5 | 	path = Speech2C/fairseq
 6 | 	url = https://github.com/facebookresearch/fairseq.git
 7 | [submodule "YiTrans/fairseq"]
 8 | 	path = YiTrans/fairseq
 9 | 	url = https://github.com/facebookresearch/fairseq
10 | [submodule "SpeechLM/fairseq"]
11 | 	path = SpeechLM/fairseq
12 | 	url = https://github.com/facebookresearch/fairseq.git
13 | [submodule "SpeechUT/fairseq"]
14 | 	path = SpeechUT/fairseq
15 | 	url = https://github.com/facebookresearch/fairseq.git
16 | [submodule "VATLM/fairseq"]
17 | 	path = VATLM/fairseq
18 | 	url = https://github.com/facebookresearch/fairseq.git
19 | [submodule "Speech2S/fairseq"]
20 | 	path = Speech2S/fairseq
21 | 	url = https://github.com/facebookresearch/fairseq.git
22 | 	branch = adding_womenbios
23 | [submodule "WavLLM/fairseq"]
24 | 	path = WavLLM/fairseq
25 | 	url = https://github.com/pytorch/fairseq.git
26 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/Speech2C/speech2c/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models   # noqa


--------------------------------------------------------------------------------
/Speech2C/speech2c/config/base_100h.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   log_format: json
 6 |   log_interval: 200
 7 |   tensorboard_logdir: tblog
 8 |   seed: 1337
 9 | 
10 | checkpoint:
11 |   no_epoch_checkpoints: true
12 |   best_checkpoint_metric: dec_accuracy
13 |   maximize_best_checkpoint_metric: true
14 | 
15 | distributed_training:
16 |   ddp_backend: c10d
17 |   find_unused_parameters: true
18 |   distributed_world_size: 1
19 |   distributed_port: 29671
20 |   nprocs_per_node: 8
21 | 
22 | task:
23 |   _name: speech2c_pretraining
24 |   data: ???
25 |   fine_tuning: true
26 |   label_dir: ???
27 |   normalize: false  # must be consistent with pre-training
28 |   labels: ["ltr"]
29 |   single_target: true
30 |   add_decoder: true
31 |   pad_audio: true
32 |   random_crop: false
33 | 
34 | dataset:
35 |   num_workers: 6
36 |   max_tokens: 3200000
37 |   skip_invalid_size_inputs_valid_test: true
38 |   train_subset: train_100h
39 |   valid_subset: dev_other
40 | 
41 | criterion:
42 |   _name: ctc_ce
43 |   zero_infinity: true
44 | 
45 | optimization:
46 |   max_update: 80000
47 |   lr: [0.00004]
48 |   sentence_avg: true
49 |   update_freq: [1]
50 | 
51 | optimizer:
52 |   _name: adam
53 |   adam_betas: (0.9,0.98)
54 |   adam_eps: 1e-08
55 | 
56 | lr_scheduler:
57 |   _name: tri_stage
58 |   phase_ratio: [0.1, 0.4, 0.5]
59 |   final_lr_scale: 0.05
60 | 
61 | model:
62 |   _name: speech2c_ctc
63 |   w2v_path: ???
64 |   apply_mask: true
65 |   mask_prob: 0.65
66 |   mask_channel_prob: 0.5
67 |   mask_channel_length: 64
68 |   layerdrop: 0.1
69 |   decoder_layerdrop: 0.1
70 |   activation_dropout: 0.1
71 |   feature_grad_mult: 0.0
72 |   freeze_finetune_updates: 25000
73 | 
74 | hydra:
75 |   job:
76 |     config:
77 |       override_dirname:
78 |         kv_sep: '-'
79 |         item_sep: '__'
80 |         exclude_keys:
81 |           - run
82 |           - task.data
83 |           - task.label_dir
84 |           - model.w2v_path
85 |           - dataset.train_subset
86 |           - dataset.valid_subset
87 |           - criterion.wer_kenlm_model
88 |           - criterion.wer_lexicon
89 |   run:
90 |     dir: ???
91 |   sweep:
92 |     dir: ???
93 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
94 | 


--------------------------------------------------------------------------------
/Speech2C/speech2c/config/speech2c_base_librispeech.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | 
  3 | common:
  4 |   fp16: true
  5 |   log_format: json
  6 |   log_interval: 200
  7 |   seed: 1337
  8 |   tensorboard_logdir: tblog
  9 | 
 10 | checkpoint:
 11 |   save_interval_updates: 25000
 12 |   keep_interval_updates: 1
 13 |   no_epoch_checkpoints: true
 14 | 
 15 | 
 16 | distributed_training:
 17 |   ddp_backend: no_c10d
 18 |   distributed_backend: 'nccl'
 19 |   distributed_world_size: 32
 20 |   distributed_port: 29671
 21 |   nprocs_per_node: 8
 22 |   find_unused_parameters: true
 23 | 
 24 | task:
 25 |   _name: speech2c_pretraining
 26 |   data: ???
 27 |   label_dir: ???
 28 |   labels: ???
 29 |   label_rate: ${model.label_rate}
 30 |   sample_rate: 16000
 31 |   max_sample_size: 250000
 32 |   min_sample_size: 32000
 33 |   pad_audio: false
 34 |   random_crop: true
 35 |   normalize: false # must be consistent with extractor
 36 |   add_decoder: true
 37 | 
 38 | dataset:
 39 |   num_workers: 6
 40 |   max_tokens: 1400000
 41 |   skip_invalid_size_inputs_valid_test: true
 42 |   validate_interval: 5
 43 |   validate_interval_updates: 10000
 44 | 
 45 | criterion:
 46 |   _name: speech2c
 47 |   pred_masked_weight: 1.0
 48 |   pred_nomask_weight: 0.0
 49 |   loss_weights: [10,]
 50 | 
 51 | optimization:
 52 |   max_update: 400000
 53 |   lr: [0.0005]
 54 |   clip_norm: 10.0
 55 | 
 56 | optimizer:
 57 |   _name: adam
 58 |   adam_betas: (0.9,0.98)
 59 |   adam_eps: 1e-06
 60 |   weight_decay: 0.01
 61 | 
 62 | lr_scheduler:
 63 |   _name: polynomial_decay
 64 |   warmup_updates: 32000
 65 | 
 66 | model:
 67 |   _name: speech2c
 68 |   label_rate: ???
 69 |   skip_masked: false
 70 |   skip_nomask: false
 71 |   mask_prob: 0.80
 72 |   extractor_mode: default
 73 |   conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
 74 |   final_dim: 256
 75 |   encoder_layerdrop: 0.05
 76 |   dropout_input: 0.1
 77 |   dropout_features: 0.1
 78 |   dropout: 0.1
 79 |   attention_dropout: 0.1
 80 |   feature_grad_mult: 0.1
 81 |   untie_final_proj: true
 82 |   activation_dropout: 0.0
 83 |   use_rel_pos_enc: true
 84 |   decoder_dict_size: -1
 85 | 
 86 | hydra:
 87 |   job:
 88 |     config:
 89 |       override_dirname:
 90 |         kv_sep: '-'
 91 |         item_sep: '__'
 92 |         exclude_keys:
 93 |           - run
 94 |           - task.data
 95 |           - task.label_dir
 96 |   run:
 97 |     dir: ???
 98 |   sweep:
 99 |     dir: ???
100 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
101 | 


--------------------------------------------------------------------------------
/Speech2C/speech2c/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | 
 5 | for file in os.listdir(os.path.dirname(__file__)):
 6 |     if file.endswith(".py") and not file.startswith("_"):
 7 |         criterion_name = file[: file.find(".py")]
 8 |         importlib.import_module(
 9 |             "speech2c.criterions." + criterion_name
10 |         )
11 | 


--------------------------------------------------------------------------------
/Speech2C/speech2c/models/modules/relative_pos_enc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | import torch
11 | 
12 | class RelativePositionalEncoding(torch.nn.Module):
13 |     def __init__(self, d_model, maxlen=1000, embed_v=False):
14 |         super(RelativePositionalEncoding, self).__init__()
15 | 
16 |         self.d_model = d_model
17 |         self.maxlen = maxlen
18 |         self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 
19 |         if embed_v:
20 |             self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
21 |         self.embed_v = embed_v
22 | 
23 | 
24 |     def forward(self, pos_seq, incremental_state=None):
25 |         pos_seq[pos_seq < -self.maxlen] = -self.maxlen
26 |         pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
27 |         pos_seq = pos_seq + self.maxlen
28 |         
29 |         if incremental_state is not None:
30 |             pos_seq = pos_seq[-1:]
31 | 
32 |         if self.embed_v:
33 |             return self.pe_k(pos_seq), self.pe_v(pos_seq)
34 |         else:
35 |             return self.pe_k(pos_seq), None
36 | 


--------------------------------------------------------------------------------
/Speech2C/speech2c/models/t5_transformer_lm.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | from fairseq.models import (
11 |     register_model_architecture,
12 | )
13 | from fairseq.models.transformer_lm import base_lm_architecture
14 | 
15 | 
16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5")
17 | def transformer_lm_t5(args):
18 |     args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
19 |     args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144)
20 |     args.decoder_layers = getattr(args, "decoder_layers", 20)
21 |     args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
22 |     args.dropout = getattr(args, "dropout", 0.1)
23 |     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
24 |     args.activation_fn = getattr(args, "activation_fn", "gelu")
25 |     base_lm_architecture(args)
26 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models
2 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | for file in os.listdir(os.path.dirname(__file__)):
 5 |     if file.endswith(".py") and not file.startswith("_"):
 6 |         criterion_name = file[: file.find(".py")]
 7 |         importlib.import_module(
 8 |             "speechut.criterions." + criterion_name
 9 |         )
10 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/Speech2S/speech2s/models/__init__.py


--------------------------------------------------------------------------------
/Speech2S/speech2s/models/t5_transformer_lm.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | from fairseq.models import (
11 |     register_model_architecture,
12 | )
13 | from fairseq.models.transformer_lm import base_lm_architecture
14 | 
15 | 
16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5")
17 | def transformer_lm_t5(args):
18 |     args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
19 |     args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144)
20 |     args.decoder_layers = getattr(args, "decoder_layers", 20)
21 |     args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
22 |     args.dropout = getattr(args, "dropout", 0.1)
23 |     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
24 |     args.activation_fn = getattr(args, "activation_fn", "gelu")
25 |     base_lm_architecture(args)
26 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (c) 2022 Microsoft
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Based on fairseq code bases
 5 | # https://github.com/facebookresearch/fairseq
 6 | # --------------------------------------------------------
 7 | 
 8 | from .learned_positional_embedding import LearnedPositionalEmbedding
 9 | from .multihead_attention import MultiheadAttention
10 | from .relative_pos_enc import RelativePositionalEncoding
11 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase
12 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer
13 | from .transformer_encoder import TransformerEncoderBase
14 | from .transformer_decoder import TransformerDecoderScriptable, TransformerDecoderBaseScriptable
15 | 
16 | __all__ = [
17 |     "MultiheadAttention",
18 |     "RelativePositionalEncoding",
19 |     "LearnedPositionalEmbedding",
20 |     "TransformerEncoderLayerBase",
21 |     "TransformerDecoderLayerBase",
22 |     "TransformerEncoder",
23 |     "TransformerSentenceEncoderLayer",
24 |     "TransformerEncoderBase",
25 |     "TransformerDecoderScriptable",
26 |     "TransformerDecoderBaseScriptable",
27 | ]
28 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/modules/relative_pos_enc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (c) 2022 Microsoft
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Based on fairseq code bases
 5 | # https://github.com/facebookresearch/fairseq
 6 | # --------------------------------------------------------
 7 | 
 8 | import torch
 9 | 
10 | class RelativePositionalEncoding(torch.nn.Module):
11 |     def __init__(self, d_model, maxlen=1000, embed_v=False):
12 |         super(RelativePositionalEncoding, self).__init__()
13 | 
14 |         self.d_model = d_model
15 |         self.maxlen = maxlen
16 |         self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 
17 |         if embed_v:
18 |             self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
19 |         self.embed_v = embed_v
20 | 
21 | 
22 |     def forward(self, pos_seq, incremental_state=None):
23 |         pos_seq[pos_seq < -self.maxlen] = -self.maxlen
24 |         pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
25 |         pos_seq = pos_seq + self.maxlen
26 |         
27 |         if incremental_state is not None:
28 |             pos_seq = pos_seq[-1:]
29 | 
30 |         if self.embed_v:
31 |             return self.pe_k(pos_seq), self.pe_v(pos_seq)
32 |         else:
33 |             return self.pe_k(pos_seq), None
34 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_asr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=1
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4asr_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
21 |   --config-name speechut_base_librispeech \
22 |   common.user_dir=$CODE_ROOT/speechut \
23 |   \
24 |   task.labels='["km"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_960+pseudo_libritext.kmu-ltr+merge_960.kmu-none\" \
31 |   dataset.valid_subset=\"dev_clean+dev.kmu-ltr+dev.kmu-none\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=1400000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=base_speechut4asr_${world_size}gpu_${update_freq}accum
41 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <data_dir> <text_data_dir> <lang=de/es> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | lang=$3
 9 | mount=$4
10 | world_size=$5
11 | update_freq=$6
12 | [ -z $mount ] && mount=${PWD}
13 | [ -z $world_size ] && world_size=32
14 | [ -z $update_freq ] && update_freq=1
15 | 
16 | CODE_ROOT=${PWD}
17 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum"
18 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
19 | 
20 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
21 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
22 |   --config-name speechut_base_librispeech \
23 |   common.user_dir=$CODE_ROOT/speechut \
24 |   \
25 |   task.labels='["km"]' \
26 |   model.label_rate=50 \
27 |   task.data=$DATA_DIR \
28 |   task.label_dir=$DATA_DIR \
29 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
30 |   \
31 |   model.add_text_ctc=false \
32 |   model.text_transformer.share_decoder_input_output_embed=true \
33 |   criterion.u2t_ed_weight=1.0 \
34 |   criterion.u2t_ctc_weight=0 \
35 |   \
36 |   dataset.train_subset=\"train_960,mustcuns_${lang}+pseudo_wmt_en${lang}.kmu-spm+train_960.kmu-none,mustcuns_${lang}.kmu-none\" \
37 |   dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \
38 |   dataset.num_workers=0 \
39 |   dataset.max_tokens=1400000 \
40 |   distributed_training.distributed_world_size=${world_size} \
41 |   optimization.update_freq=[${update_freq}] \
42 |   \
43 |   common.tensorboard_logdir=$MODEL_DIR \
44 |   checkpoint.save_dir=$MODEL_DIR \
45 |   hydra.run.dir=$MODEL_DIR \
46 |   hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum
47 | 
48 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st_enfr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <data_dir> <text_data_dir> [lang=fr] [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | lang=$3
 9 | mount=$4
10 | world_size=$5
11 | update_freq=$6
12 | [ -z $lang ] && lang=fr
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=32
15 | [ -z $update_freq ] && update_freq=1
16 | 
17 | CODE_ROOT=${PWD}
18 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum"
19 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
20 | 
21 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
22 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
23 |   --config-name speechut_base_librispeech \
24 |   common.user_dir=$CODE_ROOT/speechut \
25 |   \
26 |   task.labels='["km"]' \
27 |   model.label_rate=50 \
28 |   task.data=$DATA_DIR \
29 |   task.label_dir=$DATA_DIR \
30 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
31 |   \
32 |   model.add_text_ctc=false \
33 |   criterion.u2t_ed_weight=1.0 \
34 |   criterion.u2t_ctc_weight=0 \
35 |   \
36 |   dataset.train_subset=\"train_960,pretrain_mustc+pseudo_wmt14_enfr.kmu-spm+train_960.kmu-none,pretrain_mustc.kmu-none\" \
37 |   dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \
38 |   dataset.num_workers=0 \
39 |   dataset.max_tokens=1400000 \
40 |   optimization.max_update=600000 \
41 |   distributed_training.distributed_world_size=${world_size} \
42 |   optimization.update_freq=[${update_freq}] \
43 |   \
44 |   common.tensorboard_logdir=$MODEL_DIR \
45 |   checkpoint.save_dir=$MODEL_DIR \
46 |   hydra.run.dir=$MODEL_DIR \
47 |   hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum
48 | 
49 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/pretrain_speechut/large_speechut_for_asr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Large model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=4
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/large_speechut4asr_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
21 |   --config-name speechut_large_librilight \
22 |   common.user_dir=$CODE_ROOT/speechut \
23 |   \
24 |   task.labels='["km"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_small+pseudo_libritext.kmu-ltr\" \
31 |   dataset.valid_subset=\"dev_clean+dev.kmu-ltr\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=900000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=large_speechut4asr_${world_size}gpu_${update_freq}accum
41 |   


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune960h_large_edctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Large model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=3
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechut/config/finetune_asr \
26 |   --config-name speechut_large_960h \
27 |   common.user_dir=$CODE_ROOT/speechut \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=80000 \
35 |   dataset.max_tokens=1100000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_960" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5
46 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune_base_edctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=2
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechut/config/finetune_asr \
26 |   --config-name speechut_base_100h \
27 |   common.user_dir=$CODE_ROOT/speechut \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=40000 \
35 |   dataset.max_tokens=1300000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_clean_100" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5
46 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctc.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechUT ASR model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | beam_size=$4
11 | ctc_weight=$5
12 | extra=$6
13 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
14 | [ -z $gen_set ] && gen_set="dev_other"
15 | [ -z $beam_size ] && beam_size=10
16 | [ -z $ctc_weight ] && ctc_weight=0.2
17 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
18 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
19 | 
20 | src_dir=${model_path%/*}
21 | cpt=${model_path##*/}
22 | cpt=${cpt%.*}
23 | 
24 | CODE_ROOT=${PWD}
25 | 
26 | for subset in ${gen_set//,/ }; do
27 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
28 |     [ ! -d $results_path ] && mkdir -p $results_path
29 | 
30 |     python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
31 |     --user-dir $CODE_ROOT/speechut \
32 |     --label-dir ${DATA_DIR} \
33 |     --labels '["ltr"]' \
34 |     --single-target \
35 |     --post-process letter \
36 |     --gen-subset ${subset} \
37 |     --max-tokens 2000000 \
38 |     \
39 |     --task joint_sc2t_pretraining \
40 |     --add-decoder-target \
41 |     --fine-tuning \
42 |     --pad-audio \
43 |     --random-crop \
44 |     \
45 |     --ctc-weight ${ctc_weight} $extra \
46 |     --beam ${beam_size} \
47 |     \
48 |     --path ${model_path} \
49 |     --results-path $results_path \
50 |     \
51 |     --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
52 |     &
53 | done
54 | wait
55 | 
56 | 
57 | for subset in ${gen_set//,/ }; do
58 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
59 |     echo $results_path
60 |     tail -n 1 $results_path/generate-*.txt
61 | done
62 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts copy/tune_speechut_st/inference_st.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | lang=$3
10 | gen_set=$4
11 | beam_size=$5
12 | lenpen=$6
13 | [ -z $gen_set ] && gen_set="dev"
14 | [ -z $beam_size ] && beam_size=10
15 | [ -z $lenpen ] && lenpen=1
16 | src_dir=${model_path%/*}
17 | cpt=${model_path##*/}
18 | cpt=${cpt%.*}
19 | 
20 | CODE_ROOT=${PWD}
21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
22 | [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
25 |     --gen-subset ${gen_set}_st \
26 |     --max-tokens 2000000 \
27 |     --max-source-positions 2000000 \
28 |     --num-workers 0 \
29 |     \
30 |     --user-dir $CODE_ROOT/speechut \
31 |     --task speech_to_text \
32 |     --config-yaml config_en${lang}.yaml \
33 |     \
34 |     --path ${model_path} \
35 |     --results-path $results_path \
36 |     \
37 |     --scoring sacrebleu --max-len-a 0 --max-len-b 512 \
38 |     --beam ${beam_size} \
39 |     --lenpen $lenpen \
40 |     # --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \
41 | 
42 |     echo $results_path
43 |     tail -n 1 $results_path/generate-*.txt
44 |     sleep 1s
45 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/Speech2S/speech2s/scripts/__init__.py


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/compare_namespaces.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Helper script to compare two argparse.Namespace objects."""
 3 | 
 4 | from argparse import Namespace  # noqa
 5 | 
 6 | 
 7 | def main():
 8 | 
 9 |     ns1 = eval(input("Namespace 1: "))
10 |     ns2 = eval(input("Namespace 2: "))
11 | 
12 |     def keys(ns):
13 |         ks = set()
14 |         for k in dir(ns):
15 |             if not k.startswith("_"):
16 |                 ks.add(k)
17 |         return ks
18 | 
19 |     k1 = keys(ns1)
20 |     k2 = keys(ns2)
21 | 
22 |     def print_keys(ks, ns1, ns2=None):
23 |         for k in ks:
24 |             if ns2 is None:
25 |                 print("{}\t{}".format(k, getattr(ns1, k, None)))
26 |             else:
27 |                 print(
28 |                     "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None))
29 |                 )
30 | 
31 |     print("Keys unique to namespace 1:")
32 |     print_keys(k1 - k2, ns1)
33 |     print()
34 | 
35 |     print("Keys unique to namespace 2:")
36 |     print_keys(k2 - k1, ns2)
37 |     print()
38 | 
39 |     print("Overlapping keys with different values:")
40 |     ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")]
41 |     print_keys(ks, ns1, ns2)
42 |     print()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/compound_split_bleu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -ne 1 ]; then
 4 |     echo "usage: $0 GENERATE_PY_OUTPUT"
 5 |     exit 1
 6 | fi
 7 | 
 8 | GEN=$1
 9 | 
10 | SYS=$GEN.sys
11 | REF=$GEN.ref
12 | 
13 | if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then
14 |     echo "not done generating"
15 |     exit
16 | fi
17 | 
18 | grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS
19 | grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF
20 | fairseq-score --sys $SYS --ref $REF
21 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/constraints/validate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | #
 5 | # This source code is licensed under the MIT license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | import sys
 9 | 
10 | 
11 | """Reads in a fairseq output file, and verifies that the constraints
12 | (C- lines) are present in the output (the first H- line). Assumes that
13 | constraints are listed prior to the first hypothesis.
14 | """
15 | 
16 | constraints = []
17 | found = 0
18 | total = 0
19 | for line in sys.stdin:
20 |     if line.startswith("C-"):
21 |         constraints.append(line.rstrip().split("\t")[1])
22 |     elif line.startswith("H-"):
23 |         text = line.split("\t")[2]
24 | 
25 |         for constraint in constraints:
26 |             total += 1
27 |             if constraint in text:
28 |                 found += 1
29 |             else:
30 |                 print(f"No {constraint} in {text}", file=sys.stderr)
31 | 
32 |         constraints = []
33 | 
34 | print(f"Found {found} / {total} = {100 * found / total:.1f}%")
35 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/convert_dictionary.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright (c) Facebook, Inc. and its affiliates.
 2 | --
 3 | -- This source code is licensed under the MIT license found in the
 4 | -- LICENSE file in the root directory of this source tree.
 5 | --
 6 | -- Usage: convert_dictionary.lua <dict.th7>
 7 | require 'fairseq'
 8 | require 'torch'
 9 | require 'paths'
10 | 
11 | if #arg < 1 then
12 |    print('usage: convert_dictionary.lua <dict.th7>')
13 |    os.exit(1)
14 | end
15 | if not paths.filep(arg[1]) then
16 |    print('error: file does not exit: ' .. arg[1])
17 |    os.exit(1)
18 | end
19 | 
20 | dict = torch.load(arg[1])
21 | dst = paths.basename(arg[1]):gsub('.th7', '.txt')
22 | assert(dst:match('.txt$'))
23 | 
24 | f = io.open(dst, 'w')
25 | for idx, symbol in ipairs(dict.index_to_symbol) do
26 |   if idx > dict.cutoff then
27 |     break
28 |   end
29 |   f:write(symbol)
30 |   f:write(' ')
31 |   f:write(dict.index_to_freq[idx])
32 |   f:write('\n')
33 | end
34 | f:close()
35 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/count_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | Count the number of documents and average number of lines and tokens per
 8 | document in a large file. Documents should be separated by a single empty line.
 9 | """
10 | 
11 | import argparse
12 | import gzip
13 | import sys
14 | 
15 | import numpy as np
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("input")
21 |     parser.add_argument("--gzip", action="store_true")
22 |     args = parser.parse_args()
23 | 
24 |     def gopen():
25 |         if args.gzip:
26 |             return gzip.open(args.input, "r")
27 |         else:
28 |             return open(args.input, "r", encoding="utf-8")
29 | 
30 |     num_lines = []
31 |     num_toks = []
32 |     with gopen() as h:
33 |         num_docs = 1
34 |         num_lines_in_doc = 0
35 |         num_toks_in_doc = 0
36 |         for i, line in enumerate(h):
37 |             if len(line.strip()) == 0:  # empty line indicates new document
38 |                 num_docs += 1
39 |                 num_lines.append(num_lines_in_doc)
40 |                 num_toks.append(num_toks_in_doc)
41 |                 num_lines_in_doc = 0
42 |                 num_toks_in_doc = 0
43 |             else:
44 |                 num_lines_in_doc += 1
45 |                 num_toks_in_doc += len(line.rstrip().split())
46 |             if i % 1000000 == 0:
47 |                 print(i, file=sys.stderr, end="", flush=True)
48 |             elif i % 100000 == 0:
49 |                 print(".", file=sys.stderr, end="", flush=True)
50 |         print(file=sys.stderr, flush=True)
51 | 
52 |     print("found {} docs".format(num_docs))
53 |     print("average num lines per doc: {}".format(np.mean(num_lines)))
54 |     print("average num toks per doc: {}".format(np.mean(num_toks)))
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/read_binarized.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import argparse
 8 | 
 9 | from fairseq.data import Dictionary, data_utils, indexed_dataset
10 | 
11 | 
12 | def get_parser():
13 |     parser = argparse.ArgumentParser(
14 |         description="writes text from binarized file to stdout"
15 |     )
16 |     # fmt: off
17 |     parser.add_argument('--dataset-impl', help='dataset implementation',
18 |                         choices=indexed_dataset.get_available_dataset_impl())
19 |     parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
20 |     parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
21 |     # fmt: on
22 | 
23 |     return parser
24 | 
25 | 
26 | def main():
27 |     parser = get_parser()
28 |     args = parser.parse_args()
29 | 
30 |     dictionary = Dictionary.load(args.dict) if args.dict is not None else None
31 |     dataset = data_utils.load_indexed_dataset(
32 |         args.input,
33 |         dictionary,
34 |         dataset_impl=args.dataset_impl,
35 |         default="lazy",
36 |     )
37 | 
38 |     for tensor_line in dataset:
39 |         if dictionary is None:
40 |             line = " ".join([str(int(x)) for x in tensor_line])
41 |         else:
42 |             line = dictionary.string(tensor_line)
43 | 
44 |         print(line)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/sacrebleu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -ne 4 ]; then
 4 |     echo "usage: $0 TESTSET SRCLANG TGTLANG GEN"
 5 |     exit 1
 6 | fi
 7 | 
 8 | TESTSET=$1
 9 | SRCLANG=$2
10 | TGTLANG=$3
11 | 
12 | GEN=$4
13 | 
14 | if ! command -v sacremoses &> /dev/null
15 | then
16 |     echo "sacremoses could not be found, please install with: pip install sacremoses"
17 |     exit
18 | fi
19 | 
20 | grep ^H $GEN \
21 | | sed 's/^H\-//' \
22 | | sort -n -k 1 \
23 | | cut -f 3 \
24 | | sacremoses detokenize \
25 | > $GEN.sorted.detok
26 | 
27 | sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok
28 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/shard_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | Split a large file into shards while respecting document boundaries. Documents
 8 | should be separated by a single empty line.
 9 | """
10 | 
11 | import argparse
12 | import contextlib
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("input")
18 |     parser.add_argument("--num-shards", type=int)
19 |     args = parser.parse_args()
20 | 
21 |     assert args.num_shards is not None and args.num_shards > 1
22 | 
23 |     with open(args.input, "r", encoding="utf-8") as h:
24 |         with contextlib.ExitStack() as stack:
25 |             outputs = [
26 |                 stack.enter_context(
27 |                     open(args.input + ".shard" + str(i), "w", encoding="utf-8")
28 |                 )
29 |                 for i in range(args.num_shards)
30 |             ]
31 | 
32 |             doc = []
33 |             first_doc = [True] * args.num_shards
34 | 
35 |             def output_doc(i):
36 |                 if not first_doc[i]:
37 |                     outputs[i].write("\n")
38 |                 first_doc[i] = False
39 |                 for line in doc:
40 |                     outputs[i].write(line)
41 |                 doc.clear()
42 | 
43 |             num_docs = 0
44 |             for line in h:
45 |                 if line.strip() == "":  # empty line indicates new document
46 |                     output_doc(num_docs % args.num_shards)
47 |                     num_docs += 1
48 |                 else:
49 |                     doc.append(line)
50 |             output_doc(num_docs % args.num_shards)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/spm_decode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | from __future__ import absolute_import, division, print_function, unicode_literals
 9 | 
10 | import argparse
11 | 
12 | import sentencepiece as spm
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         "--model", required=True, help="sentencepiece model to use for decoding"
19 |     )
20 |     parser.add_argument("--input", required=True, help="input file to decode")
21 |     parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
22 |     args = parser.parse_args()
23 | 
24 |     sp = spm.SentencePieceProcessor()
25 |     sp.Load(args.model)
26 | 
27 |     if args.input_format == "piece":
28 | 
29 |         def decode(input):
30 |             return "".join(sp.DecodePieces(input))
31 | 
32 |     elif args.input_format == "id":
33 | 
34 |         def decode(input):
35 |             return "".join(sp.DecodeIds(input))
36 | 
37 |     else:
38 |         raise NotImplementedError
39 | 
40 |     def tok2int(tok):
41 |         # remap reference-side <unk> (represented as <<unk>>) to 0
42 |         return int(tok) if tok != "<<unk>>" else 0
43 | 
44 |     with open(args.input, "r", encoding="utf-8") as h:
45 |         for line in h:
46 |             if args.input_format == "id":
47 |                 print(decode(list(map(tok2int, line.rstrip().split()))))
48 |             elif args.input_format == "piece":
49 |                 print(decode(line.rstrip().split()))
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/spm_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | from __future__ import absolute_import, division, print_function, unicode_literals
 9 | 
10 | import sys
11 | 
12 | import sentencepiece as spm
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
17 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/scripts/test_fsdp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | rm -rf fsdp_dummy
 3 | mkdir -p fsdp_dummy
 4 | CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
 5 |     --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
 6 |     --cpu-offload --checkpoint-activations \
 7 |     --task language_modeling --tokens-per-sample 256 --batch-size 8 \
 8 |     --arch transformer_lm_gpt2_tiny \
 9 |     --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
10 |     --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
11 |     --max-update 5 --log-format json --log-interval 1 \
12 |     --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \
13 |     --restore-file x.pt "$@"
14 | 
15 | # Now we try to load the checkpoint
16 | CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
17 |     --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
18 |     --cpu-offload --checkpoint-activations \
19 |     --task language_modeling --tokens-per-sample 256 --batch-size 8 \
20 |     --arch transformer_lm_gpt2_tiny \
21 |     --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
22 |     --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
23 |     --max-update 2 --log-format json --log-interval 1 \
24 |     --save-interval-updates 2 --save-dir fsdp_dummy
25 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # ####################################
 3 | # Hubert SCT2T ED model #
 4 | # ####################################
 5 | 
 6 | world_size=$1
 7 | update_freq=$2
 8 | exp_name=$3
 9 | [ -z $world_size ] && world_size=8
10 | [ -z $update_freq ] && update_freq=1
11 | [ -z $exp_name ] && exp_name=sc2t_base_enes_${world_size}gpu_${update_freq}accum6666
12 | 
13 | 
14 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
15 | CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
16 | DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_enes"
17 | TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx"
18 | MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/$exp_name"
19 | 
20 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
21 | 
22 | 
23 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
24 |   --config-dir $CONFIG_DIR/pretrain \
25 |   --config-name sc2t_base_librispeech \
26 |   \
27 |   +task.store_labels=true \
28 |   task.labels='["km"]' \
29 |   model.label_rate=50 \
30 |   task.data=$DATA_DIR \
31 |   task.label_dir=$DATA_DIR \
32 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
33 |   +task.text_cfg.data_config=config.yaml \
34 |   task.text_cfg.text_maxtokens_ratio=3.0 \
35 |   \
36 |   +criterion.dec_loss_type="ce" \
37 |   \
38 |   criterion.text_weight=1.0 \
39 |   \
40 |   model.use_rel_pos_enc=true \
41 |   +model.code_use_rel_pos_enc=true \
42 |   +model.pad_with_code=true \
43 |   model.text_transformer.no_scale_embedding=true \
44 |   model.text_transformer.layernorm_embedding=true \
45 |   +model.share_decoder_input_output_embed=true \
46 |   \
47 |   dataset.train_subset=\"train_all+en.kmu-spm\" \
48 |   dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
49 |   dataset.num_workers=0 \
50 |   dataset.max_tokens=1000000 \
51 |   optimization.update_freq=[${update_freq}] \
52 |   optimization.max_update=400000 \
53 |   \
54 |   distributed_training.distributed_world_size=${world_size} \
55 |   \
56 |   common.tensorboard_logdir=$MODEL_DIR \
57 |   checkpoint.save_dir=$MODEL_DIR \
58 |   hydra.run.dir=$MODEL_DIR \
59 |   hydra.job.name=${exp_name}
60 | 
61 | 
62 | sleep 5m
63 | echo "All finished"
64 | 
65 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # ####################################
 3 | # Hubert SCT2T ED model #
 4 | # ####################################
 5 | 
 6 | world_size=$1
 7 | update_freq=$2
 8 | exp_name=$3
 9 | [ -z $world_size ] && world_size=24
10 | [ -z $update_freq ] && update_freq=3
11 | [ -z $exp_name ] && exp_name=sc2t_base_esen_${world_size}gpu_${update_freq}accum1
12 | 
13 | 
14 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
15 | CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
16 | DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_esen"
17 | TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_esen"
18 | MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_esen/$exp_name"
19 | 
20 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
21 | 
22 | 
23 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
24 |   --config-dir $CONFIG_DIR/pretrain \
25 |   --config-name sc2t_base_librispeech \
26 |   \
27 |   +task.store_labels=true \
28 |   task.labels='["km"]' \
29 |   model.label_rate=50 \
30 |   task.data=$DATA_DIR \
31 |   task.label_dir=$DATA_DIR \
32 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
33 |   +task.text_cfg.data_config=config.yaml \
34 |   task.text_cfg.text_maxtokens_ratio=3.0 \
35 |   \
36 |   +criterion.dec_loss_type="ce" \
37 |   \
38 |   criterion.text_weight=1.0 \
39 |   \
40 |   model.use_rel_pos_enc=true \
41 |   +model.code_use_rel_pos_enc=true \
42 |   +model.pad_with_code=true \
43 |   model.text_transformer.no_scale_embedding=true \
44 |   model.text_transformer.layernorm_embedding=true \
45 |   +model.share_decoder_input_output_embed=true \
46 |   \
47 |   dataset.train_subset=\"train+en.kmu-spm\" \
48 |   dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
49 |   dataset.num_workers=0 \
50 |   dataset.max_tokens=1000000 \
51 |   optimization.update_freq=[${update_freq}] \
52 |   optimization.max_update=400000 \
53 |   \
54 |   distributed_training.distributed_world_size=${world_size} \
55 |   \
56 |   common.tensorboard_logdir=$MODEL_DIR \
57 |   checkpoint.save_dir=$MODEL_DIR \
58 |   hydra.run.dir=$MODEL_DIR \
59 |   hydra.job.name=${exp_name}
60 | 
61 | 
62 | sleep 5m
63 | echo "All finished"
64 | 
65 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/config.yaml:
--------------------------------------------------------------------------------
1 | audio_root: ./
2 | standardize_audio: true
3 | use_audio_input: true
4 | vocab_filename: dict.txt
5 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   log_format: json
 6 |   log_interval: 200
 7 |   tensorboard_logdir: tblog
 8 | 
 9 | checkpoint:
10 |   save_interval: 1
11 |   keep_last_epochs: 10
12 |   keep_best_checkpoints: 5
13 |   best_checkpoint_metric: wer
14 |   restore_file: checkpoint_last.pt
15 | 
16 | distributed_training:
17 |   ddp_backend: c10d
18 |   find_unused_parameters: true
19 |   distributed_world_size: 24
20 |   distributed_port: -1
21 |   nprocs_per_node: 8
22 | 
23 | task:
24 |   _name: hubert_pretraining
25 |   data: ???
26 |   fine_tuning: true
27 |   label_dir: ???
28 |   normalize: true  # must be consistent with pre-training
29 |   labels: ["ltr"]
30 |   single_target: true
31 |   add_decoder: false
32 |   pad_audio: false
33 |   random_crop: true
34 |   tokenizer: "none"
35 |   sp_path: None
36 | 
37 | dataset:
38 |   num_workers: 0
39 |   max_tokens: 1280000
40 |   skip_invalid_size_inputs_valid_test: true
41 |   valid_subset: dev_other
42 |   required_batch_size_multiple: 1
43 | 
44 | criterion:
45 |   _name: ctc
46 |   zero_infinity: true
47 | 
48 | optimization:
49 |   max_update: 200000
50 |   lr: [0.00003]
51 |   sentence_avg: true
52 |   update_freq: [1]
53 | 
54 | optimizer:
55 |   _name: adam
56 |   adam_betas: (0.9,0.98)
57 |   adam_eps: 1e-08
58 |   weight_decay: 0.0
59 | 
60 | lr_scheduler:
61 |   _name: tri_stage
62 |   phase_ratio: [0.1, 0.4, 0.5]
63 |   final_lr_scale: 0.05
64 | 
65 | model:
66 |   _name: hubert_ctc
67 |   w2v_path: ???
68 |   apply_mask: true
69 |   mask_prob: 0.5
70 |   mask_channel_prob: 0.25
71 |   mask_channel_length: 64
72 |   layerdrop: 0.0
73 |   decoder_layerdrop: 0.1
74 |   activation_dropout: 0.1
75 |   feature_grad_mult: 0.0
76 |   freeze_finetune_updates: 0
77 |   add_decoder: false
78 | 
79 | hydra:
80 |   job:
81 |     config:
82 |       override_dirname:
83 |         kv_sep: '-'
84 |         item_sep: '__'
85 |         exclude_keys:
86 |           - run
87 |           - task.data
88 |           - task.label_dir
89 |           - model.w2v_path
90 |           - dataset.train_subset
91 |           - dataset.valid_subset
92 |           - criterion.wer_kenlm_model
93 |           - criterion.wer_lexicon
94 |   run:
95 |     dir: ???
96 |   sweep:
97 |     dir: ???
98 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
99 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   log_format: json
 6 |   log_interval: 200
 7 |   tensorboard_logdir: tblog
 8 |   seed: 1337
 9 | 
10 | checkpoint:
11 |   save_interval: 1000000
12 |   keep_last_epochs: 5
13 |   save_interval_updates: 1000
14 |   keep_interval_updates_pattern: 10000
15 |   keep_interval_updates: 5
16 |   best_checkpoint_metric: accuracy
17 |   maximize_best_checkpoint_metric: true
18 | 
19 | distributed_training:
20 |   ddp_backend: c10d
21 |   find_unused_parameters: true
22 |   distributed_world_size: 1
23 |   nprocs_per_node: 8
24 | 
25 | 
26 | criterion:
27 |   _name: "label_smoothed_cross_entropy"
28 | 
29 | 
30 | task:
31 |   _name: "translation_from_jst"
32 | 
33 | dataset:
34 |   num_workers: 0
35 |   max_tokens: 4096
36 |   skip_invalid_size_inputs_valid_test: true
37 |   validate_after_updates: ${model.freeze_finetune_updates}
38 |   validate_interval: ${checkpoint.save_interval}
39 |   validate_interval_updates: ${checkpoint.save_interval_updates}
40 |   train_subset: train_clean_100
41 |   valid_subset: dev_clean
42 |   required_batch_size_multiple: 1
43 | 
44 | optimizer:
45 |   _name: adam
46 |   adam_betas: (0.9,0.98)
47 |   adam_eps: 1e-06
48 |   weight_decay: 0.0
49 | 
50 | lr_scheduler:
51 |   _name: tri_stage
52 |   phase_ratio: [0.1, 0.4, 0.5]
53 |   final_lr_scale: 0.05
54 | 
55 | model:
56 |   _name: hubert_t2c
57 |   w2v_path: ???
58 |   layerdrop: 0.1
59 |   decoder_layerdrop: 0.1
60 |   activation_dropout: 0.1
61 |   feature_grad_mult: 0.0
62 |   freeze_finetune_updates: 0
63 | 
64 | hydra:
65 |   job:
66 |     config:
67 |       override_dirname:
68 |         kv_sep: '-'
69 |         item_sep: '__'
70 |         exclude_keys:
71 |           - run
72 |           - task.data
73 |           - task.label_dir
74 |           - model.w2v_path
75 |           - dataset.train_subset
76 |           - dataset.valid_subset
77 |   run:
78 |     dir: ???
79 |   sweep:
80 |     dir: ???
81 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
82 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | if [ ! -d ${HOME}/azcopy_linux_amd64_10.11.0 ]; then
 3 |     CURRENT_DIR=`pwd`
 4 |     cd ${HOME} && wget https://azcopyvnext.azureedge.net/release20210616/azcopy_linux_amd64_10.11.0.tar.gz && tar -zxvf azcopy_linux_amd64_10.11.0.tar.gz && rm -f azcopy_linux_amd64_10.11.0.tar.gz && cd ${CURRENT_DIR}
 5 | fi
 6 | export PATH=$PATH:${HOME}/azcopy_linux_amd64_10.11.0/:${HOME}/.local/bin
 7 | export PYTHONPATH=$PYTHONPATH:/mnt/output/users/v-kunwei/code/fairseq
 8 | 
 9 | rank=$1
10 | nshard=$2
11 | split=$3
12 | [ -z $rank ] && echo "please specify rank"
13 | [ -z $nshard ] && nshard=1
14 | [ -z $split ] && split="train"
15 | 
16 | 
17 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq
18 | ckpt_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3.pt
19 | tsv_dir=/home/v-kunwei
20 | 
21 | feat_dir=${HOME}/$split
22 | python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} 9 ${nshard} ${rank} ${feat_dir} || exit 1
23 | 
24 | 
25 | echo "-------------------------------------------------------------------------------------------"
26 | echo "----------------------------------    done    ---------------------------------------------"
27 | echo "-------------------------------------------------------------------------------------------"
28 | 
29 | km_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin 
30 | lab_dir=${HOME}/${split}
31 | python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
32 | 
33 | 
34 | # sas="?sv=2020-08-04&st=2022-01-02T04%3A58%3A15Z&se=2022-06-01T04%3A58%3A00Z&sr=c&sp=racwdl&sig=NyZKOHivgesEoZ8yvLsVT6aZMYQZMevLLmXNOTaWyvU%3D"
35 | # blob="https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-ziqzhang/data/stbert/data/librispeech/libri_960/hubert_release_iter2_layer9_kmeans/${split}"
36 | # azcopy copy $feat_dir/${split}_${rank}_${nshard}.len "$blob/$sas"
37 | # azcopy copy $feat_dir/${split}_${rank}_${nshard}.npy "$blob/$sas"
38 | # azcopy copy $lab_dir "$blob/$sas" --recursive
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | 
 4 | 
 5 | def main():
 6 |     for line in sys.stdin:
 7 |         line = line.rstrip()
 8 |         codes = list(map(int, line.split()))
 9 |         merged_codes = torch.unique_consecutive(torch.tensor(codes)).numpy()
10 |         merged_codes = map(str, merged_codes)
11 |         print(" ".join(merged_codes))
12 | 
13 | if __name__ == "__main__":
14 |     main()
15 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh:
--------------------------------------------------------------------------------
 1 | [ $# -lt 3 ] && echo "Usage: $0 <input-text> <outdir> <DICT> <suffix>" && exit 0
 2 | 
 3 | if [ ! -d ${HOME}/sentencepiece ]; then
 4 |     CURRENT_DIR=`pwd`
 5 |     cd ${HOME}
 6 |     git clone https://github.com/google/sentencepiece.git
 7 |     cd sentencepiece
 8 |     mkdir build && cd build
 9 |     cmake .. && make -j 16
10 |     sudo make install
11 |     sudo ldconfig -v
12 |     cd ${HOME}
13 |     cd ${CURRENT_DIR}
14 | fi
15 | 
16 | input=$1
17 | outdir=$2
18 | DICT=$3
19 | suffix=$4
20 | outname=${input##*/}
21 | outname=${outname%.txt*}
22 | [ -z $input ] && echo "You must specify a source file" && exit 1
23 | 
24 | [ -z $DICT ] && echo "No dict was specified!" && exit 1
25 | [ -z $outdir ] && outdir=${input%/*}
26 | [ -z $outdir ] && outdir="."
27 | [ ! -d $outdir ] && mkdir -p $outdir
28 | 
29 | echo "Dict  : $DICT"
30 | echo "------------------------------- creating idx/bin--------------------------------------------"
31 | echo "$input --> $outdir/${outname}${suffix}.idx"
32 | fairseq-preprocess \
33 |   --only-source \
34 |   --trainpref $input \
35 |   --destdir $outdir \
36 |   --thresholdsrc 0 \
37 |   --srcdict ${DICT} \
38 |   --workers 40
39 | 
40 | mv $outdir/train.idx $outdir/${outname}${suffix}.idx
41 | mv $outdir/train.bin $outdir/${outname}${suffix}.bin
42 | echo "-----------------------------------   done      --------------------------------------------"
43 | 
44 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh:
--------------------------------------------------------------------------------
 1 | [ $# -lt 2 ] && echo "Usage: $0 <input-text> <outdir> <MODEL> <suffix>" && exit 0
 2 | 
 3 | if [ ! -d ${HOME}/sentencepiece ]; then
 4 |     CURRENT_DIR=`pwd`
 5 |     cd ${HOME}
 6 |     git clone https://github.com/google/sentencepiece.git
 7 |     cd sentencepiece
 8 |     mkdir build && cd build
 9 |     cmake .. && make -j 16
10 |     sudo make install
11 |     sudo ldconfig -v
12 |     cd ${HOME}
13 |     cd ${CURRENT_DIR}
14 | fi
15 | 
16 | input=$1
17 | outdir=$2
18 | MODEL=$3
19 | suffix=$4
20 | outname=${input##*/}
21 | outname=${outname%.wrd*}
22 | [ -z $input ] && echo "You must specify a source file" && exit 1
23 | 
24 | [ -z $MODEL ] && MODEL=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/spm_unigram_10000.model && echo "No spm model was specified!, set default to $MODEL"
25 | [ -z $outdir ] && outdir=${input%/*}
26 | [ -z $outdir ] && outdir="."
27 | [ ! -d $outdir ] && mkdir -p $outdir
28 | 
29 | echo "Output: $outdir/$outname.spm"
30 | 
31 | echo "------------------------------- tokenize text...--------------------------------------------"
32 | spm_encode --model=$MODEL < ${input} > $outdir/$outname.spm || exit 1
33 | echo "-----------------------------------   done      --------------------------------------------"
34 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/wmt/normalize_en_text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | import regex
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from num2words import num2words
 7 | 
 8 | def writefile(filename, lines):
 9 |     with open(filename, 'w', encoding='utf-8') as f:
10 |         f.writelines(lines)
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--input", "-i", required=True, type=str)
15 |     parser.add_argument("--output", "-o", required=True, type=str)
16 |     args = parser.parse_args()
17 |     outlines = []
18 | 
19 |     with open(f"{args.input}", 'r') as f:
20 |         inputs = f.readlines()
21 | 
22 |         for line in tqdm(inputs):
23 |             line = line.strip().upper()
24 |             line = re.sub(u"([^\u0041-\u005a\u0061-\u007a\u0030-\u0039\'])", " ", line)
25 |             items = []
26 |             for item in line.split():
27 |                 if item.isdigit():
28 |                     try:
29 |                         item = num2words(item)
30 |                     except Exception as e:
31 |                         print(line)
32 |                         raise(e)
33 |                 items.append(item)
34 |             line = " ".join(items)
35 |             line = line.replace("-", " ")
36 |             line = line.upper()
37 |             line = line.replace("' S", "'S")
38 |             line = line.replace(" ", "|")
39 |             line = " ".join(line) + " |"
40 |             outlines.append(line + '\n')
41 |             # print(line)
42 | 
43 |     writefile(args.output, outlines)
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/data_process/wmt/normalize_es_text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | import regex
 4 | import argparse
 5 | import re,string
 6 | from tqdm import tqdm
 7 | from num2words import num2words
 8 | 
 9 | def writefile(filename, lines):
10 |     with open(filename, 'w', encoding='utf-8') as f:
11 |         f.writelines(lines)
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("--input", "-i", required=True, type=str)
16 |     parser.add_argument("--output", "-o", required=True, type=str)
17 |     args = parser.parse_args()
18 |     outlines = []
19 | 
20 |     with open(f"{args.input}", 'r') as f:
21 |         inputs = f.readlines()
22 | 
23 |         for line in tqdm(inputs):
24 |             line = line.strip()
25 |             line = re.sub(u"([^\u0041-\u005a\u0061-\u007a\u0030-\u0039\u00d1\u00f1\'])", " ", line)
26 |             items = []
27 |             punc='~`!#$%^&*()_+-=|\';":/.,?><~.'
28 |             for item in line.split():
29 |                 if item.isdigit():
30 |                     try:
31 |                         item = num2words(item, lang='es')
32 |                     except Exception as e:
33 |                         print(line)
34 |                         raise(e)
35 |                 items.append(item)
36 |             line = " ".join(items)
37 |             line = (re.sub(r"[%s]+" %punc, "",line))
38 |             line = line.replace("-", " ")
39 |             line = line.lower()
40 |             line = line.replace("' S", "'S")
41 |             line = line.replace(" ", "|")
42 |             line = " ".join(line) + " |"
43 |             outlines.append(line + '\n')
44 |             # print(line)
45 | 
46 |     writefile(args.output, outlines)
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/decode_text2code_beam2.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | # Hubert ED model #
 4 | #####################################
 5 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set> <src> <tgt> <max_tokens> <world_size> <rank>" && exit 0
 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing
 7 | 
 8 | model_path=$1
 9 | gen_set=$2
10 | tgt=$3
11 | src="ltr"
12 | max_tokens=$4
13 | word_size=$5
14 | rank=$6
15 | outdir=$7
16 | 
17 | [ -z $tgt ] && tgt="kmu"
18 | [ -z $gen_set ] && gen_set="dev_clean"
19 | [ -z $word_size ] && word_size=1
20 | [ -z $rank ] && rank=0
21 | [ -z $max_tokens ] && max_tokens=16000
22 | 
23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
24 | DATA_DIR=/home/v-kunwei/
25 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/code/fairseq_mlstku
26 | [ -z $outdir ] && outdir=$DATA_DIR
27 | 
28 | 
29 | results_path=$outdir/pseudo_${gen_set}_${rank}
30 | [ ! -d $results_path ] && mkdir -p $results_path
31 | 
32 | for subset in $gen_set; do
33 |     python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \
34 |     --path ${model_path} \
35 |     --task "translation_from_jst" \
36 |     --max-target-positions 18000 \
37 |     --gen-subset $subset \
38 |     -t $tgt -s "ltr" \
39 |     --dataset-impl "raw" \
40 |     --max-tokens ${max_tokens} \
41 |     --beam 2 \
42 |     --max-len-a 3 --max-len-b 100 \
43 |     --results-path $results_path \
44 |     --distributed-world-size $word_size --distributed-rank $rank \
45 |     
46 |     echo "$model" > $results_path/model.record
47 |     sleep 1s
48 | done | tee $results_path/decode.log
49 | 
50 | sleep 2s
51 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/eval2.sh:
--------------------------------------------------------------------------------
 1 | lmweight=0
 2 | num_gpus=8
 3 | python examples/speech_recognition/new/infer.py --config-dir /mnt/output/users/v-kunwei/code/fairseq/examples/speech_recognition/new/conf \
 4 | --config-name infer task=audio_finetuning task.data=/home/v-kunwei common.user_dir=/mnt/output/users/v-kunwei/code/fairseq/examples/data2vec \
 5 | task.labels=ltr decoding.type=viterbi \
 6 | decoding.lexicon=models/es_eval/espeak_dict.txt \
 7 | decoding.unique_wer_file=True \
 8 | dataset.gen_subset=test \
 9 | common_eval.path=/mnt/output/users/v-kunwei/code/fairseq/models/es_eval/espeak_26lang_m10.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus} \
10 | decoding.results_path=/home/v-kunwei
11 | 
12 | #sclite  -h "/home/v-kunwei/hypo.units"  -r "/home/v-kunwei/ref.units"  -i rm -o all stdout > "./result.txt"
13 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/eval3.sh:
--------------------------------------------------------------------------------
1 | #$subset=test
2 | python examples/speech_recognition/infer.py /home/v-kunwei --task audio_finetuning \
3 | --nbest 1 --path /mnt/output/users/v-kunwei/code/fairseq/models/es_eval/espeak_26lang_m10.pt --gen-subset test --results-path /home/v-kunwei --criterion ctc --labels ltr --max-tokens 4000000 \
4 | --post-process letter
5 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/inference_ed.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Hubert base model #
 3 | #####################################
 4 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set>" && exit 0
 5 | 
 6 | model_path=$1
 7 | src_dir=${model_path%/*}
 8 | cpt=${model_path##*/}
 9 | cpt=${cpt%.*}
10 | 
11 | #beam_size=$2
12 | gen_set=$2
13 | #lang=$4
14 | [ -z $gen_set ] && gen_set="test_et"
15 | [ -z $beam_size ] && beam_size=2
16 | [ -z $lang ] && lang="fr"
17 | 
18 | 
19 | #DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/fin_enes
20 | DATA_DIR=/home/v-kunwei
21 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
22 | 
23 | for subset in $gen_set; do
24 |     results_path=$src_dir/decode_${cpt}_beam${beam_size}/${subset}
25 |     [ ! -d $results_path ] && mkdir -p $results_path
26 | 
27 |     python $FAIRSEQ_ROOT/fairseq_cli/generate.py \
28 | 	    $DATA_DIR  --label-dir ${DATA_DIR} \
29 | 	    --labels '["spm"]' --gen-subset ${subset} \
30 |             --max-tokens 9000000 --task hubert_pretraining \
31 | 	    --add-decoder --fine-tuning --random-crop \
32 | 	    --path ${model_path}  --results-path /home/v-kunwei --scoring sacrebleu  \
33 | 	    --max-len-a 0 --max-len-b 900 \
34 | 	    --beam 10 --single-target 
35 |     
36 |     tail -n 1 /home/v-kunwei/generate-*.txt
37 |     sleep 1s
38 | done
39 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Hubert mt model #
 3 | #####################################
 4 | [ $# -gt 3 ] && echo "Usage: $0 <world_size> <seeds>" && exit 0
 5 | world_size=$1
 6 | update_freq=$2
 7 | w2v_path=$3
 8 | Mount=""
 9 | 
10 | [ -z $world_size ] && world_size=8
11 | [ -z $update_freq ] && update_freq=1
12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_wo_emb_32_1004.pt"
13 | 
14 | 
15 | langs="ltr,kmu"
16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation
18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/en_asr_data/
19 | 
20 | ### set save-dir
21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_en"
22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_alll"
23 | MODEL_DIR=$MODEL_DIR/$exp_name
24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
25 | 
26 | 
27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
28 |   --config-dir $CONFIG_ROOT \
29 |   --config-name text2code \
30 |   +task.data=$DATA_DIR \
31 |   dataset.dataset_impl="raw" \
32 |   +task.source_lang="ltr" +task.target_lang="kmu" \
33 |   +task.normalize=false \
34 |   \
35 |   +criterion.label_smoothing=0.1 \
36 |   +criterion.report_accuracy=true \
37 |   optimizer.weight_decay=0.00001 \
38 |   +lr_scheduler.lr="[0.0001]" \
39 |   optimization.max_update=500000 \
40 |   \
41 |   +model.dropout=0.1 \
42 |   +model.attention_dropout=0.1 \
43 |   model.activation_dropout=0.1 \
44 |   model.decoder_layerdrop=0 \
45 |   model.layerdrop=0 \
46 |   model.w2v_path=$w2v_path \
47 |   +model.text_transformer_encoder_layers=6 \
48 |   \
49 |   dataset.train_subset="en_train" \
50 |   dataset.valid_subset="en_dev" \
51 |   optimization.update_freq=[${update_freq}] \
52 |   optimization.clip_norm=5 \
53 |   \
54 |   common.seed=222 \
55 |   common.log_interval=100 \
56 |   common.log_format="json" \
57 |   \
58 |   distributed_training.distributed_world_size=${world_size} \
59 |   distributed_training.nprocs_per_node=8 \
60 |   distributed_training.ddp_backend="legacy_ddp" \
61 |   \
62 |   common.tensorboard_logdir=$MODEL_DIR \
63 |   checkpoint.save_dir=$MODEL_DIR \
64 |   hydra.run.dir=$MODEL_DIR \
65 |   hydra.job.name=${exp_name} \
66 | 
67 | sleep 10s
68 |   # sleep infinity
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k_es.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Hubert mt model #
 3 | #####################################
 4 | [ $# -gt 3 ] && echo "Usage: $0 <world_size> <seeds>" && exit 0
 5 | world_size=$1
 6 | update_freq=$2
 7 | w2v_path=$3
 8 | Mount=""
 9 | 
10 | [ -z $world_size ] && world_size=8
11 | [ -z $update_freq ] && update_freq=1
12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_es_emb_90_1004.pt"
13 | 
14 | 
15 | langs="ltr,kmu"
16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation
18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_no_data/
19 | 
20 | ### set save-dir
21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_es"
22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_no"
23 | MODEL_DIR=$MODEL_DIR/$exp_name
24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
25 | 
26 | 
27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
28 |   --config-dir $CONFIG_ROOT \
29 |   --config-name text2code \
30 |   +task.data=$DATA_DIR \
31 |   dataset.dataset_impl="raw" \
32 |   +task.source_lang="ltr" +task.target_lang="kmu" \
33 |   +task.normalize=false \
34 |   \
35 |   +criterion.label_smoothing=0.1 \
36 |   +criterion.report_accuracy=true \
37 |   optimizer.weight_decay=0.00001 \
38 |   +lr_scheduler.lr="[0.0001]" \
39 |   optimization.max_update=500000 \
40 |   \
41 |   +model.dropout=0.1 \
42 |   +model.attention_dropout=0.1 \
43 |   model.activation_dropout=0.1 \
44 |   model.decoder_layerdrop=0 \
45 |   model.layerdrop=0 \
46 |   model.w2v_path=$w2v_path \
47 |   +model.text_transformer_encoder_layers=6 \
48 |   \
49 |   dataset.train_subset="es_train" \
50 |   dataset.valid_subset="es_dev" \
51 |   optimization.update_freq=[${update_freq}] \
52 |   optimization.clip_norm=5 \
53 |   \
54 |   common.seed=222 \
55 |   common.log_interval=100 \
56 |   common.log_format="json" \
57 |   \
58 |   distributed_training.distributed_world_size=${world_size} \
59 |   distributed_training.nprocs_per_node=8 \
60 |   distributed_training.ddp_backend="legacy_ddp" \
61 |   \
62 |   common.tensorboard_logdir=$MODEL_DIR \
63 |   checkpoint.save_dir=$MODEL_DIR \
64 |   hydra.run.dir=$MODEL_DIR \
65 |   hydra.job.name=${exp_name} \
66 | 
67 | sleep 10s
68 |   # sleep infinity
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/base_ReleaseIter2_text2unicode_from400k_es2.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Hubert mt model #
 3 | #####################################
 4 | [ $# -gt 3 ] && echo "Usage: $0 <world_size> <seeds>" && exit 0
 5 | world_size=$1
 6 | update_freq=$2
 7 | w2v_path=$3
 8 | Mount=""
 9 | 
10 | [ -z $world_size ] && world_size=8
11 | [ -z $update_freq ] && update_freq=1
12 | [ -z $w2v_path ] && w2v_path="/mnt/output/users/v-kunwei/data/s2s_data/model_es_emb_81_1004.pt"
13 | 
14 | 
15 | langs="ltr,kmu"
16 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
17 | CONFIG_ROOT=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config/translation
18 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asrl_data/
19 | 
20 | ### set save-dir
21 | MODEL_DIR="/mnt/output/users/v-kunwei/data/s2s_data/exp/text2unicode_es"
22 | exp_name="base_pt400k_releaseiter2_${world_size}gpu_${update_freq}accum_lr1e-4_ll"
23 | MODEL_DIR=$MODEL_DIR/$exp_name
24 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
25 | 
26 | 
27 | python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
28 |   --config-dir $CONFIG_ROOT \
29 |   --config-name text2code \
30 |   +task.data=$DATA_DIR \
31 |   dataset.dataset_impl="raw" \
32 |   +task.source_lang="ltr" +task.target_lang="kmu" \
33 |   +task.normalize=false \
34 |   \
35 |   +criterion.label_smoothing=0.1 \
36 |   +criterion.report_accuracy=true \
37 |   optimizer.weight_decay=0.00001 \
38 |   +lr_scheduler.lr="[0.0001]" \
39 |   optimization.max_update=500000 \
40 |   \
41 |   +model.dropout=0.1 \
42 |   +model.attention_dropout=0.1 \
43 |   model.activation_dropout=0.1 \
44 |   model.decoder_layerdrop=0 \
45 |   model.layerdrop=0 \
46 |   model.w2v_path=$w2v_path \
47 |   +model.text_transformer_encoder_layers=6 \
48 |   \
49 |   dataset.train_subset="es_train" \
50 |   dataset.valid_subset="es_dev" \
51 |   optimization.update_freq=[${update_freq}] \
52 |   optimization.clip_norm=5 \
53 |   \
54 |   common.seed=222 \
55 |   common.log_interval=100 \
56 |   common.log_format="json" \
57 |   \
58 |   distributed_training.distributed_world_size=${world_size} \
59 |   distributed_training.nprocs_per_node=8 \
60 |   distributed_training.ddp_backend="legacy_ddp" \
61 |   \
62 |   common.tensorboard_logdir=$MODEL_DIR \
63 |   checkpoint.save_dir=$MODEL_DIR \
64 |   hydra.run.dir=$MODEL_DIR \
65 |   hydra.job.name=${exp_name} \
66 | 
67 | sleep 10s
68 |   # sleep infinity
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/decode_text2code.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | # Hubert ED model #
 4 | #####################################
 5 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set> <src> <tgt> <max_tokens> <world_size> <rank>" && exit 0
 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing
 7 | 
 8 | model_path=$1
 9 | gen_set=$2
10 | tgt=$3
11 | src="ltr"
12 | max_tokens=$4
13 | word_size=$5
14 | rank=$6
15 | outdir=$7
16 | 
17 | [ -z $tgt ] && tgt="kmu"
18 | [ -z $gen_set ] && gen_set="dev_clean"
19 | [ -z $word_size ] && word_size=1
20 | [ -z $rank ] && rank=0
21 | [ -z $max_tokens ] && max_tokens=2000
22 | 
23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst
24 | DATA_DIR=${gen_set%/*}
25 | gen_set=${gen_set##*/}
26 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/en_asr_data
27 | [ -z $outdir ] && outdir=$DATA_DIR
28 | 
29 | 
30 | results_path=$outdir/pseudo_${gen_set}_${rank}
31 | [ ! -d $results_path ] && mkdir -p $results_path
32 | 
33 | for subset in $gen_set; do
34 |     python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \
35 |     --path ${model_path} \
36 |     --task "translation_from_jst" \
37 |     --max-target-positions 3000 \
38 |     --gen-subset $subset \
39 |     -t $tgt -s "ltr" \
40 |     --max-tokens ${max_tokens} \
41 |     --dataset-impl "raw" \
42 |     --max-len-a 2 --max-len-b 100 \
43 |     --results-path $results_path \
44 |     --skip-invalid-size-inputs-valid-test \
45 |     --distributed-world-size $word_size --distributed-rank $rank \
46 |     
47 |     echo "$model" > $results_path/model.record
48 |     sleep 1s
49 | done | tee $results_path/decode.log
50 | 
51 | sleep 2s
52 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/decode_text2code_beam2.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | # Hubert ED model #
 4 | #####################################
 5 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set> <src> <tgt> <max_tokens> <world_size> <rank>" && exit 0
 6 | #source /mnt/default/v-ziqzhang/.bashrc_sing
 7 | 
 8 | model_path=$1
 9 | gen_set=$2
10 | tgt=$3
11 | src="ltr"
12 | max_tokens=$4
13 | word_size=$5
14 | rank=$6
15 | outdir=$7
16 | 
17 | [ -z $tgt ] && tgt="kmu"
18 | [ -z $gen_set ] && gen_set="dev_clean"
19 | [ -z $word_size ] && word_size=1
20 | [ -z $rank ] && rank=0
21 | [ -z $max_tokens ] && max_tokens=2000
22 | 
23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
24 | DATA_DIR=${gen_set%/*}
25 | gen_set=${gen_set##*/}
26 | [ $gen_set == "test" ] && DATA_DIR=/mnt/output/users/v-kunwei/code/fairseq_mlstku
27 | [ -z $outdir ] && outdir=$DATA_DIR
28 | 
29 | 
30 | results_path=$outdir/pseudo_${gen_set}_${rank}
31 | [ ! -d $results_path ] && mkdir -p $results_path
32 | 
33 | for subset in $gen_set; do
34 |     python $FAIRSEQ_ROOT/fairseq_cli/generate_mt_label.py $DATA_DIR \
35 |     --path ${model_path} \
36 |     --task "translation_from_jst" \
37 |     --max-target-positions 3000 \
38 |     --gen-subset $subset \
39 |     -t $tgt -s "ltr" \
40 |     --dataset-impl "raw" \
41 |     --max-tokens ${max_tokens} \
42 |     --beam 2 \
43 |     --max-len-a 2 --max-len-b 100 \
44 |     --results-path $results_path \
45 |     --skip-invalid-size-inputs-valid-test \
46 |     --distributed-world-size $word_size --distributed-rank $rank \
47 |     
48 |     echo "$model" > $results_path/model.record
49 |     sleep 1s
50 | done | tee $results_path/decode.log
51 | 
52 | sleep 2s
53 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/inference_code_bleu.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | # Hubert ED model #
 4 | #####################################
 5 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set>" && exit 0
 6 | 
 7 | model_path=$1
 8 | src_dir=${model_path%/*}
 9 | cpt=${model_path##*/}
10 | cpt=${cpt%.*}
11 | 
12 | gen_set=$2
13 | tgt=$3
14 | outdir=$4
15 | src="ltr"
16 | [ -z $tgt ] && tgt="kmu"
17 | [ -z $gen_set ] && gen_set="es_dev"
18 | [ -z $outdir ] && outdir=$src_dir/decode_${cpt}
19 | 
20 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asr_data/
21 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_joint_splitenc_400k/ltr-$tgt
22 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_400k/ltr-$tgt
23 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst
24 | 
25 | langs="ltr,$tgt"
26 | 
27 | for subset in $gen_set; do
28 |     results_path=$outdir/${subset}
29 |     [ ! -d $results_path ] && mkdir -p $results_path
30 | 
31 |     python $FAIRSEQ_ROOT/fairseq_cli/generate.py $DATA_DIR \
32 |     --path ${model_path} \
33 |     --task "translation_from_jst" \
34 |     --max-target-positions 3000 \
35 |     --gen-subset $subset \
36 |     -t $tgt -s "ltr" --dataset-impl "raw" \
37 |     --batch-size 16 \
38 |     --max-len-a 2 --max-len-b 400 \
39 |     --results-path $results_path \
40 |     --scoring sacrebleu $extra
41 | 
42 |     echo $results_path
43 |     tail -n 1 $results_path/generate-*.txt
44 |     sleep 1s
45 | done
46 | 
47 | # --distributed-world-size 1000 --distributed-rank 0 \
48 | 
49 | sleep 2s
50 | 
51 | # cat generate-newstest2020_enja.txt | grep "^D-" | cut -d'-' -f 2- | sort -n -k1 | cut -f3 > decode-newstest2020_enja.txt
52 | # sacrebleu -t wmt20 -l en-ja -i decode-newstest2020_enja.txt --tokenize char
53 | 


--------------------------------------------------------------------------------
/Speech2S/speech2s/stpretrain_scripts/train_text2code/inference_code_wer.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | # Hubert ED model #
 4 | #####################################
 5 | [ $# -lt 1 ] && echo "Usage: $0 <init-model> <gen-set>" && exit 0
 6 | 
 7 | model_path=$1
 8 | src_dir=${model_path%/*}
 9 | cpt=${model_path##*/}
10 | cpt=${cpt%.*}
11 | 
12 | gen_set=$2
13 | tgt=$3
14 | outdir=$4
15 | src="ltr"
16 | [ -z $tgt ] && tgt="kmu"
17 | [ -z $gen_set ] && gen_set="en_dev"
18 | [ -z $outdir ] && outdir=$src_dir/decode_${cpt}
19 | 
20 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/ltr-$tgt
21 | # DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_joint_splitenc_400k/ltr-$tgt
22 | #DATA_DIR=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/speech2c_400k/ltr-$tgt
23 | DATA_DIR=/mnt/output/users/v-kunwei/data/s2s_data/es_asr_data/
24 | FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlst
25 | 
26 | langs="ltr,$tgt"
27 | 
28 | for subset in $gen_set; do
29 |     results_path=$outdir/${subset}
30 |     [ ! -d $results_path ] && mkdir -p $results_path
31 | 
32 |     python $FAIRSEQ_ROOT/fairseq_cli/generate.py $DATA_DIR \
33 |     --path ${model_path} \
34 |     --task "translation_from_jst" \
35 |     --max-target-positions 3000 \
36 |     --gen-subset $subset \
37 |     -t $tgt -s "ltr" --dataset-impl "raw" \
38 |     --batch-size 16 \
39 |     --max-len-a 2 --max-len-b 400 \
40 |     --results-path $results_path \
41 |     --scoring wer
42 | 
43 |     echo $results_path
44 |     tail -n 1 $results_path/generate-*.txt
45 |     sleep 1s
46 | done
47 | 
48 | # --distributed-world-size 1000 --distributed-rank 0 \
49 | 
50 | sleep 2s
51 | 
52 | # cat generate-newstest2020_enja.txt | grep "^D-" | cut -d'-' -f 2- | sort -n -k1 | cut -f3 > decode-newstest2020_enja.txt
53 | # sacrebleu -t wmt20 -l en-ja -i decode-newstest2020_enja.txt --tokenize char
54 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/CommonVoice/v4/en/en-de/config_base_ende.yaml:
--------------------------------------------------------------------------------
 1 | bpe_tokenizer:
 2 |   bpe: sentencepiece
 3 |   sentencepiece_model: spm_char_st_en_de.model
 4 | 
 5 | shuffle: false
 6 | use_audio_input: true
 7 | use_sample_rate: 16000
 8 | standardize_audio: false
 9 | vocab_filename: spm_char_st_en_de.txt
10 |               
11 | # required by speech_to_text task but never used  
12 | input_channels: 1
13 | input_feat_per_channel: 1
14 | 
15 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/CommonVoice/v4/en/en-de/config_large_ende.yaml:
--------------------------------------------------------------------------------
 1 | bpe_tokenizer:
 2 |   bpe: sentencepiece
 3 |   sentencepiece_model: spm_char_st_en_de.model
 4 | 
 5 | shuffle: false
 6 | use_audio_input: true
 7 | use_sample_rate: 16000
 8 | standardize_audio: true
 9 | vocab_filename: spm_char_st_en_de.txt
10 |               
11 | # required by speech_to_text task but never used  
12 | input_channels: 1
13 | input_feat_per_channel: 1
14 | 
15 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.model


--------------------------------------------------------------------------------
/SpeechLM/dataset/CommonVoice/v4/en/en-de/spm_char_st_en_de.txt:
--------------------------------------------------------------------------------
  1 | ▁ 1
  2 | e 1
  3 | n 1
  4 | i 1
  5 | r 1
  6 | t 1
  7 | s 1
  8 | a 1
  9 | d 1
 10 | h 1
 11 | u 1
 12 | l 1
 13 | o 1
 14 | c 1
 15 | g 1
 16 | m 1
 17 | . 1
 18 | b 1
 19 | f 1
 20 | w 1
 21 | k 1
 22 | z 1
 23 | S 1
 24 | v 1
 25 | p 1
 26 | , 1
 27 | D 1
 28 | ü 1
 29 | E 1
 30 | ä 1
 31 | A 1
 32 | B 1
 33 | M 1
 34 | G 1
 35 | " 1
 36 | F 1
 37 | K 1
 38 | P 1
 39 | W 1
 40 | T 1
 41 | y 1
 42 | H 1
 43 | ö 1
 44 | I 1
 45 | R 1
 46 | L 1
 47 | - 1
 48 | C 1
 49 | V 1
 50 | N 1
 51 | ß 1
 52 | Z 1
 53 | J 1
 54 | U 1
 55 | j 1
 56 | O 1
 57 | x 1
 58 | ? 1
 59 | ! 1
 60 | ' 1
 61 | q 1
 62 | Y 1
 63 | Ü 1
 64 | : 1
 65 | Q 1
 66 | Ä 1
 67 | Ö 1
 68 | ; 1
 69 | ( 1
 70 | ) 1
 71 | X 1
 72 | 0 1
 73 | 1 1
 74 | [ 1
 75 | ] 1
 76 | é 1
 77 | 2 1
 78 | & 1
 79 | 3 1
 80 | 5 1
 81 | 4 1
 82 | 7 1
 83 | 9 1
 84 | 8 1
 85 | 6 1
 86 | / 1
 87 | á 1
 88 | ō 1
 89 | ó 1
 90 | ñ 1
 91 | ú 1
 92 | í 1
 93 | ā 1
 94 | è 1
 95 | * 1
 96 | ć 1
 97 | à 1
 98 | ê 1
 99 | ë 1
100 | ¡ 1
101 | ç 1
102 | ð 1
103 | ã 1
104 | č 1
105 | ū 1
106 | % 1
107 | É 1
108 | â 1
109 | ø 1
110 | š 1
111 | å 1
112 | ô 1
113 | ł 1
114 | œ 1
115 | ş 1
116 | Š 1
117 | _ 1
118 | Î 1
119 | Ó 1
120 | æ 1
121 | ï 1
122 | ă 1
123 | ě 1
124 | ī 1
125 | ı 1
126 | ʻ 1
127 | ʿ 1
128 | π 1
129 | и 1
130 | к 1
131 | = 1
132 | Ã 1
133 | Ø 1
134 | î 1
135 | û 1
136 | þ 1
137 | ċ 1
138 | Č 1
139 | ę 1
140 | ğ 1
141 | ń 1
142 | Ō 1
143 | ő 1
144 | ř 1
145 | ž 1
146 | ǎ 1
147 | α 1
148 | В 1
149 | е 1
150 | з 1
151 | й 1
152 | л 1
153 | н 1
154 | ь 1
155 | я 1
156 | ṃ 1
157 | ạ 1
158 | ụ 1
159 | → 1
160 | ≡ 1
161 | 京 1
162 | 大 1
163 | 都 1
164 | 阪 1
165 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriLM/hidden_unit/bin-idx/config.yaml:
--------------------------------------------------------------------------------
1 | vocab_filename: dict.ltr.txt
2 | src_vocab_filename: dict.km.txt
3 | 
4 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriLM/hidden_unit/bin-idx/dict.ltr.txt:
--------------------------------------------------------------------------------
1 | ../../phone_unit/bin-idx/dict.ltr.txt


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriLM/phone_unit/bin-idx/config.yaml:
--------------------------------------------------------------------------------
1 | vocab_filename: dict.ltr.txt
2 | src_vocab_filename: dict.phn.txt
3 | 
4 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriLM/phone_unit/bin-idx/dict.ltr.txt:
--------------------------------------------------------------------------------
 1 | | 803288730
 2 | E 439294199
 3 | T 319071758
 4 | A 277306732
 5 | O 263784364
 6 | N 239361162
 7 | I 237353011
 8 | H 223346762
 9 | S 220175453
10 | R 203352500
11 | D 152198685
12 | L 141597450
13 | U 98913389
14 | M 87138757
15 | C 84680142
16 | W 81375101
17 | F 80240665
18 | G 70642902
19 | Y 68388038
20 | P 58436929
21 | B 52538531
22 | V 33250231
23 | K 26906609
24 | ' 9162896
25 | X 5075632
26 | J 4746771
27 | Q 3401794
28 | Z 2186971
29 | <mask> 1
30 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriSpeech/asr/dict.ltr.txt:
--------------------------------------------------------------------------------
1 | ../../LibriLM/phone_unit/bin-idx/dict.ltr.txt


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriSpeech/fast_phone2unit/config.yaml:
--------------------------------------------------------------------------------
 1 | audio_root: /home/v-ziqzhang/dataset/librispeech_phone2unit
 2 | features:
 3 |   energy_max: 5.733445167541504
 4 |   energy_min: 1.0e-08
 5 |   eps: 1.0e-05
 6 |   hop_length: 256
 7 |   pitch_max: 6.608609099713706
 8 |   pitch_min: 1.0e-08
 9 |   sample_rate: 16000
10 | sample_rate: 16000
11 | vocab_filename: dict.km.txt
12 | src_vocab_filename: dict.phn.txt
13 | 
14 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriSpeech/fast_phone2unit/config_generate.yaml:
--------------------------------------------------------------------------------
 1 | audio_root: /home/v-ziqzhang/dataset/librispeech_phone2unit
 2 | features:
 3 |   energy_max: 5.733445167541504
 4 |   energy_min: 1.0e-08
 5 |   eps: 1.0e-05
 6 |   hop_length: 256
 7 |   pitch_max: 6.608609099713706
 8 |   pitch_min: 1.0e-08
 9 |   sample_rate: 16000
10 | sample_rate: 16000
11 | vocab_filename: dict.km.txt
12 | src_vocab_filename: dict.PHN.txt
13 | 
14 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriSpeech/fast_phone2unit/dict.PHN.txt:
--------------------------------------------------------------------------------
 1 | | 0
 2 | <SIL> 1
 3 | ' 2
 4 | AA 3
 5 | AE 4
 6 | AH 5
 7 | AO 6
 8 | AW 7
 9 | AY 8
10 | B 9
11 | CH 10
12 | D 11
13 | DH 12
14 | EH 13
15 | ER 14
16 | EY 15
17 | F 16
18 | G 17
19 | HH 18
20 | IH 19
21 | IY 20
22 | JH 21
23 | K 22
24 | L 23
25 | M 24
26 | N 25
27 | NG 26
28 | OW 27
29 | OY 28
30 | P 29
31 | R 30
32 | S 31
33 | SH 32
34 | T 33
35 | TH 34
36 | UH 35
37 | UW 36
38 | V 37
39 | W 38
40 | Y 39
41 | Z 40
42 | ZH 41
43 | 


--------------------------------------------------------------------------------
/SpeechLM/dataset/LibriSpeech/fast_phone2unit/dict.phn.txt:
--------------------------------------------------------------------------------
 1 | | 0
 2 | <SIL> 1
 3 | ' 2
 4 | AA 3
 5 | AE 4
 6 | AH 5
 7 | AO 6
 8 | AW 7
 9 | AY 8
10 | B 9
11 | CH 10
12 | D 11
13 | DH 12
14 | EH 13
15 | ER 14
16 | EY 15
17 | F 16
18 | G 17
19 | HH 18
20 | IH 19
21 | IY 20
22 | JH 21
23 | K 22
24 | L 23
25 | M 24
26 | N 25
27 | NG 26
28 | OW 27
29 | OY 28
30 | P 29
31 | R 30
32 | S 31
33 | SH 32
34 | T 33
35 | TH 34
36 | UH 35
37 | UW 36
38 | V 37
39 | W 38
40 | Y 39
41 | Z 40
42 | ZH 41
43 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models
2 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/decode/infer_fsqlm.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | defaults:
 4 |   - model: null
 5 | 
 6 | hydra:
 7 |   run:
 8 |     dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
 9 |   sweep:
10 |     dir: ${common_eval.results_path}
11 |     subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
12 | 
13 | task:
14 |   _name: joint_sc2t_pretraining
15 |   data: ???
16 |   label_dir: ???
17 |   labels: ["ltr"]
18 |   store_labels: true
19 |   single_target: true
20 |   fine_tuning: true
21 |   normalize: ???  # must be consistent with pre-training
22 |   add_decoder_target: false
23 |   pad_audio: false
24 |   random_crop: true
25 |   hubert_tokenizer: "none"
26 |   sp_path: None
27 | 
28 | decoding:
29 |   type: fairseqlm
30 |   lexicon: ???
31 |   lmpath: ???
32 |   beamthreshold: 25
33 |   beam: 500
34 |   lmweight: 2
35 |   wordscore: -1
36 |   silweight: 0
37 |   unique_wer_file: true
38 | common_eval:
39 |   results_path: ???
40 |   path: ???
41 |   post_process: letter
42 | dataset:
43 |   max_tokens: 1100000
44 |   gen_subset: ???
45 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/decode/infer_kenlm.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | defaults:
 4 |   - model: null
 5 | 
 6 | hydra:
 7 |   run:
 8 |     dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
 9 |   sweep:
10 |     dir: ${common_eval.results_path}
11 |     subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
12 | 
13 | task:
14 |   _name: joint_sc2t_pretraining
15 |   data: ???
16 |   label_dir: ???
17 |   labels: ["ltr"]
18 |   store_labels: true
19 |   single_target: true
20 |   fine_tuning: true
21 |   normalize: ???  # must be consistent with pre-training
22 |   add_decoder_target: false
23 |   pad_audio: false
24 |   random_crop: true
25 |   hubert_tokenizer: "none"
26 |   sp_path: None
27 | 
28 | decoding:
29 |   type: kenlm
30 |   lexicon: ???
31 |   lmpath: ???
32 |   beamthreshold: 100
33 |   beam: 500
34 |   lmweight: 2
35 |   wordscore: -1
36 |   silweight: 0
37 |   unique_wer_file: true
38 | common_eval:
39 |   results_path: ???
40 |   path: ???
41 |   post_process: letter
42 | dataset:
43 |   max_tokens: 1100000
44 |   gen_subset: ???
45 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/decode/infer_viterbi.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | defaults:
 4 |   - model: null
 5 | 
 6 | hydra:
 7 |   run:
 8 |     dir: ${common_eval.results_path}/viterbi
 9 |   sweep:
10 |     dir: ${common_eval.results_path}
11 |     subdir: viterbi
12 | 
13 | task:
14 |   _name: joint_sc2t_pretraining
15 |   data: ???
16 |   label_dir: ???
17 |   labels: ["ltr"]
18 |   store_labels: true
19 |   single_target: true
20 |   fine_tuning: true
21 |   normalize: ???  # must be consistent with pre-training
22 |   add_decoder_target: false
23 |   pad_audio: false
24 |   random_crop: true
25 |   hubert_tokenizer: "none"
26 |   sp_path: None
27 | 
28 | decoding:
29 |   type: viterbi
30 |   unique_wer_file: true
31 | common_eval:
32 |   results_path: ???
33 |   path: ???
34 |   post_process: letter
35 | dataset:
36 |   batch_size: 1
37 |   gen_subset: ???
38 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/finetune/speechlm_base_100h.yaml:
--------------------------------------------------------------------------------
  1 | # @package _group_
  2 | 
  3 | common:
  4 |   fp16: true
  5 |   log_format: json
  6 |   log_interval: 200
  7 |   tensorboard_logdir: tblog
  8 |   seed: 1337
  9 | 
 10 | checkpoint:
 11 |   save_interval: 1
 12 |   keep_last_epochs: 1
 13 |   keep_best_checkpoints: -1
 14 |   best_checkpoint_metric: wer
 15 |   restore_file: checkpoint_last.pt
 16 | 
 17 | distributed_training:
 18 |   ddp_backend: legacy_ddp
 19 |   find_unused_parameters: true
 20 |   distributed_world_size: 1
 21 |   distributed_port: -1
 22 |   nprocs_per_node: 8
 23 | 
 24 | task:
 25 |   _name: joint_sc2t_pretraining
 26 |   data: ???
 27 |   fine_tuning: true
 28 |   label_dir: ???
 29 |   normalize: false  # must be consistent with pre-training
 30 |   labels: ["ltr"]
 31 |   store_labels: true
 32 |   single_target: true
 33 |   add_decoder_target: false
 34 |   pad_audio: false
 35 |   random_crop: true
 36 |   hubert_tokenizer: "none"
 37 |   sp_path: None
 38 | 
 39 | dataset:
 40 |   num_workers: 0
 41 |   max_tokens: 1600000
 42 |   skip_invalid_size_inputs_valid_test: true
 43 |   train_subset: train_100
 44 |   valid_subset: dev_other
 45 |   required_batch_size_multiple: 1
 46 | 
 47 | criterion:
 48 |   _name: ctc
 49 |   zero_infinity: true
 50 | 
 51 | optimization:
 52 |   max_update: 30000
 53 |   lr: [0.00001]
 54 |   sentence_avg: true
 55 |   update_freq: [1]
 56 | 
 57 | optimizer:
 58 |   _name: adam
 59 |   adam_betas: (0.9,0.98)
 60 |   adam_eps: 1e-08
 61 |   weight_decay: 0.0
 62 | 
 63 | lr_scheduler:
 64 |   _name: tri_stage
 65 |   phase_ratio: [0.1, 0.4, 0.5]
 66 |   final_lr_scale: 0.05
 67 | 
 68 | model:
 69 |   _name: speechlm_ctc
 70 |   w2v_path: ???
 71 |   apply_mask: true
 72 |   mask_prob: 0.65
 73 |   mask_channel_prob: 0.5
 74 |   mask_channel_length: 64
 75 |   layerdrop: 0.1
 76 |   activation_dropout: 0.1
 77 |   feature_grad_mult: 0.0
 78 |   freeze_finetune_updates: 0
 79 | 
 80 | hydra:
 81 |   job:
 82 |     config:
 83 |       override_dirname:
 84 |         kv_sep: '-'
 85 |         item_sep: '__'
 86 |         exclude_keys:
 87 |           - run
 88 |           - task.data
 89 |           - task.label_dir
 90 |           - model.w2v_path
 91 |           - dataset.train_subset
 92 |           - dataset.valid_subset
 93 |           - criterion.wer_kenlm_model
 94 |           - criterion.wer_lexicon
 95 |   run:
 96 |     dir: ???
 97 |   sweep:
 98 |     dir: ???
 99 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
100 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/finetune/speechlm_large_960h.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   log_format: json
 6 |   log_interval: 200
 7 |   tensorboard_logdir: tblog
 8 | 
 9 | checkpoint:
10 |   save_interval: 1
11 |   keep_last_epochs: 5
12 |   keep_best_checkpoints: 5
13 |   best_checkpoint_metric: wer
14 |   restore_file: checkpoint_last.pt
15 | 
16 | distributed_training:
17 |   ddp_backend: legacy_ddp
18 |   find_unused_parameters: true
19 |   distributed_world_size: 32
20 |   distributed_port: -1
21 |   nprocs_per_node: 8
22 | 
23 | task:
24 |   _name: joint_sc2t_pretraining
25 |   data: ???
26 |   fine_tuning: true
27 |   label_dir: ???
28 |   normalize: true  # must be consistent with pre-training
29 |   labels: ["ltr"]
30 |   store_labels: true
31 |   single_target: true
32 |   add_decoder_target: false
33 |   pad_audio: false
34 |   random_crop: true
35 |   hubert_tokenizer: "none"
36 |   sp_path: None
37 | 
38 | dataset:
39 |   num_workers: 0
40 |   max_tokens: 900000
41 |   skip_invalid_size_inputs_valid_test: true
42 |   train_subset: train_960
43 |   valid_subset: dev_other
44 |   required_batch_size_multiple: 1
45 | 
46 | criterion:
47 |   _name: ctc
48 |   zero_infinity: true
49 | 
50 | optimization:
51 |   max_update: 200000
52 |   lr: [0.00001]
53 |   sentence_avg: true
54 |   update_freq: [1]
55 | 
56 | optimizer:
57 |   _name: adam
58 |   adam_betas: (0.9,0.98)
59 |   adam_eps: 1e-08
60 |   weight_decay: 0.0
61 | 
62 | lr_scheduler:
63 |   _name: tri_stage
64 |   phase_ratio: [0.1, 0.4, 0.5]
65 |   final_lr_scale: 0.05
66 | 
67 | model:
68 |   _name: speechlm_ctc
69 |   w2v_path: ???
70 |   apply_mask: true
71 |   mask_prob: 0.5
72 |   mask_channel_prob: 0.25
73 |   mask_channel_length: 64
74 |   layerdrop: 0.0
75 |   activation_dropout: 0.1
76 |   feature_grad_mult: 0.0
77 |   freeze_finetune_updates: 0
78 | 
79 | hydra:
80 |   job:
81 |     config:
82 |       override_dirname:
83 |         kv_sep: '-'
84 |         item_sep: '__'
85 |         exclude_keys:
86 |           - run
87 |           - task.data
88 |           - task.label_dir
89 |           - model.w2v_path
90 |           - dataset.train_subset
91 |           - dataset.valid_subset
92 |           - criterion.wer_kenlm_model
93 |           - criterion.wer_lexicon
94 |   run:
95 |     dir: ???
96 |   sweep:
97 |     dir: ???
98 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
99 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/config/pretrain/speechlmp_base_cfg.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/speechlm/config/pretrain/speechlmp_base_cfg.pt


--------------------------------------------------------------------------------
/SpeechLM/speechlm/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | for file in os.listdir(os.path.dirname(__file__)):
 5 |     if file.endswith(".py") and not file.startswith("_"):
 6 |         criterion_name = file[: file.find(".py")]
 7 |         importlib.import_module(
 8 |             "speechlm.criterions." + criterion_name
 9 |         )
10 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/covost2/mp3_to_wav.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tqdm import tqdm
 3 | from pydub import AudioSegment
 4 | import torchaudio
 5 | import os
 6 | 
 7 | def mp3_convert_wav(mp3_file, wav_file):
 8 |     try:
 9 |         sound = AudioSegment.from_mp3(mp3_file)
10 |         sound=sound.set_frame_rate(16000)
11 |         sound=sound.set_channels(1)
12 |         sound=sound.set_sample_width(2)
13 |         sound.export(wav_file, format="wav")
14 |     except Exception as e:
15 |         print(e)
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--input", "-i", required=True, type=str)
20 |     parser.add_argument("--shard", "-n", required=True, type=int)
21 |     parser.add_argument("--rank", "-r", required=True, type=int)
22 |     args = parser.parse_args()
23 | 
24 |     assert args.rank < args.shard, f"rank: {args.rank} >= shard: {args.shard}"
25 | 
26 |     with open(args.input, 'r') as f:
27 |         files = [line.strip() for line in f ]
28 | 
29 |     mp3_files = files[args.rank::args.shard]
30 |     for mp3_file in tqdm(mp3_files):
31 |         wav_file = mp3_file.replace("/clips/", "/wav/").replace(".mp3", ".wav")
32 |         if os.path.exists(wav_file):
33 |             try:
34 |                 torchaudio.info(wav_file)
35 |             except Exception as e:
36 |                 print(e)
37 |                 mp3_convert_wav(mp3_file, wav_file)
38 |         else:
39 |             mp3_convert_wav(mp3_file, wav_file)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/filter_paireddata_by_len.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
 5 | # 
 6 | # Copyright (c) 2022 Microsoft
 7 | # Licensed under The MIT License [see LICENSE for details]
 8 | # ----------------------------------------------------------------------------
 9 | 
10 | import os
11 | import argparse
12 | from tqdm import tqdm
13 | import numpy as np
14 | 
15 | 
16 | lg_label = "__label__{}"
17 | 
18 | def writefile(filename, lines):
19 |     with open(filename, 'w', encoding='utf-8') as f:
20 |         f.writelines(lines)
21 | 
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--input", "-i", required=True, type=str)
26 |     parser.add_argument("--output", "-o", required=True, type=str)
27 |     parser.add_argument("--src", "-s", required=True, type=str)
28 |     parser.add_argument("--tgt", "-t", required=True, type=str)
29 |     parser.add_argument("--max-len", "-m", default=2998, type=int)
30 |     args = parser.parse_args()
31 |     
32 |     src_lines, tgt_lines = [], []
33 |     with open(f"{args.input}.{args.src}", 'r') as f1, open(f"{args.input}.{args.tgt}", 'r') as f2: 
34 |         for src_line, tgt_line in tqdm(zip(f1, f2)):
35 |             src_len = len(src_line.strip().split())
36 |             tgt_len = len(tgt_line.strip().split())
37 |             if src_len < args.max_len and src_len > 0 and tgt_len < args.max_len and tgt_len > 0:
38 |                 src_lines.append(src_line)
39 |                 tgt_lines.append(tgt_line)
40 | 
41 |     writefile(f"{args.output}.{args.src}", src_lines)
42 |     writefile(f"{args.output}.{args.tgt}", tgt_lines)
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/phoneme_tokenizer/repeat_withou_insert_sil_less_4375.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
 5 | # 
 6 | # Copyright (c) 2022 Microsoft
 7 | # Licensed under The MIT License [see LICENSE for details]
 8 | # ----------------------------------------------------------------------------
 9 | 
10 | import sys, json, tqdm
11 | import numpy as np
12 | 
13 | input_file = sys.argv[1]
14 | mean_and_std_file = sys.argv[2]
15 | out_file = sys.argv[3]
16 | 
17 | mean_and_std = json.load(open(mean_and_std_file, 'r'))
18 | 
19 | with open(input_file, 'r') as f, open(out_file, 'w') as w:
20 |     for line in tqdm.tqdm(f):
21 |         l = line.split()
22 | 
23 |         new_l = []
24 |         for phn in l:
25 |             if phn not in mean_and_std:
26 |                mean_and_std[phn] = [5, 2.5]
27 |                print(f'unk phone {phn}')
28 |             n = max(1, round(np.random.normal(loc=mean_and_std[phn][0], scale=mean_and_std[phn][1])))
29 |             new_l.extend([phn] * int(n))
30 | 
31 |         minus = 0
32 |         while len(new_l) >= 4375:
33 |             minus += 1
34 |             new_l = []
35 |             for phn in l:
36 |                 n = max(1, round(mean_and_std[phn][0] - minus))
37 |                 new_l.extend([phn] * n)
38 |             print(f"too long line try minus {minus}")
39 | 
40 |         w.write(' '.join(new_l)+'\n')
41 | 
42 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/prepare_covost2_enxx.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/bin/bash
 3 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 4 | [ $# -lt 1 ] && echo "Usage: $0 <lang> [root=${PWD}/dataset/CommonVoice/v4]" && exit 0
 5 | cwd=${PWD}
 6 | src=${PWD}/speechlm/data_process
 7 | lang=$1
 8 | root=$2
 9 | [ -z $root ] && root="${PWD}/dataset/CommonVoice/v4"
10 | set -e -o pipefail -u
11 | 
12 | 
13 | ### step1, convert mp3 to wav
14 | cd $root/en && mkdir -p wav
15 | cut -f2 validated.tsv | sed '1d' | sed "s|^|${root}/en/clips/|" > validated.id
16 | for i in $(seq 0 39); do 
17 |         echo extracting $i; 
18 |         python $src/covost2/mp3_to_wav.py -i validated.id -n 40 -r $i &
19 | done
20 | wait
21 | cd $cwd
22 | 
23 | 
24 | ### step2, manifest
25 | datadir="$root/en/en-$lang" && mkdir -p $datadir && cd $datadir
26 | python /mnt/default/v-ziqzhang/code/stpretrain_scripts/data_process/covost2/prepare_covost_data.py --data-root $root --src-lang en --tgt-lang $lang --vocab-type char
27 | mv ../*en_${lang}.* ./
28 | 
29 | # adjust config_base_en${lang}.yaml
30 | echo "bpe_tokenizer:" > config_base_en${lang}.yaml
31 | echo "  bpe: sentencepiece" >> config_base_en${lang}.yaml
32 | echo "  sentencepiece_model: spm_char_st_en_de.model" >> config_base_en${lang}.yaml
33 | echo "" >> config_base_en${lang}.yaml
34 | echo "shuffle: false" >> config_base_en${lang}.yaml
35 | echo "use_audio_input: true" >> config_base_en${lang}.yaml
36 | echo "use_sample_rate: 16000" >> config_base_en${lang}.yaml
37 | echo "standardize_audio: false" >> config_base_en${lang}.yaml
38 | echo "vocab_filename: spm_char_st_en_de.txt" >> config_base_en${lang}.yaml
39 | echo "" >> config_base_en${lang}.yaml
40 | echo "# required by speech_to_text task but never used" >> config_base_en${lang}.yaml
41 | echo "input_channels: 1" >> config_base_en${lang}.yaml
42 | echo "input_feat_per_channel: 1" >> config_base_en${lang}.yaml
43 | echo "" >> config_base_en${lang}.yaml
44 | # adjust config_large_en${lang}.yaml
45 | cat config_base_en${lang}.yaml | sed "s|standardize_audio: false|standardize_audio: true|" > config_large_en${lang}.yaml
46 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/txt2idx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | [ $# -lt 3 ] && echo "Usage: $0 <input-text> <outdir> <DICT> <suffix>" && exit 0
 3 | 
 4 | input=$1
 5 | outdir=$2
 6 | DICT=$3
 7 | suffix=$4
 8 | outname=${input##*/}
 9 | outname=${outname%.txt*}
10 | [ -z $input ] && echo "You must specify a source file" && exit 1
11 | 
12 | [ -z $DICT ] && echo "No dict was specified!" && exit 1
13 | [ -z $outdir ] && outdir=${input%/*}
14 | [ -z $outdir ] && outdir="."
15 | [ ! -d $outdir ] && mkdir -p $outdir
16 | 
17 | echo "------------------------------- creating idx/bin--------------------------------------------"
18 | echo "$input --> $outdir/${outname}${suffix}.idx"
19 | fairseq-preprocess \
20 |   --only-source \
21 |   --trainpref $input \
22 |   --destdir $outdir \
23 |   --thresholdsrc 0 \
24 |   --srcdict ${DICT} \
25 |   --workers 40
26 | 
27 | mv $outdir/train.idx $outdir/${outname}${suffix}.idx
28 | mv $outdir/train.bin $outdir/${outname}${suffix}.bin
29 | echo "-----------------------------------   done      --------------------------------------------"
30 | 
31 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/data_process/wrd2ltr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def main():
 4 |     for line in sys.stdin:
 5 |         line = line.replace("<unk>", "")
 6 |         line = " ".join(line.strip().split())
 7 |         line = line.replace(" ", "|").upper() + "|"
 8 |         print(" ".join(line))
 9 | 
10 | if __name__ == "__main__":
11 |     main()
12 | 
13 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechLM/speechlm/models/__init__.py


--------------------------------------------------------------------------------
/SpeechLM/speechlm/models/speechlm_ctcasr.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
 4 | # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4
 5 | # 
 6 | # Copyright (c) 2022 Microsoft
 7 | # Licensed under The MIT License [see LICENSE for details]
 8 | # ----------------------------------------------------------------------------
 9 | 
10 | from dataclasses import dataclass
11 | from fairseq.models import BaseFairseqModel, register_model
12 | from fairseq.tasks import FairseqTask
13 | 
14 | from fairseq.models.hubert import HubertAsrConfig, HubertCtc, HubertEncoder
15 | 
16 | @dataclass
17 | class SpeechLMCtcConfig(HubertAsrConfig):
18 |     pass
19 | 
20 | 
21 | @register_model("speechlm_ctc", dataclass=SpeechLMCtcConfig)
22 | class SpeechLMCtc(HubertCtc):
23 |     def __init__(self, cfg: SpeechLMCtcConfig, w2v_encoder: BaseFairseqModel):
24 |         super().__init__(cfg, w2v_encoder)
25 | 
26 |     @classmethod
27 |     def build_model(cls, cfg: SpeechLMCtcConfig, task: FairseqTask):
28 |         """Build a new model instance."""
29 |         w2v_encoder = SpeechLMEncoder(cfg, task)
30 |         return cls(cfg, w2v_encoder)
31 | 
32 | 
33 | class SpeechLMEncoder(HubertEncoder):
34 |     def __init__(self, cfg: HubertAsrConfig, task):
35 |         super().__init__(cfg, task)
36 |         
37 |         if (task.target_dictionary is not None) and (
38 |             hasattr(self.w2v_model, "unit_encoder_ctc_head")
39 |         ):
40 |             self.proj = self.w2v_model.unit_encoder_ctc_head
41 |             self.conv_ctc_proj = True
42 |         else:
43 |             self.conv_ctc_proj = False
44 | 
45 |     def forward(self, source, padding_mask, tbc=True, **kwargs):
46 |         results = super().forward(
47 |             source,
48 |             padding_mask,
49 |             tbc,
50 |             **kwargs,
51 |         )
52 |         if self.conv_ctc_proj:
53 |             padding_mask = self.w2v_model.downsample_ctc_padding_mask(results["padding_mask"])
54 |             results["encoder_padding_mask"] = padding_mask
55 |             results["padding_mask"] = padding_mask
56 |         return results
57 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/facebookresearch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | from .multihead_attention import MultiheadAttention
11 | from .relative_pos_enc import RelativePositionalEncoding
12 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase
13 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer
14 | from .learned_positional_embedding import LearnedPositionalEmbedding
15 | 
16 | __all__ = [
17 |     "MultiheadAttention",
18 |     "RelativePositionalEncoding",
19 |     "TransformerEncoderLayerBase",
20 |     "TransformerDecoderLayerBase",
21 |     "TransformerEncoder",
22 |     "TransformerSentenceEncoderLayer"
23 | ]
24 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/modules/relative_pos_enc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | import torch
11 | 
12 | class RelativePositionalEncoding(torch.nn.Module):
13 |     def __init__(self, d_model, maxlen=1000, embed_v=False):
14 |         super(RelativePositionalEncoding, self).__init__()
15 | 
16 |         self.d_model = d_model
17 |         self.maxlen = maxlen
18 |         self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 
19 |         if embed_v:
20 |             self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
21 |         self.embed_v = embed_v
22 | 
23 | 
24 |     def forward(self, pos_seq, incremental_state=None):
25 |         pos_seq[pos_seq < -self.maxlen] = -self.maxlen
26 |         pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
27 |         pos_seq = pos_seq + self.maxlen
28 |         
29 |         if incremental_state is not None:
30 |             pos_seq = pos_seq[-1:]
31 | 
32 |         if self.embed_v:
33 |             return self.pe_k(pos_seq), self.pe_v(pos_seq)
34 |         else:
35 |             return self.pe_k(pos_seq), None
36 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/pretrain_speechlm/base_speechlmh.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM-H Base model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=1
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/base_speechlmh_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechlm/config/pretrain \
21 |   --config-name speechlm_base_librispeech \
22 |   common.user_dir=$CODE_ROOT/speechlm \
23 |   \
24 |   task.labels='["km"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_960+train_text.km-ltr\" \
31 |   dataset.valid_subset=\"dev_clean+dev_clean.km-ltr\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=1400000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=pretrain
41 | 
42 | # data_dir="/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/local"
43 | # text_data_dir="/mnt/default/v-ziqzhang/dataset/LibriLM/from_fastT2U/bin-idx"
44 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/pretrain_speechlm/base_speechlmp.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM-P Base model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=1
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/base_speechlmp_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechlm/config/pretrain \
21 |   --config-name speechlm_base_librispeech \
22 |   common.user_dir=$CODE_ROOT/speechlm \
23 |   \
24 |   task.labels='["phn"]' \
25 |   model.label_rate=100 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_960+train_text.phn-ltr\" \
31 |   dataset.valid_subset=\"dev_clean+dev_clean.phn-ltr\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=1400000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=pretrain
41 | 
42 | # data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/tri4b_mono_label"
43 | # text_data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/filt2k_sil025_m5std25_sil14_spn32/bin-idx"
44 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/pretrain_speechlm/large_speechlmp.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM-P Large model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=4
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/large_speechlmp_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechlm/config/pretrain \
21 |   --config-name speechlm_large_librilight \
22 |   common.user_dir=$CODE_ROOT/speechlm \
23 |   \
24 |   task.labels='["phn"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_60k+train_text.phn-ltr\" \
31 |   dataset.valid_subset=\"dev_clean+dev_clean.phn-ltr\" \
32 |   dataset.num_workers=1 \
33 |   dataset.max_tokens=900000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.fp16_scale_tolerance=0.1 \
38 |   common.tensorboard_logdir=$MODEL_DIR \
39 |   checkpoint.save_dir=$MODEL_DIR \
40 |   hydra.run.dir=$MODEL_DIR \
41 |   hydra.job.name=pretrain
42 | 
43 | # data_dir="/stdblob/users/v-ziqzhang/dataset/librilight/chunkdata"
44 | # text_data_dir="/stdblob/users/v-ziqzhang/dataset/LibriLM/phn2char_sanych/filt2k_sil025_m5std25_sil14_spn32/bin-idx"
45 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tokenizer_fastT2U/generate.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Fast Text2Unit Model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <gen_set> [outdir={gen_set%/*}]" && exit 0
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | src_dir=${model_path%/*}
 9 | cpt=${model_path##*/}
10 | cpt=${cpt%.*}
11 | 
12 | gen_set=$2
13 | outdir=$3
14 | 
15 | DATA_DIR=${gen_set%/*}
16 | gen_set=${gen_set##*/}
17 | [ -z $outdir ] && outdir=${DATA_DIR}
18 | 
19 | CODE_ROOT=${PWD}
20 | 
21 | nj=4
22 | for rank in $(seq 0 $((nj-1))); do
23 |     results_path=$outdir/pseudo_${gen_set}/${rank}
24 |     [ ! -d $results_path ] && mkdir -p $results_path
25 |     echo "$model_path" > $results_path/model.record
26 | 
27 |     python $CODE_ROOT/speechlm/generate_unit.py $DATA_DIR \
28 |     --user-dir $CODE_ROOT/speechlm \
29 |     --config-yaml config_generate.yaml \
30 |     --path ${model_path} \
31 |     --task fast_text_to_unit \
32 |     --gen-subset $gen_set \
33 |     \
34 |     --beam 1 \
35 |     --max-tokens 10000 \
36 |     --results-path $results_path \
37 |     --scoring sacrebleu \
38 |     --skip-invalid-size-inputs-valid-test \
39 |     --distributed-world-size $nj --distributed-rank ${rank} \
40 |     &
41 | done
42 | wait
43 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tokenizer_fastT2U/infer.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Fast Text2Unit Model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <gen_set> " && exit 0
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | src_dir=${model_path%/*}
 9 | cpt=${model_path##*/}
10 | cpt=${cpt%.*}
11 | 
12 | gen_set=$2
13 | 
14 | DATA_DIR=${gen_set%/*}
15 | gen_set=${gen_set##*/}
16 | outdir=$src_dir/decode_${cpt}
17 | 
18 | CODE_ROOT=${PWD}
19 | 
20 | for subset in ${gen_set//,/ }; do
21 |     results_path=$outdir/phone2unit_${subset}
22 |     [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 |     python $CODE_ROOT/speechlm/generate_unit.py $DATA_DIR \
25 |     --user-dir $CODE_ROOT/speechlm \
26 |     --config-yaml config.yaml \
27 |     --path ${model_path} \
28 |     --task fast_text_to_unit \
29 |     --gen-subset $subset \
30 |     \
31 |     --beam 1 \
32 |     --max-tokens 10000 \
33 |     --results-path $results_path \
34 |     --scoring sacrebleu
35 | 
36 |     echo $results_path
37 |     tail -n 1 $results_path/generate-*.txt
38 |     sleep 1s
39 | done
40 | 
41 | # --distributed-world-size 1000 --distributed-rank 0 \
42 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tokenizer_fastT2U/train_s_5e-4.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # Fast Text2Unit Model #
 3 | #####################################
 4 | [ $# -lt 1 ] && echo "Usage: $0 <data_dir> [mount] [world_size=4] [update_freq=1]" && exit 0
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | DATA_DIR=$1
 8 | mount=$2
 9 | world_size=$3
10 | update_freq=$4
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=4
13 | [ -z $update_freq ] && update_freq=1
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="$mount/exp/fast_text2unit/small_lr5e-4_tristage_ls0.1_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | fairseq-train ${DATA_DIR} --save-dir ${MODEL_DIR} \
20 |   --config-yaml config.yaml \
21 |   --user-dir $CODE_ROOT/speechlm \
22 |   --train-subset train_100 --valid-subset dev_clean \
23 |   --num-workers 4 --max-tokens 20000 \
24 |   --distributed-world-size ${world_size} --update-freq ${update_freq} \
25 |   \
26 |   --task fast_text_to_unit --criterion fasttext2unit_criterion --arch fasttext2unit_s \
27 |   --label-smoothing 0.1 \
28 |   \
29 |   --clip-norm 5.0 --n-frames-per-step 1 \
30 |   --dropout 0.1 --attention-dropout 0.1 \
31 |   --optimizer adam --lr 5e-4 --lr-scheduler tri_stage --phase-ratio [0.3,0.0,0.7] --max-update 10000 \
32 |   --seed 1 --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
33 |   \
34 |   --save-interval 2 \
35 |   --tensorboard-logdir ${MODEL_DIR} \
36 |   --fp16 --find-unused-parameters \
37 |   | tee ${MODEL_DIR}/train.log
38 | 
39 | # DATA_DIR=/mnt/default/v-ziqzhang/dataset/librispeech_phone2unit/phone2unit
40 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/finetune_base_ctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=1
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/ctc30k_from_${cpt}_bz1.6m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechlm/config/finetune \
26 |   --config-name speechlm_base_100h \
27 |   common.user_dir=$CODE_ROOT/speechlm \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=30000 \
35 |   dataset.max_tokens=1600000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_clean_100" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=${exp_name}
46 | 
47 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/exp/base/base_speechlmp_32gpu_1accum/checkpoint_298_400000.pt
48 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr
49 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/finetune_large_ctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM Large model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=4]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=4
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/ctc200k_from_${cpt}_bz3.6m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechlm/config/finetune \
26 |   --config-name speechlm_large_960h \
27 |   common.user_dir=$CODE_ROOT/speechlm \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=200000 \
35 |   dataset.max_tokens=900000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_960" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=${exp_name}
46 | 
47 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/exp/large/large_speechlmp_32gpu_4accum/checkpoint_31_400000.pt
48 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr
49 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechLM Base model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other"
11 | src_dir=${model_path%/*}
12 | cpt=${model_path##*/}
13 | cpt=${cpt%.*}
14 | 
15 | CODE_ROOT=${PWD}
16 | 
17 | for subset in ${gen_set//,/ }; do
18 |     results_path=$src_dir/decode_${cpt}_ctc/${subset}
19 |     [ ! -d $results_path ] && mkdir -p $results_path
20 | 
21 |     python $CODE_ROOT/speechlm/infer.py \
22 |     --config-dir $CODE_ROOT/speechlm/config/decode \
23 |     --config-name infer_viterbi \
24 |     common.user_dir=$CODE_ROOT/speechlm \
25 |     \
26 |     dataset.gen_subset=${subset} \
27 |     task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=false \
28 |     common_eval.results_path=${results_path} common_eval.path=${model_path} \
29 |     \
30 |     common_eval.quiet=true \
31 |     &
32 | done
33 | wait
34 | 
35 | ### important to know
36 | # When loading the fine-tuned model for decoding, fairseq also loads the pre-trained model to use its states['model'] to build the model instance.
37 | # To prevent the error about the w2v_path (if you don't have the pre-trained model at w2v_path), we set common_eval.model_overrides to override 
38 | # the w2v_path by speechlmp_base_cfg.pt. speechlmp_base_cfg.pt is just a pre-trained model checkpoint without parameters (only contains config).
39 | # So, if you have trained a model with different model config (e.g. different encoder layers), you should modify the common_eval.model_overrides to your own.
40 |     # common_eval.model_overrides=\"{\'w2v_path\':\'$CODE_ROOT/speechlm/config/pretrain/speechlmp_base_cfg.pt\'}\" \
41 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_kenlm.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechLM Base model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other"
11 | src_dir=${model_path%/*}
12 | cpt=${model_path##*/}
13 | cpt=${cpt%.*}
14 | 
15 | CODE_ROOT=${PWD}
16 | path_to_lexicon=${DATA_DIR}/librispeech_lexicon.lst
17 | path_to_lm=${DATA_DIR}/4-gram.arpa
18 | [ ! -f $path_to_lexicon ] && echo "Error: $path_to_lexicon not found !" && exit 1
19 | [ ! -f $path_to_lm ] && echo "Error: $path_to_lm not found !" && exit 1
20 | 
21 | for subset in ${gen_set//,/ }; do
22 |     results_path=$src_dir/decode_${cpt}_ctc/${subset}
23 |     [ ! -d $results_path ] && mkdir -p $results_path
24 | 
25 |     python $CODE_ROOT/speechlm/infer.py \
26 |     --config-dir $CODE_ROOT/speechlm/config/decode \
27 |     --config-name infer_kenlm \
28 |     common.user_dir=$CODE_ROOT/speechlm \
29 |     \
30 |     dataset.gen_subset=${subset} \
31 |     task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=false \
32 |     common_eval.results_path=${results_path} common_eval.path=${model_path} \
33 |     \
34 |     decoding.lexicon=$path_to_lexicon \
35 |     decoding.lmpath=$path_to_lm \
36 |     decoding.beam=1500 \
37 |     \
38 |     common_eval.quiet=false \
39 |     &
40 | done
41 | wait
42 | 
43 | ### important to know
44 | # When loading the fine-tuned model for decoding, fairseq also loads the pre-trained model to use its states['model'] to build the model instance.
45 | # To prevent the error about the w2v_path (if you don't have the pre-trained model at w2v_path), we set common_eval.model_overrides to override 
46 | # the w2v_path by speechlmp_base_cfg.pt. speechlmp_base_cfg.pt is just a pre-trained model checkpoint without parameters (only contains config).
47 | # So, if you have trained a model with different model config (e.g. different encoder layers), you should modify the common_eval.model_overrides to your own.
48 |     # common_eval.model_overrides=\"{\'w2v_path\':\'$CODE_ROOT/speechlm/config/pretrain/speechlmp_base_cfg.pt\'}\" \
49 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_large.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechLM Large model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other"
11 | src_dir=${model_path%/*}
12 | cpt=${model_path##*/}
13 | cpt=${cpt%.*}
14 | 
15 | CODE_ROOT=${PWD}
16 | 
17 | for subset in ${gen_set//,/ }; do
18 |     results_path=$src_dir/decode_${cpt}_ctc/${subset}
19 |     [ ! -d $results_path ] && mkdir -p $results_path
20 | 
21 |     python $CODE_ROOT/speechlm/infer.py \
22 |     --config-dir $CODE_ROOT/speechlm/config/decode \
23 |     --config-name infer_viterbi \
24 |     common.user_dir=$CODE_ROOT/speechlm \
25 |     \
26 |     dataset.gen_subset=${subset} \
27 |     task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=true \
28 |     common_eval.results_path=${results_path} common_eval.path=${model_path} \
29 |     \
30 |     common_eval.quiet=true \
31 |     &
32 | done
33 | wait
34 | 
35 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_asr/large_speechlmp_32gpu_4accum/ctc200k_from_400k_bz3.6m_lr1e-5/checkpoint_convert.pt
36 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr
37 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_asr/inference_ctc_large_fsqlm.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechLM Large model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_clean,dev_other,test_clean,test_other]" && exit 1
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | [ -z $gen_set ] && gen_set="dev_clean,dev_other,test_clean,test_other"
11 | src_dir=${model_path%/*}
12 | cpt=${model_path##*/}
13 | cpt=${cpt%.*}
14 | 
15 | CODE_ROOT=${PWD}
16 | path_to_lexicon=${DATA_DIR}/librispeech_lexicon.lst
17 | path_to_lm=${DATA_DIR}/fairseq_word_lm/lm_librispeech_word_transformer.pt
18 | [ ! -f $path_to_lexicon ] && echo "Error: $path_to_lexicon not found !" && exit 1
19 | [ ! -f $path_to_lm ] && echo "Error: $path_to_lm not found !" && exit 1
20 | 
21 | for subset in ${gen_set//,/ }; do
22 |     results_path=$src_dir/decode_${cpt}_ctc/${subset}
23 |     [ ! -d $results_path ] && mkdir -p $results_path
24 | 
25 |     python $CODE_ROOT/speechlm/infer.py \
26 |     --config-dir $CODE_ROOT/speechlm/config/decode \
27 |     --config-name infer_fsqlm \
28 |     common.user_dir=$CODE_ROOT/speechlm \
29 |     \
30 |     dataset.gen_subset=${subset} \
31 |     task.data=$DATA_DIR task.label_dir=$DATA_DIR task.normalize=true \
32 |     common_eval.results_path=${results_path} common_eval.path=${model_path} \
33 |     \
34 |     decoding.lexicon=$path_to_lexicon \
35 |     decoding.lmpath=$path_to_lm \
36 |     decoding.lmweight=0.90 \
37 |     decoding.wordscore=-0.31 \
38 |     decoding.beam=500 \
39 |     \
40 |     common_eval.quiet=false \
41 |     &
42 | done
43 | wait
44 | 
45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_asr/large_speechlmp_32gpu_4accum/ctc200k_from_400k_bz3.6m_lr1e-5/checkpoint_convert.pt
46 | # data_dir=/home/v-ziqzhang/dataset/LibriSpeech/asr
47 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_st/inference_base.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=5] [lenpen=1.0]" && exit 0
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | lang=$3
10 | gen_set=$4
11 | beam_size=$5
12 | lenpen=$6
13 | [ -z $gen_set ] && gen_set="dev"
14 | [ -z $beam_size ] && beam_size=5
15 | [ -z $lenpen ] && lenpen=1
16 | src_dir=${model_path%/*}
17 | cpt=${model_path##*/}
18 | cpt=${cpt%.*}
19 | 
20 | CODE_ROOT=${PWD}
21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
22 | [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
25 |     --gen-subset ${gen_set}_st_en_${lang}_local \
26 |     --max-tokens 2300000 \
27 |     --max-source-positions 2300000 \
28 |     --num-workers 0 \
29 |     \
30 |     --user-dir $CODE_ROOT/speechlm \
31 |     --task speech_to_text \
32 |     --config-yaml config_base_en${lang}.yaml \
33 |     \
34 |     --path ${model_path} \
35 |     --results-path $results_path \
36 |     \
37 |     --scoring sacrebleu --max-len-a 0 --max-len-b 512 \
38 |     --beam ${beam_size} \
39 |     --lenpen $lenpen \
40 | 
41 |     echo $results_path
42 |     tail -n 1 $results_path/generate-*.txt
43 |     sleep 1s
44 | 
45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_covost/base_speechlmp_32gpu_1accum/legacy_ende_from_400k_bz3.2m_lr1e-4/checkpoint_best_convert.pt
46 | # data_dir=dataset/CommonVoice/v4/en/en-de
47 | 


--------------------------------------------------------------------------------
/SpeechLM/speechlm/scripts/tune_speechlm_st/inference_large.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechLM Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=5] [lenpen=1.0]" && exit 0
 5 | [ ${PWD##*/} != SpeechLM ] && echo "Error: dir not match! Switch to SpeechLM/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | lang=$3
10 | gen_set=$4
11 | beam_size=$5
12 | lenpen=$6
13 | [ -z $gen_set ] && gen_set="dev"
14 | [ -z $beam_size ] && beam_size=5
15 | [ -z $lenpen ] && lenpen=1
16 | src_dir=${model_path%/*}
17 | cpt=${model_path##*/}
18 | cpt=${cpt%.*}
19 | 
20 | CODE_ROOT=${PWD}
21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
22 | [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
25 |     --gen-subset ${gen_set}_st_en_${lang}_local \
26 |     --max-tokens 2300000 \
27 |     --max-source-positions 2300000 \
28 |     --num-workers 0 \
29 |     \
30 |     --user-dir $CODE_ROOT/speechlm \
31 |     --task speech_to_text \
32 |     --config-yaml config_large_en${lang}.yaml \
33 |     \
34 |     --path ${model_path} \
35 |     --results-path $results_path \
36 |     \
37 |     --scoring sacrebleu --max-len-a 0 --max-len-b 512 \
38 |     --beam ${beam_size} \
39 |     --lenpen $lenpen \
40 | 
41 |     echo $results_path
42 |     tail -n 1 $results_path/generate-*.txt
43 |     sleep 1s
44 | 
45 | # model_path=/mnt/default/v-ziqzhang/data/speechulm/finetune_covost/large_speechlmp_32gpu_4accum/legacy_ende_from_400k_bz3.6m_lr1e-4/checkpoint.avgnbest_convert.pt
46 | # data_dir=dataset/CommonVoice/v4/en/en-de
47 | 


--------------------------------------------------------------------------------
/SpeechT5/results/ablation_study.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/ablation_study.png


--------------------------------------------------------------------------------
/SpeechT5/results/asr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/asr.png


--------------------------------------------------------------------------------
/SpeechT5/results/se.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/se.png


--------------------------------------------------------------------------------
/SpeechT5/results/sid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/sid.png


--------------------------------------------------------------------------------
/SpeechT5/results/st.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/st.png


--------------------------------------------------------------------------------
/SpeechT5/results/tts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/tts.png


--------------------------------------------------------------------------------
/SpeechT5/results/vc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/results/vc.png


--------------------------------------------------------------------------------
/SpeechT5/speecht5/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models   # noqa


--------------------------------------------------------------------------------
/SpeechT5/speecht5/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | 
 5 | for file in os.listdir(os.path.dirname(__file__)):
 6 |     if file.endswith(".py") and not file.startswith("_"):
 7 |         criterion_name = file[: file.find(".py")]
 8 |         importlib.import_module(
 9 |             "speecht5.criterions." + criterion_name
10 |         )


--------------------------------------------------------------------------------
/SpeechT5/speecht5/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/data/__init__.py


--------------------------------------------------------------------------------
/SpeechT5/speecht5/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .speecht5 import *  # noqa
2 | from .t5_transformer_lm import *  # noqa
3 | 


--------------------------------------------------------------------------------
/SpeechT5/speecht5/models/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/models/modules/__init__.py


--------------------------------------------------------------------------------
/SpeechT5/speecht5/models/modules/text_encoder_prenet.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing (https://arxiv.org/abs/2110.07205)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechT5
 4 | # Copyright (c) 2021 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq and espnet code bases
 7 | # https://github.com/pytorch/fairseq; https://github.com/espnet/espnet
 8 | # --------------------------------------------------------
 9 | 
10 | import torch.nn as nn
11 | 
12 | from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
13 | from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
14 | 
15 | 
16 | class TextEncoderPrenet(nn.Module):
17 |     """
18 | 
19 |     Args:
20 |         in_channels (int): the number of input channels
21 |         mid_channels (int): the number of intermediate channels
22 |         out_channels (int): the number of output channels
23 |         kernel_sizes (List[int]): the kernel size for each convolutional layer
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         embed_tokens,
29 |         args,
30 |     ):
31 |         super(TextEncoderPrenet, self).__init__()
32 |         self.padding_idx = embed_tokens.padding_idx
33 |         # define encoder prenet
34 |         # get positional encoding class
35 |         pos_enc_class = (
36 |             ScaledPositionalEncoding if args.enc_use_scaled_pos_enc else PositionalEncoding
37 |         )
38 | 
39 |         self.encoder_prenet = nn.Sequential(
40 |             embed_tokens,
41 |             pos_enc_class(args.encoder_embed_dim, args.transformer_enc_positional_dropout_rate, max_len=args.max_text_positions),
42 |         )
43 | 
44 |     def forward(self, src_tokens):
45 |         return self.encoder_prenet(src_tokens), src_tokens.eq(self.padding_idx)
46 | 


--------------------------------------------------------------------------------
/SpeechT5/speecht5/models/t5_transformer_lm.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing (https://arxiv.org/abs/2110.07205)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechT5
 4 | # Copyright (c) 2021 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq and espnet code bases
 7 | # https://github.com/pytorch/fairseq; https://github.com/espnet/espnet
 8 | # --------------------------------------------------------
 9 | 
10 | from fairseq.models import (
11 |     register_model_architecture,
12 | )
13 | from fairseq.models.transformer_lm import base_lm_architecture
14 | 
15 | 
16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5")
17 | def transformer_lm_t5(args):
18 |     args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
19 |     args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144)
20 |     args.decoder_layers = getattr(args, "decoder_layers", 20)
21 |     args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
22 |     args.dropout = getattr(args, "dropout", 0.1)
23 |     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
24 |     args.activation_fn = getattr(args, "activation_fn", "gelu")
25 |     base_lm_architecture(args)
26 | 


--------------------------------------------------------------------------------
/SpeechT5/speecht5/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5/tasks/__init__.py


--------------------------------------------------------------------------------
/SpeechT5/speecht5_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechT5/speecht5_framework.png


--------------------------------------------------------------------------------
/SpeechUT/dataset/LibriSpeech/dict.kmu.txt:
--------------------------------------------------------------------------------
1 | dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/LibriSpeech/dict.ltr.txt:
--------------------------------------------------------------------------------
 1 | | 803288730
 2 | E 439294199
 3 | T 319071758
 4 | A 277306732
 5 | O 263784364
 6 | N 239361162
 7 | I 237353011
 8 | H 223346762
 9 | S 220175453
10 | R 203352500
11 | D 152198685
12 | L 141597450
13 | U 98913389
14 | M 87138757
15 | C 84680142
16 | W 81375101
17 | F 80240665
18 | G 70642902
19 | Y 68388038
20 | P 58436929
21 | B 52538531
22 | V 33250231
23 | K 26906609
24 | ' 9162896
25 | X 5075632
26 | J 4746771
27 | Q 3401794
28 | Z 2186971
29 | <mask> 1
30 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/LibriSpeech/dict.txt:
--------------------------------------------------------------------------------
 1 | | 94802
 2 | E 51860
 3 | T 38431
 4 | A 33152
 5 | O 31495
 6 | N 28855
 7 | I 28794
 8 | H 27187
 9 | S 26071
10 | R 23546
11 | D 18289
12 | L 16308
13 | U 12400
14 | M 10685
15 | W 10317
16 | C 9844
17 | F 9062
18 | G 8924
19 | Y 8226
20 | P 6890
21 | B 6339
22 | V 3936
23 | K 3456
24 | ' 1023
25 | X 636
26 | J 598
27 | Q 437
28 | Z 213
29 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/dict.km.txt:
--------------------------------------------------------------------------------
1 | ../LibriSpeech/dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/dict.kmu.txt:
--------------------------------------------------------------------------------
1 | ../LibriSpeech/dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_de/config.yaml:
--------------------------------------------------------------------------------
1 | vocab_filename: dict.spm.txt
2 | src_vocab_filename: dict.kmu.txt
3 | 
4 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_de/config_ende.yaml:
--------------------------------------------------------------------------------
 1 | bpe_tokenizer:
 2 |   bpe: sentencepiece
 3 |   sentencepiece_model: spm_unigram10000.model
 4 | 
 5 | sampling_alpha: 1.0
 6 | shuffle: false
 7 | use_audio_input: true
 8 | use_sample_rate: 16000
 9 | 
10 | vocab_filename: dict.spm.txt
11 | 
12 | # required by speech_to_text task but never used  
13 | input_channels: 1
14 | input_feat_per_channel: 1
15 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_de/dict.kmu.txt:
--------------------------------------------------------------------------------
1 | ../../LibriSpeech/dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_de/spm_unigram10000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_de/spm_unigram10000.model


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_es/config.yaml:
--------------------------------------------------------------------------------
1 | vocab_filename: dict.spm.txt
2 | src_vocab_filename: dict.kmu.txt
3 | 
4 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_es/config_enes.yaml:
--------------------------------------------------------------------------------
 1 | bpe_tokenizer:
 2 |   bpe: sentencepiece
 3 |   sentencepiece_model: spm_unigram10000.model
 4 | 
 5 | sampling_alpha: 1.0
 6 | shuffle: false
 7 | use_audio_input: true
 8 | use_sample_rate: 16000
 9 | 
10 | vocab_filename: dict.spm.txt
11 | 
12 | # required by speech_to_text task but never used  
13 | input_channels: 1
14 | input_feat_per_channel: 1
15 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_es/dict.kmu.txt:
--------------------------------------------------------------------------------
1 | ../../LibriSpeech/dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_es/spm_unigram10000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_es/spm_unigram10000.model


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_fr/config.yaml:
--------------------------------------------------------------------------------
1 | vocab_filename: dict.spm.txt
2 | src_vocab_filename: dict.kmu.txt
3 | 
4 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_fr/config_enfr.yaml:
--------------------------------------------------------------------------------
 1 | bpe_tokenizer:
 2 |   bpe: sentencepiece
 3 |   sentencepiece_model: spm_unigram10000.model
 4 | 
 5 | sampling_alpha: 1.0
 6 | shuffle: false
 7 | use_audio_input: true
 8 | use_sample_rate: 16000
 9 | 
10 | vocab_filename: dict.spm.txt
11 | 
12 | # required by speech_to_text task but never used  
13 | input_channels: 1
14 | input_feat_per_channel: 1
15 | 


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_fr/dict.kmu.txt:
--------------------------------------------------------------------------------
1 | ../../LibriSpeech/dict.km.txt


--------------------------------------------------------------------------------
/SpeechUT/dataset/MuSTC/en_fr/spm_unigram10000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/dataset/MuSTC/en_fr/spm_unigram10000.model


--------------------------------------------------------------------------------
/SpeechUT/speechut/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models
2 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | for file in os.listdir(os.path.dirname(__file__)):
 5 |     if file.endswith(".py") and not file.startswith("_"):
 6 |         criterion_name = file[: file.find(".py")]
 7 |         importlib.import_module(
 8 |             "speechut.criterions." + criterion_name
 9 |         )
10 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/SpeechUT/speechut/models/__init__.py


--------------------------------------------------------------------------------
/SpeechUT/speechut/models/t5_transformer_lm.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | from fairseq.models import (
11 |     register_model_architecture,
12 | )
13 | from fairseq.models.transformer_lm import base_lm_architecture
14 | 
15 | 
16 | @register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5")
17 | def transformer_lm_t5(args):
18 |     args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
19 |     args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144)
20 |     args.decoder_layers = getattr(args, "decoder_layers", 20)
21 |     args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
22 |     args.dropout = getattr(args, "dropout", 0.1)
23 |     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
24 |     args.activation_fn = getattr(args, "activation_fn", "gelu")
25 |     base_lm_architecture(args)
26 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (c) 2022 Microsoft
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Based on fairseq code bases
 5 | # https://github.com/facebookresearch/fairseq
 6 | # --------------------------------------------------------
 7 | 
 8 | from .learned_positional_embedding import LearnedPositionalEmbedding
 9 | from .multihead_attention import MultiheadAttention
10 | from .relative_pos_enc import RelativePositionalEncoding
11 | from .transformer_layer import TransformerEncoderLayerBase, TransformerDecoderLayerBase
12 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer
13 | from .transformer_encoder import TransformerEncoderBase
14 | from .transformer_decoder import TransformerDecoderScriptable, TransformerDecoderBaseScriptable
15 | 
16 | __all__ = [
17 |     "MultiheadAttention",
18 |     "RelativePositionalEncoding",
19 |     "LearnedPositionalEmbedding",
20 |     "TransformerEncoderLayerBase",
21 |     "TransformerDecoderLayerBase",
22 |     "TransformerEncoder",
23 |     "TransformerSentenceEncoderLayer",
24 |     "TransformerEncoderBase",
25 |     "TransformerDecoderScriptable",
26 |     "TransformerDecoderBaseScriptable",
27 | ]
28 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/modules/relative_pos_enc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (c) 2022 Microsoft
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Based on fairseq code bases
 5 | # https://github.com/facebookresearch/fairseq
 6 | # --------------------------------------------------------
 7 | 
 8 | import torch
 9 | 
10 | class RelativePositionalEncoding(torch.nn.Module):
11 |     def __init__(self, d_model, maxlen=1000, embed_v=False):
12 |         super(RelativePositionalEncoding, self).__init__()
13 | 
14 |         self.d_model = d_model
15 |         self.maxlen = maxlen
16 |         self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 
17 |         if embed_v:
18 |             self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
19 |         self.embed_v = embed_v
20 | 
21 | 
22 |     def forward(self, pos_seq, incremental_state=None):
23 |         pos_seq[pos_seq < -self.maxlen] = -self.maxlen
24 |         pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
25 |         pos_seq = pos_seq + self.maxlen
26 |         
27 |         if incremental_state is not None:
28 |             pos_seq = pos_seq[-1:]
29 | 
30 |         if self.embed_v:
31 |             return self.pe_k(pos_seq), self.pe_v(pos_seq)
32 |         else:
33 |             return self.pe_k(pos_seq), None
34 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_asr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=1
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4asr_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
21 |   --config-name speechut_base_librispeech \
22 |   common.user_dir=$CODE_ROOT/speechut \
23 |   \
24 |   task.labels='["km"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_960+pseudo_libritext.kmu-ltr+merge_960.kmu-none\" \
31 |   dataset.valid_subset=\"dev_clean+dev.kmu-ltr+dev.kmu-none\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=1400000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=base_speechut4asr_${world_size}gpu_${update_freq}accum
41 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_st.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <data_dir> <text_data_dir> <lang=de/es> [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | lang=$3
 9 | mount=$4
10 | world_size=$5
11 | update_freq=$6
12 | [ -z $mount ] && mount=${PWD}
13 | [ -z $world_size ] && world_size=32
14 | [ -z $update_freq ] && update_freq=1
15 | 
16 | CODE_ROOT=${PWD}
17 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum"
18 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
19 | 
20 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
21 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
22 |   --config-name speechut_base_librispeech \
23 |   common.user_dir=$CODE_ROOT/speechut \
24 |   \
25 |   task.labels='["km"]' \
26 |   model.label_rate=50 \
27 |   task.data=$DATA_DIR \
28 |   task.label_dir=$DATA_DIR \
29 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
30 |   \
31 |   model.add_text_ctc=false \
32 |   model.text_transformer.share_decoder_input_output_embed=true \
33 |   criterion.u2t_ed_weight=1.0 \
34 |   criterion.u2t_ctc_weight=0 \
35 |   \
36 |   dataset.train_subset=\"train_960,mustcuns_${lang}+pseudo_wmt_en${lang}.kmu-spm+train_960.kmu-none,mustcuns_${lang}.kmu-none\" \
37 |   dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \
38 |   dataset.num_workers=0 \
39 |   dataset.max_tokens=1400000 \
40 |   distributed_training.distributed_world_size=${world_size} \
41 |   optimization.update_freq=[${update_freq}] \
42 |   \
43 |   common.tensorboard_logdir=$MODEL_DIR \
44 |   checkpoint.save_dir=$MODEL_DIR \
45 |   hydra.run.dir=$MODEL_DIR \
46 |   hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum
47 | 
48 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/pretrain_speechut/base_speechut_for_st_enfr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <data_dir> <text_data_dir> [lang=fr] [mount=${PWD}] [world_size=32] [update_freq=1]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | lang=$3
 9 | mount=$4
10 | world_size=$5
11 | update_freq=$6
12 | [ -z $lang ] && lang=fr
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=32
15 | [ -z $update_freq ] && update_freq=1
16 | 
17 | CODE_ROOT=${PWD}
18 | MODEL_DIR="${mount}/exp/pretrain/base_speechut4en${lang}_${world_size}gpu_${update_freq}accum"
19 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
20 | 
21 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
22 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
23 |   --config-name speechut_base_librispeech \
24 |   common.user_dir=$CODE_ROOT/speechut \
25 |   \
26 |   task.labels='["km"]' \
27 |   model.label_rate=50 \
28 |   task.data=$DATA_DIR \
29 |   task.label_dir=$DATA_DIR \
30 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
31 |   \
32 |   model.add_text_ctc=false \
33 |   criterion.u2t_ed_weight=1.0 \
34 |   criterion.u2t_ctc_weight=0 \
35 |   \
36 |   dataset.train_subset=\"train_960,pretrain_mustc+pseudo_wmt14_enfr.kmu-spm+train_960.kmu-none,pretrain_mustc.kmu-none\" \
37 |   dataset.valid_subset=\"dev_clean+pseudo_valid.kmu-spm+dev.kmu-none\" \
38 |   dataset.num_workers=0 \
39 |   dataset.max_tokens=1400000 \
40 |   optimization.max_update=600000 \
41 |   distributed_training.distributed_world_size=${world_size} \
42 |   optimization.update_freq=[${update_freq}] \
43 |   \
44 |   common.tensorboard_logdir=$MODEL_DIR \
45 |   checkpoint.save_dir=$MODEL_DIR \
46 |   hydra.run.dir=$MODEL_DIR \
47 |   hydra.job.name=base_speechut4en${lang}_${world_size}gpu_${update_freq}accum
48 | 
49 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/pretrain_speechut/large_speechut_for_asr.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Large model #
 3 | # ####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <data_dir> <text_data_dir> [mount=${PWD}] [world_size=32] [update_freq=4]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | DATA_DIR=$1
 7 | TEXT_DATA_DIR=$2
 8 | mount=$3
 9 | world_size=$4
10 | update_freq=$5
11 | [ -z $mount ] && mount=${PWD}
12 | [ -z $world_size ] && world_size=32
13 | [ -z $update_freq ] && update_freq=4
14 | 
15 | CODE_ROOT=${PWD}
16 | MODEL_DIR="${mount}/exp/pretrain/large_speechut4asr_${world_size}gpu_${update_freq}accum"
17 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
18 | 
19 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
20 |   --config-dir $CODE_ROOT/speechut/config/pretrain \
21 |   --config-name speechut_large_librilight \
22 |   common.user_dir=$CODE_ROOT/speechut \
23 |   \
24 |   task.labels='["km"]' \
25 |   model.label_rate=50 \
26 |   task.data=$DATA_DIR \
27 |   task.label_dir=$DATA_DIR \
28 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
29 |   \
30 |   dataset.train_subset=\"train_small+pseudo_libritext.kmu-ltr\" \
31 |   dataset.valid_subset=\"dev_clean+dev.kmu-ltr\" \
32 |   dataset.num_workers=0 \
33 |   dataset.max_tokens=900000 \
34 |   distributed_training.distributed_world_size=${world_size} \
35 |   optimization.update_freq=[${update_freq}] \
36 |   \
37 |   common.tensorboard_logdir=$MODEL_DIR \
38 |   checkpoint.save_dir=$MODEL_DIR \
39 |   hydra.run.dir=$MODEL_DIR \
40 |   hydra.job.name=large_speechut4asr_${world_size}gpu_${update_freq}accum
41 |   


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/tune_speechut_asr/finetune960h_large_edctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Large model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=3
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechut/config/finetune_asr \
26 |   --config-name speechut_large_960h \
27 |   common.user_dir=$CODE_ROOT/speechut \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=80000 \
35 |   dataset.max_tokens=1100000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_960" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5
46 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/tune_speechut_asr/finetune_base_edctc.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | w2v_path=$1
 8 | DATA_DIR=$2
 9 | cpt=$3
10 | mount=$4
11 | world_size=$5
12 | update_freq=$6
13 | [ -z $mount ] && mount=${PWD}
14 | [ -z $world_size ] && world_size=8
15 | [ -z $update_freq ] && update_freq=2
16 | 
17 | CODE_ROOT=${PWD}
18 | 
19 | exp_name=${w2v_path%/*}
20 | exp_name=${exp_name##*/}
21 | MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5"
22 | [ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/speechut/config/finetune_asr \
26 |   --config-name speechut_base_100h \
27 |   common.user_dir=$CODE_ROOT/speechut \
28 |   \
29 |   task.data=$DATA_DIR \
30 |   task.label_dir=$DATA_DIR \
31 |   model.w2v_path=${w2v_path} \
32 |   \
33 |   optimization.lr=[0.00001] \
34 |   optimization.max_update=40000 \
35 |   dataset.max_tokens=1300000 \
36 |   optimization.update_freq=[${update_freq}] \
37 |   distributed_training.distributed_world_size=${world_size} \
38 |   \
39 |   dataset.train_subset="train_clean_100" \
40 |   dataset.valid_subset="dev_other" \
41 |   \
42 |   common.tensorboard_logdir=$MODEL_DIR \
43 |   checkpoint.save_dir=$MODEL_DIR \
44 |   hydra.run.dir=$MODEL_DIR \
45 |   hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5
46 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/tune_speechut_asr/inference_edctc.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechUT ASR model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | beam_size=$4
11 | ctc_weight=$5
12 | extra=$6
13 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
14 | [ -z $gen_set ] && gen_set="dev_other"
15 | [ -z $beam_size ] && beam_size=10
16 | [ -z $ctc_weight ] && ctc_weight=0.2
17 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
18 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
19 | 
20 | src_dir=${model_path%/*}
21 | cpt=${model_path##*/}
22 | cpt=${cpt%.*}
23 | 
24 | CODE_ROOT=${PWD}
25 | 
26 | for subset in ${gen_set//,/ }; do
27 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
28 |     [ ! -d $results_path ] && mkdir -p $results_path
29 | 
30 |     python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
31 |     --user-dir $CODE_ROOT/speechut \
32 |     --label-dir ${DATA_DIR} \
33 |     --labels '["ltr"]' \
34 |     --single-target \
35 |     --post-process letter \
36 |     --gen-subset ${subset} \
37 |     --max-tokens 2000000 \
38 |     \
39 |     --task joint_sc2t_pretraining \
40 |     --add-decoder-target \
41 |     --fine-tuning \
42 |     --pad-audio \
43 |     --random-crop \
44 |     \
45 |     --ctc-weight ${ctc_weight} $extra \
46 |     --beam ${beam_size} \
47 |     \
48 |     --path ${model_path} \
49 |     --results-path $results_path \
50 |     \
51 |     --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
52 |     &
53 | done
54 | wait
55 | 
56 | 
57 | for subset in ${gen_set//,/ }; do
58 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
59 |     echo $results_path
60 |     tail -n 1 $results_path/generate-*.txt
61 | done
62 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/tune_speechut_asr/inference_edctclm.sh:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | # SpeechUT ASR model #
 3 | #####################################
 4 | [ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [--normalize]" && exit 1
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | gen_set=$3
10 | beam_size=$4
11 | ctc_weight=$5
12 | lm_weight=$6
13 | lm_path=$7
14 | extra=$8
15 | [ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
16 | [ -z $gen_set ] && gen_set="dev_other"
17 | [ -z $beam_size ] && beam_size=30
18 | [ -z $ctc_weight ] && ctc_weight=0.3
19 | [ -z $lm_weight ] && lm_weight=0.7
20 | [ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt"
21 | [ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0
22 | [ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
23 | 
24 | src_dir=${model_path%/*}
25 | cpt=${model_path##*/}
26 | cpt=${cpt%.*}
27 | 
28 | CODE_ROOT=${PWD}
29 | 
30 | for subset in ${gen_set//,/ }; do
31 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
32 |     [ ! -d $results_path ] && mkdir -p $results_path
33 | 
34 |     python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
35 |     --user-dir $CODE_ROOT/speechut \
36 |     --label-dir ${DATA_DIR} \
37 |     --labels '["ltr"]' \
38 |     --single-target \
39 |     --post-process letter \
40 |     --gen-subset ${subset} \
41 |     --max-tokens 800000 \
42 |     \
43 |     --task joint_sc2t_pretraining \
44 |     --add-decoder-target \
45 |     --fine-tuning \
46 |     --pad-audio \
47 |     --random-crop \
48 |     \
49 |     --ctc-weight ${ctc_weight} $extra \
50 |     --lm-weight ${lm_weight} --lm-path ${lm_path} \
51 |     --beam ${beam_size} \
52 |     \
53 |     --path ${model_path} \
54 |     --results-path ${results_path} \
55 |     \
56 |     --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
57 |     &
58 | done
59 | wait
60 | 
61 | 
62 | for subset in ${gen_set//,/ }; do
63 |     results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
64 |     echo $results_path
65 |     tail -n 1 $results_path/generate-*.txt
66 | done
67 | 


--------------------------------------------------------------------------------
/SpeechUT/speechut/scripts/tune_speechut_st/inference_st.sh:
--------------------------------------------------------------------------------
 1 | # ####################################
 2 | # SpeechUT Base model #
 3 | # ####################################
 4 | [ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0
 5 | [ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
 6 | 
 7 | model_path=$1
 8 | DATA_DIR=$2
 9 | lang=$3
10 | gen_set=$4
11 | beam_size=$5
12 | lenpen=$6
13 | [ -z $gen_set ] && gen_set="dev"
14 | [ -z $beam_size ] && beam_size=10
15 | [ -z $lenpen ] && lenpen=1
16 | src_dir=${model_path%/*}
17 | cpt=${model_path##*/}
18 | cpt=${cpt%.*}
19 | 
20 | CODE_ROOT=${PWD}
21 | results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
22 | [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
25 |     --gen-subset ${gen_set}_st \
26 |     --max-tokens 2000000 \
27 |     --max-source-positions 2000000 \
28 |     --num-workers 0 \
29 |     \
30 |     --user-dir $CODE_ROOT/speechut \
31 |     --task speech_to_text \
32 |     --config-yaml config_en${lang}.yaml \
33 |     \
34 |     --path ${model_path} \
35 |     --results-path $results_path \
36 |     \
37 |     --scoring sacrebleu --max-len-a 0 --max-len-b 512 \
38 |     --beam ${beam_size} \
39 |     --lenpen $lenpen \
40 |     # --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \
41 | 
42 |     echo $results_path
43 |     tail -n 1 $results_path/generate-*.txt
44 |     sleep 1s
45 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/requirements.txt:
--------------------------------------------------------------------------------
1 | python-speech-features==0.6
2 | scipy==1.5.4
3 | opencv-python==4.5.4.60
4 | sentencepiece==0.1.96
5 | editdistance==0.6.0
6 | kaldiio==2.17.2


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | # from .hubert import *  # noqa
 7 | # from .hubert_asr import *  # noqa
 8 | # from .hubert_dataset import *
 9 | # from .hubert_pretraining import *
10 | # from .hubert_criterion import *
11 | from . import data, tasks, criterions, models


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/conf/s2s_decode.yaml:
--------------------------------------------------------------------------------
 1 | common:
 2 |   user_dir: ???
 3 | 
 4 | generation:
 5 |   beam: 50
 6 |   max_len_a: 1.0
 7 |   max_len_b: 0
 8 |   lenpen: 1.0
 9 |   lm_weight: 0
10 | 
11 | common_eval:
12 |   results_path: ???
13 |   path: ???
14 | 
15 | dataset:
16 |   max_tokens: 1000
17 |   gen_subset: valid
18 |   num_workers: 0
19 | 
20 | override:
21 |   noise_prob: 0.0
22 |   noise_snr: 0
23 |   modalities: ???
24 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | for file in os.listdir(os.path.dirname(__file__)):
 5 |     if file.endswith(".py") and not file.startswith("_"):
 6 |         criterion_name = file[: file.find(".py")]
 7 |         importlib.import_module(
 8 |             "vathubert.criterions." + criterion_name
 9 |         )
10 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/decode_avhubert_lrs3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | decode_path=/path/to/finetuned_model
 4 | finetuned_model=checkpoint_best.pt
 5 | beam=50
 6 | data=$1
 7 | [ -z $data ] && data="test"
 8 | 
 9 | python -B infer_s2s.py --config-dir /path/to/vat_hubert/vathubert/conf/ --config-name s2s_decode.yaml \
10 |   dataset.gen_subset=${data} common_eval.path=${decode_path}/checkpoints/${finetuned_model} \
11 |   common_eval.results_path=${decode_path}/${finetuned_model}_${data}_video_beam${beam} \
12 |   override.modalities=["video"] \
13 |   common.user_dir=/path/to/vat_hubert/vathubert \
14 |   override.data=/path/to/data \
15 |   override.label_dir=/path/to/data \
16 |   generation.beam=${beam}
17 | 
18 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_lrs3_finetune30_av.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_lrs3_30h_av.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["audio","video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_vox_finetune30_av.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_30h_av.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["audio","video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_avsr/base_vox_finetune433_av.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_433h_av.yaml \
11 |        task.data=/path/to/433h_data_tsv \
12 |        task.label_dir=/path/to/433h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["audio","video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_avsr/large_vox_finetune30_av.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_30h_av.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["audio","video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_avsr/large_vox_finetune433_av.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_433h_av.yaml \
11 |        task.data=/path/to/433h_data_tsv \
12 |        task.label_dir=/path/to/433h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["audio","video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_lrs3_finetune30_v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_lrs3_30h_v.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_vox_finetune30_v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_30h_v.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_vsr/base_vox_finetune433_v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name base_vox_433h_v.yaml \
11 |        task.data=/path/to/433h_data_tsv \
12 |        task.label_dir=/path/to/433h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_vsr/large_vox_finetune30_v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_30h_v.yaml \
11 |        task.data=/path/to/30h_data_tsv \
12 |        task.label_dir=/path/to/30h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/finetune_vsr/large_vox_finetune433_v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | max_tokens=$3
 6 | pretrained_model_path=$4
 7 | save_path=$5
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/finetune --config-name large_vox_433h_v.yaml \
11 |        task.data=/path/to/433h_data_tsv \
12 |        task.label_dir=/path/to/433h_data_tsv \
13 |        task.tokenizer_bpe_model=/path/to/sentencepiece/model \
14 |        task.modalities=["video"] \
15 |        model.w2v_path=${pretrained_model_path} \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert  \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        distributed_training.ddp_backend="no_c10d" \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=${max_tokens} \
22 |        +task.use_supervised_data=False \
23 |        +task.use_extra_textdata=False \
24 |        +task.use_extra_audiodata=False \
25 |        
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/pretrain/base_lsr3_pretrain_iter5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ngpu=$1
 3 | updatefreq=$2
 4 | datapath=/LocalData/vatlm_related/fbankdata
 5 | save_path=$3
 6 | 
 7 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
 8 |        --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name base_lrs3_iter5.yaml \
 9 |        task.data=${datapath}/433pre_lrs3_433h_tsv \
10 |        task.label_dir=${datapath}/433pre_lrs3_433h_tsv \
11 |        +task.sup_data_path=${datapath}/433pre_tedv3_phone_concat_tsv2 \
12 |        +task.sup_manifest=${datapath}/433pre_tedv3_phone_concat_tsv2 \
13 |        +task.onlytext_manifest=${datapath}/433pre_cantab_tsv \
14 |        +task.onlyaudio_manifest=${datapath}/433pre_giga_tsv_km \
15 |        hydra.run.dir=${save_path} \
16 |        common.user_dir=/path/to/vat_hubert/vathubert \
17 |        distributed_training.distributed_world_size=${ngpu} \
18 |        optimization.update_freq=[${updatefreq}] \
19 |        dataset.max_tokens=3000  \
20 |        model.label_rate=25  \
21 |        common.log_interval=200 \
22 |        checkpoint.save_interval=5 \
23 |        +task.sample_distributions=\"0.08,0.1,0.15,0.15\" \
24 |        +criterion.banlance_loss_weights=[1.0,1.0] \
25 |        dataset.data_buffer_size=40 \
26 |        +task.use_supervised_data=True \
27 |        +task.use_extra_textdata=True \
28 |        +task.use_extra_audiodata=True \
29 | 
30 | 
31 |        


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/pretrain/base_vox_pretrain_iter5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ngpu=$1
 3 | updatefreq=$2
 4 | datapath=/LocalData/vatlm_related/fbankdata
 5 | save_path=$3
 6 | 
 7 | 
 8 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
 9 |        --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name base_vox_iter5.yaml \
10 |        task.data=${datapath}/fbank_lrs3_vox_tsv \
11 |        task.label_dir=${datapath}/fbank_lrs3_vox_tsv \
12 |        +task.sup_data_path=${datapath}/fbank_tedv3_phone_concat_vox_tsv \
13 |        +task.sup_manifest=${datapath}/fbank_tedv3_phone_concat_vox_tsv \
14 |        +task.onlytext_manifest=${datapath}/cantab2_vox_tsv \
15 |        +task.onlyaudio_manifest=${datapath}/fbank_giga_vox_tsv_km \
16 |        hydra.run.dir=${save_path} \
17 |        common.user_dir=/path/to/vat_hubert/vathubert \
18 |        distributed_training.distributed_world_size=${ngpu} \
19 |        optimization.update_freq=[${updatefreq}] \
20 |        dataset.max_tokens=3000  \
21 |        model.label_rate=25  \
22 |        common.log_interval=200 \
23 |        checkpoint.save_interval=5 \
24 |        +task.sample_distributions=\"0.13,0.15,0.32,0.3\" \
25 |        +criterion.banlance_loss_weights=[1.0,1.0] \
26 |        dataset.data_buffer_size=40 \
27 |        +task.use_supervised_data=True \
28 |        +task.use_extra_textdata=True \
29 |        +task.use_extra_audiodata=True \       
30 | 
31 | 


--------------------------------------------------------------------------------
/VATLM/vat_hubert/vathubert/scripts/pretrain/large_vox_pretrain_iter5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | unset WORLD_SIZE
 3 | ngpu=$1
 4 | updatefreq=$2
 5 | datapath=/LocalData/vatlm_related/fbankdata
 6 | save_path=$3
 7 | 
 8 | 
 9 | python /path/to/fairseq/fairseq_cli/hydra_train.py \
10 |        --config-dir /path/to/vat_hubert/vathubert/conf/pretrain --config-name large_vox_iter5.yaml \
11 |        task.data=${datapath}/fbank_lrs3_vox_tsv \
12 |        task.label_dir=${datapath}/fbank_lrs3_vox_tsv \
13 |        +task.sup_data_path=${datapath}/fbank_tedv3_phone_concat_vox_tsv \
14 |        +task.sup_manifest=${datapath}/fbank_tedv3_phone_concat_vox_tsv \
15 |        +task.onlytext_manifest=${datapath}/cantab2_vox_tsv \
16 |        +task.onlyaudio_manifest=${datapath}/fbank_giga_vox_tsv_km \
17 |        hydra.run.dir=${save_path} \
18 |        common.user_dir=/path/to/vat_hubert/vathubert \
19 |        distributed_training.distributed_world_size=${ngpu} \
20 |        optimization.update_freq=[${updatefreq}] \
21 |        dataset.max_tokens=3000  \
22 |        model.label_rate=25  \
23 |        common.log_interval=200 \
24 |        checkpoint.save_interval=5 \
25 |        +task.sample_distributions=\"0.13,0.15,0.32,0.3\" \
26 |        +criterion.banlance_loss_weights=[1.0,1.0] \
27 |        dataset.data_buffer_size=40 \
28 |        +task.use_supervised_data=True \
29 |        +task.use_extra_textdata=True \
30 |        +task.use_extra_audiodata=True \       
31 | 
32 | 


--------------------------------------------------------------------------------
/WavLLM/download/download.sh:
--------------------------------------------------------------------------------
 1 | stage=$1
 2 | # WavLLM model
 3 | if [ "$stage" -eq 0 ]; then
 4 |   url_p1="https://valle.blob.core.windows.net/share/wavllm/fi"
 5 |   url_p2="nal.pt?sv=2021-10-04&st=2024-04-24T04%3A50%3A"
 6 |   url_p3="15Z&se=2025-04-25T04%3A50%3A00Z&sr=b&sp=r&si"
 7 |   url_p4="g=M82edjKinydPiVd86oS78ZS9L"
 8 |   url_p5="TVxg0%2F2om3IaEkodIo%3D"
 9 |   curl -o final.pt ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5}
10 | else
11 |   # gaokao_audio
12 |   url_p1="https://valle.blob.core.windows.net/share/wavllm/ga"
13 |   url_p2="okao_audio.zip?sv=2021-10-04&st=2024-04-24T04%3A58%3A"
14 |   url_p3="56Z&se=2025-04-25T04%3A58%3A00Z&sr=b&sp=r&s"
15 |   url_p4="ig=0ql1dkz59%2FSxRHkz1ajtC"
16 |   url_p5="yfCR5Hva4UISlIfDrOO%2BRc%3D"
17 |   curl -o gaokao_audio.zip ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5}
18 | 
19 |   # gaokao_transcript
20 |   url_p1="https://valle.blob.core.windows.net/share/wavllm/ga"
21 |   url_p2="okao_text.zip?sv=2021-10-04&st=2024-04-24T04%3A57%3A"
22 |   url_p3="37Z&se=2025-04-25T04%3A57%3A00Z&sr=b&sp=r&s"
23 |   url_p4="ig=n5QKXU3F9RiP6SxHl6uVEJ"
24 |   url_p5="8m7WZ3iEeOGns1BoIozvI%3D"
25 |   curl -o gaokao_text.zip ${url_p1}${url_p2}${url_p3}${url_p4}${url_p5}
26 | fi


--------------------------------------------------------------------------------
/WavLLM/wavllm/__init__.py:
--------------------------------------------------------------------------------
1 | from . import criterions
2 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/data/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import csv
 7 | import io
 8 | import logging
 9 | import re
10 | from collections import defaultdict
11 | from pathlib import Path
12 | from typing import Dict, List, Optional
13 | from dataclasses import dataclass
14 | 
15 | import os
16 | from sentencepiece import SentencePieceProcessor
17 | from copy import deepcopy
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | 
23 | class Tokenizer:
24 |     def __init__(self, model_path: str):
25 |         # reload tokenizer
26 |         assert os.path.isfile(model_path), model_path
27 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
28 |         logger.info(f"Reloaded SentencePiece model from {model_path}")
29 | 
30 |         # BOS / EOS token IDs
31 |         self.n_words: int = self.sp_model.vocab_size()
32 |         self.bos_id: int = self.sp_model.bos_id()
33 |         self.eos_id: int = self.sp_model.eos_id()
34 |         self.pad_id: int = self.sp_model.pad_id()
35 |         self.unk_id: int = self.sp_model.unk_id()
36 |         logger.info(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id} - PAD ID: {self.pad_id} - UNK ID: {self.unk_id}")
37 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
38 | 
39 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
40 |         assert type(s) is str
41 |         t = self.sp_model.encode(s)
42 |         if bos:
43 |             t = [self.bos_id] + t
44 |         if eos:
45 |             t = t + [self.eos_id]
46 |         return t
47 | 
48 |     def decode(self, t: List[int]) -> str:
49 |         return self.sp_model.decode(t)


--------------------------------------------------------------------------------
/WavLLM/wavllm/requirements.txt:
--------------------------------------------------------------------------------
1 | fairscale==0.4.13
2 | fairseq==0.12.2
3 | numpy==1.24.3
4 | omegaconf==2.0.6
5 | sentencepiece==0.1.99
6 | torch==2.0.1
7 | transformers==4.32.1
8 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/scripts/inference_sft.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | export HYDRA_FULL_ERROR=1
 3 | export PYTHONPATH=$$PYTHONPATH:${PWD}
 4 | 
 5 | model_path=$1
 6 | [ -z $model_path ] && model_path="?"
 7 | 
 8 | src_dir=${model_path%/*}
 9 | cpt=${model_path##*/}
10 | cpt=${cpt%.*}
11 | 
12 | gen_set=$2
13 | [ -z $gen_set ] && gen_set="?"
14 | [ -z $beam_size ] && beam_size=1
15 | 
16 | 
17 | FAIRSEQ_ROOT=${PWD}
18 | DATA_DIR=$FAIRSEQ_ROOT/examples/wavllm/test_data
19 | 
20 | for subset in $gen_set; do
21 |     results_path=$src_dir/decode_${cpt}_beam${beam_size}/${subset}
22 |     [ ! -d $results_path ] && mkdir -p $results_path
23 | 
24 |     python $FAIRSEQ_ROOT/examples/wavllm/inference/generate.py $DATA_DIR \
25 |     --user-dir examples/wavllm \
26 |     --tokenizer-path $FAIRSEQ_ROOT/examples/wavllm/tokenizer/tokenizer.model \
27 |     --gen-subset ${subset} \
28 |     \
29 |     --task speechllm_task \
30 |     \
31 |     --path ${model_path} \
32 |     --results-path $results_path \
33 |     \
34 |     --scoring wer \
35 |     --skip-invalid-size-inputs-valid-test \
36 |     --max-tokens 1600000 \
37 |     --sampling --beam 1 --nbest 1 --temperature 0.5 \
38 |     --max-len-a 0 --max-len-b 512
39 | done


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/CoT-task-story.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech	orig_story
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/CoT-task-story.wav	1079348	First of all, transcribe the audio recording into text, capturing every spoken word; Additionally given this audio clip and text, can you condense it into a clear, concise summary, no more than 20 words?; Lastly disregarding the sound, translate this English summary into German.	Bis zum Jahr 2500 ist die Erde eine umweltfreundliche Utopie mit fortschrittlicher KI, neuronaler Vernetzung und einer perfekten Mischung aus Technologie und Natur.	True	In the year 2500, Earth gleamed like a sapphire, a futuristic utopia where harmony reigned. Skyscrapers, draped in lush greenery, stretched towards the heavens, their glass surfaces reflecting the tranquil azure of a pollution-free sky. Humanity had transcended past conflicts, embracing an era of shared consciousness through neural connectivity. Autonomous vehicles glided silently on solar pathways, while people mingled in serene communal spaces, their basic needs met by advanced AI that predicted and catered to their every whim. The Great Reconciliation had merged technology with nature, and in this new world, every individual thrived, their potential limited only by the expanses of their own creativity. The utopia wasn't just a place; it was the pulse of civilization, beating in perfect rhythm with the universe.


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/CoT-task.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	with_speech	tgt_text
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/CoT-task.wav	214437	First of all, transcribe the audio recording into text, capturing every spoken word; Additionally given this audio clip and text, can you condense it into a clear, concise summary, no more than 20 words?; Lastly disregarding the sound, translate this English summary into German.	True	Drei Filme aus dem asiatisch-pazifischen Raum im Rennen in Cannes


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/II-task.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	with_speech	prompt	tgt_text
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/II-task.wav	111111	True	To begin, Transcribe the audio recording into text, capturing every spoken word; Subsequently, How does the woman finally decide to go home? A. By bus; B. In the man’s car; C. In her father’s car.; Furthermore, ignore the audio clip, What is the capital of New Zealand?; Lastly, Continue the narrative of given audio clip in a coherent and engaging way	ASR+SQA+SFT+Continue


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/SQA.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sqa.wav	111111	What will the man do next? A. Start to take exercise; B. Do as he always does; C. Change his working time.	A	True
3 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/SQQA.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sqqa.wav	182574		The fundamental theorem of calculus is a theorem that links the concept of the derivative of a function with the concept of the integral .	True


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/asr.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/asr.flac	166960	Based on the attached audio, generate a comprehensive text transcription of the spoken content.	he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce	True
3 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/CoT-task-story.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/CoT-task-story.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/CoT-task.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/CoT-task.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/II-task.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/II-task.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/asr.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/asr.flac


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/emo.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/emo.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/sqa.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sqa.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/sqqa.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sqqa.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/st.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/st.flac


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/audio/sv.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/test_data/audio/sv.wav


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/dict.txt:
--------------------------------------------------------------------------------
1 | 1 1
2 | 2 2
3 | 3 3
4 | 4 4
5 | 5 5
6 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/emo.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/emo.wav	12345	Can you describe the emotional condition of the speaker in the provided audio clip?	sad	True
3 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/en2de.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	tgt_text	prompt	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/st.flac	34560	Sie wird schon in Ordnung sein.	Translate the audio clip into German.	True
3 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/test_data/sv.tsv:
--------------------------------------------------------------------------------
1 | id	audio	n_frames	prompt	tgt_text	with_speech
2 | 0	SpeechT5/WavLLM/fairseq/examples/wavllm/test_data/audio/sv.wav	351362	Is there only one speaker in the audio clip?	Yes	True
3 | 


--------------------------------------------------------------------------------
/WavLLM/wavllm/tokenizer/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/WavLLM/wavllm/tokenizer/tokenizer.model


--------------------------------------------------------------------------------
/YiTrans/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | 
3 | 


--------------------------------------------------------------------------------
/YiTrans/exp_scripts/finetune_ASR/finetune_hubert24_mbart24_en.sh:
--------------------------------------------------------------------------------
 1 | world_size=$1
 2 | update_freq=$2
 3 | [ -z $world_size ] && world_size=8
 4 | [ -z $update_freq ] && update_freq=8
 5 | 
 6 | EXP_NAME=train_iwslt_asr_hubert24_mbart24_norel
 7 | SAVE_DIR=${HOME}/data/iwslt/asr_v3/${EXP_NAME}
 8 | 
 9 | DATA_ROOT=${HOME}/dataset/iwslt_mustc
10 | LABEL_DIR=${DATA_ROOT}/fine-tune_en_bpe250k
11 | SP_PATH=${LABEL_DIR}/sentence.bpe.model
12 | retain_dict=${LABEL_DIR}/index_en_onlyMUSTC
13 | W2V_PATH=${HOME}/dataset/iwslt_mustc/pretrain_ed_model_cfg.pt
14 | 
15 | TRAIN_SUBSET=train_asr_MUSTC
16 | VALID_SUBSET=dev_asr_MUSTC
17 | 
18 | 
19 | mbart_path="/mnt/default/v-junyiao/released_exsp/mbart50.pretrained/model.pt"
20 | hubert_path="/mnt/default/v-junyiao/speechexp/fairseq_mlst/hubert_large_librivox_released/checkpoint_last.pt"
21 | 
22 | CODE_ROOT=${HOME}/code/SpeechT5/YiTrans
23 | 
24 | python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
25 |   --config-dir $CODE_ROOT/yitrans_iwslt22/config/finetune_asr \
26 |   --config-name large_mustc \
27 |   common.user_dir=$CODE_ROOT/yitrans_iwslt22 \
28 |   distributed_training.distributed_world_size=$world_size \
29 |   optimization.update_freq=[$update_freq] \
30 |   \
31 |   dataset.max_tokens=400001 \
32 |   dataset.num_workers=0 \
33 |   optimization.max_update=120000 \
34 |   \
35 |   task._name="iwslt_joint_pretraining" \
36 |   task.data=${DATA_ROOT} \
37 |   task.label_dir=${LABEL_DIR} \
38 |   +task.store_labels=True \
39 |   task.hubert_tokenizer="sentencepiece" \
40 |   task.sp_path=${SP_PATH} \
41 |   task.max_keep_size=400000 \
42 |   criterion.dec_weight=0.5 \
43 |   \
44 |   model._name="yitrans_asr" \
45 |   model.w2v_path=${W2V_PATH} \
46 |   +model.reuse_text_emb=true \
47 |   +model.share_ctc_decoder_embed=true \
48 |   +model.retain_dict_path=${retain_dict} \
49 |   model.freeze_finetune_updates=15000 \
50 |   \
51 |   +model.no_pretrained_weights=true \
52 |   +model.use_rel_pos_enc=false \
53 |   +model.encoder_layers=24 \
54 |   +model.add_text_encoder=true \
55 |   +model.share_s2t_t2t_embeddings=false \
56 |   +model.share_enc_dec_embeddings=false \
57 |   +model.add_adaptor=false \
58 |   +model.load_pretrained_w2v_from=$hubert_path \
59 |   +model.load_pretrained_mbart_from=$mbart_path \
60 |   \
61 |   dataset.train_subset=${TRAIN_SUBSET} \
62 |   dataset.valid_subset=${VALID_SUBSET} \
63 |   checkpoint.save_dir=${SAVE_DIR} \
64 |   common.tensorboard_logdir=${SAVE_DIR} \
65 |   hydra.run.dir=${SAVE_DIR} \
66 |   hydra.job.name=${EXP_NAME}
67 | 
68 | 


--------------------------------------------------------------------------------
/YiTrans/exp_scripts/pretrain/pretrain_pt36_adaptor_step1.sh:
--------------------------------------------------------------------------------
 1 | export HYDRA_FULL_ERROR=1
 2 | YiTrans=/home/v-ziqzhang/Code/SpeechT5/YiTrans
 3 | DATA_DIR=/mnt/default/lozhou/speechdata/hubert_data
 4 | LABEL_DIR=${DATA_DIR}/layer9_k500_label
 5 | SP_PATH=${LABEL_DIR}/spm_unigram8000.model
 6 | TEXT_DATA_DIR=/mnt/default/lozhou/speechdata/text_data/v3/bin_idx_step1
 7 | EXP_NAME=pretrain_pt36_addadaptor_bpecode_large_step1
 8 | SAVE_DIR=${HOME}/data/speechexp/${EXP_NAME}
 9 | W2V_PATH=${HOME}/data/speechexp/hubert_large_librivox_released/checkpoint_last.pt
10 | MBART_PATH=${HOME}/data/speechexp/mbart50.pretrained/model.pt
11 | 
12 | python ${YiTrans}/fairseq/fairseq_cli/hydra_train.py \
13 |   --config-dir ${YiTrans}/yitrans_iwslt22/config/pretrain \
14 |   --config-name joint_large \
15 |   common.user_dir=${YiTrans}/yitrans_iwslt22 \
16 |   \
17 |   task.data=$DATA_DIR \
18 |   task.labels='["km"]' \
19 |   task.label_dir=$LABEL_DIR \
20 |   task.text_cfg.text_data=$TEXT_DATA_DIR \
21 |   +task.hubert_tokenizer="sentencepiece" \
22 |   +task.sp_path=${SP_PATH} \
23 |   \
24 |   model.label_rate=50 \
25 |   model.encoder_layers=12 \
26 |   +model.load_pretrained_w2v_from=${W2V_PATH} \
27 |   +model.load_pretrained_mbart_from=${MBART_PATH} \
28 |   \
29 |   dataset.train_subset=\"train_LS,train_MUSTC+mono_deduped_filt_sort.en_XX.en_XX,mt8corpus_filt_slct.en_XX-de_DE\" \
30 |   dataset.valid_subset=\"dev_MUSTC+valid.en_XX-de_DE,dev_MUSTC+valid.en_XX-ja_XX,dev_MUSTC+valid.en_XX-zh_CN,dev_MUSTC+dev4x.en_XX.en_XX\" \
31 |   dataset.max_tokens=300000 \
32 |   \
33 |   distributed_training.distributed_world_size=8 \
34 |   distributed_training.nprocs_per_node=8 \
35 |   optimization.update_freq=[2] \
36 |   \
37 |   common.tensorboard_logdir=$SAVE_DIR \
38 |   checkpoint.save_dir=$SAVE_DIR \
39 |   hydra.run.dir=$SAVE_DIR \
40 |   hydra.job.name=$EXP_NAME \
41 |   checkpoint.reset_optimizer=true \
42 |   checkpoint.reset_dataloader=true
43 | 
44 | 
45 | 
46 |   # dataset.train_subset=\"train_CV,train_EUR,train_LS,train_MUSTC,train_TEDLIUM,train_VP+mono_deduped_filt_sort.en_XX.en_XX,mt8corpus_filt_slct.en_XX-de_DE,mt8corpus_filt_slct.en_XX-ja_XX,mt8corpus_filt_slct.en_XX-zh_CN\" \
47 | 


--------------------------------------------------------------------------------
/YiTrans/exp_scripts/pretrain/pretrain_pt36_adaptor_step2.sh:
--------------------------------------------------------------------------------
 1 | EXP_NAME=train_speech_text_joint_adaptor_large_step2_300k
 2 | SAVE_DIR=/datablob/users/v-junyiao/speechexp/fairseq_mlst/${EXP_NAME}
 3 | DATA_ROOT=/datablob/users/v-junyiao/speechdata/hubert_mlst
 4 | LABEL_DIR=${DATA_ROOT}/fine-tune_en_bpe250k_full
 5 | W2V_PATH=/mnt/default/v-junyiao/speechexp/train_speech_text_joint_addadaptor_bpecode_large_step1_mbartpt_400k/checkpoint_last_up.pt
 6 | TEXT_DATA_DIR=/datablob/users/v-junyiao/speechdata/text_data/v4/bin-idx
 7 | SP_PATH=${LABEL_DIR}/sentence.bpe.model
 8 | # export CUDA_VISIBLE_DEVICES=1
 9 | python fairseq_cli/hydra_train.py \
10 |   --config-dir examples/hubert/config/pretrain \
11 |   --config-name pretrain_step2 \
12 |   distributed_training.distributed_world_size=64 \
13 |   distributed_training.nprocs_per_node=8 \
14 |   \
15 |   dataset.train_subset=\"train_COVOST,train_asr_VP,train_punc_TEDLIUM,train_asr_MUSTC,train_punc_LS,train_asr_EUR+covost2.en_XX-ja_XX,covost2.en_XX-zh_CN,covost_eurST.en_XX-de_DE,mt8corpus_domain45.en_XX-ja_XX,mt8corpus_filt_slct80_domain44.en_XX-de_DE,mt8corpus_filt_slct80_domain40.en_XX-zh_CN,train.en_XX-de_DE,train.en_XX-ja_XX,train.en_XX-zh_CN\" \
16 |   dataset.valid_subset=\"dev_asr_MUSTC+valid.en_XX-de_DE,dev_asr_MUSTC+valid.en_XX-ja_XX,dev_asr_MUSTC+valid.en_XX-zh_CN\" \
17 |   dataset.max_tokens=480001 \
18 |   dataset.num_workers=0 \
19 |   optimization.update_freq=[1] \
20 |   optimization.max_update=300000 \
21 |   \
22 |   task.hubert_tokenizer="sentencepiece" \
23 |   task.sp_path=${SP_PATH} \
24 |   task.max_keep_size=480000 \
25 |   +task.split_modality_batch=true \
26 |   +task.speech_tgt_lang="en_XX" \
27 |   +task.mbart_style_lang_id=true \
28 |   +task.text_sampling_alpha=1.0 \
29 |   +task.store_labels=true \
30 |   model.freeze_finetune_updates=15000 \
31 |   criterion.dec_weight=0.5 \
32 |   +model.reuse_text_emb=true \
33 |   +model.share_ctc_decoder_embed=true \
34 |   +model.share_speech_text_embeddings=true \
35 |   \
36 |   task.data=${DATA_ROOT} \
37 |   task.label_dir=${LABEL_DIR} \
38 |   task.text_cfg.text_data=${TEXT_DATA_DIR} \
39 |   model.w2v_path=${W2V_PATH} \
40 |   checkpoint.save_dir=${SAVE_DIR} \
41 |   common.tensorboard_logdir=${SAVE_DIR} \
42 |   hydra.run.dir=${SAVE_DIR} \
43 |   hydra.job.name=${EXP_NAME}
44 | 
45 | sleep infinity
46 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, tasks, criterions, models
2 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/config/finetune_mt/mt_translation.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | common:
 4 |   fp16: true
 5 |   log_format: json
 6 |   log_interval: 200
 7 |   tensorboard_logdir: tblog
 8 |   seed: 1337
 9 | 
10 | checkpoint:
11 |   save_interval: 1000000
12 |   keep_last_epochs: 5
13 |   save_interval_updates: 10000
14 |   keep_interval_updates_pattern: 20000
15 |   keep_interval_updates: 5
16 |   keep_best_checkpoints: 5
17 |   best_checkpoint_metric: accuracy
18 |   maximize_best_checkpoint_metric: true
19 | 
20 | distributed_training:
21 |   ddp_backend: legacy_ddp
22 |   find_unused_parameters: true
23 |   distributed_world_size: -1
24 |   nprocs_per_node: 8
25 | 
26 | 
27 | criterion:
28 |   _name: "label_smoothed_cross_entropy"
29 |   label_smoothing: 0.2
30 |   report_accuracy: true
31 | 
32 | 
33 | task:
34 |   _name: "iwslt_translation_from_pretrained"
35 | 
36 | dataset:
37 |   num_workers: 6
38 |   max_tokens: 3200000
39 |   skip_invalid_size_inputs_valid_test: true
40 |   validate_after_updates: ${model.freeze_finetune_updates}
41 |   validate_interval: ${checkpoint.save_interval}
42 |   validate_interval_updates: ${checkpoint.save_interval_updates}
43 |   train_subset: train_100
44 |   valid_subset: dev_other
45 |   required_batch_size_multiple: 1
46 | 
47 | optimizer:
48 |   _name: adam
49 |   adam_betas: (0.9,0.98)
50 |   adam_eps: 1e-06
51 |   weight_decay: 0.0
52 | 
53 | lr_scheduler:
54 |   lr: [0.0001]
55 |   _name: polynomial_decay
56 |   warmup_updates: 5000
57 |   total_num_update: 200000
58 | 
59 | model:
60 |   _name: finetune_mt
61 |   w2v_path: ???
62 |   apply_mask: true
63 |   mask_prob: 0.65
64 |   mask_channel_prob: 0.5
65 |   mask_channel_length: 64
66 |   layerdrop: 0.1
67 |   decoder_layerdrop: 0.1
68 |   activation_dropout: 0.1
69 |   feature_grad_mult: 0.0
70 |   freeze_finetune_updates: 0
71 | 
72 | hydra:
73 |   job:
74 |     config:
75 |       override_dirname:
76 |         kv_sep: '-'
77 |         item_sep: '__'
78 |         exclude_keys:
79 |           - run
80 |           - task.data
81 |           - task.label_dir
82 |           - model.w2v_path
83 |           - dataset.train_subset
84 |           - dataset.valid_subset
85 |   run:
86 |     dir: ???
87 |   sweep:
88 |     dir: ???
89 |     subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
90 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | for file in os.listdir(os.path.dirname(__file__)):
 5 |     if file.endswith(".py") and not file.startswith("_"):
 6 |         criterion_name = file[: file.find(".py")]
 7 |         importlib.import_module(
 8 |             "yitrans_iwslt22.criterions." + criterion_name
 9 |         )
10 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/data/lang_pair_mask_dataset.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/facebookresearch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | """
11 |     Modified from https://github.com/facebookresearch/fairseq/blob/main/fairseq/data/audio/multi_modality_dataset.py
12 | """
13 | 
14 | 
15 | from typing import Optional
16 | 
17 | import numpy as np
18 | import torch
19 | from fairseq.data import (
20 |     LanguagePairDataset,
21 | )
22 | from fairseq.data.audio.multi_modality_dataset import LangPairMaskDataset as FairseqLangPairMaskDataset
23 | 
24 | class LangPairMaskDataset(FairseqLangPairMaskDataset):
25 |     def __init__(
26 |         self,
27 |         dataset: LanguagePairDataset,
28 |         src_eos: int,
29 |         src_bos: Optional[int] = None,
30 |         noise_id: Optional[int] = -1,
31 |         mask_ratio: Optional[float] = 0,
32 |         mask_type: Optional[str] = "random",
33 |     ):
34 |         super.__init__(
35 |             dataset,
36 |             src_eos,
37 |             src_bos,
38 |             noise_id,
39 |             mask_ratio,
40 |             mask_type,
41 |         )
42 |     def mask_src_tokens(self, sample):
43 |         src_item = sample["source"]
44 |         mask = None
45 |         if self.mask_type == "random":
46 |             mask = torch.rand(len(src_item)).le(self.mask_ratio)
47 |         else:
48 |             mask = torch.ones(len(src_item))
49 |             mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0
50 |             mask = mask.eq(1)
51 |         if src_item[0] == self.src_bos:
52 |             mask[0] = False
53 |         if src_item[-1] == self.src_eos:
54 |             mask[-1] = False
55 |         mask_src_item = src_item.masked_fill(mask, self.noise_id)
56 |         smp = sample
57 |         smp["source"] = mask_src_item
58 |         return smp
59 | 
60 |     def collater(self, samples, pad_to_length=None):
61 |         return self.dataset.collater(samples, pad_to_length=pad_to_length)
62 | 
63 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/SpeechT5/5d66cf5f37e97f4a1999ad519537decc16d852af/YiTrans/yitrans_iwslt22/models/__init__.py


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # The YiTrans End-to-End Speech Translation System for IWSLT 2022 Offline Shared Task (https://arxiv.org/abs/2206.05777)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/YiTrans
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/facebookresearch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | from .multihead_attention import MultiheadAttention
11 | from .relative_pos_enc import RelativePositionalEncoding
12 | from .transformer_decoder_layer import TransformerDecoderLayerBase
13 | from .w2v_encoder import TransformerEncoder, TransformerSentenceEncoderLayer
14 | from .multimodal_transformer_decoder import MultimodalTransformerDecoder
15 | 
16 | __all__ = [
17 |     "MultiheadAttention",
18 |     "RelativePositionalEncoding",
19 |     "TransformerDecoderLayerBase",
20 |     "TransformerEncoder",
21 |     "TransformerSentenceEncoderLayer",
22 |     "MultimodalTransformerDecoder",
23 | ]
24 | 


--------------------------------------------------------------------------------
/YiTrans/yitrans_iwslt22/modules/relative_pos_enc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
 3 | # Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Based on fairseq code bases
 7 | # https://github.com/pytorch/fairseq
 8 | # --------------------------------------------------------
 9 | 
10 | import torch
11 | 
12 | class RelativePositionalEncoding(torch.nn.Module):
13 |     def __init__(self, d_model, maxlen=1000, embed_v=False):
14 |         super(RelativePositionalEncoding, self).__init__()
15 | 
16 |         self.d_model = d_model
17 |         self.maxlen = maxlen
18 |         self.pe_k = torch.nn.Embedding(2*maxlen, d_model) 
19 |         if embed_v:
20 |             self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
21 |         self.embed_v = embed_v
22 | 
23 | 
24 |     def forward(self, pos_seq, incremental_state=None):
25 |         pos_seq[pos_seq < -self.maxlen] = -self.maxlen
26 |         pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
27 |         pos_seq = pos_seq + self.maxlen
28 |         
29 |         if incremental_state is not None:
30 |             pos_seq = pos_seq[-1:]
31 | 
32 |         if self.embed_v:
33 |             return self.pe_k(pos_seq), self.pe_v(pos_seq)
34 |         else:
35 |             return self.pe_k(pos_seq), None
36 | 


--------------------------------------------------------------------------------