├── .gitignore ├── AudioCodec ├── MimiCodec │ ├── config │ │ └── mimi24k.yaml │ ├── dataloaders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-312.pyc │ │ │ └── base_dataloader.cpython-312.pyc │ │ └── base_dataloader.py │ ├── get_scp.py │ ├── inference.py │ ├── losses │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── basic_loss.cpython-312.pyc │ │ │ ├── basic_loss.cpython-37.pyc │ │ │ ├── basic_loss.cpython-38.pyc │ │ │ ├── basic_loss.cpython-39.pyc │ │ │ ├── discriminator_loss.cpython-312.pyc │ │ │ ├── discriminator_loss.cpython-37.pyc │ │ │ ├── discriminator_loss.cpython-38.pyc │ │ │ ├── discriminator_loss.cpython-39.pyc │ │ │ ├── enh_loss.cpython-312.pyc │ │ │ ├── enh_loss.cpython-38.pyc │ │ │ ├── enh_loss.cpython-39.pyc │ │ │ ├── generator_loss.cpython-310.pyc │ │ │ ├── generator_loss.cpython-312.pyc │ │ │ ├── generator_loss.cpython-37.pyc │ │ │ ├── generator_loss.cpython-38.pyc │ │ │ └── generator_loss.cpython-39.pyc │ │ ├── basic_loss.py │ │ ├── discriminator_loss.py │ │ ├── enh_loss.py │ │ └── generator_loss.py │ ├── models │ │ ├── MimiCodec.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ │ ├── MimiCodec.cpython-312.pyc │ │ │ ├── MimiCodec.cpython-38.pyc │ │ │ ├── __init__.cpython-312.pyc │ │ │ └── __init__.cpython-38.pyc │ ├── modules │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── conv.cpython-310.pyc │ │ │ ├── conv.cpython-312.pyc │ │ │ ├── conv.cpython-37.pyc │ │ │ ├── conv.cpython-38.pyc │ │ │ ├── conv.cpython-39.pyc │ │ │ ├── gating.cpython-312.pyc │ │ │ ├── lstm.cpython-310.pyc │ │ │ ├── lstm.cpython-37.pyc │ │ │ ├── lstm.cpython-38.pyc │ │ │ ├── lstm.cpython-39.pyc │ │ │ ├── norm.cpython-310.pyc │ │ │ ├── norm.cpython-37.pyc │ │ │ ├── norm.cpython-38.pyc │ │ │ ├── norm.cpython-39.pyc │ │ │ ├── resample.cpython-312.pyc │ │ │ ├── rope.cpython-312.pyc │ │ │ ├── seanet.cpython-310.pyc │ │ │ ├── seanet.cpython-312.pyc │ │ │ ├── seanet.cpython-37.pyc │ │ │ ├── seanet.cpython-38.pyc │ │ │ ├── seanet.cpython-39.pyc │ │ │ ├── streaming.cpython-312.pyc │ │ │ ├── streaming.cpython-38.pyc │ │ │ ├── transformer.cpython-310.pyc │ │ │ ├── transformer.cpython-312.pyc │ │ │ ├── transformer.cpython-37.pyc │ │ │ ├── transformer.cpython-38.pyc │ │ │ └── transformer.cpython-39.pyc │ │ ├── commons │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── __init__.cpython-38.pyc │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ ├── base_layers.cpython-310.pyc │ │ │ │ ├── base_layers.cpython-312.pyc │ │ │ │ ├── base_layers.cpython-37.pyc │ │ │ │ ├── base_layers.cpython-38.pyc │ │ │ │ ├── base_layers.cpython-39.pyc │ │ │ │ ├── ops.cpython-310.pyc │ │ │ │ ├── ops.cpython-312.pyc │ │ │ │ ├── ops.cpython-37.pyc │ │ │ │ ├── ops.cpython-38.pyc │ │ │ │ ├── ops.cpython-39.pyc │ │ │ │ ├── pqmf.cpython-310.pyc │ │ │ │ ├── pqmf.cpython-312.pyc │ │ │ │ ├── pqmf.cpython-37.pyc │ │ │ │ ├── pqmf.cpython-38.pyc │ │ │ │ ├── pqmf.cpython-39.pyc │ │ │ │ ├── torch_stft.cpython-310.pyc │ │ │ │ ├── torch_stft.cpython-312.pyc │ │ │ │ ├── torch_stft.cpython-37.pyc │ │ │ │ ├── torch_stft.cpython-38.pyc │ │ │ │ └── torch_stft.cpython-39.pyc │ │ │ ├── base_layers.py │ │ │ ├── ops.py │ │ │ ├── position_encoding.py │ │ │ ├── pqmf.py │ │ │ └── torch_stft.py │ │ ├── conv.py │ │ ├── discriminators │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ ├── frequency_discriminator.cpython-312.pyc │ │ │ │ ├── period_discriminator.cpython-312.pyc │ │ │ │ └── scale_discriminator.cpython-312.pyc │ │ │ ├── combd_sbd.py │ │ │ ├── frequency_discriminator.py │ │ │ ├── frequency_discriminator_bak.py │ │ │ ├── mrd.py │ │ │ ├── period_discriminator.py │ │ │ └── scale_discriminator.py │ │ ├── gating.py │ │ ├── loss.py │ │ ├── resample.py │ │ ├── rope.py │ │ ├── seanet.py │ │ ├── streaming.py │ │ └── transformer.py │ ├── path.sh │ ├── quantization │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── base.cpython-310.pyc │ │ │ ├── base.cpython-312.pyc │ │ │ ├── core_vq.cpython-310.pyc │ │ │ ├── core_vq.cpython-312.pyc │ │ │ ├── vq.cpython-310.pyc │ │ │ ├── vq.cpython-312.pyc │ │ │ ├── vq_dc.cpython-310.pyc │ │ │ └── vq_dc.cpython-312.pyc │ │ ├── base.py │ │ ├── core_vq.py │ │ ├── vq.py │ │ └── vq_dc.py │ ├── run.sh │ ├── semantic_features │ │ ├── WavLM.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── WavLM.cpython-312.pyc │ │ │ ├── WavLM.cpython-38.pyc │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── modules.cpython-312.pyc │ │ │ ├── modules.cpython-38.pyc │ │ │ └── wavlm_feature.cpython-312.pyc │ │ ├── hubert_feature.py │ │ ├── modules.py │ │ ├── w2vec2bert_feature.py │ │ ├── wavlm_feature.py │ │ └── whisper_feature.py │ ├── train.py │ └── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-312.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── compile.cpython-312.pyc │ │ ├── ddp_utils.cpython-38.pyc │ │ ├── ddp_utils.cpython-39.pyc │ │ ├── hifigan_mel.cpython-310.pyc │ │ ├── hifigan_mel.cpython-312.pyc │ │ ├── hifigan_mel.cpython-37.pyc │ │ ├── hifigan_mel.cpython-38.pyc │ │ ├── hifigan_mel.cpython-39.pyc │ │ ├── utils.cpython-310.pyc │ │ ├── utils.cpython-312.pyc │ │ ├── utils.cpython-37.pyc │ │ ├── utils.cpython-38.pyc │ │ └── utils.cpython-39.pyc │ │ ├── autocast.py │ │ ├── compile.py │ │ ├── ddp_utils.py │ │ ├── hifigan_mel.py │ │ ├── sampling.py │ │ └── utils.py └── readme.md ├── DataPipeline └── readme.md ├── Evaluation ├── codec │ ├── compute_dnsmos.sh │ ├── compute_mcd.py │ ├── compute_metrics.sh │ ├── compute_ms_stft_loss.py │ ├── compute_pesq.py │ ├── compute_sisnr.py │ ├── compute_ssim.py │ ├── compute_stoi.py │ └── compute_visqol.py └── readme.md ├── MLLM ├── egs │ └── moshi_ft │ │ ├── data_scripts │ │ ├── create_data_json.py │ │ └── offline_tokenization.py │ │ ├── local │ │ ├── asr_whisperx.py │ │ └── vad_segment.py │ │ ├── readme.md │ │ ├── run.sh │ │ └── utils │ │ ├── run.pl │ │ └── split_scp.pl ├── models │ ├── __pycache__ │ │ └── model.cpython-312.pyc │ ├── model.py │ └── model_lora.py ├── modules │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-312.pyc │ │ ├── conv.cpython-312.pyc │ │ ├── gating.cpython-312.pyc │ │ ├── resample.cpython-312.pyc │ │ ├── rope.cpython-312.pyc │ │ ├── seanet.cpython-312.pyc │ │ ├── streaming.cpython-312.pyc │ │ ├── transformer.cpython-312.pyc │ │ └── transformer_lora.cpython-312.pyc │ ├── conv.py │ ├── gating.py │ ├── resample.py │ ├── rope.py │ ├── seanet.py │ ├── streaming.py │ ├── transformer.py │ └── transformer_lora.py ├── readme.md ├── tools │ ├── data_scripts │ │ ├── create_data_json.py │ │ ├── filter_scp.py │ │ ├── find_peer_utts.py │ │ ├── merge_then_split.py │ │ ├── offline_tokenization.py │ │ └── select_spk2utt.py │ ├── kaldi │ │ └── utils │ │ │ ├── add_disambig.pl │ │ │ ├── add_lex_disambig.pl │ │ │ ├── analyze_segments.pl │ │ │ ├── apply_map.pl │ │ │ ├── best_wer.sh │ │ │ ├── build_const_arpa_lm.sh │ │ │ ├── build_kenlm_model_from_arpa.sh │ │ │ ├── combine_data.sh │ │ │ ├── convert_ctm.pl │ │ │ ├── convert_slf.pl │ │ │ ├── convert_slf_parallel.sh │ │ │ ├── copy_data_dir.sh │ │ │ ├── create_data_link.pl │ │ │ ├── create_split_dir.pl │ │ │ ├── ctm │ │ │ ├── convert_ctm.pl │ │ │ ├── fix_ctm.sh │ │ │ └── resolve_ctm_overlaps.py │ │ │ ├── data │ │ │ ├── combine_data.sh │ │ │ ├── combine_short_segments.sh │ │ │ ├── convert_data_dir_to_whole.sh │ │ │ ├── copy_data_dir.sh │ │ │ ├── extend_segment_times.py │ │ │ ├── extract_wav_segments_data_dir.sh │ │ │ ├── fix_data_dir.sh │ │ │ ├── fix_subsegment_feats.pl │ │ │ ├── get_allowed_durations.py │ │ │ ├── get_frame_shift.sh │ │ │ ├── get_num_frames.sh │ │ │ ├── get_reco2dur.sh │ │ │ ├── get_reco2utt_for_data.sh │ │ │ ├── get_segments_for_data.sh │ │ │ ├── get_uniform_subsegments.py │ │ │ ├── get_utt2dur.sh │ │ │ ├── get_utt2num_frames.sh │ │ │ ├── internal │ │ │ │ ├── choose_utts_to_combine.py │ │ │ │ ├── combine_segments_to_recording.py │ │ │ │ ├── modify_speaker_info.py │ │ │ │ └── perturb_volume.py │ │ │ ├── limit_feature_dim.sh │ │ │ ├── modify_speaker_info.sh │ │ │ ├── modify_speaker_info_to_recording.sh │ │ │ ├── normalize_data_range.pl │ │ │ ├── perturb_data_dir_speed.sh │ │ │ ├── perturb_data_dir_speed_3way.sh │ │ │ ├── perturb_data_dir_volume.sh │ │ │ ├── perturb_speed_to_allowed_lengths.py │ │ │ ├── remove_dup_utts.sh │ │ │ ├── resample_data_dir.sh │ │ │ ├── shift_and_combine_feats.sh │ │ │ ├── shift_feats.sh │ │ │ ├── split_data.sh │ │ │ ├── subsegment_data_dir.sh │ │ │ ├── subset_data_dir.sh │ │ │ └── validate_data_dir.sh │ │ │ ├── dict_dir_add_pronprobs.sh │ │ │ ├── eps2disambig.pl │ │ │ ├── filt.py │ │ │ ├── filter_scp.pl │ │ │ ├── filter_scps.pl │ │ │ ├── find_arpa_oovs.pl │ │ │ ├── fix_ctm.sh │ │ │ ├── fix_data_dir.sh │ │ │ ├── format_lm.sh │ │ │ ├── format_lm_sri.sh │ │ │ ├── gen_topo.pl │ │ │ ├── int2sym.pl │ │ │ ├── kwslist_post_process.pl │ │ │ ├── lang │ │ │ ├── add_lex_disambig.pl │ │ │ ├── add_unigrams_arpa.pl │ │ │ ├── adjust_unk_arpa.pl │ │ │ ├── adjust_unk_graph.sh │ │ │ ├── bpe │ │ │ │ ├── add_final_optional_silence.sh │ │ │ │ ├── apply_bpe.py │ │ │ │ ├── bidi.py │ │ │ │ ├── learn_bpe.py │ │ │ │ ├── prepend_words.py │ │ │ │ └── reverse.py │ │ │ ├── check_g_properties.pl │ │ │ ├── check_phones_compatible.sh │ │ │ ├── compute_sentence_probs_arpa.py │ │ │ ├── extend_lang.sh │ │ │ ├── get_word_position_phone_map.pl │ │ │ ├── grammar │ │ │ │ ├── augment_phones_txt.py │ │ │ │ └── augment_words_txt.py │ │ │ ├── internal │ │ │ │ ├── apply_unk_lm.sh │ │ │ │ ├── arpa2fst_constrained.py │ │ │ │ └── modify_unk_pron.py │ │ │ ├── limit_arpa_unk_history.py │ │ │ ├── make_kn_lm.py │ │ │ ├── make_lexicon_fst.py │ │ │ ├── make_lexicon_fst_silprob.py │ │ │ ├── make_phone_bigram_lang.sh │ │ │ ├── make_phone_lm.py │ │ │ ├── make_position_dependent_subword_lexicon.py │ │ │ ├── make_subword_lexicon_fst.py │ │ │ ├── make_unk_lm.sh │ │ │ ├── ngram_entropy_pruning.py │ │ │ ├── prepare_lang.sh │ │ │ ├── validate_disambig_sym_file.pl │ │ │ └── validate_lang.pl │ │ │ ├── ln.pl │ │ │ ├── make_absolute.sh │ │ │ ├── make_lexicon_fst.pl │ │ │ ├── make_lexicon_fst_silprob.pl │ │ │ ├── make_unigram_grammar.pl │ │ │ ├── map_arpa_lm.pl │ │ │ ├── mkgraph.sh │ │ │ ├── mkgraph_lookahead.sh │ │ │ ├── nnet-cpu │ │ │ ├── make_nnet_config.pl │ │ │ ├── make_nnet_config_block.pl │ │ │ ├── make_nnet_config_preconditioned.pl │ │ │ └── update_learning_rates.pl │ │ │ ├── nnet │ │ │ ├── gen_dct_mat.py │ │ │ ├── gen_hamm_mat.py │ │ │ ├── gen_splice.py │ │ │ ├── make_blstm_proto.py │ │ │ ├── make_cnn_proto.py │ │ │ ├── make_lstm_proto.py │ │ │ ├── make_nnet_proto.py │ │ │ └── subset_data_tr_cv.sh │ │ │ ├── nnet3 │ │ │ └── convert_config_tdnn_to_affine.py │ │ │ ├── parallel │ │ │ ├── limit_num_gpus.sh │ │ │ ├── pbs.pl │ │ │ ├── queue.pl │ │ │ ├── retry.pl │ │ │ ├── run.pl │ │ │ └── slurm.pl │ │ │ ├── parse_options.sh │ │ │ ├── pbs.pl │ │ │ ├── perturb_data_dir_speed.sh │ │ │ ├── pinyin_map.pl │ │ │ ├── prepare_extended_lang.sh │ │ │ ├── prepare_lang.sh │ │ │ ├── prepare_online_nnet_dist_build.sh │ │ │ ├── queue.pl │ │ │ ├── remove_data_links.sh │ │ │ ├── remove_oovs.pl │ │ │ ├── require_argument.sh │ │ │ ├── require_argument_all.sh │ │ │ ├── retry.pl │ │ │ ├── reverse_arpa.py │ │ │ ├── rnnlm_compute_scores.sh │ │ │ ├── run.pl │ │ │ ├── s2eps.pl │ │ │ ├── scoring │ │ │ ├── wer_ops_details.pl │ │ │ ├── wer_per_spk_details.pl │ │ │ ├── wer_per_utt_details.pl │ │ │ └── wer_report.pl │ │ │ ├── segmentation.pl │ │ │ ├── show_lattice.sh │ │ │ ├── shuffle_list.pl │ │ │ ├── slurm.pl │ │ │ ├── spk2utt_to_utt2spk.pl │ │ │ ├── split_data.sh │ │ │ ├── split_scp.pl │ │ │ ├── ssh.pl │ │ │ ├── subset_data_dir.sh │ │ │ ├── subset_data_dir_tr_cv.sh │ │ │ ├── subset_scp.pl │ │ │ ├── subword │ │ │ ├── prepare_lang_subword.sh │ │ │ └── prepare_subword_text.sh │ │ │ ├── summarize_logs.pl │ │ │ ├── summarize_warnings.pl │ │ │ ├── sym2int.pl │ │ │ ├── train_arpa_with_kenlm.sh │ │ │ ├── utt2spk_to_spk2utt.pl │ │ │ ├── validate_data_dir.sh │ │ │ ├── validate_dict_dir.pl │ │ │ ├── validate_lang.pl │ │ │ ├── validate_text.pl │ │ │ └── write_kwslist.pl │ └── tokenizer │ │ ├── MimiCodec │ │ ├── __pycache__ │ │ │ ├── mimi_tokenizer.cpython-310.pyc │ │ │ └── mimi_tokenizer.cpython-312.pyc │ │ ├── mimi_config.yaml │ │ ├── mimi_tokenizer.py │ │ └── model │ │ │ ├── models │ │ │ ├── MimiCodec.py │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ │ ├── MimiCodec.cpython-312.pyc │ │ │ │ └── __init__.cpython-312.pyc │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ ├── conv.cpython-312.pyc │ │ │ │ ├── gating.cpython-312.pyc │ │ │ │ ├── resample.cpython-312.pyc │ │ │ │ ├── rope.cpython-312.pyc │ │ │ │ ├── seanet.cpython-312.pyc │ │ │ │ ├── streaming.cpython-312.pyc │ │ │ │ └── transformer.cpython-312.pyc │ │ │ ├── conv.py │ │ │ ├── gating.py │ │ │ ├── resample.py │ │ │ ├── rope.py │ │ │ ├── seanet.py │ │ │ ├── streaming.py │ │ │ └── transformer.py │ │ │ ├── quantization │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ ├── base.cpython-312.pyc │ │ │ │ ├── core_vq.cpython-312.pyc │ │ │ │ └── vq.cpython-312.pyc │ │ │ ├── base.py │ │ │ ├── core_vq.py │ │ │ └── vq.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-312.pyc │ │ │ └── compile.cpython-312.pyc │ │ │ └── compile.py │ │ ├── Text2ID │ │ ├── __pycache__ │ │ │ ├── moshi_text_tokenizer.cpython-312.pyc │ │ │ └── text_tokenizer.cpython-38.pyc │ │ └── moshi_text_tokenizer.py │ │ ├── __pycache__ │ │ ├── abs_tokenizer.cpython-310.pyc │ │ ├── abs_tokenizer.cpython-312.pyc │ │ ├── abs_tokenizer.cpython-38.pyc │ │ └── common.cpython-38.pyc │ │ ├── abs_tokenizer.py │ │ └── common.py ├── trainer │ ├── finetuning_full_ds.py │ ├── finetuning_full_fsdp.py │ └── finetuning_lora.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-312.pyc │ ├── abs_scheduler.cpython-312.pyc │ ├── arguments.cpython-312.pyc │ ├── compile.cpython-312.pyc │ ├── dataloader.cpython-310.pyc │ ├── dataloader.cpython-312.pyc │ ├── reporter.cpython-312.pyc │ ├── sampling.cpython-312.pyc │ ├── task_definition.cpython-312.pyc │ └── train_utils.cpython-312.pyc │ ├── abs_scheduler.py │ ├── arguments.py │ ├── autocast.py │ ├── compile.py │ ├── dataloader.py │ ├── reporter.py │ ├── sampling.py │ ├── task_definition.py │ └── train_utils.py ├── MLLM_v2 ├── configs │ └── llama3.yaml ├── egs │ ├── extract_tokens │ │ ├── data_scripts │ │ │ ├── create_data_json.py │ │ │ └── offline_tokenization.py │ │ ├── get_wav.py │ │ ├── local │ │ │ ├── asr_whisperx.py │ │ │ ├── asr_whisperx_tar.py │ │ │ └── vad_segment.py │ │ ├── run.sh │ │ └── utils │ │ │ ├── run.pl │ │ │ └── split_scp.pl │ ├── moshi_ft │ │ ├── data_scripts │ │ │ ├── create_data_json.py │ │ │ └── offline_tokenization.py │ │ ├── local │ │ │ ├── asr_whisperx.py │ │ │ └── vad_segment.py │ │ ├── readme.md │ │ ├── run.sh │ │ └── utils │ │ │ ├── run.pl │ │ │ └── split_scp.pl │ └── pretraining │ │ ├── data_scripts │ │ ├── create_data_json.py │ │ ├── emilia │ │ │ ├── config.json │ │ │ ├── env.sh │ │ │ ├── main.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── dnsmos.py │ │ │ │ ├── separate_fast.py │ │ │ │ ├── silero_vad.py │ │ │ │ └── whisper_asr.py │ │ │ ├── requirements.txt │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── logger.py │ │ │ │ └── tool.py │ │ ├── filter_scp.py │ │ ├── offline_tokenization_tar.py │ │ ├── text_tokenization.py │ │ ├── text_tokenization_scp.py │ │ └── text_tokenization_utt2json.py │ │ ├── extract_token.sh │ │ ├── infer.sh │ │ ├── local │ │ ├── asr_whisperx.py │ │ ├── asr_whisperx_tar.py │ │ ├── offline_codec_tokenization.py │ │ └── vad_segment.py │ │ ├── path.sh │ │ ├── prepare_broadcast_data.sh │ │ ├── readme.md │ │ ├── run.sh │ │ └── utils │ │ ├── run.pl │ │ └── split_scp.pl ├── infer_no_streaming.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-312.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── config.cpython-310.pyc │ │ ├── config.cpython-312.pyc │ │ ├── config.cpython-39.pyc │ │ ├── lit_model.cpython-310.pyc │ │ ├── lit_model.cpython-312.pyc │ │ ├── lit_model.cpython-39.pyc │ │ ├── llama_streaming.cpython-310.pyc │ │ ├── llama_streaming.cpython-312.pyc │ │ ├── llama_streaming.cpython-39.pyc │ │ ├── llama_streaming_lora.cpython-310.pyc │ │ ├── mlp.cpython-310.pyc │ │ ├── mlp.cpython-312.pyc │ │ ├── mlp.cpython-39.pyc │ │ ├── model.cpython-310.pyc │ │ └── model.cpython-312.pyc │ ├── config.py │ ├── lit_model.py │ ├── llama_streaming.py │ ├── mlp.py │ ├── model.py │ ├── model_llama.py │ └── model_lora.py ├── modules │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-312.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── conv.cpython-312.pyc │ │ ├── conv.cpython-38.pyc │ │ ├── gating.cpython-312.pyc │ │ ├── resample.cpython-312.pyc │ │ ├── rope.cpython-312.pyc │ │ ├── seanet.cpython-312.pyc │ │ ├── streaming.cpython-312.pyc │ │ ├── streaming.cpython-38.pyc │ │ ├── transformer.cpython-312.pyc │ │ └── transformer_lora.cpython-312.pyc │ ├── conv.py │ ├── gating.py │ ├── resample.py │ ├── rope.py │ ├── seanet.py │ ├── streaming.py │ ├── transformer.py │ └── transformer_lora.py ├── moshi │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-312.pyc │ ├── client.py │ ├── client_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── compression.cpython-312.pyc │ │ │ ├── lm.cpython-312.pyc │ │ │ └── loaders.cpython-312.pyc │ │ ├── compression.py │ │ ├── lm.py │ │ └── loaders.py │ ├── modules │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── conv.cpython-312.pyc │ │ │ ├── gating.cpython-312.pyc │ │ │ ├── resample.cpython-312.pyc │ │ │ ├── rope.cpython-312.pyc │ │ │ ├── seanet.cpython-312.pyc │ │ │ ├── streaming.cpython-312.pyc │ │ │ └── transformer.cpython-312.pyc │ │ ├── conv.py │ │ ├── conv_test.py │ │ ├── gating.py │ │ ├── resample.py │ │ ├── rope.py │ │ ├── seanet.py │ │ ├── seanet_test.py │ │ ├── streaming.py │ │ └── transformer.py │ ├── quantization │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-312.pyc │ │ │ ├── base.cpython-312.pyc │ │ │ ├── core_vq.cpython-312.pyc │ │ │ └── vq.cpython-312.pyc │ │ ├── base.py │ │ ├── core_vq.py │ │ └── vq.py │ ├── server.py │ └── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-312.pyc │ │ ├── compile.cpython-312.pyc │ │ └── sampling.cpython-312.pyc │ │ ├── autocast.py │ │ ├── compile.py │ │ └── sampling.py ├── readme.md ├── tools │ ├── data_scripts │ │ ├── create_data_json.py │ │ ├── filter_scp.py │ │ ├── find_peer_utts.py │ │ ├── merge_then_split.py │ │ ├── offline_tokenization.py │ │ └── select_spk2utt.py │ ├── kaldi │ │ └── utils │ │ │ ├── add_disambig.pl │ │ │ ├── add_lex_disambig.pl │ │ │ ├── analyze_segments.pl │ │ │ ├── apply_map.pl │ │ │ ├── best_wer.sh │ │ │ ├── build_const_arpa_lm.sh │ │ │ ├── build_kenlm_model_from_arpa.sh │ │ │ ├── combine_data.sh │ │ │ ├── convert_ctm.pl │ │ │ ├── convert_slf.pl │ │ │ ├── convert_slf_parallel.sh │ │ │ ├── copy_data_dir.sh │ │ │ ├── create_data_link.pl │ │ │ ├── create_split_dir.pl │ │ │ ├── ctm │ │ │ ├── convert_ctm.pl │ │ │ ├── fix_ctm.sh │ │ │ └── resolve_ctm_overlaps.py │ │ │ ├── data │ │ │ ├── combine_data.sh │ │ │ ├── combine_short_segments.sh │ │ │ ├── convert_data_dir_to_whole.sh │ │ │ ├── copy_data_dir.sh │ │ │ ├── extend_segment_times.py │ │ │ ├── extract_wav_segments_data_dir.sh │ │ │ ├── fix_data_dir.sh │ │ │ ├── fix_subsegment_feats.pl │ │ │ ├── get_allowed_durations.py │ │ │ ├── get_frame_shift.sh │ │ │ ├── get_num_frames.sh │ │ │ ├── get_reco2dur.sh │ │ │ ├── get_reco2utt_for_data.sh │ │ │ ├── get_segments_for_data.sh │ │ │ ├── get_uniform_subsegments.py │ │ │ ├── get_utt2dur.sh │ │ │ ├── get_utt2num_frames.sh │ │ │ ├── internal │ │ │ │ ├── choose_utts_to_combine.py │ │ │ │ ├── combine_segments_to_recording.py │ │ │ │ ├── modify_speaker_info.py │ │ │ │ └── perturb_volume.py │ │ │ ├── limit_feature_dim.sh │ │ │ ├── modify_speaker_info.sh │ │ │ ├── modify_speaker_info_to_recording.sh │ │ │ ├── normalize_data_range.pl │ │ │ ├── perturb_data_dir_speed.sh │ │ │ ├── perturb_data_dir_speed_3way.sh │ │ │ ├── perturb_data_dir_volume.sh │ │ │ ├── perturb_speed_to_allowed_lengths.py │ │ │ ├── remove_dup_utts.sh │ │ │ ├── resample_data_dir.sh │ │ │ ├── shift_and_combine_feats.sh │ │ │ ├── shift_feats.sh │ │ │ ├── split_data.sh │ │ │ ├── subsegment_data_dir.sh │ │ │ ├── subset_data_dir.sh │ │ │ └── validate_data_dir.sh │ │ │ ├── dict_dir_add_pronprobs.sh │ │ │ ├── eps2disambig.pl │ │ │ ├── filt.py │ │ │ ├── filter_scp.pl │ │ │ ├── filter_scps.pl │ │ │ ├── find_arpa_oovs.pl │ │ │ ├── fix_ctm.sh │ │ │ ├── fix_data_dir.sh │ │ │ ├── format_lm.sh │ │ │ ├── format_lm_sri.sh │ │ │ ├── gen_topo.pl │ │ │ ├── int2sym.pl │ │ │ ├── kwslist_post_process.pl │ │ │ ├── lang │ │ │ ├── add_lex_disambig.pl │ │ │ ├── add_unigrams_arpa.pl │ │ │ ├── adjust_unk_arpa.pl │ │ │ ├── adjust_unk_graph.sh │ │ │ ├── bpe │ │ │ │ ├── add_final_optional_silence.sh │ │ │ │ ├── apply_bpe.py │ │ │ │ ├── bidi.py │ │ │ │ ├── learn_bpe.py │ │ │ │ ├── prepend_words.py │ │ │ │ └── reverse.py │ │ │ ├── check_g_properties.pl │ │ │ ├── check_phones_compatible.sh │ │ │ ├── compute_sentence_probs_arpa.py │ │ │ ├── extend_lang.sh │ │ │ ├── get_word_position_phone_map.pl │ │ │ ├── grammar │ │ │ │ ├── augment_phones_txt.py │ │ │ │ └── augment_words_txt.py │ │ │ ├── internal │ │ │ │ ├── apply_unk_lm.sh │ │ │ │ ├── arpa2fst_constrained.py │ │ │ │ └── modify_unk_pron.py │ │ │ ├── limit_arpa_unk_history.py │ │ │ ├── make_kn_lm.py │ │ │ ├── make_lexicon_fst.py │ │ │ ├── make_lexicon_fst_silprob.py │ │ │ ├── make_phone_bigram_lang.sh │ │ │ ├── make_phone_lm.py │ │ │ ├── make_position_dependent_subword_lexicon.py │ │ │ ├── make_subword_lexicon_fst.py │ │ │ ├── make_unk_lm.sh │ │ │ ├── ngram_entropy_pruning.py │ │ │ ├── prepare_lang.sh │ │ │ ├── validate_disambig_sym_file.pl │ │ │ └── validate_lang.pl │ │ │ ├── ln.pl │ │ │ ├── make_absolute.sh │ │ │ ├── make_lexicon_fst.pl │ │ │ ├── make_lexicon_fst_silprob.pl │ │ │ ├── make_unigram_grammar.pl │ │ │ ├── map_arpa_lm.pl │ │ │ ├── mkgraph.sh │ │ │ ├── mkgraph_lookahead.sh │ │ │ ├── nnet-cpu │ │ │ ├── make_nnet_config.pl │ │ │ ├── make_nnet_config_block.pl │ │ │ ├── make_nnet_config_preconditioned.pl │ │ │ └── update_learning_rates.pl │ │ │ ├── nnet │ │ │ ├── gen_dct_mat.py │ │ │ ├── gen_hamm_mat.py │ │ │ ├── gen_splice.py │ │ │ ├── make_blstm_proto.py │ │ │ ├── make_cnn_proto.py │ │ │ ├── make_lstm_proto.py │ │ │ ├── make_nnet_proto.py │ │ │ └── subset_data_tr_cv.sh │ │ │ ├── nnet3 │ │ │ └── convert_config_tdnn_to_affine.py │ │ │ ├── parallel │ │ │ ├── limit_num_gpus.sh │ │ │ ├── pbs.pl │ │ │ ├── queue.pl │ │ │ ├── retry.pl │ │ │ ├── run.pl │ │ │ └── slurm.pl │ │ │ ├── parse_options.sh │ │ │ ├── pbs.pl │ │ │ ├── perturb_data_dir_speed.sh │ │ │ ├── pinyin_map.pl │ │ │ ├── prepare_extended_lang.sh │ │ │ ├── prepare_lang.sh │ │ │ ├── prepare_online_nnet_dist_build.sh │ │ │ ├── queue.pl │ │ │ ├── remove_data_links.sh │ │ │ ├── remove_oovs.pl │ │ │ ├── require_argument.sh │ │ │ ├── require_argument_all.sh │ │ │ ├── retry.pl │ │ │ ├── reverse_arpa.py │ │ │ ├── rnnlm_compute_scores.sh │ │ │ ├── run.pl │ │ │ ├── s2eps.pl │ │ │ ├── scoring │ │ │ ├── wer_ops_details.pl │ │ │ ├── wer_per_spk_details.pl │ │ │ ├── wer_per_utt_details.pl │ │ │ └── wer_report.pl │ │ │ ├── segmentation.pl │ │ │ ├── show_lattice.sh │ │ │ ├── shuffle_list.pl │ │ │ ├── slurm.pl │ │ │ ├── spk2utt_to_utt2spk.pl │ │ │ ├── split_data.sh │ │ │ ├── split_scp.pl │ │ │ ├── ssh.pl │ │ │ ├── subset_data_dir.sh │ │ │ ├── subset_data_dir_tr_cv.sh │ │ │ ├── subset_scp.pl │ │ │ ├── subword │ │ │ ├── prepare_lang_subword.sh │ │ │ └── prepare_subword_text.sh │ │ │ ├── summarize_logs.pl │ │ │ ├── summarize_warnings.pl │ │ │ ├── sym2int.pl │ │ │ ├── train_arpa_with_kenlm.sh │ │ │ ├── utt2spk_to_spk2utt.pl │ │ │ ├── validate_data_dir.sh │ │ │ ├── validate_dict_dir.pl │ │ │ ├── validate_lang.pl │ │ │ ├── validate_text.pl │ │ │ └── write_kwslist.pl │ └── tokenizer │ │ ├── GLM4V │ │ ├── __init__.py │ │ ├── configuration_whisper.py │ │ ├── cosyvoice │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ └── __init__.cpython-312.pyc │ │ │ ├── bin │ │ │ │ ├── inference.py │ │ │ │ └── train.py │ │ │ ├── cli │ │ │ │ ├── __init__.py │ │ │ │ ├── cosyvoice.py │ │ │ │ ├── frontend.py │ │ │ │ └── model.py │ │ │ ├── dataset │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset.py │ │ │ │ └── processor.py │ │ │ ├── flow │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── decoder.cpython-310.pyc │ │ │ │ │ ├── decoder.cpython-312.pyc │ │ │ │ │ ├── flow.cpython-310.pyc │ │ │ │ │ ├── flow.cpython-312.pyc │ │ │ │ │ ├── flow_matching.cpython-310.pyc │ │ │ │ │ ├── flow_matching.cpython-312.pyc │ │ │ │ │ ├── length_regulator.cpython-310.pyc │ │ │ │ │ └── length_regulator.cpython-312.pyc │ │ │ │ ├── decoder.py │ │ │ │ ├── flow.py │ │ │ │ ├── flow_gradtts.py │ │ │ │ ├── flow_matching.py │ │ │ │ ├── flow_matching_dit.py │ │ │ │ ├── length_regulator.py │ │ │ │ └── stable │ │ │ │ │ ├── adp.py │ │ │ │ │ ├── blocks.py │ │ │ │ │ ├── dit.py │ │ │ │ │ ├── dit_v2.py │ │ │ │ │ ├── sampling.py │ │ │ │ │ ├── stable_diffusion.py │ │ │ │ │ ├── stable_diffusion_test.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ └── transformer_use_mask.py │ │ │ ├── hifigan │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── f0_predictor.cpython-310.pyc │ │ │ │ │ ├── f0_predictor.cpython-312.pyc │ │ │ │ │ ├── generator.cpython-310.pyc │ │ │ │ │ └── generator.cpython-312.pyc │ │ │ │ ├── f0_predictor.py │ │ │ │ └── generator.py │ │ │ ├── llm │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── llm.cpython-310.pyc │ │ │ │ │ └── llm.cpython-312.pyc │ │ │ │ └── llm.py │ │ │ ├── transformer │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ │ ├── activation.cpython-310.pyc │ │ │ │ │ ├── activation.cpython-312.pyc │ │ │ │ │ ├── attention.cpython-310.pyc │ │ │ │ │ ├── attention.cpython-312.pyc │ │ │ │ │ ├── convolution.cpython-310.pyc │ │ │ │ │ ├── convolution.cpython-312.pyc │ │ │ │ │ ├── embedding.cpython-310.pyc │ │ │ │ │ ├── embedding.cpython-312.pyc │ │ │ │ │ ├── encoder.cpython-310.pyc │ │ │ │ │ ├── encoder.cpython-312.pyc │ │ │ │ │ ├── encoder_layer.cpython-310.pyc │ │ │ │ │ ├── encoder_layer.cpython-312.pyc │ │ │ │ │ ├── label_smoothing_loss.cpython-310.pyc │ │ │ │ │ ├── label_smoothing_loss.cpython-312.pyc │ │ │ │ │ ├── positionwise_feed_forward.cpython-310.pyc │ │ │ │ │ ├── positionwise_feed_forward.cpython-312.pyc │ │ │ │ │ ├── subsampling.cpython-310.pyc │ │ │ │ │ └── subsampling.cpython-312.pyc │ │ │ │ ├── activation.py │ │ │ │ ├── attention.py │ │ │ │ ├── convolution.py │ │ │ │ ├── decoder.py │ │ │ │ ├── decoder_layer.py │ │ │ │ ├── embedding.py │ │ │ │ ├── encoder.py │ │ │ │ ├── encoder_layer.py │ │ │ │ ├── label_smoothing_loss.py │ │ │ │ ├── positionwise_feed_forward.py │ │ │ │ └── subsampling.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-310.pyc │ │ │ │ ├── __init__.cpython-312.pyc │ │ │ │ ├── block_mask_util.cpython-310.pyc │ │ │ │ ├── block_mask_util.cpython-312.pyc │ │ │ │ ├── class_utils.cpython-310.pyc │ │ │ │ ├── class_utils.cpython-312.pyc │ │ │ │ ├── common.cpython-310.pyc │ │ │ │ ├── common.cpython-312.pyc │ │ │ │ ├── mask.cpython-310.pyc │ │ │ │ └── mask.cpython-312.pyc │ │ │ │ ├── block_mask_util.py │ │ │ │ ├── class_utils.py │ │ │ │ ├── common.py │ │ │ │ ├── executor.py │ │ │ │ ├── file_utils.py │ │ │ │ ├── frontend_utils.py │ │ │ │ ├── mask.py │ │ │ │ ├── scheduler.py │ │ │ │ └── train_utils.py │ │ ├── flow_inference.py │ │ ├── generation_whisper.py │ │ ├── modeling_whisper.py │ │ ├── semantic.py │ │ ├── third_party │ │ │ └── Matcha-TTS │ │ │ │ ├── .env.example │ │ │ │ ├── .github │ │ │ │ ├── PULL_REQUEST_TEMPLATE.md │ │ │ │ ├── codecov.yml │ │ │ │ ├── dependabot.yml │ │ │ │ └── release-drafter.yml │ │ │ │ ├── .gitignore │ │ │ │ ├── .pre-commit-config.yaml │ │ │ │ ├── .project-root │ │ │ │ ├── .pylintrc │ │ │ │ ├── LICENSE │ │ │ │ ├── MANIFEST.in │ │ │ │ ├── Makefile │ │ │ │ ├── README.md │ │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── default.yaml │ │ │ │ │ ├── model_checkpoint.yaml │ │ │ │ │ ├── model_summary.yaml │ │ │ │ │ ├── none.yaml │ │ │ │ │ └── rich_progress_bar.yaml │ │ │ │ ├── data │ │ │ │ │ ├── hi-fi_en-US_female.yaml │ │ │ │ │ ├── ljspeech.yaml │ │ │ │ │ └── vctk.yaml │ │ │ │ ├── debug │ │ │ │ │ ├── default.yaml │ │ │ │ │ ├── fdr.yaml │ │ │ │ │ ├── limit.yaml │ │ │ │ │ ├── overfit.yaml │ │ │ │ │ └── profiler.yaml │ │ │ │ ├── eval.yaml │ │ │ │ ├── experiment │ │ │ │ │ ├── hifi_dataset_piper_phonemizer.yaml │ │ │ │ │ ├── ljspeech.yaml │ │ │ │ │ ├── ljspeech_min_memory.yaml │ │ │ │ │ └── multispeaker.yaml │ │ │ │ ├── extras │ │ │ │ │ └── default.yaml │ │ │ │ ├── hparams_search │ │ │ │ │ └── mnist_optuna.yaml │ │ │ │ ├── hydra │ │ │ │ │ └── default.yaml │ │ │ │ ├── local │ │ │ │ │ └── .gitkeep │ │ │ │ ├── logger │ │ │ │ │ ├── aim.yaml │ │ │ │ │ ├── comet.yaml │ │ │ │ │ ├── csv.yaml │ │ │ │ │ ├── many_loggers.yaml │ │ │ │ │ ├── mlflow.yaml │ │ │ │ │ ├── neptune.yaml │ │ │ │ │ ├── tensorboard.yaml │ │ │ │ │ └── wandb.yaml │ │ │ │ ├── model │ │ │ │ │ ├── cfm │ │ │ │ │ │ └── default.yaml │ │ │ │ │ ├── decoder │ │ │ │ │ │ └── default.yaml │ │ │ │ │ ├── encoder │ │ │ │ │ │ └── default.yaml │ │ │ │ │ ├── matcha.yaml │ │ │ │ │ └── optimizer │ │ │ │ │ │ └── adam.yaml │ │ │ │ ├── paths │ │ │ │ │ └── default.yaml │ │ │ │ ├── train.yaml │ │ │ │ └── trainer │ │ │ │ │ ├── cpu.yaml │ │ │ │ │ ├── ddp.yaml │ │ │ │ │ ├── ddp_sim.yaml │ │ │ │ │ ├── default.yaml │ │ │ │ │ ├── gpu.yaml │ │ │ │ │ └── mps.yaml │ │ │ │ ├── matcha │ │ │ │ ├── VERSION │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ ├── cli.py │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── components │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── text_mel_datamodule.py │ │ │ │ ├── hifigan │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── README.md │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── config.py │ │ │ │ │ ├── denoiser.py │ │ │ │ │ ├── env.py │ │ │ │ │ ├── meldataset.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── xutils.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── baselightningmodule.py │ │ │ │ │ ├── components │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── decoder.py │ │ │ │ │ │ ├── flow_matching.py │ │ │ │ │ │ ├── text_encoder.py │ │ │ │ │ │ └── transformer.py │ │ │ │ │ └── matcha_tts.py │ │ │ │ ├── onnx │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── export.py │ │ │ │ │ └── infer.py │ │ │ │ ├── text │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cleaners.py │ │ │ │ │ ├── numbers.py │ │ │ │ │ └── symbols.py │ │ │ │ ├── train.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── audio.py │ │ │ │ │ ├── generate_data_statistics.py │ │ │ │ │ ├── instantiators.py │ │ │ │ │ ├── logging_utils.py │ │ │ │ │ ├── model.py │ │ │ │ │ ├── monotonic_align │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── core.pyx │ │ │ │ │ └── setup.py │ │ │ │ │ ├── pylogger.py │ │ │ │ │ ├── rich_utils.py │ │ │ │ │ └── utils.py │ │ │ │ ├── notebooks │ │ │ │ └── .gitkeep │ │ │ │ ├── pyproject.toml │ │ │ │ ├── requirements.txt │ │ │ │ ├── scripts │ │ │ │ └── schedule.sh │ │ │ │ ├── setup.py │ │ │ │ └── synthesis.ipynb │ │ └── utils.py │ │ ├── MimiCodec │ │ ├── mimi_config.yaml │ │ ├── mimi_tokenizer.py │ │ └── model │ │ │ ├── models │ │ │ ├── MimiCodec.py │ │ │ └── __init__.py │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── conv.py │ │ │ ├── gating.py │ │ │ ├── resample.py │ │ │ ├── rope.py │ │ │ ├── seanet.py │ │ │ ├── streaming.py │ │ │ └── transformer.py │ │ │ ├── quantization │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── core_vq.py │ │ │ └── vq.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── compile.py │ │ ├── Text2ID │ │ ├── moshi_text_tokenizer.py │ │ └── text_tokenizer.py │ │ ├── abs_tokenizer.py │ │ └── common.py ├── trainer │ ├── pre_training_full.py │ ├── pre_training_lora.py │ └── pre_training_lora_ds.py └── utils │ ├── __init__.py │ ├── abs_scheduler.py │ ├── arguments.py │ ├── autocast.py │ ├── compile.py │ ├── dataloader.py │ ├── reporter.py │ ├── sampling.py │ ├── task_definition.py │ └── train_utils.py ├── RSTnet.pdf ├── RSTnet.png ├── demos ├── .DS_Store └── tts │ ├── setence_level_text_audio_interleaved_1272-128104-0006_sample.wav │ ├── setence_level_text_audio_interleaved_1272-141231-0011_sample.wav │ ├── setence_level_text_audio_interleaved_174-168635-0014_sample.wav │ ├── setence_level_text_audio_interleaved_251-137823-0008_sample.wav │ ├── setence_level_text_audio_interleaved_652-129742-0018_sample.wav │ └── setence_level_text_audio_interleaved_777-126732-0080_sample.wav └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | __pycache__/ 3 | *.pyc 4 | processed/ 5 | speech_data/ 6 | *.pt 7 | data/ 8 | ckpts/ 9 | debug_data/ 10 | debug_data_processed/ -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/dataloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/dataloaders/__init__.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/dataloaders/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/dataloaders/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/dataloaders/__pycache__/base_dataloader.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/dataloaders/__pycache__/base_dataloader.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__init__.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/basic_loss.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/discriminator_loss.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/enh_loss.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/losses/__pycache__/generator_loss.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/models/__init__.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/models/__pycache__/MimiCodec.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/models/__pycache__/MimiCodec.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/models/__pycache__/MimiCodec.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/models/__pycache__/MimiCodec.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/models/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/models/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Modules used for building the models.""" 11 | 12 | # flake8: noqa 13 | from .conv import ( 14 | NormConv1d, 15 | NormConvTranspose1d, 16 | StreamingConv1d, 17 | StreamingConvTranspose1d, 18 | pad_for_conv1d, 19 | pad1d, 20 | unpad1d, 21 | ) 22 | from .seanet import SEANetEncoder, SEANetDecoder 23 | from .transformer import StreamingTransformer 24 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/conv.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/gating.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/gating.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/lstm.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/norm.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/resample.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/resample.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/rope.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/rope.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/seanet.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/streaming.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/streaming.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/streaming.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__init__.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/base_layers.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/ops.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/pqmf.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/commons/__pycache__/torch_stft.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/discriminators/__init__.py: -------------------------------------------------------------------------------- 1 | from modules.discriminators.frequency_discriminator import MultiFrequencyDiscriminator 2 | from modules.discriminators.period_discriminator import MultiPeriodDiscriminator 3 | from modules.discriminators.scale_discriminator import MultiScaleDiscriminator 4 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/discriminators/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/discriminators/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/discriminators/__pycache__/frequency_discriminator.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/discriminators/__pycache__/frequency_discriminator.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/discriminators/__pycache__/period_discriminator.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/discriminators/__pycache__/period_discriminator.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/modules/discriminators/__pycache__/scale_discriminator.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/modules/discriminators/__pycache__/scale_discriminator.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/path.sh: -------------------------------------------------------------------------------- 1 | export LC_ALL=C 2 | export PYTHONIOENCODING=UTF-8 3 | export OMP_NUM_THREADS=1 4 | 5 | # python import root 6 | export PYTHONPATH=${PYTHONPATH}:./ 7 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """RVQ.""" 11 | # flake8: noqa 12 | from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer 13 | from .base import BaseQuantizer, DummyQuantizer, QuantizedResult 14 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/base.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/base.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/base.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/core_vq.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/core_vq.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/core_vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/core_vq.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/vq.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/vq.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/vq.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/vq_dc.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/vq_dc.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/quantization/__pycache__/vq_dc.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/quantization/__pycache__/vq_dc.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__init__.py: -------------------------------------------------------------------------------- 1 | from semantic_features.wavlm_feature import WavLMFeature 2 | from semantic_features.WavLM import WavLM, WavLMConfig 3 | from semantic_features.hubert_feature import HuBertFeature -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/WavLM.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/WavLM.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/WavLM.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/WavLM.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/modules.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/modules.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/__pycache__/wavlm_feature.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/__pycache__/wavlm_feature.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/hubert_feature.py: -------------------------------------------------------------------------------- 1 | from transformers import HubertModel, Wav2Vec2Processor, Wav2Vec2FeatureExtractor 2 | import torch 3 | import torch.nn as nn 4 | 5 | class HuBertFeature(nn.Module): 6 | def __init__(self, ckpt_path, device='cpu'): 7 | super(HuBertFeature, self).__init__() 8 | self.processor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt_path) 9 | self.model = HubertModel.from_pretrained(ckpt_path) 10 | self.model.eval() 11 | self.model = self.model.to(device) 12 | self.device = device 13 | self.freeze() 14 | 15 | def freeze(self): 16 | for param in self.model.parameters(): 17 | param.requires_grad = False 18 | 19 | def extract(self, x): 20 | """ 21 | Extract features from HuBert model 22 | Input: 23 | Output: 24 | """ 25 | if len(x.size()) == 3: 26 | x = x.squeeze(1) # from (B,1,T) ---> (B, T) 27 | assert len(x.size()) == 2 28 | 29 | with torch.no_grad(): 30 | outputs = self.model(x) 31 | last_hidden_state = outputs['last_hidden_state'].to(torch.float32) # (B, ssl_dim, T) 32 | return last_hidden_state 33 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/w2vec2bert_feature.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/w2vec2bert_feature.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/wavlm_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | this code aims to extract semantic features from pre-trained WavLM model 3 | """ 4 | import torch 5 | from semantic_features.WavLM import WavLM, WavLMConfig 6 | import torchaudio 7 | import torch.nn as nn 8 | 9 | class WavLMFeature(nn.Module): 10 | def __init__(self, ckpt_path, device='cpu'): 11 | super().__init__() 12 | checkpoint = torch.load(ckpt_path) 13 | self.cfg = WavLMConfig(checkpoint['cfg']) 14 | self.model = WavLM(self.cfg) 15 | self.model.load_state_dict(checkpoint['model']) 16 | self.model.eval() 17 | self.model = self.model.to(device) 18 | self.device = device 19 | self.freeze() 20 | 21 | def freeze(self): 22 | for param in self.model.parameters(): 23 | param.requires_grad = False 24 | 25 | def extract(self, x): 26 | """ 27 | extract the feature from last layer of wavlm 28 | input: 29 | output: 30 | """ 31 | if len(x.size()) == 3: 32 | x = x.squeeze(1) # from (B,1,T) ---> (B, T) 33 | assert len(x.size()) == 2 34 | #x = torch.cat([x, torch.zeros(x.shape[0], 320).to(x.device)], dim=1) 35 | if self.cfg.normalize: 36 | wav_input_16khz = torch.nn.functional.layer_norm(x.to(self.device) , x.shape) 37 | rep = self.model.extract_features(wav_input_16khz)[0] 38 | return rep 39 | 40 | 41 | -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/semantic_features/whisper_feature.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/semantic_features/whisper_feature.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__init__.py -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/compile.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/compile.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/ddp_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/ddp_utils.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/ddp_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/ddp_utils.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/hifigan_mel.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-312.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/AudioCodec/MimiCodec/utils/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /AudioCodec/readme.md: -------------------------------------------------------------------------------- 1 | ## Streaming Audio Codec Model 2 | In this part, we introduce the training code for streaming audio codec. Now, we support to train the SOTA codec, MimiCodec.
3 | We plan to add more advanced streaming codec model in the future 4 | 5 | - [x] The training and inference code for MimiCodec. 6 | - [ ] Support other advanced streaming codec. 7 | -------------------------------------------------------------------------------- /DataPipeline/readme.md: -------------------------------------------------------------------------------- 1 | ## Large-scale data processing pipeline 2 | In this part, we introduce how to collect large-scale dataset for MLLM training. We mainly care about two types of audio data: 3 | 4 | - [ ] Single-streaming speech data (TTS level speech data)
5 | - [ ] Two-streaming or Multi-streaming speech data (conversation speech data) 6 | 7 | In this version, we only provide a data preprocessing pipeline for pre-collected multi-stream speech data (i.e. Fisher), and it is temporarily integrated in [MLLM/egs/moshi/ft/readme.md](../MLLM/egs/moshi_ft/readme.md) -------------------------------------------------------------------------------- /Evaluation/codec/compute_dnsmos.sh: -------------------------------------------------------------------------------- 1 | # DNSMOS is a reference free evaluation metrix 2 | audio_path='' 3 | cd DNS-Challenge/DNSMOS 4 | python dnsmos_local.py -t $audio_path -o output.csv -p 5 | 6 | -------------------------------------------------------------------------------- /Evaluation/readme.md: -------------------------------------------------------------------------------- 1 | ## Evaluation and Benchmark dataset 2 | We provide the evaluation metrics and evaluation dataset for audio codec and speech-text models. 3 | 4 | - [x] Audio Codec Evaluation Metrics 5 | - [ ] Audio Codec evaluation benchmark dataset 6 | - [ ] Speech-text foundation model evaluation metrics 7 | - [ ] Speech-text foundation model benchmark dataset 8 | 9 | -------------------------------------------------------------------------------- /MLLM/models/__pycache__/model.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/models/__pycache__/model.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Modules used for building the models.""" 11 | 12 | # flake8: noqa 13 | from modules.conv import ( 14 | NormConv1d, 15 | NormConvTranspose1d, 16 | StreamingConv1d, 17 | StreamingConvTranspose1d, 18 | pad_for_conv1d, 19 | pad1d, 20 | unpad1d, 21 | ) 22 | from modules.seanet import SEANetEncoder, SEANetDecoder 23 | from modules.transformer import StreamingTransformer 24 | -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/conv.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/conv.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/gating.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/gating.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/resample.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/resample.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/rope.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/rope.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/seanet.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/seanet.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/streaming.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/transformer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/transformer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/modules/__pycache__/transformer_lora.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/modules/__pycache__/transformer_lora.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/readme.md: -------------------------------------------------------------------------------- 1 | ## Multi-modal LLM (speech-text foundation models) 2 | In this part, we provide the training details of speech-text foundation models. We will includes: 3 | 4 | - [x] Moshi finetuning code, including full-parameter finetuning and LORA finetuning 5 | - [ ] Moshi pre-training and post-training code 6 | - [ ] More advanced speech-text foundation model (by ourselves) 7 | 8 | -------------------------------------------------------------------------------- /MLLM/tools/data_scripts/filter_scp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | ref_f = sys.argv[1] 4 | in_f = sys.argv[2] 5 | try: 6 | writer = open(sys.argv[3], 'w', encoding='utf-8') 7 | stream_out = False 8 | except: 9 | stream_out = True 10 | 11 | # output is in the order of ref_f 12 | ref = [] 13 | for line in open(ref_f, encoding='utf-8'): 14 | uttid = line.strip().split()[0] 15 | ref.append(uttid) 16 | 17 | in_dic = {} 18 | for line in open(in_f, encoding='utf-8'): 19 | elems = line.strip().split() 20 | uttid = elems[0] 21 | ctx = " ".join(elems[1:]) 22 | in_dic[uttid] = ctx 23 | 24 | for e in ref: 25 | if e in in_dic: 26 | if stream_out: 27 | print(f"{e} {in_dic[e]}") 28 | else: 29 | writer.write(f"{e} {in_dic[e]}\n") 30 | -------------------------------------------------------------------------------- /MLLM/tools/data_scripts/select_spk2utt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | def get_parser(): 5 | parser = argparse.ArgumentParser( 6 | description="Revise the spk2utt file: it only contans a subset of the utts", 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 8 | ) 9 | parser.add_argument("--in-spk2utt", type=str, help="original spk2utt file") 10 | parser.add_argument("--out-spk2utt", type=str, help="revised spk2utt file") 11 | parser.add_argument("--subset-list", type=str, help="list of utt subset") 12 | return parser 13 | 14 | def main(args): 15 | args = get_parser().parse_args(args) 16 | 17 | utts = open(args.subset_list).readlines() 18 | utts = [line.strip().split()[0] for line in utts] 19 | utts = {x: None for x in utts} 20 | 21 | writer = open(args.out_spk2utt, 'w') 22 | for line in open(args.in_spk2utt): 23 | line = line.strip().split() 24 | spk_id, spk_utts = line[0], line[1:] 25 | spk_utts = [utt for utt in spk_utts if utt in utts] 26 | 27 | out_str = " ".join([spk_id] + spk_utts) 28 | writer.write(out_str + "\n") 29 | 30 | if __name__ == "__main__": 31 | main(sys.argv[1:]) 32 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/build_const_arpa_lm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script reads in an Arpa format language model, and converts it into the 7 | # ConstArpaLm format language model. 8 | 9 | # begin configuration section 10 | # end configuration section 11 | 12 | [ -f path.sh ] && . ./path.sh; 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: " 18 | echo " $0 [options] " 19 | echo "e.g.:" 20 | echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" 21 | echo "Options" 22 | exit 1; 23 | fi 24 | 25 | export LC_ALL=C 26 | 27 | arpa_lm=$1 28 | old_lang=$2 29 | new_lang=$3 30 | 31 | mkdir -p $new_lang 32 | 33 | mkdir -p $new_lang 34 | cp -r $old_lang/* $new_lang 35 | 36 | unk=`cat $old_lang/oov.int` 37 | bos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 38 | eos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 39 | if [[ -z $bos || -z $eos ]]; then 40 | echo "$0: and symbols are not in $old_lang/words.txt" 41 | exit 1 42 | fi 43 | if [[ -z $unk ]]; then 44 | echo "$0: can't find oov symbol id in $old_lang/oov.int" 45 | exit 1 46 | fi 47 | 48 | 49 | arpa-to-const-arpa --bos-symbol=$bos \ 50 | --eos-symbol=$eos --unk-symbol=$unk \ 51 | "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; 52 | 53 | exit 0; 54 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/build_kenlm_model_from_arpa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 2020 author Jiayu DU 3 | # Apache 2.0 4 | 5 | # This script reads in an Arpa format language model, and converts it into the 6 | # KenLM format language model. 7 | 8 | [ -f path.sh ] && . ./path.sh; 9 | 10 | # begin configuration section 11 | kenlm_opts="" # e.g. "-q 8 -b 8" for 8bits quantization 12 | model_type="trie" # "trie" or "probing". trie is smaller, probing is faster. 13 | # end configuration section 14 | 15 | . utils/parse_options.sh 16 | 17 | if [ $# != 2 ]; then 18 | echo "Usage: " 19 | echo " $0 [options] " 20 | echo "e.g.:" 21 | echo " $0 data/local/lm/4gram.arpa data/lang_test/G.trie" 22 | echo "Options:" 23 | echo " --model-type can be either \"trie\" or \"probing\"" 24 | echo " --kenlm-opts directly pass through to kenlm" 25 | echo " e.g. for 8bits quantization, feed \"-q 8 -b 8\"" 26 | exit 1; 27 | fi 28 | 29 | export LC_ALL=C 30 | 31 | arpa_lm=$1 32 | kenlm=$2 33 | 34 | if ! which build_binary >& /dev/null ; then 35 | echo "$0: cannot find KenLM's build_binary tool," 36 | echo "check kenlm installation (tools/extras/install_kenlm_query_only.sh)." 37 | exit 1 38 | fi 39 | 40 | mkdir -p $(dirname $kenlm) 41 | build_binary $kenlm_opts $model_type $arpa_lm $kenlm 42 | 43 | echo "$0: Successfully built arpa into kenlm format: $kenlm" 44 | exit 0 45 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/ctm/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/data/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script works out the approximate number of frames in a training directory. 4 | # This is sometimes needed by higher-level scripts 5 | 6 | 7 | if [ -f path.sh ]; then . ./path.sh; fi 8 | . parse_options.sh || exit 1; 9 | 10 | if [ $# -ne 1 ]; then 11 | ( 12 | echo "Usage: $0 " 13 | echo "Prints the number of frames of data in the data-dir" 14 | ) 1>&2 15 | fi 16 | 17 | data=$1 18 | 19 | if [ ! -f $data/utt2dur ]; then 20 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1 21 | fi 22 | 23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 24 | 25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur 26 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/data/get_reco2utt_for_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "This script outputs a mapping from recording to a list of utterances " 8 | echo "corresponding to the recording. It is analogous to the content of " 9 | echo "a spk2utt file, but is indexed by recording instead of speaker." 10 | echo "Usage: get_reco2utt.sh " 11 | echo " e.g.: get_reco2utt.sh data/train" 12 | exit 1 13 | fi 14 | 15 | data=$1 16 | 17 | if [ ! -s $data/segments ]; then 18 | utils/data/get_segments_for_data.sh $data > $data/segments 19 | fi 20 | 21 | cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl 22 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/data/get_segments_for_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script operates on a data directory, such as in data/train/, 4 | # and writes new segments to stdout. The file 'segments' maps from 5 | # utterance to time offsets into a recording, with the format: 6 | # 7 | # This script assumes utterance and recording ids are the same (i.e., that 8 | # wav.scp is indexed by utterance), and uses durations from 'utt2dur', 9 | # created if necessary by get_utt2dur.sh. 10 | 11 | . ./path.sh 12 | 13 | if [ $# != 1 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "e.g.:" 16 | echo " $0 data/train > data/train/segments" 17 | exit 1 18 | fi 19 | 20 | data=$1 21 | 22 | if [ ! -s $data/utt2dur ]; then 23 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1; 24 | fi 25 | 26 | # 0 27 | awk '{ print $1, $1, 0, $2 }' $data/utt2dur 28 | 29 | exit 0 30 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/data/get_utt2num_frames.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | nj=4 8 | 9 | frame_shift=0.01 10 | frame_overlap=0.015 11 | 12 | . utils/parse_options.sh 13 | . ./path.sh 14 | 15 | if [ $# -ne 1 ]; then 16 | echo "This script writes a file utt2num_frames with the " 17 | echo "number of frames in each utterance as measured based on the " 18 | echo "duration of the utterances (in utt2dur) and the specified " 19 | echo "frame_shift and frame_overlap." 20 | echo "Usage: $0 " 21 | exit 1 22 | fi 23 | 24 | data=$1 25 | 26 | if [ -s $data/utt2num_frames ]; then 27 | echo "$0: $data/utt2num_frames already present!" 28 | exit 0; 29 | fi 30 | 31 | if [ ! -f $data/feats.scp ]; then 32 | utils/data/get_utt2dur.sh --nj ${nj} --cmd "$cmd" $data 33 | awk -v fs=$frame_shift -v fovlp=$frame_overlap \ 34 | '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames 35 | exit 0 36 | fi 37 | 38 | utils/split_data.sh --per-utt $data $nj || exit 1 39 | $cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \ 40 | feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1 41 | 42 | for n in `seq $nj`; do 43 | cat $data/split${nj}utt/$n/utt2num_frames 44 | done > $data/utt2num_frames 45 | 46 | echo "$0: Computed and wrote $data/utt2num_frames" 47 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/data/resample_data_dir.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2018 Xiaohui Zhang 5 | # Apache 2.0. 6 | 7 | if [ $# -ne 2 ]; then 8 | echo "This script adds a sox line in wav.scp to resample the audio at a " 9 | echo "different sampling-rate" 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 8000 data/dev" 12 | exit 1 13 | fi 14 | 15 | freq=$1 16 | dir=$2 17 | 18 | sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; } 19 | 20 | if [ -f $dir/feats.scp ]; then 21 | mkdir -p $dir/.backup 22 | mv $dir/feats.scp $dir/.backup/ 23 | if [ -f $dir/cmvn.scp ]; then 24 | mv $dir/cmvn.scp $dir/.backup/ 25 | fi 26 | echo "$0: feats.scp already exists. Moving it to $dir/.backup" 27 | fi 28 | 29 | # After resampling we cannot compute utt2dur from wav.scp any more, 30 | # so we create utt2dur now, in case it's needed later 31 | if [ ! -s $dir/utt2dur ]; then 32 | utils/data/get_utt2dur.sh $dir 1>&2 || exit 1; 33 | fi 34 | 35 | mv $dir/wav.scp $dir/wav.scp.tmp 36 | cat $dir/wav.scp.tmp | python -c "import sys 37 | for line in sys.stdin.readlines(): 38 | splits = line.strip().split() 39 | if splits[-1] == '|': 40 | out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |' 41 | else: 42 | out_line = '{0} cat {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:])) 43 | print (out_line)" > ${dir}/wav.scp 44 | rm $dir/wav.scp.tmp 45 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/eps2disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2015 Guoguo Chen 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This script replaces epsilon with #0 on the input side only, of the G.fst 19 | # acceptor. 20 | 21 | while(<>){ 22 | if (/\s+#0\s+/) { 23 | print STDERR "$0: ERROR: LM has word #0, " . 24 | "which is reserved as disambiguation symbol\n"; 25 | exit 1; 26 | } 27 | s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; 28 | print; 29 | } 30 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/filt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Apache 2.0 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | vocab=set() 9 | with open(sys.argv[1]) as vocabfile: 10 | for line in vocabfile: 11 | vocab.add(line.strip()) 12 | 13 | with open(sys.argv[2]) as textfile: 14 | for line in textfile: 15 | print(" ".join([word if word in vocab else '' for word in line.strip().split()])) 16 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/lang/bpe/prepend_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script, prepend '|' to every words in the transcript to mark 4 | # the beginning of the words for finding the initial-space of every word 5 | # after decoding. 6 | 7 | import sys 8 | import io 9 | import re 10 | 11 | whitespace = re.compile("[ \t]+") 12 | infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') 13 | output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') 14 | for line in infile: 15 | words = whitespace.split(line.strip(" \t\r\n")) 16 | output.write(' '.join([ "|"+word for word in words]) + '\n') 17 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/lang/bpe/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script, reverse all latin and digits sequences 5 | # (including words like MP3) to put them in the right order in the images. 6 | 7 | import re, os, sys, io 8 | 9 | in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 10 | out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 11 | for line in in_stream: 12 | out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]', 13 | lambda m:m.group(0)[::-1], line)) 14 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/make_absolute.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script replaces the command readlink -f (which is not portable). 4 | # It turns a pathname into an absolute pathname, including following soft links. 5 | target_file=$1 6 | 7 | cd $(dirname $target_file) 8 | target_file=$(basename "$target_file") 9 | 10 | # Iterate down a (possible) chain of symlinks 11 | while [ -L "$target_file" ]; do 12 | target_file=$(readlink $target_file) 13 | cd $(dirname $target_file) 14 | target_file=$(basename $target_file) 15 | done 16 | 17 | # Compute the canonicalized name by finding the physical path 18 | # for the directory we're in and appending the target file. 19 | phys_dir=$(pwd -P) 20 | result=$phys_dir/$target_file 21 | echo $result 22 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/require_argument.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To be sourced by another script 4 | 5 | set -euo pipefail 6 | 7 | if [ $# -ne 1 ]; then 8 | echo "Usage: $0 " >&2 9 | echo " e.g.: $0 --data-dir" >&2 10 | fi 11 | 12 | key=$1 13 | 14 | name=$(sed -e s/^--// -e s/-/_/g <<< "$key") 15 | 16 | if eval '[ -z "$'$name'" ]'; then 17 | echo "$0: option $key is required" >&2 18 | echo >&2 19 | echo "$help_message" >&2 20 | exit 1 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/require_argument_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To be sourced by another script 4 | 5 | for i in $@; do 6 | . utils/require_argument.sh $i 7 | done 8 | 9 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/s2eps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces and with (on both input and output sides), 18 | # for the G.fst acceptor. 19 | 20 | while(<>){ 21 | @A = split(" ", $_); 22 | if ( @A >= 4 ) { 23 | if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } 24 | if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } 25 | } 26 | print join("\t", @A) . "\n"; 27 | } 28 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines; 37 | while (<>) { 38 | push @lines, [ (rand(), $_)] ; 39 | } 40 | 41 | @lines = sort { $a->[0] cmp $b->[0] } @lines; 42 | foreach $l (@lines) { 43 | print $l->[1]; 44 | } 45 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/summarize_warnings.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; 6 | 7 | $dir = $ARGV[0]; 8 | 9 | ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; 10 | 11 | $dir =~ s:/$::; # Remove trailing slash. 12 | 13 | 14 | # Group the files into categories where all have the same base-name. 15 | foreach $f (glob ("$dir/*.log")) { 16 | $f_category = $f; 17 | # do next expression twice; s///g doesn't work as they overlap. 18 | $f_category =~ s:\.\d+\.:.*.:; 19 | $f_category =~ s:\.\d+\.:.*.:; 20 | $fmap{$f_category} .= " $f"; 21 | } 22 | 23 | sub split_hundreds { # split list of filenames into groups of 100. 24 | my $names = shift @_; 25 | my @A = split(" ", $names); 26 | my @ans = (); 27 | while (@A > 0) { 28 | my $group = ""; 29 | for ($x = 0; $x < 100 && @A>0; $x++) { 30 | $fname = pop @A; 31 | $group .= "$fname "; 32 | } 33 | push @ans, $group; 34 | } 35 | return @ans; 36 | } 37 | 38 | foreach $c (keys %fmap) { 39 | $n = 0; 40 | foreach $fgroup (split_hundreds($fmap{$c})) { 41 | $n += `grep -w WARNING $fgroup | wc -l`; 42 | } 43 | if ($n != 0) { 44 | print "$n warnings in $c\n" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /MLLM/tools/kaldi/utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/__pycache__/mimi_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/__pycache__/mimi_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/__pycache__/mimi_tokenizer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/__pycache__/mimi_tokenizer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/mimi_config.yaml: -------------------------------------------------------------------------------- 1 | generator: 2 | name: MimiCodec 3 | config: 4 | encoder_rates: [8, 6, 5, 4] 5 | codebook_size: 2048 6 | codebook_dim: 256 7 | rvq_layers: 8 -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/models/__init__.py -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/models/__pycache__/MimiCodec.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/models/__pycache__/MimiCodec.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/models/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/models/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__init__.py -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/conv.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/conv.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/gating.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/gating.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/resample.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/resample.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/rope.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/rope.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/seanet.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/seanet.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/streaming.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/transformer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/modules/__pycache__/transformer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """RVQ.""" 11 | # flake8: noqa 12 | from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer 13 | from .base import BaseQuantizer, DummyQuantizer, QuantizedResult 14 | -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/base.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/base.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/core_vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/core_vq.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/quantization/__pycache__/vq.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/utils/__init__.py -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/MimiCodec/model/utils/__pycache__/compile.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/MimiCodec/model/utils/__pycache__/compile.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/Text2ID/__pycache__/moshi_text_tokenizer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/Text2ID/__pycache__/moshi_text_tokenizer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/Text2ID/__pycache__/text_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/Text2ID/__pycache__/text_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/__pycache__/abs_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM/tools/tokenizer/__pycache__/common.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/tools/tokenizer/__pycache__/common.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Utilities.""" 11 | -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/abs_scheduler.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/abs_scheduler.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/arguments.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/arguments.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/compile.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/compile.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/dataloader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/dataloader.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/dataloader.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/dataloader.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/reporter.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/reporter.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/sampling.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/sampling.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/task_definition.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/task_definition.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM/utils/__pycache__/train_utils.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM/utils/__pycache__/train_utils.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/egs/extract_tokens/get_wav.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | names = glob.glob("/home-dongchao/data/source/*.wav") 5 | f = open('/home-dongchao/code3/RSTnet_private/MLLM/egs/extract_tokens/wav.scp', 'w') 6 | for name in names: 7 | bs_name = os.path.basename(name) 8 | f.write(bs_name+' '+name+'\n') 9 | 10 | 11 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/emilia/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": { 3 | "multilingual": true, 4 | "supported": [ 5 | "zh", 6 | "en", 7 | "fr", 8 | "ja", 9 | "ko", 10 | "de" 11 | ] 12 | }, 13 | "entrypoint": { 14 | "input_folder_path": "/mnt/users/hccl.local/jkzhao/projects/RSTnet/MLLM_v2/egs/pretraining/data_scripts/emilia/testbench", 15 | "SAMPLE_RATE": 24000 16 | }, 17 | "separate": { 18 | "step1": { 19 | "model_path": "/mnt/users/hccl.local/jkzhao/projects/RSTnet/MLLM_v2/egs/pretraining/data_scripts/emilia/ckpts/UVR-MDX-NET-Inst_HQ_3.onnx", 20 | "denoise": true, 21 | "margin": 44100, 22 | "chunks": 15, 23 | "n_fft": 6144, 24 | "dim_t": 8, 25 | "dim_f": 3072 26 | } 27 | }, 28 | "mos_model": { 29 | "primary_model_path": "/mnt/users/hccl.local/jkzhao/projects/RSTnet/MLLM_v2/egs/pretraining/data_scripts/emilia/ckpts/sig_bak_ovr.onnx" 30 | }, 31 | "huggingface_token": "" 32 | } -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/emilia/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024 Amphion. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | conda install ffmpeg -y 8 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y 9 | conda install cudnn=9 10 | pip install -r requirements.txt 11 | pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ 12 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/emilia/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/egs/pretraining/data_scripts/emilia/models/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/emilia/requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | numpy 3 | tqdm 4 | pydub 5 | pyannote.audio 6 | pandas 7 | git+https://github.com/m-bain/whisperx.git # needs torch >= 2.0.0 8 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/emilia/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/egs/pretraining/data_scripts/emilia/utils/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/data_scripts/filter_scp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | ref_f = sys.argv[1] 4 | in_f = sys.argv[2] 5 | try: 6 | writer = open(sys.argv[3], 'w', encoding='utf-8') 7 | stream_out = False 8 | except: 9 | stream_out = True 10 | 11 | # output is in the order of ref_f 12 | ref = [] 13 | for line in open(ref_f, encoding='utf-8'): 14 | uttid = line.strip().split()[0] 15 | ref.append(uttid) 16 | 17 | in_dic = {} 18 | for line in open(in_f, encoding='utf-8'): 19 | elems = line.strip().split() 20 | uttid = elems[0] 21 | ctx = " ".join(elems[1:]) 22 | in_dic[uttid] = ctx 23 | 24 | for e in ref: 25 | if e in in_dic: 26 | if stream_out: 27 | print(f"{e} {in_dic[e]}") 28 | else: 29 | writer.write(f"{e} {in_dic[e]}\n") 30 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/infer.sh: -------------------------------------------------------------------------------- 1 | # inference 2 | . ./path.sh 3 | ngpu=1 4 | inference_dir='/home-dongchao/exp/MLLM/exp/infer' 5 | part='simple_infer' 6 | 7 | 8 | python3 ../../infer_no_streaming.py \ 9 | --exp_dir /home-dongchao/exp/MLLM/exp/exp/audiollm_v2_llama3B_11_25_tts \ 10 | --resume /home-dongchao/exp/MLLM/exp/exp/audiollm_v2_llama3B_11_25_tts/ep1-iter125000.checkpoint \ 11 | --inference_mode 'sampling' \ 12 | --rank 0 \ 13 | --output_dir /home-dongchao/code3/RSTnet_private/MLLM2_11_24/egs/pretraining/tts_only_11_25 \ 14 | --data_json /home-dongchao/exp/MLLM/tasks/audio/libritts/test/8splits/data_tts.0.json \ 15 | --generate_target 'audio' \ 16 | --task_name 'TTS' 17 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/path.sh: -------------------------------------------------------------------------------- 1 | export LC_ALL=C 2 | export PYTHONIOENCODING=UTF-8 3 | export OMP_NUM_THREADS=1 4 | 5 | # executable bins 6 | export PATH=$PATH:utils:../../tools/data_scripts/ 7 | 8 | # python import root 9 | export PYTHONPATH=${PYTHONPATH}:../../ 10 | -------------------------------------------------------------------------------- /MLLM_v2/egs/pretraining/readme.md: -------------------------------------------------------------------------------- 1 | ## Main idea 2 | 3 | 1. preprocess the dataset 4 | 5 | First, prepare environment following [Emilia](https://github.com/open-mmlab/Amphion/tree/main/preprocessors/Emilia). The Emilia codes are at `exripts/emilia`. 6 | 7 | Then, modify paths in `prepare_broadcast_data.sh` and run. The dataloader will yield a token sequence with the following [B, 9, t_text+t_audio+2] shape: 8 | ``` 9 | <|begin_of_text|>[] ··· [] <|text_emply_token|>··· 10 | ···<|semantic_emply_token|> 11 | <|semantic_emply_token|>··· 12 | ``` 13 | 14 | 2. Pre-training 15 | 16 | 3. Post-training 17 | 18 | 4. inference 19 | 20 | -------------------------------------------------------------------------------- /MLLM_v2/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/config.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/config.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/lit_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/lit_model.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/lit_model.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/lit_model.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/lit_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/lit_model.cpython-39.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/llama_streaming.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/llama_streaming.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/llama_streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/llama_streaming.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/llama_streaming.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/llama_streaming.cpython-39.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/llama_streaming_lora.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/llama_streaming_lora.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/mlp.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/mlp.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/mlp.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/mlp.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/mlp.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/mlp.cpython-39.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/__pycache__/model.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/models/__pycache__/model.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/models/mlp.py: -------------------------------------------------------------------------------- 1 | import math 2 | from dataclasses import dataclass 3 | from typing import Any, Dict, List, Optional, Tuple, Type, Union 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn import functional as F 8 | from typing_extensions import Self 9 | from models import lit_model 10 | from models.config import Config as BaseConfig 11 | 12 | 13 | -------------------------------------------------------------------------------- /MLLM_v2/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Modules used for building the models.""" 11 | 12 | # flake8: noqa 13 | from modules.conv import ( 14 | NormConv1d, 15 | NormConvTranspose1d, 16 | StreamingConv1d, 17 | StreamingConvTranspose1d, 18 | pad_for_conv1d, 19 | pad1d, 20 | unpad1d, 21 | ) 22 | from modules.seanet import SEANetEncoder, SEANetDecoder 23 | from modules.transformer import StreamingTransformer 24 | -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/conv.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/conv.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/conv.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/conv.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/gating.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/gating.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/resample.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/resample.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/rope.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/rope.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/seanet.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/seanet.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/streaming.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/streaming.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/streaming.cpython-38.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/transformer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/transformer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/modules/__pycache__/transformer_lora.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/modules/__pycache__/transformer_lora.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | """ 6 | moshi is the inference codebase for Kyutai audio generation models. 7 | 8 | The code has been adapted from Audiocraft, see LICENSE.audiocraft 9 | Copyright (c) Meta Platforms, Inc. and affiliates. 10 | """ 11 | 12 | # flake8: noqa 13 | from . import utils 14 | from . import modules 15 | from . import models 16 | from . import quantization 17 | 18 | __version__ = "0.1.0" 19 | -------------------------------------------------------------------------------- /MLLM_v2/moshi/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | """ 5 | Models for the compression model Moshi, 6 | """ 7 | 8 | # flake8: noqa 9 | from moshi.models.compression import ( 10 | CompressionModel, 11 | MimiModel, 12 | ) 13 | from moshi.models.lm import LMModel, LMGen 14 | from moshi.models.loaders import get_mimi, get_moshi_lm 15 | -------------------------------------------------------------------------------- /MLLM_v2/moshi/models/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/models/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/models/__pycache__/compression.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/models/__pycache__/compression.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/models/__pycache__/lm.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/models/__pycache__/lm.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/models/__pycache__/loaders.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/models/__pycache__/loaders.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Modules used for building the models.""" 11 | 12 | # flake8: noqa 13 | from moshi.modules.conv import ( 14 | NormConv1d, 15 | NormConvTranspose1d, 16 | StreamingConv1d, 17 | StreamingConvTranspose1d, 18 | pad_for_conv1d, 19 | pad1d, 20 | unpad1d, 21 | ) 22 | from moshi.modules.seanet import SEANetEncoder, SEANetDecoder 23 | from moshi.modules.transformer import StreamingTransformer 24 | -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/conv.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/conv.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/gating.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/gating.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/resample.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/resample.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/rope.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/rope.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/seanet.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/seanet.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/streaming.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/streaming.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/modules/__pycache__/transformer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/modules/__pycache__/transformer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """RVQ.""" 11 | # flake8: noqa 12 | from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer 13 | from .base import BaseQuantizer, DummyQuantizer, QuantizedResult 14 | -------------------------------------------------------------------------------- /MLLM_v2/moshi/quantization/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/quantization/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/quantization/__pycache__/base.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/quantization/__pycache__/base.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/quantization/__pycache__/core_vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/quantization/__pycache__/core_vq.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/quantization/__pycache__/vq.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/quantization/__pycache__/vq.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Utilities.""" 11 | -------------------------------------------------------------------------------- /MLLM_v2/moshi/utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/utils/__pycache__/compile.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/utils/__pycache__/compile.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/moshi/utils/__pycache__/sampling.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/moshi/utils/__pycache__/sampling.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/readme.md: -------------------------------------------------------------------------------- 1 | ## Multi-modal LLM (speech-text foundation models) 2 | In this part, we provide the training details of speech-text foundation models. 3 | We provide a moshi-style pre-training code. 4 | 5 | ## How to start it? 6 | 7 | ### Step 0: refer to litgpt https://github.com/Lightning-AI/litgpt/ to download the desired LLM checkpoints 8 | 9 | ### Step 1: refer to egs/pretraining, and check the extract_token.sh for data preprocessing 10 | 11 | ### Step 2: refer to egs/pretraining, and check the run.sh for model pre-training 12 | 13 | ### Step 3: refer to egs/pretraining, and check the infer.sh for inference 14 | 15 | 16 | -------------------------------------------------------------------------------- /MLLM_v2/tools/data_scripts/filter_scp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | ref_f = sys.argv[1] 4 | in_f = sys.argv[2] 5 | try: 6 | writer = open(sys.argv[3], 'w', encoding='utf-8') 7 | stream_out = False 8 | except: 9 | stream_out = True 10 | 11 | # output is in the order of ref_f 12 | ref = [] 13 | for line in open(ref_f, encoding='utf-8'): 14 | uttid = line.strip().split()[0] 15 | ref.append(uttid) 16 | 17 | in_dic = {} 18 | for line in open(in_f, encoding='utf-8'): 19 | elems = line.strip().split() 20 | uttid = elems[0] 21 | ctx = " ".join(elems[1:]) 22 | in_dic[uttid] = ctx 23 | 24 | for e in ref: 25 | if e in in_dic: 26 | if stream_out: 27 | print(f"{e} {in_dic[e]}") 28 | else: 29 | writer.write(f"{e} {in_dic[e]}\n") 30 | -------------------------------------------------------------------------------- /MLLM_v2/tools/data_scripts/select_spk2utt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | def get_parser(): 5 | parser = argparse.ArgumentParser( 6 | description="Revise the spk2utt file: it only contans a subset of the utts", 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 8 | ) 9 | parser.add_argument("--in-spk2utt", type=str, help="original spk2utt file") 10 | parser.add_argument("--out-spk2utt", type=str, help="revised spk2utt file") 11 | parser.add_argument("--subset-list", type=str, help="list of utt subset") 12 | return parser 13 | 14 | def main(args): 15 | args = get_parser().parse_args(args) 16 | 17 | utts = open(args.subset_list).readlines() 18 | utts = [line.strip().split()[0] for line in utts] 19 | utts = {x: None for x in utts} 20 | 21 | writer = open(args.out_spk2utt, 'w') 22 | for line in open(args.in_spk2utt): 23 | line = line.strip().split() 24 | spk_id, spk_utts = line[0], line[1:] 25 | spk_utts = [utt for utt in spk_utts if utt in utts] 26 | 27 | out_str = " ".join([spk_id] + spk_utts) 28 | writer.write(out_str + "\n") 29 | 30 | if __name__ == "__main__": 31 | main(sys.argv[1:]) 32 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/build_const_arpa_lm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script reads in an Arpa format language model, and converts it into the 7 | # ConstArpaLm format language model. 8 | 9 | # begin configuration section 10 | # end configuration section 11 | 12 | [ -f path.sh ] && . ./path.sh; 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: " 18 | echo " $0 [options] " 19 | echo "e.g.:" 20 | echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" 21 | echo "Options" 22 | exit 1; 23 | fi 24 | 25 | export LC_ALL=C 26 | 27 | arpa_lm=$1 28 | old_lang=$2 29 | new_lang=$3 30 | 31 | mkdir -p $new_lang 32 | 33 | mkdir -p $new_lang 34 | cp -r $old_lang/* $new_lang 35 | 36 | unk=`cat $old_lang/oov.int` 37 | bos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 38 | eos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 39 | if [[ -z $bos || -z $eos ]]; then 40 | echo "$0: and symbols are not in $old_lang/words.txt" 41 | exit 1 42 | fi 43 | if [[ -z $unk ]]; then 44 | echo "$0: can't find oov symbol id in $old_lang/oov.int" 45 | exit 1 46 | fi 47 | 48 | 49 | arpa-to-const-arpa --bos-symbol=$bos \ 50 | --eos-symbol=$eos --unk-symbol=$unk \ 51 | "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; 52 | 53 | exit 0; 54 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/build_kenlm_model_from_arpa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 2020 author Jiayu DU 3 | # Apache 2.0 4 | 5 | # This script reads in an Arpa format language model, and converts it into the 6 | # KenLM format language model. 7 | 8 | [ -f path.sh ] && . ./path.sh; 9 | 10 | # begin configuration section 11 | kenlm_opts="" # e.g. "-q 8 -b 8" for 8bits quantization 12 | model_type="trie" # "trie" or "probing". trie is smaller, probing is faster. 13 | # end configuration section 14 | 15 | . utils/parse_options.sh 16 | 17 | if [ $# != 2 ]; then 18 | echo "Usage: " 19 | echo " $0 [options] " 20 | echo "e.g.:" 21 | echo " $0 data/local/lm/4gram.arpa data/lang_test/G.trie" 22 | echo "Options:" 23 | echo " --model-type can be either \"trie\" or \"probing\"" 24 | echo " --kenlm-opts directly pass through to kenlm" 25 | echo " e.g. for 8bits quantization, feed \"-q 8 -b 8\"" 26 | exit 1; 27 | fi 28 | 29 | export LC_ALL=C 30 | 31 | arpa_lm=$1 32 | kenlm=$2 33 | 34 | if ! which build_binary >& /dev/null ; then 35 | echo "$0: cannot find KenLM's build_binary tool," 36 | echo "check kenlm installation (tools/extras/install_kenlm_query_only.sh)." 37 | exit 1 38 | fi 39 | 40 | mkdir -p $(dirname $kenlm) 41 | build_binary $kenlm_opts $model_type $arpa_lm $kenlm 42 | 43 | echo "$0: Successfully built arpa into kenlm format: $kenlm" 44 | exit 0 45 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/ctm/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/data/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script works out the approximate number of frames in a training directory. 4 | # This is sometimes needed by higher-level scripts 5 | 6 | 7 | if [ -f path.sh ]; then . ./path.sh; fi 8 | . parse_options.sh || exit 1; 9 | 10 | if [ $# -ne 1 ]; then 11 | ( 12 | echo "Usage: $0 " 13 | echo "Prints the number of frames of data in the data-dir" 14 | ) 1>&2 15 | fi 16 | 17 | data=$1 18 | 19 | if [ ! -f $data/utt2dur ]; then 20 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1 21 | fi 22 | 23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 24 | 25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur 26 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/data/get_reco2utt_for_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "This script outputs a mapping from recording to a list of utterances " 8 | echo "corresponding to the recording. It is analogous to the content of " 9 | echo "a spk2utt file, but is indexed by recording instead of speaker." 10 | echo "Usage: get_reco2utt.sh " 11 | echo " e.g.: get_reco2utt.sh data/train" 12 | exit 1 13 | fi 14 | 15 | data=$1 16 | 17 | if [ ! -s $data/segments ]; then 18 | utils/data/get_segments_for_data.sh $data > $data/segments 19 | fi 20 | 21 | cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/data/get_segments_for_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script operates on a data directory, such as in data/train/, 4 | # and writes new segments to stdout. The file 'segments' maps from 5 | # utterance to time offsets into a recording, with the format: 6 | # 7 | # This script assumes utterance and recording ids are the same (i.e., that 8 | # wav.scp is indexed by utterance), and uses durations from 'utt2dur', 9 | # created if necessary by get_utt2dur.sh. 10 | 11 | . ./path.sh 12 | 13 | if [ $# != 1 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "e.g.:" 16 | echo " $0 data/train > data/train/segments" 17 | exit 1 18 | fi 19 | 20 | data=$1 21 | 22 | if [ ! -s $data/utt2dur ]; then 23 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1; 24 | fi 25 | 26 | # 0 27 | awk '{ print $1, $1, 0, $2 }' $data/utt2dur 28 | 29 | exit 0 30 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/data/get_utt2num_frames.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | nj=4 8 | 9 | frame_shift=0.01 10 | frame_overlap=0.015 11 | 12 | . utils/parse_options.sh 13 | . ./path.sh 14 | 15 | if [ $# -ne 1 ]; then 16 | echo "This script writes a file utt2num_frames with the " 17 | echo "number of frames in each utterance as measured based on the " 18 | echo "duration of the utterances (in utt2dur) and the specified " 19 | echo "frame_shift and frame_overlap." 20 | echo "Usage: $0 " 21 | exit 1 22 | fi 23 | 24 | data=$1 25 | 26 | if [ -s $data/utt2num_frames ]; then 27 | echo "$0: $data/utt2num_frames already present!" 28 | exit 0; 29 | fi 30 | 31 | if [ ! -f $data/feats.scp ]; then 32 | utils/data/get_utt2dur.sh --nj ${nj} --cmd "$cmd" $data 33 | awk -v fs=$frame_shift -v fovlp=$frame_overlap \ 34 | '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames 35 | exit 0 36 | fi 37 | 38 | utils/split_data.sh --per-utt $data $nj || exit 1 39 | $cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \ 40 | feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1 41 | 42 | for n in `seq $nj`; do 43 | cat $data/split${nj}utt/$n/utt2num_frames 44 | done > $data/utt2num_frames 45 | 46 | echo "$0: Computed and wrote $data/utt2num_frames" 47 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/data/resample_data_dir.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2018 Xiaohui Zhang 5 | # Apache 2.0. 6 | 7 | if [ $# -ne 2 ]; then 8 | echo "This script adds a sox line in wav.scp to resample the audio at a " 9 | echo "different sampling-rate" 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 8000 data/dev" 12 | exit 1 13 | fi 14 | 15 | freq=$1 16 | dir=$2 17 | 18 | sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; } 19 | 20 | if [ -f $dir/feats.scp ]; then 21 | mkdir -p $dir/.backup 22 | mv $dir/feats.scp $dir/.backup/ 23 | if [ -f $dir/cmvn.scp ]; then 24 | mv $dir/cmvn.scp $dir/.backup/ 25 | fi 26 | echo "$0: feats.scp already exists. Moving it to $dir/.backup" 27 | fi 28 | 29 | # After resampling we cannot compute utt2dur from wav.scp any more, 30 | # so we create utt2dur now, in case it's needed later 31 | if [ ! -s $dir/utt2dur ]; then 32 | utils/data/get_utt2dur.sh $dir 1>&2 || exit 1; 33 | fi 34 | 35 | mv $dir/wav.scp $dir/wav.scp.tmp 36 | cat $dir/wav.scp.tmp | python -c "import sys 37 | for line in sys.stdin.readlines(): 38 | splits = line.strip().split() 39 | if splits[-1] == '|': 40 | out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |' 41 | else: 42 | out_line = '{0} cat {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:])) 43 | print (out_line)" > ${dir}/wav.scp 44 | rm $dir/wav.scp.tmp 45 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/eps2disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2015 Guoguo Chen 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This script replaces epsilon with #0 on the input side only, of the G.fst 19 | # acceptor. 20 | 21 | while(<>){ 22 | if (/\s+#0\s+/) { 23 | print STDERR "$0: ERROR: LM has word #0, " . 24 | "which is reserved as disambiguation symbol\n"; 25 | exit 1; 26 | } 27 | s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; 28 | print; 29 | } 30 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/filt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Apache 2.0 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | vocab=set() 9 | with open(sys.argv[1]) as vocabfile: 10 | for line in vocabfile: 11 | vocab.add(line.strip()) 12 | 13 | with open(sys.argv[2]) as textfile: 14 | for line in textfile: 15 | print(" ".join([word if word in vocab else '' for word in line.strip().split()])) 16 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/lang/bpe/prepend_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script, prepend '|' to every words in the transcript to mark 4 | # the beginning of the words for finding the initial-space of every word 5 | # after decoding. 6 | 7 | import sys 8 | import io 9 | import re 10 | 11 | whitespace = re.compile("[ \t]+") 12 | infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') 13 | output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') 14 | for line in infile: 15 | words = whitespace.split(line.strip(" \t\r\n")) 16 | output.write(' '.join([ "|"+word for word in words]) + '\n') 17 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/lang/bpe/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script, reverse all latin and digits sequences 5 | # (including words like MP3) to put them in the right order in the images. 6 | 7 | import re, os, sys, io 8 | 9 | in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 10 | out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 11 | for line in in_stream: 12 | out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]', 13 | lambda m:m.group(0)[::-1], line)) 14 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/make_absolute.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script replaces the command readlink -f (which is not portable). 4 | # It turns a pathname into an absolute pathname, including following soft links. 5 | target_file=$1 6 | 7 | cd $(dirname $target_file) 8 | target_file=$(basename "$target_file") 9 | 10 | # Iterate down a (possible) chain of symlinks 11 | while [ -L "$target_file" ]; do 12 | target_file=$(readlink $target_file) 13 | cd $(dirname $target_file) 14 | target_file=$(basename $target_file) 15 | done 16 | 17 | # Compute the canonicalized name by finding the physical path 18 | # for the directory we're in and appending the target file. 19 | phys_dir=$(pwd -P) 20 | result=$phys_dir/$target_file 21 | echo $result 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/require_argument.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To be sourced by another script 4 | 5 | set -euo pipefail 6 | 7 | if [ $# -ne 1 ]; then 8 | echo "Usage: $0 " >&2 9 | echo " e.g.: $0 --data-dir" >&2 10 | fi 11 | 12 | key=$1 13 | 14 | name=$(sed -e s/^--// -e s/-/_/g <<< "$key") 15 | 16 | if eval '[ -z "$'$name'" ]'; then 17 | echo "$0: option $key is required" >&2 18 | echo >&2 19 | echo "$help_message" >&2 20 | exit 1 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/require_argument_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To be sourced by another script 4 | 5 | for i in $@; do 6 | . utils/require_argument.sh $i 7 | done 8 | 9 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/s2eps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces and with (on both input and output sides), 18 | # for the G.fst acceptor. 19 | 20 | while(<>){ 21 | @A = split(" ", $_); 22 | if ( @A >= 4 ) { 23 | if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } 24 | if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } 25 | } 26 | print join("\t", @A) . "\n"; 27 | } 28 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines; 37 | while (<>) { 38 | push @lines, [ (rand(), $_)] ; 39 | } 40 | 41 | @lines = sort { $a->[0] cmp $b->[0] } @lines; 42 | foreach $l (@lines) { 43 | print $l->[1]; 44 | } 45 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/summarize_warnings.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; 6 | 7 | $dir = $ARGV[0]; 8 | 9 | ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; 10 | 11 | $dir =~ s:/$::; # Remove trailing slash. 12 | 13 | 14 | # Group the files into categories where all have the same base-name. 15 | foreach $f (glob ("$dir/*.log")) { 16 | $f_category = $f; 17 | # do next expression twice; s///g doesn't work as they overlap. 18 | $f_category =~ s:\.\d+\.:.*.:; 19 | $f_category =~ s:\.\d+\.:.*.:; 20 | $fmap{$f_category} .= " $f"; 21 | } 22 | 23 | sub split_hundreds { # split list of filenames into groups of 100. 24 | my $names = shift @_; 25 | my @A = split(" ", $names); 26 | my @ans = (); 27 | while (@A > 0) { 28 | my $group = ""; 29 | for ($x = 0; $x < 100 && @A>0; $x++) { 30 | $fname = pop @A; 31 | $group .= "$fname "; 32 | } 33 | push @ans, $group; 34 | } 35 | return @ans; 36 | } 37 | 38 | foreach $c (keys %fmap) { 39 | $n = 0; 40 | foreach $fgroup (split_hundreds($fmap{$c})) { 41 | $n += `grep -w WARNING $fgroup | wc -l`; 42 | } 43 | if ($n != 0) { 44 | print "$n warnings in $c\n" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /MLLM_v2/tools/kaldi/utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/cli/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/dataset/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/decoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/decoder.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/decoder.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/decoder.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow_matching.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow_matching.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow_matching.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/flow_matching.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/length_regulator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/length_regulator.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/length_regulator.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/flow/__pycache__/length_regulator.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/generator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/generator.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/generator.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/hifigan/__pycache__/generator.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/llm/__pycache__/llm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/llm/__pycache__/llm.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/llm/__pycache__/llm.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/llm/__pycache__/llm.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/activation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/activation.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/activation.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/activation.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/attention.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/attention.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/attention.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/convolution.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/convolution.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/convolution.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/convolution.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/embedding.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/embedding.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/embedding.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/embedding.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder_layer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder_layer.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder_layer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/encoder_layer.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/subsampling.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/subsampling.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/subsampling.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/transformer/__pycache__/subsampling.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/block_mask_util.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/block_mask_util.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/class_utils.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/class_utils.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/common.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/common.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/common.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/common.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/mask.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/mask.cpython-310.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/mask.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/__pycache__/mask.cpython-312.pyc -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/cosyvoice/utils/block_mask_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def create_grid_mask(seq_length, trunck_length, fill_triangle): 5 | assert seq_length > 0 6 | 7 | # 先不考虑seen_length创建一个grid mask: 8 | if fill_triangle: 9 | mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) 10 | # 下三角与主对角线都为1 11 | else: 12 | mask = torch.zeros(seq_length, seq_length) 13 | 14 | for i in range(seq_length): 15 | trunck_idx = i // trunck_length 16 | trunck_start = trunck_idx * trunck_length 17 | trunck_end = trunck_length + trunck_start 18 | mask[i][trunck_start:trunck_end] = 1 19 | 20 | return mask 21 | 22 | 23 | if __name__ == "__main__": 24 | mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int() 25 | print(mask) 26 | # tensor([[1, 1, 1, 0, 0, 0, 0, 0], 27 | # [1, 1, 1, 0, 0, 0, 0, 0], 28 | # [1, 1, 1, 0, 0, 0, 0, 0], 29 | # [1, 1, 1, 1, 1, 1, 0, 0], 30 | # [1, 1, 1, 1, 1, 1, 0, 0], 31 | # [1, 1, 1, 1, 1, 1, 0, 0], 32 | # [1, 1, 1, 1, 1, 1, 1, 1], 33 | # [1, 1, 1, 1, 1, 1, 1, 1]] 34 | 35 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.env.example: -------------------------------------------------------------------------------- 1 | # example of file for storing private and user specific environment variables, like keys or system paths 2 | # rename it to ".env" (excluded from version control by default) 3 | # .env is loaded by train.py automatically 4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} 5 | 6 | MY_VAR="/home/user/my/system/path" 7 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 9 | 10 | Fixes #\ 11 | 12 | ## Before submitting 13 | 14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? 15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? 16 | - [ ] Did you list all the **breaking changes** introduced by this pull request? 17 | - [ ] Did you **test your PR locally** with `pytest` command? 18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? 19 | 20 | ## Did you have fun? 21 | 22 | Make sure you had fun coding 🙃 23 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.github/codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | # measures overall project coverage 4 | project: 5 | default: 6 | threshold: 100% # how much decrease in coverage is needed to not consider success 7 | 8 | # measures PR or single commit coverage 9 | patch: 10 | default: 11 | threshold: 100% # how much decrease in coverage is needed to not consider success 12 | 13 | 14 | # project: off 15 | # patch: off 16 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | target-branch: "dev" 11 | schedule: 12 | interval: "daily" 13 | ignore: 14 | - dependency-name: "pytorch-lightning" 15 | update-types: ["version-update:semver-patch"] 16 | - dependency-name: "torchmetrics" 17 | update-types: ["version-update:semver-patch"] 18 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: "v$RESOLVED_VERSION" 2 | tag-template: "v$RESOLVED_VERSION" 3 | 4 | categories: 5 | - title: "🚀 Features" 6 | labels: 7 | - "feature" 8 | - "enhancement" 9 | - title: "🐛 Bug Fixes" 10 | labels: 11 | - "fix" 12 | - "bugfix" 13 | - "bug" 14 | - title: "🧹 Maintenance" 15 | labels: 16 | - "maintenance" 17 | - "dependencies" 18 | - "refactoring" 19 | - "cosmetic" 20 | - "chore" 21 | - title: "📝️ Documentation" 22 | labels: 23 | - "documentation" 24 | - "docs" 25 | 26 | change-template: "- $TITLE @$AUTHOR (#$NUMBER)" 27 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions 28 | 29 | version-resolver: 30 | major: 31 | labels: 32 | - "major" 33 | minor: 34 | labels: 35 | - "minor" 36 | patch: 37 | labels: 38 | - "patch" 39 | default: patch 40 | 41 | template: | 42 | ## Changes 43 | 44 | $CHANGES 45 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/.project-root: -------------------------------------------------------------------------------- 1 | # this file is required for inferring the project root directory 2 | # do not delete 3 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Shivam Mehta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include matcha/VERSION 7 | recursive-include matcha *.json 8 | recursive-include matcha *.html 9 | recursive-include matcha *.png 10 | recursive-include matcha *.md 11 | recursive-include matcha *.py 12 | recursive-include matcha *.pyx 13 | recursive-exclude tests * 14 | prune tests* 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | clean-logs: ## Clean logs 14 | rm -rf logs/** 15 | 16 | create-package: ## Create wheel and tar gz 17 | rm -rf dist/ 18 | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 19 | python setup.py sdist 20 | python -m twine upload dist/* --verbose --skip-existing 21 | 22 | format: ## Run pre-commit hooks 23 | pre-commit run -a 24 | 25 | sync: ## Merge changes from main branch to your current branch 26 | git pull 27 | git pull origin main 28 | 29 | test: ## Run not slow tests 30 | pytest -k "not slow" 31 | 32 | test-full: ## Run all tests 33 | pytest 34 | 35 | train-ljspeech: ## Train the model 36 | python matcha/train.py experiment=ljspeech 37 | 38 | train-ljspeech-min: ## Train the model with minimum memory 39 | python matcha/train.py experiment=ljspeech_min_memory 40 | 41 | start_app: ## Start the app 42 | python matcha/app.py 43 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # this file is needed here to include configs when building project as a package 2 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - model_summary.yaml 4 | - rich_progress_bar.yaml 5 | - _self_ 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html 2 | 3 | model_checkpoint: 4 | _target_: lightning.pytorch.callbacks.ModelCheckpoint 5 | dirpath: ${paths.output_dir}/checkpoints # directory to save the model file 6 | filename: checkpoint_{epoch:03d} # checkpoint filename 7 | monitor: epoch # name of the logged metric which determines when model is improving 8 | verbose: False # verbosity mode 9 | save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt 10 | save_top_k: 10 # save k best models (determined by above metric) 11 | mode: "max" # "max" means higher metric value is better, can be also "min" 12 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 13 | save_weights_only: False # if True, then only the model’s weights will be saved 14 | every_n_train_steps: null # number of training steps between checkpoints 15 | train_time_interval: null # checkpoints are monitored at the specified time interval 16 | every_n_epochs: 100 # number of epochs between checkpoints 17 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 18 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html 2 | 3 | model_summary: 4 | _target_: lightning.pytorch.callbacks.RichModelSummary 5 | max_depth: 3 # the maximum depth of layer nesting that the summary will include 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/none.yaml -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html 2 | 3 | rich_progress_bar: 4 | _target_: lightning.pytorch.callbacks.RichProgressBar 5 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | # Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ 6 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 7 | name: hi-fi_en-US_female 8 | train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt 9 | valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt 10 | batch_size: 32 11 | cleaners: [english_cleaners_piper] 12 | data_statistics: # Computed for this dataset 13 | mel_mean: -6.38385 14 | mel_std: 2.541796 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/data/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 2 | name: ljspeech 3 | train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt 4 | valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt 5 | batch_size: 32 6 | num_workers: 20 7 | pin_memory: True 8 | cleaners: [english_cleaners2] 9 | add_blank: True 10 | n_spks: 1 11 | n_fft: 1024 12 | n_feats: 80 13 | sample_rate: 22050 14 | hop_length: 256 15 | win_length: 1024 16 | f_min: 0 17 | f_max: 8000 18 | data_statistics: # Computed for ljspeech dataset 19 | mel_mean: -5.536622 20 | mel_std: 2.116101 21 | seed: ${seed} 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/data/vctk.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - ljspeech 3 | - _self_ 4 | 5 | _target_: matcha.data.text_mel_datamodule.TextMelDataModule 6 | name: vctk 7 | train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt 8 | valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt 9 | batch_size: 32 10 | add_blank: True 11 | n_spks: 109 12 | data_statistics: # Computed for vctk dataset 13 | mel_mean: -6.630575 14 | mel_std: 2.482914 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/debug/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # default debugging setup, runs 1 full epoch 4 | # other debugging configs can inherit from this one 5 | 6 | # overwrite task name so debugging logs are stored in separate folder 7 | task_name: "debug" 8 | 9 | # disable callbacks and loggers during debugging 10 | # callbacks: null 11 | # logger: null 12 | 13 | extras: 14 | ignore_warnings: False 15 | enforce_tags: False 16 | 17 | # sets level of all command line loggers to 'DEBUG' 18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 19 | hydra: 20 | job_logging: 21 | root: 22 | level: DEBUG 23 | 24 | # use this to also set hydra loggers to 'DEBUG' 25 | # verbose: True 26 | 27 | trainer: 28 | max_epochs: 1 29 | accelerator: cpu # debuggers don't like gpus 30 | devices: 1 # debuggers don't like multiprocessing 31 | detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor 32 | 33 | data: 34 | num_workers: 0 # debuggers don't like multiprocessing 35 | pin_memory: False # disable gpu memory pin 36 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/debug/fdr.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs 1 train, 1 validation and 1 test step 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | fast_dev_run: true 10 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/debug/limit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # uses only 1% of the training data and 5% of validation/test data 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 3 10 | limit_train_batches: 0.01 11 | limit_val_batches: 0.05 12 | limit_test_batches: 0.05 13 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/debug/overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # overfits to 3 batches 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 20 10 | overfit_batches: 3 11 | 12 | # model ckpt and early stopping need to be disabled during overfitting 13 | callbacks: null 14 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/debug/profiler.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs with execution time profiling 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 1 10 | # profiler: "simple" 11 | profiler: "advanced" 12 | # profiler: "pytorch" 13 | accelerator: gpu 14 | 15 | limit_train_batches: 0.02 16 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - data: mnist # choose datamodule with `test_dataloader()` for evaluation 6 | - model: mnist 7 | - logger: null 8 | - trainer: default 9 | - paths: default 10 | - extras: default 11 | - hydra: default 12 | 13 | task_name: "eval" 14 | 15 | tags: ["dev"] 16 | 17 | # passing checkpoint path is necessary for evaluation 18 | ckpt_path: ??? 19 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: hi-fi_en-US_female.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] 13 | 14 | run_name: hi-fi_en-US_female_piper_phonemizer 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech_min 15 | 16 | 17 | model: 18 | out_size: 172 19 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: vctk.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["multispeaker"] 13 | 14 | run_name: multispeaker 15 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | 15 | job_logging: 16 | handlers: 17 | file: 18 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 19 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 20 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/local/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/local/.gitkeep -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/aim.yaml: -------------------------------------------------------------------------------- 1 | # https://aimstack.io/ 2 | 3 | # example usage in lightning module: 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py 5 | 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder): 7 | # `aim up` 8 | 9 | aim: 10 | _target_: aim.pytorch_lightning.AimLogger 11 | repo: ${paths.root_dir} # .aim folder will be created here 12 | # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# 13 | 14 | # aim allows to group runs under experiment name 15 | experiment: null # any string, set to "default" if not specified 16 | 17 | train_metric_prefix: "train/" 18 | val_metric_prefix: "val/" 19 | test_metric_prefix: "test/" 20 | 21 | # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) 22 | system_tracking_interval: 10 # set to null to disable system metrics tracking 23 | 24 | # enable/disable logging of system params such as installed packages, git info, env vars, etc. 25 | log_system_params: true 26 | 27 | # enable/disable tracking console logs (default value is true) 28 | capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 29 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: lightning.pytorch.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "lightning-hydra-template" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: lightning.pytorch.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | # - comet 5 | - csv 6 | # - mlflow 7 | # - neptune 8 | - tensorboard 9 | - wandb 10 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: lightning.pytorch.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: username/lightning-hydra-template 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: lightning.pytorch.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "lightning-hydra-template" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/model/cfm/default.yaml: -------------------------------------------------------------------------------- 1 | name: CFM 2 | solver: euler 3 | sigma_min: 1e-4 4 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/model/decoder/default.yaml: -------------------------------------------------------------------------------- 1 | channels: [256, 256] 2 | dropout: 0.05 3 | attention_head_dim: 64 4 | n_blocks: 1 5 | num_mid_blocks: 2 6 | num_heads: 2 7 | act_fn: snakebeta 8 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/model/encoder/default.yaml: -------------------------------------------------------------------------------- 1 | encoder_type: RoPE Encoder 2 | encoder_params: 3 | n_feats: ${model.n_feats} 4 | n_channels: 192 5 | filter_channels: 768 6 | filter_channels_dp: 256 7 | n_heads: 2 8 | n_layers: 6 9 | kernel_size: 3 10 | p_dropout: 0.1 11 | spk_emb_dim: 64 12 | n_spks: 1 13 | prenet: true 14 | 15 | duration_predictor_params: 16 | filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} 17 | kernel_size: 3 18 | p_dropout: ${model.encoder.encoder_params.p_dropout} 19 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/model/matcha.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - encoder: default.yaml 4 | - decoder: default.yaml 5 | - cfm: default.yaml 6 | - optimizer: adam.yaml 7 | 8 | _target_: matcha.models.matcha_tts.MatchaTTS 9 | n_vocab: 178 10 | n_spks: ${data.n_spks} 11 | spk_emb_dim: 64 12 | n_feats: 80 13 | data_statistics: ${data.data_statistics} 14 | out_size: null # Must be divisible by 4 15 | prior_loss: true 16 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | _partial_: true 3 | lr: 1e-4 4 | weight_decay: 0.0 5 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # you can replace it with "." if you want the root to be the current working directory 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to data directory 7 | data_dir: ${paths.root_dir}/data/ 8 | 9 | # path to logging directory 10 | log_dir: ${paths.root_dir}/logs/ 11 | 12 | # path to output directory, created dynamically by hydra 13 | # path generation pattern is specified in `configs/hydra/default.yaml` 14 | # use it to store all files generated during the run, like ckpts and metrics 15 | output_dir: ${hydra:runtime.output_dir} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} 19 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | strategy: ddp 5 | 6 | accelerator: gpu 7 | devices: [0,1] 8 | num_nodes: 1 9 | sync_batchnorm: True 10 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: lightning.pytorch.trainer.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | max_epochs: -1 6 | 7 | accelerator: gpu 8 | devices: [0] 9 | 10 | # mixed precision for extra speed-up 11 | precision: 16-mixed 12 | 13 | # perform a validation loop every N training epochs 14 | check_val_every_n_epoch: 1 15 | 16 | # set True to to ensure deterministic results 17 | # makes training slower but gives more reproducibility than just setting seeds 18 | deterministic: False 19 | 20 | gradient_clip_val: 5.0 21 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.5.1 2 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/data/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/data/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/data/components/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/hifigan/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/hifigan/config.py: -------------------------------------------------------------------------------- 1 | v1 = { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0004, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | "upsample_rates": [8, 8, 2, 2], 11 | "upsample_kernel_sizes": [16, 16, 4, 4], 12 | "upsample_initial_channel": 512, 13 | "resblock_kernel_sizes": [3, 7, 11], 14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 15 | "resblock_initial_channel": 256, 16 | "segment_size": 8192, 17 | "num_mels": 80, 18 | "num_freq": 1025, 19 | "n_fft": 1024, 20 | "hop_size": 256, 21 | "win_size": 1024, 22 | "sampling_rate": 22050, 23 | "fmin": 0, 24 | "fmax": 8000, 25 | "fmax_loss": None, 26 | "num_workers": 4, 27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, 28 | } 29 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/hifigan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/models/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/models/components/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/onnx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/onnx/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron 2 | 3 | Defines the set of symbols used in text input to the model. 4 | """ 5 | _pad = "_" 6 | _punctuation = ';:,.!?¡¿—…"«»“” ' 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 8 | _letters_ipa = ( 9 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | ) 11 | 12 | 13 | # Export all symbols: 14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 15 | 16 | # Special symbol ids 17 | SPACE_ID = symbols.index(" ") 18 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers 2 | from matcha.utils.logging_utils import log_hyperparameters 3 | from matcha.utils.pylogger import get_pylogger 4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree 5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper 6 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from matcha.utils.monotonic_align.core import maximum_path_c 5 | 6 | 7 | def maximum_path(value, mask): 8 | """Cython optimised version. 9 | value: [b, t_x, t_y] 10 | mask: [b, t_x, t_y] 11 | """ 12 | value = value * mask 13 | device = value.device 14 | dtype = value.dtype 15 | value = value.data.cpu().numpy().astype(np.float32) 16 | path = np.zeros_like(value).astype(np.int32) 17 | mask = mask.data.cpu().numpy() 18 | 19 | t_x_max = mask.sum(1)[:, 0].astype(np.int32) 20 | t_y_max = mask.sum(2)[:, 0].astype(np.int32) 21 | maximum_path_c(path, value, t_x_max, t_y_max) 22 | return torch.from_numpy(path).to(device=device, dtype=dtype) 23 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/matcha/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name: str = __name__) -> logging.Logger: 7 | """Initializes a multi-GPU-friendly python command line logger. 8 | 9 | :param name: The name of the logger, defaults to ``__name__``. 10 | 11 | :return: A logger object. 12 | """ 13 | logger = logging.getLogger(name) 14 | 15 | # this ensures all logging levels get marked with the rank zero decorator 16 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 17 | logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") 18 | for level in logging_levels: 19 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 20 | 21 | return logger 22 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/notebooks/.gitkeep -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] 3 | 4 | [tool.black] 5 | line-length = 120 6 | target-version = ['py310'] 7 | exclude = ''' 8 | 9 | ( 10 | /( 11 | \.eggs # exclude a few common directories in the 12 | | \.git # root of the project 13 | | \.hg 14 | | \.mypy_cache 15 | | \.tox 16 | | \.venv 17 | | _build 18 | | buck-out 19 | | build 20 | | dist 21 | )/ 22 | | foo.py # also separately exclude a file named foo.py in 23 | # the root of the project 24 | ) 25 | ''' 26 | 27 | [tool.pytest.ini_options] 28 | addopts = [ 29 | "--color=yes", 30 | "--durations=0", 31 | "--strict-markers", 32 | "--doctest-modules", 33 | ] 34 | filterwarnings = [ 35 | "ignore::DeprecationWarning", 36 | "ignore::UserWarning", 37 | ] 38 | log_cli = "True" 39 | markers = [ 40 | "slow: slow tests", 41 | ] 42 | minversion = "6.0" 43 | testpaths = "tests/" 44 | 45 | [tool.coverage.report] 46 | exclude_lines = [ 47 | "pragma: nocover", 48 | "raise NotImplementedError", 49 | "raise NotImplementedError()", 50 | "if __name__ == .__main__.:", 51 | ] 52 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | # --------- pytorch --------- # 2 | torch>=2.0.0 3 | torchvision>=0.15.0 4 | lightning>=2.0.0 5 | torchmetrics>=0.11.4 6 | 7 | # --------- hydra --------- # 8 | hydra-core==1.3.2 9 | hydra-colorlog==1.2.0 10 | hydra-optuna-sweeper==1.2.0 11 | 12 | # --------- loggers --------- # 13 | # wandb 14 | # neptune-client 15 | # mlflow 16 | # comet-ml 17 | # aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 18 | 19 | # --------- others --------- # 20 | rootutils # standardizing the project root setup 21 | pre-commit # hooks for applying linters on commit 22 | rich # beautiful text formatting in terminal 23 | pytest # tests 24 | # sh # for running bash commands in some tests (linux/macos only) 25 | phonemizer # phonemization of text 26 | tensorboard 27 | librosa 28 | Cython 29 | numpy 30 | einops 31 | inflect 32 | Unidecode 33 | scipy 34 | torchaudio 35 | matplotlib 36 | pandas 37 | conformer==0.3.2 38 | diffusers==0.25.0 39 | notebook 40 | ipywidgets 41 | gradio==3.43.2 42 | gdown 43 | wget 44 | seaborn 45 | piper_phonemize 46 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/GLM4V/third_party/Matcha-TTS/scripts/schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Schedule execution of many runs 3 | # Run from root folder with: bash scripts/schedule.sh 4 | 5 | python src/train.py trainer.max_epochs=5 logger=csv 6 | 7 | python src/train.py trainer.max_epochs=10 logger=csv 8 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/MimiCodec/mimi_config.yaml: -------------------------------------------------------------------------------- 1 | generator: 2 | name: MimiCodec 3 | config: 4 | encoder_rates: [8, 6, 5, 4] 5 | codebook_size: 2048 6 | codebook_dim: 256 7 | rvq_layers: 8 -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/MimiCodec/model/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/MimiCodec/model/models/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/MimiCodec/model/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/MimiCodec/model/modules/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/MimiCodec/model/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """RVQ.""" 11 | # flake8: noqa 12 | from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer 13 | from .base import BaseQuantizer, DummyQuantizer, QuantizedResult 14 | -------------------------------------------------------------------------------- /MLLM_v2/tools/tokenizer/MimiCodec/model/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/MLLM_v2/tools/tokenizer/MimiCodec/model/utils/__init__.py -------------------------------------------------------------------------------- /MLLM_v2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Kyutai, all rights reserved. 2 | # This source code is licensed under the license found in the 3 | # LICENSE file in the root directory of this source tree. 4 | 5 | # Copyright (c) Meta Platforms, Inc. and affiliates. 6 | # All rights reserved. 7 | # 8 | # This source code is licensed under the license found in the 9 | # LICENSE file in the root directory of this source tree. 10 | """Utilities.""" 11 | -------------------------------------------------------------------------------- /RSTnet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/RSTnet.pdf -------------------------------------------------------------------------------- /RSTnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/RSTnet.png -------------------------------------------------------------------------------- /demos/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/.DS_Store -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_1272-128104-0006_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_1272-128104-0006_sample.wav -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_1272-141231-0011_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_1272-141231-0011_sample.wav -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_174-168635-0014_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_174-168635-0014_sample.wav -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_251-137823-0008_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_251-137823-0008_sample.wav -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_652-129742-0018_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_652-129742-0018_sample.wav -------------------------------------------------------------------------------- /demos/tts/setence_level_text_audio_interleaved_777-126732-0080_sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangdongchao/RSTnet/5f01e1fbefe947342ac404e80fed8fbe5cc3da03/demos/tts/setence_level_text_audio_interleaved_777-126732-0080_sample.wav --------------------------------------------------------------------------------