├── README.md ├── chime_eval_result.log ├── cmd.sh ├── conf ├── fbank.conf ├── fbank_16k.conf ├── fbank_16k_64ms_16ms.conf ├── fbank_8k.conf ├── mfcc.conf ├── mfcc_hires.conf ├── mfcc_hires_16k.conf ├── mfcc_hires_8k.conf ├── mfcc_sad.conf ├── online_cmvn.conf └── spectrogram_16k_64ms_16ms.conf ├── data ├── chime7_eval_all_CH │ ├── cmn_slide_fbank_htk.list │ ├── f1.rttm │ └── oracle.rttm └── dipco_dev_all_CH │ ├── cmn_slide_fbank_htk.list │ ├── f1.rttm │ └── oracle.rttm ├── dipco_dev_result.log ├── doc ├── CHiME_2023_DASR_wang.pdf ├── ICASS2024.pdf ├── NN_v3.jpg └── results.jpg ├── embedding_raw └── voxceleb │ ├── cluster_center_128.npy │ ├── cluster_center_256.npy │ ├── cluster_center_64.npy │ ├── cluster_center_64.txt │ ├── cluster_label_128.txt │ ├── cluster_label_256.txt │ ├── cluster_label_64.txt │ ├── speakers.txt │ ├── xvector_cluster_center_128.npy │ ├── xvector_cluster_center_256.npy │ └── xvector_cluster_center_64.npy ├── exp ├── S2S │ └── Batchsize20_4speakers_Segment800s_Mixup0.5_CHiME6MAMSELabel_SimuCHiME6_Mixer6MAMSELabel_SimuMixer6_SimuDipcoDevNoise_all_data_512_all0Dropout_6layers_weight_input_DIM │ │ ├── MULTI_MAM_SE_S2S_model.model6_chime7_eval_all_CH_f1_fusion │ │ ├── rttm_th0.35 │ │ ├── rttm_th0.35_pp │ │ ├── rttm_th0.40 │ │ ├── rttm_th0.40_pp │ │ ├── rttm_th0.45 │ │ ├── rttm_th0.45_pp │ │ ├── rttm_th0.50 │ │ ├── rttm_th0.50_pp │ │ ├── rttm_th0.55 │ │ ├── rttm_th0.55_pp │ │ ├── rttm_th0.60 │ │ ├── rttm_th0.60_pp │ │ ├── rttm_th0.65 │ │ └── rttm_th0.65_pp │ │ └── MULTI_MAM_SE_S2S_model.model6_dipco_dev_all_CH_f1_fusion │ │ ├── rttm_th0.35 │ │ ├── rttm_th0.35_pp │ │ ├── rttm_th0.40 │ │ ├── rttm_th0.40_pp │ │ ├── rttm_th0.45 │ │ ├── rttm_th0.45_pp │ │ ├── rttm_th0.50 │ │ ├── rttm_th0.50_pp │ │ ├── rttm_th0.55 │ │ ├── rttm_th0.55_pp │ │ ├── rttm_th0.60 │ │ ├── rttm_th0.60_pp │ │ ├── rttm_th0.65 │ │ └── rttm_th0.65_pp └── nnet3_recipe_ivector │ ├── extractor │ ├── 10.ie │ ├── final.dubm │ ├── final.ie │ ├── final.ie.id │ ├── final.mat │ ├── global_cmvn.stats │ ├── num_jobs │ ├── online_cmvn.conf │ └── splice_opts │ ├── ivectors_chime7_eval_all_CH_f1 │ └── ivectors_spk.txt │ ├── ivectors_chime7_train_array_Oracle │ └── ivectors_spk.txt │ └── ivectors_dipco_dev_all_CH_f1 │ └── ivectors_spk.txt ├── local ├── HTK.py ├── __pycache__ │ ├── HTK.cpython-39.pyc │ ├── config.cpython-310.pyc │ ├── config.cpython-39.pyc │ ├── conformer2.cpython-310.pyc │ ├── conformer2.cpython-39.pyc │ ├── model_S2S_weight_input_DIM.cpython-39.pyc │ ├── reader_s2s.cpython-39.pyc │ ├── utils.cpython-39.pyc │ └── utils_s2s.cpython-39.pyc ├── analysis_diarization.sh ├── config.py ├── conformer2.py ├── decode_MULTI_SE_MA_MSE_S2S_CH_fusion.py ├── decode_MULTI_SE_MA_MSE_S2S_CH_fusion_models_fusion.py ├── decode_S2S_model.sh ├── decode_S2S_models_fusion.sh ├── extract_feature.sh ├── extract_ivector_session_level.sh ├── loss_function.py ├── md-eval-22.pl ├── model_S2S_weight_input_DIM.py ├── postprocessing_s2s.py ├── prepare_ivector_extractor_dir_with_rttm.py ├── reader_s2s.py ├── reader_sc_s2s.py ├── rttm_filter_with_vad.py ├── run_MAMSE_S2S_chime7_ws_input_DIM.py ├── split_long_segment_s2s.py ├── train_Pretrain_DDP_S2S.py ├── utils.py └── utils_s2s.py ├── path.sh ├── requirements.txt ├── run_decode.sh ├── steps ├── align_basis_fmllr.sh ├── align_basis_fmllr_lats.sh ├── align_fmllr.sh ├── align_fmllr_lats.sh ├── align_lvtln.sh ├── align_raw_fmllr.sh ├── align_sgmm2.sh ├── align_si.sh ├── append_feats.sh ├── best_path_weights.sh ├── cleanup │ ├── clean_and_segment_data.sh │ ├── clean_and_segment_data_nnet3.sh │ ├── combine_short_segments.py │ ├── create_segments_from_ctm.pl │ ├── debug_lexicon.sh │ ├── decode_fmllr_segmentation.sh │ ├── decode_segmentation.sh │ ├── decode_segmentation_nnet3.sh │ ├── find_bad_utts.sh │ ├── find_bad_utts_nnet.sh │ ├── internal │ │ ├── align_ctm_ref.py │ │ ├── compute_tf_idf.py │ │ ├── ctm_to_text.pl │ │ ├── get_ctm_edits.py │ │ ├── get_non_scored_words.py │ │ ├── get_pron_stats.py │ │ ├── make_one_biased_lm.py │ │ ├── modify_ctm_edits.py │ │ ├── resolve_ctm_edits_overlaps.py │ │ ├── retrieve_similar_docs.py │ │ ├── segment_ctm_edits.py │ │ ├── segment_ctm_edits_mild.py │ │ ├── split_text_into_docs.pl │ │ ├── stitch_documents.py │ │ ├── taint_ctm_edits.py │ │ └── tf_idf.py │ ├── lattice_oracle_align.sh │ ├── make_biased_lm_graphs.sh │ ├── make_biased_lms.py │ ├── make_segmentation_data_dir.sh │ ├── make_segmentation_graph.sh │ ├── make_utterance_fsts.pl │ ├── make_utterance_graph.sh │ ├── segment_long_utterances.sh │ ├── segment_long_utterances_nnet3.sh │ └── split_long_utterance.sh ├── combine_ali_dirs.sh ├── combine_lat_dirs.sh ├── combine_trans_dirs.sh ├── compare_alignments.sh ├── compute_cmvn_stats.sh ├── compute_vad_decision.sh ├── conf │ ├── append_eval_to_ctm.py │ ├── append_prf_to_ctm.py │ ├── apply_calibration.sh │ ├── convert_ctm_to_tra.py │ ├── get_ctm_conf.sh │ ├── lattice_depth_per_frame.sh │ ├── parse_arpa_unigrams.py │ ├── prepare_calibration_data.py │ ├── prepare_word_categories.py │ └── train_calibration.sh ├── copy_ali_dir.sh ├── copy_lat_dir.sh ├── copy_trans_dir.sh ├── data │ ├── augment_data_dir.py │ ├── data_dir_manipulation_lib.py │ ├── make_musan.py │ ├── make_musan.sh │ └── reverberate_data_dir.py ├── decode.sh ├── decode_basis_fmllr.sh ├── decode_biglm.sh ├── decode_combine.sh ├── decode_fmllr.sh ├── decode_fmllr_extra.sh ├── decode_fmmi.sh ├── decode_fromlats.sh ├── decode_lvtln.sh ├── decode_nnet.sh ├── decode_nolats.sh ├── decode_raw_fmllr.sh ├── decode_sgmm2.sh ├── decode_sgmm2_fromlats.sh ├── decode_sgmm2_rescore.sh ├── decode_sgmm2_rescore_project.sh ├── decode_si.sh ├── decode_with_map.sh ├── diagnostic │ ├── analyze_alignments.sh │ ├── analyze_lats.sh │ ├── analyze_lattice_depth_stats.py │ └── analyze_phone_length_stats.py ├── dict │ ├── apply_g2p.sh │ ├── apply_g2p_phonetisaurus.sh │ ├── apply_lexicon_edits.py │ ├── get_pron_stats.py │ ├── internal │ │ ├── get_subsegments.py │ │ ├── prune_pron_candidates.py │ │ └── sum_arc_info.py │ ├── learn_lexicon_bayesian.sh │ ├── learn_lexicon_greedy.sh │ ├── merge_learned_lexicons.py │ ├── prons_to_lexicon.py │ ├── prune_pron_candidates.py │ ├── select_prons_bayesian.py │ ├── select_prons_greedy.py │ ├── train_g2p.sh │ └── train_g2p_phonetisaurus.sh ├── get_ctm.sh ├── get_ctm_conf.sh ├── get_ctm_conf_fast.sh ├── get_ctm_fast.sh ├── get_fmllr_basis.sh ├── get_lexicon_probs.sh ├── get_prons.sh ├── get_train_ctm.sh ├── info │ ├── chain_dir_info.pl │ ├── gmm_dir_info.pl │ ├── nnet2_dir_info.pl │ ├── nnet3_dir_info.pl │ └── nnet3_disc_dir_info.pl ├── libs │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── common.cpython-38.pyc │ ├── common.py │ └── nnet3 │ │ ├── __init__.py │ │ ├── report │ │ ├── __init__.py │ │ └── log_parse.py │ │ ├── train │ │ ├── __init__.py │ │ ├── chain_objf │ │ │ ├── __init__.py │ │ │ └── acoustic_model.py │ │ ├── common.py │ │ ├── dropout_schedule.py │ │ └── frame_level_objf │ │ │ ├── __init__.py │ │ │ ├── acoustic_model.py │ │ │ ├── common.py │ │ │ └── raw_model.py │ │ └── xconfig │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── basic_layers.py │ │ ├── composite_layers.py │ │ ├── convolution.py │ │ ├── gru.py │ │ ├── layers.py │ │ ├── lstm.py │ │ ├── parser.py │ │ ├── stats_layer.py │ │ ├── trivial_layers.py │ │ └── utils.py ├── lmrescore.sh ├── lmrescore_const_arpa.sh ├── lmrescore_const_arpa_undeterminized.sh ├── lmrescore_rnnlm_lat.sh ├── make_denlats.sh ├── make_denlats_sgmm2.sh ├── make_fbank.sh ├── make_fbank_pitch.sh ├── make_index.sh ├── make_mfcc.sh ├── make_mfcc_pitch.sh ├── make_mfcc_pitch_online.sh ├── make_phone_graph.sh ├── make_plp.sh ├── make_plp_pitch.sh ├── nnet │ ├── align.sh │ ├── decode.sh │ ├── ivector │ │ ├── extract_ivectors.sh │ │ ├── train_diag_ubm.sh │ │ └── train_ivector_extractor.sh │ ├── make_bn_feats.sh │ ├── make_denlats.sh │ ├── make_fmllr_feats.sh │ ├── make_fmmi_feats.sh │ ├── make_priors.sh │ ├── pretrain_dbn.sh │ ├── train.sh │ ├── train_mmi.sh │ ├── train_mpe.sh │ └── train_scheduler.sh ├── nnet2 │ ├── adjust_priors.sh │ ├── align.sh │ ├── check_ivectors_compatible.sh │ ├── convert_lda_to_raw.sh │ ├── convert_nnet1_to_nnet2.sh │ ├── create_appended_model.sh │ ├── decode.sh │ ├── dump_bottleneck_features.sh │ ├── get_egs.sh │ ├── get_egs2.sh │ ├── get_egs_discriminative2.sh │ ├── get_ivector_id.sh │ ├── get_lda.sh │ ├── get_lda_block.sh │ ├── get_num_frames.sh │ ├── get_perturbed_feats.sh │ ├── make_denlats.sh │ ├── make_multisplice_configs.py │ ├── relabel_egs.sh │ ├── relabel_egs2.sh │ ├── remove_egs.sh │ ├── retrain_fast.sh │ ├── retrain_simple2.sh │ ├── retrain_tanh.sh │ ├── train_block.sh │ ├── train_convnet_accel2.sh │ ├── train_discriminative.sh │ ├── train_discriminative2.sh │ ├── train_discriminative_multilang2.sh │ ├── train_more.sh │ ├── train_more2.sh │ ├── train_multilang2.sh │ ├── train_multisplice_accel2.sh │ ├── train_multisplice_ensemble.sh │ ├── train_pnorm.sh │ ├── train_pnorm_accel2.sh │ ├── train_pnorm_bottleneck_fast.sh │ ├── train_pnorm_ensemble.sh │ ├── train_pnorm_fast.sh │ ├── train_pnorm_multisplice.sh │ ├── train_pnorm_multisplice2.sh │ ├── train_pnorm_simple.sh │ ├── train_pnorm_simple2.sh │ ├── train_tanh.sh │ ├── train_tanh_bottleneck.sh │ ├── train_tanh_fast.sh │ └── update_nnet.sh ├── nnet3 │ ├── adjust_priors.sh │ ├── align.sh │ ├── align_lats.sh │ ├── chain │ │ ├── align_lats.sh │ │ ├── build_tree.sh │ │ ├── build_tree_multiple_sources.sh │ │ ├── e2e │ │ │ ├── README.txt │ │ │ ├── compute_biphone_stats.py │ │ │ ├── get_egs_e2e.sh │ │ │ ├── prepare_e2e.sh │ │ │ ├── text_to_phones.py │ │ │ └── train_e2e.py │ │ ├── gen_topo.pl │ │ ├── gen_topo.py │ │ ├── gen_topo2.py │ │ ├── gen_topo3.py │ │ ├── gen_topo4.py │ │ ├── gen_topo5.py │ │ ├── gen_topo_orig.py │ │ ├── get_egs.sh │ │ ├── get_model_context.sh │ │ ├── get_phone_post.sh │ │ ├── make_weighted_den_fst.sh │ │ ├── multilingual │ │ │ └── combine_egs.sh │ │ ├── train.py │ │ └── train_tdnn.sh │ ├── chain2 │ │ ├── combine_egs.sh │ │ ├── compute_preconditioning_matrix.sh │ │ ├── get_raw_egs.sh │ │ ├── internal │ │ │ ├── get_best_model.sh │ │ │ └── get_train_schedule.py │ │ ├── process_egs.sh │ │ ├── randomize_egs.sh │ │ ├── train.sh │ │ ├── validate_processed_egs.sh │ │ ├── validate_randomized_egs.sh │ │ └── validate_raw_egs.sh │ ├── components.py │ ├── compute_output.sh │ ├── convert_nnet2_to_nnet3.py │ ├── decode.sh │ ├── decode_grammar.sh │ ├── decode_lookahead.sh │ ├── decode_looped.sh │ ├── decode_score_fusion.sh │ ├── decode_semisup.sh │ ├── dot │ │ ├── descriptor_parser.py │ │ └── nnet3_to_dot.py │ ├── get_degs.sh │ ├── get_egs.sh │ ├── get_egs_discriminative.sh │ ├── get_egs_targets.sh │ ├── get_saturation.pl │ ├── get_successful_models.py │ ├── lstm │ │ ├── make_configs.py │ │ └── train.sh │ ├── make_bottleneck_features.sh │ ├── make_denlats.sh │ ├── make_tdnn_configs.py │ ├── multilingual │ │ ├── allocate_multilingual_examples.py │ │ └── combine_egs.sh │ ├── nnet3_to_dot.sh │ ├── remove_egs.sh │ ├── report │ │ ├── convert_model.py │ │ ├── generate_plots.py │ │ └── summarize_compute_debug_timing.py │ ├── tdnn │ │ ├── make_configs.py │ │ ├── train.sh │ │ └── train_raw_nnet.sh │ ├── train_discriminative.sh │ ├── train_dnn.py │ ├── train_raw_dnn.py │ ├── train_raw_rnn.py │ ├── train_rnn.py │ ├── train_tdnn.sh │ ├── xconfig_to_config.py │ └── xconfig_to_configs.py ├── online │ ├── decode.sh │ ├── nnet2 │ │ ├── align.sh │ │ ├── copy_data_dir.sh │ │ ├── copy_ivector_dir.sh │ │ ├── decode.sh │ │ ├── dump_nnet_activations.sh │ │ ├── extract_ivectors.sh │ │ ├── extract_ivectors_online.sh │ │ ├── get_egs.sh │ │ ├── get_egs2.sh │ │ ├── get_egs_discriminative2.sh │ │ ├── get_pca_transform.sh │ │ ├── make_denlats.sh │ │ ├── prepare_online_decoding.sh │ │ ├── prepare_online_decoding_retrain.sh │ │ ├── prepare_online_decoding_transfer.sh │ │ ├── train_diag_ubm.sh │ │ └── train_ivector_extractor.sh │ ├── nnet3 │ │ ├── decode.sh │ │ ├── decode_wake_word.sh │ │ └── prepare_online_decoding.sh │ └── prepare_online_decoding.sh ├── oracle_wer.sh ├── overlap │ ├── get_overlap_segments.py │ ├── get_overlap_targets.py │ ├── output_to_rttm.py │ ├── post_process_output.sh │ └── prepare_overlap_graph.py ├── paste_feats.sh ├── pytorchnn │ ├── check_py.py │ ├── compute_sentence_scores.py │ ├── data.py │ ├── lmrescore_nbest_pytorchnn.sh │ ├── model.py │ └── train.py ├── resegment_data.sh ├── resegment_text.sh ├── rnnlmrescore.sh ├── score_kaldi.sh ├── score_kaldi_compare.sh ├── scoring │ ├── score_kaldi_cer.sh │ ├── score_kaldi_compare.sh │ └── score_kaldi_wer.sh ├── search_index.sh ├── segmentation │ ├── ali_to_targets.sh │ ├── combine_targets_dirs.sh │ ├── convert_targets_dir_to_whole_recording.sh │ ├── convert_utt2spk_and_segments_to_rttm.py │ ├── copy_targets_dir.sh │ ├── decode_sad.sh │ ├── detect_speech_activity.sh │ ├── evaluate_segmentation.pl │ ├── get_targets_for_out_of_segments.sh │ ├── internal │ │ ├── arc_info_to_targets.py │ │ ├── find_oov_phone.py │ │ ├── get_default_targets_for_out_of_segments.py │ │ ├── get_transform_probs_mat.py │ │ ├── merge_segment_targets_to_recording.py │ │ ├── merge_targets.py │ │ ├── prepare_sad_graph.py │ │ ├── resample_targets.py │ │ ├── sad_to_segments.py │ │ └── verify_phones_list.py │ ├── lats_to_targets.sh │ ├── merge_targets_dirs.sh │ ├── post_process_sad_to_segments.sh │ ├── prepare_targets_gmm.sh │ ├── resample_targets_dir.sh │ └── validate_targets_dir.sh ├── select_feats.sh ├── shift_feats.sh ├── subset_ali_dir.sh ├── tandem │ ├── align_fmllr.sh │ ├── align_sgmm2.sh │ ├── align_si.sh │ ├── decode.sh │ ├── decode_fmllr.sh │ ├── decode_sgmm2.sh │ ├── decode_si.sh │ ├── make_denlats.sh │ ├── make_denlats_sgmm2.sh │ ├── mk_aslf_lda_mllt.sh │ ├── mk_aslf_sgmm2.sh │ ├── train_deltas.sh │ ├── train_lda_mllt.sh │ ├── train_mllt.sh │ ├── train_mmi.sh │ ├── train_mmi_sgmm2.sh │ ├── train_mono.sh │ ├── train_sat.sh │ ├── train_sgmm2.sh │ └── train_ubm.sh ├── tfrnnlm │ ├── check_py.py │ ├── check_tensorflow_installed.sh │ ├── lmrescore_rnnlm_lat.sh │ ├── lmrescore_rnnlm_lat_pruned.sh │ ├── lstm.py │ ├── lstm_fast.py │ ├── reader.py │ └── vanilla_rnnlm.py ├── train_deltas.sh ├── train_diag_ubm.sh ├── train_lda_mllt.sh ├── train_lvtln.sh ├── train_map.sh ├── train_mmi.sh ├── train_mmi_fmmi.sh ├── train_mmi_fmmi_indirect.sh ├── train_mmi_sgmm2.sh ├── train_mono.sh ├── train_mpe.sh ├── train_nnet.sh ├── train_quick.sh ├── train_raw_sat.sh ├── train_sat.sh ├── train_sat_basis.sh ├── train_segmenter.sh ├── train_sgmm2.sh ├── train_sgmm2_group.sh ├── train_smbr.sh ├── train_ubm.sh └── word_align_lattices.sh └── utils ├── add_disambig.pl ├── add_lex_disambig.pl ├── analyze_segments.pl ├── apply_map.pl ├── best_wer.sh ├── build_const_arpa_lm.sh ├── build_kenlm_model_from_arpa.sh ├── combine_data.sh ├── convert_ctm.pl ├── convert_slf.pl ├── convert_slf_parallel.sh ├── copy_data_dir.sh ├── create_data_link.pl ├── create_split_dir.pl ├── ctm ├── convert_ctm.pl ├── fix_ctm.sh └── resolve_ctm_overlaps.py ├── data ├── combine_data.sh ├── combine_short_segments.sh ├── convert_data_dir_to_whole.sh ├── copy_data_dir.sh ├── extend_segment_times.py ├── extract_wav_segments_data_dir.sh ├── fix_data_dir.sh ├── fix_subsegment_feats.pl ├── get_allowed_durations.py ├── get_frame_shift.sh ├── get_num_frames.sh ├── get_reco2dur.sh ├── get_reco2utt_for_data.sh ├── get_segments_for_data.sh ├── get_uniform_subsegments.py ├── get_utt2dur.sh ├── get_utt2num_frames.sh ├── internal │ ├── choose_utts_to_combine.py │ ├── combine_segments_to_recording.py │ ├── modify_speaker_info.py │ └── perturb_volume.py ├── limit_feature_dim.sh ├── modify_speaker_info.sh ├── modify_speaker_info_to_recording.sh ├── normalize_data_range.pl ├── perturb_data_dir_speed.sh ├── perturb_data_dir_speed_3way.sh ├── perturb_data_dir_volume.sh ├── perturb_speed_to_allowed_lengths.py ├── remove_dup_utts.sh ├── resample_data_dir.sh ├── shift_and_combine_feats.sh ├── shift_feats.sh ├── split_data.sh ├── subsegment_data_dir.sh ├── subset_data_dir.sh └── validate_data_dir.sh ├── dict_dir_add_pronprobs.sh ├── eps2disambig.pl ├── filt.py ├── filter_scp.pl ├── filter_scps.pl ├── find_arpa_oovs.pl ├── fix_ctm.sh ├── fix_data_dir.sh ├── format_lm.sh ├── format_lm_sri.sh ├── gen_topo.pl ├── int2sym.pl ├── kwslist_post_process.pl ├── lang ├── add_lex_disambig.pl ├── add_unigrams_arpa.pl ├── adjust_unk_arpa.pl ├── adjust_unk_graph.sh ├── bpe │ ├── add_final_optional_silence.sh │ ├── apply_bpe.py │ ├── bidi.py │ ├── learn_bpe.py │ ├── prepend_words.py │ └── reverse.py ├── check_g_properties.pl ├── check_phones_compatible.sh ├── compute_sentence_probs_arpa.py ├── extend_lang.sh ├── get_word_position_phone_map.pl ├── grammar │ ├── augment_phones_txt.py │ └── augment_words_txt.py ├── internal │ ├── apply_unk_lm.sh │ ├── arpa2fst_constrained.py │ └── modify_unk_pron.py ├── limit_arpa_unk_history.py ├── make_kn_lm.py ├── make_lexicon_fst.py ├── make_lexicon_fst_silprob.py ├── make_phone_bigram_lang.sh ├── make_phone_lm.py ├── make_position_dependent_subword_lexicon.py ├── make_subword_lexicon_fst.py ├── make_unk_lm.sh ├── prepare_lang.sh ├── validate_disambig_sym_file.pl └── validate_lang.pl ├── ln.pl ├── make_absolute.sh ├── make_lexicon_fst.pl ├── make_lexicon_fst_silprob.pl ├── make_unigram_grammar.pl ├── map_arpa_lm.pl ├── mkgraph.sh ├── mkgraph_lookahead.sh ├── nnet-cpu ├── make_nnet_config.pl ├── make_nnet_config_block.pl ├── make_nnet_config_preconditioned.pl └── update_learning_rates.pl ├── nnet ├── gen_dct_mat.py ├── gen_hamm_mat.py ├── gen_splice.py ├── make_blstm_proto.py ├── make_cnn_proto.py ├── make_lstm_proto.py ├── make_nnet_proto.py └── subset_data_tr_cv.sh ├── nnet3 └── convert_config_tdnn_to_affine.py ├── parallel ├── limit_num_gpus.sh ├── pbs.pl ├── queue.pl ├── retry.pl ├── run.pl └── slurm.pl ├── parse_options.sh ├── pbs.pl ├── perturb_data_dir_speed.sh ├── pinyin_map.pl ├── prepare_extended_lang.sh ├── prepare_lang.sh ├── prepare_online_nnet_dist_build.sh ├── queue.pl ├── remove_data_links.sh ├── remove_oovs.pl ├── retry.pl ├── reverse_arpa.py ├── rnnlm_compute_scores.sh ├── run.pl ├── s2eps.pl ├── scoring ├── wer_ops_details.pl ├── wer_per_spk_details.pl ├── wer_per_utt_details.pl └── wer_report.pl ├── segmentation.pl ├── show_lattice.sh ├── shuffle_list.pl ├── slurm.pl ├── spk2utt_to_utt2spk.pl ├── split_data.sh ├── split_scp.pl ├── ssh.pl ├── subset_data_dir.sh ├── subset_data_dir_tr_cv.sh ├── subset_scp.pl ├── subword ├── prepare_lang_subword.sh └── prepare_subword_text.sh ├── summarize_logs.pl ├── summarize_warnings.pl ├── sym2int.pl ├── train_arpa_with_kenlm.sh ├── utt2spk_to_spk2utt.pl ├── validate_data_dir.sh ├── validate_dict_dir.pl ├── validate_lang.pl ├── validate_text.pl └── write_kwslist.pl /cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | export train_cmd="run.pl" 14 | -------------------------------------------------------------------------------- /conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false 2 | --num-mel-bins=40 3 | --low-freq=20 4 | --high-freq=-400 5 | -------------------------------------------------------------------------------- /conf/fbank_16k.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false 2 | --num-mel-bins=40 3 | --low-freq=20 4 | --high-freq=-400 5 | -------------------------------------------------------------------------------- /conf/fbank_16k_64ms_16ms.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false 2 | --num-mel-bins=40 3 | --low-freq=20 4 | --high-freq=-400 5 | --frame-length=64 6 | --frame-shift=16 7 | -------------------------------------------------------------------------------- /conf/fbank_8k.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false 2 | --num-mel-bins=40 3 | --low-freq=20 4 | --high-freq=-400 5 | --sample-frequency=8000 6 | -------------------------------------------------------------------------------- /conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false 2 | --sample-frequency=16000 3 | -------------------------------------------------------------------------------- /conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=16000 7 | --num-mel-bins=40 8 | --num-ceps=40 9 | --low-freq=40 10 | --high-freq=-400 11 | -------------------------------------------------------------------------------- /conf/mfcc_hires_16k.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=16000 7 | --num-mel-bins=40 8 | --num-ceps=40 9 | --low-freq=40 10 | --high-freq=-400 11 | -------------------------------------------------------------------------------- /conf/mfcc_hires_8k.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=8000 7 | --num-mel-bins=40 8 | --num-ceps=40 9 | --low-freq=40 10 | --high-freq=-200 11 | -------------------------------------------------------------------------------- /conf/mfcc_sad.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for SAD neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=16000 7 | --num-mel-bins=40 8 | --num-ceps=40 9 | --low-freq=40 10 | --high-freq=-400 11 | -------------------------------------------------------------------------------- /conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /conf/spectrogram_16k_64ms_16ms.conf: -------------------------------------------------------------------------------- 1 | --frame-length=64 2 | --frame-shift=16 3 | -------------------------------------------------------------------------------- /doc/CHiME_2023_DASR_wang.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/doc/CHiME_2023_DASR_wang.pdf -------------------------------------------------------------------------------- /doc/ICASS2024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/doc/ICASS2024.pdf -------------------------------------------------------------------------------- /doc/NN_v3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/doc/NN_v3.jpg -------------------------------------------------------------------------------- /doc/results.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/doc/results.jpg -------------------------------------------------------------------------------- /embedding_raw/voxceleb/cluster_center_128.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/cluster_center_128.npy -------------------------------------------------------------------------------- /embedding_raw/voxceleb/cluster_center_256.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/cluster_center_256.npy -------------------------------------------------------------------------------- /embedding_raw/voxceleb/cluster_center_64.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/cluster_center_64.npy -------------------------------------------------------------------------------- /embedding_raw/voxceleb/xvector_cluster_center_128.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/xvector_cluster_center_128.npy -------------------------------------------------------------------------------- /embedding_raw/voxceleb/xvector_cluster_center_256.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/xvector_cluster_center_256.npy -------------------------------------------------------------------------------- /embedding_raw/voxceleb/xvector_cluster_center_64.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/embedding_raw/voxceleb/xvector_cluster_center_64.npy -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/10.ie: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/exp/nnet3_recipe_ivector/extractor/10.ie -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/final.dubm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/exp/nnet3_recipe_ivector/extractor/final.dubm -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/final.ie: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/exp/nnet3_recipe_ivector/extractor/final.ie -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/final.ie.id: -------------------------------------------------------------------------------- 1 | 3acf506c5892d1f607da22efbc9e7933 2 | -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/final.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/exp/nnet3_recipe_ivector/extractor/final.mat -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/global_cmvn.stats: -------------------------------------------------------------------------------- 1 | [ 2 | 3.06165e+11 -8.240025e+09 -2.253718e+10 3.054559e+09 -4.238454e+10 -3.825784e+10 -5.03306e+10 -2.026265e+10 -2.347276e+10 -4.240301e+09 -1.706322e+10 -6.892789e+09 -2.357631e+10 -2.507509e+09 -1.907804e+10 -6.255032e+09 -1.384562e+10 -2.108998e+09 -8.082981e+09 -7.889935e+08 -3.700739e+09 -7.177256e+07 -6.844363e+08 -3.111713e+07 1.112144e+09 -1.571209e+08 1.9715e+09 -8.712586e+08 1.764505e+09 -1.274736e+09 1.756529e+09 -9.595976e+08 1.610006e+09 -7.968066e+08 1.349054e+09 -3.42071e+08 6.27247e+08 -8.501681e+08 -4.32481e+08 -5.583656e+08 3.028656e+09 3 | 3.189034e+13 1.238453e+12 1.439817e+12 1.378808e+12 2.032086e+12 1.896199e+12 2.230804e+12 1.426913e+12 1.446148e+12 1.219986e+12 1.22271e+12 1.199805e+12 1.094108e+12 8.138162e+11 7.389557e+11 4.92009e+11 4.304046e+11 2.647395e+11 1.897839e+11 9.905068e+10 5.47939e+10 1.789002e+10 2.644892e+09 7.090682e+08 9.178836e+09 2.227656e+10 3.951766e+10 5.257301e+10 6.400752e+10 7.014398e+10 7.660171e+10 8.050034e+10 8.482946e+10 8.021838e+10 6.389043e+10 5.309594e+10 4.975399e+10 3.896006e+10 2.983068e+10 2.09207e+10 0 ] 4 | -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/num_jobs: -------------------------------------------------------------------------------- 1 | 40 2 | -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /exp/nnet3_recipe_ivector/extractor/splice_opts: -------------------------------------------------------------------------------- 1 | --left-context=3 --right-context=3 2 | -------------------------------------------------------------------------------- /local/HTK.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy 4 | import struct 5 | 6 | 7 | def readHtk(filename): 8 | ''' 9 | Reads the features in a HTK file, and returns them in a 2-D numpy array. 10 | ''' 11 | with open(filename, "rb") as f: 12 | # Read header 13 | nSamples, sampPeriod, sampSize, parmKind = struct.unpack(">iihh", f.read(12)) 14 | # sampPeriod and parmKind will be omitted 15 | # Read data 16 | data = struct.unpack(">%df" % (nSamples * sampSize / 4), f.read(nSamples * sampSize)) 17 | # return numpy.array(data).reshape(nSamples, int(sampSize / 4)) 18 | return nSamples, sampPeriod, sampSize, parmKind, data 19 | 20 | def readHtk_start_end(filename, start, end): 21 | with open(filename, "rb") as f: 22 | # Read header 23 | nSamples, sampPeriod, sampSize, parmKind = struct.unpack(">iihh", f.read(12)) 24 | # sampPeriod and parmKind will be omitted 25 | f.seek(start * sampSize,1) 26 | # Read data 27 | data = struct.unpack(">%df" % ((end - start) * sampSize / 4), f.read((end - start) * sampSize)) 28 | # return numpy.array(data).reshape(nSamples, int(sampSize 1 4)) 29 | return nSamples, sampPeriod, sampSize, parmKind, data 30 | 31 | def readHtk_info(filename): 32 | with open(filename, "rb") as f: 33 | # Read header 34 | nSamples, sampPeriod, sampSize, parmKind = struct.unpack(">iihh", f.read(12)) 35 | return nSamples, sampPeriod, sampSize, parmKind 36 | 37 | def writeHtk(filename, feature, sampPeriod=3200, parmKind=9): 38 | ''' 39 | Writes the features in a 2-D numpy array into a HTK file. 40 | ''' 41 | with open(filename, "wb") as f: 42 | # Write header 43 | nSamples = feature.shape[0] 44 | sampSize = feature.shape[1] * 4 45 | f.write(struct.pack(">iihh", nSamples, sampPeriod, sampSize, parmKind)) 46 | # Write data 47 | f.write(struct.pack(">%df" % (nSamples * sampSize / 4), *feature.ravel())) 48 | 49 | 50 | def read_wav_start_end(path, start, end): 51 | dur = end - start 52 | with open(path, "rb") as f: 53 | f.seek(44 + start * 2, 1) 54 | data = struct.unpack("<%dh" % (dur), f.read(dur*2)) 55 | #print(dur, numpy.array(data).shape) 56 | return numpy.array(data) / 32768. -------------------------------------------------------------------------------- /local/__pycache__/HTK.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/HTK.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /local/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/conformer2.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/conformer2.cpython-310.pyc -------------------------------------------------------------------------------- /local/__pycache__/conformer2.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/conformer2.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/model_S2S_weight_input_DIM.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/model_S2S_weight_input_DIM.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/reader_s2s.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/reader_s2s.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /local/__pycache__/utils_s2s.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/local/__pycache__/utils_s2s.cpython-39.pyc -------------------------------------------------------------------------------- /local/analysis_diarization.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | 4 | score_area= 5 | collar=0 6 | uem=None 7 | . ./utils/parse_options.sh 8 | ref_rttm_path=$1 9 | hyp_rttm_path=$2 10 | tempdir=$( mktemp -d /tmp/eval_diarization.XXXXXX ) 11 | if [ -f $uem ];then 12 | echo uem 13 | local/md-eval-22.pl $score_area -u $uem -c $collar -afc -r $ref_rttm_path -s $hyp_rttm_path 2>/dev/null > ${tempdir}/temp.info 14 | else 15 | local/md-eval-22.pl $score_area -c $collar -afc -r $ref_rttm_path -s $hyp_rttm_path 2>/dev/null > ${tempdir}/temp.info 16 | fi 17 | grep SCORED ${tempdir}/temp.info | cut -d "=" -f 2 | cut -d " " -f 1 > ${tempdir}/SCORED.list 18 | grep MISSED ${tempdir}/temp.info | cut -d "=" -f 2 | cut -d " " -f 1 > ${tempdir}/MISSED.list 19 | grep FALARM ${tempdir}/temp.info | cut -d "=" -f 2 | cut -d " " -f 1 > ${tempdir}/FALARM.list 20 | grep "SPEAKER ERROR" ${tempdir}/temp.info | cut -d "=" -f 2 | cut -d " " -f 1 > ${tempdir}/SPEAKER.list 21 | grep OVERALL ${tempdir}/temp.info | cut -d "=" -f 4 | cut -d ")" -f 1 > ${tempdir}/session.list 22 | sed -i '$d' ${tempdir}/session.list 23 | echo "ALL" >> ${tempdir}/session.list 24 | for l in `cat ${tempdir}/session.list`;do 25 | grep $l $ref_rttm_path | awk '{print $8}' | sort | uniq | wc -l 26 | done > ${tempdir}/oracle_spknum.list 27 | 28 | for l in `cat ${tempdir}/session.list`;do 29 | grep $l $hyp_rttm_path | awk '{print $8}' | sort | uniq | wc -l 30 | done > ${tempdir}/diarized_spknum.list 31 | 32 | paste -d " " ${tempdir}/session.list ${tempdir}/SCORED.list ${tempdir}/MISSED.list \ 33 | ${tempdir}/FALARM.list ${tempdir}/SPEAKER.list ${tempdir}/oracle_spknum.list \ 34 | ${tempdir}/diarized_spknum.list > ${tempdir}/temp.details 35 | 36 | awk '{printf "%s %.2f %.2f %.2f %.2f %d %d\n",$1,$4/$2*100,$3/$2*100,$5/$2*100,($3+$4+$5)/$2*100,$6,$7}' ${tempdir}/temp.details > ${tempdir}/temp.info1 37 | echo "session FA MISS SPKERR DER ORACLE_SPKNUM DIARIZED_SPKNUM" > ${tempdir}/temp.details 38 | grep -v "ALL" ${tempdir}/temp.info1 | sort -n -k 5 >> ${tempdir}/temp.details 39 | grep "ALL" ${tempdir}/temp.info1 >> ${tempdir}/temp.details 40 | 41 | column -t ${tempdir}/temp.details 42 | 43 | rm -rf ${tempdir} -------------------------------------------------------------------------------- /local/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | configs3_4Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM = {"input_dim": 40, 4 | "average_pooling": 301, 5 | "cnn_configs": [[2, 64, 3, 1], [64, 64, 3, 1], [64, 128, 3, (2, 1)], [128, 128, 3, 1]], 6 | "conformer_layers": 6, 7 | "conformer_conv_kernel_size": 15, 8 | "conformer_ff_dropout": 0.1, 9 | "decoder_layers": 6, 10 | "decoder_num_heads": 8, 11 | "decoder_ffn_num_hiddens": 1024, 12 | "decoder_mlp_num_hiddens": 512, 13 | "decoder_attn_dropout": 0.0, 14 | "decoder_dropout": 0.0, 15 | "decode_Time": 800, 16 | "fea_dim": 512, 17 | "embedding_path1": "embedding_raw/voxceleb/cluster_center_128.npy", 18 | "ma_mse_layers_1":1, 19 | "embedding_path2": "embedding_raw/voxceleb/xvector_cluster_center_128.npy", 20 | "ma_mse_layers_2":1, 21 | "output_speaker": 4 22 | } 23 | 24 | configs3_2Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM = {"input_dim": 40, 25 | "average_pooling": 301, 26 | "cnn_configs": [[2, 64, 3, 1], [64, 64, 3, 1], [64, 128, 3, (2, 1)], [128, 128, 3, 1]], 27 | "conformer_layers": 6, 28 | "conformer_conv_kernel_size": 15, 29 | "conformer_ff_dropout": 0.1, 30 | "decoder_layers": 6, 31 | "decoder_num_heads": 8, 32 | "decoder_ffn_num_hiddens": 1024, 33 | "decoder_mlp_num_hiddens": 512, 34 | "decoder_attn_dropout": 0.0, 35 | "decoder_dropout": 0.0, 36 | "decode_Time": 800, 37 | "fea_dim": 512, 38 | "embedding_path1": "embedding_raw/voxceleb/cluster_center_128.npy", 39 | "ma_mse_layers_1":3, 40 | "embedding_path2": "embedding_raw/voxceleb/xvector_cluster_center_128.npy", 41 | "ma_mse_layers_2":3, 42 | "output_speaker": 2 43 | } 44 | 45 | 46 | configs = { 47 | "configs3_4Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM": configs3_4Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM, 48 | "configs3_2Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM": configs3_2Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM, 49 | } 50 | -------------------------------------------------------------------------------- /local/run_MAMSE_S2S_chime7_ws_input_DIM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | from train_Pretrain_DDP_S2S import Train 6 | from model_S2S_weight_input_DIM import MULTI_MAM_SE_S2S_model 7 | from config import configs3_4Speakers_ivector_ivector128_xvectors128_S2S_MA_MSE_DIM as config_train 8 | import torch 9 | from reader_sc_s2s import Fbank_Embedding_Label_Mask, collate_fn_mask, RTTM_to_Speaker_Mask 10 | 11 | 12 | data="CHiME6MAMSELabel_SimuCHiME6_Mixer6MAMSELabel_SimuMixer6_SimuDipcoDevNoise" # train data name 13 | feature_scp = f"data/{data}/cmn_slide_fbank_htk.list" # fbank 14 | ivector_path = f"data/{data}/ivectors_spk.txt" # i-vector 15 | oracle_rttm = f"data/{data}/oracle.rttm" 16 | 17 | max_utt_durance = 800 18 | batchsize = 20 19 | mixup_rate=0.5 20 | 21 | output_dir = f"exp/S2S/Batchsize{batchsize}_4speakers_Segment{max_utt_durance}s_Mixup{mixup_rate}_{data}_all_data_512_all0Dropout_6layers_weight_input_DIM" 22 | print('exp will be saved in', output_dir) 23 | if not os.path.exists(output_dir): 24 | os.makedirs(output_dir, exist_ok=True) 25 | label_2classes = RTTM_to_Speaker_Mask(oracle_rttm, differ_silence_inference_speech = False) 26 | 27 | multiple_4speakers_2classes = Fbank_Embedding_Label_Mask(feature_scp, ivector_path, label_2classes, append_speaker=True, diff_speaker=True, min_speaker=2, max_speaker=4, max_utt_durance=max_utt_durance, frame_shift=int(max_utt_durance/4*3), mixup_rate=mixup_rate, alpha=0.5) 28 | 29 | 30 | os.system("cp {} {}/{}".format(os.path.abspath(sys.argv[0]), output_dir, os.path.basename(sys.argv[0]))) 31 | os.system("cp {} {}/{}".format("local_gb/model_S2S_weight_input_DIM.py", output_dir, "model.py")) 32 | optimizer = torch.optim.Adam 33 | loss_fn = torch.nn.BCEWithLogitsLoss() 34 | 35 | 36 | train = Train(multiple_4speakers_2classes, collate_fn_mask, MULTI_MAM_SE_S2S_model, config_train, "MULTI_MAM_SE_S2S_model", output_dir, optimizer, loss_fn, batchsize=batchsize, accumulation_steps=[(0, 1)], lr=0.0001, start_epoch=0, end_epoch=6, num_workers=12) 37 | train.train(updata_utt=True) 38 | -------------------------------------------------------------------------------- /local/split_long_segment_s2s.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import os 4 | import sys 5 | 6 | def split_segment(prob, sess, spk, start, end, max_dur=2000): 7 | dur = end - start 8 | if dur <= max_dur: 9 | print("SPEAKER {} 1 {:.2f} {:.2f} {} ".format(sess, start/100., dur/100., spk)) 10 | else: 11 | tosplit = int(start+100 + np.argmin(prob[int(start+100):int(end-100)])) 12 | split_segment(prob, sess, spk, start, tosplit) 13 | split_segment(prob, sess, spk, tosplit, end) 14 | 15 | 16 | prob_array_dir = sys.argv[1] 17 | input_rttm = sys.argv[2] 18 | prob_array = [os.path.join(prob_array_dir, l) for l in os.listdir(prob_array_dir)] 19 | prob_label = {} 20 | #print(prob_array_dir, input_rttm) 21 | for p in prob_array: 22 | if p.find(".npy") == -1: continue 23 | session = os.path.basename(p).split('.')[0] 24 | if session.find("CH") != -1 and session.find("S") != -1: 25 | sess = session.split("_")[0] 26 | elif session.find("CH") != -1 and session.find("S") == -1: 27 | sess = "_".join(session.split("_")[:-1]) 28 | else: 29 | sess = session 30 | prob_label[sess] = np.load(os.path.join(p)) #num_spk, len 31 | IN = open(input_rttm) 32 | for l in IN: 33 | #print(l) 34 | line = l.split(" ") 35 | session = line[1] 36 | if line[-2] != "": 37 | spk = line[-2] 38 | else: 39 | spk = line[-3] 40 | #print(line[3] ) 41 | start = np.int64(np.float64(line[3]) * 100 ) 42 | dur = np.int64(np.float64(line[4]) * 100) 43 | end = start + dur 44 | if dur <= 2000: 45 | print(l.rstrip()) 46 | #pass 47 | else: 48 | split_segment(prob_label[session][int(spk)], session, spk, start, end, max_dur=2000) 49 | -------------------------------------------------------------------------------- /local/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import logging 5 | import sys 6 | import pdb 7 | 8 | 9 | def save_checkpoint(model, optimizer, filename): 10 | try: 11 | torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict()}, filename) 12 | except: 13 | torch.save({'model': model.state_dict(), \ 14 | 'optimizer_tsvad': optimizer['tsvad'].state_dict(), \ 15 | 'optimizer_resnet': optimizer['resnet'].state_dict()}, filename) 16 | 17 | def load_checkpoint(model, optimizer, filename): 18 | checkpoint = torch.load(filename) 19 | if model is not None: 20 | model.load_state_dict(checkpoint['model']) 21 | if optimizer is not None: 22 | optimizer.load_state_dict(checkpoint['optimizer']) 23 | 24 | 25 | def load_checkpoint_join_training(model, optimizer, filename): 26 | checkpoint = torch.load(filename) 27 | # pdb.set_trace() 28 | if model is not None: 29 | model_dict = model.state_dict() 30 | # pdb.set_trace() 31 | state_dict_2 = {k:v for k,v in checkpoint['model'].items()} 32 | # pdb.set_trace() 33 | model_dict.update(state_dict_2) 34 | model.load_state_dict(model_dict) 35 | # model_dict['FC.2.weight'] - checkpoint['model']['FC.2.weight'] 36 | # pdb.set_trace() 37 | # model.load_state_dict(checkpoint['model']) 38 | # pdb.set_trace() 39 | if optimizer is not None and 'join_train' in filename: 40 | print('load optimizer') 41 | optimizer.load_state_dict(checkpoint['optimizer']) 42 | 43 | def get_logger(filename): 44 | # Logging configuration: set the basic configuration of the logging system 45 | log_formatter = logging.Formatter(fmt='%(asctime)s [%(processName)s, %(process)s] [%(levelname)-5.5s] %(message)s', datefmt='%m-%d %H:%M') 46 | logger = logging.getLogger() 47 | logger.setLevel(logging.DEBUG) 48 | # File logger 49 | file_handler = logging.FileHandler("{}.log".format(filename)) 50 | file_handler.setFormatter(log_formatter) 51 | file_handler.setLevel(logging.DEBUG) 52 | logger.addHandler(file_handler) 53 | # Stderr logger 54 | std_handler = logging.StreamHandler(sys.stdout) 55 | std_handler.setFormatter(log_formatter) 56 | std_handler.setLevel(logging.DEBUG) 57 | logger.addHandler(std_handler) 58 | return logger 59 | -------------------------------------------------------------------------------- /path.sh: -------------------------------------------------------------------------------- 1 | #export KALDI_ROOT=/yrfs1/intern/glzhong/kaldi 2 | export KALDI_ROOT=/home/yoos/Documents/code/kaldi 3 | export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 6 | . $KALDI_ROOT/tools/config/common_path.sh 7 | export LC_ALL=C 8 | export LD_LIBRARY_PATH=$KALDI_ROOT/src/lib:$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH 9 | #LD_LIBRARY_PATH=/yrfs5/sre/leisun8/tools/kaldi_cuda9/tools/sox/lib:$LD_LIBRARY_PATH 10 | #PATH=/yrfs5/sre/leisun8/tools/kaldi_cuda9/tools/sox/bin:$PATH 11 | 12 | #PATH=/home4/intern/rywang9/tools/sox/:$PATH 13 | #LD_LIBRARY_PATH=/home4/intern/rywang9/tools/sox/lib:$LD_LIBRARY_PATH 14 | #export PATH=/home/intern/stniu/anaconda3/bin/:$PATH 15 | #export PATH=/home4/intern/stniu/anaconda3/envs/mss/bin:$PATH 16 | #export PATH=/opt/lib/cuda-9.0_cudnn-v7.1.4/bin${PATH:+:${PATH}} 17 | #export LD_LIBRARY_PATH=/opt/lib/cuda-9.0_cudnn-v7.1.4/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} 18 | 19 | export PATH=/opt/lib/cuda-10.2/bin${PATH:+:${PATH}} 20 | export LD_LIBRARY_PATH=/opt/lib/cudnn/cudnn-10.2-v7.6.5.32/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} 21 | export LD_LIBRARY_PATH=/work1/sre/leisun8/tools/libsndfile/lib/:$LD_LIBRARY_PATH 22 | #export LD_LIBRARY_PATH=/home4/intern/mkhe/anaconda3/envs/torch/lib:$LD_LIBRARY_PATH 23 | #. path_v100.sh 24 | export PATH=/home4/intern/mkhe/anaconda3/bin/:$PATH 25 | export LD_LIBRARY_PATH=/home4/intern/mkhe/anaconda3/lib:$LD_LIBRARY_PATH 26 | 27 | export PATH=/home4/intern/stniu/libs/ffmpeg/bin/:$PATH 28 | export LD_LIBRARY_PATH=/home4/intern/stniu/libs/ffmpeg/lib:$LD_LIBRARY_PATH 29 | #CUDA_LAUNCH_BLOCKING=1 30 | #export NCCL_IB_DISABLE=1 31 | # NCCL_DEBUG=INFO 32 | 33 | NCCL_SOCKET_IFNAME=eth0 34 | 35 | #export PATH=/home3/cv1/hangchen2/anaconda3/envs/py38+cu102/bin/:$PATH 36 | #export LD_LIBRARY_PATH=/home3/cv1/hangchen2/anaconda3/envs/py38+cu102/lib:$LD_LIBRARY_PATH 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | einops==0.7.0 2 | matplotlib==3.6.2 3 | numpy<1.28.0 4 | scipy==1.11.4 5 | torch==2.1.1 6 | tqdm==4.65.0 -------------------------------------------------------------------------------- /run_decode.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # single model decode 4 | bash /train8/sppro/gbyang/code/NSD-MS2S/local/decode_S2S_model.sh --stage 3 --data chime7_eval_all_CH --diarized_rttm data/chime7_eval_all_CH/f1.rttm --affix f1 5 | 6 | # models fusion decode 7 | bash /train8/sppro/gbyang/code/NSD-MS2S/local/decode_S2S_models_fusion.sh --stage 3 --data chime7_eval_all_CH --diarized_rttm data/chime7_eval_all_CH/f1.rttm --affix f1 -------------------------------------------------------------------------------- /steps/cleanup/internal/ctm_to_text.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | use strict; 7 | use warnings; 8 | 9 | if (scalar @ARGV != 1 && scalar @ARGV != 3) { 10 | my $usage = < []. 18 | This script assumes the CTM to be in NIST sorted order given by UNIX 19 | sort command "sort +0 -1 +1 -2 +2nb -3" 20 | 21 | Usage: ctm_to_text.pl [--non-scored-words ] > 22 | END 23 | die $usage; 24 | } 25 | 26 | my $non_scored_words_list = ""; 27 | if (scalar @ARGV > 1) { 28 | if ($ARGV[0] eq "--non-scored-words") { 29 | shift @ARGV; 30 | $non_scored_words_list = shift @ARGV; 31 | } else { 32 | die "Unknown option $ARGV[0]\n"; 33 | } 34 | } 35 | 36 | my %non_scored_words; 37 | $non_scored_words{""} = 1; 38 | 39 | if ($non_scored_words_list ne "") { 40 | open NONSCORED, $non_scored_words_list or die "Failed to open $non_scored_words_list"; 41 | 42 | while () { 43 | chomp; 44 | my @F = split; 45 | $non_scored_words{$F[0]} = 1; 46 | } 47 | 48 | close NONSCORED; 49 | } 50 | 51 | my $ctm_file = shift @ARGV; 52 | open CTM, $ctm_file or die "Failed to open $ctm_file"; 53 | 54 | my $prev_utt = ""; 55 | my @text; 56 | 57 | while () { 58 | chomp; 59 | my @F = split; 60 | 61 | my $utt = $F[0]; 62 | if ($utt ne $prev_utt && $prev_utt ne "") { 63 | if (scalar @text > 0) { 64 | print $prev_utt . " " . join(" ", @text) . "\n"; 65 | } 66 | @text = (); 67 | } 68 | 69 | if (scalar @F < 5 || scalar @F > 6) { 70 | die "Invalid line $_ in CTM $ctm_file\n"; 71 | } 72 | 73 | if (!defined $non_scored_words{$F[4]}) { 74 | push @text, $F[4]; 75 | } 76 | 77 | $prev_utt = $utt; 78 | } 79 | 80 | close CTM; 81 | 82 | if (scalar @text > 0) { 83 | print $prev_utt . " " . join(" ", @text) . "\n"; 84 | } 85 | -------------------------------------------------------------------------------- /steps/cleanup/internal/split_text_into_docs.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # If 'text' contains: 7 | # utterance1 A B C D 8 | # utterance2 C B 9 | # and you ran: 10 | # split_text_into_docs.pl --max-words 2 text doc2text docs 11 | # then 'doc2text' would contain: 12 | # utterance1-1 utterance1 13 | # utterance1-2 utterance1 14 | # utterance2-1 utterance2 15 | # and 'docs' would contain: 16 | # utterance1-1 A B 17 | # utterance1-2 C D 18 | # utterance2-1 C B 19 | 20 | use warnings; 21 | use strict; 22 | 23 | my $max_words = 1000; 24 | 25 | my $usage = "Usage: steps/cleanup/internal/split_text_into_docs.pl [--max-words ] text doc2text docs\n"; 26 | 27 | while (@ARGV > 3) { 28 | if ($ARGV[0] eq "--max-words") { 29 | shift @ARGV; 30 | $max_words = shift @ARGV; 31 | } else { 32 | print STDERR "$usage"; 33 | exit (1); 34 | } 35 | } 36 | 37 | if (scalar @ARGV != 3) { 38 | print STDERR "$usage"; 39 | exit (1); 40 | } 41 | 42 | sub min ($$) { $_[$_[0] > $_[1]] } 43 | 44 | open TEXT, $ARGV[0] or die "$0: Could not open file $ARGV[0] for reading\n"; 45 | open DOC2TEXT, ">", $ARGV[1] or die "$0: Could not open file $ARGV[1] for writing\n"; 46 | open DOCS, ">", $ARGV[2] or die "$0: Could not open file $ARGV[2] for writing\n"; 47 | 48 | while () { 49 | chomp; 50 | my @F = split; 51 | my $utt = shift @F; 52 | my $num_words = scalar @F; 53 | 54 | if ($num_words <= $max_words) { 55 | print DOCS "$_\n"; 56 | print DOC2TEXT "$utt $utt\n"; 57 | next; 58 | } 59 | 60 | my $num_docs = int($num_words / $max_words) + 1; 61 | my $num_words_shift = int($num_words / $num_docs) + 1; 62 | my $words_per_doc = $num_words_shift; 63 | 64 | #print STDERR ("$utt num-words=$num_words num-docs=$num_docs words-per-doc=$words_per_doc\n"); 65 | 66 | for (my $i = 0; $i < $num_docs; $i++) { 67 | my $st = $i*$num_words_shift; 68 | my $end = min($st + $words_per_doc, $num_words) - 1; 69 | print DOCS ("$utt-$i " . join(" ", @F[$st..$end]) . "\n"); 70 | print DOC2TEXT "$utt-$i $utt\n"; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /steps/cleanup/make_utterance_fsts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | # makes unigram decoding-graph FSTs specific to each utterances, where the 5 | # supplied top-n-words list together with the supervision text of the utterance are 6 | # combined. 7 | 8 | if (@ARGV != 1) { 9 | print STDERR "** Warning: this script is deprecated and will be removed. See\n" . 10 | "** steps/cleanup/make_biased_lm_graphs.sh.\n" . 11 | "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" . 12 | "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" . 13 | " make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n"; 14 | exit(1); 15 | } 16 | 17 | ($top_words_file) = @ARGV; 18 | 19 | open(F, "<$top_words_file") || die "opening $top_words_file"; 20 | 21 | %top_word_probs = ( ); 22 | 23 | while() { 24 | @A = split; 25 | (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file"; 26 | $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n"; 27 | $top_word_probs{$A[1]} += $A[0]; 28 | } 29 | 30 | while () { 31 | @A = split; 32 | $utterance_id = shift @A; 33 | print "$utterance_id\n"; 34 | $num_words = @A + 0; # length of array @A 35 | %word_probs = %top_word_probs; 36 | foreach $w (@A) { 37 | $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_"; 38 | $word_probs{$w} += 1.0 / $num_words; 39 | } 40 | foreach $w (keys %word_probs) { 41 | $prob = $word_probs{$w}; 42 | $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n"; 43 | $cost = -log($prob); 44 | print "0 0 $w $w $cost\n"; 45 | } 46 | $final_cost = -log(1.0 / $num_words); 47 | print "0 $final_cost\n"; 48 | print "\n"; # Empty line terminates the FST in the text-archive format. 49 | } 50 | -------------------------------------------------------------------------------- /steps/compute_vad_decision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0 5 | 6 | # To be run from .. (one directory up from here) 7 | # see ../run.sh for example 8 | 9 | # Compute energy based VAD output 10 | 11 | nj=4 12 | cmd=run.pl 13 | vad_config=conf/vad.conf 14 | 15 | echo "$0 $@" # Print the command line for logging 16 | 17 | if [ -f path.sh ]; then . ./path.sh; fi 18 | . parse_options.sh || exit 1; 19 | 20 | if [ $# -lt 1 ] || [ $# -gt 3 ]; then 21 | echo "Usage: $0 [options] [ []]"; 22 | echo "e.g.: $0 data/train exp/make_vad mfcc" 23 | echo "Note: defaults to /log, and defaults to /data" 24 | echo " Options:" 25 | echo " --vad-config # config passed to compute-vad-energy" 26 | echo " --nj # number of parallel jobs" 27 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 28 | exit 1; 29 | fi 30 | 31 | data=$1 32 | if [ $# -ge 2 ]; then 33 | logdir=$2 34 | else 35 | logdir=$data/log 36 | fi 37 | if [ $# -ge 3 ]; then 38 | vaddir=$3 39 | else 40 | vaddir=$data/data 41 | fi 42 | 43 | 44 | # make $vaddir an absolute pathname. 45 | vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}` 46 | 47 | # use "name" as part of name of the archive. 48 | name=`basename $data` 49 | 50 | mkdir -p $vaddir || exit 1; 51 | mkdir -p $logdir || exit 1; 52 | 53 | if [ -f $data/vad.scp ]; then 54 | mkdir -p $data/.backup 55 | echo "$0: moving $data/vad.scp to $data/.backup" 56 | mv $data/vad.scp $data/.backup 57 | fi 58 | 59 | for f in $data/feats.scp "$vad_config"; do 60 | if [ ! -f $f ]; then 61 | echo "compute_vad_decision.sh: no such file $f" 62 | exit 1; 63 | fi 64 | done 65 | 66 | utils/split_data.sh $data $nj || exit 1; 67 | sdata=$data/split$nj; 68 | 69 | $cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \ 70 | compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp \ 71 | ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp || exit 1 72 | 73 | for ((n=1; n<=nj; n++)); do 74 | cat $vaddir/vad_${name}.$n.scp || exit 1; 75 | done > $data/vad.scp 76 | 77 | nc=`cat $data/vad.scp | wc -l` 78 | nu=`cat $data/feats.scp | wc -l` 79 | if [ $nc -ne $nu ]; then 80 | echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);" 81 | echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh" 82 | [ $nc -eq 0 ] && exit 1; 83 | fi 84 | 85 | 86 | echo "Created VAD output for $name" 87 | -------------------------------------------------------------------------------- /steps/conf/append_prf_to_ctm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import sys 8 | 9 | # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': 10 | # (parsed from the 'prf' output of 'sclite') 11 | 12 | # The tags in appended column are: 13 | # 'C' = correct 14 | # 'S' = substitution 15 | # 'I' = insertion 16 | # 'U' = unknown (not part of scored segment) 17 | 18 | # Parse options, 19 | if len(sys.argv) != 4: 20 | print("Usage: %s prf ctm_in ctm_out" % __file__) 21 | sys.exit(1) 22 | prf_file, ctm_file, ctm_out_file = sys.argv[1:] 23 | 24 | if ctm_out_file == '-': ctm_out_file = '/dev/stdout' 25 | 26 | # Load the prf file, 27 | prf = [] 28 | with open(prf_file) as f: 29 | for l in f: 30 | # Store the data, 31 | if l[:5] == 'File:': 32 | file_id = l.split()[1] 33 | if l[:8] == 'Channel:': 34 | chan = l.split()[1] 35 | if l[:5] == 'H_T1:': 36 | h_t1 = l 37 | if l[:5] == 'Eval:': 38 | evl = l 39 | prf.append((file_id,chan,h_t1,evl)) 40 | 41 | # Parse the prf records into dictionary, 42 | prf_dict = dict() 43 | for (f,c,t,e) in prf: 44 | t_pos = 0 # position in the 't' string, 45 | while t_pos < len(t): 46 | t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos' 47 | try: 48 | # get word evaluation letter 'C,S,I', 49 | evl = e[t_pos] if e[t_pos] != ' ' else 'C' 50 | # add to dictionary, 51 | key='%s,%s' % (f,c) # file,channel 52 | if key not in prf_dict: prf_dict[key] = dict() 53 | prf_dict[key][float(t1)] = evl 54 | except ValueError: 55 | pass 56 | t_pos += len(t1)+1 # advance position for parsing, 57 | 58 | # Load the ctm file (with confidences), 59 | with open(ctm_file) as f: 60 | ctm = [ l.split() for l in f ] 61 | 62 | # Append the sclite alignment tags to ctm, 63 | ctm_out = [] 64 | for f, chan, beg, dur, wrd, conf in ctm: 65 | # U = unknown, C = correct, S = substitution, I = insertion, 66 | sclite_tag = 'U' 67 | try: 68 | sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)] 69 | except KeyError: 70 | pass 71 | ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag]) 72 | 73 | # Save the augmented ctm file, 74 | with open(ctm_out_file, 'w') as f: 75 | f.writelines([' '.join(ctm_record)+'\n' for ctm_record in ctm_out]) 76 | 77 | -------------------------------------------------------------------------------- /steps/conf/convert_ctm_to_tra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import sys, operator 8 | 9 | # This scripts loads a 'ctm' file and converts it into the 'tra' format: 10 | # "utt-key word1 word2 word3 ... wordN" 11 | # The 'utt-key' is the 1st column in the CTM. 12 | 13 | # Typically the CTM contains: 14 | # - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl') 15 | # - confidences 16 | 17 | if len(sys.argv) != 3: 18 | print('Usage: %s ctm-in tra-out' % __file__) 19 | sys.exit(1) 20 | dummy, ctm_in, tra_out = sys.argv 21 | 22 | if ctm_in == '-': ctm_in = '/dev/stdin' 23 | if tra_out == '-': tra_out = '/dev/stdout' 24 | 25 | # Load the 'ctm' into dictionary, 26 | tra = dict() 27 | with open(ctm_in) as f: 28 | for l in f: 29 | utt, ch, beg, dur, wrd, conf = l.split() 30 | if not utt in tra: tra[utt] = [] 31 | tra[utt].append((float(beg),wrd)) 32 | 33 | # Store the in 'tra' format, 34 | with open(tra_out,'w') as f: 35 | for utt,tuples in tra.items(): 36 | tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time, 37 | f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples]))) 38 | 39 | -------------------------------------------------------------------------------- /steps/conf/lattice_depth_per_frame.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2015 Brno University of Technology (Author: Karel Vesely) 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | 5 | # Extract lattice-depth for each frame. 6 | 7 | # Begin configuration 8 | cmd=run.pl 9 | # End configuration 10 | 11 | echo "$0 $@" # Print the command line for logging 12 | 13 | [ -f path.sh ] && . ./path.sh # source the path. 14 | . parse_options.sh || exit 1; 15 | 16 | if [ $# != 2 ]; then 17 | echo "usage: $0 [opts] " 18 | echo "main options (for others, see top of script file)" 19 | echo " --config # config containing options" 20 | echo " --cmd" 21 | exit 1; 22 | fi 23 | 24 | set -euo pipefail 25 | 26 | latdir=$1 27 | dir=$2 28 | 29 | [ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1 30 | nj=$(cat $latdir/num_jobs) 31 | 32 | # Get the pdf-posterior vectors, 33 | $cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \ 34 | lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark 35 | # Merge, 36 | for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark 37 | rm $dir/lattice_frame_depth.*.ark 38 | 39 | # Done! 40 | -------------------------------------------------------------------------------- /steps/conf/parse_arpa_unigrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import sys, gzip, re 8 | 9 | # Parse options, 10 | if len(sys.argv) != 4: 11 | print("Usage: %s " % __file__) 12 | sys.exit(0) 13 | words_txt, arpa_gz, unigrams_out = sys.argv[1:] 14 | 15 | if arpa_gz == '-': arpa_gz = '/dev/stdin' 16 | if unigrams_out == '-': unigrams_out = '/dev/stdout' 17 | 18 | # Load the words.txt, 19 | words = [ l.split() for l in open(words_txt) ] 20 | 21 | # Load the unigram probabilities in 10log from ARPA, 22 | wrd_log10 = dict() 23 | with gzip.open(arpa_gz,'r') as f: 24 | read = False 25 | for l in f: 26 | if l.strip() == '\\1-grams:': read = True 27 | if l.strip() == '\\2-grams:': break 28 | if read and len(l.split())>=2: 29 | log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2] 30 | wrd_log10[wrd] = float(log10_p_unigram) 31 | 32 | # Create list, 'wrd id log_p_unigram', 33 | words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ] 34 | 35 | print(words_unigram[0], file=sys.stderr) 36 | # Store, 37 | with open(unigrams_out,'w') as f: 38 | f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram]) 39 | 40 | -------------------------------------------------------------------------------- /steps/conf/prepare_word_categories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | import sys 7 | 8 | from optparse import OptionParser 9 | desc = """ 10 | Prepare mapping of words into categories. Each word with minimal frequency 11 | has its own category, the rest is merged into single class. 12 | """ 13 | usage = "%prog [opts] words.txt ctm category_mapping" 14 | parser = OptionParser(usage=usage, description=desc) 15 | parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20) 16 | (o, args) = parser.parse_args() 17 | 18 | if len(args) != 3: 19 | parser.print_help() 20 | sys.exit(1) 21 | words_file, text_file, category_mapping_file = args 22 | 23 | if text_file == '-': text_file = '/dev/stdin' 24 | if category_mapping_file == '-': category_mapping_file = '/dev/stdout' 25 | 26 | # Read the words from the 'tra' file, 27 | with open(text_file) as f: 28 | text_words = [ l.split()[1:] for l in f ] 29 | 30 | # Flatten the array of arrays of words, 31 | import itertools 32 | text_words = list(itertools.chain.from_iterable(text_words)) 33 | 34 | # Count the words (regardless if correct or incorrect), 35 | word_counts = dict() 36 | for w in text_words: 37 | if w not in word_counts: word_counts[w] = 0 38 | word_counts[w] += 1 39 | 40 | # Read the words.txt, 41 | with open(words_file) as f: 42 | word_id = [ l.split() for l in f ] 43 | 44 | # Append the categories, 45 | n=1 46 | word_id_cat=[] 47 | for word, idx in word_id: 48 | cat = 0 49 | if word in word_counts: 50 | if word_counts[word] > o.min_count: 51 | cat = n; n += 1 52 | word_id_cat.append([word, idx, str(cat)]) 53 | 54 | # Store the mapping, 55 | with open(category_mapping_file,'w') as f: 56 | f.writelines([' '.join(record)+'\n' for record in word_id_cat]) 57 | -------------------------------------------------------------------------------- /steps/data/data_dir_manipulation_lib.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | def RunKaldiCommand(command, wait = True): 4 | """ Runs commands frequently seen in Kaldi scripts. These are usually a 5 | sequence of commands connected by pipes, so we use shell=True """ 6 | #logger.info("Running the command\n{0}".format(command)) 7 | p = subprocess.Popen(command, shell = True, 8 | stdout = subprocess.PIPE, 9 | stderr = subprocess.PIPE) 10 | 11 | if wait: 12 | [stdout, stderr] = p.communicate() 13 | if p.returncode is not 0: 14 | raise Exception("There was an error while running the command {0}\n------------\n{1}".format(command, stderr)) 15 | return stdout, stderr 16 | else: 17 | return p 18 | -------------------------------------------------------------------------------- /steps/data/make_musan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2015 David Snyder 3 | # 2019 Phani Sankar Nidadavolu 4 | # Apache 2.0. 5 | # 6 | # This script creates the MUSAN data directory. 7 | # Consists of babble, music and noise files. 8 | # Used to create augmented data 9 | # The required dataset is freely available at http://www.openslr.org/17/ 10 | 11 | # The corpus can be cited as follows: 12 | # @misc{musan2015, 13 | # author = {David Snyder and Guoguo Chen and Daniel Povey}, 14 | # title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus}, 15 | # year = {2015}, 16 | # eprint = {1510.08484}, 17 | # note = {arXiv:1510.08484v1} 18 | # } 19 | 20 | set -e 21 | use_vocals=true 22 | sampling_rate=16000 23 | stage=0 24 | 25 | echo "$0 $@" # Print the command line for logging 26 | 27 | if [ -f path.sh ]; then . ./path.sh; fi 28 | . parse_options.sh || exit 1; 29 | 30 | if [ $# -ne 2 ]; then 31 | echo USAGE: $0 input_dir output_dir 32 | echo input_dir is the path where the MUSAN corpus is located 33 | echo e.g: $0 /export/corpora/JHU/musan data 34 | echo "main options (for others, see top of script file)" 35 | echo " --sampling-rate # Sampling frequency of source dir" 36 | echo " --use-vocals # Use vocals from music portion of MUSAN corpus" 37 | exit 1; 38 | fi 39 | 40 | in_dir=$1 41 | data_dir=$2 42 | 43 | mkdir -p local/musan.tmp 44 | 45 | # The below script will create the musan corpus 46 | steps/data/make_musan.py --use-vocals ${use_vocals} \ 47 | --sampling-rate ${sampling_rate} \ 48 | ${in_dir} ${data_dir}/musan || exit 1; 49 | 50 | utils/fix_data_dir.sh ${data_dir}/musan 51 | 52 | grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music 53 | grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech 54 | grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise 55 | 56 | utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ 57 | ${data_dir}/musan ${data_dir}/musan_music 58 | utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ 59 | ${data_dir}/musan ${data_dir}/musan_speech 60 | utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ 61 | ${data_dir}/musan ${data_dir}/musan_noise 62 | 63 | utils/fix_data_dir.sh ${data_dir}/musan_music 64 | utils/fix_data_dir.sh ${data_dir}/musan_speech 65 | utils/fix_data_dir.sh ${data_dir}/musan_noise 66 | 67 | rm -rf local/musan.tmp 68 | 69 | for name in speech noise music; do 70 | utils/data/get_reco2dur.sh ${data_dir}/musan_${name} 71 | done 72 | -------------------------------------------------------------------------------- /steps/decode_combine.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | # Combine two decoding directories by composing the lattices (we 6 | # apply a weight to each of the original weights, by default 0.5 each). 7 | # Note, this is not the only combination method, or the most normal combination 8 | # method. See also egs/wsj/s5/local/score_combine.sh. 9 | 10 | # Begin configuration section. 11 | weight1=0.5 # Weight on 1st set of lattices. 12 | cmd=run.pl 13 | skip_scoring=false 14 | # End configuration section. 15 | 16 | echo "$0 $@" # Print the command line for logging 17 | 18 | [ -f ./path.sh ] && . ./path.sh; # source the path. 19 | . parse_options.sh || exit 1; 20 | 21 | if [ $# -ne 5 ]; then 22 | echo "Usage: steps/decode_combine.sh [options] " 23 | echo " e.g.: steps/decode_combine.sh data/lang data/test exp/dir1/decode exp/dir2/decode exp/combine_1_2/decode" 24 | echo "main options (for others, see top of script file)" 25 | echo " --config # config containing options" 26 | echo " --cmd # Command to run in parallel with" 27 | echo " --weight1 # Weight on 1st set of lattices (default 0.5)" 28 | exit 1; 29 | fi 30 | 31 | data=$1 32 | lang_or_graphdir=$2 33 | srcdir1=$3 34 | srcdir2=$4 35 | dir=$5 36 | 37 | for f in $data/utt2spk $lang_or_graphdir/phones.txt $srcdir1/lat.1.gz $srcdir2/lat.1.gz; do 38 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 39 | done 40 | 41 | nj1=`cat $srcdir1/num_jobs` || exit 1; 42 | nj2=`cat $srcdir2/num_jobs` || exit 1; 43 | [ $nj1 -ne $nj2 ] && echo "$0: mismatch in number of jobs $nj1 versus $nj2" && exit 1; 44 | nj=$nj1 45 | 46 | mkdir -p $dir/log 47 | echo $nj > $dir/num_jobs 48 | 49 | # The lattice-interp command does the score interpolation (with composition), 50 | # and the lattice-copy-backoff replaces the result with the 1st lattice, in 51 | # cases where the composed result was empty. 52 | $cmd JOB=1:$nj $dir/log/interp.JOB.log \ 53 | lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \ 54 | "ark,s,cs:gunzip -c $srcdir2/lat.JOB.gz|" ark:- \| \ 55 | lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \ 56 | "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; 57 | 58 | if ! $skip_scoring ; then 59 | [ ! -x local/score.sh ] && \ 60 | echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; 61 | local/score.sh --cmd "$cmd" $data $lang_or_graphdir $dir || 62 | { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } 63 | fi 64 | 65 | exit 0; 66 | -------------------------------------------------------------------------------- /steps/diagnostic/analyze_alignments.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright Johns Hopkins University (Author: Daniel Povey) 2016. Apache 2.0. 4 | 5 | # This script performs some analysis of alignments on disk, currently in terms 6 | # of phone lengths, including lengths of leading and trailing silences 7 | 8 | 9 | # begin configuration section. 10 | cmd=run.pl 11 | #end configuration section. 12 | 13 | echo "$0 $@" # Print the command line for logging 14 | 15 | [ -f ./path.sh ] && . ./path.sh 16 | . parse_options.sh || exit 1; 17 | 18 | if [ $# -ne 2 ]; then 19 | echo "Usage: $0 [options] " 20 | echo " Options:" 21 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 22 | echo "e.g.:" 23 | echo "$0 data/lang exp/tri4b" 24 | echo "This script writes some diagnostics to /log/alignments.log" 25 | exit 1; 26 | fi 27 | 28 | lang=$1 29 | dir=$2 30 | 31 | model=$dir/final.mdl 32 | 33 | for f in $lang/words.txt $model $dir/ali.1.gz $dir/num_jobs; do 34 | [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; 35 | done 36 | 37 | num_jobs=$(cat $dir/num_jobs) || exit 1 38 | 39 | mkdir -p $dir/log 40 | 41 | rm $dir/phone_stats.*.gz 2>/dev/null || true 42 | 43 | $cmd JOB=1:$num_jobs $dir/log/get_phone_alignments.JOB.log \ 44 | set -o pipefail '&&' ali-to-phones --write-lengths=true "$model" \ 45 | "ark:gunzip -c $dir/ali.JOB.gz|" ark,t:- \| \ 46 | sed -E 's/^[^ ]+ //' \| \ 47 | awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ 48 | sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 49 | 50 | if ! $cmd $dir/log/analyze_alignments.log \ 51 | gunzip -c "$dir/phone_stats.*.gz" \| \ 52 | steps/diagnostic/analyze_phone_length_stats.py $lang; then 53 | echo "$0: analyze_phone_length_stats.py failed, but ignoring the error (it's just for diagnostics)" 54 | fi 55 | 56 | grep WARNING $dir/log/analyze_alignments.log 57 | echo "$0: see stats in $dir/log/analyze_alignments.log" 58 | 59 | rm $dir/phone_stats.*.gz 60 | 61 | exit 0 62 | -------------------------------------------------------------------------------- /steps/libs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | """ This package contains modules and subpackages used in kaldi scripts. 7 | """ 8 | 9 | from . import common 10 | 11 | __all__ = ["common"] 12 | -------------------------------------------------------------------------------- /steps/libs/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/steps/libs/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /steps/libs/__pycache__/common.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyunlongaaa/NSD-MS2S/6acb1fed78be709c0e332353468ada08bfcc2515/steps/libs/__pycache__/common.cpython-38.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Johns Hopkins University (Dan Povey) 4 | # 2016 Vimal Manohar 5 | # 2016 Vijayaditya Peddinti 6 | # 2016 Yiming Wang 7 | # Apache 2.0. 8 | 9 | 10 | # This module has the python functions which facilitate the use of nnet3 toolkit 11 | # It has two sub-modules 12 | # xconfig : Library for parsing high level description of neural networks 13 | # train : Library for training scripts 14 | -------------------------------------------------------------------------------- /steps/libs/nnet3/report/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | from . import log_parse 7 | 8 | __all__ = ["log_parse"] 9 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2016 Vimal Manohar 3 | # Apache 2.0 4 | 5 | """ This library has classes and methods commonly used for training nnet3 6 | neural networks. 7 | 8 | It has separate submodules for frame-level objectives and chain objective: 9 | frame_level_objf -- For both recurrent and non-recurrent architectures 10 | chain_objf -- LF-MMI objective training 11 | """ 12 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/chain_objf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | """ This is a subpackage containing modules for training of 7 | deep neural network acoustic model with chain objective. 8 | """ 9 | 10 | from . import acoustic_model 11 | 12 | __all__ = ["acoustic_model"] 13 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/frame_level_objf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """ This library has classes and methods commonly used for training nnet3 7 | neural networks with frame-level objectives. 8 | """ 9 | 10 | from . import common 11 | from . import raw_model 12 | from . import acoustic_model 13 | 14 | __all__ = ["common", "raw_model", "acoustic_model"] 15 | -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Johns Hopkins University (Dan Povey) 2 | # 2016 Vijayaditya Peddinti 3 | # 2016 Yiming Wang 4 | # Apache 2.0. 5 | 6 | """This library has classes and methods to form neural network computation graphs, 7 | in the nnet3 framework, using higher level abstractions called 'layers' 8 | (e.g. sub-graphs like LSTMS ). 9 | 10 | Note : We use the term 'layer' though the computation graph can have a highly 11 | non-linear structure as, other terms such as nodes/components have already been 12 | used in C++ codebase of nnet3. 13 | 14 | This is basically a config parser module, where the configs have very concise 15 | descriptions of a neural network. 16 | 17 | This module has methods to convert the xconfigs into a configs interpretable by 18 | nnet3 C++ library. 19 | 20 | It generates three different configs: 21 | 'init.config' : which is the config with the info necessary for computing 22 | the preconditioning matrix i.e., LDA transform 23 | e.g. 24 | input-node name=input dim=40 25 | input-node name=ivector dim=100 26 | output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear 27 | 28 | 'ref.config' : which is a version of the config file used to generate 29 | a model for getting left and right context (it doesn't read 30 | anything for the LDA-like transform and/or 31 | presoftmax-prior-scale components) 32 | 33 | 'final.config' : which has the actual config used to initialize the model used 34 | in training i.e, it has file paths for LDA transform and 35 | other initialization files 36 | """ 37 | 38 | 39 | __all__ = ["utils", "layers", "parser"] 40 | -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/layers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Johns Hopkins University (Dan Povey) 2 | # 2016 Vijayaditya Peddinti 3 | # 2016 Yiming Wang 4 | # Apache 2.0. 5 | 6 | from .basic_layers import * 7 | from .convolution import * 8 | from .attention import * 9 | from .lstm import * 10 | from .gru import * 11 | from .stats_layer import * 12 | from .trivial_layers import * 13 | from .composite_layers import * 14 | -------------------------------------------------------------------------------- /steps/lmrescore_const_arpa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script rescores lattices with the ConstArpaLm format language model. 7 | 8 | # Begin configuration section. 9 | cmd=run.pl 10 | skip_scoring=false 11 | stage=1 12 | scoring_opts= 13 | # End configuration section. 14 | 15 | echo "$0 $@" # Print the command line for logging 16 | 17 | . ./utils/parse_options.sh 18 | 19 | if [ $# != 5 ]; then 20 | echo "Does language model rescoring of lattices (remove old LM, add new LM)" 21 | echo "Usage: $0 [options] \\" 22 | echo " " 23 | echo "options: [--cmd (run.pl|queue.pl [queue opts])]" 24 | exit 1; 25 | fi 26 | 27 | [ -f path.sh ] && . ./path.sh; 28 | 29 | oldlang=$1 30 | newlang=$2 31 | data=$3 32 | indir=$4 33 | outdir=$5 34 | 35 | oldlm=$oldlang/G.fst 36 | newlm=$newlang/G.carpa 37 | ! cmp $oldlang/words.txt $newlang/words.txt &&\ 38 | echo "$0: Warning: vocabularies may be incompatible." 39 | [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; 40 | [ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1; 41 | ! ls $indir/lat.*.gz >/dev/null &&\ 42 | echo "$0: No lattices input directory $indir" && exit 1; 43 | 44 | if ! cmp -s $oldlang/words.txt $newlang/words.txt; then 45 | echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; 46 | fi 47 | 48 | oldlmcommand="fstproject --project_output=true $oldlm |" 49 | 50 | mkdir -p $outdir/log 51 | nj=`cat $indir/num_jobs` || exit 1; 52 | cp $indir/num_jobs $outdir 53 | 54 | if [ $stage -le 1 ]; then 55 | $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ 56 | lattice-lmrescore --lm-scale=-1.0 \ 57 | "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ 58 | lattice-lmrescore-const-arpa --lm-scale=1.0 \ 59 | ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; 60 | fi 61 | 62 | if ! $skip_scoring && [ $stage -le 2 ]; then 63 | err_msg="Not scoring because local/score.sh does not exist or not executable." 64 | [ ! -x local/score.sh ] && echo $err_msg && exit 1; 65 | local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir 66 | else 67 | echo "Not scoring because requested so..." 68 | fi 69 | 70 | exit 0; 71 | -------------------------------------------------------------------------------- /steps/nnet2/check_ivectors_compatible.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2016, Johns Hopkins University (Yenda Trmal ) 3 | # License: Apache 2.0 4 | 5 | # Begin configuration section. 6 | # End configuration section 7 | 8 | #echo >&2 "$0 $@" # Print the command line for logging 9 | if [ $# != 2 ] ; then 10 | echo >&2 "Usage: $0 " 11 | echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem" 12 | fi 13 | 14 | dir_a=$1 15 | dir_b=$2 16 | 17 | id_a=$(steps/nnet2/get_ivector_id.sh $dir_a) 18 | ret_a=$? 19 | id_b=$(steps/nnet2/get_ivector_id.sh $dir_b) 20 | ret_b=$? 21 | 22 | if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then 23 | if [ "${id_a}" == "${id_b}" ]; then 24 | exit 0 25 | else 26 | echo >&2 "$0: ERROR: iVector id ${id_a} in $dir_a and the iVector id ${id_b} in $dir_b do not match" 27 | echo >&2 "$0: ERROR: that means that the systems are not compatible." 28 | exit 1 29 | fi 30 | elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then 31 | echo >&2 "$0: WARNING: The directories do not contain iVector ID." 32 | echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping " 33 | echo >&2 "$0: WARNING: the directories compatible" 34 | exit 0 35 | else 36 | echo >&2 "$0: WARNING: One of the directories do not contain iVector ID." 37 | echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping " 38 | echo >&2 "$0: WARNING: the directories compatible" 39 | exit 0 40 | fi 41 | -------------------------------------------------------------------------------- /steps/nnet2/convert_nnet1_to_nnet2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Johns Hopkins University (Author: Daniel Povey). 4 | # Apache 2.0. 5 | 6 | # This script converts nnet1 into nnet2 models. 7 | # Note, it doesn't support all possible types of nnet1 models. 8 | 9 | # Begin configuration section 10 | cleanup=true 11 | cmd=run.pl 12 | # End configuration section. 13 | 14 | echo "$0 $@" # Print the command line for logging 15 | 16 | [ -f ./path.sh ] && . ./path.sh; # source the path. 17 | . parse_options.sh || exit 1; 18 | 19 | 20 | if [ $# -ne 2 ]; then 21 | echo "Usage: $0 [options] " 22 | echo "e.g.: $0 exp/dnn4b_pretrain-dbn_dnn exp/dnn4b_nnet2" 23 | exit 1; 24 | fi 25 | 26 | src=$1 27 | dir=$2 28 | 29 | mkdir -p $dir/log || exit 1; 30 | 31 | for f in $src/final.mdl $src/final.feature_transform $src/ali_train_pdf.counts; do 32 | [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 33 | done 34 | 35 | cp $src/phones.txt $dir 2>/dev/null 36 | 37 | $cmd $dir/log/convert_feature_transform.log \ 38 | nnet1-to-raw-nnet $src/final.feature_transform $dir/0.raw || exit 1; 39 | 40 | 41 | if [ -f $src/final.nnet ]; then 42 | echo "$0: $src/final.nnet exists, using it as input." 43 | $cmd $dir/log/convert_model.log \ 44 | nnet1-to-raw-nnet $src/final.nnet $dir/1.raw || exit 1; 45 | elif [ -f $src/final.dbn ]; then 46 | echo "$0: $src/final.dbn exists, using it as input." 47 | num_leaves=$(am-info $src/final.mdl | grep -w pdfs | awk '{print $NF}') || exit 1; 48 | dbn_output_dim=$(nnet-info exp/dnn4b_pretrain-dbn/6.dbn | grep component | tail -n 1 | sed s:,::g | awk '{print $NF}') || exit 1; 49 | [ -z "$dbn_output_dim" ] && exit 1; 50 | 51 | cat > $dir/final_layer.conf <) 3 | # License: Apache 2.0 4 | 5 | # Begin configuration section. 6 | # End configuration section 7 | set -e -o pipefail 8 | set -o nounset # Treat unset variables as an error 9 | 10 | # End configuration section. 11 | 12 | #echo >&2 "$0 $@" # Print the command line for logging 13 | 14 | if [ -f path.sh ]; then . ./path.sh; fi 15 | . parse_options.sh || exit 1; 16 | 17 | 18 | if [ $# != 1 ]; then 19 | echo >&2 "Usage: $0 " 20 | echo >&2 " e.g.: $0 exp/nnet3/extractor" 21 | exit 1 22 | fi 23 | 24 | ivecdir=$1 25 | 26 | if [ -f $ivecdir/final.ie.id ] ; then 27 | cat $ivecdir/final.ie.id 28 | elif [ -f $ivecdir/final.ie ] ; then 29 | # note the creation can fail in case the extractor directory 30 | # is not read-only media or the user des not have access rights 31 | # in that case we will just behave as if the id is not available 32 | id=$(md5sum $ivecdir/final.ie | awk '{print $1}') 33 | echo "$id" > $ivecdir/final.ie.id || true 34 | echo "$id" 35 | else 36 | exit 0 37 | fi 38 | 39 | exit 0 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /steps/nnet2/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script works out the approximate number of frames in a training directory. 4 | # This is sometimes needed by higher-level scripts 5 | 6 | 7 | if [ -f path.sh ]; then . ./path.sh; fi 8 | . parse_options.sh || exit 1; 9 | 10 | if [ $# -ne 1 ]; then 11 | ( 12 | echo "Usage: $0 " 13 | echo "Prints the number of frames of data in the data-dir" 14 | ) 1>&2 15 | fi 16 | 17 | data=$1 18 | 19 | if [ ! -f $data/utt2dur ]; then 20 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1 21 | fi 22 | 23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 24 | 25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur 26 | -------------------------------------------------------------------------------- /steps/nnet2/remove_egs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Johns Hopkins University (Author: Daniel Povey). 4 | # Apache 2.0. 5 | 6 | # This script removes the examples in an egs/ directory, e.g. 7 | # steps/nnet2/remove_egs.sh exp/nnet4b/egs/ 8 | # We give it its own script because we need to be careful about 9 | # things that are soft links to something in storage/ (i.e. remove the 10 | # data that's linked to as well as the soft link), and we want to not 11 | # delete the examples if someone has done "touch $dir/egs/.nodelete". 12 | 13 | 14 | if [ $# != 1 ]; then 15 | echo "Usage: $0 " 16 | echo "e.g.: $0 data/nnet4b/egs/" 17 | echo "e.g.: $0 data/nnet4b_mpe/degs/" 18 | echo "This script is usually equivalent to 'rm /egs.* /degs.*' but it follows" 19 | echo "soft links to /storage/; and it avoids deleting anything in the directory if" 20 | echo "someone did 'touch /.nodelete" 21 | exit 1; 22 | fi 23 | 24 | egs=$1 25 | 26 | if [ ! -d $egs ]; then 27 | echo "$0: expected directory $egs to exist" 28 | exit 1; 29 | fi 30 | 31 | if [ -f $egs/.nodelete ]; then 32 | echo "$0: not deleting egs in $egs since $egs/.nodelete exists" 33 | exit 0; 34 | fi 35 | 36 | 37 | 38 | for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do 39 | if [ -L $f ]; then 40 | rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. 41 | fi 42 | rm $f 2>/dev/null 43 | done 44 | 45 | 46 | echo "$0: Finished deleting examples in $egs" 47 | -------------------------------------------------------------------------------- /steps/nnet3/chain/e2e/README.txt: -------------------------------------------------------------------------------- 1 | The scripts related to end2end chain training are in this directory 2 | Currently it has 3 scripts: 3 | 4 | ** prepare_e2e.sh which is almost equivalent 5 | to regular chain's build-tree.sh (i.e. it creates the tree and 6 | the transition-model) except it does not require any previously 7 | trained models (in other terms, it does what stages -3 and -2 8 | of steps/train_mono.sh do). 9 | 10 | ** get_egs_e2e.sh: this is simlilar to chain/get_egs.sh except it 11 | uses training FSTs (instead of lattices) to generate end2end egs. 12 | 13 | ** train_e2e.py: this is very similar to chain/train.py but 14 | with fewer stages (e.g. it does not compute the preconditioning matrix) 15 | 16 | 17 | For details please see the comments at top of local/chain/e2e/run_flatstart_*.sh 18 | and also src/chain/chain-generic-numerator.h. 19 | -------------------------------------------------------------------------------- /steps/nnet3/chain/e2e/text_to_phones.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017 Hossein Hadian 4 | # Apache 2.0 5 | 6 | 7 | """ This reads data/train/text from standard input, converts the word transcriptions 8 | to phone transcriptions using the provided lexicon, 9 | and writes them to standard output. 10 | """ 11 | from __future__ import print_function 12 | 13 | import argparse 14 | from os.path import join 15 | import sys 16 | import copy 17 | import random 18 | 19 | parser = argparse.ArgumentParser(description="""This script reads 20 | data/train/text from std input and converts the word transcriptions 21 | to phone transcriptions using the provided lexicon""") 22 | parser.add_argument('langdir', type=str) 23 | parser.add_argument('--edge-silprob', type=float, default=0.8, 24 | help="""Probability of optional silence at the beginning 25 | and end.""") 26 | parser.add_argument('--between-silprob', type=float, default=0.2, 27 | help="Probability of optional silence between the words.") 28 | 29 | 30 | args = parser.parse_args() 31 | 32 | # optional silence 33 | sil = open(join(args.langdir, 34 | "phones/optional_silence.txt")).readline().strip() 35 | 36 | oov_word = open(join(args.langdir, "oov.txt")).readline().strip() 37 | 38 | 39 | # load the lexicon 40 | lexicon = {} 41 | with open(join(args.langdir, "phones/align_lexicon.txt")) as f: 42 | for line in f: 43 | line = line.strip(); 44 | parts = line.split() 45 | lexicon[parts[0]] = parts[2:] # ignore parts[1] 46 | 47 | n_tot = 0 48 | n_fail = 0 49 | for line in sys.stdin: 50 | line = line.strip().split() 51 | key = line[0] 52 | word_trans = line[1:] # word-level transcription 53 | phone_trans = [] # phone-level transcription 54 | if random.random() < args.edge_silprob: 55 | phone_trans += [sil] 56 | for i in range(len(word_trans)): 57 | n_tot += 1 58 | word = word_trans[i] 59 | if word not in lexicon: 60 | n_fail += 1 61 | if n_fail < 20: 62 | sys.stderr.write("{} not found in lexicon, replacing with {}\n".format(word, oov_word)) 63 | elif n_fail == 20: 64 | sys.stderr.write("Not warning about OOVs any more.\n") 65 | pronunciation = lexicon[oov_word] 66 | else: 67 | pronunciation = copy.deepcopy(lexicon[word]) 68 | phone_trans += pronunciation 69 | prob = args.between_silprob if i < len(word_trans) - 1 else args.edge_silprob 70 | if random.random() < prob: 71 | phone_trans += [sil] 72 | print(key + " " + " ".join(phone_trans)) 73 | 74 | sys.stderr.write("Done. {} out of {} were OOVs.\n".format(n_fail, n_tot)) 75 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Generate a topology file. This allows control of the number of states in the 6 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 8 | # believe should be useful in the 'chain' model. Note: right now it doesn't 9 | # have any real options, and it treats silence and nonsilence the same. The 10 | # intention is that you write different versions of this script, or add options, 11 | # if you experiment with it. 12 | 13 | if (@ARGV != 2) { 14 | print STDERR "Usage: utils/gen_topo.pl \n"; 15 | print STDERR "e.g.: utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n"; 16 | exit (1); 17 | } 18 | 19 | ($nonsil_phones, $sil_phones) = @ARGV; 20 | 21 | $nonsil_phones =~ s/:/ /g; 22 | $sil_phones =~ s/:/ /g; 23 | $nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; 24 | $sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n"; 25 | 26 | print "\n"; 27 | print "\n"; 28 | print "\n"; 29 | print "$nonsil_phones $sil_phones\n"; 30 | print "\n"; 31 | # The next two lines may look like a bug, but they are as intended. State 0 has 32 | # no self-loop, it happens exactly once. And it can go either to state 1 (with 33 | # a self-loop) or to state 2, so we can have zero or more instances of state 1 34 | # following state 0. 35 | # We make the transition-probs 0.5 so they normalize, to keep the code happy. 36 | # In fact, we always set the transition probability scale to 0.0 in the 'chain' 37 | # code, so they are never used. 38 | print " 0 0 1 0.5 2 0.5 \n"; 39 | print " 1 1 1 0.5 2 0.5 \n"; 40 | print " 2 \n"; 41 | print "\n"; 42 | print "\n"; 43 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # This script was modified around 11.11.2016, when the code was extended to 6 | # support having a different pdf-class on the self loop. 7 | 8 | # Generate a topology file. This allows control of the number of states in the 9 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 10 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 11 | # believe should be useful in the 'chain' model. Note: right now it doesn't 12 | # have any real options, and it treats silence and nonsilence the same. The 13 | # intention is that you write different versions of this script, or add options, 14 | # if you experiment with it. 15 | 16 | from __future__ import print_function 17 | import argparse 18 | 19 | 20 | parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " 21 | " " 22 | "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", 23 | epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); 24 | parser.add_argument("nonsilence_phones", type=str, 25 | help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); 26 | parser.add_argument("silence_phones", type=str, 27 | help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); 28 | 29 | args = parser.parse_args() 30 | 31 | silence_phones = [ int(x) for x in args.silence_phones.split(":") ] 32 | nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] 33 | all_phones = silence_phones + nonsilence_phones 34 | 35 | print("") 36 | print("") 37 | print("") 38 | print(" ".join([str(x) for x in all_phones])) 39 | print("") 40 | # We make the transition-probs 0.5 so they normalize, to keep the code happy. 41 | # In fact, we always set the transition probability scale to 0.0 in the 'chain' 42 | # code, so they are never used. 43 | # Note: the will actually happen on the incoming arc because 44 | # we always build the graph with "reorder=true". 45 | print(" 0 0 1 0 0.5 1 0.5 ") 46 | print(" 1 ") 47 | print("") 48 | print("") 49 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Generate a topology file. This allows control of the number of states in the 6 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 8 | # believe should be useful in the 'chain' model. Note: right now it doesn't 9 | # have any real options, and it treats silence and nonsilence the same. The 10 | # intention is that you write different versions of this script, or add options, 11 | # if you experiment with it. 12 | 13 | from __future__ import print_function 14 | import argparse 15 | 16 | 17 | parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " 18 | " " 19 | "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", 20 | epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); 21 | parser.add_argument("nonsilence_phones", type=str, 22 | help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); 23 | parser.add_argument("silence_phones", type=str, 24 | help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); 25 | 26 | args = parser.parse_args() 27 | 28 | silence_phones = [ int(x) for x in args.silence_phones.split(":") ] 29 | nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] 30 | all_phones = silence_phones + nonsilence_phones 31 | 32 | print("") 33 | print("") 34 | print("") 35 | print(" ".join([str(x) for x in all_phones])) 36 | print("") 37 | print(" 0 0 0 0.5 1 0.5 ") 38 | print(" 1 ") 39 | print("") 40 | print("") 41 | 42 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Generate a topology file. This allows control of the number of states in the 6 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 8 | # believe should be useful in the 'chain' model. Note: right now it doesn't 9 | # have any real options, and it treats silence and nonsilence the same. The 10 | # intention is that you write different versions of this script, or add options, 11 | # if you experiment with it. 12 | 13 | from __future__ import print_function 14 | import argparse 15 | 16 | 17 | parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " 18 | " " 19 | "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", 20 | epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); 21 | parser.add_argument("nonsilence_phones", type=str, 22 | help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); 23 | parser.add_argument("silence_phones", type=str, 24 | help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); 25 | 26 | args = parser.parse_args() 27 | 28 | silence_phones = [ int(x) for x in args.silence_phones.split(":") ] 29 | nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] 30 | all_phones = silence_phones + nonsilence_phones 31 | 32 | print("") 33 | print("") 34 | print("") 35 | print(" ".join([str(x) for x in all_phones])) 36 | print("") 37 | # state 0 is obligatory (occurs once) 38 | print(" 0 0 1 0.3333 2 0.3333 3 0.3333 ") 39 | # state 1 is used only when >2 frames 40 | print(" 1 1 1 0.5 2 0.5 ") 41 | # state 2 is used only when >=2 frames (and occurs once) 42 | print(" 2 2 3 1.0 ") 43 | print(" 3 ") # final nonemitting state 44 | print("") 45 | print("") 46 | 47 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Generate a topology file. This allows control of the number of states in the 6 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 8 | # believe should be useful in the 'chain' model. Note: right now it doesn't 9 | # have any real options, and it treats silence and nonsilence the same. The 10 | # intention is that you write different versions of this script, or add options, 11 | # if you experiment with it. 12 | 13 | from __future__ import print_function 14 | import argparse 15 | 16 | 17 | parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " 18 | " " 19 | "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", 20 | epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); 21 | parser.add_argument("nonsilence_phones", type=str, 22 | help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); 23 | parser.add_argument("silence_phones", type=str, 24 | help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); 25 | 26 | args = parser.parse_args() 27 | 28 | silence_phones = [ int(x) for x in args.silence_phones.split(":") ] 29 | nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] 30 | all_phones = silence_phones + nonsilence_phones 31 | 32 | print("") 33 | print("") 34 | print("") 35 | print(" ".join([str(x) for x in all_phones])) 36 | print("") 37 | # state 0 is nonemitting 38 | print(" 0 1 0.5 2 0.5 ") 39 | # state 1 is for when we traverse it in 1 state 40 | print(" 1 0 4 1.0 ") 41 | # state 2 is for when we traverse it in >1 state, for the first state. 42 | print(" 2 2 3 1.0 ") 43 | # state 3 is for the self-loop. Use pdf-class 1 here so that the default 44 | # phone-class clustering (which uses only pdf-class 1 by default) gets only 45 | # stats from longer phones. 46 | print(" 3 1 3 0.5 4 0.5 ") 47 | print(" 4 ") 48 | print("") 49 | print("") 50 | 51 | -------------------------------------------------------------------------------- /steps/nnet3/chain2/internal/get_best_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Idiap Research Institute (Author: Srikanth Madikeri). Apache 2.0. 4 | # This script is the equivalent of get_successful_models function in the python library. 5 | # It takes a list of models and returns either the best model (the deafult) or a list of 6 | # models to average. 7 | 8 | models_to_average=false 9 | difference_threshold=1.0 10 | output=output 11 | 12 | 13 | # echo "$0 $@" # Print the command line for logging 14 | 15 | if [ -f path.sh ]; then . ./path.sh; fi 16 | . parse_options.sh || exit 1; 17 | 18 | if [ $# -lt 1 ]; then 19 | echo "Usage: $0: [options] .... " 20 | echo "where is one of the n models to choose from." 21 | echo "" 22 | echo "--models-to-average: when true, returns the models to be averaged rather than the single best model" 23 | echo "--difference-threshold: used to reject models. models with objf < max-value - difference_threshold are rejected" 24 | echo "--output: the objf of the this output layer is used for model selection" 25 | echo "" 26 | exit 1; 27 | fi 28 | 29 | if ! $models_to_average; then 30 | if [ $# -eq 1 ]; then 31 | basename $1 | tr '.' ' ' | awk '{ print $(NF-1) }' 32 | exit 0; 33 | fi 34 | model_log_list=$(for arg in $*; do echo $arg; done) 35 | first_log=$1 36 | log_line=`fgrep -m 1 "Overall average objective function for '$output' is" $first_log` 37 | colno=`echo $log_line | cut -d '=' -f1 | wc -w` 38 | ((colno+=2)) 39 | filename=$(fgrep -m 1 "Overall average objective function for '$output' is" $model_log_list | \ 40 | cut -d ' ' -f1,$colno | tr ':' ' ' | \ 41 | awk '{print $1,$3}' | \ 42 | sort -k2,2 -g | tail -1 | cut -d ' ' -f1) 43 | basename $filename | tr '.' ' ' | awk '{ print $(NF-1) }' 44 | fi 45 | -------------------------------------------------------------------------------- /steps/nnet3/chain2/validate_processed_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | # Copyright 2019 Idiap Research Institute (Author: Srikanth Madikeri). Apache 2.0. 5 | # 6 | # This script validates a directory containing 'processed' egs for 'chain' 7 | # training, i.e. the output of process_egs.sh. It also helps to document the 8 | # expectations on such a directory. 9 | 10 | 11 | if [ -f path.sh ]; then . ./path.sh; fi 12 | 13 | 14 | if [ $# != 1 ]; then 15 | echo "Usage: $0 " 16 | echo " e.g.: $0 exp/chain/tdnn1a_sp/processed_egs" 17 | echo "" 18 | echo "Validates that the processed-egs dir has the expected format" 19 | fi 20 | 21 | dir=$1 22 | 23 | # Note: the .ark files are not actually consumed directly downstream (only via 24 | # the top-level .scp files), but we check them anyway for now. 25 | for f in $dir/train.scp $dir/info.txt \ 26 | $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \ 27 | $dir/train.1.scp $dir/train.1.ark; do 28 | if ! [ -f $f -a -s $f ]; then 29 | echo "$0: expected file $f to exist and be nonempty." 30 | exit 1 31 | fi 32 | done 33 | 34 | 35 | if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chain_egs" ]; then 36 | grep dir_type $dir/info.txt 37 | echo "$0: dir_type should be processed_chain_egs in $dir/info.txt" 38 | exit 1 39 | fi 40 | 41 | lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) 42 | 43 | for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do 44 | if ! [ -f $f -a -s $f ]; then 45 | echo "$0: expected file $f to exist and be nonempty." 46 | exit 1 47 | fi 48 | done 49 | 50 | echo "$0: sucessfully validated processed egs in $dir" 51 | -------------------------------------------------------------------------------- /steps/nnet3/chain2/validate_randomized_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | # Copyright 2019 Idiap Research Institute (Author: Srikanth Madikeri). Apache 2.0. 5 | # 6 | # This script validates a directory containing 'randomized' egs for 'chain' 7 | # training, i.e. the output of randomize_egs.sh (this is the final form of the 8 | # egs which is consumed by the training script). It also helps to document the 9 | # expectations on such a directory. 10 | 11 | 12 | if [ -f path.sh ]; then . ./path.sh; fi 13 | 14 | 15 | if [ $# != 1 ]; then 16 | echo "Usage: $0 " 17 | echo " e.g.: $0 exp/chain/tdnn1a_sp/egs" 18 | echo "" 19 | echo "Validates that the final (randomized) egs dir has the expected format" 20 | fi 21 | 22 | dir=$1 23 | 24 | # Note: the .ark files are not actually consumed directly downstream (only via 25 | # the top-level .scp files), but we check them anyway for now. 26 | for f in $dir/train.1.scp $dir/info.txt \ 27 | $dir/heldout_subset.scp $dir/train_subset.scp; do 28 | if ! [ -f $f -a -s $f ]; then 29 | echo "$0: expected file $f to exist and be nonempty." 30 | exit 1 31 | fi 32 | done 33 | 34 | 35 | if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "randomized_chain_egs" ]; then 36 | grep dir_type $dir/info.txt 37 | echo "$0: dir_type should be randomized_chain_egs in $dir/info.txt" 38 | exit 1 39 | fi 40 | 41 | langs=$(awk '/^langs / {$1 = ""; print; }' <$dir/info.txt) 42 | num_scp_files=$(awk '/^num_scp_files / { print $2; }' <$dir/info.txt) 43 | 44 | if [ -z "$langs" ]; then 45 | echo "$0: expecting the list of languages to be nonempty in $dir/info.txt" 46 | exit 1 47 | fi 48 | 49 | for lang in $langs; do 50 | for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst} $dir/info_${lang}.txt; do 51 | if ! [ -f $f -a -s $f ]; then 52 | echo "$0: expected file $f to exist and be nonempty." 53 | exit 1 54 | fi 55 | done 56 | done 57 | 58 | for i in $(seq $num_scp_files); do 59 | if ! [ -s $dir/train.$i.scp ]; then 60 | echo "$0: expected file $dir/train.$i.scp to exist and be nonempty." 61 | exit 1 62 | fi 63 | done 64 | 65 | 66 | echo "$0: sucessfully validated randomized egs in $dir" 67 | -------------------------------------------------------------------------------- /steps/nnet3/chain2/validate_raw_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | # Copyright 2019 Idiap Research Institute (Author: Srikanth Madikeri). Apache 2.0. 5 | # 6 | # This script validates a directory containing 'raw' egs for 'chain' training. 7 | # It also helps to document the expectations on such a directory. 8 | 9 | 10 | 11 | if [ -f path.sh ]; then . ./path.sh; fi 12 | 13 | 14 | if [ $# != 1 ]; then 15 | echo "Usage: $0 " 16 | echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs" 17 | echo "" 18 | echo "Validates that the raw-egs dir has the expected format" 19 | fi 20 | 21 | dir=$1 22 | 23 | for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \ 24 | $dir/misc/utt2spk; do 25 | if ! [ -s $f ]; then 26 | echo "$0: expected file $f to exist and be nonempty." 27 | exit 1 28 | fi 29 | done 30 | 31 | 32 | if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "raw_chain_egs" ]; then 33 | grep dir_type $dir/info.txt 34 | echo "$0: dir_type should be raw_chain_egs in $dir/info.txt" 35 | exit 1 36 | fi 37 | 38 | lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) 39 | 40 | for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do 41 | if ! [ -s $f ]; then 42 | echo "$0: expected file $f to exist and be nonempty." 43 | exit 1 44 | fi 45 | done 46 | 47 | echo "$0: sucessfully validated raw egs in $dir" 48 | -------------------------------------------------------------------------------- /steps/nnet3/nnet3_to_dot.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # script showing use of nnet3_to_dot.py 4 | # Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). 5 | 6 | # Begin configuration section. 7 | component_attributes="name,type" 8 | node_prefixes="" 9 | info_bin=nnet3-am-info 10 | echo "$0 $@" # Print the command line for logging 11 | 12 | [ -f ./path.sh ] && . ./path.sh; # source the path. 13 | . parse_options.sh || exit 1; 14 | 15 | if [ $# != 3 ]; then 16 | echo "Usage: $0 [opts] " 17 | echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png" 18 | echo "" 19 | echo "Main options (for others, see top of script file)" 20 | echo " --info-bin # Name of the binary to generate the nnet3 file" 21 | echo " --component-attributes # attributes to be printed in nnet3 components" 22 | echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" 23 | echo " # will be clustered together in the dot-graph" 24 | 25 | 26 | exit 1; 27 | fi 28 | 29 | model=$1 30 | dot_file=$2 31 | output_file=$3 32 | 33 | attr=${node_prefixes:+ --node-prefixes "$node_prefixes"} 34 | $info_bin $model | \ 35 | steps/nnet3/dot/nnet3_to_dot.py \ 36 | --component-attributes "$component_attributes" \ 37 | $attr $dot_file 38 | echo "Generated the dot file $dot_file" 39 | 40 | command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } 41 | dot -Tpdf $dot_file -o $output_file 42 | -------------------------------------------------------------------------------- /steps/nnet3/remove_egs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Johns Hopkins University (Author: Daniel Povey). 4 | # Apache 2.0. 5 | 6 | # This script removes the examples in an egs/ directory, e.g. 7 | # steps/nnet2/remove_egs.sh exp/nnet4b/egs/ 8 | # We give it its own script because we need to be careful about 9 | # things that are soft links to something in storage/ (i.e. remove the 10 | # data that's linked to as well as the soft link), and we want to not 11 | # delete the examples if someone has done "touch $dir/egs/.nodelete". 12 | 13 | 14 | if [ $# != 1 ]; then 15 | echo "Usage: $0 " 16 | echo "e.g.: $0 data/nnet4b/egs/" 17 | echo "e.g.: $0 data/nnet4b_mpe/degs/" 18 | echo "This script is usually equivalent to 'rm /egs.* /degs.*' but it follows" 19 | echo "soft links to /storage/; and it avoids deleting anything in the directory if" 20 | echo "someone did 'touch /.nodelete" 21 | exit 1; 22 | fi 23 | 24 | egs=$1 25 | 26 | if [ ! -d $egs ]; then 27 | echo "$0: expected directory $egs to exist" 28 | exit 1; 29 | fi 30 | 31 | if [ -f $egs/.nodelete ]; then 32 | echo "$0: not deleting egs in $egs since $egs/.nodelete exists" 33 | exit 0; 34 | fi 35 | 36 | 37 | 38 | for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do 39 | if [ -L $f ]; then 40 | rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. 41 | fi 42 | rm $f 2>/dev/null 43 | done 44 | 45 | 46 | echo "$0: Finished deleting examples in $egs" 47 | -------------------------------------------------------------------------------- /steps/online/nnet2/copy_ivector_dir.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Johns Hopkins University (author: Hossein Hadian) 4 | # Apache 2.0 5 | 6 | # This script copies the necessary parts of an online ivector directory 7 | # optionally applying a mapping to the ivector_online.scp file 8 | 9 | utt2orig= 10 | 11 | . utils/parse_options.sh 12 | 13 | if [ $# != 2 ]; then 14 | echo "Usage: " 15 | echo " $0 [options] " 16 | echo "e.g.:" 17 | echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs" 18 | echo "Options" 19 | echo " --utt2orig= # utterance id mapping to use" 20 | exit 1; 21 | fi 22 | 23 | 24 | srcdir=$1 25 | destdir=$2 26 | 27 | if [ ! -f $srcdir/ivector_period ]; then 28 | echo "$0: no such file $srcdir/ivector_period" 29 | exit 1; 30 | fi 31 | 32 | if [ "$destdir" == "$srcdir" ]; then 33 | echo "$0: this script requires and to be different." 34 | exit 1 35 | fi 36 | 37 | set -e; 38 | 39 | mkdir -p $destdir 40 | cp -r $srcdir/{conf,ivector_period} $destdir 41 | if [ -z $utt2orig ]; then 42 | cp $srcdir/ivector_online.scp $destdir 43 | else 44 | utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp 45 | fi 46 | cp $srcdir/final.ie.id $destdir 47 | 48 | echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir" 49 | -------------------------------------------------------------------------------- /steps/online/nnet2/get_pca_transform.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2016 David Snyder 4 | # 5 | # This script computes a PCA transform on top of spliced features processed with 6 | # apply-cmvn-online. 7 | # 8 | # 9 | # Apache 2.0. 10 | 11 | # Begin configuration. 12 | cmd=run.pl 13 | config= 14 | stage=0 15 | dim=40 # The dim after applying PCA 16 | normalize_variance=true # If the PCA transform normalizes the variance 17 | normalize_mean=true # If the PCA transform centers 18 | splice_opts= 19 | online_cmvn_opts= 20 | max_utts=5000 # maximum number of files to use 21 | subsample=5 # subsample features with this periodicity 22 | 23 | echo "$0 $@" # Print the command line for logging 24 | 25 | [ -f path.sh ] && . ./path.sh 26 | . parse_options.sh || exit 1; 27 | 28 | if [ $# != 2 ]; then 29 | echo "Usage: steps/nnet2/get_pca_transform.sh [options] " 30 | echo " e.g.: steps/train_pca_transform.sh data/train_si84 exp/tri2b" 31 | echo "Main options (for others, see top of script file)" 32 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 33 | echo " --config # config containing options" 34 | echo " --stage # stage to do partial re-run from." 35 | exit 1; 36 | fi 37 | 38 | data=$1 39 | dir=$2 40 | 41 | for f in $data/feats.scp ; do 42 | [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1 43 | done 44 | 45 | mkdir -p $dir/log 46 | 47 | echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options 48 | # so that later stages of system building can know what they were. 49 | echo $online_cmvn_opts > $dir/online_cmvn.conf # keep track of options to CMVN. 50 | 51 | # create global_cmvn.stats 52 | if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then 53 | echo "$0: Error summing cmvn stats" 54 | exit 1 55 | fi 56 | 57 | feats="ark,s,cs:utils/subset_scp.pl --quiet $max_utts $data/feats.scp | apply-cmvn-online $online_cmvn_opts $dir/global_cmvn.stats scp:- ark:- | splice-feats $splice_opts ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |" 58 | 59 | if [ $stage -le 0 ]; then 60 | $cmd $dir/log/pca_est.log \ 61 | est-pca --dim=$dim --normalize-variance=$normalize_variance \ 62 | --normalize-mean=$normalize_mean "$feats" $dir/final.mat || exit 1; 63 | fi 64 | 65 | echo "Done estimating PCA transform in $dir" 66 | 67 | exit 0 68 | -------------------------------------------------------------------------------- /steps/overlap/post_process_output.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2015-17 Vimal Manohar 4 | # 2020 Desh Raj 5 | # Apache 2.0. 6 | 7 | # This script post-processes the output of the overlap neural network, 8 | # which is in the form of frame-level alignments, into an RTTM file. 9 | # The alignments must be 0/1/2 denoting silence/single/overlap. Based 10 | # on this, this script can also be used to get single speaker regions. 11 | 12 | set -e -o pipefail -u 13 | . ./path.sh 14 | 15 | cmd=run.pl 16 | stage=-10 17 | nj=18 18 | 19 | region_type=overlap # change this to "single" to get only single-speaker regions 20 | 21 | # The values below are in seconds 22 | frame_shift=0.01 23 | segment_padding=0.2 24 | min_segment_dur=0 25 | merge_consecutive_max_dur=inf 26 | 27 | . utils/parse_options.sh 28 | 29 | if [ $# -ne 3 ]; then 30 | echo "This script post-processes the output of steps/segmentation/decode_sad.sh, " 31 | echo "which is in the form of frame-level alignments, into kaldi segments. " 32 | echo "The alignments must be speech activity detection marks i.e. 1 for silence " 33 | echo "and 2 for speech." 34 | echo "Usage: $0 " 35 | echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire" 36 | exit 1 37 | fi 38 | 39 | data_dir=$1 40 | output_dir=$2 # Alignment directory containing frame-level SAD labels 41 | dir=$3 42 | 43 | mkdir -p $dir 44 | 45 | for f in $output_dir/ali.1.gz $output_dir/num_jobs; do 46 | if [ ! -f $f ]; then 47 | echo "$0: Could not find file $f" && exit 1 48 | fi 49 | done 50 | 51 | nj=`cat $output_dir/num_jobs` || exit 1 52 | utils/split_data.sh $data_dir $nj 53 | 54 | utils/data/get_utt2dur.sh $data_dir 55 | 56 | if [ $stage -le 0 ]; then 57 | $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \ 58 | copy-int-vector "ark:gunzip -c $output_dir/ali.JOB.gz |" ark,t:- \| \ 59 | steps/overlap/output_to_rttm.py \ 60 | --region-type=$region_type \ 61 | --frame-shift=$frame_shift --segment-padding=$segment_padding \ 62 | --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \ 63 | --utt2dur=$data_dir/utt2dur - $dir/rttm_${region_type}.JOB 64 | fi 65 | 66 | echo $nj > $dir/num_jobs 67 | 68 | for n in $(seq $nj); do 69 | cat $dir/rttm_${region_type}.$n 70 | done > $dir/rttm_${region_type} 71 | -------------------------------------------------------------------------------- /steps/pytorchnn/check_py.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | -------------------------------------------------------------------------------- /steps/pytorchnn/data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import torch 7 | 8 | 9 | class Dictionary(object): 10 | def __init__(self): 11 | self.word2idx = {} 12 | self.idx2word = [] 13 | 14 | def read_vocab(self, path): 15 | with open(path, 'r', encoding='utf-8') as f: 16 | for line in f: 17 | word = line.split() 18 | assert (len(word) == 2) 19 | word = word[0] 20 | if word not in self.word2idx: 21 | self.idx2word.append(word) 22 | self.word2idx[word] = len(self.idx2word) - 1 23 | 24 | def __len__(self): 25 | return len(self.idx2word) 26 | 27 | 28 | class Corpus(object): 29 | def __init__(self, path): 30 | self.dictionary = Dictionary() 31 | self.dictionary.read_vocab(os.path.join(path, 'words.txt')) 32 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 33 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 34 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 35 | 36 | def tokenize(self, path): 37 | """Tokenizes a text file.""" 38 | assert os.path.exists(path) 39 | with open(path, 'r', encoding='utf-8') as f: 40 | all_ids = [] 41 | for line in f: 42 | words = line.split() + [''] 43 | ids = [] 44 | for word in words: 45 | if word in self.dictionary.word2idx: 46 | ids.append(self.dictionary.word2idx[word]) 47 | else: 48 | ids.append(self.dictionary.word2idx['']) 49 | all_ids.append(torch.tensor(ids).type(torch.int64)) 50 | data = torch.cat(all_ids) 51 | 52 | return data 53 | -------------------------------------------------------------------------------- /steps/score_kaldi_compare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2016 Nicolas Serrano 3 | # Apache 2.0 4 | 5 | [ -f ./path.sh ] && . ./path.sh 6 | 7 | # begin configuration section. 8 | cmd=run.pl 9 | replications=10000 10 | #end configuration section. 11 | 12 | echo "$0 $@" # Print the command line for logging 13 | [ -f ./path.sh ] && . ./path.sh 14 | . parse_options.sh || exit 1; 15 | 16 | if [ $# -ne 3 ]; then 17 | echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " 18 | echo " Options:" 19 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 20 | echo " --replications # number of bootstrap evaluation to compute confidence." 21 | exit 1; 22 | fi 23 | 24 | dir1=$1 25 | dir2=$2 26 | dir_compare=$3 27 | 28 | mkdir -p $dir_compare/log 29 | 30 | for d in $dir1 $dir2; do 31 | for f in test_filt.txt best_wer; do 32 | [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1; 33 | done 34 | done 35 | 36 | 37 | best_wer_file1=$(awk '{print $NF}' $dir1/best_wer) 38 | best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \ 39 | awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}') 40 | 41 | best_wer_file2=$(awk '{print $NF}' $dir2/best_wer) 42 | best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \ 43 | awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}') 44 | 45 | $cmd $dir_compare/log/score_compare.log \ 46 | compute-wer-bootci --replications=$replications \ 47 | ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \ 48 | '>' $dir_compare/wer_bootci_comparison || exit 1; 49 | 50 | exit 0; 51 | -------------------------------------------------------------------------------- /steps/scoring/score_kaldi_compare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2016 Nicolas Serrano 3 | # Apache 2.0 4 | 5 | [ -f ./path.sh ] && . ./path.sh 6 | 7 | # begin configuration section. 8 | cmd=run.pl 9 | replications=10000 10 | #end configuration section. 11 | 12 | echo "$0 $@" # Print the command line for logging 13 | [ -f ./path.sh ] && . ./path.sh 14 | . parse_options.sh || exit 1; 15 | 16 | if [ $# -ne 3 ]; then 17 | echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " 18 | echo " Options:" 19 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 20 | echo " --replications # number of bootstrap evaluation to compute confidence." 21 | exit 1; 22 | fi 23 | 24 | dir1=$1 25 | dir2=$2 26 | dir_compare=$3 27 | 28 | mkdir -p $dir_compare/log 29 | 30 | for d in $dir1 $dir2; do 31 | for f in test_filt.txt best_wer; do 32 | [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1; 33 | done 34 | done 35 | 36 | 37 | best_wer_file1=$(awk '{print $NF}' $dir1/best_wer) 38 | best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \ 39 | awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}') 40 | 41 | best_wer_file2=$(awk '{print $NF}' $dir2/best_wer) 42 | best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \ 43 | awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}') 44 | 45 | $cmd $dir_compare/log/score_compare.log \ 46 | compute-wer-bootci --replications=$replications \ 47 | ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \ 48 | '>' $dir_compare/wer_bootci_comparison || exit 1; 49 | 50 | exit 0; 51 | -------------------------------------------------------------------------------- /steps/search_index.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) 4 | # Apache 2.0 5 | 6 | # Begin configuration section. 7 | cmd=run.pl 8 | nbest=-1 9 | strict=true 10 | indices_dir= 11 | frame_subsampling_factor=1 12 | # End configuration section. 13 | 14 | echo "$0 $@" # Print the command line for logging 15 | 16 | [ -f ./path.sh ] && . ./path.sh; # source the path. 17 | . parse_options.sh || exit 1; 18 | 19 | if [ $# != 2 ]; then 20 | echo "Usage: steps/search_index.sh [options] " 21 | echo " e.g.: steps/search_index.sh data/kws exp/sgmm2_5a_mmi/decode/kws/" 22 | echo "" 23 | echo "main options (for others, see top of script file)" 24 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 25 | echo " --nbest # return n best results. (-1 means all)" 26 | echo " --indices-dir # where the indices should be stored, by default it will be in " 27 | exit 1; 28 | fi 29 | 30 | 31 | kwsdatadir=$1; 32 | kwsdir=$2; 33 | 34 | if [ -z $indices_dir ] ; then 35 | indices_dir=$kwsdir 36 | fi 37 | 38 | mkdir -p $kwsdir/log; 39 | nj=`cat $indices_dir/num_jobs` || exit 1; 40 | if [ -f $kwsdatadir/keywords.fsts.gz ]; then 41 | keywords="\"gunzip -c $kwsdatadir/keywords.fsts.gz|\"" 42 | elif [ -f $kwsdatadir/keywords.fsts ]; then 43 | keywords=$kwsdatadir/keywords.fsts; 44 | else 45 | echo "$0: no such file $kwsdatadir/keywords.fsts[.gz]" && exit 1; 46 | fi 47 | 48 | for f in $indices_dir/index.1.gz ; do 49 | [ ! -f $f ] && echo "make_index.sh: no such file $f" && exit 1; 50 | done 51 | 52 | $cmd JOB=1:$nj $kwsdir/log/search.JOB.log \ 53 | kws-search --strict=$strict --negative-tolerance=-1 \ 54 | --frame-subsampling-factor=${frame_subsampling_factor} \ 55 | "ark:gzip -cdf $indices_dir/index.JOB.gz|" ark:$keywords \ 56 | "ark,t:|gzip -c > $kwsdir/result.JOB.gz" \ 57 | "ark,t:|gzip -c > $kwsdir/stats.JOB.gz" || exit 1; 58 | 59 | exit 0; 60 | -------------------------------------------------------------------------------- /steps/segmentation/combine_targets_dirs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Nagendra Kumar Goel 4 | # 2018 Vimal Manohar 5 | # Apache 2.0. 6 | 7 | # This script combines targets directory into a new targets directory 8 | # containing targets from all the input targets directories. 9 | 10 | echo "$0 $@" # Print the command line for logging 11 | 12 | if [ -f path.sh ]; then . ./path.sh; fi 13 | . parse_options.sh || exit 1; 14 | 15 | if [ $# -lt 3 ]; then 16 | echo "Usage: $0 [options] ..." 17 | echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2" 18 | exit 1; 19 | fi 20 | 21 | export LC_ALL=C 22 | 23 | data=$1; 24 | shift; 25 | dest=$1; 26 | shift; 27 | first_src=$1; 28 | 29 | mkdir -p $dest; 30 | rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null 31 | 32 | frame_subsampling_factor=1 33 | if [ -f $first_src/frame_subsampling_factor ]; then 34 | cp $first_src/frame_subsampling_factor $dest 35 | frame_subsampling_factor=$(cat $dest/frame_subsampling_factor) 36 | fi 37 | 38 | for d in $*; do 39 | this_frame_subsampling_factor=1 40 | if [ -f $d/frame_subsampling_factor ]; then 41 | this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor) 42 | fi 43 | 44 | if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then 45 | echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2 46 | exit 1 47 | fi 48 | 49 | cat $d/targets.scp 50 | done | sort -k1,1 > $dest/targets.scp || exit 1 51 | 52 | steps/segmentation/validate_targets_dir.sh $dest $data || exit 1 53 | 54 | echo "Combined targets and stored in $dest" 55 | exit 0 56 | -------------------------------------------------------------------------------- /steps/segmentation/copy_targets_dir.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Nagendra Kumar Goel 4 | # 2014 Johns Hopkins University (author: Nagendra K Goel) 5 | # Apache 2.0 6 | 7 | # This script makes a copy of targets directory (by copying targets.scp), 8 | # possibly adding a specified prefix or a suffix to the utterance names. 9 | 10 | # begin configuration section 11 | utt_prefix= 12 | utt_suffix= 13 | # end configuration section 14 | 15 | if [ -f ./path.sh ]; then . ./path.sh; fi 16 | . ./utils/parse_options.sh 17 | 18 | if [ $# != 2 ]; then 19 | echo "Usage: " 20 | echo " $0 [options] " 21 | echo "e.g.:" 22 | echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" 23 | echo "Options" 24 | echo " --utt-prefix= # Prefix for utterance ids, default empty" 25 | echo " --utt-suffix= # Suffix for utterance ids, default empty" 26 | exit 1; 27 | fi 28 | 29 | export LC_ALL=C 30 | 31 | srcdir=$1 32 | destdir=$2 33 | 34 | mkdir -p $destdir 35 | 36 | if [ -f $srcdir/frame_subsampling_factor ]; then 37 | cp $srcdir/frame_subsampling_factor $destdir 38 | fi 39 | 40 | cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \ 41 | '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map 42 | 43 | cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \ 44 | sort -k1,1 > $destdir/targets.scp 45 | 46 | echo "$0: copied targets from $srcdir to $destdir" 47 | -------------------------------------------------------------------------------- /steps/segmentation/decode_sad.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # This script does Viterbi decoding using a matrix of frame log-likelihoods 7 | # with the columns corresponding to the pdfs. 8 | # It is a wrapper around the binary decode-faster. 9 | 10 | set -e 11 | set -o pipefail 12 | 13 | cmd=run.pl 14 | nj=4 15 | acwt=0.1 16 | beam=8 17 | max_active=1000 18 | transform= # Transformation matrix to apply on the input archives read from output.scp 19 | 20 | . ./path.sh 21 | 22 | . utils/parse_options.sh 23 | 24 | if [ $# -ne 3 ]; then 25 | echo "Usage: $0 " 26 | echo " e.g.: $0 " 27 | exit 1 28 | fi 29 | 30 | graph_dir=$1 31 | nnet_output_dir=$2 32 | dir=$3 33 | 34 | mkdir -p $dir/log 35 | 36 | echo $nj > $dir/num_jobs 37 | 38 | for f in $graph_dir/HCLG.fst $nnet_output_dir/output.scp $extra_files; do 39 | if [ ! -f $f ]; then 40 | echo "$0: Could not find file $f" 41 | exit 1 42 | fi 43 | done 44 | 45 | rspecifier="ark:utils/split_scp.pl -j $nj \$[JOB-1] $nnet_output_dir/output.scp | copy-feats scp:- ark:- |" 46 | 47 | # Apply a transformation on the input matrix to combine 48 | # probs from different columns to pseudo-likelihoods 49 | if [ ! -z "$transform" ]; then 50 | rspecifier="$rspecifier transform-feats $transform ark:- ark:- |" 51 | fi 52 | 53 | # Convert pseudo-likelihoods to pseudo log-likelihood 54 | rspecifier="$rspecifier copy-matrix --apply-log ark:- ark:- |" 55 | 56 | decoder_opts+=(--acoustic-scale=$acwt --beam=$beam --max-active=$max_active) 57 | 58 | $cmd JOB=1:$nj $dir/log/decode.JOB.log \ 59 | decode-faster ${decoder_opts[@]} \ 60 | $graph_dir/HCLG.fst "$rspecifier" \ 61 | ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" 62 | -------------------------------------------------------------------------------- /steps/segmentation/internal/find_oov_phone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """This script finds the OOV phone by reading the OOV word from 7 | oov.int in the input directory and the lexicon 8 | /phones/align_lexicon.int. 9 | It prints the OOV phone to stdout, if it can find a single phone 10 | mapping for the OOV word.""" 11 | from __future__ import print_function 12 | 13 | import sys 14 | 15 | 16 | def main(): 17 | if len(sys.argv) != 2: 18 | raise RuntimeError("Usage: {0} ".format(sys.argv[0])) 19 | 20 | lang = sys.argv[1] 21 | 22 | oov_int = int(open("{0}/oov.int").readline()) 23 | assert oov_int > 0 24 | 25 | oov_mapped_to_multiple_phones = False 26 | for line in open("{0}/phones/align_lexicon.int"): 27 | parts = line.strip().split() 28 | 29 | if len(parts) < 3: 30 | raise RuntimeError("Could not parse line {0} in " 31 | "{1}/phones/align_lexicon.int" 32 | "".format(line, lang)) 33 | 34 | w = int(parts[0]) 35 | if w != oov_int: 36 | continue 37 | 38 | if len(parts[2:]) > 1: 39 | # Try to find a single phone mapping for OOV 40 | oov_mapped_to_multiple_phones = True 41 | continue 42 | 43 | p = int(parts[2]) 44 | print ("{0}".format(p)) 45 | 46 | raise SystemExit(0) 47 | 48 | if oov_mapped_to_multiple_phones: 49 | raise RuntimeError("OOV word found, but is mapped to multiples phones. " 50 | "This is an unusual case.") 51 | 52 | raise RuntimeError("Could not find OOV word in " 53 | "{0}/phones/align_lexicon.int".format(lang)) 54 | 55 | 56 | if __name__ != "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /steps/segmentation/internal/verify_phones_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """This script verifies the list of phones read from stdin are valid 7 | phones present in lang/phones.txt.""" 8 | 9 | import argparse 10 | import sys 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser(description=""" 14 | This script verifies the list of phones read from stdin are valid 15 | phones present in lang/phones.txt.""") 16 | 17 | parser.add_argument("phones", type=str, 18 | help="File containing the list of all phones as the " 19 | "first column") 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def main(): 26 | args = get_args() 27 | phones = set() 28 | for line in open(args.phones): 29 | phones.add(line.strip().split()[0]) 30 | 31 | for line in sys.stdin.readlines(): 32 | p = line.strip() 33 | 34 | if p not in phones: 35 | sys.stderr.write("Could not find phone {p} in {f}" 36 | "\n".format(p=p, f=args.phones)) 37 | raise SystemExit(1) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /steps/segmentation/post_process_sad_to_segments.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2015-17 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # This script post-processes the output of steps/segmentation/decode_sad.sh, 7 | # which is in the form of frame-level alignments, into a 'segments' file. 8 | # The alignments must be speech activity detection marks i.e. 1 for silence 9 | # and 2 for speech. 10 | 11 | set -e -o pipefail -u 12 | . ./path.sh 13 | 14 | cmd=run.pl 15 | stage=-10 16 | nj=18 17 | 18 | # The values below are in seconds 19 | frame_shift=0.01 20 | segment_padding=0.2 21 | min_segment_dur=0 22 | merge_consecutive_max_dur=0 23 | 24 | . utils/parse_options.sh 25 | 26 | if [ $# -ne 3 ]; then 27 | echo "This script post-processes the output of steps/segmentation/decode_sad.sh, " 28 | echo "which is in the form of frame-level alignments, into kaldi segments. " 29 | echo "The alignments must be speech activity detection marks i.e. 1 for silence " 30 | echo "and 2 for speech." 31 | echo "Usage: $0 " 32 | echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire" 33 | exit 1 34 | fi 35 | 36 | data_dir=$1 37 | vad_dir=$2 # Alignment directory containing frame-level SAD labels 38 | dir=$3 39 | 40 | mkdir -p $dir 41 | 42 | for f in $vad_dir/ali.1.gz $vad_dir/num_jobs; do 43 | if [ ! -f $f ]; then 44 | echo "$0: Could not find file $f" && exit 1 45 | fi 46 | done 47 | 48 | nj=`cat $vad_dir/num_jobs` || exit 1 49 | utils/split_data.sh $data_dir $nj 50 | 51 | utils/data/get_utt2dur.sh $data_dir 52 | 53 | if [ $stage -le 0 ]; then 54 | $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \ 55 | copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \ 56 | steps/segmentation/internal/sad_to_segments.py \ 57 | --frame-shift=$frame_shift --segment-padding=$segment_padding \ 58 | --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \ 59 | --utt2dur=$data_dir/utt2dur - $dir/segments.JOB 60 | fi 61 | 62 | echo $nj > $dir/num_jobs 63 | 64 | for n in $(seq $nj); do 65 | cat $dir/segments.$n 66 | done > $dir/segments 67 | -------------------------------------------------------------------------------- /steps/subset_ali_dir.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | 8 | if [ -f ./path.sh ]; then . ./path.sh; fi 9 | 10 | . ./utils/parse_options.sh 11 | 12 | if [ $# -ne 4 ]; then 13 | cat < from the 16 | original alignment directory containing alignments for utterances in 17 | . 18 | 19 | The number of split jobs in the output alignment directory is 20 | equal to the number of jobs in the original alignment directory, 21 | unless the subset data directory has too few speakers. 22 | 23 | Usage: $0 [options] 24 | e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali 25 | 26 | Options: 27 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. 28 | EOF 29 | exit 1 30 | fi 31 | 32 | data=$1 33 | subset_data=$2 34 | ali_dir=$3 35 | dir=$4 36 | 37 | nj=$(cat $ali_dir/num_jobs) || exit 1 38 | utils/split_data.sh $data $nj 39 | 40 | mkdir -p $dir 41 | cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true 42 | cp -r $ali_dir/phones $dir 2>/dev/null || true 43 | 44 | $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ 45 | copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ 46 | ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 47 | 48 | for n in `seq $nj`; do 49 | cat $dir/ali_tmp.$n.scp 50 | done > $dir/ali_tmp.scp 51 | 52 | num_spk=$(cat $subset_data/spk2utt | wc -l) 53 | if [ $num_spk -lt $nj ]; then 54 | nj=$num_spk 55 | fi 56 | 57 | utils/split_data.sh $subset_data $nj 58 | $cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ 59 | copy-int-vector \ 60 | "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ 61 | "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 62 | 63 | echo $nj > $dir/num_jobs 64 | 65 | rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp 66 | 67 | exit 0 68 | -------------------------------------------------------------------------------- /steps/tfrnnlm/check_py.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | -------------------------------------------------------------------------------- /steps/tfrnnlm/check_tensorflow_installed.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # this script checks if TF is installed to be used with python 4 | # and if TF related binaries in kaldi is ready to use 5 | . ./path.sh 6 | 7 | if which lattice-lmrescore-tf-rnnlm 2>&1>/dev/null; then 8 | echo TensorFlow relate binaries found. This is good. 9 | else 10 | echo TF related binaries not compiled. 11 | echo You need to go to tools/ and run extras/install_tensorflow_cc.sh first 12 | echo and then do \"make\" under both src/tfrnnlm and src/tfrnnlmbin 13 | exit 1 14 | fi 15 | 16 | echo 17 | 18 | if python steps/tfrnnlm/check_py.py 2>/dev/null; then 19 | echo TensorFlow ready to use on the python side. This is good. 20 | else 21 | echo TensorFlow not found on the python side. 22 | echo Please go to tools/ and run extras/install_tensorflow_py.sh to install it 23 | echo If you already have TensorFlow installed somewhere else, you would need 24 | echo to add it to your PATH 25 | exit 1 26 | fi 27 | -------------------------------------------------------------------------------- /steps/word_align_lattices.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright Johns Hopkins University (Author: Daniel Povey) 2012 4 | # Apache 2.0. 5 | 6 | # Begin configuration section. 7 | silence_label=0 8 | cmd=run.pl 9 | # End configuration section. 10 | 11 | echo "$0 $@" # Print the command line for logging 12 | 13 | for x in `seq 2`; do 14 | [ "$1" == "--silence-label" ] && silence_label=$2 && shift 2; 15 | [ "$1" == "--cmd" ] && cmd="$2" && shift 2; 16 | done 17 | 18 | if [ $# != 3 ]; then 19 | echo "Word-align lattices (make the arcs sync up with words)" 20 | echo "" 21 | echo "Usage: $0 [options] " 22 | echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--silence-label ]" 23 | exit 1; 24 | fi 25 | 26 | . ./path.sh || exit 1; 27 | 28 | lang=$1 29 | indir=$2 30 | outdir=$3 31 | 32 | mdl=`dirname $indir`/final.mdl 33 | wbfile=$lang/phones/word_boundary.int 34 | 35 | for f in $mdl $wbfile $indir/num_jobs; do 36 | [ ! -f $f ] && echo "word_align_lattices.sh: no such file $f" && exit 1; 37 | done 38 | 39 | mkdir -p $outdir/log 40 | 41 | 42 | cp $indir/num_jobs $outdir; 43 | nj=`cat $indir/num_jobs` 44 | 45 | $cmd JOB=1:$nj $outdir/log/align.JOB.log \ 46 | lattice-align-words --silence-label=$silence_label --test=true \ 47 | $wbfile $mdl "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c >$outdir/lat.JOB.gz" || exit 1; 48 | 49 | -------------------------------------------------------------------------------- /utils/add_disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # Adds some specified number of disambig symbols to a symbol table. 19 | # Adds these as #1, #2, etc. 20 | # If the --include-zero option is specified, includes an extra one 21 | # #0. 22 | 23 | $include_zero = 0; 24 | if($ARGV[0] eq "--include-zero") { 25 | $include_zero = 1; 26 | shift @ARGV; 27 | } 28 | 29 | if(@ARGV != 2) { 30 | die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; 31 | } 32 | 33 | 34 | $input = $ARGV[0]; 35 | $nsyms = $ARGV[1]; 36 | 37 | open(F, "<$input") || die "Opening file $input"; 38 | 39 | while() { 40 | @A = split(" ", $_); 41 | @A == 2 || die "Bad line $_"; 42 | $lastsym = $A[1]; 43 | print; 44 | } 45 | 46 | if(!defined($lastsym)){ 47 | die "Empty symbol file?"; 48 | } 49 | 50 | if($include_zero) { 51 | $lastsym++; 52 | print "#0 $lastsym\n"; 53 | } 54 | 55 | for($n = 1; $n <= $nsyms; $n++) { 56 | $y = $n + $lastsym; 57 | print "#$n $y\n"; 58 | } 59 | -------------------------------------------------------------------------------- /utils/analyze_segments.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2015 GoVivace Inc. (Author: Nagendra Kumar Goel) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Analyze a segments file and print important stats on it. 18 | 19 | $dur = $total = 0; 20 | $maxDur = 0; 21 | $minDur = 9999999999; 22 | $n = 0; 23 | while(<>){ 24 | chomp; 25 | @t = split(/\s+/); 26 | $dur = $t[3] - $t[2]; 27 | $total += $dur; 28 | if ($dur > $maxDur) { 29 | $maxSegId = $t[0]; 30 | $maxDur = $dur; 31 | } 32 | if ($dur < $minDur) { 33 | $minSegId = $t[0]; 34 | $minDur = $dur; 35 | } 36 | $n++; 37 | } 38 | $avg=$total/$n; 39 | $hrs = $total/3600; 40 | print "Total $hrs hours of data\n"; 41 | print "Average segment length $avg seconds\n"; 42 | print "Segment $maxSegId has length of $maxDur seconds\n"; 43 | print "Segment $minSegId has length of $minDur seconds\n"; 44 | -------------------------------------------------------------------------------- /utils/best_wer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2010-2011 Microsoft Corporation 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # To be run from one directory above this script. 19 | 20 | perl -e 'while(<>){ 21 | s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g; 22 | if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool. 23 | elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|: 24 | && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite. 25 | if (defined $bestline){ print $bestline; } ' | \ 26 | awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \ 27 | awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \ 28 | awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \ 29 | sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||' 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /utils/build_const_arpa_lm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script reads in an Arpa format language model, and converts it into the 7 | # ConstArpaLm format language model. 8 | 9 | # begin configuration section 10 | # end configuration section 11 | 12 | [ -f path.sh ] && . ./path.sh; 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: " 18 | echo " $0 [options] " 19 | echo "e.g.:" 20 | echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" 21 | echo "Options" 22 | exit 1; 23 | fi 24 | 25 | export LC_ALL=C 26 | 27 | arpa_lm=$1 28 | old_lang=$2 29 | new_lang=$3 30 | 31 | mkdir -p $new_lang 32 | 33 | mkdir -p $new_lang 34 | cp -r $old_lang/* $new_lang 35 | 36 | unk=`cat $old_lang/oov.int` 37 | bos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 38 | eos=`grep "^\s" $old_lang/words.txt | awk '{print $2}'` 39 | if [[ -z $bos || -z $eos ]]; then 40 | echo "$0: and symbols are not in $old_lang/words.txt" 41 | exit 1 42 | fi 43 | if [[ -z $unk ]]; then 44 | echo "$0: can't find oov symbol id in $old_lang/oov.int" 45 | exit 1 46 | fi 47 | 48 | 49 | arpa-to-const-arpa --bos-symbol=$bos \ 50 | --eos-symbol=$eos --unk-symbol=$unk \ 51 | "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; 52 | 53 | exit 0; 54 | -------------------------------------------------------------------------------- /utils/build_kenlm_model_from_arpa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 2020 author Jiayu DU 3 | # Apache 2.0 4 | 5 | # This script reads in an Arpa format language model, and converts it into the 6 | # KenLM format language model. 7 | 8 | [ -f path.sh ] && . ./path.sh; 9 | 10 | # begin configuration section 11 | kenlm_opts="" # e.g. "-q 8 -b 8" for 8bits quantization 12 | model_type="trie" # "trie" or "probing". trie is smaller, probing is faster. 13 | # end configuration section 14 | 15 | . utils/parse_options.sh 16 | 17 | if [ $# != 2 ]; then 18 | echo "Usage: " 19 | echo " $0 [options] " 20 | echo "e.g.:" 21 | echo " $0 data/local/lm/4gram.arpa data/lang_test/G.trie" 22 | echo "Options:" 23 | echo " --model-type can be either \"trie\" or \"probing\"" 24 | echo " --kenlm-opts directly pass through to kenlm" 25 | echo " e.g. for 8bits quantization, feed \"-q 8 -b 8\"" 26 | exit 1; 27 | fi 28 | 29 | export LC_ALL=C 30 | 31 | arpa_lm=$1 32 | kenlm=$2 33 | 34 | if ! which build_binary >& /dev/null ; then 35 | echo "$0: cannot find KenLM's build_binary tool," 36 | echo "check kenlm installation (tools/extras/install_kenlm_query_only.sh)." 37 | exit 1 38 | fi 39 | 40 | mkdir -p $(dirname $kenlm) 41 | build_binary $kenlm_opts $model_type $arpa_lm $kenlm 42 | 43 | echo "$0: Successfully built arpa into kenlm format: $kenlm" 44 | exit 0 45 | -------------------------------------------------------------------------------- /utils/ctm/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /utils/data/convert_data_dir_to_whole.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016-2018 Vimal Manohar 4 | # Apache 2.0 5 | 6 | # This scripts converts a data directory into a "whole" data directory 7 | # by removing the segments and using the recordings themselves as 8 | # utterances 9 | 10 | set -o pipefail 11 | 12 | . ./path.sh 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# -ne 2 ]; then 17 | echo "Usage: convert_data_dir_to_whole.sh " 18 | echo " e.g.: convert_data_dir_to_whole.sh data/dev data/dev_whole" 19 | exit 1 20 | fi 21 | 22 | data=$1 23 | dir=$2 24 | 25 | if [ ! -f $data/segments ]; then 26 | echo "$0: Data directory already does not contain segments. So just copying it." 27 | utils/copy_data_dir.sh $data $dir 28 | exit 0 29 | fi 30 | 31 | mkdir -p $dir 32 | cp $data/wav.scp $dir 33 | if [ -f $data/reco2file_and_channel ]; then 34 | cp $data/reco2file_and_channel $dir; 35 | fi 36 | 37 | mkdir -p $dir/.backup 38 | if [ -f $dir/feats.scp ]; then 39 | mv $dir/feats.scp $dir/.backup 40 | fi 41 | if [ -f $dir/cmvn.scp ]; then 42 | mv $dir/cmvn.scp $dir/.backup 43 | fi 44 | if [ -f $dir/utt2spk ]; then 45 | mv $dir/utt2spk $dir/.backup 46 | fi 47 | 48 | [ -f $data/stm ] && cp $data/stm $dir 49 | [ -f $data/glm ] && cp $data/glm $dir 50 | 51 | utils/data/internal/combine_segments_to_recording.py \ 52 | --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1 53 | 54 | if [ -f $data/text ]; then 55 | utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1 56 | fi 57 | 58 | rm $dir/reco2sorted_utts 59 | 60 | utils/fix_data_dir.sh $dir || exit 1 61 | 62 | exit 0 63 | -------------------------------------------------------------------------------- /utils/data/extract_wav_segments_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Hossein Hadian 4 | # Apache 2.0 5 | 6 | # This script copies a data directory (which has a 'segments' file), extracting 7 | # wav segments (according to the 'segments' file) 8 | # so that the resulting data directory does not have a 'segments' file anymore. 9 | 10 | nj=4 11 | cmd=run.pl 12 | 13 | . ./utils/parse_options.sh 14 | . ./path.sh 15 | 16 | if [ $# != 2 ]; then 17 | echo "Usage: $0 " 18 | echo " This script copies data directory to and removes" 19 | echo " the 'segments' file by extracting the wav segments." 20 | echo "Options: " 21 | echo " --nj # number of parallel jobs" 22 | echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." 23 | exit 1; 24 | fi 25 | 26 | 27 | export LC_ALL=C 28 | 29 | srcdir=$1 30 | dir=$2 31 | logdir=$dir/log 32 | 33 | if ! mkdir -p $dir/data; then 34 | echo "$0: failed to create directory $dir/data" 35 | exit 1 36 | fi 37 | mkdir -p $logdir 38 | 39 | set -eu -o pipefail 40 | utils/copy_data_dir.sh $srcdir $dir 41 | 42 | split_segments="" 43 | for n in $(seq $nj); do 44 | split_segments="$split_segments $logdir/segments.$n" 45 | done 46 | 47 | utils/split_scp.pl $srcdir/segments $split_segments 48 | 49 | $cmd JOB=1:$nj $logdir/extract_wav_segments.JOB.log \ 50 | extract-segments scp,p:$srcdir/wav.scp $logdir/segments.JOB \ 51 | ark,scp:$dir/data/wav_segments.JOB.ark,$dir/data/wav_segments.JOB.scp 52 | 53 | # concatenate the .scp files together. 54 | for n in $(seq $nj); do 55 | cat $dir/data/wav_segments.$n.scp 56 | done > $dir/data/wav_segments.scp 57 | 58 | cat $dir/data/wav_segments.scp | awk '{ print $1 " wav-copy " $2 " - |" }' >$dir/wav.scp 59 | rm $dir/{segments,reco2file_and_channel} 2>/dev/null || true 60 | -------------------------------------------------------------------------------- /utils/data/get_frame_shift.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2016 Johns Hopkins University (author: Daniel Povey) 4 | # Apache 2.0 5 | 6 | # This script takes as input a data directory, such as data/train/, preferably 7 | # with utt2dur file already existing (or the utt2dur file will be created if 8 | # not), and it attempts to work out the approximate frame shift by comparing the 9 | # utt2dur with the output of feat-to-len on the feats.scp. It prints it out. 10 | # if the shift is very close to, but above, 0.01 (the normal frame shift) it 11 | # rounds it down. 12 | 13 | . utils/parse_options.sh 14 | . ./path.sh 15 | 16 | if [ $# != 1 ]; then 17 | cat >&2 <) 19 | e.g.: frame_shift=\$($0 data/train) 20 | 21 | This script prints the frame-shift in seconds (e.g. 0.01) to the standard out. 22 | Its output is intended to be captured in a shell variable. 23 | 24 | If does not contain the file utt2dur, this script may invoke 25 | utils/data/get_utt2dur.sh, which will require write permission to . 26 | EOF 27 | exit 1 28 | fi 29 | 30 | export LC_ALL=C 31 | 32 | dir=$1 33 | 34 | if [[ -s $dir/frame_shift ]]; then 35 | cat $dir/frame_shift 36 | exit 37 | fi 38 | 39 | if [ ! -f $dir/feats.scp ]; then 40 | echo "$0: $dir/feats.scp does not exist" 1>&2 41 | exit 1 42 | fi 43 | 44 | if [ ! -s $dir/utt2dur ]; then 45 | if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then 46 | echo "$0: neither $dir/wav.scp nor $dir/segments exist; assuming a frame shift of 0.01." 1>&2 47 | echo 0.01 48 | exit 0 49 | fi 50 | echo "$0: $dir/utt2dur does not exist: creating it" 1>&2 51 | utils/data/get_utt2dur.sh 1>&2 $dir || exit 1 52 | fi 53 | 54 | temp=$(mktemp /tmp/tmp.XXXX) || exit 1 55 | 56 | feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp 57 | 58 | if [[ ! -s $temp ]]; then 59 | rm $temp 60 | echo "$0: error running feat-to-len" 1>&2 61 | exit 1 62 | fi 63 | 64 | frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk ' 65 | { dur += $2; frames += $4; } 66 | END { shift = dur / frames; 67 | if (shift > 0.01 && shift < 0.0102) shift = 0.01; 68 | print shift; }') || exit 1; 69 | 70 | rm $temp 71 | 72 | echo $frame_shift > $dir/frame_shift 73 | echo $frame_shift 74 | exit 0 75 | -------------------------------------------------------------------------------- /utils/data/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script works out the approximate number of frames in a training directory. 4 | # This is sometimes needed by higher-level scripts 5 | 6 | 7 | if [ -f path.sh ]; then . ./path.sh; fi 8 | . parse_options.sh || exit 1; 9 | 10 | if [ $# -ne 1 ]; then 11 | ( 12 | echo "Usage: $0 " 13 | echo "Prints the number of frames of data in the data-dir" 14 | ) 1>&2 15 | fi 16 | 17 | data=$1 18 | 19 | if [ ! -f $data/utt2dur ]; then 20 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1 21 | fi 22 | 23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 24 | 25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur 26 | -------------------------------------------------------------------------------- /utils/data/get_reco2utt_for_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "This script outputs a mapping from recording to a list of utterances " 8 | echo "corresponding to the recording. It is analogous to the content of " 9 | echo "a spk2utt file, but is indexed by recording instead of speaker." 10 | echo "Usage: get_reco2utt.sh " 11 | echo " e.g.: get_reco2utt.sh data/train" 12 | exit 1 13 | fi 14 | 15 | data=$1 16 | 17 | if [ ! -s $data/segments ]; then 18 | utils/data/get_segments_for_data.sh $data > $data/segments 19 | fi 20 | 21 | cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl 22 | -------------------------------------------------------------------------------- /utils/data/get_segments_for_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script operates on a data directory, such as in data/train/, 4 | # and writes new segments to stdout. The file 'segments' maps from 5 | # utterance to time offsets into a recording, with the format: 6 | # 7 | # This script assumes utterance and recording ids are the same (i.e., that 8 | # wav.scp is indexed by utterance), and uses durations from 'utt2dur', 9 | # created if necessary by get_utt2dur.sh. 10 | 11 | . ./path.sh 12 | 13 | if [ $# != 1 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "e.g.:" 16 | echo " $0 data/train > data/train/segments" 17 | exit 1 18 | fi 19 | 20 | data=$1 21 | 22 | if [ ! -s $data/utt2dur ]; then 23 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1; 24 | fi 25 | 26 | # 0 27 | awk '{ print $1, $1, 0, $2 }' $data/utt2dur 28 | 29 | exit 0 30 | -------------------------------------------------------------------------------- /utils/data/get_utt2num_frames.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | nj=4 8 | 9 | frame_shift=0.01 10 | frame_overlap=0.015 11 | 12 | . utils/parse_options.sh 13 | . ./path.sh 14 | 15 | if [ $# -ne 1 ]; then 16 | echo "This script writes a file utt2num_frames with the " 17 | echo "number of frames in each utterance as measured based on the " 18 | echo "duration of the utterances (in utt2dur) and the specified " 19 | echo "frame_shift and frame_overlap." 20 | echo "Usage: $0 " 21 | exit 1 22 | fi 23 | 24 | data=$1 25 | 26 | if [ -s $data/utt2num_frames ]; then 27 | echo "$0: $data/utt2num_frames already present!" 28 | exit 0; 29 | fi 30 | 31 | if [ ! -f $data/feats.scp ]; then 32 | utils/data/get_utt2dur.sh --nj ${nj} --cmd "$cmd" $data 33 | awk -v fs=$frame_shift -v fovlp=$frame_overlap \ 34 | '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames 35 | exit 0 36 | fi 37 | 38 | utils/split_data.sh --per-utt $data $nj || exit 1 39 | $cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \ 40 | feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1 41 | 42 | for n in `seq $nj`; do 43 | cat $data/split${nj}utt/$n/utt2num_frames 44 | done > $data/utt2num_frames 45 | 46 | echo "$0: Computed and wrote $data/utt2num_frames" 47 | -------------------------------------------------------------------------------- /utils/data/internal/combine_segments_to_recording.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2018 Vimal Manohar 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import collections 10 | from collections import defaultdict 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser(description=""" 14 | This script combines segments into utterances at 15 | recording-level and write out new utt2spk file with reco-id as the 16 | speakers. If --write-reco2utt is provided, it writes a mapping from 17 | recording-id to the list of utterances sorted by start and end times. 18 | This map can be used to combine text corresponding to the segments to 19 | recording-level.""") 20 | 21 | parser.add_argument("--write-reco2utt", help="If provided, writes a " 22 | "mapping from recording-id to list of utterances " 23 | "sorted by start and end times.") 24 | parser.add_argument("segments_in", help="Input segments file") 25 | parser.add_argument("utt2spk_out", help="Output utt2spk file") 26 | 27 | args = parser.parse_args() 28 | 29 | return args 30 | 31 | 32 | def main(): 33 | args = get_args() 34 | 35 | utt2reco = {} 36 | segments_for_reco = defaultdict(list) 37 | for line in open(args.segments_in): 38 | parts = line.strip().split() 39 | 40 | if len(parts) < 4: 41 | raise TypeError("bad line in segments file {}".format(line)) 42 | 43 | utt = parts[0] 44 | reco = parts[1] 45 | start_time = parts[2] 46 | end_time = parts[3] 47 | 48 | segments_for_reco[reco].append((utt, start_time, end_time)) 49 | utt2reco[utt] = reco 50 | 51 | if args.write_reco2utt is not None: 52 | with open(args.write_reco2utt, 'w') as reco2utt_writer, \ 53 | open(args.utt2spk_out, 'w') as utt2spk_writer: 54 | for reco, segments_in_reco in segments_for_reco.items(): 55 | utts = ' '.join([seg[0] for seg in sorted( 56 | segments_in_reco, key=lambda x:(x[1], x[2]))]) 57 | print("{0} {1}".format(reco, utts), file=reco2utt_writer) 58 | print ("{0} {0}".format(reco), file=utt2spk_writer) 59 | else: 60 | with open(args.utt2spk_out, 'w') as utt2spk_writer: 61 | for reco in segments_for_reco.keys(): 62 | print ("{0} {0}".format(reco), file=utt2spk_writer) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /utils/data/limit_feature_dim.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2016 Alibaba Robotics Corp. (author: Xingyu Na) 4 | # Apache 2.0 5 | 6 | # The script creates a new data directory by selecting a specified 7 | # dimension range of the features in the source directory. 8 | 9 | . utils/parse_options.sh 10 | 11 | if [ $# != 3 ]; then 12 | echo "Usage: " 13 | echo " $0 " 14 | echo "The script creates a new data directory by selecting a specified" 15 | echo "dimension range of the features in the source directory." 16 | echo "e.g.:" 17 | echo " $0 0:39 data/train_hires_pitch data/train_hires" 18 | exit 1; 19 | fi 20 | 21 | feat_dim_range=$1 22 | srcdir=$2 23 | destdir=$3 24 | 25 | if [ "$destdir" == "$srcdir" ]; then 26 | echo "$0: this script requires and to be different." 27 | exit 1 28 | fi 29 | 30 | if [ ! -f $srcdir/feats.scp ]; then 31 | echo "$0: no such file $srcdir/feats.scp" 32 | exit 1; 33 | fi 34 | 35 | mkdir -p $destdir 36 | utils/copy_data_dir.sh $srcdir $destdir 37 | 38 | if [ -f $destdir/cmvn.scp ]; then 39 | rm $destdir/cmvn.scp 40 | echo "$0: warning: removing $destdir/cmvn.cp, you will have to regenerate it from the features." 41 | fi 42 | 43 | rm $destdir/feats.scp 44 | sed 's/$/\[:,'${feat_dim_range}'\]/' $srcdir/feats.scp | \ 45 | utils/data/normalize_data_range.pl > $destdir/feats.scp 46 | 47 | [ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" 48 | utils/validate_data_dir.sh $validate_opts $destdir 49 | -------------------------------------------------------------------------------- /utils/data/modify_speaker_info_to_recording.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # Copy the data directory, but modify it to use the recording-id as the 7 | # speaker. This is useful to get matching speaker information in the 8 | # whole recording data directory. 9 | # Note that this also appends the recording-id as a prefix to the 10 | # utterance-id. 11 | 12 | if [ $# -ne 2 ]; then 13 | echo "Usage: $0 " 14 | echo " e.g.: $0 data/train data/train_recospk" 15 | exit 1 16 | fi 17 | 18 | in_data=$1 19 | out_data=$2 20 | 21 | mkdir -p $out_data 22 | 23 | for f in wav.scp segments utt2spk; do 24 | if [ ! -f $in_data/$f ]; then 25 | echo "$0: Could not find file $in_data/$f" 26 | exit 1 27 | fi 28 | done 29 | 30 | cp $in_data/wav.scp $out_data/ || exit 1 31 | cp $in_data/reco2file_and_channel $out_data/ 2> /dev/null || true 32 | awk '{print $1" "$2"-"$1}' $in_data/segments > \ 33 | $out_data/old2new.uttmap || exit 1 34 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/segments > \ 35 | $out_data/segments || exit 1 36 | awk '{print $1" "$2}' $out_data/segments > $out_data/utt2spk || exit 1 37 | utils/utt2spk_to_spk2utt.pl $out_data/utt2spk > $out_data/spk2utt || exit 1 38 | 39 | if [ -f $in_data/text ]; then 40 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/text > \ 41 | $out_data/text || exit 1 42 | fi 43 | 44 | if [ -f $in_data/feats.scp ]; then 45 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/feats.scp > \ 46 | $out_data/feats.scp || exit 1 47 | fi 48 | 49 | utils/fix_data_dir.sh $out_data || exit 1 50 | utils/validate_data_dir.sh --no-text --no-feats $out_data || exit 1 51 | -------------------------------------------------------------------------------- /utils/data/remove_dup_utts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Remove excess utterances once they appear more than a specified 4 | # number of times with the same transcription, in a data set. 5 | # E.g. useful for removing excess "uh-huh" from training. 6 | 7 | if [ $# != 3 ]; then 8 | echo "Usage: remove_dup_utts.sh max-count " 9 | echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup" 10 | echo "This script is used to filter out utterances that have from over-represented" 11 | echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of" 12 | echo "any given word-sequence to a specified value. It's often used to get" 13 | echo "subsets for early stages of training." 14 | exit 1; 15 | fi 16 | 17 | maxcount=$1 18 | srcdir=$2 19 | destdir=$3 20 | mkdir -p $destdir 21 | 22 | [ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1; 23 | 24 | ! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1; 25 | 26 | ! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1; 27 | 28 | cp $srcdir/* $destdir 29 | cat $srcdir/text | \ 30 | perl -e ' 31 | $maxcount = shift @ARGV; 32 | @all = (); 33 | $p1 = 103349; $p2 = 71147; $k = 0; 34 | sub random { # our own random number generator: predictable. 35 | $k = ($k + $p1) % $p2; 36 | return ($k / $p2); 37 | } 38 | while(<>) { 39 | push @all, $_; 40 | @A = split(" ", $_); 41 | shift @A; 42 | $text = join(" ", @A); 43 | $count{$text} ++; 44 | } 45 | foreach $line (@all) { 46 | @A = split(" ", $line); 47 | shift @A; 48 | $text = join(" ", @A); 49 | $n = $count{$text}; 50 | if ($n < $maxcount || random() < ($maxcount / $n)) { 51 | print $line; 52 | } 53 | }' $maxcount >$destdir/text 54 | 55 | echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`" 56 | 57 | echo "Using fix_data_dir.sh to reconcile the other files." 58 | utils/fix_data_dir.sh $destdir 59 | rm -r $destdir/.backup 60 | 61 | exit 0 62 | -------------------------------------------------------------------------------- /utils/data/resample_data_dir.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2018 Xiaohui Zhang 5 | # Apache 2.0. 6 | 7 | if [ $# -ne 2 ]; then 8 | echo "This script adds a sox line in wav.scp to resample the audio at a " 9 | echo "different sampling-rate" 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 8000 data/dev" 12 | exit 1 13 | fi 14 | 15 | freq=$1 16 | dir=$2 17 | 18 | sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; } 19 | 20 | if [ -f $dir/feats.scp ]; then 21 | mkdir -p $dir/.backup 22 | mv $dir/feats.scp $dir/.backup/ 23 | if [ -f $dir/cmvn.scp ]; then 24 | mv $dir/cmvn.scp $dir/.backup/ 25 | fi 26 | echo "$0: feats.scp already exists. Moving it to $dir/.backup" 27 | fi 28 | 29 | # After resampling we cannot compute utt2dur from wav.scp any more, 30 | # so we create utt2dur now, in case it's needed later 31 | if [ ! -s $dir/utt2dur ]; then 32 | utils/data/get_utt2dur.sh $dir 1>&2 || exit 1; 33 | fi 34 | 35 | mv $dir/wav.scp $dir/wav.scp.tmp 36 | cat $dir/wav.scp.tmp | python -c "import sys 37 | for line in sys.stdin.readlines(): 38 | splits = line.strip().split() 39 | if splits[-1] == '|': 40 | out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |' 41 | else: 42 | out_line = '{0} cat {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:])) 43 | print (out_line)" > ${dir}/wav.scp 44 | rm $dir/wav.scp.tmp 45 | -------------------------------------------------------------------------------- /utils/data/shift_and_combine_feats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2017 Hossein Hadian 4 | 5 | # Apache 2.0 6 | 7 | write_utt2orig= # if provided, this script will write 8 | # a mapping of shifted utterance ids 9 | # to the original ones into the file 10 | # specified by this option 11 | 12 | echo "$0 $@" # Print the command line for logging 13 | if [ -f path.sh ]; then . ./path.sh; fi 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: $0 " 18 | echo "e.g.: $0 3 data/train data/train_fs3" 19 | echo "For use in perturbing data for discriminative training and alignment of" 20 | echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh" 21 | echo "and utils/data/combine_data.sh to shift the features" 22 | echo " different ways and combine them." 23 | echo "E.g. if is 3, this script will combine" 24 | echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)." 25 | exit 1 26 | fi 27 | 28 | frame_subsampling_factor=$1 29 | srcdir=$2 30 | destdir=$3 31 | 32 | if [ ! -f $srcdir/feats.scp ]; then 33 | echo "$0: expected $srcdir/feats.scp to exist" 34 | exit 1 35 | fi 36 | 37 | if [ -f $destdir/feats.scp ]; then 38 | echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)" 39 | exit 1 40 | fi 41 | 42 | if [ ! -z $write_utt2orig ]; then 43 | awk '{print $1 " " $1}' $srcdir/feats.scp >$write_utt2orig 44 | fi 45 | 46 | tmp_shift_destdirs=() 47 | for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do 48 | if [ "$frame_shift" == 0 ]; then continue; fi 49 | utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1 50 | tmp_shift_destdirs+=("${destdir}_fs$frame_shift") 51 | if [ ! -z $write_utt2orig ]; then 52 | awk -v prefix="fs$frame_shift-" '{printf("%s%s %s\n", prefix, $1, $1);}' $srcdir/feats.scp >>$write_utt2orig 53 | fi 54 | done 55 | utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1 56 | rm -r ${tmp_shift_destdirs[@]} 57 | 58 | utils/validate_data_dir.sh $destdir 59 | 60 | src_nf=`cat $srcdir/feats.scp | wc -l` 61 | dest_nf=`cat $destdir/feats.scp | wc -l` 62 | if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then 63 | echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];" 64 | exit 1; 65 | fi 66 | 67 | echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir" 68 | -------------------------------------------------------------------------------- /utils/data/shift_feats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2017 Hossein Hadian 5 | # Apache 2.0 6 | 7 | echo "$0 $@" # Print the command line for logging 8 | if [ -f path.sh ]; then . ./path.sh; fi 9 | . parse_options.sh || exit 1; 10 | 11 | if [ $# != 3 ]; then 12 | echo " Usage: $0 " 13 | echo "e.g.: $0 -1 data/train data/train_fs-1" 14 | echo "The script creates a new data directory with the features modified" 15 | echo "using the program shift-feats with the specified frame-shift." 16 | echo "This program automatically adds the prefix 'fs-' to the" 17 | echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh" 18 | exit 1 19 | fi 20 | 21 | frame_shift=$1 22 | srcdir=$2 23 | destdir=$3 24 | 25 | 26 | if [ "$destdir" == "$srcdir" ]; then 27 | echo "$0: this script requires and to be different." 28 | exit 1 29 | fi 30 | 31 | if [ ! -f $srcdir/feats.scp ]; then 32 | echo "$0: no such file $srcdir/feats.scp" 33 | exit 1; 34 | fi 35 | 36 | utt_prefix="fs$frame_shift-" 37 | spk_prefix="fs$frame_shift-" 38 | 39 | mkdir -p $destdir 40 | utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \ 41 | $srcdir $destdir 42 | 43 | if grep --quiet "'" $srcdir/feats.scp; then 44 | echo "$0: the input features already use single quotes. Can't proceed." 45 | exit 1; 46 | fi 47 | 48 | awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \ 49 | NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \ 50 | NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \ 51 | $destdir/feats.scp >$destdir/feats_shifted.scp 52 | mv -f $destdir/feats_shifted.scp $destdir/feats.scp 53 | 54 | echo "$0: Done" 55 | 56 | -------------------------------------------------------------------------------- /utils/eps2disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2015 Guoguo Chen 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This script replaces epsilon with #0 on the input side only, of the G.fst 19 | # acceptor. 20 | 21 | while(<>){ 22 | if (/\s+#0\s+/) { 23 | print STDERR "$0: ERROR: LM has word #0, " . 24 | "which is reserved as disambiguation symbol\n"; 25 | exit 1; 26 | } 27 | s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; 28 | print; 29 | } 30 | -------------------------------------------------------------------------------- /utils/filt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Apache 2.0 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | vocab=set() 9 | with open(sys.argv[1]) as vocabfile: 10 | for line in vocabfile: 11 | vocab.add(line.strip()) 12 | 13 | with open(sys.argv[2]) as textfile: 14 | for line in textfile: 15 | print(" ".join([word if word in vocab else '' for word in line.strip().split()])) 16 | -------------------------------------------------------------------------------- /utils/find_arpa_oovs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | if ( @ARGV < 1 && @ARGV > 2) { 19 | die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n"; 20 | # This program finds words in the arpa file that are not symbols 21 | # in the OpenFst-format symbol table words.txt. It prints them 22 | # on the standard output, one per line. 23 | } 24 | 25 | $symtab = shift @ARGV; 26 | open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n"; 27 | while(){ 28 | @A = split(" ", $_); 29 | @A == 2 || die "Bad line in symbol table file: $_"; 30 | $seen{$A[0]} = 1; 31 | } 32 | 33 | $found_data=0; 34 | $curgram=0; 35 | while(<>) { # Find the \data\ marker. 36 | if(m:^\\data\\\s*$:) { $found_data=1; last; } 37 | } 38 | 39 | if ($found_data==0) { 40 | print STDERR "find_arpa_oovs.pl: found no \\data\\ marker in the ARPA input.\n"; 41 | exit(1); 42 | } 43 | 44 | while(<>) { 45 | if(m/^\\(\d+)\-grams:\s*$/) { 46 | $curgram = $1; 47 | if($curgram > 1) { 48 | last; # This is an optimization as we can get the vocab from the 1-grams 49 | } 50 | } elsif($curgram > 0) { 51 | @A = split(" ", $_); 52 | if(@A > 1) { 53 | shift @A; 54 | for($n=0;$n<$curgram;$n++) { 55 | $word = $A[$n]; 56 | if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; } 57 | $in_arpa{$word} = 1; 58 | } 59 | } else { 60 | if(@A > 0 && $A[0] !~ m:\\end\\:) { 61 | print STDERR "Unusual line $_ (line $.) in arpa file\n"; 62 | } 63 | } 64 | } 65 | } 66 | 67 | foreach $w (keys %in_arpa) { 68 | if(!defined $seen{$w} && $w ne "" && $w ne "") { 69 | print "$w\n"; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /utils/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /utils/int2sym.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 3 | # Apache 2.0. 4 | 5 | undef $field_begin; 6 | undef $field_end; 7 | 8 | 9 | if ($ARGV[0] eq "-f") { 10 | shift @ARGV; 11 | $field_spec = shift @ARGV; 12 | if ($field_spec =~ m/^\d+$/) { 13 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 14 | } 15 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10) 16 | if ($1 ne "") { 17 | $field_begin = $1 - 1; # Change to zero-based indexing. 18 | } 19 | if ($2 ne "") { 20 | $field_end = $2 - 1; # Change to zero-based indexing. 21 | } 22 | } 23 | if (!defined $field_begin && !defined $field_end) { 24 | die "Bad argument to -f option: $field_spec"; 25 | } 26 | } 27 | $symtab = shift @ARGV; 28 | if(!defined $symtab) { 29 | print STDERR "Usage: int2sym.pl [options] symtab [input] > output\n" . 30 | "options: [-f (|-)]\n" . 31 | "e.g.: -f 2, or -f 3-4\n"; 32 | exit(1); 33 | } 34 | 35 | open(F, "<$symtab") || die "Error opening symbol table file $symtab"; 36 | while() { 37 | @A = split(" ", $_); 38 | @A == 2 || die "bad line in symbol table file: $_"; 39 | $int2sym{$A[1]} = $A[0]; 40 | } 41 | 42 | sub int2sym { 43 | my $a = shift @_; 44 | my $pos = shift @_; 45 | if($a !~ m:^\d+$:) { # not all digits.. 46 | $pos1 = $pos+1; # make it one-based. 47 | die "int2sym.pl: found noninteger token $a [in position $pos1]\n"; 48 | } 49 | $s = $int2sym{$a}; 50 | if(!defined ($s)) { 51 | die "int2sym.pl: integer $a not in symbol table $symtab."; 52 | } 53 | return $s; 54 | } 55 | 56 | $error = 0; 57 | while (<>) { 58 | @A = split(" ", $_); 59 | for ($pos = 0; $pos <= $#A; $pos++) { 60 | $a = $A[$pos]; 61 | if ( (!defined $field_begin || $pos >= $field_begin) 62 | && (!defined $field_end || $pos <= $field_end)) { 63 | $a = int2sym($a, $pos); 64 | } 65 | print $a . " "; 66 | } 67 | print "\n"; 68 | } 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /utils/lang/add_unigrams_arpa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2018 Xiaohui Zhang 4 | # Apache 2.0. 5 | # 6 | use strict; 7 | use warnings; 8 | use Getopt::Long; 9 | 10 | my $Usage = < output-arpa 13 | contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be 14 | scaled by a positive scalar and then be used as the unigram prob. of the added word. 15 | The scale should approximiately relect the OOV rate of the language in concern. 16 | EOU 17 | 18 | my @F; 19 | my @OOVS; 20 | 21 | if (@ARGV != 2) { 22 | die $Usage; 23 | } 24 | 25 | # Gets parameters. 26 | my $oov_prob_file = shift @ARGV; 27 | my $scale = shift @ARGV; 28 | my $arpa_in = shift @ARGV; 29 | my $arpa_out = shift @ARGV; 30 | 31 | # Opens files. 32 | open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file\n"; 33 | while () { push @OOVS, $_; } 34 | my $num_oovs = @OOVS; 35 | 36 | $scale > 0.0 || die "Bad scale"; 37 | print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file\n"; 38 | 39 | my %vocab; 40 | my $unigram = 0; 41 | my $num_unigrams = 0; 42 | my @lines; 43 | 44 | # Parse and record the head and unigrams in the ARPA LM. 45 | while() { 46 | if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; } 47 | 48 | if (m/^\\2-grams:$/) { last; } 49 | if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; } 50 | if (m/^\\2-grams:$/) { $unigram = 0; } 51 | 52 | my @col = split(" ", $_); 53 | if ( $unigram == 1 ) { 54 | # Record in-vocab words into a map. 55 | if ( @col > 0 ) { 56 | my $word = $col[1]; 57 | $vocab{$word} = 1; 58 | push(@lines, $_); 59 | } else { 60 | # Insert out-of-vocab words and their probs into the unigram list. 61 | foreach my $l (@OOVS) { 62 | my @A = split(" ", $l); 63 | @A == 2 || die "bad line in oov2prob: $_;"; 64 | my $word = $A[0]; 65 | my $prob = $A[1]; 66 | if (exists($vocab{$word})) { next; } 67 | $num_unigrams ++; 68 | my $log10prob = (log($prob * $scale) / log(10.0)); 69 | $vocab{$word} = 1; 70 | my $line = sprintf("%.6f\t$word\n", $log10prob); 71 | push(@lines, $line); 72 | } 73 | } 74 | } else { push(@lines, $_); } 75 | } 76 | 77 | # Print the head and unigrams, with the updated # unigrams in the head. 78 | foreach my $l (@lines) { 79 | if ($l =~ m/ngram 1=/) { 80 | print "ngram 1=$num_unigrams\n"; 81 | } else { 82 | print $l; 83 | } 84 | } 85 | 86 | # Print the left fields. 87 | print "\n\\2-grams:\n"; 88 | while() { 89 | print; 90 | } 91 | 92 | close(F); 93 | exit 0 94 | -------------------------------------------------------------------------------- /utils/lang/adjust_unk_arpa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2018 Xiaohui Zhang 4 | # Apache 2.0. 5 | # 6 | use strict; 7 | use warnings; 8 | use Getopt::Long; 9 | 10 | my $Usage = < output-arpa 13 | 14 | Allowed options: 15 | --fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to 16 | the unigram prob of the OOV dict entry, rather than using it to 17 | scale the probs. In this case higher order n-grams containing 18 | the OOV dict entry remain untouched. This is useful when the OOV 19 | dict entry doesn't appear in n-grams (n>1) as the predicted word. 20 | EOU 21 | 22 | my $fixed_value = "false"; 23 | GetOptions('fixed-value=s' => \$fixed_value); 24 | 25 | ($fixed_value eq "true" || $fixed_value eq "false") || 26 | die "$0: Bad value for option --fixed-value\n"; 27 | 28 | if (@ARGV != 2) { 29 | die $Usage; 30 | } 31 | 32 | # Gets parameters. 33 | my $unk_word = shift @ARGV; 34 | my $unk_scale = shift @ARGV; 35 | my $arpa_in = shift @ARGV; 36 | my $arpa_out = shift @ARGV; 37 | 38 | $unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive 39 | if ( $fixed_value eq "true" ) { 40 | print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n"; 41 | } else { 42 | print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n"; 43 | } 44 | 45 | my $ngram = 0; # the order of ngram we are visiting 46 | 47 | # Change the unigram prob of the unk-word in the ARPA LM. 48 | while() { 49 | if (m/^\\1-grams:$/) { $ngram = 1; } 50 | if (m/^\\2-grams:$/) { $ngram = 2; } 51 | if (m/^\\3-grams:$/) { $ngram = 3; } 52 | if (m/^\\4-grams:$/) { $ngram = 4; } 53 | if (m/^\\5-grams:$/) { $ngram = 5; } 54 | my @col = split(" ", $_); 55 | if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) { 56 | if ( $fixed_value eq "true" && $ngram == 1 ) { 57 | $col[0] = (log($unk_scale) / log(10.0)); 58 | } elsif ($fixed_value eq "false" ) { 59 | $col[0] += (log($unk_scale) / log(10.0)); 60 | } 61 | my $line = join("\t", @col); 62 | print "$line\n"; 63 | } else { 64 | print; 65 | } 66 | } 67 | 68 | exit 0 69 | -------------------------------------------------------------------------------- /utils/lang/adjust_unk_graph.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2018 Xiaohui Zhang 3 | # Apache 2.0 4 | 5 | # This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores 6 | # of all arcs whose output symbol is a user-specified OOV symbol (or any other word). 7 | # This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales 8 | # the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph. 9 | 10 | set -o pipefail 11 | 12 | if [ $# != 4 ]; then 13 | echo "Usage: utils/adjust_unk_graph.sh " 14 | echo "e.g.: utils/adjust_unk_graph.sh \"\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1" 15 | exit 1; 16 | fi 17 | 18 | if [ -f path.sh ]; then . ./path.sh; fi 19 | 20 | oov_word=$1 21 | unk_scale=$2 22 | graphdir_in=$3 23 | graphdir_out=$4 24 | 25 | mkdir -p $graphdir_out 26 | 27 | required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt" 28 | for f in $required; do 29 | [ ! -e $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1; 30 | cp -r $graphdir_in/$f $graphdir_out 31 | done 32 | 33 | cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out 34 | 35 | oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt` 36 | [ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1; 37 | fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \ 38 | fstcompile | fstconvert --fst_type=const > $graphdir_out/HCLG.fst || exit 1; 39 | -------------------------------------------------------------------------------- /utils/lang/bpe/add_final_optional_silence.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | . ./path.sh 3 | 4 | final_sil_prob=0.5 5 | 6 | echo "$0 $@" # Print the command line for logging 7 | 8 | . ./utils/parse_options.sh 9 | 10 | if [ $# -ne 1 ]; then 11 | echo "Usage: $0 " 12 | echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in" 13 | echo " lang/ directory ." 14 | echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which" 15 | echo " the word-initial silence is part of the lexicon, so we turn off the standard" 16 | echo " optional silence in the lexicon" 17 | echo "options:" 18 | echo " --final-sil-prob # default 0.5" 19 | exit 1; 20 | fi 21 | 22 | lang=$1 23 | 24 | if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then 25 | echo "$0 $lang/phones/final_sil_prob exists. Exiting..." 26 | exit 1; 27 | fi 28 | 29 | silphone=$(cat $lang/phones/optional_silence.int) 30 | 31 | sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}")) 32 | sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}")) 33 | sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}")) 34 | sil_gt_one=$(echo $(perl -e "if ( $final_sil_prob > 1.0) {print 'true';} else {print 'false';}")) 35 | 36 | if $sil_lt_zero || $sil_gt_one; then 37 | echo "$0 final-sil-prob should be between 0.0 and 1.0. Final silence was not added." 38 | exit 1; 39 | else 40 | if $sil_eq_zero; then 41 | echo "$0 final-sil-prob = 0 => Final silence was not added." 42 | exit 0; 43 | elif $sil_eq_one; then 44 | ( echo "0 1 $silphone 0"; 45 | echo "1" ) | fstcompile > $lang/final_sil.fst 46 | else 47 | log_silprob=$(echo $(perl -e "print log $final_sil_prob")) 48 | ( echo "0 1 $silphone 0 $log_silprob"; 49 | echo "0 $log_silprob"; 50 | echo "1" ) | fstcompile > $lang/final_sil.fst 51 | fi 52 | mv $lang/L.fst $lang/L.fst.orig 53 | mv $lang/L_disambig.fst $lang/L_disambig.fst.orig 54 | fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst 55 | fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst 56 | echo "$final_sil_prob" > $lang/phones/final_sil_prob 57 | fi 58 | -------------------------------------------------------------------------------- /utils/lang/bpe/bidi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2018 Chun-Chieh Chang 3 | 4 | # This script is largely written by Stephen Rawls 5 | # and uses the python package https://pypi.org/project/PyICU_BiDi/ 6 | # The code leaves right to left text alone and reverses left to right text. 7 | 8 | import icu_bidi 9 | import io 10 | import sys 11 | import unicodedata 12 | # R=strong right-to-left; AL=strong arabic right-to-left 13 | rtl_set = set(chr(i) for i in range(sys.maxunicode) 14 | if unicodedata.bidirectional(chr(i)) in ['R','AL']) 15 | def determine_text_direction(text): 16 | # Easy case first 17 | for char in text: 18 | if char in rtl_set: 19 | return icu_bidi.UBiDiLevel.UBIDI_RTL 20 | # If we made it here we did not encounter any strongly rtl char 21 | return icu_bidi.UBiDiLevel.UBIDI_LTR 22 | 23 | def utf8_visual_to_logical(text): 24 | text_dir = determine_text_direction(text) 25 | 26 | bidi = icu_bidi.Bidi() 27 | bidi.inverse = True 28 | bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT 29 | bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS 30 | 31 | bidi.set_para(text, text_dir, None) 32 | 33 | res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) 34 | 35 | return res 36 | 37 | def utf8_logical_to_visual(text): 38 | text_dir = determine_text_direction(text) 39 | 40 | bidi = icu_bidi.Bidi() 41 | 42 | bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT 43 | bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS 44 | 45 | bidi.set_para(text, text_dir, None) 46 | 47 | res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) 48 | 49 | return res 50 | 51 | 52 | ##main## 53 | sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8") 54 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8") 55 | for line in sys.stdin: 56 | line = line.strip() 57 | line = utf8_logical_to_visual(line)[::-1] 58 | sys.stdout.write(line + '\n') 59 | -------------------------------------------------------------------------------- /utils/lang/bpe/prepend_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script, prepend '|' to every words in the transcript to mark 4 | # the beginning of the words for finding the initial-space of every word 5 | # after decoding. 6 | 7 | import sys 8 | import io 9 | import re 10 | 11 | whitespace = re.compile("[ \t]+") 12 | infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') 13 | output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') 14 | for line in infile: 15 | words = whitespace.split(line.strip(" \t\r\n")) 16 | output.write(' '.join([ "|"+word for word in words]) + '\n') 17 | -------------------------------------------------------------------------------- /utils/lang/bpe/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script, reverse all latin and digits sequences 5 | # (including words like MP3) to put them in the right order in the images. 6 | 7 | import re, os, sys, io 8 | 9 | in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 10 | out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 11 | for line in in_stream: 12 | out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]', 13 | lambda m:m.group(0)[::-1], line)) 14 | -------------------------------------------------------------------------------- /utils/lang/check_phones_compatible.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2016 Hang Lyu 3 | 4 | # Licensed udner the Apache License, Version 2.0 (the "Lincense"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OF IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script exits with status zero if the phone symbols tables are the same 18 | # except for possible differences in disambiguation symbols (meaning that all 19 | # symbols except those beginning with a # are mapped to the same values). 20 | # Otherwise it prints a warning and exits with status 1. 21 | # For the sake of compatibility with other scripts that did not write the 22 | # phones.txt to model directories, this script exits silently with status 0 23 | # if one of the phone symbol tables does not exist. 24 | 25 | . utils/parse_options.sh || exit 1; 26 | 27 | if [ $# -ne 2 ]; then 28 | echo "Usage: utils/lang/check_phones_compatible.sh " 29 | echo "e.g.: utils/lang/check_phones_compatible.sh data/lang/phones.txt exp/tri3/phones.txt" 30 | exit 1; 31 | fi 32 | 33 | table_first=$1 34 | table_second=$2 35 | 36 | # check if the files exist or not 37 | if [ ! -f $table_first ]; then 38 | if [ ! -f $table_second ]; then 39 | echo "$0: Error! Both of the two phones-symbol tables are absent." 40 | echo "Please check your command" 41 | exit 1; 42 | else 43 | # The phones-symbol-table1 is absent. The model directory maybe created by old script. 44 | # For back compatibility, this script exits silently with status 0. 45 | exit 0; 46 | fi 47 | elif [ ! -f $table_second ]; then 48 | # The phones-symbol-table2 is absent. The model directory maybe created by old script. 49 | # For back compatibility, this script exits silently with status 0. 50 | exit 0; 51 | fi 52 | 53 | # Check if the two tables are the same (except for possible difference in disambiguation symbols). 54 | if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then 55 | echo "$0: phone symbol tables $table_first and $table_second are not compatible." 56 | exit 1; 57 | fi 58 | 59 | exit 0; 60 | -------------------------------------------------------------------------------- /utils/ln.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use File::Spec; 3 | 4 | if ( @ARGV < 2 ) { 5 | print STDERR "usage: ln.pl input1 input2 dest-dir\n" . 6 | "This script does a soft link of input1, input2, etc." . 7 | "to dest-dir, using relative links where possible\n" . 8 | "Note: input-n and dest-dir may both be absolute pathnames,\n" . 9 | "or relative pathnames, relative to the current directlory.\n"; 10 | exit(1); 11 | } 12 | 13 | $dir = pop @ARGV; 14 | if ( ! -d $dir ) { 15 | print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n"; 16 | exit(1); 17 | } 18 | 19 | $ans = 1; # true. 20 | 21 | $absdir = File::Spec->rel2abs($dir); # Get $dir as abs path. 22 | defined $absdir || die "No such directory $dir"; 23 | foreach $file (@ARGV) { 24 | $absfile = File::Spec->rel2abs($file); # Get $file as abs path. 25 | defined $absfile || die "No such file or directory: $file"; 26 | @absdir_split = split("/", $absdir); 27 | @absfile_split = split("/", $absfile); 28 | 29 | $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this 30 | # as the destination in the link command. 31 | $num_removed = 0; 32 | while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) { 33 | shift @absdir_split; 34 | shift @absfile_split; 35 | $num_removed++; 36 | } 37 | if (-l $newfile) { # newfile is already a link -> safe to delete it. 38 | unlink($newfile); # "unlink" just means delete. 39 | } 40 | if ($num_removed == 0) { # will use absolute pathnames. 41 | $oldfile = "/" . join("/", @absfile_split); 42 | $ret = symlink($oldfile, $newfile); 43 | } else { 44 | $num_dots = @absdir_split; 45 | $oldfile = join("/", @absfile_split); 46 | for ($n = 0; $n < $num_dots; $n++) { 47 | $oldfile = "../" . $oldfile; 48 | } 49 | $ret = symlink($oldfile, $newfile); 50 | } 51 | $ans = $ans && $ret; 52 | if (! $ret) { 53 | print STDERR "Error linking $oldfile to $newfile\n"; 54 | } 55 | } 56 | 57 | exit ($ans == 1 ? 0 : 1); 58 | 59 | -------------------------------------------------------------------------------- /utils/make_absolute.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script replaces the command readlink -f (which is not portable). 4 | # It turns a pathname into an absolute pathname, including following soft links. 5 | target_file=$1 6 | 7 | cd $(dirname $target_file) 8 | target_file=$(basename "$target_file") 9 | 10 | # Iterate down a (possible) chain of symlinks 11 | while [ -L "$target_file" ]; do 12 | target_file=$(readlink $target_file) 13 | cd $(dirname $target_file) 14 | target_file=$(basename $target_file) 15 | done 16 | 17 | # Compute the canonicalized name by finding the physical path 18 | # for the directory we're in and appending the target file. 19 | phys_dir=$(pwd -P) 20 | result=$phys_dir/$target_file 21 | echo $result 22 | -------------------------------------------------------------------------------- /utils/make_unigram_grammar.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script is used in discriminative training. 18 | # This script makes a simple unigram-loop version of G.fst 19 | # using a unigram grammar estimated from some training transcripts. 20 | # This is for MMI training. 21 | # We don't have any silences in G.fst; these are supplied by the 22 | # optional silences in the lexicon. 23 | 24 | # Note: the symbols in the transcripts become the input and output 25 | # symbols of G.txt; these can be numeric or not. 26 | 27 | if(@ARGV != 0) { 28 | die "Usage: make_unigram_grammar.pl < text-transcripts > G.txt" 29 | } 30 | 31 | $totcount = 0; 32 | $nl = 0; 33 | while (<>) { 34 | @A = split(" ", $_); 35 | foreach $a (@A) { 36 | $count{$a}++; 37 | $totcount++; 38 | } 39 | $nl++; 40 | $totcount++; # Treat end-of-sentence as a symbol for purposes of 41 | # $totcount, so the grammar is properly stochastic. This doesn't 42 | # become , it just becomes the final-prob. 43 | } 44 | 45 | foreach $a (keys %count) { 46 | $prob = $count{$a} / $totcount; 47 | $cost = -log($prob); # Negated natural-log probs. 48 | print "0\t0\t$a\t$a\t$cost\n"; 49 | } 50 | # Zero final-cost. 51 | $final_prob = $nl / $totcount; 52 | $final_cost = -log($final_prob); 53 | print "0\t$final_cost\n"; 54 | 55 | -------------------------------------------------------------------------------- /utils/nnet/gen_dct_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Brno University of Technology (author: Karel Vesely) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # ./gen_dct_mat.py 19 | # script generates matrix with DCT transform, which is sparse 20 | # and takes into account that data-layout is along frequency axis, 21 | # while DCT is done along temporal axis. 22 | 23 | from __future__ import division 24 | from __future__ import print_function 25 | from math import * 26 | import sys 27 | 28 | 29 | from optparse import OptionParser 30 | 31 | def print_on_same_line(text): 32 | print(text, end=' ') 33 | 34 | parser = OptionParser() 35 | parser.add_option('--fea-dim', dest='dim', help='feature dimension') 36 | parser.add_option('--splice', dest='splice', help='applied splice value') 37 | parser.add_option('--dct-basis', dest='dct_basis', help='number of DCT basis') 38 | (options, args) = parser.parse_args() 39 | 40 | if(options.dim == None): 41 | parser.print_help() 42 | sys.exit(1) 43 | 44 | dim=int(options.dim) 45 | splice=int(options.splice) 46 | dct_basis=int(options.dct_basis) 47 | 48 | timeContext=2*splice+1 49 | 50 | 51 | #generate the DCT matrix 52 | M_PI = 3.1415926535897932384626433832795 53 | M_SQRT2 = 1.4142135623730950488016887 54 | 55 | 56 | #generate sparse DCT matrix 57 | print('[') 58 | for k in range(dct_basis): 59 | for m in range(dim): 60 | for n in range(timeContext): 61 | if(n==0): 62 | print_on_same_line(m*'0 ') 63 | else: 64 | print_on_same_line((dim-1)*'0 ') 65 | print_on_same_line(str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5)))) 66 | if(n==timeContext-1): 67 | print_on_same_line((dim-m-1)*'0 ') 68 | print() 69 | print() 70 | 71 | print(']') 72 | 73 | -------------------------------------------------------------------------------- /utils/nnet/gen_hamm_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Brno University of Technology (author: Karel Vesely) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # ./gen_hamm_mat.py 19 | # script generates diagonal matrix with hamming window values 20 | 21 | from __future__ import division 22 | from __future__ import print_function 23 | from math import * 24 | import sys 25 | 26 | 27 | from optparse import OptionParser 28 | 29 | def print_on_same_line(text): 30 | print(text, end=' ') 31 | 32 | parser = OptionParser() 33 | parser.add_option('--fea-dim', dest='dim', help='feature dimension') 34 | parser.add_option('--splice', dest='splice', help='applied splice value') 35 | (options, args) = parser.parse_args() 36 | 37 | if(options.dim == None): 38 | parser.print_help() 39 | sys.exit(1) 40 | 41 | dim=int(options.dim) 42 | splice=int(options.splice) 43 | 44 | 45 | #generate the diagonal matrix with hammings 46 | M_2PI = 6.283185307179586476925286766559005 47 | 48 | dim_mat=(2*splice+1)*dim 49 | timeContext=2*splice+1 50 | print('[') 51 | for row in range(dim_mat): 52 | for col in range(dim_mat): 53 | if col!=row: 54 | print_on_same_line('0') 55 | else: 56 | i=int(row/dim) 57 | print_on_same_line(str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1)))) 58 | print() 59 | 60 | print(']') 61 | 62 | 63 | -------------------------------------------------------------------------------- /utils/nnet/gen_splice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012 Brno University of Technology (author: Karel Vesely) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # ./gen_splice.py 19 | # generates Component 20 | 21 | from __future__ import print_function 22 | from math import * 23 | import sys 24 | 25 | 26 | from optparse import OptionParser 27 | 28 | def print_on_same_line(text): 29 | print(text, end=' ') 30 | 31 | parser = OptionParser() 32 | parser.add_option('--fea-dim', dest='dim_in', help='feature dimension') 33 | parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame') 34 | parser.add_option('--splice-step', dest='splice_step', help='splicing step (frames dont need to be consecutive, --splice 3 --splice-step 2 will select offsets: -6 -4 -2 0 2 4 6)', default='1' ) 35 | (options, args) = parser.parse_args() 36 | 37 | if(options.dim_in == None): 38 | parser.print_help() 39 | sys.exit(1) 40 | 41 | dim_in=int(options.dim_in) 42 | splice=int(options.splice) 43 | splice_step=int(options.splice_step) 44 | 45 | dim_out=(2*splice+1)*dim_in 46 | 47 | print(' {0} {1}'.format(dim_out, dim_in)) 48 | print_on_same_line('[') 49 | 50 | splice_vec = list(range(-splice*splice_step, splice*splice_step+1, splice_step)) 51 | for idx in range(len(splice_vec)): 52 | print_on_same_line(splice_vec[idx]) 53 | 54 | print(']') 55 | 56 | -------------------------------------------------------------------------------- /utils/nnet/subset_data_tr_cv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2017 Brno University of Technology (Author: Karel Vesely); 4 | # Apache 2.0 5 | 6 | # This scripts splits 'data' directory into two parts: 7 | # - training set with 90% of speakers 8 | # - held-out set with 10% of speakers (cv) 9 | # (to be used in frame cross-entropy training of 'nnet1' models), 10 | 11 | # The script also accepts a list of held-out set speakers by '--cv-spk-list' 12 | # (with perturbed data, we pass the list of speakers externally). 13 | # The remaining set of speakers is the the training set. 14 | 15 | cv_spk_percent=10 16 | cv_spk_list= # To be used with perturbed data, 17 | seed=777 18 | cv_utt_percent= # ignored (compatibility), 19 | . utils/parse_options.sh 20 | 21 | if [ $# != 3 ]; then 22 | echo "Usage: $0 [opts] " 23 | echo " --cv-spk-percent N (default 10)" 24 | echo " --cv-spk-list (a pre-defined list with cv speakers)" 25 | exit 1; 26 | fi 27 | 28 | set -euo pipefail 29 | 30 | src_data=$1 31 | trn_data=$2 32 | cv_data=$3 33 | 34 | [ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1 35 | 36 | tmp=$(mktemp -d /tmp/${USER}_XXXXX) 37 | 38 | if [ -z "$cv_spk_list" ]; then 39 | # Select 'cv_spk_percent' speakers randomly, 40 | cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers 41 | n_spk=$(wc -l <$tmp/speakers) 42 | n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ") 43 | # 44 | head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv 45 | tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn 46 | else 47 | # Use pre-defined list of speakers, 48 | cp $cv_spk_list $tmp/speakers_cv 49 | join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn 50 | fi 51 | 52 | # Sanity checks, 53 | n_spk=$(wc -l <$src_data/spk2utt) 54 | echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)" 55 | overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l) 56 | [ $overlap != 0 ] && \ 57 | echo "WARNING, speaker overlap detected!" && \ 58 | join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \ 59 | echo '...' 60 | 61 | # Create new data dirs, 62 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data 63 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data 64 | 65 | -------------------------------------------------------------------------------- /utils/parallel/limit_num_gpus.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script functions as a wrapper of a bash command that uses GPUs. 4 | # 5 | # It sets the CUDA_VISIBLE_DEVICES variable so that it limits the number of GPUs 6 | # used for programs. It is neccesary for running a job on the grid if the job 7 | # would automatically grabs all resources available on the system, e.g. a 8 | # TensorFlow program. 9 | 10 | num_gpus=1 # this variable indicates how many GPUs we will allow the command 11 | # passed to this script will run on. We achieve this by setting the 12 | # CUDA_VISIBLE_DEVICES variable 13 | set -e 14 | 15 | if [ "$1" == "--num-gpus" ]; then 16 | num_gpus=$2 17 | shift 18 | shift 19 | fi 20 | 21 | if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then 22 | echo $0: Must pass a positive interger or 0 after --num-gpus 23 | echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh 24 | exit 1 25 | fi 26 | 27 | if [ $# -eq 0 ]; then 28 | echo "Usage: $0 [--num-gpus ] [...]" 29 | echo "Runs with args after setting CUDA_VISIBLE_DEVICES to " 30 | echo "make sure exactly GPUs are visible (default: 1)." 31 | exit 1 32 | fi 33 | 34 | CUDA_VISIBLE_DEVICES= 35 | num_total_gpus=`nvidia-smi -L | wc -l` 36 | num_gpus_assigned=0 37 | 38 | if [ $num_gpus -eq 0 ] ; then 39 | echo "$0: Running the job on CPU. Disabling submitting to gpu" 40 | export CUDA_VISIBLE_DEVICES="" 41 | else 42 | for i in `seq 0 $[$num_total_gpus-1]`; do 43 | # going over all GPUs and check if it is idle, and add to the list if yes 44 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 45 | CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1] 46 | fi 47 | # once we have enough GPUs, break out of the loop 48 | [ $num_gpus_assigned -eq $num_gpus ] && break 49 | done 50 | 51 | [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1 52 | 53 | export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g") 54 | 55 | echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES" 56 | fi 57 | 58 | "$@" 59 | -------------------------------------------------------------------------------- /utils/prepare_online_nnet_dist_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti) 4 | # Guoguo Chen 5 | # Apache 2.0 6 | # Script to prepare the distribution from the online-nnet build 7 | 8 | other_files= #other files to be included in the build 9 | other_dirs= 10 | conf_files="ivector_extractor.conf mfcc.conf online_cmvn.conf online_nnet2_decoding.conf splice.conf" 11 | ivec_extractor_files="final.dubm final.ie final.mat global_cmvn.stats online_cmvn.conf splice_opts" 12 | 13 | echo "$0 $@" # Print the command line for logging 14 | [ -f path.sh ] && . ./path.sh; 15 | . parse_options.sh || exit 1; 16 | 17 | if [ $# -ne 3 ]; then 18 | echo "Usage: $0 " 19 | echo "e.g.: $0 data/lang exp/nnet2_online/nnet_ms_a_online tedlium.tgz" 20 | exit 1; 21 | fi 22 | 23 | lang=$1 24 | modeldir=$2 25 | tgzfile=$3 26 | 27 | for f in $lang/phones.txt $other_files; do 28 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 29 | done 30 | 31 | build_files= 32 | for d in $modeldir/conf $modeldir/ivector_extractor; do 33 | [ ! -d $d ] && echo "$0: no such directory $d" && exit 1; 34 | done 35 | 36 | for f in $ivec_extractor_files; do 37 | f=$modeldir/ivector_extractor/$f 38 | [ ! -f $f ] && echo "$0: no such file $f" && exit 1; 39 | build_files="$build_files $f" 40 | done 41 | 42 | # Makes a copy of the original config files, as we will change the absolute path 43 | # to relative. 44 | rm -rf $modeldir/conf_abs_path 45 | mkdir -p $modeldir/conf_abs_path 46 | cp -r $modeldir/conf/* $modeldir/conf_abs_path 47 | 48 | for f in $conf_files; do 49 | [ ! -f $modeldir/conf/$f ] && \ 50 | echo "$0: no such file $modeldir/conf/$f" && exit 1; 51 | # Changes absolute path to relative path. The path entries in the config file 52 | # are generated by scripts and it is safe to assume that they have structure: 53 | # variable=path 54 | cat $modeldir/conf_abs_path/$f | perl -e ' 55 | use File::Spec; 56 | while() { 57 | chomp; 58 | @col = split("=", $_); 59 | if (@col == 2 && (-f $col[1])) { 60 | $col[1] = File::Spec->abs2rel($col[1]); 61 | print "$col[0]=$col[1]\n"; 62 | } else { 63 | print "$_\n"; 64 | } 65 | } 66 | ' > $modeldir/conf/$f 67 | build_files="$build_files $modeldir/conf/$f" 68 | done 69 | 70 | tar -hczvf $tgzfile $lang $build_files $other_files $other_dirs \ 71 | $modeldir/final.mdl $modeldir/tree >/dev/null 72 | 73 | # Changes back to absolute path. 74 | rm -rf $modeldir/conf 75 | mv $modeldir/conf_abs_path $modeldir/conf 76 | -------------------------------------------------------------------------------- /utils/remove_data_links.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This program searches within a directory for soft links that 4 | # appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory, 5 | # and it removes both the soft links and the things they point to. 6 | # for instance, if you have a soft link 7 | # foo/egs/1.1.egs -> storage/2/1.1.egs 8 | # it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs. 9 | 10 | ret=0 11 | 12 | dry_run=false 13 | 14 | if [ "$1" == "--dry-run" ]; then 15 | dry_run=true 16 | shift 17 | fi 18 | 19 | if [ $# == 0 ]; then 20 | echo "Usage: $0 [--dry-run] " 21 | echo "e.g.: $0 exp/nnet4a/egs/" 22 | echo " Removes from any subdirectories of the command-line arguments, soft links that " 23 | echo " appear to have been created by utils/create_data_link.pl, as well as the things" 24 | echo " that those soft links point to. Will typically be called on a directory prior" 25 | echo " to 'rm -r' on that directory, to ensure that data that was distributed on other" 26 | echo " volumes also gets deleted." 27 | echo " With --dry-run, just prints what it would do." 28 | fi 29 | 30 | for dir in $*; do 31 | if [ ! -d $dir ]; then 32 | echo "$0: not a directory: $dir" 33 | ret=1 34 | else 35 | for subdir in $(find $dir -type d); do 36 | if [ -d $subdir/storage ]; then 37 | for x in $(ls $subdir); do 38 | f=$subdir/$x 39 | if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then 40 | target=$subdir/$(readlink $f) 41 | if $dry_run; then 42 | echo rm $f $target 43 | else 44 | rm $f $target 45 | fi 46 | fi 47 | done 48 | fi 49 | done 50 | fi 51 | done 52 | 53 | exit $ret 54 | -------------------------------------------------------------------------------- /utils/remove_oovs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script removes lines that contain these OOVs on either the 18 | # third or fourth fields of the line. It is intended to remove arcs 19 | # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). 20 | 21 | if ( @ARGV < 1 && @ARGV > 2) { 22 | die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; 23 | } 24 | 25 | $unklist = shift @ARGV; 26 | open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; 27 | while(){ 28 | @A = split(" ", $_); 29 | @A == 1 || die "Bad line in unknown-symbol list: $_"; 30 | $unk{$A[0]} = 1; 31 | } 32 | 33 | $num_removed = 0; 34 | while(<>){ 35 | @A = split(" ", $_); 36 | if(defined $unk{$A[2]} || defined $unk{$A[3]}) { 37 | $num_removed++; 38 | } else { 39 | print; 40 | } 41 | } 42 | print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; 43 | 44 | -------------------------------------------------------------------------------- /utils/s2eps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces and with (on both input and output sides), 18 | # for the G.fst acceptor. 19 | 20 | while(<>){ 21 | @A = split(" ", $_); 22 | if ( @A >= 4 ) { 23 | if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } 24 | if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } 25 | } 26 | print join("\t", @A) . "\n"; 27 | } 28 | -------------------------------------------------------------------------------- /utils/scoring/wer_report.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2015 Johns Hopkins University (author: Jan Trmal ) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # This script reads per-utt table generated for example during scoring 19 | # and outpus the WER similar to the format the compute-wer utility 20 | # or the utils/best_wer.pl produces 21 | # i.e. from table containing lines in this format 22 | # SUM raw 23344 243230 176178 46771 9975 20281 77027 16463 23 | # produces something output like this 24 | # %WER 31.67 [ 77027 / 243230, 9975 ins, 20281 del, 46771 sub ] 25 | # NB: if the STDIN stream will contain more of the SUM raw entries, 26 | # the best one will be found and printed 27 | # 28 | # If the script is called with parameters, it uses them pro provide 29 | # a description of the output 30 | # i.e. 31 | # cat per-spk-report | utils/scoring/wer_report.pl Full set 32 | # the following output will be produced 33 | # %WER 31.67 [ 77027 / 243230, 9975 ins, 20281 del, 46771 sub ] Full set 34 | 35 | 36 | while () { 37 | if ( m:SUM\s+raw:) { 38 | @F = split; 39 | if ((!defined $wer) || ($wer > $F[8])) { 40 | $corr=$F[4]; 41 | $sub=$F[5]; 42 | $ins=$F[6]; 43 | $del=$F[7]; 44 | $wer=$F[8]; 45 | $words=$F[3]; 46 | } 47 | } 48 | } 49 | 50 | if (defined $wer) { 51 | $wer_str = sprintf("%.2f", (100.0 * $wer) / $words); 52 | print "%WER $wer_str [ $wer / $words, $ins ins, $del del, $sub sub ]"; 53 | print " " . join(" ", @ARGV) if @ARGV > 0; 54 | print "\n"; 55 | } 56 | -------------------------------------------------------------------------------- /utils/show_lattice.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | format=pdf # pdf svg 4 | mode=save # display save 5 | lm_scale=0.0 6 | acoustic_scale=0.0 7 | #end of config 8 | 9 | . utils/parse_options.sh 10 | 11 | if [ $# != 3 ]; then 12 | echo "usage: $0 [--mode display|save] [--format pdf|svg] " 13 | echo "e.g.: $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt" 14 | exit 1; 15 | fi 16 | 17 | . ./path.sh 18 | 19 | uttid=$1 20 | lat=$2 21 | words=$3 22 | 23 | tmpdir=$(mktemp -d /tmp/kaldi.XXXX); # trap "rm -r $tmpdir" EXIT # cleanup 24 | 25 | gunzip -c $lat | lattice-to-fst --lm-scale=$lm_scale --acoustic-scale=$acoustic_scale ark:- "scp,p:echo $uttid $tmpdir/$uttid.fst|" || exit 1; 26 | ! [ -s $tmpdir/$uttid.fst ] && \ 27 | echo "Failed to extract lattice for utterance $uttid (not present?)" && exit 1; 28 | fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format} 29 | 30 | if [ "$(uname)" == "Darwin" ]; then 31 | doc_open=open 32 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then 33 | doc_open=xdg-open 34 | elif [ $mode == "display" ] ; then 35 | echo "Can not automaticaly open file on your operating system" 36 | mode=save 37 | fi 38 | 39 | [ $mode == "display" ] && $doc_open $tmpdir/$uttid.${format} 40 | [[ $mode == "display" && $? -ne 0 ]] && echo "Failed to open ${format} format." && mode=save 41 | [ $mode == "save" ] && echo "Saving to $uttid.${format}" && cp $tmpdir/$uttid.${format} . 42 | 43 | exit 0 44 | -------------------------------------------------------------------------------- /utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines; 37 | while (<>) { 38 | push @lines, [ (rand(), $_)] ; 39 | } 40 | 41 | @lines = sort { $a->[0] cmp $b->[0] } @lines; 42 | foreach $l (@lines) { 43 | print $l->[1]; 44 | } 45 | -------------------------------------------------------------------------------- /utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /utils/subset_data_dir_tr_cv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2017 Brno University of Technology (Author: Karel Vesely); 4 | # Apache 2.0 5 | 6 | # This scripts splits 'data' directory into two parts: 7 | # - training set with 90% of speakers 8 | # - held-out set with 10% of speakers (cv) 9 | # (to be used in frame cross-entropy training of 'nnet1' models), 10 | 11 | # The script also accepts a list of held-out set speakers by '--cv-spk-list' 12 | # (with perturbed data, we pass the list of speakers externally). 13 | # The remaining set of speakers is the the training set. 14 | 15 | cv_spk_percent=10 16 | cv_spk_list= # To be used with perturbed data, 17 | seed=777 18 | cv_utt_percent= # ignored (compatibility), 19 | . utils/parse_options.sh 20 | 21 | if [ $# != 3 ]; then 22 | echo "Usage: $0 [opts] " 23 | echo " --cv-spk-percent N (default 10)" 24 | echo " --cv-spk-list (a pre-defined list with cv speakers)" 25 | exit 1; 26 | fi 27 | 28 | set -euo pipefail 29 | 30 | src_data=$1 31 | trn_data=$2 32 | cv_data=$3 33 | 34 | [ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1 35 | 36 | tmp=$(mktemp -d /tmp/${USER}_XXXXX) 37 | 38 | if [ -z "$cv_spk_list" ]; then 39 | # Select 'cv_spk_percent' speakers randomly, 40 | cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers 41 | n_spk=$(wc -l <$tmp/speakers) 42 | n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ") 43 | # 44 | head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv 45 | tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn 46 | else 47 | # Use pre-defined list of speakers, 48 | cp $cv_spk_list $tmp/speakers_cv 49 | join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn 50 | fi 51 | 52 | # Sanity checks, 53 | n_spk=$(wc -l <$src_data/spk2utt) 54 | echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)" 55 | overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l) 56 | [ $overlap != 0 ] && \ 57 | echo "WARNING, speaker overlap detected!" && \ 58 | join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \ 59 | echo '...' 60 | 61 | # Create new data dirs, 62 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data 63 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data 64 | 65 | -------------------------------------------------------------------------------- /utils/subword/prepare_subword_text.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 2019 Dongji Gao 4 | 5 | # This script generates subword text form word text. 6 | # For example, internatioal -> inter@@ nation@@ al 7 | # @@ here is the separator indicate the poisition of subword in word. 8 | # Subword directly followed by separator can only appear at he begining or middle of word. 9 | # "" here can be reserved if added to the option "--glossaries" 10 | 11 | # Begin configuration section 12 | separator="@@" 13 | glossaries= 14 | # End configuration section 15 | 16 | . utils/parse_options.sh 17 | 18 | echo "$0 $@" 19 | 20 | if [ $# -ne 3 ]; then 21 | echo "Usage: utils/prepare_subword_text.sh " 22 | echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword" 23 | echo " --seperator # default: @@" 24 | echo " --glossaries # glossaries are words reserved" 25 | exit 1; 26 | fi 27 | 28 | word_text=$1 29 | pair_code=$2 30 | subword_text=$3 31 | 32 | [ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1; 33 | 34 | grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1; 35 | 36 | glossaries_opt= 37 | [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries" 38 | cut -d ' ' -f2- $word_text | \ 39 | utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub 40 | if [ $word_text == $subword_text ]; then 41 | mv $word_text ${word_text}.old 42 | cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text 43 | else 44 | cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text 45 | fi 46 | 47 | rm ${word_text}.sub 48 | echo "Subword text created." 49 | -------------------------------------------------------------------------------- /utils/summarize_warnings.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; 6 | 7 | $dir = $ARGV[0]; 8 | 9 | ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; 10 | 11 | $dir =~ s:/$::; # Remove trailing slash. 12 | 13 | 14 | # Group the files into categories where all have the same base-name. 15 | foreach $f (glob ("$dir/*.log")) { 16 | $f_category = $f; 17 | # do next expression twice; s///g doesn't work as they overlap. 18 | $f_category =~ s:\.\d+\.:.*.:; 19 | $f_category =~ s:\.\d+\.:.*.:; 20 | $fmap{$f_category} .= " $f"; 21 | } 22 | 23 | sub split_hundreds { # split list of filenames into groups of 100. 24 | my $names = shift @_; 25 | my @A = split(" ", $names); 26 | my @ans = (); 27 | while (@A > 0) { 28 | my $group = ""; 29 | for ($x = 0; $x < 100 && @A>0; $x++) { 30 | $fname = pop @A; 31 | $group .= "$fname "; 32 | } 33 | push @ans, $group; 34 | } 35 | return @ans; 36 | } 37 | 38 | foreach $c (keys %fmap) { 39 | $n = 0; 40 | foreach $fgroup (split_hundreds($fmap{$c})) { 41 | $n += `grep -w WARNING $fgroup | wc -l`; 42 | } 43 | if ($n != 0) { 44 | print "$n warnings in $c\n" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /utils/train_arpa_with_kenlm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 2020 Author Jiayu DU 4 | # Apache 2.0 5 | 6 | # This script uses kenlm to estimate an arpa model from plain text, 7 | # it is a resort when you hit memory limit dealing with large corpus 8 | # kenlm estimates arpa using on-disk structure, 9 | # as long as you have big enough hard disk, memory shouldn't be a problem. 10 | # by default, kenlm use up to 50% of your local memory, 11 | # you can control this through -S option 12 | 13 | [ -f path.sh ] && . ./path.sh; 14 | 15 | kenlm_opts="" # e.g. "-o 4 -S 50% --prune 0 5 7 7" 16 | 17 | if [ $# != 4 ]; then 18 | echo "$0 " 19 | echo "e.g. $0 train.txt words.txt wdir 4gram" 20 | exit 1 21 | fi 22 | 23 | text=$1 24 | symbol_table=$2 25 | dir=$3 26 | arpa_name=$4 27 | 28 | if ! which lmplz >& /dev/null ; then 29 | echo "$0: cannot find training tool *lmplz*." 30 | echo "tools/extras/install_kenlm_query_only.sh installs kenlm at tools/kenlm" 31 | echo "it only supports runtime mode, to actually train an arpa using KenLM," 32 | echo "you need a complete KenLM installation(depends on EIGEN and BOOST)," 33 | echo "follow KenLM's building instructions at (https://github.com/kpu/kenlm)" 34 | exit 1 35 | fi 36 | 37 | # the text should be properly pre-processed, e.g: 38 | # cleand, normalized and possibly word-segmented 39 | 40 | # get rid off irrelavent symbols 41 | grep -v '' $symbol_table \ 42 | | grep -v '#0' \ 43 | | grep -v '' | grep -v '' \ 44 | | grep -v '' | grep -v '' \ 45 | | awk '{print $1}' \ 46 | > $dir/ngram.vocab 47 | 48 | # To make sure that kenlm & kaldi have strictly the same vocabulary: 49 | # 1. feed vocabulary into kenlm via --limit_vocab_file 50 | # 2. cat vocabulary to training text, so each word at least appear once 51 | # 52 | # TL;DR reason: 53 | # Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option 54 | # spcifies a *valid* set of vocabulary, whereas *valid but unseen* 55 | # words are discarded in final arpa. 56 | # So the trick is, 57 | # we explicitly add kaldi's vocab(one word per line) to training text, 58 | # making each word appear at least once. 59 | # kenlm never prunes unigram, 60 | # so this always generates consistent kenlm vocabuary as kaldi has. 61 | # The effect of this is like add-one smoothing to unigram counts, 62 | # shouldn't have significant impacts in practice. 63 | cat $dir/ngram.vocab $text \ 64 | | lmplz $kenlm_opts --limit_vocab_file $dir/ngram.vocab \ 65 | > $dir/${arpa_name}.arpa 66 | 67 | echo "$0: Done training arpa to: $dir/${arpa_name}.arpa" 68 | -------------------------------------------------------------------------------- /utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | --------------------------------------------------------------------------------