├── asr ├── local │ ├── path.sh │ ├── cut_r1_r2.sh │ ├── text_format.pl │ ├── text_normalize.pl │ ├── aishell4_process_empty_text_speaker.py │ ├── apply_map_noid.py │ ├── apply_map_new.py │ ├── data.sh │ ├── st_cmds_data_prep.sh │ ├── aishell_data_prep.sh │ ├── aidatatang_data_prep.sh │ ├── alimeeting_process_donothing.py │ ├── text2textgrid.py │ ├── alimeeting_process_textgrid.py │ ├── alimeeting_process_textgrid_time_new.py │ ├── alimeeting_data_prep.sh │ └── aishell4_data_prep.sh ├── conf │ ├── pitch.conf │ ├── fbank.conf │ ├── decode_asr_rnn.yaml │ ├── decode_asr_rnn_noctc.yaml │ ├── decode_asr_rnn_nolm.yaml │ ├── decode_asr_rnn_onlyctc.yaml │ ├── decode_asr_transformer.yaml │ ├── pbs.conf │ ├── queue.conf │ ├── slurm.conf │ ├── train_lm_transformer.yaml │ ├── train_asr_transformer.yaml │ ├── array │ │ ├── train_asr_transformer.yaml │ │ ├── train_asr_transformer_noctc.yaml │ │ ├── train_asr_transformer_test.yaml │ │ ├── train_asr_conformer.yaml │ │ ├── train_asr_conformer_noctc.yaml │ │ ├── train_asr_transformer_rir.yaml │ │ └── train_asr_conformer_rir.yaml │ ├── train_asr_conformer_batch.yaml │ ├── train_asr_conformer.yaml │ └── train_asr_conformer_add_array.yaml ├── pyscripts │ ├── utils │ │ ├── get_yaml.py │ │ ├── make_token_list_from_config.py │ │ └── print_args.py │ └── feats │ │ └── feat-to-shape.py ├── scripts │ ├── feats │ │ ├── make_stft.sh │ │ ├── make_fbank.sh │ │ └── feat_to_shape.sh │ ├── utils │ │ ├── download_from_google_drive.sh │ │ ├── show_asr_result.sh │ │ └── perturb_data_dir_speed.sh │ └── audio │ │ └── format_wav_scp.sh ├── path.sh ├── README.md ├── run_local_conformer_near_alimeeting.sh ├── run_local_multispeaker_conformer_alimeeting.sh ├── cmd.sh └── db.sh ├── speaker ├── dscore │ ├── scorelib │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_write.uem │ │ │ ├── test_load.uem │ │ │ ├── sys.rttm │ │ │ ├── ref.rttm │ │ │ ├── test_score.py │ │ │ ├── test_uem.py │ │ │ ├── test_turn.py │ │ │ └── test_metrics.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── rttm.cpython-36.pyc │ │ │ ├── six.cpython-36.pyc │ │ │ ├── turn.cpython-36.pyc │ │ │ ├── uem.cpython-36.pyc │ │ │ ├── metrics.cpython-36.pyc │ │ │ ├── score.cpython-36.pyc │ │ │ ├── utils.cpython-36.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── argparse.cpython-36.pyc │ │ ├── argparse.py │ │ ├── utils.py │ │ ├── rttm.py │ │ └── uem.py │ ├── requirements.txt │ ├── LICENSE │ └── validate_rttm.py ├── scripts │ ├── segments_to_lab.sh │ ├── extract_feature.sh │ ├── test.sh │ ├── do_segmentation.sh │ ├── run_cluster.sh │ ├── extract_embeddings.sh │ ├── choose_first_channel.py │ └── segment_to_lab.py ├── conf │ ├── sad.conf │ └── mfcc_hires.conf ├── VBx │ ├── __init__.py │ ├── __pycache__ │ │ ├── features.cpython-36.pyc │ │ ├── kaldi_utils.cpython-36.pyc │ │ ├── VB_diarization.cpython-36.pyc │ │ └── diarization_lib.cpython-36.pyc │ ├── models │ │ ├── __pycache__ │ │ │ └── resnet.cpython-36.pyc │ │ └── resnet.py │ ├── extract.sh │ ├── free_gpu.sh │ └── kaldi_utils.py ├── requirements.txt ├── path.sh ├── cmd.sh ├── README.md ├── local │ ├── make_textgrid_rttm.py │ ├── meeting_speaker_number_process.py │ ├── meeting_statistic.py │ ├── train_sad.sh │ └── segmentation │ │ └── tuning │ │ ├── train_lstm_sad_1a.sh │ │ └── train_stats_sad_1a.sh └── run.sh ├── ISCA.png ├── .DS_Store ├── alibaba.ai ├── alibaba.png ├── fig_aishell.jpg ├── ExchangeChannal └── ExchangeChannal.py └── README.md /asr/local/path.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /asr/conf/pitch.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | -------------------------------------------------------------------------------- /speaker/scripts/segments_to_lab.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /speaker/conf/sad.conf: -------------------------------------------------------------------------------- 1 | affix=_1a 2 | nnet_type=stats 3 | -------------------------------------------------------------------------------- /asr/pyscripts/utils/get_yaml.py: -------------------------------------------------------------------------------- 1 | ../../../../../utils/get_yaml.py -------------------------------------------------------------------------------- /asr/scripts/feats/make_stft.sh: -------------------------------------------------------------------------------- 1 | ../../../../../utils/make_stft.sh -------------------------------------------------------------------------------- /asr/scripts/feats/make_fbank.sh: -------------------------------------------------------------------------------- 1 | ../../../../../utils/make_fbank.sh -------------------------------------------------------------------------------- /ISCA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/ISCA.png -------------------------------------------------------------------------------- /asr/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --num-mel-bins=80 3 | -------------------------------------------------------------------------------- /asr/pyscripts/feats/feat-to-shape.py: -------------------------------------------------------------------------------- 1 | ../../../../../utils/feat-to-shape.py -------------------------------------------------------------------------------- /asr/scripts/feats/feat_to_shape.sh: -------------------------------------------------------------------------------- 1 | ../../../../../utils/feat_to_shape.sh -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/.DS_Store -------------------------------------------------------------------------------- /alibaba.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/alibaba.ai -------------------------------------------------------------------------------- /alibaba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/alibaba.png -------------------------------------------------------------------------------- /fig_aishell.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/fig_aishell.jpg -------------------------------------------------------------------------------- /asr/scripts/utils/download_from_google_drive.sh: -------------------------------------------------------------------------------- 1 | ../../../../../utils/download_from_google_drive.sh -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/test_write.uem: -------------------------------------------------------------------------------- 1 | FILE1 1 0.000 10.600 2 | FILE2 1 4.500 13.240 3 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__init__.py: -------------------------------------------------------------------------------- 1 | """Diarization system scoring.""" 2 | __version__ = '1.1.0' 3 | -------------------------------------------------------------------------------- /speaker/dscore/requirements.txt: -------------------------------------------------------------------------------- 1 | intervaltree>=3.0.0 2 | numpy>=1.16.2 3 | scipy>=0.17.0 4 | tabulate>=0.5.0 5 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/test_load.uem: -------------------------------------------------------------------------------- 1 | FILE1 1 0 15 2 | FILE1 1 25 30.4 3 | ; A comment. 4 | FILE2 1 4.5 13.24 -------------------------------------------------------------------------------- /speaker/VBx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Phonexia 2 | # Author: Jan Profant 3 | # All Rights Reserved 4 | -------------------------------------------------------------------------------- /asr/conf/decode_asr_rnn.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 20 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 0.6 6 | lm_weight: 0.3 7 | -------------------------------------------------------------------------------- /asr/conf/decode_asr_rnn_noctc.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 20 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 0.0 6 | lm_weight: 0.3 7 | -------------------------------------------------------------------------------- /asr/conf/decode_asr_rnn_nolm.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 20 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 0.6 6 | lm_weight: 0.0 7 | -------------------------------------------------------------------------------- /asr/conf/decode_asr_rnn_onlyctc.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 20 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 1.0 6 | lm_weight: 0.0 7 | -------------------------------------------------------------------------------- /asr/conf/decode_asr_transformer.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 10 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 0.5 6 | lm_weight: 0.7 7 | -------------------------------------------------------------------------------- /speaker/VBx/__pycache__/features.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/VBx/__pycache__/features.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/VBx/__pycache__/kaldi_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/VBx/__pycache__/kaldi_utils.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/VBx/__pycache__/VB_diarization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/VBx/__pycache__/VB_diarization.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/VBx/models/__pycache__/resnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/VBx/models/__pycache__/resnet.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/VBx/__pycache__/diarization_lib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/VBx/__pycache__/diarization_lib.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/rttm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/rttm.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/six.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/six.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/turn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/turn.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/uem.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/uem.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/score.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/score.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/dscore/scorelib/__pycache__/argparse.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yufan-aslp/AliMeeting/HEAD/speaker/dscore/scorelib/__pycache__/argparse.cpython-36.pyc -------------------------------------------------------------------------------- /speaker/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | sklearn 4 | numexpr 5 | h5py 6 | onnxruntime 7 | soundfile 8 | kaldi_io 9 | torch==1.6.0 10 | tabulate 11 | intervaltree 12 | textgrid 13 | -------------------------------------------------------------------------------- /speaker/scripts/extract_feature.sh: -------------------------------------------------------------------------------- 1 | . ./path.sh 2 | . ./cmd.sh 3 | 4 | dataset=$1 5 | mfccdir=$2 6 | nj=$3 7 | 8 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \ 9 | --mfcc-config conf/mfcc_hires.conf \ 10 | $dataset $dataset/make_mfcc $mfccdir 11 | 12 | -------------------------------------------------------------------------------- /asr/local/cut_r1_r2.sh: -------------------------------------------------------------------------------- 1 | dir=$1 2 | 3 | mkdir -p $dir/R1_R2/ 4 | 5 | grep "R1" $dir/wav.scp > $dir/R1_R2/wav.scp 6 | grep "R2" $dir/wav.scp >> $dir/R1_R2/wav.scp 7 | 8 | cp $dir/text $dir/R1_R2/text 9 | cp $dir/utt2spk $dir/R1_R2/utt2spk 10 | cp $dir/spk2utt $dir/R1_R2/spk2utt 11 | 12 | ./utils/fix_data_dir.sh $dir/R1_R2 13 | -------------------------------------------------------------------------------- /asr/local/text_format.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | # Copyright Chao Weng 4 | 5 | # normalizations for hkust trascript 6 | # see the docs/trans-guidelines.pdf for details 7 | 8 | while () { 9 | @A = split(" ", $_); 10 | if (@A == 1) { 11 | next; 12 | } 13 | print $_ 14 | } 15 | -------------------------------------------------------------------------------- /asr/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /speaker/scripts/test.sh: -------------------------------------------------------------------------------- 1 | 2 | # The tools need you to change the kaldi root by yourself 3 | #export KALDI_ROOT=/home/work_nfs/common/kaldi-20190604-cuda10 4 | 5 | work_path=$1 6 | wav_dir=$2 7 | while read text_file 8 | do 9 | audio=`echo $text_file | awk '{print $1}'` 10 | audio_path=`echo $text_file | awk '{print $2}'` 11 | mv $audio_path $wav_dir/${audio}.wav 12 | done < $work_path/wav.scp 13 | 14 | -------------------------------------------------------------------------------- /speaker/conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=16000 7 | --num-mel-bins=40 8 | --num-ceps=40 9 | --low-freq=40 10 | --high-freq=-400 11 | 12 | -------------------------------------------------------------------------------- /asr/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/argparse.py: -------------------------------------------------------------------------------- 1 | """Custom argument parser and action classes.""" 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | import argparse 6 | import sys 7 | 8 | 9 | __all__ = ['ArgumentParser'] 10 | 11 | 12 | class ArgumentParser(argparse.ArgumentParser): 13 | """Sub-class of ``ArgumentParser`` that write errors to STDERR.""" 14 | def error(self, message): 15 | sys.stderr.write('error: %s\n' % message) 16 | self.print_help() 17 | sys.exit(2) 18 | -------------------------------------------------------------------------------- /asr/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /asr/conf/train_lm_transformer.yaml: -------------------------------------------------------------------------------- 1 | lm: transformer 2 | lm_conf: 3 | pos_enc: null 4 | embed_unit: 128 5 | att_unit: 512 6 | head: 8 7 | unit: 2048 8 | layer: 16 9 | dropout_rate: 0.1 10 | 11 | # optimization related 12 | grad_clip: 5.0 13 | batch_type: numel 14 | batch_bins: 2000000 15 | accum_grad: 1 16 | max_epoch: 15 # 15epoch is enougth 17 | 18 | optim: adam 19 | optim_conf: 20 | lr: 0.001 21 | scheduler: warmuplr 22 | scheduler_conf: 23 | warmup_steps: 25000 24 | 25 | best_model_criterion: 26 | - - valid 27 | - loss 28 | - min 29 | keep_nbest_models: 10 # 10 is good. 30 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/sys.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER FILE1 1 0.101 0.500 speaker2 2 | SPEAKER FILE1 1 0.680 1.533 speaker2 3 | SPEAKER FILE1 1 1.300 3.020 speaker1 4 | SPEAKER FILE1 1 4.698 2.425 speaker2 5 | SPEAKER FILE1 1 12.239 4.816 speaker1 6 | SPEAKER FILE1 1 17.555 0.964 speaker2 7 | SPEAKER FILE1 1 22.930 2.190 speaker2 8 | SPEAKER FILE1 1 25.734 3.211 speaker2 9 | SPEAKER FILE1 1 29.927 2.110 speaker2 10 | SPEAKER FILE1 1 32.242 2.280 speaker2 11 | SPEAKER FILE1 1 34.900 5.314 speaker1 12 | -------------------------------------------------------------------------------- /speaker/path.sh: -------------------------------------------------------------------------------- 1 | #export KALDI_ROOT=/home/work_nfs/common/kaldi-20190604-cuda10 2 | export KALDI_ROOT=/home/yf01084755/workspace/kaldi/ 3 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH 4 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 5 | . $KALDI_ROOT/tools/config/common_path.sh 6 | export LC_ALL=C 7 | 8 | export PATH=/usr/local/cuda-10.2/bin:$PATH 9 | export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH 10 | 11 | SDT_ROOT=`pwd` 12 | export PYTHONPATH=$SDT_ROOT:$PYTHONPATH 13 | export PYTHONPATH=$SDT_ROOT/local:$PYTHONPATH 14 | export PYTHONPATH=$SDT_ROOT/scripts:$PYTHONPATH 15 | 16 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/ref.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER FILE1 1 0.000 0.558 speaker1 2 | SPEAKER FILE1 1 0.678 1.533 speaker2 3 | SPEAKER FILE1 1 1.100 3.228 speaker1 4 | SPEAKER FILE1 1 4.698 2.425 speaker2 5 | SPEAKER FILE1 1 12.239 2.241 speaker1 6 | SPEAKER FILE1 1 15.384 1.609 speaker1 7 | SPEAKER FILE1 1 17.555 0.964 speaker2 8 | SPEAKER FILE1 1 22.930 2.190 speaker2 9 | SPEAKER FILE1 1 25.734 3.211 speaker2 10 | SPEAKER FILE1 1 29.927 4.643 speaker2 11 | SPEAKER FILE1 1 34.235 0.387 speaker1 12 | SPEAKER FILE1 1 34.932 2.314 speaker1 13 | SPEAKER FILE1 1 35.303 5.743 speaker2 14 | -------------------------------------------------------------------------------- /speaker/scripts/do_segmentation.sh: -------------------------------------------------------------------------------- 1 | . ./path.sh 2 | . ./cmd.sh 3 | 4 | # local/segmentation/detect_speech_activity.sh --nj 4 --stage 0 --cmd run.pl \ 5 | # /home/lycheng/workspace/corecode/Python/kaldi_chime6_sad/0012_sad_v1/corpus/test_0328/ \ 6 | # exp/segmentation_1a/tdnn_stats_sad_1a/ \ 7 | # /home/lycheng/workspace/corecode/Python/kaldi_chime6_sad/0012_sad_v1/corpus/test_0328/mfcc/ \ 8 | # sad_work/test_0328/ \ 9 | # /home/lycheng/workspace/corecode/Python/kaldi_chime6_sad/0012_sad_v1/corpus/test_0328/sad/ 10 | 11 | data_dir=$1 12 | sad_work=$2 13 | sad_result=$3 14 | nj=$4 15 | 16 | echo "$data_dir" 17 | echo "$sad_work" 18 | echo "$sad_result" 19 | echo "$nj" 20 | 21 | local/segmentation/detect_speech_activity.sh --nj $nj --stage 0 --cmd run.pl \ 22 | $data_dir exp/segmentation_1a/tdnn_stats_sad_1a/ $data_dir/feat/mfcc $sad_work $sad_result 23 | 24 | -------------------------------------------------------------------------------- /speaker/scripts/run_cluster.sh: -------------------------------------------------------------------------------- 1 | 2 | . path.sh 3 | 4 | work_path=$1 5 | exp_path=$work_path/exp 6 | 7 | for audio in $(awk '{print $1}' $work_path/wav.scp); 8 | do 9 | filename="${audio}" 10 | echo $filename 11 | $train_cmd $exp_path/cluster_${filename}.log \ 12 | python VBx/vbhmm.py --init AHC+VB \ 13 | --out-rttm-dir $work_path/rttm \ 14 | --xvec-ark-file $work_path/embedding/${audio}.ark \ 15 | --segments-file $work_path/embedding/${audio}.seg \ 16 | --xvec-transform VBx/models/ResNet101_16kHz/transform.h5 \ 17 | --plda-file VBx/models/ResNet101_16kHz/plda \ 18 | --threshold -0.015 \ 19 | --lda-dim 128 \ 20 | --Fa 0.3 \ 21 | --Fb 17 \ 22 | --loopP 0.99 & 23 | echo "$filename mission added" 24 | done 25 | 26 | -------------------------------------------------------------------------------- /speaker/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | #export train_cmd="queue.pl -q all.q --mem 4G" 14 | export train_cmd="run.pl -q all.q --mem 4G" 15 | 16 | 17 | -------------------------------------------------------------------------------- /asr/pyscripts/utils/make_token_list_from_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | 6 | import yaml 7 | 8 | 9 | def get_parser(): 10 | parser = argparse.ArgumentParser( 11 | description="get a specified attribute from a YAML file", 12 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 13 | ) 14 | parser.add_argument("inyaml") 15 | return parser 16 | 17 | 18 | def main(): 19 | args = get_parser().parse_args() 20 | with open(args.inyaml, "r") as f: 21 | indict = yaml.load(f, Loader=yaml.Loader) 22 | 23 | if "token_list" not in indict: 24 | raise AttributeError("token_list is not found in config.") 25 | 26 | token_list = os.path.dirname(args.inyaml) + "/tokens.txt" 27 | with open(token_list, "w") as f: 28 | for token in indict["token_list"]: 29 | f.write(f"{token}\n") 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /asr/pyscripts/utils/print_args.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_commandline_args(no_executable=True): 6 | extra_chars = [ 7 | " ", 8 | ";", 9 | "&", 10 | "|", 11 | "<", 12 | ">", 13 | "?", 14 | "*", 15 | "~", 16 | "`", 17 | '"', 18 | "'", 19 | "\\", 20 | "{", 21 | "}", 22 | "(", 23 | ")", 24 | ] 25 | 26 | # Escape the extra characters for shell 27 | argv = [ 28 | arg.replace("'", "'\\''") 29 | if all(char not in arg for char in extra_chars) 30 | else "'" + arg.replace("'", "'\\''") + "'" 31 | for arg in sys.argv 32 | ] 33 | 34 | if no_executable: 35 | return " ".join(argv[1:]) 36 | else: 37 | return sys.executable + " " + " ".join(argv) 38 | 39 | 40 | def main(): 41 | print(get_commandline_args()) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /asr/path.sh: -------------------------------------------------------------------------------- 1 | MAIN_ROOT=/home/yf352572/workspace/espnet/ 2 | KALDI_ROOT=$MAIN_ROOT/tools/kaldi 3 | 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 6 | . $KALDI_ROOT/tools/config/common_path.sh 7 | export LC_ALL=C 8 | 9 | . "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh 10 | export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH 11 | 12 | export OMP_NUM_THREADS=1 13 | 14 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C 15 | export PYTHONIOENCODING=UTF-8 16 | 17 | # You need to change or unset NCCL_SOCKET_IFNAME according to your network environment 18 | # https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname 19 | export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet" 20 | 21 | # NOTE(kamo): Source at the last to overwrite the setting 22 | . local/path.sh 23 | -------------------------------------------------------------------------------- /asr/README.md: -------------------------------------------------------------------------------- 1 | # Automatic Speech Recognition (ASR) 2 | 3 | 4 | 5 | ## Usage 6 | 7 | For ASR track, we provide two baseline systems includes single-speaker and multi-speaker ASR. For single-speaker, please run all steps in `./run_local_conformer_near_alimeeting.sh`, while `./run_local_multispeaker_conformer_alimenting.sh` is used for the multi-speaker ASR. 8 | 9 | 10 | **The main stage:** 11 | 12 | 1. We use the implementation of Conformer ASR model in the ESPnet2. Please install the latest ESPnet toolkit and copy our all files to the `espnet/egs2/AliMeeting/asr`. 13 | 2. Both data preparation, language model training, and ASR model training are included in `asr_local.sh`. 14 | 3. First, please run `./run_local_conformer_near_alimeeting.sh` to train the single-speaker ASR model. Then, run `run_local_multispeaker_conformer_alimeeting.sh` to train the multi-speaker ASR model. Please note that you don’t need to repeat the data preparation procedure in the multi-speaker ASR training, since all the preparation will be done in the first training. 15 | 16 | 17 | 18 | 19 | ## Reference 20 | 1. [ESPnet](https://github.com/espnet/espnet.git) 21 | 2. [VBx](https://github.com/BUTSpeechFIT/VBx) 22 | 23 | -------------------------------------------------------------------------------- /speaker/scripts/extract_embeddings.sh: -------------------------------------------------------------------------------- 1 | 2 | # The tools need you to change the kaldi root by yourself 3 | #export KALDI_ROOT=/home/work_nfs/common/kaldi-20190604-cuda10 4 | 5 | work_path=$1 6 | #wav_path=$2 7 | exp_path=$work_path/exp 8 | 9 | mkdir -p $exp_path || exit 1; 10 | 11 | #for audio in $(awk '{print $1}' $work_path/wav.scp) 12 | while read text_file 13 | do 14 | audio=`echo $text_file | awk '{print $1}'` 15 | tmp=`echo $text_file | awk '{print $2}'` 16 | wav_path=`dirname $tmp` 17 | filename="${audio}" 18 | echo $filename > $exp_path/${filename}_wav_list.txt 19 | echo "Sub jobs predict for $filename" 20 | $train_cmd $work_path/exp/extract_embedding_${filename}.log \ 21 | python VBx/predict.py --in-file-list $exp_path/${filename}_wav_list.txt \ 22 | --in-lab-dir $work_path/vad \ 23 | --in-wav-dir $wav_path \ 24 | --out-ark-fn $work_path/embedding/${audio}.ark \ 25 | --out-seg-fn $work_path/embedding/${audio}.seg \ 26 | --weights VBx/models/ResNet101_16kHz/nnet/final.onnx \ 27 | --backend onnx & 28 | echo "${filename} finished" 29 | done < $work_path/wav.scp 30 | echo "extract embedding finished!" 31 | 32 | -------------------------------------------------------------------------------- /asr/local/text_normalize.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | # Copyright Chao Weng 4 | 5 | # normalizations for hkust trascript 6 | # see the docs/trans-guidelines.pdf for details 7 | 8 | while () { 9 | @A = split(" ", $_); 10 | print "$A[0] "; 11 | for ($n = 1; $n < @A; $n++) { 12 | $tmp = $A[$n]; 13 | if ($tmp =~ //) {$tmp =~ s:::g;} 14 | if ($tmp =~ /<%>/) {$tmp =~ s:<%>::g;} 15 | if ($tmp =~ /<->/) {$tmp =~ s:<->::g;} 16 | if ($tmp =~ /<\$>/) {$tmp =~ s:<\$>::g;} 17 | if ($tmp =~ /<#>/) {$tmp =~ s:<#>::g;} 18 | if ($tmp =~ /<_>/) {$tmp =~ s:<_>::g;} 19 | if ($tmp =~ //) {$tmp =~ s:::g;} 20 | if ($tmp =~ /`/) {$tmp =~ s:`::g;} 21 | if ($tmp =~ /&/) {$tmp =~ s:&::g;} 22 | if ($tmp =~ /,/) {$tmp =~ s:,::g;} 23 | if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} 24 | if ($tmp =~ /A/) {$tmp =~ s:A:A:g;} 25 | if ($tmp =~ /a/) {$tmp =~ s:a:A:g;} 26 | if ($tmp =~ /b/) {$tmp =~ s:b:B:g;} 27 | if ($tmp =~ /c/) {$tmp =~ s:c:C:g;} 28 | if ($tmp =~ /k/) {$tmp =~ s:k:K:g;} 29 | if ($tmp =~ /t/) {$tmp =~ s:t:T:g;} 30 | if ($tmp =~ /,/) {$tmp =~ s:,::g;} 31 | if ($tmp =~ /丶/) {$tmp =~ s:丶::g;} 32 | if ($tmp =~ /。/) {$tmp =~ s:。::g;} 33 | if ($tmp =~ /、/) {$tmp =~ s:、::g;} 34 | if ($tmp =~ /?/) {$tmp =~ s:?::g;} 35 | print "$tmp "; 36 | } 37 | print "\n"; 38 | } 39 | -------------------------------------------------------------------------------- /speaker/dscore/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Neville Ryant 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /speaker/README.md: -------------------------------------------------------------------------------- 1 | # Speaker Diarization 2 | 3 | 4 | ## Usage 5 | 6 | For the speaker diarzation track, only the first channel of waves will be used. 7 | 8 | 9 | **The main stage:** 10 | 11 | 1. We use the implementation of Kaldi toolkits. Please install the Kaldi toolkits and conduct ` ln -s /export/kaldi/utils/ utils` and ` ln -s /export/kaldi/step/ step`. 12 | 2. Stage 1 is for the data preparation and stage 2 is for the voice detect activity (VAD). 13 | 3. When using the `VBx` toolkits for diarization, please convert the segments file to `.lab`. Use ` scripts/segment_to_lab.sh` to change the file format. 14 | 4. The speaker diarization consists of speaker embedding extraction and speaker embedding clustering. In our baseline system, the `VBx` toolkit is used to extract the speaker embeddings. 15 | 5. For the speaker-embedding cluster, the code will get the hypothesis rttm for each audio in the wav.scp. 16 | 6. We obtain the reference rttm through the ground truth transcripts. 17 | 7. We use toolkits `dscore` to get the DER results. 18 | 19 | 20 | ## Pre-traiend Model Download 21 | 22 | Download the model from the [path](https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/AliMeeting/speaker_part.tgz). Then, move the `exp` directory to our `speaker` directory and move the ` ResNet101_16kHz` to ` speaker/VBx/models`. 23 | 24 | 25 | 26 | ## Reference 27 | 1. [kaldi-sad-model](http://kaldi-asr.org/models/m12) 28 | 2. [VBx](https://github.com/BUTSpeechFIT/VBx) 29 | 3. [dscore](https://github.com/nryant/dscore.git) 30 | 31 | -------------------------------------------------------------------------------- /speaker/VBx/extract.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -*- coding: utf-8 -*- 3 | # 4 | # @Authors: Federico Landini 5 | # @Emails: landini@fit.vutbr.cz 6 | 7 | MODEL=$1 8 | WEIGHTS=$2 9 | WAV_DIR=$3 10 | LAB_DIR=$4 11 | FILE_LIST=$5 12 | OUT_DIR=$6 13 | DEVICE=$7 14 | 15 | EMBED_DIM=256 16 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 17 | 18 | mkdir -pv $OUT_DIR 19 | 20 | TASKFILE=$OUT_DIR/xv_task 21 | rm -f $TASKFILE 22 | 23 | mkdir -p $OUT_DIR/lists $OUT_DIR/xvectors $OUT_DIR/segments 24 | while IFS= read -r line; do 25 | mkdir -p "$(dirname $OUT_DIR/lists/$line)" 26 | grep $line $FILE_LIST > $OUT_DIR/lists/$line".txt" 27 | OUT_ARK_FILE=$OUT_DIR/xvectors/$line.ark 28 | OUT_SEG_FILE=$OUT_DIR/segments/$line 29 | mkdir -p "$(dirname $OUT_ARK_FILE)" 30 | mkdir -p "$(dirname $OUT_SEG_FILE)" 31 | if [[ "$DEVICE" == "gpu" ]]; then 32 | echo "python $DIR/predict.py --seg-len 144 --seg-jump 24 --model $MODEL --weights $WEIGHTS --gpus=\$($DIR/free_gpu.sh) $MDL_WEIGHTS --ndim 64 --embed-dim $EMBED_DIM --in-file-list $OUT_DIR/lists/$line".txt" --in-lab-dir $LAB_DIR --in-wav-dir $WAV_DIR --out-ark-fn $OUT_ARK_FILE --out-seg-fn $OUT_SEG_FILE" >> $TASKFILE 33 | else 34 | echo "python $DIR/predict.py --seg-len 144 --seg-jump 24 --model $MODEL --weights $WEIGHTS --gpus= $MDL_WEIGHTS --ndim 64 --embed-dim $EMBED_DIM --in-file-list $OUT_DIR/lists/$line".txt" --in-lab-dir $LAB_DIR --in-wav-dir $WAV_DIR --out-ark-fn $OUT_ARK_FILE --out-seg-fn $OUT_SEG_FILE" >> $TASKFILE 35 | fi 36 | done < $FILE_LIST 37 | -------------------------------------------------------------------------------- /asr/conf/train_asr_transformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: transformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | 15 | # decoder related 16 | decoder: transformer 17 | decoder_conf: 18 | attention_heads: 4 19 | linear_units: 2048 20 | num_blocks: 6 21 | dropout_rate: 0.1 22 | positional_dropout_rate: 0.1 23 | self_attention_dropout_rate: 0.0 24 | src_attention_dropout_rate: 0.0 25 | 26 | # ctc related 27 | ctc_conf: 28 | ignore_nan_grad: true 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 0.3 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | num_workers: 8 38 | batch_type: folded 39 | batch_bins: 80 # reduce/increase this number according to your GPU memory 40 | 41 | # optimization related 42 | accum_grad: 4 43 | grad_clip: 5 44 | max_epoch: 50 45 | val_scheduler_criterion: 46 | - valid 47 | - acc 48 | best_model_criterion: 49 | - - valid 50 | - acc 51 | - max 52 | keep_nbest_models: 10 53 | 54 | optim: adam 55 | optim_conf: 56 | lr: 0.002 57 | scheduler: warmuplr 58 | scheduler_conf: 59 | warmup_steps: 25000 60 | -------------------------------------------------------------------------------- /speaker/scripts/choose_first_channel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import soundfile as sf 5 | 6 | 7 | def main(args): 8 | init_wav_scp_file = args.init_wav_scp_file 9 | output_wav_path = args.output_wav_path 10 | output_wav_scp_file = args.output_wav_scp_file 11 | 12 | input_wav_scp = dict() 13 | with open(init_wav_scp_file, "r") as fr: 14 | lines = fr.readlines() 15 | for line in lines: 16 | parts = line.strip().split() 17 | input_wav_scp[parts[0]] = parts[1] 18 | 19 | output_wav_scp = dict() 20 | for utt_id in input_wav_scp.keys(): 21 | input_utt_path = input_wav_scp[utt_id] 22 | utt, sr = sf.read(input_utt_path) 23 | print(utt.shape) 24 | print(utt.T[0].shape) 25 | 26 | output_utt_path = os.path.join(output_wav_path, utt_id + ".wav") 27 | sf.write(output_utt_path, utt.T[0], sr) 28 | output_wav_scp[utt_id] = output_utt_path 29 | 30 | with open(output_wav_scp_file, "w") as fw: 31 | for (utt_id, utt_path) in output_wav_scp.items(): 32 | fw.write(f"{utt_id} {utt_path}\n") 33 | fw.flush() 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser("Choose one channel from multi-channel wav") 38 | parser.add_argument("--init_wav_scp_file", required=True, help="The init wav.scp file") 39 | parser.add_argument("--output_wav_path", required=True, help="The output wav path") 40 | parser.add_argument("--output_wav_scp_file", required=True, help="The output wav.scp file") 41 | args = parser.parse_args() 42 | main(args) 43 | -------------------------------------------------------------------------------- /speaker/dscore/validate_rttm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Validate RTTM files. 3 | 4 | To validate a RTTM files ``f1.rttm``, ``f2.rttm``, ... 5 | 6 | python validate_rttm.py f1.rttm f2.rttm ... 7 | 8 | which will for each file report the following: 9 | 10 | - the number of unique file ids found 11 | - the number of speaker ids found 12 | - each line containing an error + an error message 13 | """ 14 | from __future__ import print_function 15 | from __future__ import unicode_literals 16 | import argparse 17 | import sys 18 | 19 | from scorelib import __version__ as VERSION 20 | from scorelib.rttm import validate_rttm 21 | from scorelib.utils import error, info 22 | 23 | 24 | if __name__ == '__main__': 25 | # Parse command line arguments. 26 | parser = argparse.ArgumentParser( 27 | description='Validate RTTM files.', add_help=True, 28 | usage='%(prog)s [options] rttm_fns') 29 | parser.add_argument( 30 | 'rttm_fns', nargs='+', help='RTTM files') 31 | parser.add_argument( 32 | '--version', action='version', 33 | version='%(prog)s ' + VERSION) 34 | if len(sys.argv) == 1: 35 | parser.print_help() 36 | sys.exit(1) 37 | args = parser.parse_args() 38 | 39 | for rttm_fn in args.rttm_fns: 40 | info('Validating %s...' % rttm_fn) 41 | file_ids, speaker_ids, error_messages = validate_rttm(rttm_fn) 42 | file_ids = sorted(file_ids) 43 | info('%d file ids found: %s' % 44 | (len(file_ids), ', '.join(file_ids))) 45 | speaker_ids = sorted(speaker_ids) 46 | info('%d speaker ids found: %s' % 47 | (len(speaker_ids), ', '.join(speaker_ids))) 48 | for msg in error_messages: 49 | error(msg, file=sys.stdout) 50 | -------------------------------------------------------------------------------- /speaker/VBx/free_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | usage() { 4 | echo "Usage: $0 [-h | --help]" 5 | } 6 | 7 | help() { 8 | echo "Get the list of the available of available gpus. 9 | " 10 | usage 11 | echo " 12 | Options: 13 | -h --help show this message 14 | 15 | Example: 16 | \$ $0 \\ 17 | /path/to/archives_list \\ 18 | ./dbstats.npz 19 | " 20 | } 21 | 22 | 23 | # Parsing optional arguments. 24 | while [ $# -ge 0 ]; do 25 | param=`echo $1 | awk -F= '{print $1}'` 26 | value=`echo $1 | awk -F= '{print $2}'` 27 | case $param in 28 | -h | --help) 29 | help 30 | exit 31 | ;; 32 | --) 33 | shift 34 | break 35 | ;; 36 | -*) 37 | usage 38 | exit 1 39 | ;; 40 | *) 41 | break 42 | esac 43 | shift 44 | done 45 | 46 | 47 | # Parsing mandatory arguments. 48 | if [ $# -ne 0 ]; then 49 | usage 50 | exit 1 51 | fi 52 | 53 | 54 | # Get the number of GPU available. If the `nvidia-smi` command fails 55 | # we set this number to 0. 56 | nvidia_smi_output=$(nvidia-smi -L) 57 | if [ ${?} -eq 0 ]; then 58 | n_gpus=$(echo "${nvidia_smi_output}" | wc -l) 59 | else 60 | n_gpus=0 61 | fi 62 | 63 | 64 | if [ ${n_gpus} -eq 0 ]; then 65 | # There is no GPU/CUDA on this machine. 66 | exit 0 67 | fi 68 | 69 | for i in $(seq 1 $n_gpus); do 70 | gpu_id=$((i-1)) 71 | status=`nvidia-smi -i ${gpu_id} | grep -c "No running processes found"` 72 | if [ "$status" = "1" ]; then 73 | # if [ ! -z ${free_gpus} ]; then 74 | # free_gpus=${free_gpus} 75 | # break 76 | # fi 77 | free_gpus=${free_gpus}${gpu_id} 78 | break 79 | fi 80 | done 81 | 82 | echo ${free_gpus} 83 | -------------------------------------------------------------------------------- /asr/local/aishell4_process_empty_text_speaker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Process the textgrid files 4 | """ 5 | import argparse 6 | import codecs 7 | from distutils.util import strtobool 8 | from pathlib import Path 9 | import textgrid 10 | import pdb 11 | 12 | 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser(description="process the textgrid files") 16 | parser.add_argument("--path", type=str, required=True, help="Data path") 17 | args = parser.parse_args() 18 | return args 19 | 20 | def main(args): 21 | text = codecs.open(Path(args.path) / "text_process", "r", "utf-8") 22 | utt2spk_merge = codecs.open(Path(args.path) / "utt2spk_merge_process", "r", "utf-8") 23 | 24 | text_file_new = codecs.open(Path(args.path) / "text_new", "w", "utf-8") 25 | utt2spk_file_new = codecs.open(Path(args.path) / "utt2spk_merge_new", "w", "utf-8") 26 | 27 | all_segments = [] 28 | for line1,line2 in zip(text,utt2spk_merge): 29 | uttid1 = line1.strip().split(" ")[0] 30 | uttid2 = line2.strip().split(" ")[0] 31 | assert uttid1 == uttid2 32 | context1 = line1.strip().split(" ")[1].split("$") 33 | context2 = line2.strip().split(" ")[1].split("$") 34 | assert len(context1) == len(context2) 35 | text_file_new.write("%s " % (uttid1)) 36 | utt2spk_file_new.write("%s " % (uttid2)) 37 | for i in range(len(context1)): 38 | if context1[i]!="": 39 | text_file_new.write("SRC%s" % (context1[i])) 40 | utt2spk_file_new.write("SRC%s" % (context2[i])) 41 | text_file_new.write("\n") 42 | utt2spk_file_new.write("\n") 43 | text.close() 44 | utt2spk_merge.close() 45 | utt2spk_file_new.close() 46 | text_file_new.close() 47 | 48 | if __name__ == "__main__": 49 | args = get_args() 50 | main(args) 51 | -------------------------------------------------------------------------------- /asr/local/apply_map_noid.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import sys 3 | 4 | dict = sys.argv[1] 5 | input = sys.argv[2] 6 | output = sys.argv[3] 7 | unk = sys.argv[4] 8 | warning = sys.argv[5] 9 | unit_name = sys.argv[6] 10 | 11 | map = {} 12 | units = [] 13 | 14 | with codecs.open(dict, 'r', encoding='utf-8') as f1: 15 | for line in f1: 16 | if len(line.split('\t')) > 1: 17 | word = line.split('\t')[0] 18 | tokens = line.rstrip('\n').split('\t')[1] 19 | else: 20 | word = line.split(' ')[0] 21 | tokens = line.rstrip('\n').split(' ')[1] 22 | map[word] = tokens 23 | 24 | with codecs.open(input, 'r', encoding='utf-8') as f2: 25 | with codecs.open(output, 'w', encoding='utf-8') as f3 ,codecs.open(warning, 'w', encoding='utf-8') as f4: 26 | for line in f2: 27 | if len(line.split('\t')) > 1: 28 | head = line.split('\t')[0] 29 | sentence = line.rstrip('\n').split('\t')[1].split(' ') 30 | else: 31 | head = '' 32 | sentence = line.rstrip('\n').split(' ') 33 | result = '' 34 | for word in sentence: 35 | if len(word): 36 | if word in map: 37 | result += map[word] + ' ' 38 | for unit in map[word].split(' '): 39 | if unit not in units: 40 | units.append(unit) 41 | else: 42 | f4.write(word + '\n') 43 | result += unk + ' ' 44 | f3.write(result.rstrip(' ').lstrip(' ') + '\n') 45 | 46 | list.sort(units) 47 | units.insert(0, '') 48 | with codecs.open(unit_name, 'w', encoding='utf-8') as f5: 49 | for i in range(len(units)): 50 | f5.write(str(units[i]) + ' ' + str(i+1)+'\n') 51 | -------------------------------------------------------------------------------- /asr/local/apply_map_new.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import sys 3 | 4 | dict = sys.argv[1] 5 | input = sys.argv[2] 6 | output = sys.argv[3] 7 | unk = sys.argv[4] 8 | warning = sys.argv[5] 9 | unit_name = sys.argv[6] 10 | 11 | map = {} 12 | units = [] 13 | 14 | with codecs.open(dict, 'r', encoding='utf-8') as f1: 15 | for line in f1: 16 | if len(line.split('\t')) > 1: 17 | word = line.split('\t')[0] 18 | tokens = line.rstrip('\n').split('\t')[1] 19 | else: 20 | word = line.split(' ')[0] 21 | tokens = line.rstrip('\n').split(' ')[1] 22 | map[word] = tokens 23 | 24 | with codecs.open(input, 'r', encoding='utf-8') as f2: 25 | with codecs.open(output, 'w', encoding='utf-8') as f3 ,codecs.open(warning, 'w', encoding='utf-8') as f4: 26 | for line in f2: 27 | if len(line.split('\t')) > 1: 28 | head = line.split('\t')[0] 29 | sentence = line.rstrip('\n').split('\t')[1].split(' ') 30 | else: 31 | head = line.split(' ')[0] 32 | sentence = line.rstrip('\n').split(' ')[1:] 33 | result = head + '\t' 34 | for word in sentence: 35 | if len(word): 36 | if word in map: 37 | result += map[word] + ' ' 38 | for unit in map[word].split(' '): 39 | if unit not in units: 40 | units.append(unit) 41 | else: 42 | f4.write(word + '\n') 43 | result += unk + ' ' 44 | f3.write(result.rstrip(' ').lstrip(' ') + '\n') 45 | 46 | list.sort(units) 47 | units.insert(0, '') 48 | with codecs.open(unit_name, 'w', encoding='utf-8') as f5: 49 | for i in range(len(units)): 50 | f5.write(str(units[i]) + ' ' + str(i+1)+'\n') 51 | -------------------------------------------------------------------------------- /asr/scripts/utils/show_asr_result.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | mindepth=0 3 | maxdepth=1 4 | 5 | . utils/parse_options.sh 6 | 7 | if [ $# -gt 1 ]; then 8 | echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2 9 | echo "" 10 | echo "Show the system environments and the evaluation results in Markdown format." 11 | echo 'The default of is "exp/".' 12 | exit 1 13 | fi 14 | 15 | [ -f ./path.sh ] && . ./path.sh 16 | set -euo pipefail 17 | if [ $# -eq 1 ]; then 18 | exp=$1 19 | else 20 | exp=exp 21 | fi 22 | 23 | 24 | cat << EOF 25 | 26 | # RESULTS 27 | ## Environments 28 | - date: \`$(LC_ALL=C date)\` 29 | EOF 30 | 31 | python3 << EOF 32 | import sys, espnet, torch 33 | pyversion = sys.version.replace('\n', ' ') 34 | 35 | print(f"""- python version: \`{pyversion}\` 36 | - espnet version: \`espnet {espnet.__version__}\` 37 | - pytorch version: \`pytorch {torch.__version__}\`""") 38 | EOF 39 | 40 | cat << EOF 41 | - Git hash: \`$(git rev-parse HEAD)\` 42 | - Commit date: \`$(git log -1 --format='%cd')\` 43 | 44 | EOF 45 | 46 | while IFS= read -r expdir; do 47 | if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then 48 | echo "## $(basename ${expdir})" 49 | for type in wer cer ter; do 50 | if ls "${expdir}"/*/*/score_${type}/result.txt &> /dev/null; then 51 | cat << EOF 52 | ### ${type^^} 53 | 54 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 55 | |---|---|---|---|---|---|---|---|---| 56 | EOF 57 | grep -H -e Avg "${expdir}"/*/*/score_${type}/result.txt \ 58 | | sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_${type}/result.txt:#|\1#g" \ 59 | | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' 60 | echo 61 | fi 62 | done 63 | fi 64 | 65 | 66 | done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d) 67 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_transformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: transformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | 15 | # decoder related 16 | decoder: transformer 17 | decoder_conf: 18 | attention_heads: 4 19 | linear_units: 2048 20 | num_blocks: 6 21 | dropout_rate: 0.1 22 | positional_dropout_rate: 0.1 23 | self_attention_dropout_rate: 0.0 24 | src_attention_dropout_rate: 0.0 25 | 26 | # ctc related 27 | ctc_conf: 28 | ignore_nan_grad: true 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 0.3 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | num_workers: 8 38 | batch_type: numel 39 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 40 | 41 | # optimization related 42 | accum_grad: 4 43 | grad_clip: 5 44 | max_epoch: 100 45 | val_scheduler_criterion: 46 | - valid 47 | - acc 48 | best_model_criterion: 49 | - - valid 50 | - acc 51 | - max 52 | keep_nbest_models: 10 53 | 54 | optim: adam 55 | optim_conf: 56 | lr: 0.0005 57 | scheduler: warmuplr 58 | scheduler_conf: 59 | warmup_steps: 25000 60 | 61 | specaug: specaug 62 | specaug_conf: 63 | apply_time_warp: true 64 | time_warp_window: 5 65 | time_warp_mode: bicubic 66 | apply_freq_mask: true 67 | freq_mask_width_range: 68 | - 0 69 | - 30 70 | num_freq_mask: 2 71 | apply_time_mask: true 72 | time_mask_width_range: 73 | - 0 74 | - 40 75 | num_time_mask: 2 76 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_transformer_noctc.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: transformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | 15 | # decoder related 16 | decoder: transformer 17 | decoder_conf: 18 | attention_heads: 4 19 | linear_units: 2048 20 | num_blocks: 6 21 | dropout_rate: 0.1 22 | positional_dropout_rate: 0.1 23 | self_attention_dropout_rate: 0.0 24 | src_attention_dropout_rate: 0.0 25 | 26 | # ctc related 27 | ctc_conf: 28 | ignore_nan_grad: true 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 0 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | num_workers: 8 38 | batch_type: numel 39 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 40 | 41 | # optimization related 42 | accum_grad: 4 43 | grad_clip: 5 44 | max_epoch: 100 45 | val_scheduler_criterion: 46 | - valid 47 | - acc 48 | best_model_criterion: 49 | - - valid 50 | - acc 51 | - max 52 | keep_nbest_models: 10 53 | 54 | optim: adam 55 | optim_conf: 56 | lr: 0.0005 57 | scheduler: warmuplr 58 | scheduler_conf: 59 | warmup_steps: 25000 60 | 61 | specaug: specaug 62 | specaug_conf: 63 | apply_time_warp: true 64 | time_warp_window: 5 65 | time_warp_mode: bicubic 66 | apply_freq_mask: true 67 | freq_mask_width_range: 68 | - 0 69 | - 30 70 | num_freq_mask: 2 71 | apply_time_mask: true 72 | time_mask_width_range: 73 | - 0 74 | - 40 75 | num_time_mask: 2 76 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_transformer_test.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: transformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | 15 | # decoder related 16 | decoder: transformer 17 | decoder_conf: 18 | attention_heads: 4 19 | linear_units: 2048 20 | num_blocks: 6 21 | dropout_rate: 0.1 22 | positional_dropout_rate: 0.1 23 | self_attention_dropout_rate: 0.0 24 | src_attention_dropout_rate: 0.0 25 | 26 | # ctc related 27 | ctc_conf: 28 | ignore_nan_grad: true 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 0.3 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | num_workers: 4 38 | batch_type: numel 39 | batch_bins: 150000000 # reduce/increase this number according to your GPU memory 40 | 41 | # optimization related 42 | accum_grad: 4 43 | grad_clip: 5 44 | max_epoch: 400 45 | val_scheduler_criterion: 46 | - valid 47 | - acc 48 | best_model_criterion: 49 | - - valid 50 | - acc 51 | - max 52 | keep_nbest_models: 10 53 | 54 | optim: adam 55 | optim_conf: 56 | lr: 0.0005 57 | scheduler: warmuplr 58 | scheduler_conf: 59 | warmup_steps: 25000 60 | 61 | specaug: specaug 62 | specaug_conf: 63 | apply_time_warp: true 64 | time_warp_window: 5 65 | time_warp_mode: bicubic 66 | apply_freq_mask: true 67 | freq_mask_width_range: 68 | - 0 69 | - 30 70 | num_freq_mask: 2 71 | apply_time_mask: true 72 | time_mask_width_range: 73 | - 0 74 | - 40 75 | num_time_mask: 2 76 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions.""" 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | import itertools 6 | import sys 7 | 8 | from . import six 9 | 10 | __all__ = ['clip', 'error', 'format_float', 'groupby', 'info', 'warn', 'xor'] 11 | 12 | 13 | def error(msg, file=sys.stderr): 14 | """Log error message ``msg`` to stderr.""" 15 | msg = 'ERROR: %s' % msg 16 | if six.PY2: 17 | msg = msg.encode('utf-8') 18 | print(msg, file=file) 19 | 20 | 21 | def info(msg, print_level=False, file=sys.stdout): 22 | """Log info message ``msg`` to stdout.""" 23 | if print_level: 24 | msg = 'INFO: %s' %msg 25 | if six.PY2: 26 | msg = msg.encode('utf-8') 27 | print(msg, file=file) 28 | 29 | 30 | def warn(msg, file=sys.stderr): 31 | """Log warning message ``msg`` to stderr.""" 32 | msg = 'WARNING: %s' %msg 33 | if six.PY2: 34 | msg = msg.encode('utf-8') 35 | print(msg, file=file) 36 | 37 | 38 | def xor(x, y): 39 | """Return truth value of ``x`` XOR ``y``.""" 40 | return bool(x) != bool(y) 41 | 42 | 43 | def format_float(x, n_digits=3): 44 | """Format floating point number for output as string. 45 | 46 | Parameters 47 | ---------- 48 | x : float 49 | Number. 50 | 51 | n_digits : int, optional 52 | Number of decimal digits to round to. 53 | (Default: 3) 54 | 55 | Returns 56 | ------- 57 | s : str 58 | Formatted string. 59 | """ 60 | fmt_str = '%%.%df' % n_digits 61 | return fmt_str % round(x, n_digits) 62 | 63 | 64 | def clip(x, lower, upper): 65 | """Clip ``x`` to [``lower``, ``upper``].""" 66 | return min(max(x, lower), upper) 67 | 68 | 69 | def groupby(iterable, keyfunc): 70 | """Wrapper around ``itertools.groupby`` which sorts data first.""" 71 | iterable = sorted(iterable, key=keyfunc) 72 | for key, group in itertools.groupby(iterable, keyfunc): 73 | yield key, group 74 | -------------------------------------------------------------------------------- /asr/run_local_conformer_near_alimeeting.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | ngpu=2 9 | device=0,1 10 | 11 | stage=2 12 | stop_stage=12 13 | 14 | 15 | train_set=Train_Ali_near 16 | valid_set=Eval_Ali_near 17 | test_sets="Test_Ali_near" 18 | asr_config=conf/array/train_asr_conformer.yaml 19 | inference_config=conf/decode_asr_rnn.yaml 20 | 21 | lm_config=conf/train_lm_transformer.yaml 22 | use_lm=true 23 | use_wordlm=false 24 | 25 | 26 | ./asr_local.sh \ 27 | --device ${device} \ 28 | --ngpu ${ngpu} \ 29 | --stage ${stage} \ 30 | --stop_stage ${stop_stage} \ 31 | --asr_exp exp/asr_train_asr_conformer_raw_zh_char_data_alimeeting \ 32 | --asr_stats_dir exp/asr_stats_conformer_raw_zh_char_data_alimeeting \ 33 | --lm_exp exp/lm_train_lm_transformer_zh_char_data_alimeeting \ 34 | --lm_stats_dir exp/lm_stats_zh_char_data_alimeeting \ 35 | --lang zh \ 36 | --audio_format wav \ 37 | --feats_type raw \ 38 | --token_type char \ 39 | --use_lm ${use_lm} \ 40 | --use_word_lm ${use_wordlm} \ 41 | --lm_config "${lm_config}" \ 42 | --asr_config "${asr_config}" \ 43 | --inference_config "${inference_config}" \ 44 | --train_set "${train_set}" \ 45 | --valid_set "${valid_set}" \ 46 | --test_sets "${test_sets}" \ 47 | --asr_speech_fold_length 512 \ 48 | --asr_text_fold_length 150 \ 49 | --lm_fold_length 150 \ 50 | --lm_train_text "data/${train_set}/text" "$@" 51 | -------------------------------------------------------------------------------- /asr/local/data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | SECONDS=0 17 | 18 | log "$0 $*" 19 | 20 | 21 | . ./utils/parse_options.sh 22 | 23 | . ./db.sh 24 | . ./path.sh 25 | . ./cmd.sh 26 | 27 | 28 | if [ $# -gt 1 ]; then 29 | log "${help_message}" 30 | exit 2 31 | fi 32 | 33 | # aishell data preparation 34 | #if [ -z "${AISHELL}" ]; then 35 | # log "Error: \$AISHELL is not set in db.sh." 36 | # exit 2 37 | #fi 38 | # include both training, dev, and test set 39 | #local/aishell_data_prep.sh 40 | 41 | # aidatatang data preparation 42 | #if [ -z "${AIDATATANG}" ]; then 43 | # log "Error: \$AIDATATANG is not set in db.sh." 44 | # exit 2 45 | #fi 46 | # include both training, dev, and test set 47 | #local/aidatatang_data_prep.sh 48 | 49 | # st_cmd data preparation 50 | #if [ -z "${ST_CMD}" ]; then 51 | # log "Error: \$ST_CMD is not set in db.sh." 52 | # exit 2 53 | #fi 54 | # only have training and dev set 55 | #local/st_cmds_data_prep.sh 56 | 57 | # aishell4 data preparation 58 | #if [ -z "${AISHELL4}" ]; then 59 | # log "Error: \$AISHELL4 is not set in db.sh." 60 | # exit 2 61 | #fi 62 | # only have training and test set 63 | #local/aishell4_data_prep.sh --no_overlap true 64 | 65 | # alimeeting data preparation 66 | if [ -z "${AliMeeting}" ]; then 67 | log "Error: \$AliMeeting is not set in db.sh." 68 | exit 2 69 | fi 70 | # only have training and test set 71 | local/alimeeting_data_prep.sh --no_overlap true --tgt test 72 | local/alimeeting_data_prep.sh --no_overlap true --tgt train 73 | 74 | # combine all training set 75 | #utils/combine_data.sh data/train data/*_train 76 | 77 | # combine all dev set 78 | #utils/combine_data.sh data/test data/*_test 79 | 80 | 81 | log "Successfully finished. [elapsed=${SECONDS}s]" 82 | -------------------------------------------------------------------------------- /asr/run_local_multispeaker_conformer_alimeeting.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | ngpu=2 9 | device=4,5 10 | #stage 1 creat both near and far 11 | stage=7 12 | stop_stage=12 13 | 14 | 15 | train_set=Train_Ali_far 16 | valid_set=Eval_Ali_far 17 | test_sets="Test_Ali_far" 18 | asr_config=conf/array/train_asr_conformer.yaml 19 | inference_config=conf/decode_asr_rnn.yaml 20 | 21 | lm_config=conf/train_lm_transformer.yaml 22 | use_lm=true 23 | use_wordlm=false 24 | ./asr_local.sh \ 25 | --device ${device} \ 26 | --ngpu ${ngpu} \ 27 | --stage ${stage} \ 28 | --stop_stage ${stop_stage} \ 29 | --asr_exp exp/asr_train_asr_multispeaker_conformer_raw_zh_char_data_alimeeting \ 30 | --asr_stats_dir exp/asr_stats_multispeaker_conformer_raw_zh_char_data_alimeeting \ 31 | --lm_exp exp/lm_train_lm_multispeaker_transformer_zh_char_data_alimeeting \ 32 | --lm_stats_dir exp/lm_stats_multispeaker_zh_char_data_alimeeting \ 33 | --lang zh \ 34 | --audio_format wav \ 35 | --feats_type raw \ 36 | --token_type char \ 37 | --use_lm ${use_lm} \ 38 | --use_word_lm ${use_wordlm} \ 39 | --lm_config "${lm_config}" \ 40 | --asr_config "${asr_config}" \ 41 | --inference_config "${inference_config}" \ 42 | --train_set "${train_set}" \ 43 | --valid_set "${valid_set}" \ 44 | --test_sets "${test_sets}" \ 45 | --asr_speech_fold_length 1024 \ 46 | --asr_text_fold_length 600 \ 47 | --lm_fold_length 600 \ 48 | --lm_train_text "data/${train_set}/text" "$@" 49 | -------------------------------------------------------------------------------- /asr/conf/train_asr_conformer_batch.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.3 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: folded 46 | batch_bins: 64 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 50 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.0005 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 30000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_conformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.3 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: numel 46 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 100 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.0005 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 25000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | -------------------------------------------------------------------------------- /asr/conf/train_asr_conformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.3 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: numel 46 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 100 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.001 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 20000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | 84 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_conformer_noctc.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.0 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: numel 46 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 100 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.0005 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 25000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | -------------------------------------------------------------------------------- /asr/conf/train_asr_conformer_add_array.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.3 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: numel 46 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 100 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.001 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 20000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | 84 | -------------------------------------------------------------------------------- /speaker/local/make_textgrid_rttm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tqdm 3 | import codecs 4 | import textgrid 5 | import pdb 6 | 7 | class Segment(object): 8 | def __init__(self, uttid, spkr, stime, etime, text): 9 | self.uttid = uttid 10 | self.spkr = spkr 11 | self.stime = round(stime, 2) 12 | self.etime = round(etime, 2) 13 | self.text = text 14 | 15 | def change_stime(self, time): 16 | self.stime = time 17 | 18 | def change_etime(self, time): 19 | self.etime = time 20 | 21 | 22 | def main(args): 23 | tg = textgrid.TextGrid.fromFile(args.input_textgrid_file) 24 | segments = [] 25 | spk = {} 26 | num_spk = 1 27 | uttid = args.uttid 28 | for i in range(tg.__len__()): 29 | for j in range(tg[i].__len__()): 30 | if tg[i][j].mark: 31 | if tg[i].name not in spk: 32 | spk[tg[i].name] = num_spk 33 | num_spk += 1 34 | segments.append( 35 | Segment( 36 | uttid, 37 | spk[tg[i].name], 38 | tg[i][j].minTime, 39 | tg[i][j].maxTime, 40 | tg[i][j].mark.strip(), 41 | ) 42 | ) 43 | segments = sorted(segments, key=lambda x: x.stime) 44 | 45 | rttm_file = codecs.open(args.output_rttm_file, "w", "utf-8") 46 | 47 | for i in range(len(segments)): 48 | fmt = "SPEAKER {:s} 1 {:.2f} {:.2f} {:s} " 49 | #pdb.set_trace() 50 | rttm_file.write(f"{fmt.format(segments[i].uttid, float(segments[i].stime), float(segments[i].etime) - float(segments[i].stime), str(segments[i].spkr))}\n") 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description="Make rttm for true label") 54 | parser.add_argument("--input_textgrid_file", required=True, help="The textgrid file") 55 | parser.add_argument("--output_rttm_file", required=True, help="The output rttm file") 56 | parser.add_argument("--uttid", required=True, help="The utt id of the file") 57 | args = parser.parse_args() 58 | main(args) 59 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_transformer_rir.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: transformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | 15 | # decoder related 16 | decoder: transformer 17 | decoder_conf: 18 | attention_heads: 4 19 | linear_units: 2048 20 | num_blocks: 6 21 | dropout_rate: 0.1 22 | positional_dropout_rate: 0.1 23 | self_attention_dropout_rate: 0.0 24 | src_attention_dropout_rate: 0.0 25 | 26 | # ctc related 27 | ctc_conf: 28 | ignore_nan_grad: true 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 0.3 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | num_workers: 8 38 | batch_type: numel 39 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 40 | 41 | # optimization related 42 | accum_grad: 4 43 | grad_clip: 5 44 | max_epoch: 100 45 | val_scheduler_criterion: 46 | - valid 47 | - acc 48 | best_model_criterion: 49 | - - valid 50 | - acc 51 | - max 52 | keep_nbest_models: 10 53 | 54 | optim: adam 55 | optim_conf: 56 | lr: 0.0005 57 | scheduler: warmuplr 58 | scheduler_conf: 59 | warmup_steps: 25000 60 | 61 | specaug: specaug 62 | specaug_conf: 63 | apply_time_warp: true 64 | time_warp_window: 5 65 | time_warp_mode: bicubic 66 | apply_freq_mask: true 67 | freq_mask_width_range: 68 | - 0 69 | - 30 70 | num_freq_mask: 2 71 | apply_time_mask: true 72 | time_mask_width_range: 73 | - 0 74 | - 40 75 | num_time_mask: 2 76 | 77 | 78 | use_preprocessor: true 79 | rir_scp: /mnt/fyu/data/nosie_and_rir_data/rir.scp 80 | rir_utt_prefix: "BAC,aidatatang,st_cmd" 81 | rir_apply_prob: 0.9 82 | noise_scp: /mnt/fyu/data/nosie_and_rir_data/noisedata_select.scp 83 | noise_utt_prefix: "BAC,aidatatang,st_cmd" 84 | noise_db_range: "5_20" 85 | noise_apply_prob: 0.9 86 | -------------------------------------------------------------------------------- /speaker/scripts/segment_to_lab.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | def read_segments_file(segments_file): 6 | utt2segments = dict() 7 | with open(segments_file, "r") as fr: 8 | lines = fr.readlines() 9 | for line in lines: 10 | parts = line.strip().split() 11 | segment_utt_id, utt_id, start, end = parts[0], parts[1], float(parts[2]), float(parts[3]) 12 | if utt_id not in utt2segments: 13 | utt2segments[utt_id] = [] 14 | utt2segments[utt_id].append((segment_utt_id, start, end)) 15 | return utt2segments 16 | 17 | 18 | def write_label(label_file, label_list): 19 | with open(label_file, "w") as fw: 20 | for (start, end) in label_list: 21 | fw.write(f"{start} {end} sp\n") 22 | fw.flush() 23 | 24 | 25 | def write_label_scp_file(label_scp_file, label_scp: dict): 26 | with open(label_scp_file, "w") as fw: 27 | for (utt_id, label_path) in label_scp.items(): 28 | fw.write(f"{utt_id} {label_path}\n") 29 | fw.flush() 30 | 31 | 32 | def main(args): 33 | input_segments = args.input_segments 34 | label_path = args.label_path 35 | output_label_scp_file = args.output_label_scp_file 36 | 37 | utt2segments = read_segments_file(input_segments) 38 | print(f"Collect {len(utt2segments)} utt2segments in file {input_segments}") 39 | 40 | result_label_scp = dict() 41 | for utt_id in utt2segments.keys(): 42 | segment_list = utt2segments[utt_id] 43 | cur_label_path = os.path.join(label_path, f"{utt_id}.lab") 44 | write_label(cur_label_path, label_list=[(i1, i2) for (_, i1, i2) in segment_list]) 45 | result_label_scp[utt_id] = cur_label_path 46 | write_label_scp_file(output_label_scp_file, result_label_scp) 47 | print(f"Write {len(result_label_scp)} labels") 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser(description="Make the lab file for segments") 52 | parser.add_argument("--input_segments", required=True, help="The input segments file") 53 | parser.add_argument("--label_path", required=True, help="The label_path to save file.lab") 54 | parser.add_argument("--output_label_scp_file", required=True, help="The output label.scp file") 55 | 56 | args = parser.parse_args() 57 | main(args) 58 | 59 | -------------------------------------------------------------------------------- /asr/conf/array/train_asr_conformer_rir.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: conformer 4 | encoder_conf: 5 | output_size: 256 # dimension of attention 6 | attention_heads: 4 7 | linear_units: 2048 # the number of units of position-wise feed forward 8 | num_blocks: 12 # the number of encoder blocks 9 | dropout_rate: 0.1 10 | positional_dropout_rate: 0.1 11 | attention_dropout_rate: 0.0 12 | input_layer: conv2d # encoder architecture type 13 | normalize_before: true 14 | rel_pos_type: latest 15 | pos_enc_layer_type: rel_pos 16 | selfattention_layer_type: rel_selfattn 17 | activation_type: swish 18 | macaron_style: true 19 | use_cnn_module: true 20 | cnn_module_kernel: 15 21 | 22 | # decoder related 23 | decoder: transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | self_attention_dropout_rate: 0.0 31 | src_attention_dropout_rate: 0.0 32 | 33 | # ctc related 34 | ctc_conf: 35 | ignore_nan_grad: true 36 | 37 | # hybrid CTC/attention 38 | model_conf: 39 | ctc_weight: 0.3 40 | lsm_weight: 0.1 # label smoothing option 41 | length_normalized_loss: false 42 | 43 | # minibatch related 44 | num_workers: 8 45 | batch_type: numel 46 | batch_bins: 15000000 # reduce/increase this number according to your GPU memory 47 | 48 | # optimization related 49 | accum_grad: 4 50 | grad_clip: 5 51 | max_epoch: 100 52 | val_scheduler_criterion: 53 | - valid 54 | - acc 55 | best_model_criterion: 56 | - - valid 57 | - acc 58 | - max 59 | keep_nbest_models: 10 60 | 61 | optim: adam 62 | optim_conf: 63 | lr: 0.0005 64 | scheduler: warmuplr 65 | scheduler_conf: 66 | warmup_steps: 25000 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 30 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_range: 80 | - 0 81 | - 40 82 | num_time_mask: 2 83 | 84 | use_preprocessor: true 85 | rir_scp: /mnt/fyu/data/nosie_and_rir_data/rir.scp 86 | rir_utt_prefix: "BAC,aidatatang,st_cmd" 87 | rir_apply_prob: 0.9 88 | noise_scp: /mnt/fyu/data/nosie_and_rir_data/noisedata_select.scp 89 | noise_utt_prefix: "BAC,aidatatang,st_cmd" 90 | noise_db_range: "5_20" 91 | noise_apply_prob: 0.9 92 | -------------------------------------------------------------------------------- /ExchangeChannal/ExchangeChannal.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import os 3 | import sys 4 | import wave 5 | #import wavio 6 | import numpy as np 7 | import math 8 | 9 | def _trivial__enter__(self): 10 | return self 11 | 12 | def _self_close__exit__(self, exc_type, exc_value, traceback): 13 | self.close() 14 | 15 | wave.Wave_read.__exit__ = wave.Wave_write.__exit__ = _self_close__exit__ 16 | wave.Wave_read.__enter__ = wave.Wave_write.__enter__ = _trivial__enter__ 17 | 18 | class ExchangeChannal: 19 | 20 | __wavInfo = {'waveRate':0, 'sampleWidth':0, 'channelCnt':0, 'nframes':0} 21 | 22 | def __init__(self, argv): 23 | print "__init__ ExchangeChannal" 24 | 25 | def __mergeWave(self, inputArray, outputFile): 26 | cmdline = "sox -M" 27 | for inputFile in inputArray : 28 | cmdline += " " 29 | cmdline += inputFile 30 | cmdline += " " 31 | cmdline += outputFile 32 | ret = os.system(cmdline) 33 | 34 | return ret 35 | 36 | def __removeTmpFile(self, inputArray): 37 | for inputFile in inputArray: 38 | os.remove(inputFile) 39 | 40 | def __splitChannel(self, inputFile, outputFile, channalID) : 41 | cmdline = ' '.join(("sox", inputFile, outputFile, "remix", str(channalID))) 42 | ret = os.system(cmdline) 43 | 44 | return ret 45 | 46 | def __exchange(self, strInFile, outDir, totalChannal, channalArray): 47 | 48 | fileArray = [] 49 | newChannalArray = [] 50 | namearray = os.path.basename(strInFile).split('.') 51 | 52 | for i in range(totalChannal) : 53 | fileName = outDir + '/' + namearray[0] + "_ch_" + str(i+1) + ".wav" 54 | self.__splitChannel(strInFile, fileName, i+1) 55 | fileArray.append([fileName, 0]) 56 | 57 | for i in range(len(channalArray)) : 58 | for j in range(totalChannal) : 59 | if fileArray[j][1] == 1 : 60 | continue 61 | if (j == channalArray[i] - 1) : 62 | fileArray[j][1] = 1 63 | newChannalArray.append(fileArray[j][0]) 64 | break 65 | 66 | outFile = outDir + '/' + namearray[0] + '.wav' 67 | self.__mergeWave(newChannalArray, outFile) 68 | 69 | self.__removeTmpFile(newChannalArray) 70 | 71 | return outFile 72 | 73 | def process(self, inputFile, outDir, totalChannal, channalArray): 74 | return self.__exchange(inputFile, outDir, totalChannal, channalArray) 75 | 76 | 77 | usage = "python xxx.py input.wav channalcnt newchannals (example: xxx.py input.wav outDir 8 3 4 2 1 7 8 6 5)" 78 | 79 | if __name__ == '__main__': 80 | __inputArray = [] 81 | if len(sys.argv) < 5 : 82 | print usage 83 | tool = ExchangeChannal(sys.argv) 84 | for i in range(4, len(sys.argv)) : 85 | __inputArray.append(int(sys.argv[i])) 86 | tool.process(sys.argv[1], sys.argv[2], int(sys.argv[3]), __inputArray) 87 | 88 | exit(0) 89 | -------------------------------------------------------------------------------- /asr/local/st_cmds_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | 17 | SECONDS=0 18 | 19 | log "$0 $*" 20 | 21 | 22 | . ./utils/parse_options.sh 23 | 24 | . ./db.sh 25 | . ./path.sh 26 | . ./cmd.sh 27 | 28 | 29 | if [ $# -gt 0 ]; then 30 | log "${help_message}" 31 | exit 2 32 | fi 33 | 34 | if [ -z "${ST_CMD}" ]; then 35 | log "Error: \$ST_CMD is not set in db.sh." 36 | exit 2 37 | fi 38 | 39 | if [ ! -d "${ST_CMD}" ]; then 40 | log "Error: ${ST_CMD} is empty." 41 | exit 2 42 | fi 43 | 44 | # To absolute path 45 | ST_CMD=$(cd ${ST_CMD}; pwd) 46 | 47 | log "ST_CMD Data Preparation" 48 | dir=data/local/st_cmd 49 | mkdir -p $dir 50 | 51 | # find wav audio files 52 | find -L ${ST_CMD} -iname "*.wav" > $dir/wav.flist 53 | n=$(wc -l < $dir/wav.flist) 54 | [ $n -ne 102600 ] && log Warning: expected 102600 data data files, found $n 55 | 56 | # find transcription files 57 | find -L ${ST_CMD} -iname "*.txt" > $dir/trans.flist 58 | n=$(wc -l < $dir/trans.flist) 59 | [ $n -ne 102600 ] && log Warning: expected 102600 data data files, found $n 60 | 61 | # wav.scp preparation 62 | sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list 63 | paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all 64 | 65 | # transcriptions preparation 66 | while read line; do 67 | uttid=`echo $line | sed -e 's/\.txt//' | awk -F '/' '{print $NF}'` 68 | text=`cat $line` 69 | echo "$uttid $text" 70 | done < $dir/trans.flist > $dir/text_all 71 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/text_all | local/text_normalize.pl | sort -u > $dir/text 72 | 73 | awk '{print $1}' $dir/text > $dir/utt.list 74 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp 75 | 76 | # spk2utt prepartion 77 | cat $dir/utt.list | awk '{print substr($1,9,7)}' > $dir/spk.list 78 | paste -d' ' $dir/utt.list $dir/spk.list > $dir/utt2spk 79 | utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt 80 | 81 | utils/copy_data_dir.sh --utt-prefix st_cmd- --spk-prefix st_cmd- \ 82 | $dir data/st_cmd 83 | 84 | # remove space in text 85 | cp data/st_cmd/text data/st_cmd/text.org 86 | paste -d " " <(cut -f 1 -d" " data/st_cmd/text.org) <(cut -f 2- -d" " data/st_cmd/text.org | tr -d " ") \ 87 | > data/st_cmd/text 88 | rm data/st_cmd/text.org 89 | 90 | utils/subset_data_dir_tr_cv.sh data/st_cmd data/st_cmd_train data/st_cmd_dev 91 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/test_score.py: -------------------------------------------------------------------------------- 1 | """Tests for scoring module.""" 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | import os 6 | 7 | import numpy as np 8 | from numpy.testing import (assert_almost_equal, assert_equal, assert_raises_regex) 9 | 10 | from scorelib.rttm import load_rttm 11 | from scorelib.score import flatten_labels, score, turns_to_frames, Scores 12 | from scorelib.turn import Turn 13 | from scorelib.uem import UEM 14 | 15 | 16 | TEST_DIR = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | 19 | def test_turns_to_frames(): 20 | # Check validation. 21 | turns = [ 22 | Turn(0, 11, speaker_id='S1', file_id='FILE1'), 23 | Turn(5, 11, speaker_id='S2', file_id='FILE2'), 24 | ] 25 | with assert_raises_regex(ValueError, 'Turns should be from'): 26 | labels = turns_to_frames(turns, [(0, 1)], step=0.1) 27 | 28 | # Check that file containing no speech returns an (n_frames, 0) array. 29 | labels = turns_to_frames([], [(0, 1)], step=0.1) 30 | assert_equal(labels, np.zeros((10, 0), dtype='int64')) 31 | 32 | # Check on toy input. 33 | expected_labels = np.zeros((120, 2), dtype='int64') 34 | expected_labels[:110, 0] = 1 35 | expected_labels[50:110, 1] = 1 36 | turns = [ 37 | Turn(0, 11, speaker_id='S1', file_id='FILE1'), 38 | Turn(5, 11, speaker_id='S2', file_id='FILE1'), 39 | ] 40 | labels = turns_to_frames(turns, [(0, 12)], step=0.1) 41 | assert_equal(labels, expected_labels) 42 | 43 | 44 | def test_flatten_labels(): 45 | # No speech. 46 | assert_equal(flatten_labels(np.zeros((5, 0), dtype='int64')), 47 | np.zeros(5, dtype='int64')) 48 | assert_equal(flatten_labels(np.zeros((5, 1), dtype='int64')), 49 | np.zeros(5, dtype='int64')) 50 | assert_equal(flatten_labels(np.zeros((5, 2), dtype='int64')), 51 | np.zeros(5, dtype='int64')) 52 | 53 | # Toy input with 2 speakers. 54 | labels = np.array( 55 | [[0, 0], 56 | [1, 0], 57 | [0, 1], 58 | [1, 1]], 59 | dtype='int64') 60 | assert_equal(flatten_labels(labels), 61 | np.arange(4, dtype='int64')) 62 | 63 | 64 | def test_score(): 65 | # Some real data. 66 | expected_scores = Scores( 67 | 'FILE1', 26.39309, 33.24631, 0.71880, 0.72958, 0.72415, 0.60075, 0.58534, 68 | 0.80471, 0.72543, 0.96810, 0.55872) 69 | ref_turns, _, _ = load_rttm(os.path.join(TEST_DIR, 'ref.rttm')) 70 | sys_turns, _, _ = load_rttm(os.path.join(TEST_DIR, 'sys.rttm')) 71 | uem = UEM({'FILE1' : [(0, 43)]}) 72 | file_scores, global_scores = score(ref_turns, sys_turns, uem) 73 | assert len(file_scores) == 1 74 | assert file_scores[-1].file_id == expected_scores.file_id 75 | assert_almost_equal(file_scores[-1][1:], expected_scores[1:], 3) 76 | expected_scores = expected_scores._replace(file_id='*** OVERALL ***') 77 | assert global_scores.file_id == expected_scores.file_id 78 | assert_almost_equal(global_scores[1:], expected_scores[1:], 3) 79 | -------------------------------------------------------------------------------- /asr/local/aishell_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | 17 | SECONDS=0 18 | 19 | log "$0 $*" 20 | 21 | 22 | . ./utils/parse_options.sh 23 | 24 | . ./db.sh 25 | . ./path.sh 26 | . ./cmd.sh 27 | 28 | 29 | if [ $# -gt 0 ]; then 30 | log "${help_message}" 31 | exit 2 32 | fi 33 | 34 | if [ -z "${AISHELL}" ]; then 35 | log "Error: \$AISHELL is not set in db.sh." 36 | exit 2 37 | fi 38 | 39 | if [ ! -d "${AISHELL}" ]; then 40 | log "Error: ${AISHELL} is empty." 41 | exit 2 42 | fi 43 | 44 | # To absolute path 45 | AISHELL=$(cd ${AISHELL}; pwd) 46 | aishell_audio_dir=${AISHELL}/data_aishell/wav 47 | aishell_text=${AISHELL}/data_aishell/transcript/aishell_transcript_v0.8.txt 48 | 49 | log "Aishell Data Preparation" 50 | train_dir=data/local/aishell_train 51 | dev_dir=data/local/aishell_dev 52 | test_dir=data/local/aishell_test 53 | tmp_dir=data/local/tmp 54 | 55 | mkdir -p $train_dir 56 | mkdir -p $dev_dir 57 | mkdir -p $test_dir 58 | mkdir -p $tmp_dir 59 | 60 | # find wav audio file for train, dev and test resp. 61 | find -L $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist 62 | n=$(wc -l < $tmp_dir/wav.flist) 63 | [ $n -ne 141925 ] && log Warning: expected 141925 data data files, found $n 64 | 65 | grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; 66 | grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; 67 | grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; 68 | 69 | rm -r $tmp_dir 70 | 71 | # transcriptions preparation 72 | for dir in $train_dir $dev_dir $test_dir; do 73 | sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list 74 | sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all 75 | paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all 76 | utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt 77 | awk '{print $1}' $dir/transcripts.txt > $dir/utt.list 78 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk 79 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp 80 | sort -u $dir/transcripts.txt | local/text_normalize.pl > $dir/text 81 | utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt 82 | done 83 | 84 | utils/copy_data_dir.sh --utt-prefix aishell- --spk-prefix aishell- \ 85 | $train_dir data/aishell_train 86 | utils/copy_data_dir.sh --utt-prefix aishell- --spk-prefix aishell- \ 87 | $dev_dir data/aishell_dev 88 | utils/copy_data_dir.sh --utt-prefix aishell- --spk-prefix aishell- \ 89 | $test_dir data/aishell_test 90 | 91 | # remove space in text 92 | for x in aishell_train aishell_dev aishell_test; do 93 | cp data/${x}/text data/${x}/text.org 94 | paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ 95 | > data/${x}/text 96 | rm data/${x}/text.org 97 | done 98 | -------------------------------------------------------------------------------- /asr/local/aidatatang_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | 17 | SECONDS=0 18 | 19 | log "$0 $*" 20 | 21 | 22 | . ./utils/parse_options.sh 23 | 24 | . ./db.sh 25 | . ./path.sh 26 | . ./cmd.sh 27 | 28 | 29 | if [ $# -gt 0 ]; then 30 | log "${help_message}" 31 | exit 2 32 | fi 33 | 34 | if [ -z "${AIDATATANG}" ]; then 35 | log "Error: \$AIDATATANG is not set in db.sh." 36 | exit 2 37 | fi 38 | 39 | if [ ! -d "${AIDATATANG}" ]; then 40 | log "Error: ${AIDATATANG} is empty." 41 | exit 2 42 | fi 43 | 44 | # To absolute path 45 | AIDATATANG=$(cd ${AIDATATANG}; pwd) 46 | aidatatang_audio_dir=${AIDATATANG}/corpus 47 | aidatatang_text=${AIDATATANG}/transcript/aidatatang_200_zh_transcript.txt 48 | 49 | log "Aidatatang Data Preparation" 50 | train_dir=data/local/aidatatang_train 51 | dev_dir=data/local/aidatatang_dev 52 | test_dir=data/local/aidatatang_test 53 | tmp_dir=data/local/tmp 54 | 55 | mkdir -p $train_dir 56 | mkdir -p $dev_dir 57 | mkdir -p $test_dir 58 | mkdir -p $tmp_dir 59 | 60 | # find wav audio file for train, dev and test resp. 61 | find -L $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist 62 | n=$(wc -l < $tmp_dir/wav.flist) 63 | [ $n -ne 237265 ] && log Warning: expected 237265 data data files, found $n 64 | 65 | grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; 66 | grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; 67 | grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; 68 | 69 | rm -r $tmp_dir 70 | 71 | # transcriptions preparation 72 | for dir in $train_dir $dev_dir $test_dir; do 73 | sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list 74 | sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all 75 | paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all 76 | utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text > $dir/transcripts.txt 77 | awk '{print $1}' $dir/transcripts.txt > $dir/utt.list 78 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk 79 | utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp 80 | sort -u $dir/transcripts.txt | local/text_normalize.pl > $dir/text 81 | utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt 82 | done 83 | 84 | utils/copy_data_dir.sh --utt-prefix aidatatang- --spk-prefix aidatatang- \ 85 | $train_dir data/aidatatang_train 86 | utils/copy_data_dir.sh --utt-prefix aidatatang- --spk-prefix aidatatang- \ 87 | $dev_dir data/aidatatang_dev 88 | utils/copy_data_dir.sh --utt-prefix aidatatang- --spk-prefix aidatatang- \ 89 | $test_dir data/aidatatang_test 90 | 91 | # remove space in text 92 | for x in aidatatang_train aidatatang_dev aidatatang_test; do 93 | cp data/${x}/text data/${x}/text.org 94 | paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ 95 | > data/${x}/text 96 | rm data/${x}/text.org 97 | done 98 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/test_uem.py: -------------------------------------------------------------------------------- 1 | """Tets for UEM utilities.""" 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | import os 6 | import shutil 7 | import tempfile 8 | 9 | from numpy.testing import (assert_equal, assert_raises, assert_raises_regex) 10 | import pytest 11 | 12 | from scorelib.turn import Turn 13 | from scorelib.uem import gen_uem, load_uem, write_uem, UEM 14 | 15 | 16 | TEST_DIR = os.path.dirname(os.path.abspath(__file__)) 17 | TMP_DIR = tempfile.mkdtemp(prefix="dscore_scorelib_test_uem__") 18 | 19 | 20 | def test_UEM_input_validation(): 21 | # Test type validation. 22 | invalid_type_msg = 'Expected sequence of pairs' 23 | with assert_raises_regex(TypeError, invalid_type_msg): 24 | UEM()['fid'] = 1 25 | with assert_raises_regex(TypeError, invalid_type_msg): 26 | UEM()['fid'] = (1, (2, 3)) 27 | with assert_raises_regex(TypeError, invalid_type_msg): 28 | UEM()['fid'] = ('1', (2, 3)) 29 | with assert_raises_regex(TypeError, invalid_type_msg): 30 | UEM()['fid'] = ((1,), (2, 3)) 31 | with assert_raises_regex(TypeError, invalid_type_msg): 32 | UEM()['fid'] = ((3, 4, 5), (0, 1)) 33 | 34 | # Test value validation. 35 | invalid_timestamp_msg = 'Could not convert interval' 36 | with assert_raises_regex(ValueError, invalid_timestamp_msg): 37 | UEM()['fid'] = (('a', 'b'), (2, 3)) 38 | invalid_interval_msg = 'Invalid interval' 39 | with assert_raises_regex(ValueError, invalid_interval_msg): 40 | UEM()['fid'] = ((-1, 1), (2, 3)) 41 | with assert_raises_regex(ValueError, invalid_interval_msg): 42 | UEM()['fid'] = ((1, 1), (2, 3)) 43 | with assert_raises_regex(ValueError, invalid_interval_msg): 44 | UEM()['fid'] = ((-1, 1), (2, 3)) 45 | 46 | 47 | def test_UEM_merge_overlaps(): 48 | overlaps_uem = UEM({'fid' : [(0, 5), (4, 10)]}) 49 | no_overlaps_uem = UEM({'fid' : [(0, 10)]}) 50 | assert overlaps_uem == no_overlaps_uem 51 | 52 | 53 | def test_load_uem(): 54 | expected_uem = UEM({ 55 | 'FILE1' : [(0, 15), (25, 30.4)], 56 | 'FILE2' : [(4.5, 13.24)]}) 57 | loaded_uem = load_uem(os.path.join(TEST_DIR, 'test_load.uem')) 58 | assert expected_uem == loaded_uem 59 | 60 | 61 | def test_write_uem(): 62 | uem = UEM({ 63 | 'FILE1' : [(0, 5), (4, 10.6)], 64 | 'FILE2' : [(4.5, 13.24)]}) 65 | tmp_uemf = os.path.join(TMP_DIR, 'test_write.uem') 66 | write_uem(tmp_uemf, uem) 67 | expected_uemf = os.path.join(TEST_DIR, 'test_write.uem') 68 | with open(tmp_uemf, 'rb') as f, open(expected_uemf, 'rb') as g: 69 | assert f.read() == g.read() 70 | 71 | 72 | def test_gen_uem(): 73 | ref_turns = [ 74 | Turn(0, 14, speaker_id='SPK1', file_id='FILE1'), 75 | Turn(20, 25, speaker_id='SPK2', file_id='FILE1'), 76 | Turn(11, 45, speaker_id='SPK3', file_id='FILE2'), 77 | ] 78 | sys_turns = [ 79 | Turn(35, 46, speaker_id='SPK1', file_id='FILE1'), 80 | Turn(2.5, 14, speaker_id='SPK4', file_id='FILE2'), 81 | ] 82 | expected_uem = UEM({ 83 | 'FILE1' : [(0, 46)], 84 | 'FILE2' : [(2.5, 45)]}) 85 | assert expected_uem == gen_uem(ref_turns, sys_turns) 86 | 87 | 88 | def teardown_module(): 89 | if os.path.isdir(TMP_DIR): 90 | shutil.rmtree(TMP_DIR) 91 | -------------------------------------------------------------------------------- /speaker/local/meeting_speaker_number_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Process the textgrid files 4 | """ 5 | import argparse 6 | import codecs 7 | from distutils.util import strtobool 8 | from pathlib import Path 9 | import textgrid 10 | import pdb 11 | 12 | class Segment(object): 13 | def __init__(self, uttid, spkr, stime, etime, text): 14 | self.uttid = uttid 15 | self.spkr = spkr 16 | self.stime = round(stime, 2) 17 | self.etime = round(etime, 2) 18 | self.text = text 19 | 20 | def change_stime(self, time): 21 | self.stime = time 22 | 23 | def change_etime(self, time): 24 | self.etime = time 25 | 26 | 27 | def get_args(): 28 | parser = argparse.ArgumentParser(description="process the textgrid files") 29 | parser.add_argument("--path", type=str, required=True, help="textgrid path") 30 | parser.add_argument("--label_path", type=str, required=True, help="label rttm file path") 31 | parser.add_argument("--predict_path", type=str, required=True, help="predict rttm file path") 32 | args = parser.parse_args() 33 | return args 34 | 35 | def main(args): 36 | textgrid_flist = codecs.open(Path(args.path)/"uttid_textgrid.flist", "r", "utf-8") 37 | 38 | 39 | # parse the textgrid file for each utterance 40 | speaker2_uttidset = [] 41 | speaker3_uttidset = [] 42 | speaker4_uttidset = [] 43 | for line in textgrid_flist: 44 | uttid ,textgrid_file = line.strip().split("\t") 45 | tg = textgrid.TextGrid() 46 | tg.read(textgrid_file) 47 | 48 | num_speaker = len(tg) 49 | if num_speaker ==2: 50 | speaker2_uttidset.append(uttid) 51 | elif num_speaker ==3: 52 | speaker3_uttidset.append(uttid) 53 | elif num_speaker ==4: 54 | speaker4_uttidset.append(uttid) 55 | textgrid_flist.close() 56 | 57 | speaker2_id_label = codecs.open(Path(args.label_path) / "speaker2_id", "w", "utf-8") 58 | speaker2_id_predict = codecs.open(Path(args.predict_path) / "speaker2_id", "w", "utf-8") 59 | speaker3_id_label = codecs.open(Path(args.label_path) / "speaker3_id", "w", "utf-8") 60 | speaker3_id_predict = codecs.open(Path(args.predict_path) / "speaker3_id", "w", "utf-8") 61 | speaker4_id_label = codecs.open(Path(args.label_path) / "speaker4_id", "w", "utf-8") 62 | speaker4_id_predict = codecs.open(Path(args.predict_path) / "speaker4_id", "w", "utf-8") 63 | 64 | for i in range(len(speaker2_uttidset)): 65 | speaker2_id_label.write("%s\n" % (args.label_path+"/"+speaker2_uttidset[i]+".rttm")) 66 | speaker2_id_predict.write("%s\n" % (args.predict_path+"/"+speaker2_uttidset[i]+".rttm")) 67 | for i in range(len(speaker3_uttidset)): 68 | speaker3_id_label.write("%s\n" % (args.label_path+"/"+speaker3_uttidset[i]+".rttm")) 69 | speaker3_id_predict.write("%s\n" % (args.predict_path+"/"+speaker3_uttidset[i]+".rttm")) 70 | for i in range(len(speaker4_uttidset)): 71 | speaker4_id_label.write("%s\n" % (args.label_path+"/"+speaker4_uttidset[i]+".rttm")) 72 | speaker4_id_predict.write("%s\n" % (args.predict_path+"/"+speaker4_uttidset[i]+".rttm")) 73 | 74 | speaker2_id_label.close() 75 | speaker2_id_predict.close() 76 | speaker3_id_label.close() 77 | speaker3_id_predict.close() 78 | speaker4_id_label.close() 79 | speaker4_id_predict.close() 80 | 81 | if __name__ == "__main__": 82 | args = get_args() 83 | main(args) 84 | -------------------------------------------------------------------------------- /asr/scripts/audio/format_wav_scp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | SECONDS=0 4 | log() { 5 | local fname=${BASH_SOURCE[1]##*/} 6 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 7 | } 8 | help_message=$(cat << EOF 9 | Usage: $0 [ []] 10 | e.g. 11 | $0 data/test/wav.scp data/test_format/ 12 | 13 | Format 'wav.scp': In short words, 14 | changing "kaldi-datadir" to "modified-kaldi-datadir" 15 | 16 | The 'wav.scp' format in kaldi is very flexible, 17 | e.g. It can use unix-pipe as describing that wav file, 18 | but it sometime looks confusing and make scripts more complex. 19 | This tools creates actual wav files from 'wav.scp' 20 | and also segments wav files using 'segments'. 21 | 22 | Options 23 | --fs 24 | --segments 25 | --nj 26 | --cmd 27 | EOF 28 | ) 29 | 30 | out_filename=wav.scp 31 | cmd=utils/run.pl 32 | nj=30 33 | fs=none 34 | segments= 35 | 36 | ref_channels= 37 | utt2ref_channels= 38 | 39 | audio_format=wav 40 | write_utt2num_samples=true 41 | 42 | log "$0 $*" 43 | . utils/parse_options.sh 44 | 45 | if [ $# -ne 2 ] && [ $# -ne 3 ] && [ $# -ne 4 ]; then 46 | log "${help_message}" 47 | log "Error: invalid command line arguments" 48 | exit 1 49 | fi 50 | 51 | . ./path.sh # Setup the environment 52 | 53 | scp=$1 54 | if [ ! -f "${scp}" ]; then 55 | log "${help_message}" 56 | echo "$0: Error: No such file: ${scp}" 57 | exit 1 58 | fi 59 | dir=$2 60 | 61 | 62 | if [ $# -eq 2 ]; then 63 | logdir=${dir}/logs 64 | outdir=${dir}/data 65 | 66 | elif [ $# -eq 3 ]; then 67 | logdir=$3 68 | outdir=${dir}/data 69 | 70 | elif [ $# -eq 4 ]; then 71 | logdir=$3 72 | outdir=$4 73 | fi 74 | 75 | 76 | mkdir -p ${logdir} 77 | 78 | rm -f "${dir}/${out_filename}" 79 | 80 | 81 | opts= 82 | if [ -n "${utt2ref_channels}" ]; then 83 | opts="--utt2ref-channels ${utt2ref_channels} " 84 | elif [ -n "${ref_channels}" ]; then 85 | opts="--ref-channels ${ref_channels} " 86 | fi 87 | 88 | 89 | if [ -n "${segments}" ]; then 90 | log "[info]: using ${segments}" 91 | nutt=$(<${segments} wc -l) 92 | nj=$((nj /dev/null 130 | 131 | # concatenate the .scp files together. 132 | for n in $(seq ${nj}); do 133 | cat "${outdir}/format.${n}/wav.scp" || exit 1; 134 | done > "${dir}/${out_filename}" || exit 1 135 | 136 | if "${write_utt2num_samples}"; then 137 | for n in $(seq ${nj}); do 138 | cat "${outdir}/format.${n}/utt2num_samples" || exit 1; 139 | done > "${dir}/utt2num_samples" || exit 1 140 | fi 141 | 142 | log "Successfully finished. [elapsed=${SECONDS}s]" 143 | -------------------------------------------------------------------------------- /asr/local/alimeeting_process_donothing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Process the textgrid files 4 | """ 5 | import argparse 6 | import codecs 7 | from distutils.util import strtobool 8 | from pathlib import Path 9 | import textgrid 10 | import pdb 11 | 12 | class Segment(object): 13 | def __init__(self, uttid, spkr, stime, etime, text): 14 | self.uttid = uttid 15 | self.spkr = spkr 16 | self.stime = round(stime, 2) 17 | self.etime = round(etime, 2) 18 | self.text = text 19 | 20 | def change_stime(self, time): 21 | self.stime = time 22 | 23 | def change_etime(self, time): 24 | self.etime = time 25 | 26 | 27 | def get_args(): 28 | parser = argparse.ArgumentParser(description="process the textgrid files") 29 | parser.add_argument("--path", type=str, required=True, help="Data path") 30 | parser.add_argument( 31 | "--mars", 32 | type=strtobool, 33 | default=False, 34 | help="Whether to process mars data set.", 35 | ) 36 | args = parser.parse_args() 37 | return args 38 | 39 | def main(args): 40 | wav_scp = codecs.open(Path(args.path) / "wav.scp", "r", "utf-8") 41 | textgrid_flist = codecs.open(Path(args.path) / "textgrid.flist", "r", "utf-8") 42 | 43 | # get the path of textgrid file for each utterance 44 | utt2textgrid = {} 45 | for line in textgrid_flist: 46 | path = Path(line.strip()) 47 | uttid = path.stem 48 | utt2textgrid[uttid] = path 49 | 50 | # parse the textgrid file for each utterance 51 | all_segments = [] 52 | for line in wav_scp: 53 | uttid = line.strip().split(" ")[0] 54 | uttid_part=uttid 55 | if args.mars == True: 56 | uttid_list = uttid.split("_") 57 | uttid_part= uttid_list[0]+"_"+uttid_list[1] 58 | if uttid_part not in utt2textgrid: 59 | print("%s doesn't have transcription" % uttid) 60 | continue 61 | 62 | segments = [] 63 | tg = textgrid.TextGrid.fromFile(utt2textgrid[uttid_part]) 64 | for i in range(tg.__len__()): 65 | for j in range(tg[i].__len__()): 66 | if tg[i][j].mark: 67 | segments.append( 68 | Segment( 69 | uttid, 70 | tg[i].name, 71 | tg[i][j].minTime, 72 | tg[i][j].maxTime, 73 | tg[i][j].mark.strip(), 74 | ) 75 | ) 76 | 77 | segments = sorted(segments, key=lambda x: x.stime) 78 | all_segments += segments 79 | 80 | wav_scp.close() 81 | textgrid_flist.close() 82 | 83 | segments_file = codecs.open(Path(args.path) / "segments_all", "w", "utf-8") 84 | utt2spk_file = codecs.open(Path(args.path) / "utt2spk_all", "w", "utf-8") 85 | text_file = codecs.open(Path(args.path) / "text_all", "w", "utf-8") 86 | 87 | for i in range(len(all_segments)): 88 | utt_name = "%s-%s-%07d-%07d" % ( 89 | all_segments[i].uttid, 90 | all_segments[i].spkr, 91 | all_segments[i].stime * 100, 92 | all_segments[i].etime * 100, 93 | ) 94 | 95 | segments_file.write( 96 | "%s %s %.2f %.2f\n" 97 | % ( 98 | utt_name, 99 | all_segments[i].uttid, 100 | all_segments[i].stime, 101 | all_segments[i].etime, 102 | ) 103 | ) 104 | utt2spk_file.write( 105 | "%s %s-%s\n" % (utt_name, all_segments[i].uttid, all_segments[i].spkr) 106 | ) 107 | text_file.write("%s %s\n" % (utt_name, all_segments[i].text)) 108 | 109 | segments_file.close() 110 | utt2spk_file.close() 111 | text_file.close() 112 | 113 | 114 | if __name__ == "__main__": 115 | args = get_args() 116 | main(args) 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # M2MeT challenge baseline -- AliMeeting 2 | 3 | 4 | This project provides the baseline system recipes for the ICASSP 2020 Multi-channel Multi-party Meeting Transcription Challenge (M2MeT). The challenge mainly consists of two tracks, named ***Automatic Speech Recognition (ASR)*** and ***Speaker Diarization***. For each track, detailed descriptions can be found in its corresponding directory. The goal of this project is to simplify the training and evaluation procedures and make it flexible for participants to reproduce the baseline experiments and develop novelty methods. 5 | 6 | 7 | ## Setup 8 | 9 | ```shell 10 | git clone https://github.com/yufan-aslp/AliMeeting.git 11 | ``` 12 | 13 | ## Introduction 14 | 15 | * [Speech Recognition Track](asr): Follow the detailed steps in `./asr`. 16 | * [Speaker Diarization Track](speaker): Follow the detailed steps in `./speaker`. 17 | 18 | 19 | ## General steps 20 | 21 | 1. Prepare the training data for speaker diarization and ASR model, respectively 22 | 2. Follow the running steps of the speaker diarization experiment and obtain the `rttm` file. The `rttm` file includes the voice activity detection (VAD) and speaker diarization results, which will be used to compute the final Diarization Error Rate (DER) scores. 23 | 3. For ASR track, we can train the single-speaker or multi-speaker ASR models. The evaluation metric of ASR systems is Character Error Rate (CER). 24 | 25 | 26 | 27 | 28 | ## Citation 29 | 30 | If you use the challenge dataset or our baseline systems, please consider citing the following: 31 | 32 | @inproceedings{Yu2022M2MeT, 33 | title={M2{M}e{T}: The {ICASSP} 2022 Multi-Channel Multi-Party Meeting Transcription Challenge}, 34 | author={Yu, Fan and Zhang, Shiliang and Fu, Yihui and Xie, Lei and Zheng, Siqi and Du, Zhihao and Huang, Weilong and Guo, Pengcheng and Yan, Zhijie and Ma, Bin and Xu, Xin and Bu, Hui}, 35 | booktitle={Proc. ICASSP}, 36 | year={2022}, 37 | organization={IEEE} 38 | } 39 | 40 | @inproceedings{Yu2022Summary, 41 | title={Summary On The {ICASSP} 2022 Multi-Channel Multi-Party Meeting Transcription Grand Challenge}, 42 | author={Yu, Fan and Zhang, Shiliang and Guo, Pengcheng and Fu, Yihui and Du, Zhihao and Zheng, Siqi and Huang, Weilong and Xie, Lei and Tan, Zheng-Hua and Wang, DeLiang and Qian, Yanmin and Lee, Kong Aik and Yan, Zhijie and Ma, Bin and Xu, Xin and Bu, Hui}, 43 | booktitle={Proc. ICASSP}, 44 | year={2022}, 45 | organization={IEEE} 46 | } 47 | 48 | Challenge introduction paper: M2MeT: The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Challenge (https://arxiv.org/abs/2110.07393?spm=a3c0i.25445127.6257982940.1.111654811kxLMY&file=2110.07393) 49 | 50 | 51 | Challenge summary paper: Summary On The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Grand Challenge (https://arxiv.org/abs/2202.03647?spm=a3c0i.25445127.6257982940.2.111654811kxLMY&file=2202.03647) 52 | 53 | 54 | The AliMeeting data download at https://www.openslr.org/119 55 | 56 | 57 | Room config of AliMeeting Train set download at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/AliMeeting/AliMeeting_Trainset_Room.xlsx 58 | 59 | 60 | M2MeT challege codalab(Open evaluation platform for Eval and Test sets of both Tracks): https://codalab.lisn.upsaclay.fr/competitions/?q=M2MeT 61 | 62 | 63 | ## Organizing Committee 64 | * Lei Xie, AISHELL Foundation, China, xielei21st@gmail.com 65 | * Bin Ma, Principal Engineer at Alibaba, Singapore, b.ma@alibaba-inc.com 66 | * DeLiang Wang, Professor, Ohio State University, USA, dwang@cse.ohio-state.edu 67 | * Zheng-Hua Tan, Professor, Aalborg University, Denmark, zt@es.aau.dk 68 | * Kong Aik Lee, Senior Scientist, Institute for Infocomm Research, A*STAR, Singapore, kongaik.lee@ieee.org 69 | * Zhijie Yan, Director of Speech Lab at Alibaba, China, zhijie.yzj@alibaba-inc.com 70 | * Yanmin Qian, Associate Professor, Shanghai Jiao Tong University, China, 71 | yanminqian@sjtu.edu.cn 72 | * Hui Bu, CEO, AIShell Inc., China, buhui@aishelldata.com 73 | 74 | ## Contributors 75 | 76 | [](https://damo.alibaba.com/labs/speech/?lang=zh)[](http://www.aishelltech.com/sy)[](https://isca-speech.org/iscaweb/) 77 | 78 | ## Code license 79 | 80 | [Apache 2.0](./LICENSE) 81 | 82 | -------------------------------------------------------------------------------- /speaker/dscore/scorelib/tests/test_turn.py: -------------------------------------------------------------------------------- 1 | """Tets for UEM utilities.""" 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | from intervaltree import Interval, IntervalTree 7 | 8 | import pytest 9 | 10 | from scorelib.turn import chop_tree, merge_turns, trim_turns, Turn 11 | from scorelib.uem import UEM 12 | 13 | 14 | def test_merge_turns(): 15 | expected_turns = [ 16 | Turn(0, 11, speaker_id='S1', file_id='FILE1'), 17 | Turn(0, 11, speaker_id='S2', file_id='FILE1'), 18 | Turn(0, 11, speaker_id='S1', file_id='FILE2'), 19 | ] 20 | turns = [ 21 | Turn(0, 10, speaker_id='S1', file_id='FILE1'), 22 | Turn(9, 11, speaker_id='S1', file_id='FILE1'), 23 | Turn(0, 11, speaker_id='S2', file_id='FILE1'), 24 | Turn(0, 11, speaker_id='S1', file_id='FILE2'), 25 | ] 26 | assert set(expected_turns) == set(merge_turns(turns)) 27 | 28 | 29 | def test_trim_turns(): 30 | turns = [ 31 | Turn(1, 5, speaker_id='S1', file_id='FILE1'), 32 | Turn(6, 10, speaker_id='S1', file_id='FILE1'), 33 | Turn(0, 10, speaker_id='S1', file_id='FILE2'), 34 | ] 35 | 36 | # Trim with a UEM. 37 | uem = UEM({'FILE1' : [(2, 6), (5.8, 7)], 38 | 'FILE2' : [(2, 3), (4, 5)]}) 39 | expected_turns = [ 40 | Turn(2, 5, speaker_id='S1', file_id='FILE1'), 41 | Turn(6, 7, speaker_id='S1', file_id='FILE1'), 42 | Turn(2, 3, speaker_id='S1', file_id='FILE2'), 43 | Turn(4, 5, speaker_id='S1', file_id='FILE2'), 44 | ] 45 | assert set(expected_turns) == set(trim_turns(turns, uem)) 46 | 47 | # Trim without UEM. 48 | expected_turns = [ 49 | Turn(2, 5, speaker_id='S1', file_id='FILE1'), 50 | Turn(6, 7, speaker_id='S1', file_id='FILE1'), 51 | Turn(2, 7, speaker_id='S1', file_id='FILE2'), 52 | ] 53 | assert set(expected_turns) == set(trim_turns(turns, None, 2, 7)) 54 | 55 | 56 | def test_chop_tree(): 57 | def _get_tree(): 58 | return IntervalTree.from_tuples( 59 | [(1, 5, 'i1'), 60 | (7, 10, 'i2'), 61 | ]) 62 | 63 | # Interval contained within chop region. 64 | expected_tree = IntervalTree.from_tuples([(7, 10, 'i2')]) 65 | tree = _get_tree() 66 | overlap_intervals = chop_tree(tree, 0, 6) 67 | assert overlap_intervals == set([Interval(1, 5, 'i1')]) 68 | assert tree == expected_tree 69 | 70 | # Left overlap. 71 | expected_tree = IntervalTree.from_tuples([(2, 5, 'i1'), (7, 10, 'i2')]) 72 | tree = _get_tree() 73 | overlap_intervals = chop_tree(tree, 0, 2) 74 | assert overlap_intervals == set([Interval(1, 5, 'i1')]) 75 | assert tree == expected_tree 76 | 77 | # Right overlap. 78 | expected_tree = IntervalTree.from_tuples([(1, 5, 'i1'), (7, 9, 'i2')]) 79 | tree = _get_tree() 80 | overlap_intervals = chop_tree(tree, 9, 11) 81 | assert overlap_intervals == set([Interval(7, 10, 'i2')]) 82 | assert tree == expected_tree 83 | 84 | # Chop region contained within interval. 85 | expected_tree = IntervalTree.from_tuples([(1, 2, 'i1'), 86 | (3, 5, 'i1'), 87 | (7, 10, 'i2')]) 88 | tree = _get_tree() 89 | overlap_intervals = chop_tree(tree, 2, 3) 90 | assert overlap_intervals == set([Interval(1, 5, 'i1')]) 91 | assert tree == expected_tree 92 | 93 | # Overlaps two intervals. 94 | expected_tree = IntervalTree.from_tuples([(1, 4, 'i1'), 95 | (8, 10, 'i2')]) 96 | tree = _get_tree() 97 | overlap_intervals = chop_tree(tree, 4, 8) 98 | assert overlap_intervals == set([Interval(1, 5, 'i1'), 99 | Interval(7, 10, 'i2')]) 100 | assert tree == expected_tree 101 | 102 | # No overlap. 103 | expected_tree = _get_tree() 104 | tree = _get_tree() 105 | overlap_intervals = chop_tree(tree, 6, 6.5) 106 | assert overlap_intervals == set() 107 | assert tree == expected_tree 108 | 109 | # No trivial overlaps. 110 | expected_tree = _get_tree() 111 | tree = _get_tree() 112 | overlap_intervals = chop_tree(tree, 0, 1) 113 | assert overlap_intervals == set() 114 | assert tree == expected_tree 115 | overlap_intervals = chop_tree(tree, 10, 11) 116 | assert overlap_intervals == set() 117 | assert tree == expected_tree 118 | -------------------------------------------------------------------------------- /asr/local/text2textgrid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Process the textgrid files 4 | """ 5 | import argparse 6 | import codecs 7 | from distutils.util import strtobool 8 | from pathlib import Path 9 | import textgrid 10 | import pdb 11 | 12 | class Segment(object): 13 | def __init__(self, uttid, spkr, stime, etime, text): 14 | self.uttid = uttid 15 | self.spkr = spkr 16 | self.stime = round(stime, 2) 17 | self.etime = round(etime, 2) 18 | self.text = text 19 | 20 | def change_stime(self, time): 21 | self.stime = time 22 | 23 | def change_etime(self, time): 24 | self.etime = time 25 | 26 | 27 | def get_args(): 28 | parser = argparse.ArgumentParser(description="process the textgrid files") 29 | parser.add_argument("--in_path", type=str, required=True, help="Text input path") 30 | parser.add_argument("--out_path", type=str, required=True, help="Text output path") 31 | parser.add_argument("--speaker_limit", type=strtobool, default=True, help="speaker is only c1") 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | def main(args): 37 | text = codecs.open(Path(args.in_path), "r", "utf-8") 38 | #text_uttid = args.uttid 39 | # get the path of textgrid file for each utterance 40 | spk2textgrid = {} 41 | xmin = 0 42 | xmax = 0 43 | for line in text: 44 | uttlist = line.split() 45 | utt_id = uttlist[0] 46 | if utt_id == "编号" or utt_id == "文本": 47 | continue 48 | utt_text = uttlist[1] 49 | utt_use = uttlist[2] 50 | #pdb.set_trace() 51 | utt_time_s, utt_time_e=uttlist[-1].strip('[').strip(']').split('][') 52 | if float(utt_time_s) < 0: 53 | raise ValueError(float(utt_time_s)) 54 | if float(utt_time_e) < 0: 55 | raise ValueError(float(utt_time_e)) 56 | 57 | if utt_use == "有效": 58 | utt_speaker = uttlist[3] 59 | if args.speaker_limit == True and (utt_speaker != "c1" and utt_speaker != "S1" and utt_speaker != "S2" and utt_speaker != "S3"): 60 | raise ValueError(str(utt_id)+" "+str(utt_speaker)) 61 | if utt_speaker not in spk2textgrid: 62 | spk2textgrid[utt_speaker] = [] 63 | xmax = max(xmax,float(utt_time_e)) 64 | spk2textgrid[utt_speaker].append( 65 | Segment( 66 | utt_id, 67 | utt_speaker, 68 | float(utt_time_s), 69 | float(utt_time_e), 70 | utt_text.strip(), 71 | ) 72 | ) 73 | text.close() 74 | #pdb.set_trace() 75 | #for segments in spk2textgrid.keys(): 76 | # spk2textgrid[segments] = sorted(spk2textgrid[segments], key=lambda x: x.stime) 77 | xmax=xmax+0.01 78 | textgrid = codecs.open(Path(args.out_path), "w", "utf-8") 79 | textgrid.write("File type = \"ooTextFile\"\n") 80 | textgrid.write("Object class = \"TextGrid\"\n\n") 81 | 82 | textgrid.write("xmin = %s\n" % (xmin)) 83 | textgrid.write("xmax = %s\n" % (xmax)) 84 | textgrid.write("tiers? \n") 85 | textgrid.write("size = %s\n" % (len(spk2textgrid))) 86 | textgrid.write("item []:\n") 87 | num_spk = 1 88 | for segments in spk2textgrid.keys(): 89 | textgrid.write("\titem [%s]:\n" % (num_spk)) 90 | num_spk = num_spk + 1 91 | textgrid.write("\t\tclass = \"IntervalTier\"\n") 92 | textgrid.write("\t\tname = \"%s\"\n" % spk2textgrid[segments][0].spkr) 93 | textgrid.write("\t\txmin = %s\n" % (xmin)) 94 | textgrid.write("\t\txmax = %s\n" % (xmax)) 95 | textgrid.write("\t\tintervals: size = %s\n" % (len(spk2textgrid[segments]))) 96 | #pdb.set_trace() 97 | for i in range(len(spk2textgrid[segments])): 98 | #spk2textgrid[segments][i] 99 | #pdb.set_trace() 100 | textgrid.write("\t\tintervals [%s]:\n" % (i+1)) 101 | textgrid.write("\t\t\txmin = %s\n" % (spk2textgrid[segments][i].stime)) 102 | textgrid.write("\t\t\txmax = %s\n" % (spk2textgrid[segments][i].etime)) 103 | textgrid.write("\t\t\ttext = \"%s\"\n" % (spk2textgrid[segments][i].text)) 104 | #textgrid.write("%s %s %s %s %s \n" % (spk2textgrid[segments][i].uttid, spk2textgrid[segments][i].spkr,spk2textgrid[segments][i].stime, 105 | #spk2textgrid[segments][i].etime,spk2textgrid[segments][i].text)) 106 | textgrid.close() 107 | 108 | 109 | if __name__ == "__main__": 110 | args = get_args() 111 | main(args) 112 | -------------------------------------------------------------------------------- /asr/cmd.sh: -------------------------------------------------------------------------------- 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== 2 | # Usage: .pl [options] JOB=1: 3 | # e.g. 4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB 5 | # 6 | # Options: 7 | # --time