├── .DS_Store
├── ExchangeChannal
└── ExchangeChannal.py
├── ISCA.png
├── README.md
├── alibaba.ai
├── alibaba.png
├── asr
├── README.md
├── asr_local.sh
├── cmd.sh
├── conf
│ ├── array
│ │ ├── train_asr_conformer.yaml
│ │ ├── train_asr_conformer_noctc.yaml
│ │ ├── train_asr_conformer_rir.yaml
│ │ ├── train_asr_transformer.yaml
│ │ ├── train_asr_transformer_noctc.yaml
│ │ ├── train_asr_transformer_rir.yaml
│ │ └── train_asr_transformer_test.yaml
│ ├── decode_asr_rnn.yaml
│ ├── decode_asr_rnn_noctc.yaml
│ ├── decode_asr_rnn_nolm.yaml
│ ├── decode_asr_rnn_onlyctc.yaml
│ ├── decode_asr_transformer.yaml
│ ├── fbank.conf
│ ├── pbs.conf
│ ├── pitch.conf
│ ├── queue.conf
│ ├── slurm.conf
│ ├── train_asr_conformer.yaml
│ ├── train_asr_conformer_add_array.yaml
│ ├── train_asr_conformer_batch.yaml
│ ├── train_asr_transformer.yaml
│ └── train_lm_transformer.yaml
├── db.sh
├── local
│ ├── aidatatang_data_prep.sh
│ ├── aishell4_data_prep.sh
│ ├── aishell4_data_prep_speaker.sh
│ ├── aishell4_process_empty_text_speaker.py
│ ├── aishell4_process_textgrid.py
│ ├── aishell4_process_textgrid_speaker.py
│ ├── aishell_data_prep.sh
│ ├── alimeeting_data_prep.sh
│ ├── alimeeting_data_prep_test.sh
│ ├── alimeeting_data_prep_time_new.sh
│ ├── alimeeting_process_donothing.py
│ ├── alimeeting_process_overlap.py
│ ├── alimeeting_process_overlap_force.py
│ ├── alimeeting_process_textgrid.py
│ ├── alimeeting_process_textgrid_speaker.py
│ ├── alimeeting_process_textgrid_time_new.py
│ ├── apply_map_new.py
│ ├── apply_map_noid.py
│ ├── cut_r1_r2.sh
│ ├── data.sh
│ ├── path.sh
│ ├── st_cmds_data_prep.sh
│ ├── text2textgrid.py
│ ├── text_format.pl
│ └── text_normalize.pl
├── path.sh
├── pyscripts
│ ├── audio
│ │ └── format_wav_scp.py
│ ├── feats
│ │ └── feat-to-shape.py
│ └── utils
│ │ ├── evaluate_mcd.py
│ │ ├── get_yaml.py
│ │ ├── make_token_list_from_config.py
│ │ ├── plot_sinc_filters.py
│ │ └── print_args.py
├── run_local_conformer_near_alimeeting.sh
├── run_local_multispeaker_conformer_alimeeting.sh
└── scripts
│ ├── audio
│ └── format_wav_scp.sh
│ ├── feats
│ ├── feat_to_shape.sh
│ ├── make_fbank.sh
│ └── make_stft.sh
│ └── utils
│ ├── download_from_google_drive.sh
│ ├── perturb_data_dir_speed.sh
│ └── show_asr_result.sh
├── fig_aishell.jpg
└── speaker
├── README.md
├── VBx
├── VB_diarization.py
├── __init__.py
├── __pycache__
│ ├── VB_diarization.cpython-36.pyc
│ ├── diarization_lib.cpython-36.pyc
│ ├── features.cpython-36.pyc
│ └── kaldi_utils.cpython-36.pyc
├── diarization_lib.py
├── extract.sh
├── features.py
├── free_gpu.sh
├── kaldi_utils.py
├── models
│ ├── __pycache__
│ │ └── resnet.cpython-36.pyc
│ └── resnet.py
├── predict.py
└── vbhmm.py
├── cmd.sh
├── conf
├── mfcc_hires.conf
└── sad.conf
├── dscore
├── LICENSE
├── README.md
├── ref.rttm
├── requirements.txt
├── score.py
├── scorelib
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── argparse.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ ├── rttm.cpython-36.pyc
│ │ ├── score.cpython-36.pyc
│ │ ├── six.cpython-36.pyc
│ │ ├── turn.cpython-36.pyc
│ │ ├── uem.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── argparse.py
│ ├── md-eval-22.pl
│ ├── metrics.py
│ ├── rttm.py
│ ├── score.py
│ ├── six.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── ref.rttm
│ │ ├── sys.rttm
│ │ ├── test_load.uem
│ │ ├── test_metrics.py
│ │ ├── test_score.py
│ │ ├── test_turn.py
│ │ ├── test_uem.py
│ │ └── test_write.uem
│ ├── turn.py
│ ├── uem.py
│ └── utils.py
├── sys.rttm
└── validate_rttm.py
├── local
├── make_textgrid_rttm.py
├── meeting_speaker_number_process.py
├── meeting_statistic.py
├── segmentation
│ ├── detect_speech_activity.sh
│ └── tuning
│ │ ├── train_lstm_sad_1a.sh
│ │ └── train_stats_sad_1a.sh
└── train_sad.sh
├── path.sh
├── requirements.txt
├── run.sh
└── scripts
├── choose_first_channel.py
├── do_segmentation.sh
├── extract_embeddings.sh
├── extract_feature.sh
├── run_cluster.sh
├── segment_to_lab.py
├── segments_to_lab.sh
└── test.sh
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yufan-aslp/AliMeeting/692a034fd510f1720f547c2c91e0fbadc9c24bf2/.DS_Store
--------------------------------------------------------------------------------
/ExchangeChannal/ExchangeChannal.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import os
3 | import sys
4 | import wave
5 | #import wavio
6 | import numpy as np
7 | import math
8 |
9 | def _trivial__enter__(self):
10 | return self
11 |
12 | def _self_close__exit__(self, exc_type, exc_value, traceback):
13 | self.close()
14 |
15 | wave.Wave_read.__exit__ = wave.Wave_write.__exit__ = _self_close__exit__
16 | wave.Wave_read.__enter__ = wave.Wave_write.__enter__ = _trivial__enter__
17 |
18 | class ExchangeChannal:
19 |
20 | __wavInfo = {'waveRate':0, 'sampleWidth':0, 'channelCnt':0, 'nframes':0}
21 |
22 | def __init__(self, argv):
23 | print "__init__ ExchangeChannal"
24 |
25 | def __mergeWave(self, inputArray, outputFile):
26 | cmdline = "sox -M"
27 | for inputFile in inputArray :
28 | cmdline += " "
29 | cmdline += inputFile
30 | cmdline += " "
31 | cmdline += outputFile
32 | ret = os.system(cmdline)
33 |
34 | return ret
35 |
36 | def __removeTmpFile(self, inputArray):
37 | for inputFile in inputArray:
38 | os.remove(inputFile)
39 |
40 | def __splitChannel(self, inputFile, outputFile, channalID) :
41 | cmdline = ' '.join(("sox", inputFile, outputFile, "remix", str(channalID)))
42 | ret = os.system(cmdline)
43 |
44 | return ret
45 |
46 | def __exchange(self, strInFile, outDir, totalChannal, channalArray):
47 |
48 | fileArray = []
49 | newChannalArray = []
50 | namearray = os.path.basename(strInFile).split('.')
51 |
52 | for i in range(totalChannal) :
53 | fileName = outDir + '/' + namearray[0] + "_ch_" + str(i+1) + ".wav"
54 | self.__splitChannel(strInFile, fileName, i+1)
55 | fileArray.append([fileName, 0])
56 |
57 | for i in range(len(channalArray)) :
58 | for j in range(totalChannal) :
59 | if fileArray[j][1] == 1 :
60 | continue
61 | if (j == channalArray[i] - 1) :
62 | fileArray[j][1] = 1
63 | newChannalArray.append(fileArray[j][0])
64 | break
65 |
66 | outFile = outDir + '/' + namearray[0] + '.wav'
67 | self.__mergeWave(newChannalArray, outFile)
68 |
69 | self.__removeTmpFile(newChannalArray)
70 |
71 | return outFile
72 |
73 | def process(self, inputFile, outDir, totalChannal, channalArray):
74 | return self.__exchange(inputFile, outDir, totalChannal, channalArray)
75 |
76 |
77 | usage = "python xxx.py input.wav channalcnt newchannals (example: xxx.py input.wav outDir 8 3 4 2 1 7 8 6 5)"
78 |
79 | if __name__ == '__main__':
80 | __inputArray = []
81 | if len(sys.argv) < 5 :
82 | print usage
83 | tool = ExchangeChannal(sys.argv)
84 | for i in range(4, len(sys.argv)) :
85 | __inputArray.append(int(sys.argv[i]))
86 | tool.process(sys.argv[1], sys.argv[2], int(sys.argv[3]), __inputArray)
87 |
88 | exit(0)
89 |
--------------------------------------------------------------------------------
/ISCA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yufan-aslp/AliMeeting/692a034fd510f1720f547c2c91e0fbadc9c24bf2/ISCA.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # M2MeT challenge baseline -- AliMeeting
2 |
3 |
4 | This project provides the baseline system recipes for the ICASSP 2020 Multi-channel Multi-party Meeting Transcription Challenge (M2MeT). The challenge mainly consists of two tracks, named ***Automatic Speech Recognition (ASR)*** and ***Speaker Diarization***. For each track, detailed descriptions can be found in its corresponding directory. The goal of this project is to simplify the training and evaluation procedures and make it flexible for participants to reproduce the baseline experiments and develop novelty methods.
5 |
6 |
7 | ## Setup
8 |
9 | ```shell
10 | git clone https://github.com/yufan-aslp/AliMeeting.git
11 | ```
12 |
13 | ## Introduction
14 |
15 | * [Speech Recognition Track](asr): Follow the detailed steps in `./asr`.
16 | * [Speaker Diarization Track](speaker): Follow the detailed steps in `./speaker`.
17 |
18 |
19 | ## General steps
20 |
21 | 1. Prepare the training data for speaker diarization and ASR model, respectively
22 | 2. Follow the running steps of the speaker diarization experiment and obtain the `rttm` file. The `rttm` file includes the voice activity detection (VAD) and speaker diarization results, which will be used to compute the final Diarization Error Rate (DER) scores.
23 | 3. For ASR track, we can train the single-speaker or multi-speaker ASR models. The evaluation metric of ASR systems is Character Error Rate (CER).
24 |
25 |
26 |
27 |
28 | ## Citation
29 |
30 | If you use the challenge dataset or our baseline systems, please consider citing the following:
31 |
32 | @inproceedings{Yu2022M2MeT,
33 | title={M2{M}e{T}: The {ICASSP} 2022 Multi-Channel Multi-Party Meeting Transcription Challenge},
34 | author={Yu, Fan and Zhang, Shiliang and Fu, Yihui and Xie, Lei and Zheng, Siqi and Du, Zhihao and Huang, Weilong and Guo, Pengcheng and Yan, Zhijie and Ma, Bin and Xu, Xin and Bu, Hui},
35 | booktitle={Proc. ICASSP},
36 | year={2022},
37 | organization={IEEE}
38 | }
39 |
40 | @inproceedings{Yu2022Summary,
41 | title={Summary On The {ICASSP} 2022 Multi-Channel Multi-Party Meeting Transcription Grand Challenge},
42 | author={Yu, Fan and Zhang, Shiliang and Guo, Pengcheng and Fu, Yihui and Du, Zhihao and Zheng, Siqi and Huang, Weilong and Xie, Lei and Tan, Zheng-Hua and Wang, DeLiang and Qian, Yanmin and Lee, Kong Aik and Yan, Zhijie and Ma, Bin and Xu, Xin and Bu, Hui},
43 | booktitle={Proc. ICASSP},
44 | year={2022},
45 | organization={IEEE}
46 | }
47 |
48 | Challenge introduction paper: M2MeT: The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Challenge (https://arxiv.org/abs/2110.07393?spm=a3c0i.25445127.6257982940.1.111654811kxLMY&file=2110.07393)
49 |
50 |
51 | Challenge summary paper: Summary On The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Grand Challenge (https://arxiv.org/abs/2202.03647?spm=a3c0i.25445127.6257982940.2.111654811kxLMY&file=2202.03647)
52 |
53 |
54 | The AliMeeting data download at https://www.openslr.org/119
55 |
56 |
57 | Room config of AliMeeting Train set download at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/AliMeeting/AliMeeting_Trainset_Room.xlsx
58 |
59 |
60 | M2MeT challege codalab(Open evaluation platform for Eval and Test sets of both Tracks): https://codalab.lisn.upsaclay.fr/competitions/?q=M2MeT
61 |
62 |
63 | ## Organizing Committee
64 | * Lei Xie, AISHELL Foundation, China, xielei21st@gmail.com
65 | * Bin Ma, Principal Engineer at Alibaba, Singapore, b.ma@alibaba-inc.com
66 | * DeLiang Wang, Professor, Ohio State University, USA, dwang@cse.ohio-state.edu
67 | * Zheng-Hua Tan, Professor, Aalborg University, Denmark, zt@es.aau.dk
68 | * Kong Aik Lee, Senior Scientist, Institute for Infocomm Research, A*STAR, Singapore, kongaik.lee@ieee.org
69 | * Zhijie Yan, Director of Speech Lab at Alibaba, China, zhijie.yzj@alibaba-inc.com
70 | * Yanmin Qian, Associate Professor, Shanghai Jiao Tong University, China,
71 | yanminqian@sjtu.edu.cn
72 | * Hui Bu, CEO, AIShell Inc., China, buhui@aishelldata.com
73 |
74 | ## Contributors
75 |
76 | [
](https://damo.alibaba.com/labs/speech/?lang=zh)[
](http://www.aishelltech.com/sy)[
](https://isca-speech.org/iscaweb/)
77 |
78 | ## Code license
79 |
80 | [Apache 2.0](./LICENSE)
81 |
82 |
--------------------------------------------------------------------------------
/alibaba.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yufan-aslp/AliMeeting/692a034fd510f1720f547c2c91e0fbadc9c24bf2/alibaba.ai
--------------------------------------------------------------------------------
/alibaba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yufan-aslp/AliMeeting/692a034fd510f1720f547c2c91e0fbadc9c24bf2/alibaba.png
--------------------------------------------------------------------------------
/asr/README.md:
--------------------------------------------------------------------------------
1 | # Automatic Speech Recognition (ASR)
2 |
3 |
4 |
5 | ## Usage
6 |
7 | For ASR track, we provide two baseline systems includes single-speaker and multi-speaker ASR. For single-speaker, please run all steps in `./run_local_conformer_near_alimeeting.sh`, while `./run_local_multispeaker_conformer_alimenting.sh` is used for the multi-speaker ASR.
8 |
9 |
10 | **The main stage:**
11 |
12 | 1. We use the implementation of Conformer ASR model in the ESPnet2. Please install the latest ESPnet toolkit and copy our all files to the `espnet/egs2/AliMeeting/asr`.
13 | 2. Both data preparation, language model training, and ASR model training are included in `asr_local.sh`.
14 | 3. First, please run `./run_local_conformer_near_alimeeting.sh` to train the single-speaker ASR model. Then, run `run_local_multispeaker_conformer_alimeeting.sh` to train the multi-speaker ASR model. Please note that you don’t need to repeat the data preparation procedure in the multi-speaker ASR training, since all the preparation will be done in the first training.
15 |
16 |
17 |
18 |
19 | ## Reference
20 | 1. [ESPnet](https://github.com/espnet/espnet.git)
21 | 2. [VBx](https://github.com/BUTSpeechFIT/VBx)
22 |
23 |
--------------------------------------------------------------------------------
/asr/cmd.sh:
--------------------------------------------------------------------------------
1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2 | # Usage: .pl [options] JOB=1:
3 | # e.g.
4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5 | #
6 | # Options:
7 | # --time