├── .gitignore ├── Experiment.ipynb ├── LICENSE.txt ├── README.en.md ├── README.md ├── data ├── diadiem │ ├── preprocess.py │ └── text.py └── vivos │ └── preprocess.py ├── data_format.md ├── egs ├── diadiem │ ├── __init__.py │ ├── analyze.py │ ├── extension │ │ ├── __init__.py │ │ ├── analyze.py │ │ ├── export.py │ │ ├── metrics.py │ │ ├── model.py │ │ └── text.py │ ├── load_data.py │ ├── model │ │ ├── __init__.py │ │ ├── etc │ │ │ ├── feat.params │ │ │ ├── idngram │ │ │ ├── sphinx_train.cfg │ │ │ ├── text │ │ │ ├── tmp.dic │ │ │ ├── tmp.filler │ │ │ ├── tmp.lm │ │ │ ├── tmp.phone │ │ │ ├── tmp_test.fileids │ │ │ ├── tmp_test.transcription │ │ │ ├── tmp_train.fileids │ │ │ ├── tmp_train.transcription │ │ │ └── vocab │ │ ├── model_parameters │ │ │ ├── tmp.cd_cont_200 │ │ │ │ ├── feat.params │ │ │ │ ├── mdef │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── noisedict │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.cd_cont_200_1 │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.cd_cont_200_2 │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.cd_cont_200_4 │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.cd_cont_initial │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.cd_cont_untied │ │ │ │ ├── feat.params │ │ │ │ ├── mdef │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── noisedict │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ ├── tmp.ci_cont │ │ │ │ ├── feat.params │ │ │ │ ├── mdef │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── noisedict │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ │ └── tmp.ci_cont_flatinitial │ │ │ │ ├── globalmean │ │ │ │ ├── globalvar │ │ │ │ ├── means │ │ │ │ ├── mixture_weights │ │ │ │ ├── transition_matrices │ │ │ │ └── variances │ │ └── text.py │ ├── test │ │ ├── CAFPHEE001.wav │ │ ├── CAFPHEE002.wav │ │ ├── CAFPHEE003.wav │ │ ├── CAFPHEE004.wav │ │ ├── CAFPHEE005.wav │ │ ├── CAFPHEE006.wav │ │ ├── CAFPHEE007.wav │ │ ├── CAFPHEE008.wav │ │ ├── CAFPHEE009.wav │ │ ├── CAFPHEE010.wav │ │ ├── DDUSNG0001.wav │ │ ├── DDUSNG0002.wav │ │ ├── DDUSNG0003.wav │ │ ├── DDUSNG0004.wav │ │ ├── DDUSNG0005.wav │ │ ├── DDUSNG0006.wav │ │ ├── DDUSNG0007.wav │ │ ├── DDUSNG0008.wav │ │ ├── DDUSNG0009.wav │ │ ├── DDUSNG0010.wav │ │ ├── KARAOKE001.wav │ │ ├── KARAOKE002.wav │ │ ├── KARAOKE003.wav │ │ ├── KARAOKE004.wav │ │ ├── KARAOKE005.wav │ │ ├── KARAOKE006.wav │ │ ├── KARAOKE007.wav │ │ ├── KARAOKE008.wav │ │ ├── KARAOKE009.wav │ │ ├── KARAOKE010.wav │ │ ├── KHASCHSAJN001.wav │ │ ├── KHASCHSAJN002.wav │ │ ├── KHASCHSAJN003.wav │ │ ├── KHASCHSAJN004.wav │ │ ├── KHASCHSAJN005.wav │ │ ├── KHASCHSAJN006.wav │ │ ├── KHASCHSAJN007.wav │ │ ├── KHASCHSAJN008.wav │ │ ├── KHASCHSAJN009.wav │ │ ├── KHASCHSAJN010.wav │ │ ├── KHOONG0001.wav │ │ ├── KHOONG0002.wav │ │ ├── KHOONG0003.wav │ │ ├── KHOONG0004.wav │ │ ├── KHOONG0005.wav │ │ ├── KHOONG0006.wav │ │ ├── KHOONG0007.wav │ │ ├── KHOONG0008.wav │ │ ├── KHOONG0009.wav │ │ ├── KHOONG0010.wav │ │ ├── MASTXA001.wav │ │ ├── MASTXA002.wav │ │ ├── MASTXA003.wav │ │ ├── MASTXA004.wav │ │ ├── MASTXA005.wav │ │ ├── MASTXA006.wav │ │ ├── MASTXA007.wav │ │ ├── MASTXA008.wav │ │ ├── MASTXA009.wav │ │ ├── MASTXA010.wav │ │ ├── TRAJMAYTEEM001.wav │ │ ├── TRAJMAYTEEM002.wav │ │ ├── TRAJMAYTEEM003.wav │ │ ├── TRAJMAYTEEM004.wav │ │ ├── TRAJMAYTEEM005.wav │ │ ├── TRAJMAYTEEM006.wav │ │ ├── TRAJMAYTEEM007.wav │ │ ├── TRAJMAYTEEM008.wav │ │ ├── TRAJMAYTEEM009.wav │ │ ├── TRAJMAYTEEM010.wav │ │ ├── TROWRLAJI001.wav │ │ ├── TROWRLAJI002.wav │ │ ├── TROWRLAJI003.wav │ │ ├── TROWRLAJI004.wav │ │ ├── TROWRLAJI005.wav │ │ ├── TROWRLAJI006.wav │ │ ├── TROWRLAJI007.wav │ │ ├── TROWRLAJI008.wav │ │ ├── TROWRLAJI009.wav │ │ └── TROWRLAJI010.wav │ ├── test_model.py │ ├── text.py │ └── train.py └── vivos │ ├── README.md │ ├── __init__.py │ ├── analyze.py │ ├── extension │ ├── __init__.py │ ├── analyze.py │ ├── cmd.sh │ ├── export.py │ ├── metrics.py │ ├── model.py │ ├── model_sphinx.py │ ├── path.sh │ ├── run_deltadelta.sh │ ├── run_lda_mllt.sh │ ├── run_lda_mllt_decode.sh │ ├── run_sat.sh │ ├── run_sgmm2.sh │ ├── text.py │ ├── transcript_deltadelta.sh │ └── transcriptions │ │ ├── audio │ │ ├── R001.wav │ │ ├── R002.wav │ │ ├── R003.wav │ │ ├── R004.wav │ │ ├── R005.wav │ │ ├── t1_tat_ca.wav │ │ └── t2_tro_nen.wav │ │ └── wav.scp │ ├── load_data.py │ ├── logs │ ├── 20181207_122900.md │ ├── 20181207_185000.md │ ├── 20181207_232600.md │ ├── 20181208_075100.md │ └── README.md │ ├── model │ ├── __init__.py │ ├── etc │ │ ├── feat.params │ │ ├── idngram │ │ ├── sphinx_train.cfg │ │ ├── text │ │ ├── tmp.dic │ │ ├── tmp.filler │ │ ├── tmp.lm │ │ ├── tmp.phone │ │ ├── tmp_test.fileids │ │ ├── tmp_test.transcription │ │ ├── tmp_train.fileids │ │ ├── tmp_train.transcription │ │ └── vocab │ ├── model_parameters │ │ ├── tmp.cd_cont_200 │ │ │ ├── feat.params │ │ │ ├── mdef │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── noisedict │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.cd_cont_200_1 │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.cd_cont_200_2 │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.cd_cont_200_4 │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.cd_cont_initial │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.cd_cont_untied │ │ │ ├── feat.params │ │ │ ├── mdef │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── noisedict │ │ │ ├── transition_matrices │ │ │ └── variances │ │ ├── tmp.ci_cont │ │ │ ├── feat.params │ │ │ ├── mdef │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── noisedict │ │ │ ├── transition_matrices │ │ │ └── variances │ │ └── tmp.ci_cont_flatinitial │ │ │ ├── globalmean │ │ │ ├── globalvar │ │ │ ├── means │ │ │ ├── mixture_weights │ │ │ ├── transition_matrices │ │ │ └── variances │ └── text.py │ ├── predict.py │ ├── predict_delta.sh │ ├── preprocess.py │ ├── preprocess_full.py │ ├── test │ ├── VIVOSDEV01_R003.wav │ ├── VIVOSDEV01_R012.wav │ ├── VIVOSDEV01_R027.wav │ ├── VIVOSDEV01_R028.wav │ ├── VIVOSDEV01_R034.wav │ ├── VIVOSDEV01_R043.wav │ ├── VIVOSDEV01_R044.wav │ └── VIVOSDEV01_R055.wav │ ├── test_model.py │ ├── text2.py │ └── train.py ├── insight ├── vivos.txt └── vlsp2018.txt ├── report ├── acl2017.sty ├── acl_natbib.bst ├── build.sh ├── eacl2017.bst ├── eacl2017.sty ├── notation.tex ├── technique_report.bib ├── technique_report.pdf └── technique_report.tex ├── tmp └── .gitkeep └── util └── eda_vlsp.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data/vivos/raw/ 3 | data/vivos/corpus/ 4 | data/open_fpt/raw/FPTOpenSpeechData_Set001_V0.1 5 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part1_V0.1 6 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part2_V0.1 7 | **/tmp/ 8 | **/analyze/ 9 | /experiment/diadiem/tmp/ 10 | /data/vlsp/corpus/ 11 | /data/vlsp/wav 12 | /experiment/vlsp/extension/_pycache_/ 13 | **/**/__pycache__/ 14 | **/__pycache__/ 15 | .ipynb_checkpoints 16 | data/vlsp 17 | tmp 18 | !tmp/.gitkeep 19 | data/diadiem/ -------------------------------------------------------------------------------- /Experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "print(\"Hello from Underthesea Automatic Speech Recognition Team\")" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!lscpu" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "!free -m" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "!df -h" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "data\t\texperiment\t LICENSE.txt\tREADME.md\r\n", 49 | "data_format.md\tExperiment.ipynb README.en.md\treport\r\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "!ls" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "diadiem vivos\r\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "!ls data" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "corpus\tpreprocess.py raw\r\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "!ls data/vivos" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "4.0K\tExperiment.ipynb\n", 101 | "36K\tLICENSE.txt\n", 102 | "4.0K\tREADME.en.md\n", 103 | "8.0K\tREADME.md\n", 104 | "5.2G\tdata\n", 105 | "4.0K\tdata_format.md\n", 106 | "312M\texperiment\n", 107 | "220K\treport\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "!du -sh *" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.6.3" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /README.en.md: -------------------------------------------------------------------------------- 1 | # Vietnamese Automatic Speech Recognition 2 | 3 | ## Mục lục 4 | 5 | 6 | ## Huấn luyện mô hình 7 | 8 | ## Môi trường thử nghiệm 9 | 10 | * Ubuntu 16.04 11 | 12 | ## Cài đặt 13 | 14 | **Cài đặt Kaldi** theo hướng dẫn tại [http://kaldi-asr.org/doc/tutorial_setup.html](http://kaldi-asr.org/doc/tutorial_setup.html) 15 | 16 | ``` 17 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden 18 | 19 | $ cd kaldi-trunk/tools/; make; 20 | 21 | $ extras/install_openblas.sh 22 | 23 | $ cd ../src; ./configure --openblas-root=../tools/OpenBLAS/install; make 24 | ``` 25 | 26 | **Cài đặt language modeling toolkit srilm** 27 | 28 | Cài đặt dependencies 29 | 30 | ``` 31 | $ apt-get install gawk 32 | ``` 33 | 34 | Cài đặt srilm 35 | 36 | ``` 37 | $ cd kaldi-trunk/tools 38 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz 39 | $ ./install_srilm.sh 40 | ... 41 | Installation of SRILM finished successfully 42 | Please source the tools/env.sh in your path.sh to enable it 43 | ``` 44 | 45 | # Mô tả dữ liệu 46 | 47 | [Xem chi tiết](data_format.md) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nhận dạng tiếng nói tiếng Việt 2 | 3 | ![](https://img.shields.io/badge/made%20with-%E2%9D%A4-red.svg) 4 | ![](https://img.shields.io/badge/opensource-vietnamese-blue.svg) 5 | ![](https://img.shields.io/badge/build-passing-green.svg) 6 | 7 | Dự án nghiên cứu về bài toán *Nhận dạng tiếng nói tiếng Việt*, được phát triển bởi nhóm nghiên cứu xử lý ngôn ngữ tự nhiên tiếng Việt - [undertheseanlp](https://github.com/undertheseanlp/). Chứa mã nguồn các thử nghiệm cho việc xử lý dữ liệu, huấn luyện và đánh giá mô hình, cũng như cho phép dễ dàng tùy chỉnh mô hình đối với những tập dữ liệu mới. 8 | 9 | **Nhóm tác giả** 10 | 11 | * Vũ Anh ([anhv.ict91@gmail.com](anhv.ict91@gmail.com)) 12 | * Lê Phi Hùng ([lephihungch@gmail.com](lephihungch@gmail.com)) 13 | 14 | **Tham gia đóng góp** 15 | 16 | Mọi ý kiến đóng góp hoặc yêu cầu trợ giúp xin gửi vào mục [Issues](../../issues) của dự án. Các thảo luận được khuyến khích **sử dụng tiếng Việt** để dễ dàng trong quá trình trao đổi. 17 | 18 | Nếu bạn có kinh nghiệm trong bài toán này, muốn tham gia vào nhóm phát triển với vai trò là [Developer](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor), xin hãy đọc kỹ [Hướng dẫn tham gia đóng góp](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor). 19 | 20 | ## Mục lục 21 | 22 | * [Yêu cầu hệ thống](#yêu-cầu-hệ-thống) 23 | * [Thiết lập môi trường](#thiết-lập-môi-trường) 24 | * [Hướng dẫn sử dụng](#hướng-dẫn-sử-dụng) 25 | * [Sử dụng mô hình đã huấn luyện](#sử-dụng-mô-hình-đã-huấn-luyện) 26 | * [Huấn luyện mô hình](#huấn-luyện-mô-hình) 27 | * [Kết quả thử nghiệm](#kết-quả-thử-nghiệm) 28 | * [Trích dẫn](#trích-dẫn) 29 | * [Bản quyền](#bản-quyền) 30 | 31 | ## Yêu cầu hệ thống 32 | 33 | * `Hệ điều hành: Ubuntu 16.04` 34 | * `Python 3.6+` 35 | * `conda 4+` 36 | 37 | 38 | ## Thiết lập môi trường 39 | 40 | **Cài đặt Kaldi** 41 | 42 | Để cài đặt Kaldi, thực hiện theo các bước tại [hướng dẫn](http://kaldi-asr.org/doc/tutorial_setup.html) 43 | 44 | ``` 45 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden 46 | 47 | $ cd kaldi-trunk/tools/; make; 48 | 49 | $ extras/install_openblas.sh 50 | 51 | $ cd ../src; ./configure --openblas-root=../tools/OpenBLAS/install; make 52 | ``` 53 | 54 | **Cài đặt language modeling toolkit srilm** 55 | 56 | Cài đặt dependencies 57 | 58 | ``` 59 | $ apt-get install gawk 60 | ``` 61 | 62 | **Cài đặt srilm** 63 | 64 | ``` 65 | $ cd kaldi-trunk/tools 66 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz 67 | $ ./install_srilm.sh 68 | ... 69 | Installation of SRILM finished successfully 70 | Please source the tools/env.sh in your path.sh to enable it 71 | ``` 72 | 73 | ## Hướng dẫn sử dụng 74 | 75 | ### Huấn luyện mô hình 76 | 77 | **Mô tả dữ liệu**: [Xem chi tiết](data_format.md) 78 | 79 | Trước khi run train.py phải set lại đường dẫn tới kaldi_folder . 80 | 81 | Method predict nên có thêm argument model_path nếu bạn đã thực hiện train trước đó (vì nếu không nó sẽ lấy theo tmp_path của model, mà tmp_path này random cho mỗi lần khởi tạo lại model để chuẩn bị cho việc chạy training mới) 82 | 83 | Thay đổi N_TRAIN và N_TEST trong init của KaldiSpeechRecognition để đổi giới hạn tập train/test 84 | 85 | Output folder sẽ nằm trong kaldi_folder/egs/uts_{tmp_number} với tmp_number được thấy khi run train.py (EX: "Init Kaldi Speech Recognition in number_of_tmp folder" - Will be updated soon) 86 | 87 | ## Kết quả thử nghiệm 88 | 89 | Huấn luyện trên tập dữ liệu VIVOS - OpenFPT, test trên tập VLSP 2018 90 | 91 | 92 | 93 | 96 | 97 | 98 | 99 | 100 |
Mô hình 94 | WER 95 |
GMM: MFCC + delta + LDA + MLTT75.27%
101 | 102 | Huấn luyện trên tập dữ liệu VIVOS, test trên tập VLSP 2018 103 | 104 | 105 | 106 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 |
Mô hình 107 | WER 108 |
GMM: MFCC + delta + LDA + MLTT79.80%
GMM: MFCC + delta82.03%
118 | 119 | ## Bản quyền 120 | 121 | Mã nguồn của dự án được phân phối theo giấy phép [GPL-3.0](LICENSE.txt). 122 | 123 | Dự án sử dụng tập dữ liệu **[VIVOS](https://ailab.hcmus.edu.vn/vivos/)** trong các thử nghiệm. Xin vui lòng kiểm tra lại thông tin trên website hoặc báo cáo khoa học tương ứng để biết thông tin về bản quyền và trích dẫn khi sử dụng tập dữ liệu này. -------------------------------------------------------------------------------- /data/diadiem/preprocess.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os import mkdir 3 | import re 4 | from text import phone2word 5 | 6 | 7 | def create_train_text(): 8 | lines = open( 9 | "raw/huanluyen_diadiem_train.transcription").read().splitlines() 10 | output = [] 11 | for line in lines: 12 | m = re.match(r"^ (?P.*) \((?P.*)\)$", line) 13 | if m: 14 | text = phone2word(m.group("text").lower()) 15 | fileid = m.group("fileid") 16 | content = "{}|{}".format(fileid, text) 17 | output.append(content) 18 | pass 19 | else: 20 | raise Exception("Content not match.") 21 | text = "\n".join(output) 22 | open("corpus/train/text", "w").write(text) 23 | 24 | 25 | def create_test_text(): 26 | lines = open( 27 | "raw/huanluyen_diadiem_test.transcription").read().splitlines() 28 | output = [] 29 | for line in lines: 30 | m = re.match(r"^(?P.*) \((?P.*)\)$", line) 31 | if m: 32 | text = phone2word(m.group("text").lower()) 33 | fileid = m.group("fileid") 34 | content = "{}|{}".format(fileid, text) 35 | output.append(content) 36 | pass 37 | else: 38 | raise Exception("Text not match.") 39 | text = "\n".join(output) 40 | open("corpus/test/text", "w").write(text) 41 | 42 | 43 | try: 44 | shutil.rmtree("corpus") 45 | except: 46 | pass 47 | finally: 48 | mkdir("corpus") 49 | mkdir("corpus/train") 50 | mkdir("corpus/test") 51 | shutil.copytree("raw/wav/train", "corpus/train/wav") 52 | shutil.copytree("raw/wav/test", "corpus/test/wav") 53 | create_train_text() 54 | create_test_text() 55 | -------------------------------------------------------------------------------- /data/diadiem/text.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /data/vivos/preprocess.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os import mkdir, walk 3 | from os import listdir 4 | from os.path import join 5 | 6 | 7 | def create_train_waves(): 8 | waves_folder = "raw/train/waves" 9 | corpus_waves_folder = "corpus/train/wav" 10 | try: 11 | shutil.rmtree(corpus_waves_folder) 12 | except: 13 | pass 14 | finally: 15 | mkdir(corpus_waves_folder) 16 | for root, dirs, files in walk(waves_folder): 17 | for dir in dirs: 18 | for f in listdir(join(waves_folder, dir)): 19 | shutil.copy( 20 | join(waves_folder, dir, f), 21 | join(corpus_waves_folder, f)) 22 | 23 | 24 | def create_test_waves(): 25 | waves_folder = "raw/test/waves" 26 | corpus_waves_folder = "corpus/test/wav" 27 | try: 28 | shutil.rmtree(corpus_waves_folder) 29 | except: 30 | pass 31 | finally: 32 | mkdir(corpus_waves_folder) 33 | for root, dirs, files in walk(waves_folder): 34 | for dir in dirs: 35 | for f in listdir(join(waves_folder, dir)): 36 | shutil.copy( 37 | join(waves_folder, dir, f), 38 | join(corpus_waves_folder, f)) 39 | 40 | 41 | def create_train_text(): 42 | content = open("raw/train/prompts.txt").read() 43 | content = content.replace(":", "") 44 | lines = content.splitlines() 45 | output = [] 46 | for line in lines: 47 | items = line.split() 48 | fileid = items[0] 49 | text = " ".join(items[1:]).lower() 50 | content = "{}|{}".format(fileid, text) 51 | output.append(content) 52 | text = "\n".join(output) 53 | open("corpus/train/text", "w").write(text) 54 | 55 | 56 | def create_test_text(): 57 | content = open("raw/test/prompts.txt").read() 58 | content = content.replace(":", "") 59 | lines = content.splitlines() 60 | output = [] 61 | for line in lines: 62 | items = line.split() 63 | fileid = items[0] 64 | text = " ".join(items[1:]).lower() 65 | content = "{}|{}".format(fileid, text) 66 | output.append(content) 67 | text = "\n".join(output) 68 | open("corpus/test/text", "w").write(text) 69 | 70 | 71 | def create_gender(): 72 | content = open("raw/train/genders.txt").read() 73 | open("corpus/train/gender", "w").write(content) 74 | content = open("raw/test/genders.txt").read() 75 | open("corpus/test/gender", "w").write(content) 76 | 77 | 78 | def create_speaker(): 79 | lines = open("raw/train/prompts.txt").read().splitlines() 80 | files = [line.split()[0] for line in lines] 81 | tmp = [] 82 | for file_id in files: 83 | speaker_id = file_id.split("_")[0] 84 | content = "{} {}".format(speaker_id, file_id) 85 | tmp.append(content) 86 | content = "\n".join(tmp) 87 | open("corpus/train/speaker", "w").write(content) 88 | 89 | lines = open("raw/test/prompts.txt").read().splitlines() 90 | files = [line.split()[0] for line in lines] 91 | tmp = [] 92 | for file_id in files: 93 | speaker_id = file_id.split("_")[0] 94 | content = "{} {}".format(speaker_id, file_id) 95 | tmp.append(content) 96 | content = "\n".join(tmp) 97 | open("corpus/test/speaker", "w").write(content) 98 | 99 | 100 | try: 101 | shutil.rmtree("corpus") 102 | except: 103 | pass 104 | finally: 105 | mkdir("corpus") 106 | mkdir("corpus/train") 107 | mkdir("corpus/test") 108 | create_train_waves() 109 | create_test_waves() 110 | create_train_text() 111 | create_test_text() 112 | create_gender() 113 | create_speaker() 114 | -------------------------------------------------------------------------------- /data_format.md: -------------------------------------------------------------------------------- 1 | # Mô tả dữ liệu 2 | 3 | | Phiên bản | v1.0.0 | 4 | |-------------------|------------| 5 | | Lần cập nhật cuối | 10/01/2018 | 6 | | Người thực hiện | Vũ Anh | 7 | 8 | Tài liệu mô tả đề xuất về cấu trúc chuẩn của tập dữ liệu (corpus) đối với bài toán nhận dạng tiếng nói (ASR). Được áp dụng trong các thí nghiệm của [`underthesea`](https://github.com/undertheseanlp/automatic_speech_recognition) từ phiên bản 1.2.0 9 | 10 | Các ví dụ mẫu: [`diadiem`](https://github.com/undertheseanlp/automatic_speech_recognition/tree/sphinx_lab/data/diadiem/corpus) corpus 11 | 12 | ### Tập dữ liệu 13 | 14 | Dữ liệu của bài toán nhận dạng tiếng nói được lưu trong một thư mục, gồm hai thư mục con `train` và `test`. 15 | 16 | * Dữ liệu huấn luyện được lưu trong thư mục `train` 17 | * Dữ liệu kiểm thử được lưu trong thư mục `test` 18 | 19 | Cấu trúc thư mục 20 | 21 | ``` 22 | . 23 | ├── train 24 | | ├── wav 25 | | | ├── train_01.wav 26 | | | ├── train_02.wav 27 | | | └── train_03.wav 28 | | ├── gender 29 | | ├── speaker 30 | | └── text 31 | └── test 32 | ├── wav 33 | | ├── test_01.wav 34 | | ├── test_02.wav 35 | | └── test_03.wav 36 | ├── gender 37 | ├── speaker 38 | └── text 39 | ``` 40 | 41 | Mỗi thư mục `train` và `test` gồm thư mục con `wav`, file `gender`, file `speaker` và file `text`. Trong thư mục `wav` có chứa các file âm thanh (với đuôi định dạng phổ biến là wav), chứa dữ liệu âm thanh. 42 | 43 | File `text` chứa nội dung của từng câu nói với tên file âm thanh tương ứng 44 | 45 | *Format*: `|` 46 | 47 | ``` 48 | train_01|text content 01 49 | train_02|text content 02 50 | train_03|text content 03 51 | train_04|text content 04 52 | ``` 53 | 54 | File `speaker` chứa mô tả speaker id với câu nói tương ứng 55 | 56 | *Format*: ` ` 57 | 58 | ``` 59 | spk01 train_01 60 | spk01 train_02 61 | spk02 train_03 62 | spk02 train_04 63 | ``` 64 | 65 | File `gender` chứa thông tin về giới tính của speaker 66 | 67 | *Format*: ` ` 68 | 69 | ``` 70 | spk01 f 71 | spk02 m 72 | ``` 73 | 74 | Ký hiệu: 75 | 76 | * `f` (female): speaker có giới tính nữ 77 | * `m` (male): speakder có giới tính nam -------------------------------------------------------------------------------- /egs/diadiem/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/__init__.py -------------------------------------------------------------------------------- /egs/diadiem/analyze.py: -------------------------------------------------------------------------------- 1 | from model import transcript 2 | from os.path import join, dirname 3 | from extension.analyze import WERAnalyzeLogger 4 | 5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem", 6 | "corpus") 7 | 8 | 9 | def load_test(): 10 | lines = open(join(corpus_folder, "test", "text")).read().splitlines() 11 | lines = [line.split("|") for line in lines] 12 | wavs = [line[0] for line in lines] 13 | wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs] 14 | texts = [line[1] for line in lines] 15 | return wavs, texts 16 | 17 | 18 | wavs_test, texts_test = load_test() 19 | # texts_pred = [""] * len(texts_test) 20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test] 21 | 22 | log_folder = join(dirname(__file__), "analyze") 23 | 24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder) 25 | -------------------------------------------------------------------------------- /egs/diadiem/extension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/extension/__init__.py -------------------------------------------------------------------------------- /egs/diadiem/extension/analyze.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | from extension.metrics import calculate_wer 4 | from os.path import join, basename 5 | import os 6 | from underthesea.util.file_io import write 7 | import numpy as np 8 | 9 | 10 | class WERAnalyzeLogger: 11 | @staticmethod 12 | def log(wavs_test, texts_test, texts_pred, log_folder): 13 | wer = np.mean([calculate_wer(test.split(), pred.split()) 14 | for test, pred in zip(texts_test, texts_pred)]) 15 | wer = np.round(wer, 4) 16 | result = { 17 | "WER": wer 18 | } 19 | content = json.dumps(result, ensure_ascii=False) 20 | log_file = join(log_folder, "result.json") 21 | write(log_file, content) 22 | wav_folder = join(log_folder, "wav") 23 | try: 24 | shutil.rmtree(wav_folder) 25 | except: 26 | pass 27 | finally: 28 | os.mkdir(wav_folder) 29 | for wav in wavs_test: 30 | new_path = join(wav_folder, basename(wav)) 31 | shutil.copyfile(wav, new_path) 32 | wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test] 33 | speech_recognition = { 34 | "texts_test": texts_test, 35 | "texts_pred": texts_pred, 36 | "wavs_test": wavs_test_new_path, 37 | } 38 | content = json.dumps(speech_recognition, ensure_ascii=False) 39 | log_file = join(log_folder, "speechrecognition.json") 40 | write(log_file, content) 41 | 42 | print("Result is written in {}".format(log_file)) 43 | print("WER: {}%".format(wer * 100)) 44 | -------------------------------------------------------------------------------- /egs/diadiem/extension/export.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os.path import join 3 | 4 | 5 | class SphinxSpeechRecognitionExporter: 6 | @staticmethod 7 | def export(model, export_folder): 8 | tmp_folder = model.tmp_folder 9 | try: 10 | shutil.rmtree(join(export_folder, "etc")) 11 | except: 12 | pass 13 | finally: 14 | shutil.copytree(join(tmp_folder, "etc"), 15 | join(export_folder, "etc")) 16 | 17 | try: 18 | shutil.rmtree(join(export_folder, "model_parameters")) 19 | except: 20 | pass 21 | finally: 22 | shutil.copytree(join(tmp_folder, "model_parameters"), 23 | join(export_folder, "model_parameters")) 24 | -------------------------------------------------------------------------------- /egs/diadiem/extension/metrics.py: -------------------------------------------------------------------------------- 1 | def calculate_wer(reference, hypothesis): 2 | """ 3 | Calculation of WER with Levenshtein distance. 4 | Works only for iterables up to 254 elements (uint8). 5 | O(nm) time and space complexity. 6 | 7 | >>> calculate_wer("who is there".split(), "is there".split()) 8 | 1 9 | >>> calculate_wer("who is there".split(), "".split()) 10 | 3 11 | >>> calculate_wer("".split(), "who is there".split()) 12 | 3 13 | """ 14 | # initialisation 15 | import numpy 16 | d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1), 17 | dtype=numpy.uint8) 18 | d = d.reshape((len(reference) + 1, len(hypothesis) + 1)) 19 | for i in range(len(reference) + 1): 20 | for j in range(len(hypothesis) + 1): 21 | if i == 0: 22 | d[0][j] = j 23 | elif j == 0: 24 | d[i][0] = i 25 | 26 | # computation 27 | for i in range(1, len(reference) + 1): 28 | for j in range(1, len(hypothesis) + 1): 29 | if reference[i - 1] == hypothesis[j - 1]: 30 | d[i][j] = d[i - 1][j - 1] 31 | else: 32 | substitution = d[i - 1][j - 1] + 1 33 | insertion = d[i][j - 1] + 1 34 | deletion = d[i - 1][j] + 1 35 | d[i][j] = min(substitution, insertion, deletion) 36 | 37 | return d[len(reference)][len(hypothesis)] / float(len(reference)) 38 | 39 | 40 | import unittest 41 | assertions = unittest.TestCase('__init__') 42 | 43 | if __name__ == '__main__': 44 | s = calculate_wer("khach san".split(), "khach san cua toi".split()) 45 | assertions.assertAlmostEqual(s, 1) 46 | s = calculate_wer("khach san cua".split(), "khach san cua toi".split()) 47 | assertions.assertAlmostEqual(s, 0.333, 3) 48 | -------------------------------------------------------------------------------- /egs/diadiem/extension/model.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | import text 4 | 5 | 6 | class SphinxSpeechRecognition: 7 | def __init__(self, corpus_folder, tmp_folder): 8 | print("Initial Sphinx Speech Recognition") 9 | self.corpus_folder = corpus_folder 10 | self.tmp_folder = tmp_folder 11 | try: 12 | shutil.rmtree(tmp_folder) 13 | except Exception as e: 14 | pass 15 | finally: 16 | os.mkdir(tmp_folder) 17 | os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder)) 18 | self._init_data() 19 | self._change_config() 20 | self._make_transcription() 21 | self._make_dictionary() 22 | self._make_filler() 23 | self._make_language_model() 24 | 25 | # ========================== # 26 | # Init Data 27 | # ========================== # 28 | def _init_data(self): 29 | os.system("cd {}; mkdir wav".format(self.tmp_folder)) 30 | 31 | os.system("cd {}; cp -r {}/train/wav wav/train".format(self.tmp_folder, 32 | self.corpus_folder)) 33 | os.system("cd {}; cp -r {}/test/wav wav/test".format(self.tmp_folder, 34 | self.corpus_folder)) 35 | 36 | ids = open( 37 | "{}/train/text".format(self.corpus_folder)).read().splitlines() 38 | ids = [item.split("|")[0] for item in ids] 39 | ids = ["train/{}".format(id) for id in ids] 40 | ids.append("") 41 | content = "\n".join(ids) 42 | open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"), 43 | "w").write(content) 44 | 45 | ids = open( 46 | "{}/test/text".format(self.corpus_folder)).read().splitlines() 47 | ids = [item.split("|")[0] for item in ids] 48 | ids = ["test/{}".format(id) for id in ids] 49 | ids.append("") 50 | content = "\n".join(ids) 51 | open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"), 52 | "w").write(content) 53 | 54 | # ========================== # 55 | # Config 56 | # ========================== # 57 | def _change_config(self): 58 | config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg") 59 | config = SphinxConfig(config_file) 60 | config.set("$CFG_BASE_DIR", "\".\"") 61 | config.set("$CFG_WAVFILE_SRATE", 8000.0) 62 | config.set("$CFG_NUM_FILT", 31) 63 | config.set("$CFG_LO_FILT", 200) 64 | config.set("$CFG_HI_FILT", 3500) 65 | config.set("$CFG_WAVFILE_TYPE", "'raw'") 66 | config.set("$CFG_LANGUAGEMODEL", 67 | "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"") 68 | config.set("$DEC_CFG_LANGUAGEMODEL", 69 | "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"") 70 | 71 | # ========================== # 72 | # Transcription 73 | # ========================== # 74 | def _convert_transcription(self, in_file, out_file): 75 | lines = open(in_file).read().splitlines() 76 | output = [] 77 | for line in lines: 78 | fileid, word = line.split("|") 79 | phone = text.word2phone(word) 80 | content = " {} ({})".format(phone, fileid) 81 | output.append(content) 82 | content = "\n".join(output) 83 | open(out_file, "w").write(content) 84 | 85 | def _make_transcription(self): 86 | self._convert_transcription( 87 | "{}/train/text".format(self.corpus_folder), 88 | "{}/etc/tmp_train.transcription".format(self.tmp_folder)) 89 | self._convert_transcription( 90 | "{}/test/text".format(self.corpus_folder), 91 | "{}/etc/tmp_test.transcription".format(self.tmp_folder)) 92 | 93 | # ============================== # 94 | # Create dictionary and phones 95 | # ============================== # 96 | def _make_dictionary(self): 97 | lines = open( 98 | "{}/train/text".format(self.corpus_folder)).read().splitlines() 99 | phones = [] 100 | for line in lines: 101 | fileid, word = line.split("|") 102 | p = text.word2phone(word).split() 103 | phones += p 104 | phones = sorted(set(phones)) 105 | # create .dic files 106 | lines = [] 107 | phone_units = [] 108 | for p in phones: 109 | units = list(p) 110 | phone_units += units 111 | units = " ".join(units) 112 | line = "{:20s}{}".format(p, units) 113 | lines.append(line) 114 | open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write( 115 | "\n".join(lines)) 116 | phone_units = sorted(set(phone_units)) 117 | phone_units.append("SIL") 118 | open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write( 119 | "\n".join(phone_units)) 120 | 121 | def _make_filler(self): 122 | fillers = ["", "", ""] 123 | lines = ["{:20s}SIL".format(f) for f in fillers] 124 | open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write( 125 | "\n".join(lines)) 126 | 127 | # ========================== # 128 | # Language Model 129 | # ========================== # 130 | def _make_cleaned_text(self): 131 | in_file = "{}/train/text".format(self.corpus_folder) 132 | out_file = "{}/etc/text".format(self.tmp_folder) 133 | lines = open(in_file).read().splitlines() 134 | output = [] 135 | for line in lines: 136 | fileid, word = line.split("|") 137 | phone = text.word2phone(word) 138 | content = " {} ".format(phone, fileid) 139 | output.append(content) 140 | content = "\n".join(output) 141 | open(out_file, "w").write(content) 142 | 143 | def _make_language_model(self): 144 | self._make_cleaned_text() 145 | etc_folder = os.path.join(self.tmp_folder, "etc") 146 | chdir = "cd {}; ".format(etc_folder) 147 | os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab") 148 | os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text") 149 | os.system( 150 | chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm") 151 | 152 | def fit(self): 153 | chdir = "cd {}; ".format(self.tmp_folder) 154 | os.system(chdir + "sphinxtrain run") 155 | 156 | def predict(self, wav_file): 157 | command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format( 158 | self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file) 159 | output = os.popen(command).read().strip() 160 | output = text.phone2word(output) 161 | return output 162 | 163 | 164 | class SphinxConfig: 165 | def __init__(self, config_file): 166 | self.file = config_file 167 | self.lines = open(config_file).read().splitlines() 168 | 169 | def save(self): 170 | content = "\n".join(self.lines) 171 | open(self.file, "w").write(content) 172 | 173 | def set(self, key, value): 174 | for i, line in enumerate(self.lines): 175 | if line.startswith(key): 176 | content = "{} = {};".format(key, value) 177 | self.lines[i] = content 178 | self.save() 179 | -------------------------------------------------------------------------------- /egs/diadiem/extension/text.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /egs/diadiem/load_data.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem", 4 | "corpus") 5 | -------------------------------------------------------------------------------- /egs/diadiem/model/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname 2 | import os 3 | import text 4 | 5 | 6 | def transcript(wav_file): 7 | tmp_folder = dirname(__file__) 8 | command = "pocketsphinx_continuous " \ 9 | "-hmm {0}/model_parameters/tmp.cd_cont_200 " \ 10 | "-samprate 8000 " \ 11 | "-lm {0}/etc/tmp.lm " \ 12 | "-dict {0}/etc/tmp.dic " \ 13 | "-infile {1} " \ 14 | "-logfn {0}/yes".format(tmp_folder, wav_file) 15 | with os.popen(command) as c: 16 | output = c.read().strip() 17 | output = text.phone2word(output) 18 | os.remove("{}/yes".format(tmp_folder)) 19 | return output 20 | -------------------------------------------------------------------------------- /egs/diadiem/model/etc/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf __CFG_LO_FILT__ 2 | -upperf __CFG_HI_FILT__ 3 | -nfilt __CFG_NUM_FILT__ 4 | -transform __CFG_TRANSFORM__ 5 | -lifter __CFG_LIFTER__ 6 | -feat __CFG_FEATURE__ 7 | -svspec __CFG_SVSPEC__ 8 | -agc __CFG_AGC__ 9 | -cmn __CFG_CMN__ 10 | -varnorm __CFG_VARNORM__ 11 | -------------------------------------------------------------------------------- /egs/diadiem/model/etc/idngram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/etc/idngram -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp.dic: -------------------------------------------------------------------------------- 1 | ay a y 2 | caf c a f 3 | ddusng d d u s n g 4 | ka k a 5 | ke k e 6 | khasch k h a s c h 7 | khoong k h o o n g 8 | laji l a j i 9 | mast m a s t 10 | phee p h e e 11 | rao r a o 12 | sajn s a j n 13 | teem t e e m 14 | trajm t r a j m 15 | trowr t r o w r 16 | xa x a -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp.filler: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp.lm: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | ## Copyright (c) 1996, Carnegie Mellon University, Cambridge University, 3 | ## Ronald Rosenfeld and Philip Clarkson 4 | ## Version 3, Copyright (c) 2006, Carnegie Mellon University 5 | ## Contributors includes Wen Xu, Ananlada Chotimongkol, 6 | ## David Huggins-Daines, Arthur Chan and Alan Black 7 | ############################################################################# 8 | ============================================================================= 9 | =============== This file was produced by the CMU-Cambridge =============== 10 | =============== Statistical Language Modeling Toolkit =============== 11 | ============================================================================= 12 | This is a 3-gram language model, based on a vocabulary of 18 words, 13 | which begins "", "", "ay"... 14 | This is a CLOSED-vocabulary model 15 | (OOVs eliminated from training data and are forbidden in test data) 16 | Good-Turing discounting was applied. 17 | 1-gram frequency of frequency : 0 18 | 2-gram frequency of frequency : 0 0 0 0 0 0 0 19 | 3-gram frequency of frequency : 0 0 0 0 0 0 0 20 | 1-gram discounting ratios : 21 | 2-gram discounting ratios : 22 | 3-gram discounting ratios : 23 | This file is in the ARPA-standard format introduced by Doug Paul. 24 | 25 | p(wd3|wd1,wd2)= if(trigram exists) p_3(wd1,wd2,wd3) 26 | else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2) 27 | else p(wd3|w2) 28 | 29 | p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2) 30 | else bo_wt_1(wd1)*p_1(wd2) 31 | 32 | All probs and back-off weights (bo_wt) are given in log10 form. 33 | 34 | Data formats: 35 | 36 | Beginning of data mark: \data\ 37 | ngram 1=nr # number of 1-grams 38 | ngram 2=nr # number of 2-grams 39 | ngram 3=nr # number of 3-grams 40 | 41 | \1-grams: 42 | p_1 wd_1 bo_wt_1 43 | \2-grams: 44 | p_2 wd_1 wd_2 bo_wt_2 45 | \3-grams: 46 | p_3 wd_1 wd_2 wd_3 47 | 48 | end of data mark: \end\ 49 | 50 | \data\ 51 | ngram 1=18 52 | ngram 2=25 53 | ngram 3=32 54 | 55 | \1-grams: 56 | -0.5755 -3.5579 57 | -0.5754 -3.5587 58 | -1.6028 ay -2.6555 59 | -1.5908 caf -2.6672 60 | -1.2657 ddusng -2.8684 61 | -1.5982 ka -2.6601 62 | -1.5982 ke -2.5370 63 | -1.6066 khasch -2.6519 64 | -1.2289 khoong -2.9053 65 | -1.5817 laji -2.5534 66 | -1.5899 mast -2.6681 67 | -1.5908 phee -2.5444 68 | -1.5982 rao -2.6601 69 | -1.6066 sajn -2.5286 70 | -1.6028 teem -2.5324 71 | -1.6028 trajm -2.6555 72 | -1.5817 trowr -2.6760 73 | -1.5899 xa -2.5453 74 | 75 | \2-grams: 76 | -0.0001 0.0008 77 | -1.0152 caf 0.0009 78 | -0.6907 ddusng 0.0004 79 | -1.0226 ka 0.0009 80 | -1.0310 khasch 0.0009 81 | -0.6538 khoong 0.0004 82 | -1.0143 mast 0.0009 83 | -1.0273 trajm 0.0009 84 | -1.0053 trowr 0.0000 85 | -0.0009 ay teem 0.0009 86 | -0.0009 caf phee 0.0009 87 | -0.0004 ddusng 0.6900 88 | -0.0009 ka rao 0.0009 89 | -0.0009 ke 1.0219 90 | -0.0009 khasch sajn 0.0009 91 | -0.0004 khoong 0.6531 92 | -0.0009 laji 1.0055 93 | -0.0009 mast xa 0.0009 94 | -0.0009 phee 1.0145 95 | -0.0009 rao ke 0.0009 96 | -0.0009 sajn 1.0303 97 | -0.0009 teem 1.0266 98 | -0.0009 trajm ay 0.0009 99 | -0.0009 trowr laji 0.0009 100 | -0.0009 xa 1.0136 101 | 102 | \3-grams: 103 | -1.0163 caf 104 | -0.6903 ddusng 105 | -1.0227 ka 106 | -1.0312 khasch 107 | -0.6534 khoong 108 | -1.0144 mast 109 | -1.0274 trajm 110 | -1.0054 trowr 111 | -0.0009 caf phee 112 | -0.0004 ddusng 113 | -0.0009 ka rao 114 | -0.0009 khasch sajn 115 | -0.0004 khoong 116 | -0.0009 mast xa 117 | -0.0009 trajm ay 118 | -0.0009 trowr laji 119 | -0.0009 ay teem 120 | -0.0009 caf phee 121 | -0.0004 ddusng 122 | -0.0009 ka rao ke 123 | -0.0009 ke 124 | -0.0009 khasch sajn 125 | -0.0004 khoong 126 | -0.0009 laji 127 | -0.0009 mast xa 128 | -0.0009 phee 129 | -0.0009 rao ke 130 | -0.0009 sajn 131 | -0.0009 teem 132 | -0.0009 trajm ay teem 133 | -0.0009 trowr laji 134 | -0.0009 xa 135 | 136 | \end\ 137 | -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp.phone: -------------------------------------------------------------------------------- 1 | a 2 | c 3 | d 4 | e 5 | f 6 | g 7 | h 8 | i 9 | j 10 | k 11 | l 12 | m 13 | n 14 | o 15 | p 16 | r 17 | s 18 | t 19 | u 20 | w 21 | x 22 | y 23 | SIL -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp_test.fileids: -------------------------------------------------------------------------------- 1 | test/CAFPHEE001 2 | test/CAFPHEE002 3 | test/CAFPHEE003 4 | test/CAFPHEE004 5 | test/CAFPHEE005 6 | test/CAFPHEE006 7 | test/CAFPHEE007 8 | test/CAFPHEE008 9 | test/CAFPHEE009 10 | test/CAFPHEE010 11 | test/DDUSNG0001 12 | test/DDUSNG0002 13 | test/DDUSNG0003 14 | test/DDUSNG0004 15 | test/DDUSNG0005 16 | test/DDUSNG0006 17 | test/DDUSNG0007 18 | test/DDUSNG0008 19 | test/DDUSNG0009 20 | test/DDUSNG0010 21 | test/KARAOKE001 22 | test/KARAOKE002 23 | test/KARAOKE003 24 | test/KARAOKE004 25 | test/KARAOKE005 26 | test/KARAOKE006 27 | test/KARAOKE007 28 | test/KARAOKE008 29 | test/KARAOKE009 30 | test/KARAOKE010 31 | test/KHASCHSAJN001 32 | test/KHASCHSAJN002 33 | test/KHASCHSAJN003 34 | test/KHASCHSAJN004 35 | test/KHASCHSAJN005 36 | test/KHASCHSAJN006 37 | test/KHASCHSAJN007 38 | test/KHASCHSAJN008 39 | test/KHASCHSAJN009 40 | test/KHASCHSAJN010 41 | test/KHOONG0001 42 | test/KHOONG0002 43 | test/KHOONG0003 44 | test/KHOONG0004 45 | test/KHOONG0005 46 | test/KHOONG0006 47 | test/KHOONG0007 48 | test/KHOONG0008 49 | test/KHOONG0009 50 | test/KHOONG0010 51 | test/MASTXA001 52 | test/MASTXA002 53 | test/MASTXA003 54 | test/MASTXA004 55 | test/MASTXA005 56 | test/MASTXA006 57 | test/MASTXA007 58 | test/MASTXA008 59 | test/MASTXA009 60 | test/MASTXA010 61 | test/TRAJMAYTEEM001 62 | test/TRAJMAYTEEM002 63 | test/TRAJMAYTEEM003 64 | test/TRAJMAYTEEM004 65 | test/TRAJMAYTEEM005 66 | test/TRAJMAYTEEM006 67 | test/TRAJMAYTEEM007 68 | test/TRAJMAYTEEM008 69 | test/TRAJMAYTEEM009 70 | test/TRAJMAYTEEM010 71 | test/TROWRLAJI001 72 | test/TROWRLAJI002 73 | test/TROWRLAJI003 74 | test/TROWRLAJI004 75 | test/TROWRLAJI005 76 | test/TROWRLAJI006 77 | test/TROWRLAJI007 78 | test/TROWRLAJI008 79 | test/TROWRLAJI009 80 | test/TROWRLAJI010 81 | -------------------------------------------------------------------------------- /egs/diadiem/model/etc/tmp_test.transcription: -------------------------------------------------------------------------------- 1 | caf phee (CAFPHEE001) 2 | caf phee (CAFPHEE002) 3 | caf phee (CAFPHEE003) 4 | caf phee (CAFPHEE004) 5 | caf phee (CAFPHEE005) 6 | caf phee (CAFPHEE006) 7 | caf phee (CAFPHEE007) 8 | caf phee (CAFPHEE008) 9 | caf phee (CAFPHEE009) 10 | caf phee (CAFPHEE010) 11 | ddusng (DDUSNG0001) 12 | ddusng (DDUSNG0002) 13 | ddusng (DDUSNG0003) 14 | ddusng (DDUSNG0004) 15 | ddusng (DDUSNG0005) 16 | ddusng (DDUSNG0006) 17 | ddusng (DDUSNG0007) 18 | ddusng (DDUSNG0008) 19 | ddusng (DDUSNG0009) 20 | ddusng (DDUSNG0010) 21 | ka rao ke (KARAOKE001) 22 | ka rao ke (KARAOKE002) 23 | ka rao ke (KARAOKE003) 24 | ka rao ke (KARAOKE004) 25 | ka rao ke (KARAOKE005) 26 | ka rao ke (KARAOKE006) 27 | ka rao ke (KARAOKE007) 28 | ka rao ke (KARAOKE008) 29 | ka rao ke (KARAOKE009) 30 | ka rao ke (KARAOKE010) 31 | khasch sajn (KHASCHSAJN001) 32 | khasch sajn (KHASCHSAJN002) 33 | khasch sajn (KHASCHSAJN003) 34 | khasch sajn (KHASCHSAJN004) 35 | khasch sajn (KHASCHSAJN005) 36 | khasch sajn (KHASCHSAJN006) 37 | khasch sajn (KHASCHSAJN007) 38 | khasch sajn (KHASCHSAJN008) 39 | khasch sajn (KHASCHSAJN009) 40 | khasch sajn (KHASCHSAJN010) 41 | khoong (KHOONG0001) 42 | khoong (KHOONG0002) 43 | khoong (KHOONG0003) 44 | khoong (KHOONG0004) 45 | khoong (KHOONG0005) 46 | khoong (KHOONG0006) 47 | khoong (KHOONG0007) 48 | khoong (KHOONG0008) 49 | khoong (KHOONG0009) 50 | khoong (KHOONG0010) 51 | mast xa (MASTXA001) 52 | mast xa (MASTXA002) 53 | mast xa (MASTXA003) 54 | mast xa (MASTXA004) 55 | mast xa (MASTXA005) 56 | mast xa (MASTXA006) 57 | mast xa (MASTXA007) 58 | mast xa (MASTXA008) 59 | mast xa (MASTXA009) 60 | mast xa (MASTXA010) 61 | trajm ay teem (TRAJMAYTEEM001) 62 | trajm ay teem (TRAJMAYTEEM002) 63 | trajm ay teem (TRAJMAYTEEM003) 64 | trajm ay teem (TRAJMAYTEEM004) 65 | trajm ay teem (TRAJMAYTEEM005) 66 | trajm ay teem (TRAJMAYTEEM006) 67 | trajm ay teem (TRAJMAYTEEM007) 68 | trajm ay teem (TRAJMAYTEEM008) 69 | trajm ay teem (TRAJMAYTEEM009) 70 | trajm ay teem (TRAJMAYTEEM010) 71 | trowr laji (TROWRLAJI001) 72 | trowr laji (TROWRLAJI002) 73 | trowr laji (TROWRLAJI003) 74 | trowr laji (TROWRLAJI004) 75 | trowr laji (TROWRLAJI005) 76 | trowr laji (TROWRLAJI006) 77 | trowr laji (TROWRLAJI007) 78 | trowr laji (TROWRLAJI008) 79 | trowr laji (TROWRLAJI009) 80 | trowr laji (TROWRLAJI010) -------------------------------------------------------------------------------- /egs/diadiem/model/etc/vocab: -------------------------------------------------------------------------------- 1 | ## Vocab generated by v2 of the CMU-Cambridge Statistcal 2 | ## Language Modeling toolkit. 3 | ## 4 | ## Includes 18 words ## 5 | 6 | 7 | ay 8 | caf 9 | ddusng 10 | ka 11 | ke 12 | khasch 13 | khoong 14 | laji 15 | mast 16 | phee 17 | rao 18 | sajn 19 | teem 20 | trajm 21 | trowr 22 | xa 23 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mdef: -------------------------------------------------------------------------------- 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan 5 10:29:45 2018 2 | 0.3 3 | 23 n_base 4 | 59 n_tri 5 | 328 n_state_map 6 | 246 n_tied_state 7 | 69 n_tied_ci_state 8 | 23 n_tied_tmat 9 | # 10 | # Columns definitions 11 | #base lft rt p attrib tmat ... state id's ... 12 | SIL - - - filler 0 0 1 2 N 13 | a - - - n/a 1 3 4 5 N 14 | c - - - n/a 2 6 7 8 N 15 | d - - - n/a 3 9 10 11 N 16 | e - - - n/a 4 12 13 14 N 17 | f - - - n/a 5 15 16 17 N 18 | g - - - n/a 6 18 19 20 N 19 | h - - - n/a 7 21 22 23 N 20 | i - - - n/a 8 24 25 26 N 21 | j - - - n/a 9 27 28 29 N 22 | k - - - n/a 10 30 31 32 N 23 | l - - - n/a 11 33 34 35 N 24 | m - - - n/a 12 36 37 38 N 25 | n - - - n/a 13 39 40 41 N 26 | o - - - n/a 14 42 43 44 N 27 | p - - - n/a 15 45 46 47 N 28 | r - - - n/a 16 48 49 50 N 29 | s - - - n/a 17 51 52 53 N 30 | t - - - n/a 18 54 55 56 N 31 | u - - - n/a 19 57 58 59 N 32 | w - - - n/a 20 60 61 62 N 33 | x - - - n/a 21 63 64 65 N 34 | y - - - n/a 22 66 67 68 N 35 | a c f i n/a 1 69 70 71 N 36 | a h s i n/a 1 72 73 74 N 37 | a k r e n/a 1 75 76 77 N 38 | a l j i n/a 1 78 79 80 N 39 | a m s i n/a 1 81 82 83 N 40 | a m y b n/a 1 84 85 86 N 41 | a r j i n/a 1 87 88 89 N 42 | a r o i n/a 1 90 91 92 N 43 | a s j i n/a 1 93 94 95 N 44 | a x SIL e n/a 1 96 97 98 N 45 | c SIL a b n/a 2 99 100 101 N 46 | c s h i n/a 2 102 103 104 N 47 | d SIL d b n/a 3 105 106 107 N 48 | d d u i n/a 3 108 109 110 N 49 | e e SIL e n/a 4 111 112 113 N 50 | e e m i n/a 4 114 115 116 N 51 | e h e i n/a 4 117 118 119 N 52 | e k SIL e n/a 4 120 121 122 N 53 | e t e i n/a 4 123 124 125 N 54 | f a p e n/a 5 126 127 128 N 55 | g n SIL e n/a 6 129 130 131 N 56 | h c s e n/a 7 132 133 134 N 57 | h k a i n/a 7 135 136 137 N 58 | h k o i n/a 7 138 139 140 N 59 | h p e i n/a 7 141 142 143 N 60 | i j SIL e n/a 8 144 145 146 N 61 | j a i i n/a 9 147 148 149 N 62 | j a m i n/a 9 150 151 152 N 63 | j a n i n/a 9 153 154 155 N 64 | k SIL a b n/a 10 156 157 158 N 65 | k SIL h b n/a 10 159 160 161 N 66 | k o e b n/a 10 162 163 164 N 67 | l r a b n/a 11 165 166 167 N 68 | m SIL a b n/a 12 168 169 170 N 69 | m e SIL e n/a 12 171 172 173 N 70 | m j a e n/a 12 174 175 176 N 71 | n j SIL e n/a 13 177 178 179 N 72 | n o g i n/a 13 180 181 182 N 73 | n s g i n/a 13 183 184 185 N 74 | o a k e n/a 14 186 187 188 N 75 | o h o i n/a 14 189 190 191 N 76 | o o n i n/a 14 192 193 194 N 77 | o r w i n/a 14 195 196 197 N 78 | p f h b n/a 15 198 199 200 N 79 | r a a b n/a 16 201 202 203 N 80 | r t a i n/a 16 204 205 206 N 81 | r t o i n/a 16 207 208 209 N 82 | r w l e n/a 16 210 211 212 N 83 | s a c i n/a 17 213 214 215 N 84 | s a t i n/a 17 216 217 218 N 85 | s h a b n/a 17 219 220 221 N 86 | s u n i n/a 17 222 223 224 N 87 | t SIL r b n/a 18 225 226 227 N 88 | t s x e n/a 18 228 229 230 N 89 | t y e b n/a 18 231 232 233 N 90 | u d s i n/a 19 234 235 236 N 91 | w o r i n/a 20 237 238 239 N 92 | x t a b n/a 21 240 241 242 N 93 | y a t e n/a 22 243 244 245 N 94 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/mdef: -------------------------------------------------------------------------------- 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan 5 10:29:25 2018 2 | 0.3 3 | 23 n_base 4 | 0 n_tri 5 | 92 n_state_map 6 | 69 n_tied_state 7 | 69 n_tied_ci_state 8 | 23 n_tied_tmat 9 | # 10 | # Columns definitions 11 | #base lft rt p attrib tmat ... state id's ... 12 | SIL - - - filler 0 0 1 2 N 13 | a - - - n/a 1 3 4 5 N 14 | c - - - n/a 2 6 7 8 N 15 | d - - - n/a 3 9 10 11 N 16 | e - - - n/a 4 12 13 14 N 17 | f - - - n/a 5 15 16 17 N 18 | g - - - n/a 6 18 19 20 N 19 | h - - - n/a 7 21 22 23 N 20 | i - - - n/a 8 24 25 26 N 21 | j - - - n/a 9 27 28 29 N 22 | k - - - n/a 10 30 31 32 N 23 | l - - - n/a 11 33 34 35 N 24 | m - - - n/a 12 36 37 38 N 25 | n - - - n/a 13 39 40 41 N 26 | o - - - n/a 14 42 43 44 N 27 | p - - - n/a 15 45 46 47 N 28 | r - - - n/a 16 48 49 50 N 29 | s - - - n/a 17 51 52 53 N 30 | t - - - n/a 18 54 55 56 N 31 | u - - - n/a 19 57 58 59 N 32 | w - - - n/a 20 60 61 62 N 33 | x - - - n/a 21 63 64 65 N 34 | y - - - n/a 22 66 67 68 N 35 | -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/variances -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices -------------------------------------------------------------------------------- /egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances -------------------------------------------------------------------------------- /egs/diadiem/model/text.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/CAFPHEE010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/DDUSNG0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KARAOKE010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHASCHSAJN010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/KHOONG0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/MASTXA010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TRAJMAYTEEM010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM010.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI001.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI002.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI003.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI004.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI005.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI006.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI007.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI008.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI009.wav -------------------------------------------------------------------------------- /egs/diadiem/test/TROWRLAJI010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI010.wav -------------------------------------------------------------------------------- /egs/diadiem/test_model.py: -------------------------------------------------------------------------------- 1 | from model import transcript 2 | from os.path import join, dirname 3 | from unittest import TestCase 4 | 5 | 6 | class TestSentiment(TestCase): 7 | def test_1(self): 8 | wav = join(dirname(__file__), "test", "CAFPHEE001.wav") 9 | actual = transcript(wav) 10 | expected = "cà phê" 11 | self.assertEqual(actual, expected) 12 | 13 | def test_2(self): 14 | wav = join(dirname(__file__), "test", "KHASCHSAJN003.wav") 15 | actual = transcript(wav) 16 | expected = "khách sạn" 17 | self.assertEqual(actual, expected) 18 | -------------------------------------------------------------------------------- /egs/diadiem/text.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /egs/diadiem/train.py: -------------------------------------------------------------------------------- 1 | from extension.model import SphinxSpeechRecognition 2 | from extension.export import SphinxSpeechRecognitionExporter 3 | from load_data import corpus_folder 4 | from os.path import join, dirname 5 | 6 | tmp_folder = join(dirname(__file__), "tmp") 7 | export_folder = join(dirname(__file__), "model") 8 | 9 | model = SphinxSpeechRecognition(corpus_folder, tmp_folder) 10 | model.fit() 11 | SphinxSpeechRecognitionExporter.export(model, export_folder) 12 | # wav_file = join(tmp_folder, "etc", "wav", "train", "test", "CAFPHEE003.wav") 13 | # model.predict(wav_file) 14 | -------------------------------------------------------------------------------- /egs/vivos/README.md: -------------------------------------------------------------------------------- 1 | /home/anhv/anaconda3/envs/automatic_speech_recognition/bin/python /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/egs/vivos/train.py --kaldi_folder /home/anhv/PycharmProjects/kaldi-trunk --corpus_folder /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/data/vivos/corpus --nj 10 --method lda_mllt 2 | 3 | ===== Time Report ===== 4 | Mono 5 | 9:25 6 | 0:0 7 | 0:25 8 | Tri1 9 | 2:38 10 | 0:0 11 | 0:24 12 | Tri2a 13 | 2:38 14 | 0:0 15 | 0:24 16 | Tri3a 17 | 2:52 18 | 24:16 19 | 0:51 20 | Total time: 21 | 44:21 22 | 23 | 24 | ===== Score Report ===== 25 | Best WER 26 | %WER 79.80 [ 25926 / 32487, 245 ins, 5587 del, 20094 sub ] exp/tri3a/decode/wer_12 -------------------------------------------------------------------------------- /egs/vivos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/__init__.py -------------------------------------------------------------------------------- /egs/vivos/analyze.py: -------------------------------------------------------------------------------- 1 | from model import transcript 2 | from os.path import join, dirname 3 | from extension.analyze import WERAnalyzeLogger 4 | 5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 6 | "corpus") 7 | 8 | 9 | def load_test(): 10 | lines = open(join(corpus_folder, "test", "text")).read().splitlines() 11 | lines = [line.split("|") for line in lines] 12 | wavs = [line[0] for line in lines] 13 | wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs] 14 | texts = [line[1] for line in lines] 15 | return wavs, texts 16 | 17 | 18 | wavs_test, texts_test = load_test() 19 | # texts_pred = [""] * len(texts_test) 20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test] 21 | 22 | log_folder = join(dirname(__file__), "analyze") 23 | 24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder) -------------------------------------------------------------------------------- /egs/vivos/extension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/__init__.py -------------------------------------------------------------------------------- /egs/vivos/extension/analyze.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | from extension.metrics import calculate_wer 4 | from os.path import join, basename 5 | import os 6 | from underthesea.util.file_io import write 7 | import numpy as np 8 | 9 | 10 | class WERAnalyzeLogger: 11 | @staticmethod 12 | def log(wavs_test, texts_test, texts_pred, log_folder): 13 | wer = np.mean([calculate_wer(test.split(), pred.split()) 14 | for test, pred in zip(texts_test, texts_pred)]) 15 | wer = np.round(wer, 4) 16 | result = { 17 | "WER": wer 18 | } 19 | content = json.dumps(result, ensure_ascii=False) 20 | log_file = join(log_folder, "result.json") 21 | write(log_file, content) 22 | wav_folder = join(log_folder, "wav") 23 | try: 24 | shutil.rmtree(wav_folder) 25 | except: 26 | pass 27 | finally: 28 | os.mkdir(wav_folder) 29 | for wav in wavs_test: 30 | new_path = join(wav_folder, basename(wav)) 31 | shutil.copyfile(wav, new_path) 32 | wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test] 33 | speech_recognition = { 34 | "texts_test": texts_test, 35 | "texts_pred": texts_pred, 36 | "wavs_test": wavs_test_new_path, 37 | } 38 | content = json.dumps(speech_recognition, ensure_ascii=False) 39 | log_file = join(log_folder, "speechrecognition.json") 40 | write(log_file, content) 41 | 42 | print("Result is written in {}".format(log_file)) 43 | print("WER: {}%".format(wer * 100)) 44 | -------------------------------------------------------------------------------- /egs/vivos/extension/cmd.sh: -------------------------------------------------------------------------------- 1 | # Setting local system jobs (local CPU - no external clusters) 2 | export train_cmd=run.pl 3 | export decode_cmd=run.pl -------------------------------------------------------------------------------- /egs/vivos/extension/export.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os.path import join 3 | 4 | 5 | class SphinxSpeechRecognitionExporter: 6 | @staticmethod 7 | def export(model, export_folder): 8 | tmp_folder = model.tmp_folder 9 | try: 10 | shutil.rmtree(join(export_folder, "etc")) 11 | except: 12 | pass 13 | finally: 14 | shutil.copytree(join(tmp_folder, "etc"), 15 | join(export_folder, "etc")) 16 | 17 | try: 18 | shutil.rmtree(join(export_folder, "model_parameters")) 19 | except: 20 | pass 21 | finally: 22 | shutil.copytree(join(tmp_folder, "model_parameters"), 23 | join(export_folder, "model_parameters")) 24 | -------------------------------------------------------------------------------- /egs/vivos/extension/metrics.py: -------------------------------------------------------------------------------- 1 | def calculate_wer(reference, hypothesis): 2 | """ 3 | Calculation of WER with Levenshtein distance. 4 | Works only for iterables up to 254 elements (uint8). 5 | O(nm) time and space complexity. 6 | 7 | >>> calculate_wer("who is there".split(), "is there".split()) 8 | 1 9 | >>> calculate_wer("who is there".split(), "".split()) 10 | 3 11 | >>> calculate_wer("".split(), "who is there".split()) 12 | 3 13 | """ 14 | # initialisation 15 | import numpy 16 | d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1), 17 | dtype=numpy.uint8) 18 | d = d.reshape((len(reference) + 1, len(hypothesis) + 1)) 19 | for i in range(len(reference) + 1): 20 | for j in range(len(hypothesis) + 1): 21 | if i == 0: 22 | d[0][j] = j 23 | elif j == 0: 24 | d[i][0] = i 25 | 26 | # computation 27 | for i in range(1, len(reference) + 1): 28 | for j in range(1, len(hypothesis) + 1): 29 | if reference[i - 1] == hypothesis[j - 1]: 30 | d[i][j] = d[i - 1][j - 1] 31 | else: 32 | substitution = d[i - 1][j - 1] + 1 33 | insertion = d[i][j - 1] + 1 34 | deletion = d[i - 1][j] + 1 35 | d[i][j] = min(substitution, insertion, deletion) 36 | 37 | return d[len(reference)][len(hypothesis)] / float(len(reference)) 38 | 39 | 40 | import unittest 41 | assertions = unittest.TestCase('__init__') 42 | 43 | if __name__ == '__main__': 44 | s = calculate_wer("khach san".split(), "khach san cua toi".split()) 45 | assertions.assertAlmostEqual(s, 1) 46 | s = calculate_wer("khach san cua".split(), "khach san cua toi".split()) 47 | assertions.assertAlmostEqual(s, 0.333, 3) 48 | -------------------------------------------------------------------------------- /egs/vivos/extension/model_sphinx.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | import text 4 | 5 | N = 10000 6 | 7 | 8 | class SphinxSpeechRecognition: 9 | def __init__(self, corpus_folder, tmp_folder): 10 | print("Initial Sphinx Speech Recognition") 11 | self.corpus_folder = corpus_folder 12 | self.tmp_folder = tmp_folder 13 | try: 14 | shutil.rmtree(tmp_folder) 15 | except Exception as e: 16 | pass 17 | finally: 18 | os.mkdir(tmp_folder) 19 | os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder)) 20 | self._init_data() 21 | self._change_config() 22 | self._make_transcription() 23 | self._make_dictionary() 24 | self._make_filler() 25 | self._make_language_model() 26 | 27 | # ========================== # 28 | # Init Data 29 | # ========================== # 30 | def _init_data(self): 31 | os.system("cd {}; mkdir wav".format(self.tmp_folder)) 32 | os.system("cd {}; mkdir wav/train".format(self.tmp_folder)) 33 | os.system("cd {}; mkdir wav/test".format(self.tmp_folder)) 34 | 35 | ids = open( 36 | "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N] 37 | ids = [item.split("|")[0] for item in ids] 38 | for id in ids: 39 | shutil.copy2( 40 | "{}/train/wav/{}.wav".format(self.corpus_folder, id), 41 | "{}/wav/train/{}.wav".format(self.tmp_folder, id) 42 | ) 43 | 44 | ids = ["train/{}".format(id) for id in ids] 45 | ids.append("") 46 | content = "\n".join(ids) 47 | open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"), 48 | "w").write(content) 49 | 50 | ids = open( 51 | "{}/test/text".format(self.corpus_folder)).read().splitlines() 52 | ids = [item.split("|")[0] for item in ids] 53 | for id in ids: 54 | shutil.copy2( 55 | "{}/test/wav/{}.wav".format(self.corpus_folder, id), 56 | "{}/wav/test/{}.wav".format(self.tmp_folder, id) 57 | ) 58 | ids = ["test/{}".format(id) for id in ids] 59 | ids.append("") 60 | content = "\n".join(ids) 61 | open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"), 62 | "w").write(content) 63 | 64 | # ========================== # 65 | # Config 66 | # ========================== # 67 | def _change_config(self): 68 | config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg") 69 | config = SphinxConfig(config_file) 70 | config.set("$CFG_BASE_DIR", "\".\"") 71 | config.set("$CFG_WAVFILE_SRATE", 8000.0) 72 | config.set("$CFG_NUM_FILT", 31) 73 | config.set("$CFG_LO_FILT", 200) 74 | config.set("$CFG_HI_FILT", 3500) 75 | config.set("$CFG_WAVFILE_TYPE", "'raw'") 76 | config.set("$CFG_LANGUAGEMODEL", 77 | "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"") 78 | config.set("$DEC_CFG_LANGUAGEMODEL", 79 | "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"") 80 | 81 | # ========================== # 82 | # Transcription 83 | # ========================== # 84 | def _convert_transcription(self, in_file, out_file): 85 | lines = open(in_file).read().splitlines()[:N] 86 | output = [] 87 | for line in lines: 88 | fileid, word = line.split("|") 89 | phone = text.word2phone(word) 90 | content = " {} ({})".format(phone, fileid) 91 | output.append(content) 92 | output.append("") 93 | content = "\n".join(output) 94 | open(out_file, "w").write(content) 95 | 96 | def _make_transcription(self): 97 | self._convert_transcription( 98 | "{}/train/text".format(self.corpus_folder), 99 | "{}/etc/tmp_train.transcription".format(self.tmp_folder)) 100 | self._convert_transcription( 101 | "{}/test/text".format(self.corpus_folder), 102 | "{}/etc/tmp_test.transcription".format(self.tmp_folder)) 103 | 104 | # ============================== # 105 | # Create dictionary and phones 106 | # ============================== # 107 | def _make_dictionary(self): 108 | lines = open( 109 | "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N] 110 | phones = [] 111 | for line in lines: 112 | fileid, word = line.split("|") 113 | p = text.word2phone(word).split() 114 | phones += p 115 | phones = sorted(set(phones)) 116 | # create .dic files 117 | lines = [] 118 | phone_units = [] 119 | for p in phones: 120 | units = list(p) 121 | phone_units += units 122 | units = " ".join(units) 123 | line = "{:20s}{}".format(p, units) 124 | lines.append(line) 125 | open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write( 126 | "\n".join(lines)) 127 | phone_units = sorted(set(phone_units)) 128 | phone_units.append("SIL") 129 | open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write( 130 | "\n".join(phone_units)) 131 | 132 | def _make_filler(self): 133 | fillers = ["", "", ""] 134 | lines = ["{:20s}SIL".format(f) for f in fillers] 135 | open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write( 136 | "\n".join(lines)) 137 | 138 | # ========================== # 139 | # Language Model 140 | # ========================== # 141 | def _make_cleaned_text(self): 142 | in_file = "{}/train/text".format(self.corpus_folder) 143 | out_file = "{}/etc/text".format(self.tmp_folder) 144 | lines = open(in_file).read().splitlines()[:N] 145 | output = [] 146 | for line in lines: 147 | fileid, word = line.split("|") 148 | phone = text.word2phone(word) 149 | content = " {} ".format(phone, fileid) 150 | output.append(content) 151 | content = "\n".join(output) 152 | open(out_file, "w").write(content) 153 | 154 | def _make_language_model(self): 155 | self._make_cleaned_text() 156 | etc_folder = os.path.join(self.tmp_folder, "etc") 157 | chdir = "cd {}; ".format(etc_folder) 158 | os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab") 159 | os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text") 160 | os.system( 161 | chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm") 162 | 163 | def fit(self): 164 | chdir = "cd {}; ".format(self.tmp_folder) 165 | os.system(chdir + "sphinxtrain run") 166 | 167 | def predict(self, wav_file): 168 | command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format( 169 | self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file) 170 | output = os.popen(command).read().strip() 171 | output = text.phone2word(output) 172 | return output 173 | 174 | 175 | class SphinxConfig: 176 | def __init__(self, config_file): 177 | self.file = config_file 178 | self.lines = open(config_file).read().splitlines() 179 | 180 | def save(self): 181 | content = "\n".join(self.lines) 182 | open(self.file, "w").write(content) 183 | 184 | def set(self, key, value): 185 | for i, line in enumerate(self.lines): 186 | if line.startswith(key): 187 | content = "{} = {};".format(key, value) 188 | self.lines[i] = content 189 | self.save() 190 | -------------------------------------------------------------------------------- /egs/vivos/extension/path.sh: -------------------------------------------------------------------------------- 1 | # Defining Kaldi root directory 2 | export KALDI_ROOT=`pwd`/../.. 3 | 4 | # Setting paths to useful tools 5 | export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH 6 | 7 | # Defining audio data directory (modify it for your installation directory!) 8 | export DATA_ROOT=`pwd`/audio 9 | 10 | # Enable SRILM 11 | . $KALDI_ROOT/tools/env.sh 12 | 13 | # Variable needed for proper data sorting 14 | export LC_ALL=C -------------------------------------------------------------------------------- /egs/vivos/extension/run_deltadelta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh || exit 1 4 | . ./cmd.sh || exit 1 5 | 6 | EXP_START=$(date +%s); 7 | 8 | nj=1 # number of parallel jobs 9 | lm_order=1 # language model order (n-gram quantity) 10 | 11 | # Safety mechanism (possible running this script with modified arguments) 12 | . utils/parse_options.sh || exit 1 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; } 14 | 15 | # Removing previously created data (from last run.sh execution) 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt 17 | 18 | 19 | 20 | echo 21 | echo "===== PREPARING ACOUSTIC DATA =====" 22 | echo 23 | 24 | # Needs to be prepared by hand (or using self written scripts): 25 | # 26 | # spk2gender [ ] 27 | # wav.scp [ ] 28 | # text [ ] 29 | # utt2spk [ ] 30 | # corpus.txt [] 31 | 32 | # Making spk2utt files 33 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt 34 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt 35 | 36 | 37 | echo 38 | echo "===== FEATURES EXTRACTION =====" 39 | echo 40 | 41 | # Making feats.scp files 42 | mfccdir=mfcc 43 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting 44 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory 45 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory 46 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir 47 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir 48 | 49 | 50 | # Making cmvn.scp files 51 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 52 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir 53 | 54 | echo 55 | echo "===== PREPARING LANGUAGE DATA =====" 56 | echo 57 | 58 | # Needs to be prepared by hand (or using self written scripts): 59 | # 60 | # lexicon.txt [ ...] 61 | # nonsilence_phones.txt [] 62 | # silence_phones.txt [] 63 | # optional_silence.txt [] 64 | 65 | # Preparing language data 66 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang 67 | 68 | echo 69 | echo "===== LANGUAGE MODEL CREATION =====" 70 | echo "===== MAKING lm.arpa =====" 71 | echo 72 | 73 | loc=`which ngram-count`; 74 | if [ -z $loc ]; then 75 | if uname -a | grep 64 >/dev/null; then 76 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 77 | else 78 | sdir=$KALDI_ROOT/tools/srilm/bin/i686 79 | fi 80 | if [ -f $sdir/ngram-count ]; then 81 | echo "Using SRILM language modelling tool from $sdir" 82 | export PATH=$PATH:$sdir 83 | else 84 | echo "SRILM toolkit is probably not installed. 85 | Instructions: tools/install_srilm.sh" 86 | exit 1 87 | fi 88 | fi 89 | 90 | local=data/local 91 | mkdir $local/tmp 92 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa 93 | 94 | echo 95 | echo "===== MAKING G.fst =====" 96 | echo 97 | 98 | lang=data/lang 99 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst 100 | 101 | echo 102 | echo "===== MONO TRAINING =====" 103 | echo 104 | 105 | START=$(date +%s); 106 | steps/train_mono.sh --nj $nj \ 107 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1 108 | END=$(date +%s); 109 | MONO_TRAINING_TIME=$((END - START)) 110 | 111 | echo 112 | echo "===== MONO DECODING =====" 113 | echo 114 | 115 | START=$(date +%s); 116 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1 117 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 118 | # exp/mono/graph data/test exp/mono/decode 119 | END=$(date +%s); 120 | MONO_DECODING_TIME=$((END - START)) 121 | 122 | echo 123 | echo "===== MONO ALIGNMENT =====" 124 | echo 125 | 126 | START=$(date +%s); 127 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 128 | data/train data/lang exp/mono exp/mono_ali || exit 1 129 | END=$(date +%s); 130 | MONO_ALIGNMENT_TIME=$((END - START)) 131 | 132 | echo 133 | echo "===== TRI1 (first triphone pass) TRAINING =====" 134 | echo 135 | 136 | START=$(date +%s); 137 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 138 | data/train data/lang exp/mono_ali exp/tri1 || exit 1 139 | END=$(date +%s); 140 | TRI1_TRAINING_TIME=$((END - START)) 141 | 142 | echo 143 | echo "===== TRI1 (first triphone pass) DECODING =====" 144 | echo 145 | 146 | START=$(date +%s); 147 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1 148 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 149 | # exp/tri1/graph data/test exp/tri1/decode 150 | END=$(date +%s); 151 | TRI1_DECODING_TIME=$((END - START)) 152 | 153 | echo 154 | echo "===== TRI1 ALIGNMENT =====" 155 | echo 156 | 157 | START=$(date +%s); 158 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 159 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1; 160 | END=$(date +%s); 161 | TRI1_ALIGNMENT_TIME=$((END - START)) 162 | 163 | echo 164 | echo "===== TRI2A TRAINING =====" 165 | echo 166 | 167 | START=$(date +%s); 168 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 169 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1 170 | END=$(date +%s); 171 | TRI2A_TRAINING_TIME=$((END - START)) 172 | 173 | echo 174 | echo "===== TRI2A DECODING =====" 175 | echo 176 | 177 | START=$(date +%s); 178 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1 179 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 180 | exp/tri2a/graph data/test exp/tri2a/decode 181 | END=$(date +%s); 182 | TRI2A_DECODING_TIME=$((END - START)) 183 | 184 | echo 185 | echo "===== TRI2A ALIGNMENT =====" 186 | echo 187 | 188 | START=$(date +%s); 189 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 190 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; 191 | END=$(date +%s); 192 | TRI2A_ALIGNMENT_TIME=$((END - START)) 193 | 194 | echo 195 | echo "===== run.sh script is finished =====" 196 | echo 197 | 198 | EXP_END=$(date +%s); 199 | EXP_TIME=$((EXP_END - EXP_START)) 200 | 201 | log_file='exp.log' 202 | echo "" > $log_file 203 | echo "===== Time Report =====" >> $log_file 204 | echo "Mono" >> $log_file 205 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 206 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 207 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 208 | 209 | echo "Tri1" >> $log_file 210 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 211 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 212 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 213 | 214 | echo "Tri2a" >> $log_file 215 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 216 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 217 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 218 | 219 | echo "Total time:" >> $log_file 220 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 221 | 222 | echo -e "\n" >> $log_file 223 | echo "===== Score Report =====" >> $log_file 224 | echo "Best WER" >> $log_file 225 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file 226 | 227 | echo -e "\n" >> $log_file 228 | 229 | cat $log_file 230 | -------------------------------------------------------------------------------- /egs/vivos/extension/run_lda_mllt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh || exit 1 4 | . ./cmd.sh || exit 1 5 | 6 | EXP_START=$(date +%s); 7 | 8 | nj=1 # number of parallel jobs 9 | lm_order=1 # language model order (n-gram quantity) 10 | 11 | # Safety mechanism (possible running this script with modified arguments) 12 | . utils/parse_options.sh || exit 1 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; } 14 | 15 | # Removing previously created data (from last run.sh execution) 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt 17 | 18 | echo 19 | echo "===== PREPARING ACOUSTIC DATA =====" 20 | echo 21 | 22 | # Needs to be prepared by hand (or using self written scripts): 23 | # 24 | # spk2gender [ ] 25 | # wav.scp [ ] 26 | # text [ ] 27 | # utt2spk [ ] 28 | # corpus.txt [] 29 | 30 | # Making spk2utt files 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt 33 | 34 | echo 35 | echo "===== FEATURES EXTRACTION =====" 36 | echo 37 | 38 | # Making feats.scp files 39 | mfccdir=mfcc 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting 41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory 42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir 45 | 46 | # Making cmvn.scp files 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir 49 | 50 | echo 51 | echo "===== PREPARING LANGUAGE DATA =====" 52 | echo 53 | 54 | # Needs to be prepared by hand (or using self written scripts): 55 | # 56 | # lexicon.txt [ ...] 57 | # nonsilence_phones.txt [] 58 | # silence_phones.txt [] 59 | # optional_silence.txt [] 60 | 61 | # Preparing language data 62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang 63 | 64 | echo 65 | echo "===== LANGUAGE MODEL CREATION =====" 66 | echo "===== MAKING lm.arpa =====" 67 | echo 68 | 69 | loc=`which ngram-count`; 70 | if [ -z $loc ]; then 71 | if uname -a | grep 64 >/dev/null; then 72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 73 | else 74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686 75 | fi 76 | if [ -f $sdir/ngram-count ]; then 77 | echo "Using SRILM language modelling tool from $sdir" 78 | export PATH=$PATH:$sdir 79 | else 80 | echo "SRILM toolkit is probably not installed. 81 | Instructions: tools/install_srilm.sh" 82 | exit 1 83 | fi 84 | fi 85 | 86 | local=data/local 87 | mkdir $local/tmp 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa 89 | 90 | echo 91 | echo "===== MAKING G.fst =====" 92 | echo 93 | 94 | lang=data/lang 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst 96 | 97 | echo 98 | echo "===== MONO TRAINING =====" 99 | echo 100 | 101 | START=$(date +%s); 102 | steps/train_mono.sh --nj $nj \ 103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1 104 | END=$(date +%s); 105 | MONO_TRAINING_TIME=$((END - START)) 106 | 107 | echo 108 | echo "===== MONO DECODING =====" 109 | echo 110 | 111 | START=$(date +%s); 112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1 113 | END=$(date +%s); 114 | MONO_DECODING_TIME=$((END - START)) 115 | 116 | echo 117 | echo "===== MONO ALIGNMENT =====" 118 | echo 119 | 120 | START=$(date +%s); 121 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 122 | data/train data/lang exp/mono exp/mono_ali || exit 1 123 | END=$(date +%s); 124 | MONO_ALIGNMENT_TIME=$((END - START)) 125 | 126 | echoalign 127 | echo "===== TRI1 (first triphone pass) TRAINING =====" 128 | echo 129 | 130 | START=$(date +%s); 131 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 132 | data/train data/lang exp/mono_ali exp/tri1 || exit 1 133 | END=$(date +%s); 134 | TRI1_TRAINING_TIME=$((END - START)) 135 | 136 | echo 137 | echo "===== TRI1 (first triphone pass) DECODING =====" 138 | echo 139 | 140 | START=$(date +%s); 141 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1 142 | END=$(date +%s); 143 | TRI1_DECODING_TIME=$((END - START)) 144 | 145 | echo 146 | echo "===== TRI1 ALIGNMENT =====" 147 | echo 148 | 149 | START=$(date +%s); 150 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 151 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1; 152 | END=$(date +%s); 153 | TRI1_ALIGNMENT_TIME=$((END - START)) 154 | 155 | echo 156 | echo "===== TRI2A TRAINING =====" 157 | echo 158 | 159 | START=$(date +%s); 160 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 161 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1 162 | END=$(date +%s); 163 | TRI2A_TRAINING_TIME=$((END - START)) 164 | 165 | echo 166 | echo "===== TRI2A DECODING =====" 167 | echo 168 | 169 | START=$(date +%s); 170 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1 171 | END=$(date +%s); 172 | TRI2A_DECODING_TIME=$((END - START)) 173 | 174 | echo 175 | echo "===== TRI2A ALIGNMENT =====" 176 | echo 177 | 178 | START=$(date +%s); 179 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 180 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; 181 | END=$(date +%s); 182 | TRI2A_ALIGNMENT_TIME=$((END - START)) 183 | 184 | echo 185 | echo "===== TRI3A TRAINING =====" 186 | echo 187 | 188 | START=$(date +%s); 189 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \ 190 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1; 191 | END=$(date +%s); 192 | TRI3A_TRAINING_TIME=$((END - START)) 193 | 194 | echo 195 | echo "===== TRI3A DECODING =====" 196 | echo 197 | 198 | START=$(date +%s); 199 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1 200 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 201 | exp/tri3a/graph data/test exp/tri3a/decode 202 | END=$(date +%s); 203 | TRI3A_DECODING_TIME=$((END - START)) 204 | 205 | echo 206 | echo "===== TRI3A ALIGNMENT =====" 207 | echo 208 | 209 | START=$(date +%s); 210 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 211 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; 212 | END=$(date +%s); 213 | TRI3A_ALIGNMENT_TIME=$((END - START)) 214 | 215 | echo 216 | echo "===== run.sh script is finished =====" 217 | echo 218 | 219 | EXP_END=$(date +%s); 220 | EXP_TIME=$((EXP_END - EXP_START)) 221 | 222 | log_file='exp.log' 223 | echo "" > $log_file 224 | echo "===== Time Report =====" >> $log_file 225 | echo "Mono" >> $log_file 226 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 227 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 228 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 229 | 230 | echo "Tri1" >> $log_file 231 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 232 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 233 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 234 | 235 | echo "Tri2a" >> $log_file 236 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 237 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 238 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 239 | 240 | echo "Tri3a" >> $log_file 241 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 242 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 243 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 244 | 245 | echo "Total time:" >> $log_file 246 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 247 | 248 | echo -e "\n" >> $log_file 249 | echo "===== Score Report =====" >> $log_file 250 | echo "Best WER" >> $log_file 251 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file 252 | 253 | echo -e "\n" >> $log_file 254 | 255 | cat $log_file 256 | -------------------------------------------------------------------------------- /egs/vivos/extension/run_lda_mllt_decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh || exit 1 4 | . ./cmd.sh || exit 1 5 | 6 | EXP_START=$(date +%s); 7 | 8 | nj=1 # number of parallel jobs 9 | lm_order=1 # language model order (n-gram quantity) 10 | 11 | # Safety mechanism (possible running this script with modified arguments) 12 | . utils/parse_options.sh || exit 1 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; } 14 | 15 | # Removing previously created data (from last run.sh execution) 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt 17 | 18 | echo 19 | echo "===== PREPARING ACOUSTIC DATA =====" 20 | echo 21 | 22 | # Needs to be prepared by hand (or using self written scripts): 23 | # 24 | # spk2gender [ ] 25 | # wav.scp [ ] 26 | # text [ ] 27 | # utt2spk [ ] 28 | # corpus.txt [] 29 | 30 | # Making spk2utt files 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt 33 | 34 | echo 35 | echo "===== FEATURES EXTRACTION =====" 36 | echo 37 | 38 | # Making feats.scp files 39 | mfccdir=mfcc 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting 41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory 42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir 45 | 46 | # Making cmvn.scp files 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir 49 | 50 | echo 51 | echo "===== PREPARING LANGUAGE DATA =====" 52 | echo 53 | 54 | # Needs to be prepared by hand (or using self written scripts): 55 | # 56 | # lexicon.txt [ ...] 57 | # nonsilence_phones.txt [] 58 | # silence_phones.txt [] 59 | # optional_silence.txt [] 60 | 61 | # Preparing language data 62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang 63 | 64 | echo 65 | echo "===== LANGUAGE MODEL CREATION =====" 66 | echo "===== MAKING lm.arpa =====" 67 | echo 68 | 69 | loc=`which ngram-count`; 70 | if [ -z $loc ]; then 71 | if uname -a | grep 64 >/dev/null; then 72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 73 | else 74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686 75 | fi 76 | if [ -f $sdir/ngram-count ]; then 77 | echo "Using SRILM language modelling tool from $sdir" 78 | export PATH=$PATH:$sdir 79 | else 80 | echo "SRILM toolkit is probably not installed. 81 | Instructions: tools/install_srilm.sh" 82 | exit 1 83 | fi 84 | fi 85 | 86 | local=data/local 87 | mkdir $local/tmp 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa 89 | 90 | echo 91 | echo "===== MAKING G.fst =====" 92 | echo 93 | 94 | lang=data/lang 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst 96 | 97 | echo 98 | echo "===== MONO TRAINING =====" 99 | echo 100 | 101 | START=$(date +%s); 102 | steps/train_mono.sh --nj $nj \ 103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1 104 | END=$(date +%s); 105 | MONO_TRAINING_TIME=$((END - START)) 106 | 107 | echo 108 | echo "===== MONO DECODING =====" 109 | echo 110 | 111 | START=$(date +%s); 112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1 113 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 114 | exp/mono/graph data/test exp/mono/decode 115 | END=$(date +%s); 116 | MONO_DECODING_TIME=$((END - START)) 117 | 118 | echo 119 | echo "===== MONO ALIGNMENT =====" 120 | echo 121 | 122 | START=$(date +%s); 123 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 124 | data/train data/lang exp/mono exp/mono_ali || exit 1 125 | END=$(date +%s); 126 | MONO_ALIGNMENT_TIME=$((END - START)) 127 | 128 | echo 129 | echo "===== TRI1 (first triphone pass) TRAINING =====" 130 | echo 131 | 132 | START=$(date +%s); 133 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 134 | data/train data/lang exp/mono_ali exp/tri1 || exit 1 135 | END=$(date +%s); 136 | TRI1_TRAINING_TIME=$((END - START)) 137 | 138 | echo 139 | echo "===== TRI1 (first triphone pass) DECODING =====" 140 | echo 141 | 142 | START=$(date +%s); 143 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1 144 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 145 | exp/tri1/graph data/test exp/tri1/decode 146 | END=$(date +%s); 147 | TRI1_DECODING_TIME=$((END - START)) 148 | 149 | echo 150 | echo "===== TRI1 ALIGNMENT =====" 151 | echo 152 | 153 | START=$(date +%s); 154 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 155 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1; 156 | END=$(date +%s); 157 | TRI1_ALIGNMENT_TIME=$((END - START)) 158 | 159 | echo 160 | echo "===== TRI2A TRAINING =====" 161 | echo 162 | 163 | START=$(date +%s); 164 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 165 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1 166 | END=$(date +%s); 167 | TRI2A_TRAINING_TIME=$((END - START)) 168 | 169 | echo 170 | echo "===== TRI2A DECODING =====" 171 | echo 172 | 173 | START=$(date +%s); 174 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1 175 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 176 | exp/tri2a/graph data/test exp/tri2a/decode 177 | END=$(date +%s); 178 | TRI2A_DECODING_TIME=$((END - START)) 179 | 180 | echo 181 | echo "===== TRI2A ALIGNMENT =====" 182 | echo 183 | 184 | START=$(date +%s); 185 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 186 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; 187 | END=$(date +%s); 188 | TRI2A_ALIGNMENT_TIME=$((END - START)) 189 | 190 | echo 191 | echo "===== TRI3A TRAINING =====" 192 | echo 193 | 194 | START=$(date +%s); 195 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \ 196 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1; 197 | END=$(date +%s); 198 | TRI3A_TRAINING_TIME=$((END - START)) 199 | 200 | echo 201 | echo "===== TRI3A DECODING =====" 202 | echo 203 | 204 | START=$(date +%s); 205 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1 206 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 207 | exp/tri3a/graph data/test exp/tri3a/decode 208 | END=$(date +%s); 209 | TRI3A_DECODING_TIME=$((END - START)) 210 | 211 | echo 212 | echo "===== TRI3A ALIGNMENT =====" 213 | echo 214 | 215 | START=$(date +%s); 216 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 217 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; 218 | END=$(date +%s); 219 | TRI3A_ALIGNMENT_TIME=$((END - START)) 220 | 221 | echo 222 | echo "===== run.sh script is finished =====" 223 | echo 224 | 225 | EXP_END=$(date +%s); 226 | EXP_TIME=$((EXP_END - EXP_START)) 227 | 228 | log_file='exp.log' 229 | echo "" > $log_file 230 | echo "===== Time Report =====" >> $log_file 231 | echo "Mono" >> $log_file 232 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 233 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 234 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 235 | 236 | echo "Tri1" >> $log_file 237 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 238 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 239 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 240 | 241 | echo "Tri2a" >> $log_file 242 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 243 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 244 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 245 | 246 | echo "Tri3a" >> $log_file 247 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 248 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 249 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 250 | 251 | echo "Total time:" >> $log_file 252 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 253 | 254 | echo -e "\n" >> $log_file 255 | echo "===== Score Report =====" >> $log_file 256 | echo "Best WER" >> $log_file 257 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file 258 | 259 | echo -e "\n" >> $log_file 260 | 261 | cat $log_file 262 | -------------------------------------------------------------------------------- /egs/vivos/extension/run_sat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh || exit 1 4 | . ./cmd.sh || exit 1 5 | 6 | EXP_START=$(date +%s); 7 | 8 | nj=1 # number of parallel jobs 9 | lm_order=1 # language model order (n-gram quantity) 10 | 11 | # Safety mechanism (possible running this script with modified arguments) 12 | . utils/parse_options.sh || exit 1 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; } 14 | 15 | # Removing previously created data (from last run.sh execution) 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt 17 | 18 | echo 19 | echo "===== PREPARING ACOUSTIC DATA =====" 20 | echo 21 | 22 | # Needs to be prepared by hand (or using self written scripts): 23 | # 24 | # spk2gender [ ] 25 | # wav.scp [ ] 26 | # text [ ] 27 | # utt2spk [ ] 28 | # corpus.txt [] 29 | 30 | # Making spk2utt files 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt 33 | 34 | echo 35 | echo "===== FEATURES EXTRACTION =====" 36 | echo 37 | 38 | # Making feats.scp files 39 | mfccdir=mfcc 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting 41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory 42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir 45 | 46 | # Making cmvn.scp files 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir 49 | 50 | echo 51 | echo "===== PREPARING LANGUAGE DATA =====" 52 | echo 53 | 54 | # Needs to be prepared by hand (or using self written scripts): 55 | # 56 | # lexicon.txt [ ...] 57 | # nonsilence_phones.txt [] 58 | # silence_phones.txt [] 59 | # optional_silence.txt [] 60 | 61 | # Preparing language data 62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang 63 | 64 | echo 65 | echo "===== LANGUAGE MODEL CREATION =====" 66 | echo "===== MAKING lm.arpa =====" 67 | echo 68 | 69 | loc=`which ngram-count`; 70 | if [ -z $loc ]; then 71 | if uname -a | grep 64 >/dev/null; then 72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 73 | else 74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686 75 | fi 76 | if [ -f $sdir/ngram-count ]; then 77 | echo "Using SRILM language modelling tool from $sdir" 78 | export PATH=$PATH:$sdir 79 | else 80 | echo "SRILM toolkit is probably not installed. 81 | Instructions: tools/install_srilm.sh" 82 | exit 1 83 | fi 84 | fi 85 | 86 | local=data/local 87 | mkdir $local/tmp 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa 89 | 90 | echo 91 | echo "===== MAKING G.fst =====" 92 | echo 93 | 94 | lang=data/lang 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst 96 | 97 | echo 98 | echo "===== MONO TRAINING =====" 99 | echo 100 | 101 | START=$(date +%s); 102 | steps/train_mono.sh --nj $nj \ 103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1 104 | END=$(date +%s); 105 | MONO_TRAINING_TIME=$((END - START)) 106 | 107 | echo 108 | echo "===== MONO DECODING =====" 109 | echo 110 | 111 | START=$(date +%s); 112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1 113 | 114 | END=$(date +%s); 115 | MONO_DECODING_TIME=$((END - START)) 116 | 117 | echo 118 | echo "===== MONO ALIGNMENT =====" 119 | echo 120 | 121 | START=$(date +%s); 122 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 123 | data/train data/lang exp/mono exp/mono_ali || exit 1 124 | END=$(date +%s); 125 | MONO_ALIGNMENT_TIME=$((END - START)) 126 | 127 | echo 128 | echo "===== TRI1 (first triphone pass) TRAINING =====" 129 | echo 130 | 131 | START=$(date +%s); 132 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 133 | data/train data/lang exp/mono_ali exp/tri1 || exit 1 134 | END=$(date +%s); 135 | TRI1_TRAINING_TIME=$((END - START)) 136 | 137 | echo 138 | echo "===== TRI1 (first triphone pass) DECODING =====" 139 | echo 140 | 141 | START=$(date +%s); 142 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1 143 | 144 | END=$(date +%s); 145 | TRI1_DECODING_TIME=$((END - START)) 146 | 147 | echo 148 | echo "===== TRI1 ALIGNMENT =====" 149 | echo 150 | 151 | START=$(date +%s); 152 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 153 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1; 154 | END=$(date +%s); 155 | TRI1_ALIGNMENT_TIME=$((END - START)) 156 | 157 | echo 158 | echo "===== TRI2A TRAINING =====" 159 | echo 160 | 161 | START=$(date +%s); 162 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \ 163 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1 164 | END=$(date +%s); 165 | TRI2A_TRAINING_TIME=$((END - START)) 166 | 167 | echo 168 | echo "===== TRI2A DECODING =====" 169 | echo 170 | 171 | START=$(date +%s); 172 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1 173 | 174 | END=$(date +%s); 175 | TRI2A_DECODING_TIME=$((END - START)) 176 | 177 | echo 178 | echo "===== TRI2A ALIGNMENT =====" 179 | echo 180 | 181 | START=$(date +%s); 182 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \ 183 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; 184 | END=$(date +%s); 185 | TRI2A_ALIGNMENT_TIME=$((END - START)) 186 | 187 | echo 188 | echo "===== TRI3A TRAINING =====" 189 | echo 190 | 191 | START=$(date +%s); 192 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \ 193 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1; 194 | END=$(date +%s); 195 | TRI3A_TRAINING_TIME=$((END - START)) 196 | 197 | echo 198 | echo "===== TRI3A DECODING =====" 199 | echo 200 | 201 | START=$(date +%s); 202 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1 203 | 204 | END=$(date +%s); 205 | TRI3A_DECODING_TIME=$((END - START)) 206 | 207 | echo 208 | echo "===== TRI3A ALIGNMENT =====" 209 | echo 210 | 211 | START=$(date +%s); 212 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 213 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; 214 | END=$(date +%s); 215 | TRI3A_ALIGNMENT_TIME=$((END - START)) 216 | 217 | 218 | echo 219 | echo "===== TRI4A TRAINING =====" 220 | echo 221 | 222 | START=$(date +%s); 223 | steps/train_sat.sh --cmd "$train_cmd" 2500 20000 \ 224 | data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; 225 | END=$(date +%s); 226 | TRI4A_TRAINING_TIME=$((END - START)) 227 | 228 | echo 229 | echo "===== TRI4A DECODING =====" 230 | echo 231 | 232 | START=$(date +%s); 233 | utils/mkgraph.sh data/lang exp/tri4a exp/tri4a/graph || exit 1 234 | 235 | END=$(date +%s); 236 | TRI4A_DECODING_TIME=$((END - START)) 237 | 238 | echo 239 | echo "===== TRI4A ALIGNMENT =====" 240 | echo 241 | 242 | START=$(date +%s); 243 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 244 | data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; 245 | END=$(date +%s); 246 | TRI4A_ALIGNMENT_TIME=$((END - START)) 247 | 248 | echo 249 | echo "===== TRI5A TRAINING =====" 250 | echo 251 | 252 | START=$(date +%s); 253 | steps/train_sat.sh --cmd "$train_cmd" 3500 100000 \ 254 | data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; 255 | END=$(date +%s); 256 | TRI5A_TRAINING_TIME=$((END - START)) 257 | 258 | echo 259 | echo "===== TRI5A DECODING =====" 260 | echo 261 | 262 | START=$(date +%s); 263 | utils/mkgraph.sh data/lang exp/tri5a exp/tri5a/graph || exit 1 264 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \ 265 | exp/tri5a/graph data/test exp/tri5a/decode 266 | END=$(date +%s); 267 | TRI5A_DECODING_TIME=$((END - START)) 268 | 269 | echo 270 | echo "===== TRI5A ALIGNMENT =====" 271 | echo 272 | 273 | START=$(date +%s); 274 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 275 | data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; 276 | END=$(date +%s); 277 | TRI5A_ALIGNMENT_TIME=$((END - START)) 278 | 279 | echo 280 | echo "===== run.sh script is finished =====" 281 | echo 282 | 283 | EXP_END=$(date +%s); 284 | EXP_TIME=$((EXP_END - EXP_START)) 285 | 286 | log_file='exp.log' 287 | echo "" > $log_file 288 | echo "===== Time Report =====" >> $log_file 289 | echo "Mono" >> $log_file 290 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 291 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 292 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 293 | 294 | echo "Tri1" >> $log_file 295 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 296 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 297 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 298 | 299 | echo "Tri2a" >> $log_file 300 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 301 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 302 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 303 | 304 | echo "Tri3a" >> $log_file 305 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 306 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 307 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 308 | 309 | echo "Tri4a" >> $log_file 310 | echo $TRI4A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 311 | echo $TRI4A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 312 | echo $TRI4A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 313 | 314 | echo "Tri5a" >> $log_file 315 | echo $TRI5A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 316 | echo $TRI5A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 317 | echo $TRI5A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 318 | 319 | echo "Total time:" >> $log_file 320 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file 321 | 322 | echo -e "\n" >> $log_file 323 | echo "===== Score Report =====" >> $log_file 324 | echo "Best WER" >> $log_file 325 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file 326 | 327 | echo -e "\n" >> $log_file 328 | 329 | cat $log_file 330 | -------------------------------------------------------------------------------- /egs/vivos/extension/text.py: -------------------------------------------------------------------------------- 1 | class PhoneConverter1: 2 | rules_1 = [ 3 | "aàáảãạ", 4 | "ăằắẳẵặ", 5 | "âầấẩẫậ", 6 | "eèéẻẽẹ", 7 | "êềếểễệ", 8 | "iìíỉĩị", 9 | "oòóỏõọ", 10 | "ôồốổỗộ", 11 | "ơờớởỡợ", 12 | "uùúủũụ", 13 | "ưừứửữự", 14 | "yỳýỷỹỵ" 15 | ] 16 | rules_2 = [ 17 | "awă", 18 | "aaâ", 19 | "eeê", 20 | "ooô", 21 | "owơ", 22 | "uwư", 23 | "ddđ" 24 | ] 25 | w2p = {} 26 | p2w = {} 27 | for words in rules_1: 28 | original = words[0] 29 | words = words[1:] 30 | for rule in rules_2: 31 | if original == rule[2]: 32 | original = rule[0:2] 33 | tones = "fsrxj" 34 | for i, w in enumerate(words): 35 | w2p[w] = original + tones[i] 36 | for rule in rules_2: 37 | w2p[rule[2]] = rule[0:2] 38 | for key, value in w2p.items(): 39 | p2w[value] = key 40 | 41 | @staticmethod 42 | def word2phone(word): 43 | w2p = PhoneConverter1.w2p 44 | phone = "" 45 | for w in word: 46 | if w in w2p: 47 | phone += w2p[w] 48 | else: 49 | phone += w 50 | return phone 51 | 52 | @staticmethod 53 | def phone2word(phone): 54 | p2w = PhoneConverter1.p2w 55 | i = 0 56 | word = "" 57 | while i < len(phone): 58 | if phone[i:i+3] in p2w: 59 | p = phone[i:i+3] 60 | word += p2w[p] 61 | i += 3 62 | elif phone[i:i+2] in p2w: 63 | p = phone[i:i+2] 64 | word += p2w[p] 65 | i += 2 66 | else: 67 | p = phone[i:i+1] 68 | word += p 69 | i += 1 70 | return word 71 | 72 | if __name__ == '__main__': 73 | tests = [ 74 | ("con hoẵng", "con hoawxng"), 75 | ("lựu đạn", "luwju ddajn"), 76 | ("kiểm tra", "kieerm tra"), 77 | ("ủy ban", "ury ban"), 78 | ("cà phê", "caf phee"), 79 | ("khách sạn", "khasch sajn"), 80 | ("đúng", "ddusng"), 81 | ("xã hội", "xax hooji") 82 | ] 83 | for test in tests: 84 | assert (test[0] == PhoneConverter1.phone2word(test[1])) 85 | assert (test[1] == PhoneConverter1.word2phone(test[0])) 86 | -------------------------------------------------------------------------------- /egs/vivos/extension/transcript_deltadelta.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #!/bin/bash 3 | 4 | . ./path.sh || exit 1 5 | . ./cmd.sh || exit 1 6 | 7 | 8 | model_folder=exp/tri2a 9 | transcript_folder=transcriptions 10 | output_folder=output 11 | 12 | rm -rf $output_folder 13 | mkdir $output_folder 14 | 15 | echo 16 | echo "===== AUDIO -> FEATURE VECTORS =====" 17 | echo 18 | 19 | compute-mfcc-feats --config=conf/mfcc.conf \ 20 | scp:$transcript_folder/wav.scp \ 21 | ark,scp:$output_folder/feats.ark,$output_folder/feats.scp 22 | 23 | add-deltas \ 24 | scp:$output_folder/feats.scp \ 25 | ark:$output_folder/delta-feats.ark 26 | 27 | 28 | echo 29 | echo "===== TRAINED GMM-HMM + FEATURE VECTORS -> LATTICE =====" 30 | echo 31 | 32 | gmm-latgen-faster \ 33 | --word-symbol-table=$model_folder/graph/words.txt \ 34 | $model_folder/final.mdl \ 35 | $model_folder/graph/HCLG.fst \ 36 | ark:$output_folder/delta-feats.ark \ 37 | ark,t:$output_folder/lattices.ark 38 | 39 | echo 40 | echo "===== LATTICE -> BEST PATH THROUGH LATTICE =====" 41 | echo 42 | 43 | lattice-best-path \ 44 | --word-symbol-table=$model_folder/graph/words.txt \ 45 | ark:$output_folder/lattices.ark \ 46 | ark,t:$output_folder/one-best.tra 47 | 48 | echo 49 | echo "===== BEST PATH INTEGERS -> BEST PATH WORDS =====" 50 | echo 51 | 52 | utils/int2sym.pl -f 2- \ 53 | $model_folder/graph/words.txt \ 54 | $output_folder/one-best.tra \ 55 | > $output_folder/one-best-hypothesis.txt 56 | 57 | cat $output_folder/one-best-hypothesis.txt 58 | -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/R001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R001.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/R002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R002.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/R003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R003.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/R004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R004.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/R005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R005.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav -------------------------------------------------------------------------------- /egs/vivos/extension/transcriptions/wav.scp: -------------------------------------------------------------------------------- 1 | r1 ./transcriptions/audio/R001.wav 2 | r2 ./transcriptions/audio/R002.wav 3 | r3 ./transcriptions/audio/R003.wav 4 | r4 ./transcriptions/audio/R004.wav 5 | r5 ./transcriptions/audio/R005.wav 6 | t1 ./transcriptions/audio/t1_tat_ca.wav -------------------------------------------------------------------------------- /egs/vivos/load_data.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 4 | "corpus") 5 | -------------------------------------------------------------------------------- /egs/vivos/logs/README.md: -------------------------------------------------------------------------------- 1 | VIVOS + FPT, LDA-MLLT: 20181227_122900.md -------------------------------------------------------------------------------- /egs/vivos/model/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname 2 | import os 3 | import text 4 | 5 | 6 | def transcript(wav_file): 7 | tmp_folder = dirname(__file__) 8 | command = "pocketsphinx_continuous " \ 9 | "-hmm {0}/model_parameters/tmp.cd_cont_200 " \ 10 | "-samprate 8000 " \ 11 | "-lm {0}/etc/tmp.lm " \ 12 | "-dict {0}/etc/tmp.dic " \ 13 | "-infile {1} " \ 14 | "-logfn {0}/yes".format(tmp_folder, wav_file) 15 | with os.popen(command) as c: 16 | output = c.read().strip() 17 | output = text.phone2word(output) 18 | os.remove("{}/yes".format(tmp_folder)) 19 | return output 20 | -------------------------------------------------------------------------------- /egs/vivos/model/etc/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf __CFG_LO_FILT__ 2 | -upperf __CFG_HI_FILT__ 3 | -nfilt __CFG_NUM_FILT__ 4 | -transform __CFG_TRANSFORM__ 5 | -lifter __CFG_LIFTER__ 6 | -feat __CFG_FEATURE__ 7 | -svspec __CFG_SVSPEC__ 8 | -agc __CFG_AGC__ 9 | -cmn __CFG_CMN__ 10 | -varnorm __CFG_VARNORM__ 11 | -------------------------------------------------------------------------------- /egs/vivos/model/etc/idngram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/etc/idngram -------------------------------------------------------------------------------- /egs/vivos/model/etc/tmp.filler: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/vivos/model/etc/tmp.phone: -------------------------------------------------------------------------------- 1 | 4 2 | a 3 | b 4 | c 5 | d 6 | e 7 | f 8 | g 9 | h 10 | i 11 | j 12 | k 13 | l 14 | m 15 | n 16 | o 17 | p 18 | q 19 | r 20 | s 21 | t 22 | u 23 | v 24 | w 25 | x 26 | y 27 | SIL -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_initial/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/feat.params: -------------------------------------------------------------------------------- 1 | -lowerf 200 2 | -upperf 3500 3 | -nfilt 31 4 | -transform dct 5 | -lifter 22 6 | -feat 1s_c_d_dd 7 | -agc none 8 | -cmn batch 9 | -varnorm no 10 | -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/mdef: -------------------------------------------------------------------------------- 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Sat Jan 6 09:51:27 2018 2 | 0.3 3 | 27 n_base 4 | 0 n_tri 5 | 108 n_state_map 6 | 81 n_tied_state 7 | 81 n_tied_ci_state 8 | 27 n_tied_tmat 9 | # 10 | # Columns definitions 11 | #base lft rt p attrib tmat ... state id's ... 12 | 4 - - - n/a 0 0 1 2 N 13 | SIL - - - filler 1 3 4 5 N 14 | a - - - n/a 2 6 7 8 N 15 | b - - - n/a 3 9 10 11 N 16 | c - - - n/a 4 12 13 14 N 17 | d - - - n/a 5 15 16 17 N 18 | e - - - n/a 6 18 19 20 N 19 | f - - - n/a 7 21 22 23 N 20 | g - - - n/a 8 24 25 26 N 21 | h - - - n/a 9 27 28 29 N 22 | i - - - n/a 10 30 31 32 N 23 | j - - - n/a 11 33 34 35 N 24 | k - - - n/a 12 36 37 38 N 25 | l - - - n/a 13 39 40 41 N 26 | m - - - n/a 14 42 43 44 N 27 | n - - - n/a 15 45 46 47 N 28 | o - - - n/a 16 48 49 50 N 29 | p - - - n/a 17 51 52 53 N 30 | q - - - n/a 18 54 55 56 N 31 | r - - - n/a 19 57 58 59 N 32 | s - - - n/a 20 60 61 62 N 33 | t - - - n/a 21 63 64 65 N 34 | u - - - n/a 22 66 67 68 N 35 | v - - - n/a 23 69 70 71 N 36 | w - - - n/a 24 72 73 74 N 37 | x - - - n/a 25 75 76 77 N 38 | y - - - n/a 26 78 79 80 N 39 | -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/noisedict: -------------------------------------------------------------------------------- 1 | SIL 2 | SIL 3 | SIL -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/variances -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices -------------------------------------------------------------------------------- /egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances -------------------------------------------------------------------------------- /egs/vivos/model/text.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /egs/vivos/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Process some integers.') 5 | parser.add_argument('--wav', help='Path for input file to predict', required=True) 6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True) 7 | parser.add_argument('--model_path', help='Model path (default: exp/{model} in kaldi-trunk/egs/{result})', required=True) 8 | parser.add_argument('--utils_path', help='Kaldi utils dir path, usually in super parent directory of model_path') 9 | parser.add_argument('--method', help='Method to predict, delta/lda_mllt,sat', default="delta") 10 | 11 | args = parser.parse_args() 12 | 13 | 14 | def predict(kaldi_folder, wav_file, model_path, method="delta", utils_path=None): 15 | # Model path usually is in etc at kaldi-trunk/egs/uts_{random_int}/exp 16 | model = model_path 17 | 18 | if not os.path.exists(os.path.join(model, "final.mdl")): 19 | raise Exception("Cannot find final.mdl model file with given model path.") 20 | if not os.path.exists(os.path.join(model, "graph")): 21 | raise Exception("Cannot find graph with given model path.") 22 | 23 | if utils_path is None: 24 | utils_path = os.path.join(os.path.dirname(os.path.dirname(model)), "utils") 25 | 26 | if not os.path.exists(os.path.join(utils_path, "int2sym.pl")): 27 | raise Exception( 28 | "Cannot find int2sym.pl file with given utils path, please make sure that you are provided correctly utils_path argument") 29 | 30 | # Prepare predict dir 31 | os.system("cd {}; rm -rf predict;".format(model)) 32 | os.system("cd {}; mkdir predict;".format(model)) 33 | os.system("cd {}/predict; mkdir config;".format(model)) 34 | os.system("cd {}/predict; mkdir experiment;".format(model)) 35 | os.system("cd {}/predict; mkdir transcriptions;".format(model)) 36 | os.system("cd {}/predict/experiment; mkdir triphones_deldel;".format(model)) 37 | 38 | # Copy pre-trained model 39 | os.system("cd {};cp final.mdl predict/experiment/triphones_deldel/final.mdl;".format(model)) 40 | 41 | os.system("cd {};cp -r graph predict/experiment/triphones_deldel/graph".format(model)) 42 | 43 | os.system("cd {}/predict/config; echo '--use-energy=true \n\ 44 | --sample-frequency=16000 \n\ 45 | --num-mel-bins=40 \n\ 46 | --frame-length=25 \n\ 47 | --frame-shift=10 \n\ 48 | --high-freq=0 \n\ 49 | --low-freq=0 \n\ 50 | --num-ceps=13 \n\ 51 | --window-type=hamming' > mfcc.conf".format(model)) 52 | os.system("cd {}/predict/transcriptions; echo 'result: {}' > wav.scp".format(model, wav_file)) 53 | os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV16 result:' > spk2utt".format(model)) 54 | os.system("cd {}/predict/transcriptions; echo 'result: VIVOSDEV16' > utt2spk".format(model)) 55 | # os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV02-R015 result' > utt2spk".format(model)) 56 | 57 | # Run predict 58 | os.system( 59 | "cd {}/predict; {}/src/featbin/compute-mfcc-feats --config=config/mfcc.conf \ 60 | scp:transcriptions/wav.scp ark,scp:transcriptions/feats.ark,transcriptions/feats.scp" \ 61 | .format(model, kaldi_folder)) 62 | 63 | os.system( 64 | "cd {}/predict; {}/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \ 65 | scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \ 66 | .format(model, kaldi_folder)) 67 | 68 | # os.system( 69 | # "cd {}/predict; {}/src/featbin/apply-cmvn --uut2spk=ark:transcriptions/utt2spk \ 70 | # scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \ 71 | # .format(model, kaldi_folder)) 72 | 73 | # delta 74 | if method == "delta": 75 | # os.system("cd {}/predict; {}/src/featbin/add-deltas \ 76 | # scp:transcriptions/feats.scp ark:transcriptions/delta-feats.ark" \ 77 | # .format(model, kaldi_folder)) 78 | 79 | # os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \ 80 | # --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \ 81 | # --word-symbol-table=experiment/triphones_deldel/graph/words.txt \ 82 | # experiment/triphones_deldel/final.mdl \ 83 | # experiment/triphones_deldel/graph/HCLG.fst \ 84 | # ark:transcriptions/delta-feats.ark \ 85 | # ark,t:transcriptions/lattices.ark" \ 86 | # .format(model, kaldi_folder)) 87 | command = "cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \ 88 | --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \ 89 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \ 90 | experiment/triphones_deldel/final.mdl \ 91 | experiment/triphones_deldel/graph/HCLG.fst \ 92 | 'ark,s,cs:{}/src/featbin/apply-cmvn \ 93 | --utt2spk=ark:transcriptions/utt2spk \ 94 | scp:experiment/cmvn.scp \ 95 | scp:transcriptions/feats.scp ark:- | \ 96 | {}/src/featbin/add-deltas ark:- ark:- |' 'ark,t:transcriptions/lattices.ark' 'ark:|gzip -c > experiment/lat.gz'" \ 97 | .format(model, kaldi_folder, kaldi_folder, kaldi_folder) 98 | os.system(command) 99 | elif method == "lda_mllt": 100 | os.system("cd {};cp final.mat predict/experiment/triphones_deldel/final.mat;".format(model)) 101 | 102 | os.system("cd {}/predict; {}/src/featbin/splice-feats \ 103 | scp:transcriptions/feats.scp \ 104 | ark:transcriptions/splice-feats.ark".format(model, kaldi_folder)) 105 | os.system("cd {}/predict; {}/src/featbin/transform-feats \ 106 | experiment/triphones_deldel/final.mat \ 107 | ark:transcriptions/splice-feats.ark \ 108 | ark:transcriptions/splice-transform-feats.ark".format(model, kaldi_folder)) 109 | os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \ 110 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \ 111 | experiment/triphones_deldel/final.mdl experiment/triphones_deldel/graph/HCLG.fst \ 112 | ark:transcriptions/splice-transform-feats.ark ark,t:transcriptions/lattices.ark" \ 113 | .format(model, kaldi_folder)) 114 | else: 115 | raise Exception("The given method {} is not supported yet".format(method)) 116 | 117 | os.system("cd {}/predict; {}/src/latbin/lattice-best-path" 118 | " \ 119 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \ 120 | ark:transcriptions/lattices.ark \ 121 | ark,t:transcriptions/one-best.tra" \ 122 | .format(model, kaldi_folder)) 123 | 124 | os.system("cd {}/predict; {}/int2sym.pl" 125 | " -f 2- {}/predict/experiment/triphones_deldel/graph/words.txt transcriptions/one-best.tra \ 126 | > {}/predict/transcriptions/one-best-hypothesis.txt; echo $(<{}/predict/transcriptions/one-best-hypothesis.txt);" \ 127 | .format(model, utils_path, model, model, model)) 128 | 129 | result = open("{}/predict/transcriptions/one-best-hypothesis.txt".format(model)).read() 130 | # Result will stored in model_path/predict/transcriptions/one-best-hypothesis.txt under format test {predict_result} 131 | result = result[8:] 132 | print(result) 133 | return result 134 | 135 | 136 | if __name__ == "__main__": 137 | predict(args.kaldi_folder, args.wav, args.model_path, args.method, args.utils_path) 138 | -------------------------------------------------------------------------------- /egs/vivos/predict_delta.sh: -------------------------------------------------------------------------------- 1 | # Please don't charge this default config 2 | MODEL=/home/anhv/PycharmProjects/kaldi-trunk/egs/uts_443/exp/tri2a 3 | KALDI=/home/anhv/PycharmProjects/kaldi-trunk 4 | WAV=/home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/experiment/vivos/test/VIVOSDEV01_R034.wav 5 | 6 | # Variables 7 | # MODEL= 8 | # KALDI= 9 | # WAV= 10 | 11 | # Prepare predict dir 12 | cd $MODEL; 13 | rm -rf predict 14 | mkdir predict 15 | cd $MODEL/predict 16 | mkdir config; mkdir experiment; mkdir transcriptions 17 | cd $MODEL/predict/experiment 18 | mkdir triphones_delta 19 | 20 | # Copy pre-trained model 21 | cd $MODEL 22 | cp final.mdl predict/experiment/triphones_delta/final.mdl 23 | cp -r graph predict/experiment/triphones_delta/graph 24 | 25 | cd $MODEL/predict/config 26 | cat > mfcc.conf << EOL 27 | --use-energy=true 28 | --sample-frequency=16000 29 | --num-mel-bins=40 30 | --frame-length=25 31 | --frame-shift=10 32 | --high-freq=0 33 | --low-freq=0 34 | --num-ceps=13 35 | --window-type=hamming 36 | EOL 37 | 38 | # Prepare util 39 | cd $MODEL/predict/transcriptions 40 | echo "result: $WAV" > wav.scp 41 | echo "VIVOSDEV16 result:" > spk2utt 42 | echo "result: VIVOSDEV16" > utt2spk 43 | 44 | 45 | # Run predict 46 | cd $MODEL/predict; 47 | $KALDI/src/featbin/compute-mfcc-feats \ 48 | --config=config/mfcc.conf \ 49 | scp:transcriptions/wav.scp \ 50 | ark,scp:transcriptions/feats.ark,transcriptions/feats.scp 51 | $KALDI/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \ 52 | scp:transcriptions/feats.scp \ 53 | ark,scp:experiment/cmvn.ark,experiment/cmvn.scp 54 | 55 | cd $MODEL/predict; 56 | $KALDI/src/gmmbin/gmm-latgen-faster \ 57 | --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \ 58 | --word-symbol-table=experiment/triphones_delta/graph/words.txt \ 59 | experiment/triphones_delta/final.mdl \ 60 | experiment/triphones_delta/graph/HCLG.fst \ 61 | 'ark,s,cs:'$KALDI'/src/featbin/apply-cmvn \ 62 | --utt2spk=ark:transcriptions/utt2spk scp:experiment/cmvn.scp scp:transcriptions/feats.scp \ 63 | ark:- | '$KALDI'/src/featbin/add-deltas ark:- ark:- |' 'ark:|gzip -c > experiment/lat.JOB.gz' 64 | 65 | echo "Finish predict" -------------------------------------------------------------------------------- /egs/vivos/preprocess.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os import mkdir, walk 3 | from os import listdir 4 | from os.path import dirname 5 | from os.path import join 6 | import os 7 | import re 8 | 9 | def create_train_waves(): 10 | 11 | waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 12 | "raw","train","waves") 13 | waves_folder_2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", 14 | "raw","test","waves") 15 | corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 16 | "corpus","train","wav") 17 | try: 18 | shutil.rmtree(corpus_waves_folder) 19 | except: 20 | pass 21 | finally: 22 | mkdir(corpus_waves_folder) 23 | for root, dirs, files in walk(waves_folder): 24 | for dir in dirs: 25 | for f in listdir(join(waves_folder, dir)): 26 | shutil.copy( 27 | join(waves_folder, dir, f), 28 | join(corpus_waves_folder, f)) 29 | 30 | for root, dirs, files in walk(waves_folder_2): 31 | for dir in dirs: 32 | for f in listdir(join(waves_folder_2, dir)): 33 | shutil.copy( 34 | join(waves_folder_2, dir, f), 35 | join(corpus_waves_folder, f)) 36 | 37 | 38 | def create_test_waves(): 39 | waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vlsp", 40 | "wav") 41 | corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 42 | "corpus","test") 43 | corpus_short_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos", 44 | "corpus", "test_short") 45 | try: 46 | shutil.rmtree(corpus_waves_folder) 47 | shutil.rmtree(corpus_short_waves_folder) 48 | except: 49 | pass 50 | finally: 51 | mkdir(corpus_waves_folder) 52 | mkdir(corpus_short_waves_folder) 53 | mkdir(join(corpus_short_waves_folder,"wav")) 54 | 55 | shutil.copytree(waves_folder,join(corpus_waves_folder,"wav")) 56 | files = listdir(join(corpus_waves_folder,"wav")) 57 | for file in files: 58 | os.rename(join(corpus_waves_folder,"wav",file),join(corpus_waves_folder,"wav","{}_{}".format("global",file))) 59 | list_files = listdir(join(corpus_waves_folder,"wav")) 60 | list_files.sort() 61 | for index,file in enumerate(list_files): 62 | if index < 20: 63 | shutil.copyfile(join(corpus_waves_folder,"wav",file),join(corpus_short_waves_folder,"wav",file)) 64 | 65 | 66 | def create_train_text(): 67 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", 68 | "raw","train","prompts.txt") 69 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", 70 | "raw", "test", "prompts.txt") 71 | content = open(content_path).read() 72 | content = content.replace(":", "") 73 | 74 | content2 = open(content_path2).read() 75 | content2 = content2.replace(":", "") 76 | lines = content.splitlines() 77 | lines2 = content2.splitlines() 78 | output = [] 79 | for line in lines: 80 | items = line.split() 81 | fileid = items[0] 82 | text = " ".join(items[1:]).lower() 83 | content = "{}|{}".format(fileid, text) 84 | output.append(content) 85 | for line in lines2: 86 | items = line.split() 87 | fileid = items[0] 88 | text = " ".join(items[1:]).lower() 89 | content2 = "{}|{}".format(fileid, text) 90 | output.append(content2) 91 | text = "\n".join(output) 92 | 93 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos","corpus","train", "text") 94 | open(content_path, "w").write(text) 95 | 96 | 97 | def create_test_text(): 98 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text") 99 | 100 | content = open(content_path).read() 101 | content = content.replace(":", "") 102 | lines = content.splitlines() 103 | output = [] 104 | output_short = [] 105 | short_counter = 0 106 | for line in lines: 107 | m = re.match(r"^(?P.*)\t(?P.*)$", line) 108 | if m: 109 | text = m.group("text") 110 | fileid = m.group("fileid") 111 | content = "{}|{}".format("global_{}".format(fileid), text) 112 | output.append(content) 113 | if short_counter < 20: 114 | output_short.append(content) 115 | short_counter += 1 116 | text = "\n".join(output) 117 | 118 | 119 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "text") 120 | open(content_path, "w").write(text) 121 | 122 | text = "\n".join(output_short) 123 | 124 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "text") 125 | open(content_path, "w").write(text) 126 | 127 | 128 | def create_gender(): 129 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "genders.txt") 130 | content = open(content_path).read() 131 | 132 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "genders.txt") 133 | content2 = open(content_path2).read() 134 | content = content2 + content 135 | 136 | output_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "gender") 137 | open(output_path, "w").write(content) 138 | 139 | content_test = "\n".join(["global m"]) 140 | 141 | output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "gender") 142 | open(output_test_path, "w").write(content_test) 143 | 144 | output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "gender") 145 | open(output_test_path, "w").write(content_test) 146 | 147 | 148 | def create_speaker(): 149 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "prompts.txt") 150 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "prompts.txt") 151 | lines = open(content_path).read().splitlines() 152 | files = [line.split()[0] for line in lines] 153 | tmp = [] 154 | 155 | for file_id in files: 156 | speaker_id = file_id.split("_")[0] 157 | content = "{} {}".format(speaker_id, file_id) 158 | tmp.append(content) 159 | 160 | # Merge vivos test to train dir 161 | lines2 = open(content_path2).read().splitlines() 162 | files2 = [line.split()[0] for line in lines2] 163 | 164 | for file_id in files2: 165 | speaker_id = file_id.split("_")[0] 166 | content = "{} {}".format(speaker_id, file_id) 167 | tmp.append(content) 168 | 169 | tmp.sort() 170 | 171 | content = "\n".join(tmp) 172 | 173 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "speaker") 174 | open(content_path, "w").write(content) 175 | 176 | lines_test_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text") 177 | lines_test = open(lines_test_path).read().splitlines() 178 | test_output = [] 179 | short_test_output = [] 180 | short_test_counter = 0 181 | 182 | for line in lines_test: 183 | # print(line) 184 | m = re.match(r"^(?P.*)\t(?P.*)$", line) 185 | if m: 186 | # text = m.group("text") 187 | fileid = m.group("fileid") 188 | content = "global {}".format("global_{}".format(fileid)) 189 | 190 | test_output.append(content) 191 | if short_test_counter < 20: 192 | short_test_output.append(content) 193 | 194 | short_test_counter+=1 195 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "speaker") 196 | content = "\n".join(test_output) 197 | open(content_path, "w").write(content) 198 | 199 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "speaker") 200 | short_content = "\n".join(short_test_output) 201 | open(content_path, "w").write(short_content) 202 | 203 | try: 204 | shutil.rmtree(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus")) 205 | except: 206 | pass 207 | finally: 208 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus")) 209 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus","train")) 210 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test")) 211 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short")) 212 | create_train_waves() 213 | create_test_waves() 214 | create_train_text() 215 | create_test_text() 216 | create_gender() 217 | create_speaker() 218 | -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R003.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R012.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R027.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R027.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R028.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R028.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R034.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R034.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R043.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R043.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R044.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R044.wav -------------------------------------------------------------------------------- /egs/vivos/test/VIVOSDEV01_R055.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R055.wav -------------------------------------------------------------------------------- /egs/vivos/test_model.py: -------------------------------------------------------------------------------- 1 | from model import transcript 2 | from os.path import join, dirname 3 | from unittest import TestCase 4 | 5 | 6 | class TestSentiment(TestCase): 7 | def test_1(self): 8 | wav = join(dirname(__file__), "test", "VIVOSDEV01_R003.wav") 9 | actual = transcript(wav) 10 | expected = "cà phê" 11 | self.assertEqual(actual, expected) 12 | 13 | def test_2(self): 14 | wav = join(dirname(__file__), "test", "VIVOSDEV01_R034.wav") 15 | actual = transcript(wav) 16 | expected = "khách sạn" 17 | self.assertEqual(actual, expected) 18 | -------------------------------------------------------------------------------- /egs/vivos/text2.py: -------------------------------------------------------------------------------- 1 | rules_1 = [ 2 | "aàáảãạ", 3 | "ăằắẳẵặ", 4 | "âầấẩẫậ", 5 | "eèéẻẽẹ", 6 | "êềếểễệ", 7 | "iìíỉĩị", 8 | "oòóỏõọ", 9 | "ôồốổỗộ", 10 | "ơờớởỡợ", 11 | "uùúủũụ", 12 | "ưừứửữự", 13 | "yỳýỷỹỵ" 14 | ] 15 | rules_2 = [ 16 | "awă", 17 | "aaâ", 18 | "eeê", 19 | "ooô", 20 | "owơ", 21 | "uwư", 22 | "ddđ" 23 | ] 24 | w2p = {} 25 | p2w = {} 26 | for words in rules_1: 27 | original = words[0] 28 | words = words[1:] 29 | for rule in rules_2: 30 | if original == rule[2]: 31 | original = rule[0:2] 32 | tones = "fsrxj" 33 | for i, w in enumerate(words): 34 | w2p[w] = original + tones[i] 35 | for rule in rules_2: 36 | w2p[rule[2]] = rule[0:2] 37 | for key, value in w2p.items(): 38 | p2w[value] = key 39 | 40 | 41 | def word2phone(word): 42 | phone = "" 43 | for w in word: 44 | if w in w2p: 45 | phone += w2p[w] 46 | else: 47 | phone += w 48 | return phone 49 | 50 | 51 | def phone2word(phone): 52 | i = 0 53 | word = "" 54 | while i < len(phone): 55 | if phone[i:i+3] in p2w: 56 | p = phone[i:i+3] 57 | word += p2w[p] 58 | i += 3 59 | elif phone[i:i+2] in p2w: 60 | p = phone[i:i+2] 61 | word += p2w[p] 62 | i += 2 63 | else: 64 | p = phone[i:i+1] 65 | word += p 66 | i += 1 67 | return word 68 | 69 | if __name__ == '__main__': 70 | tests = [ 71 | ("con hoẵng", "con hoawxng"), 72 | ("lựu đạn", "luwju ddajn"), 73 | ("kiểm tra", "kieerm tra"), 74 | ("ủy ban", "ury ban"), 75 | ("cà phê", "caf phee"), 76 | ("khách sạn", "khasch sajn"), 77 | ("đúng", "ddusng"), 78 | ("xã hội", "xax hooji") 79 | ] 80 | for test in tests: 81 | assert (test[0] == phone2word(test[1])) 82 | assert (test[1] == word2phone(test[0])) 83 | -------------------------------------------------------------------------------- /egs/vivos/train.py: -------------------------------------------------------------------------------- 1 | from egs.vivos.extension.model import KaldiSpeechRecognition 2 | from os.path import join, dirname 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description='Process some integers.') 6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True) 7 | parser.add_argument('--corpus_folder', help='Corpus path to train',required=True) 8 | parser.add_argument('--export_path', help='Export path will be able soon') 9 | parser.add_argument('--nj', help='Parallel number of job', default=1) 10 | parser.add_argument('--method', help='Parallel number of job', default="deltadelta") 11 | 12 | 13 | args = parser.parse_args() 14 | 15 | 16 | def train(kaldi_folder, corpus_folder, export_folder=None, nj=1, method="deltadelta"): 17 | export_folder = join(dirname(__file__), "model") 18 | params = { 19 | "method": method, 20 | "jobs": nj, 21 | "lm_order": 1 22 | } 23 | model = KaldiSpeechRecognition(corpus_folder, kaldi_folder, params) 24 | model.fit() 25 | 26 | 27 | if __name__ == "__main__": 28 | train(args.kaldi_folder, args.corpus_folder, args.export_path,args.nj,args.method) -------------------------------------------------------------------------------- /insight/vivos.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/insight/vivos.txt -------------------------------------------------------------------------------- /insight/vlsp2018.txt: -------------------------------------------------------------------------------- 1 | # Dữ liệu VLSP 2018 2 | 3 | Tập dữ liệu VLSP 2018 có tất cả 796 câu. 4 | 5 | Dữ liệu gồm 796 câu nói với độ dài trung bình 40 tokens (max 104 tokens, min 0 tokens). 6 | Trong đó có một câu đặc biệt có id 0437, không chứa một tiếng nói nào, trong file wav tương ứng chỉ có tiếng xe máy chạy ngoài đường. 7 | 8 | Thông tin phân phối độ dài câu trong tập dữ liệu: 9 | 10 | ``` 11 | count 796.000000 12 | mean 40.812814 13 | std 22.313014 14 | min 0.000000 15 | 0% 0.000000 16 | 5% 9.000000 17 | 10% 13.000000 18 | 15.0% 16.000000 19 | 20% 19.000000 20 | 25% 22.000000 21 | 30.0% 25.000000 22 | 35% 28.000000 23 | 40% 31.000000 24 | 45% 34.000000 25 | 50% 38.000000 26 | 55.0% 41.000000 27 | 60.0% 46.000000 28 | 65% 49.000000 29 | 70% 53.000000 30 | 75% 58.000000 31 | 80% 62.000000 32 | 85.0% 68.000000 33 | 90% 73.000000 34 | 95% 81.000000 35 | 100% 104.000000 36 | max 104.000000 37 | ``` 38 | 39 | File âm thanh dài nhất cỡ 27 giây, ngắn nhất cỡ 1 giây, độ dài trung bình của file âm thanh là 9.5 giây. -------------------------------------------------------------------------------- /report/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | name="technique_report" 3 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex 4 | bibtex $name.aux 5 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex 6 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex 7 | 8 | rm -rf $name.blg 9 | rm -rf $name.log 10 | rm -rf $name.out 11 | rm -rf *.aux 12 | rm -rf $name.bbl 13 | rm -rf $name.synctex.gz -------------------------------------------------------------------------------- /report/notation.tex: -------------------------------------------------------------------------------- 1 | % Tensor 2 | \DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl} 3 | \SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n} 4 | \newcommand{\tens}[1]{\bm{\mathsfit{#1}}} 5 | \def\tA{{\tens{A}}} 6 | \def\tB{{\tens{B}}} 7 | \def\tC{{\tens{C}}} 8 | \def\tD{{\tens{D}}} 9 | \def\tE{{\tens{E}}} 10 | \def\tF{{\tens{F}}} 11 | \def\tG{{\tens{G}}} 12 | \def\tH{{\tens{H}}} 13 | \def\tI{{\tens{I}}} 14 | \def\tJ{{\tens{J}}} 15 | \def\tK{{\tens{K}}} 16 | \def\tL{{\tens{L}}} 17 | \def\tM{{\tens{M}}} 18 | \def\tN{{\tens{N}}} 19 | \def\tO{{\tens{O}}} 20 | \def\tP{{\tens{P}}} 21 | \def\tQ{{\tens{Q}}} 22 | \def\tR{{\tens{R}}} 23 | \def\tS{{\tens{S}}} 24 | \def\tT{{\tens{T}}} 25 | \def\tU{{\tens{U}}} 26 | \def\tV{{\tens{V}}} 27 | \def\tW{{\tens{W}}} 28 | \def\tX{{\tens{X}}} 29 | \def\tY{{\tens{Y}}} 30 | \def\tZ{{\tens{Z}}} 31 | \def\tx{{\tens{x}}} 32 | \def\ty{{\tens{y}}} -------------------------------------------------------------------------------- /report/technique_report.bib: -------------------------------------------------------------------------------- 1 | @article{DBLP:journals/corr/Le-Hong16, 2 | author = {Phuong Le{-}Hong}, 3 | title = {Vietnamese Named Entity Recognition using Token Regular Expressions 4 | and Bidirectional Inference}, 5 | journal = {CoRR}, 6 | volume = {abs/1610.05652}, 7 | year = {2016}, 8 | url = {http://arxiv.org/abs/1610.05652}, 9 | archivePrefix = {arXiv}, 10 | eprint = {1610.05652}, 11 | timestamp = {Wed, 07 Jun 2017 14:42:34 +0200}, 12 | biburl = {https://dblp.org/rec/bib/journals/corr/Le-Hong16}, 13 | bibsource = {dblp computer science bibliography, https://dblp.org} 14 | } 15 | 16 | @article{DBLP:journals/corr/abs-1708-07241, 17 | author = {Thai{-}Hoang Pham and 18 | Xuan{-}Khoai Pham and 19 | Tuan{-}Anh Nguyen and 20 | Phuong Le{-}Hong}, 21 | title = {{NNVLP:} {A} Neural Network-Based Vietnamese Language Processing Toolkit}, 22 | journal = {CoRR}, 23 | volume = {abs/1708.07241}, 24 | year = {2017}, 25 | url = {http://arxiv.org/abs/1708.07241}, 26 | archivePrefix = {arXiv}, 27 | eprint = {1708.07241}, 28 | timestamp = {Tue, 05 Sep 2017 10:03:46 +0200}, 29 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07241}, 30 | bibsource = {dblp computer science bibliography, https://dblp.org} 31 | } 32 | 33 | @article{DBLP:journals/corr/abs-1801-01331, 34 | author = {Thanh Vu and 35 | Dat Quoc Nguyen and 36 | Dai Quoc Nguyen and 37 | Mark Dras and 38 | Mark Johnson}, 39 | title = {VnCoreNLP: {A} Vietnamese Natural Language Processing Toolkit}, 40 | journal = {CoRR}, 41 | volume = {abs/1801.01331}, 42 | year = {2018}, 43 | url = {http://arxiv.org/abs/1801.01331}, 44 | archivePrefix = {arXiv}, 45 | eprint = {1801.01331}, 46 | timestamp = {Thu, 01 Feb 2018 19:52:26 +0100}, 47 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1801-01331}, 48 | bibsource = {dblp computer science bibliography, https://dblp.org} 49 | } 50 | 51 | @article{DBLP:journals/corr/abs-1803-08463, 52 | author = {Pham Quang Nhat Minh}, 53 | title = {A Feature-Based Model for Nested Named-Entity Recognition at {VLSP-2018} 54 | {NER} Evaluation Campaign}, 55 | journal = {CoRR}, 56 | volume = {abs/1803.08463}, 57 | year = {2018}, 58 | url = {http://arxiv.org/abs/1803.08463}, 59 | archivePrefix = {arXiv}, 60 | eprint = {1803.08463}, 61 | timestamp = {Wed, 11 Apr 2018 11:12:46 +0200}, 62 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-08463}, 63 | bibsource = {dblp computer science bibliography, https://dblp.org} 64 | } -------------------------------------------------------------------------------- /report/technique_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/report/technique_report.pdf -------------------------------------------------------------------------------- /report/technique_report.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper]{article} 2 | \usepackage{acl2017} 3 | \usepackage{times} 4 | \usepackage{multirow} 5 | \usepackage{url} 6 | \usepackage{latexsym} 7 | \usepackage{graphicx} 8 | \usepackage{color} 9 | \usepackage{booktabs} 10 | \usepackage{amsmath} 11 | \usepackage[english,vietnam]{babel} 12 | \usepackage[utf8]{vietnam} 13 | 14 | \aclfinalcopy % Uncomment this line for the final submission 15 | %\def\eaclpaperid{***} % Enter the acl Paper ID here 16 | 17 | %\setlength\titlebox{5cm} 18 | % You can expand the titlebox if you need extra space 19 | % to show all the authors. Please do not make the titlebox 20 | % smaller than 5cm (the original size); we will check this 21 | % in the camera-ready version and ask you to change it back. 22 | 23 | \newcommand\BibTeX{B{\sc ib}\TeX} 24 | 25 | \title{Báo cáo kỹ thuật\\Module nhận dạng tiếng nói tiếng Việt\\ trong underthesea} 26 | 27 | \include{notation} 28 | 29 | \author{ 30 | Vũ Anh\\ 31 | underthesea\\ 32 | {\tt anhv.ict91@gmail.com} \\ 33 | \And 34 | Lê Phi Hùng \\ 35 | underthesea\\ 36 | {\tt lephihungch@gmail.com} \\ 37 | } 38 | 39 | \date{} 40 | 41 | \begin{document} 42 | \maketitle 43 | \begin{abstract} 44 | 45 | Trong báo cáo này, trong chúng mô tả hệ thống nhận dạng tiếng nói tiếng Việt trong underthesea. Trong đó, hệ thống sử dụng công cụ Kaldi để xây dựng module nhận dạng, kết quả được đánh giá trên tập dữ liệu test của VLSP 2018. Toàn bộ mã nguồn và tài liệu của dự án được phát hiện dưới dạng mở nguồn mở tại địa chỉ \url{https://github.com/undertheseanlp/automatic_speech_recognition} 46 | 47 | \end{abstract} 48 | 49 | \section{Giới thiệu} 50 | 51 | \section{Mô tả hệ thống} 52 | 53 | Các thử nghiệm được thực hiện trên bộ công cụ nhận dạng tiếng nói được viết trên C++ Kaldi. \footnote{http://kaldi-asr.org/} 54 | 55 | Mô hình xây dựng hệ thống nhận dạng tiếng nói 56 | 57 | \subsection{Chuẩn bị dữ liệu và các tài nguyên ngôn ngữ} 58 | 59 | Việc đầu tiên cần làm là chuẩn bị dữ liệu huấn luyện âm thanh - phụ đề. 60 | Gồm có các tập tin âm thanh (thường để ở định dạng wav) chứa các tiếng nói của người và các tập tin phụ đề tương ứng. 61 | 62 | Việc tiếp theo là xây dựng từ điển phát âm. 63 | Hình dung một cách đơn giản, từ điển phát âm sẽ chứa cách phát âm (cách phân chia các âm) tương ứng với từng tiếng. 64 | Ngoài ra trong hệ thống còn cần các âm câm (silence\_phones), các từ ngoài từ điển (out-of-vocabulary hay oov). 65 | 66 | 67 | Cuối cùng là chuẩn bị dữ liệu cho việc huấn luyện mô hình ngôn ngữ. 68 | Mô hình ngôn ngữ giúp cải thiện chất lượng của hệ thống nhận dạng tiếng nói, bằng cách đưa ra những khả năng có thể nhất trong một cụm từ. 69 | Hãy xem xét ví dụ hệ thống đang phải quyết định từ con thiếu trong câu \textit{Tôi đi Hà \_ mấy ngày}. 70 | Nếu hệ thống sử dụng mô hình ngôn ngữ, có thể dễ dàng nhận ra từ \textit{Nội} là từ có khả năng còn thiếu nhất trong câu này. 71 | 72 | \subsection{Huấn luyện mô hình Gaussian Mixture Model} 73 | 74 | Bước đầu tiên là huấn luyện mô hình âm học, là thành phần chuyển các tín hiệu âm thanh thành dữ liệu văn bản. 75 | Mô hình huấn luyện thường sử dụng thuật toán Gaussian Mixture Model trên các tập đặc trưng phổ biến của âm thanh như MFCC (Mel-frequency cepstral coefficients) \footnote{Để biết thêm về đặc trưng này, xin tìm đọc tài liệu \href{http://www.lrc.tnu.edu.vn/upload/collection/brief/41619_13520141527406.pdf}{So sánh hai phương pháp trích chọn đặc trưng âm thanh: Đường bao phổ (MFCC) và cao độ Pitch trong việc tìm kiếm âm nhạc theo nội dung}}. Ngoài ra còn có các đặc trưng delta, lda, mltt hay sat. 76 | 77 | Bước thứ hai là huấn luyện mô hình ngôn ngữ 78 | 79 | \subsection{Quá trình giải mã} 80 | 81 | \begin{itemize} 82 | \item Tạo ra một đồ thị giải mã 83 | \item Tính điểm lại Lattice 84 | \end{itemize} 85 | 86 | \section{Đánh giá} 87 | 88 | \subsection{Tập dữ liệu} 89 | 90 | Có hai tập dữ liệu được sử dụng. Tập dữ liệu VIVOS và tập dữ liệu VLSP 2018. Trong đó, tập dữ liệu VIVOS được dùng để huấn luyện, tập dữ liệu VLSP 2018 được sử dụng để đánh giá kết quả mô hình. 91 | 92 | \subsection{Kết quả} 93 | 94 | % TODO To be updated 95 | 96 | \section{Conclusion} 97 | 98 | % TODO To be updated 99 | 100 | \section{Lời cảm ơn} 101 | 102 | Vì kiến thức còn hạn chế, trong phần mô tả kỹ thuật, tác giả có tham khảo các tài liệu \textit{Building Speech Recognition Systems with the Kaldi Toolkit} \footnote{https://engineering.jhu.edu/clsp/wp-content/uploads/sites/75/2016/06/Building-Speech-Recognition-Systems-with-the-Kaldi-Toolkit.pdf} 103 | 104 | \bibliography{technique_report} 105 | \bibliographystyle{acl_natbib} 106 | 107 | \end{document} -------------------------------------------------------------------------------- /tmp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/tmp/.gitkeep -------------------------------------------------------------------------------- /util/eda_vlsp.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import join, dirname 3 | import pandas as pd 4 | import numpy as np 5 | import librosa 6 | 7 | ROOT_FOLDER = dirname(dirname(__file__)) 8 | 9 | 10 | def stat_tokens(lines): 11 | token_lengths = [len(line.split()[1:]) for line in lines] 12 | token_lengths = pd.Series(token_lengths) 13 | print(token_lengths.describe(percentiles=np.linspace(0, 1, 21))) 14 | 15 | 16 | def stat_text(): 17 | print("\nText Data:") 18 | text_file = join(ROOT_FOLDER, "data", "vlsp", "text") 19 | lines = open(text_file, "r").read().splitlines() 20 | print("VLSP 2018 DATA SET") 21 | print("\nTotal sentences:", len(lines)) 22 | stat_tokens(lines) 23 | 24 | 25 | def stat_acoustic(): 26 | print("\nAcoustic Data:") 27 | wav_folder = join(ROOT_FOLDER, "data", "vlsp", "wav") 28 | files = listdir(wav_folder) 29 | files = [join(wav_folder, file) for file in files] 30 | durations = [librosa.get_duration(filename=file) for file in files] 31 | durations = pd.Series(durations) 32 | print(f"Total: {durations.sum():.2f} seconds ({durations.sum() / 3600:.2f} hours)") 33 | print(durations.describe()) 34 | 35 | 36 | if __name__ == '__main__': 37 | stat_text() 38 | stat_acoustic() 39 | --------------------------------------------------------------------------------