├── .gitignore
├── Experiment.ipynb
├── LICENSE.txt
├── README.en.md
├── README.md
├── data
    ├── diadiem
    │   ├── preprocess.py
    │   └── text.py
    └── vivos
    │   └── preprocess.py
├── data_format.md
├── egs
    ├── diadiem
    │   ├── __init__.py
    │   ├── analyze.py
    │   ├── extension
    │   │   ├── __init__.py
    │   │   ├── analyze.py
    │   │   ├── export.py
    │   │   ├── metrics.py
    │   │   ├── model.py
    │   │   └── text.py
    │   ├── load_data.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── etc
    │   │   │   ├── feat.params
    │   │   │   ├── idngram
    │   │   │   ├── sphinx_train.cfg
    │   │   │   ├── text
    │   │   │   ├── tmp.dic
    │   │   │   ├── tmp.filler
    │   │   │   ├── tmp.lm
    │   │   │   ├── tmp.phone
    │   │   │   ├── tmp_test.fileids
    │   │   │   ├── tmp_test.transcription
    │   │   │   ├── tmp_train.fileids
    │   │   │   ├── tmp_train.transcription
    │   │   │   └── vocab
    │   │   ├── model_parameters
    │   │   │   ├── tmp.cd_cont_200
    │   │   │   │   ├── feat.params
    │   │   │   │   ├── mdef
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── noisedict
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.cd_cont_200_1
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.cd_cont_200_2
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.cd_cont_200_4
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.cd_cont_initial
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.cd_cont_untied
    │   │   │   │   ├── feat.params
    │   │   │   │   ├── mdef
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── noisedict
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   ├── tmp.ci_cont
    │   │   │   │   ├── feat.params
    │   │   │   │   ├── mdef
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── noisedict
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   │   └── tmp.ci_cont_flatinitial
    │   │   │   │   ├── globalmean
    │   │   │   │   ├── globalvar
    │   │   │   │   ├── means
    │   │   │   │   ├── mixture_weights
    │   │   │   │   ├── transition_matrices
    │   │   │   │   └── variances
    │   │   └── text.py
    │   ├── test
    │   │   ├── CAFPHEE001.wav
    │   │   ├── CAFPHEE002.wav
    │   │   ├── CAFPHEE003.wav
    │   │   ├── CAFPHEE004.wav
    │   │   ├── CAFPHEE005.wav
    │   │   ├── CAFPHEE006.wav
    │   │   ├── CAFPHEE007.wav
    │   │   ├── CAFPHEE008.wav
    │   │   ├── CAFPHEE009.wav
    │   │   ├── CAFPHEE010.wav
    │   │   ├── DDUSNG0001.wav
    │   │   ├── DDUSNG0002.wav
    │   │   ├── DDUSNG0003.wav
    │   │   ├── DDUSNG0004.wav
    │   │   ├── DDUSNG0005.wav
    │   │   ├── DDUSNG0006.wav
    │   │   ├── DDUSNG0007.wav
    │   │   ├── DDUSNG0008.wav
    │   │   ├── DDUSNG0009.wav
    │   │   ├── DDUSNG0010.wav
    │   │   ├── KARAOKE001.wav
    │   │   ├── KARAOKE002.wav
    │   │   ├── KARAOKE003.wav
    │   │   ├── KARAOKE004.wav
    │   │   ├── KARAOKE005.wav
    │   │   ├── KARAOKE006.wav
    │   │   ├── KARAOKE007.wav
    │   │   ├── KARAOKE008.wav
    │   │   ├── KARAOKE009.wav
    │   │   ├── KARAOKE010.wav
    │   │   ├── KHASCHSAJN001.wav
    │   │   ├── KHASCHSAJN002.wav
    │   │   ├── KHASCHSAJN003.wav
    │   │   ├── KHASCHSAJN004.wav
    │   │   ├── KHASCHSAJN005.wav
    │   │   ├── KHASCHSAJN006.wav
    │   │   ├── KHASCHSAJN007.wav
    │   │   ├── KHASCHSAJN008.wav
    │   │   ├── KHASCHSAJN009.wav
    │   │   ├── KHASCHSAJN010.wav
    │   │   ├── KHOONG0001.wav
    │   │   ├── KHOONG0002.wav
    │   │   ├── KHOONG0003.wav
    │   │   ├── KHOONG0004.wav
    │   │   ├── KHOONG0005.wav
    │   │   ├── KHOONG0006.wav
    │   │   ├── KHOONG0007.wav
    │   │   ├── KHOONG0008.wav
    │   │   ├── KHOONG0009.wav
    │   │   ├── KHOONG0010.wav
    │   │   ├── MASTXA001.wav
    │   │   ├── MASTXA002.wav
    │   │   ├── MASTXA003.wav
    │   │   ├── MASTXA004.wav
    │   │   ├── MASTXA005.wav
    │   │   ├── MASTXA006.wav
    │   │   ├── MASTXA007.wav
    │   │   ├── MASTXA008.wav
    │   │   ├── MASTXA009.wav
    │   │   ├── MASTXA010.wav
    │   │   ├── TRAJMAYTEEM001.wav
    │   │   ├── TRAJMAYTEEM002.wav
    │   │   ├── TRAJMAYTEEM003.wav
    │   │   ├── TRAJMAYTEEM004.wav
    │   │   ├── TRAJMAYTEEM005.wav
    │   │   ├── TRAJMAYTEEM006.wav
    │   │   ├── TRAJMAYTEEM007.wav
    │   │   ├── TRAJMAYTEEM008.wav
    │   │   ├── TRAJMAYTEEM009.wav
    │   │   ├── TRAJMAYTEEM010.wav
    │   │   ├── TROWRLAJI001.wav
    │   │   ├── TROWRLAJI002.wav
    │   │   ├── TROWRLAJI003.wav
    │   │   ├── TROWRLAJI004.wav
    │   │   ├── TROWRLAJI005.wav
    │   │   ├── TROWRLAJI006.wav
    │   │   ├── TROWRLAJI007.wav
    │   │   ├── TROWRLAJI008.wav
    │   │   ├── TROWRLAJI009.wav
    │   │   └── TROWRLAJI010.wav
    │   ├── test_model.py
    │   ├── text.py
    │   └── train.py
    └── vivos
    │   ├── README.md
    │   ├── __init__.py
    │   ├── analyze.py
    │   ├── extension
    │       ├── __init__.py
    │       ├── analyze.py
    │       ├── cmd.sh
    │       ├── export.py
    │       ├── metrics.py
    │       ├── model.py
    │       ├── model_sphinx.py
    │       ├── path.sh
    │       ├── run_deltadelta.sh
    │       ├── run_lda_mllt.sh
    │       ├── run_lda_mllt_decode.sh
    │       ├── run_sat.sh
    │       ├── run_sgmm2.sh
    │       ├── text.py
    │       ├── transcript_deltadelta.sh
    │       └── transcriptions
    │       │   ├── audio
    │       │       ├── R001.wav
    │       │       ├── R002.wav
    │       │       ├── R003.wav
    │       │       ├── R004.wav
    │       │       ├── R005.wav
    │       │       ├── t1_tat_ca.wav
    │       │       └── t2_tro_nen.wav
    │       │   └── wav.scp
    │   ├── load_data.py
    │   ├── logs
    │       ├── 20181207_122900.md
    │       ├── 20181207_185000.md
    │       ├── 20181207_232600.md
    │       ├── 20181208_075100.md
    │       └── README.md
    │   ├── model
    │       ├── __init__.py
    │       ├── etc
    │       │   ├── feat.params
    │       │   ├── idngram
    │       │   ├── sphinx_train.cfg
    │       │   ├── text
    │       │   ├── tmp.dic
    │       │   ├── tmp.filler
    │       │   ├── tmp.lm
    │       │   ├── tmp.phone
    │       │   ├── tmp_test.fileids
    │       │   ├── tmp_test.transcription
    │       │   ├── tmp_train.fileids
    │       │   ├── tmp_train.transcription
    │       │   └── vocab
    │       ├── model_parameters
    │       │   ├── tmp.cd_cont_200
    │       │   │   ├── feat.params
    │       │   │   ├── mdef
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── noisedict
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.cd_cont_200_1
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.cd_cont_200_2
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.cd_cont_200_4
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.cd_cont_initial
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.cd_cont_untied
    │       │   │   ├── feat.params
    │       │   │   ├── mdef
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── noisedict
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   ├── tmp.ci_cont
    │       │   │   ├── feat.params
    │       │   │   ├── mdef
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── noisedict
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       │   └── tmp.ci_cont_flatinitial
    │       │   │   ├── globalmean
    │       │   │   ├── globalvar
    │       │   │   ├── means
    │       │   │   ├── mixture_weights
    │       │   │   ├── transition_matrices
    │       │   │   └── variances
    │       └── text.py
    │   ├── predict.py
    │   ├── predict_delta.sh
    │   ├── preprocess.py
    │   ├── preprocess_full.py
    │   ├── test
    │       ├── VIVOSDEV01_R003.wav
    │       ├── VIVOSDEV01_R012.wav
    │       ├── VIVOSDEV01_R027.wav
    │       ├── VIVOSDEV01_R028.wav
    │       ├── VIVOSDEV01_R034.wav
    │       ├── VIVOSDEV01_R043.wav
    │       ├── VIVOSDEV01_R044.wav
    │       └── VIVOSDEV01_R055.wav
    │   ├── test_model.py
    │   ├── text2.py
    │   └── train.py
├── insight
    ├── vivos.txt
    └── vlsp2018.txt
├── report
    ├── acl2017.sty
    ├── acl_natbib.bst
    ├── build.sh
    ├── eacl2017.bst
    ├── eacl2017.sty
    ├── notation.tex
    ├── technique_report.bib
    ├── technique_report.pdf
    └── technique_report.tex
├── tmp
    └── .gitkeep
└── util
    └── eda_vlsp.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | data/vivos/raw/
 3 | data/vivos/corpus/
 4 | data/open_fpt/raw/FPTOpenSpeechData_Set001_V0.1
 5 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part1_V0.1
 6 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part2_V0.1
 7 | **/tmp/
 8 | **/analyze/
 9 | /experiment/diadiem/tmp/
10 | /data/vlsp/corpus/
11 | /data/vlsp/wav
12 | /experiment/vlsp/extension/_pycache_/
13 | **/**/__pycache__/
14 | **/__pycache__/
15 | .ipynb_checkpoints
16 | data/vlsp
17 | tmp
18 | !tmp/.gitkeep
19 | data/diadiem/


--------------------------------------------------------------------------------
/Experiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "print(\"Hello from Underthesea Automatic Speech Recognition Team\")"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "!lscpu"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "!free -m"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "!df -h"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 1,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "data\t\texperiment\t  LICENSE.txt\tREADME.md\r\n",
 49 |       "data_format.md\tExperiment.ipynb  README.en.md\treport\r\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "!ls"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "diadiem  vivos\r\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "!ls data"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "corpus\tpreprocess.py  raw\r\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "!ls data/vivos"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 4,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "4.0K\tExperiment.ipynb\n",
101 |       "36K\tLICENSE.txt\n",
102 |       "4.0K\tREADME.en.md\n",
103 |       "8.0K\tREADME.md\n",
104 |       "5.2G\tdata\n",
105 |       "4.0K\tdata_format.md\n",
106 |       "312M\texperiment\n",
107 |       "220K\treport\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "!du -sh *"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": []
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "codemirror_mode": {
133 |     "name": "ipython",
134 |     "version": 3
135 |    },
136 |    "file_extension": ".py",
137 |    "mimetype": "text/x-python",
138 |    "name": "python",
139 |    "nbconvert_exporter": "python",
140 |    "pygments_lexer": "ipython3",
141 |    "version": "3.6.3"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 2
146 | }
147 | 


--------------------------------------------------------------------------------
/README.en.md:
--------------------------------------------------------------------------------
 1 | # Vietnamese Automatic Speech Recognition
 2 | 
 3 | ## Mục lục
 4 | 
 5 | 
 6 | ## Huấn luyện mô hình
 7 | 
 8 | ## Môi trường thử nghiệm
 9 | 
10 | * Ubuntu 16.04
11 | 
12 | ## Cài đặt
13 | 
14 | **Cài đặt Kaldi** theo hướng dẫn tại [http://kaldi-asr.org/doc/tutorial_setup.html](http://kaldi-asr.org/doc/tutorial_setup.html)
15 | 
16 | ```
17 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
18 | 
19 | $ cd kaldi-trunk/tools/; make;
20 | 
21 | $ extras/install_openblas.sh
22 | 
23 | $ cd ../src; ./configure  --openblas-root=../tools/OpenBLAS/install; make
24 | ```
25 | 
26 | **Cài đặt language modeling toolkit srilm**
27 | 
28 | Cài đặt dependencies
29 | 
30 | ```
31 | $ apt-get install gawk
32 | ```
33 | 
34 | Cài đặt srilm
35 | 
36 | ```
37 | $ cd kaldi-trunk/tools
38 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz
39 | $ ./install_srilm.sh
40 | ...
41 | Installation of SRILM finished successfully
42 | Please source the tools/env.sh in your path.sh to enable it
43 | ```
44 | 
45 | # Mô tả dữ liệu
46 | 
47 | [Xem chi tiết](data_format.md)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Nhận dạng tiếng nói tiếng Việt
  2 | 
  3 | ![](https://img.shields.io/badge/made%20with-%E2%9D%A4-red.svg)
  4 | ![](https://img.shields.io/badge/opensource-vietnamese-blue.svg)
  5 | ![](https://img.shields.io/badge/build-passing-green.svg)
  6 | 
  7 | Dự án nghiên cứu về bài toán *Nhận dạng tiếng nói tiếng Việt*, được phát triển bởi nhóm nghiên cứu xử lý ngôn ngữ tự nhiên tiếng Việt - [undertheseanlp](https://github.com/undertheseanlp/). Chứa mã nguồn các thử nghiệm cho việc xử lý dữ liệu, huấn luyện và đánh giá mô hình, cũng như cho phép dễ dàng tùy chỉnh mô hình đối với những tập dữ liệu mới.
  8 | 
  9 | **Nhóm tác giả**
 10 | 
 11 | * Vũ Anh ([anhv.ict91@gmail.com](anhv.ict91@gmail.com))
 12 | * Lê Phi Hùng ([lephihungch@gmail.com](lephihungch@gmail.com))
 13 | 
 14 | **Tham gia đóng góp**
 15 | 
 16 | Mọi ý kiến đóng góp hoặc yêu cầu trợ giúp xin gửi vào mục [Issues](../../issues) của dự án. Các thảo luận được khuyến khích **sử dụng tiếng Việt** để dễ dàng trong quá trình trao đổi. 
 17 | 
 18 | Nếu bạn có kinh nghiệm trong bài toán này, muốn tham gia vào nhóm phát triển với vai trò là [Developer](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor), xin hãy đọc kỹ [Hướng dẫn tham gia đóng góp](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor).
 19 | 
 20 | ## Mục lục
 21 | 
 22 | * [Yêu cầu hệ thống](#yêu-cầu-hệ-thống)
 23 | * [Thiết lập môi trường](#thiết-lập-môi-trường)
 24 | * [Hướng dẫn sử dụng](#hướng-dẫn-sử-dụng)
 25 |   * [Sử dụng mô hình đã huấn luyện](#sử-dụng-mô-hình-đã-huấn-luyện)
 26 |   * [Huấn luyện mô hình](#huấn-luyện-mô-hình)
 27 | * [Kết quả thử nghiệm](#kết-quả-thử-nghiệm)
 28 | * [Trích dẫn](#trích-dẫn)
 29 | * [Bản quyền](#bản-quyền)
 30 | 
 31 | ## Yêu cầu hệ thống 
 32 | 
 33 | * `Hệ điều hành: Ubuntu 16.04`
 34 | * `Python 3.6+`
 35 | * `conda 4+`
 36 | 
 37 | 
 38 | ## Thiết lập môi trường
 39 | 
 40 | **Cài đặt Kaldi**
 41 | 
 42 | Để cài đặt Kaldi, thực hiện theo các bước tại [hướng dẫn](http://kaldi-asr.org/doc/tutorial_setup.html)
 43 | 
 44 | ```
 45 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
 46 | 
 47 | $ cd kaldi-trunk/tools/; make;
 48 | 
 49 | $ extras/install_openblas.sh
 50 | 
 51 | $ cd ../src; ./configure  --openblas-root=../tools/OpenBLAS/install; make
 52 | ```
 53 | 
 54 | **Cài đặt language modeling toolkit srilm**
 55 | 
 56 | Cài đặt dependencies
 57 | 
 58 | ```
 59 | $ apt-get install gawk
 60 | ```
 61 | 
 62 | **Cài đặt srilm**
 63 | 
 64 | ```
 65 | $ cd kaldi-trunk/tools
 66 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz
 67 | $ ./install_srilm.sh
 68 | ...
 69 | Installation of SRILM finished successfully
 70 | Please source the tools/env.sh in your path.sh to enable it
 71 | ```
 72 | 
 73 | ## Hướng dẫn sử dụng
 74 | 
 75 | ### Huấn luyện mô hình
 76 | 
 77 | **Mô tả dữ liệu**: [Xem chi tiết](data_format.md)
 78 | 
 79 | Trước khi run train.py phải set lại đường dẫn tới kaldi_folder .
 80 | 
 81 | Method predict nên có thêm argument model_path nếu bạn đã thực hiện train trước đó (vì nếu không nó sẽ lấy theo tmp_path của model, mà tmp_path này random cho mỗi lần khởi tạo lại model để chuẩn bị cho việc chạy training mới)
 82 | 
 83 | Thay đổi N_TRAIN và N_TEST trong init của KaldiSpeechRecognition để đổi giới hạn tập train/test
 84 | 
 85 | Output folder sẽ nằm trong kaldi_folder/egs/uts_{tmp_number} với tmp_number được thấy khi run train.py (EX: "Init Kaldi Speech Recognition in number_of_tmp folder" - Will be updated soon)
 86 | 
 87 | ## Kết quả thử nghiệm 
 88 | 
 89 | Huấn luyện trên tập dữ liệu VIVOS - OpenFPT, test trên tập VLSP 2018
 90 | 
 91 | <table>
 92 |  <tr>
 93 |    <th>Mô hình</td>
 94 |    <th>WER</td>
 95 |  </tr>
 96 |   <tr>
 97 |     <td>GMM: MFCC + delta + LDA + MLTT</td>
 98 |     <td>75.27%</td>
 99 |  </tr>
100 | </table>
101 | 
102 | Huấn luyện trên tập dữ liệu VIVOS, test trên tập VLSP 2018 
103 | 
104 | <table>
105 |  <tr>
106 |    <th>Mô hình</td>
107 |    <th>WER</td>
108 |  </tr>
109 |   <tr>
110 |     <td>GMM: MFCC + delta + LDA + MLTT</td>
111 |     <td>79.80%</td>
112 |  </tr>
113 |  <tr>
114 |     <td>GMM: MFCC + delta</td>
115 |     <td>82.03%</td>
116 |  </tr>
117 | </table>
118 | 
119 | ## Bản quyền
120 | 
121 | Mã nguồn của dự án được phân phối theo giấy phép [GPL-3.0](LICENSE.txt).
122 | 
123 | Dự án sử dụng tập dữ liệu **[VIVOS](https://ailab.hcmus.edu.vn/vivos/)** trong các thử nghiệm. Xin vui lòng kiểm tra lại thông tin trên website hoặc báo cáo khoa học tương ứng để biết thông tin về bản quyền và trích dẫn khi sử dụng tập dữ liệu này.


--------------------------------------------------------------------------------
/data/diadiem/preprocess.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from os import mkdir
 3 | import re
 4 | from text import phone2word
 5 | 
 6 | 
 7 | def create_train_text():
 8 |     lines = open(
 9 |         "raw/huanluyen_diadiem_train.transcription").read().splitlines()
10 |     output = []
11 |     for line in lines:
12 |         m = re.match(r"^<s> (?P<text>.*) </s> \((?P<fileid>.*)\)$", line)
13 |         if m:
14 |             text = phone2word(m.group("text").lower())
15 |             fileid = m.group("fileid")
16 |             content = "{}|{}".format(fileid, text)
17 |             output.append(content)
18 |             pass
19 |         else:
20 |             raise Exception("Content not match.")
21 |     text = "\n".join(output)
22 |     open("corpus/train/text", "w").write(text)
23 | 
24 | 
25 | def create_test_text():
26 |     lines = open(
27 |         "raw/huanluyen_diadiem_test.transcription").read().splitlines()
28 |     output = []
29 |     for line in lines:
30 |         m = re.match(r"^(?P<text>.*) \((?P<fileid>.*)\)$", line)
31 |         if m:
32 |             text = phone2word(m.group("text").lower())
33 |             fileid = m.group("fileid")
34 |             content = "{}|{}".format(fileid, text)
35 |             output.append(content)
36 |             pass
37 |         else:
38 |             raise Exception("Text not match.")
39 |     text = "\n".join(output)
40 |     open("corpus/test/text", "w").write(text)
41 | 
42 | 
43 | try:
44 |     shutil.rmtree("corpus")
45 | except:
46 |     pass
47 | finally:
48 |     mkdir("corpus")
49 |     mkdir("corpus/train")
50 |     mkdir("corpus/test")
51 |     shutil.copytree("raw/wav/train", "corpus/train/wav")
52 |     shutil.copytree("raw/wav/test", "corpus/test/wav")
53 |     create_train_text()
54 |     create_test_text()
55 | 


--------------------------------------------------------------------------------
/data/diadiem/text.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/data/vivos/preprocess.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from os import mkdir, walk
  3 | from os import listdir
  4 | from os.path import join
  5 | 
  6 | 
  7 | def create_train_waves():
  8 |     waves_folder = "raw/train/waves"
  9 |     corpus_waves_folder = "corpus/train/wav"
 10 |     try:
 11 |         shutil.rmtree(corpus_waves_folder)
 12 |     except:
 13 |         pass
 14 |     finally:
 15 |         mkdir(corpus_waves_folder)
 16 |     for root, dirs, files in walk(waves_folder):
 17 |         for dir in dirs:
 18 |             for f in listdir(join(waves_folder, dir)):
 19 |                 shutil.copy(
 20 |                     join(waves_folder, dir, f),
 21 |                     join(corpus_waves_folder, f))
 22 | 
 23 | 
 24 | def create_test_waves():
 25 |     waves_folder = "raw/test/waves"
 26 |     corpus_waves_folder = "corpus/test/wav"
 27 |     try:
 28 |         shutil.rmtree(corpus_waves_folder)
 29 |     except:
 30 |         pass
 31 |     finally:
 32 |         mkdir(corpus_waves_folder)
 33 |     for root, dirs, files in walk(waves_folder):
 34 |         for dir in dirs:
 35 |             for f in listdir(join(waves_folder, dir)):
 36 |                 shutil.copy(
 37 |                     join(waves_folder, dir, f),
 38 |                     join(corpus_waves_folder, f))
 39 | 
 40 | 
 41 | def create_train_text():
 42 |     content = open("raw/train/prompts.txt").read()
 43 |     content = content.replace(":", "")
 44 |     lines = content.splitlines()
 45 |     output = []
 46 |     for line in lines:
 47 |         items = line.split()
 48 |         fileid = items[0]
 49 |         text = " ".join(items[1:]).lower()
 50 |         content = "{}|{}".format(fileid, text)
 51 |         output.append(content)
 52 |     text = "\n".join(output)
 53 |     open("corpus/train/text", "w").write(text)
 54 | 
 55 | 
 56 | def create_test_text():
 57 |     content = open("raw/test/prompts.txt").read()
 58 |     content = content.replace(":", "")
 59 |     lines = content.splitlines()
 60 |     output = []
 61 |     for line in lines:
 62 |         items = line.split()
 63 |         fileid = items[0]
 64 |         text = " ".join(items[1:]).lower()
 65 |         content = "{}|{}".format(fileid, text)
 66 |         output.append(content)
 67 |     text = "\n".join(output)
 68 |     open("corpus/test/text", "w").write(text)
 69 | 
 70 | 
 71 | def create_gender():
 72 |     content = open("raw/train/genders.txt").read()
 73 |     open("corpus/train/gender", "w").write(content)
 74 |     content = open("raw/test/genders.txt").read()
 75 |     open("corpus/test/gender", "w").write(content)
 76 | 
 77 | 
 78 | def create_speaker():
 79 |     lines = open("raw/train/prompts.txt").read().splitlines()
 80 |     files = [line.split()[0] for line in lines]
 81 |     tmp = []
 82 |     for file_id in files:
 83 |         speaker_id = file_id.split("_")[0]
 84 |         content = "{} {}".format(speaker_id, file_id)
 85 |         tmp.append(content)
 86 |     content = "\n".join(tmp)
 87 |     open("corpus/train/speaker", "w").write(content)
 88 | 
 89 |     lines = open("raw/test/prompts.txt").read().splitlines()
 90 |     files = [line.split()[0] for line in lines]
 91 |     tmp = []
 92 |     for file_id in files:
 93 |         speaker_id = file_id.split("_")[0]
 94 |         content = "{} {}".format(speaker_id, file_id)
 95 |         tmp.append(content)
 96 |     content = "\n".join(tmp)
 97 |     open("corpus/test/speaker", "w").write(content)
 98 | 
 99 | 
100 | try:
101 |     shutil.rmtree("corpus")
102 | except:
103 |     pass
104 | finally:
105 |     mkdir("corpus")
106 |     mkdir("corpus/train")
107 |     mkdir("corpus/test")
108 |     create_train_waves()
109 |     create_test_waves()
110 |     create_train_text()
111 |     create_test_text()
112 |     create_gender()
113 |     create_speaker()
114 | 


--------------------------------------------------------------------------------
/data_format.md:
--------------------------------------------------------------------------------
 1 | # Mô tả dữ liệu
 2 | 
 3 | | Phiên bản         | v1.0.0     |
 4 | |-------------------|------------|
 5 | | Lần cập nhật cuối | 10/01/2018 |
 6 | | Người thực hiện   | Vũ Anh     |
 7 | 
 8 | Tài liệu mô tả đề xuất về cấu trúc chuẩn của tập dữ liệu (corpus) đối với bài toán nhận dạng tiếng nói (ASR). Được áp dụng trong các thí nghiệm của [`underthesea`](https://github.com/undertheseanlp/automatic_speech_recognition) từ phiên bản 1.2.0
 9 | 
10 | Các ví dụ mẫu: [`diadiem`](https://github.com/undertheseanlp/automatic_speech_recognition/tree/sphinx_lab/data/diadiem/corpus) corpus
11 | 
12 | ### Tập dữ liệu
13 | 
14 | Dữ liệu của bài toán nhận dạng tiếng nói được lưu trong một thư mục, gồm hai thư mục con `train` và `test`.
15 | 
16 | * Dữ liệu huấn luyện được lưu trong thư mục `train`
17 | * Dữ liệu kiểm thử được lưu trong thư mục `test`
18 | 
19 | Cấu trúc thư mục
20 | 
21 | ```
22 | .
23 | ├── train
24 | |   ├── wav
25 | |   |   ├── train_01.wav
26 | |   |   ├── train_02.wav
27 | |   |   └── train_03.wav
28 | |   ├── gender
29 | |   ├── speaker
30 | |   └── text
31 | └── test
32 |     ├── wav
33 |     |   ├── test_01.wav
34 |     |   ├── test_02.wav
35 |     |   └── test_03.wav
36 |     ├── gender
37 |     ├── speaker
38 |     └── text
39 | ```
40 | 
41 | Mỗi thư mục `train` và `test` gồm thư mục con `wav`, file `gender`, file `speaker` và file `text`. Trong thư mục `wav` có chứa các file âm thanh (với đuôi định dạng phổ biến là wav), chứa dữ liệu âm thanh.
42 | 
43 | File `text` chứa nội dung của từng câu nói với tên file âm thanh tương ứng
44 | 
45 | *Format*: `<audio_file_id>|<text content>`
46 | 
47 | ```
48 | train_01|text content 01
49 | train_02|text content 02
50 | train_03|text content 03
51 | train_04|text content 04
52 | ```
53 | 
54 | File `speaker` chứa mô tả speaker id với câu nói tương ứng
55 | 
56 | *Format*: `<speaker_id> <audio_file_id>`
57 | 
58 | ```
59 | spk01 train_01
60 | spk01 train_02
61 | spk02 train_03
62 | spk02 train_04
63 | ```
64 | 
65 | File `gender` chứa thông tin về giới tính của speaker
66 | 
67 | *Format*: `<speaker_id> <gender>`
68 | 
69 | ```
70 | spk01 f
71 | spk02 m
72 | ```
73 | 
74 | Ký hiệu:
75 | 
76 | * `f` (female): speaker có giới tính nữ
77 | * `m` (male): speakder có giới tính nam


--------------------------------------------------------------------------------
/egs/diadiem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/__init__.py


--------------------------------------------------------------------------------
/egs/diadiem/analyze.py:
--------------------------------------------------------------------------------
 1 | from model import transcript
 2 | from os.path import join, dirname
 3 | from extension.analyze import WERAnalyzeLogger
 4 | 
 5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem",
 6 |                      "corpus")
 7 | 
 8 | 
 9 | def load_test():
10 |     lines = open(join(corpus_folder, "test", "text")).read().splitlines()
11 |     lines = [line.split("|") for line in lines]
12 |     wavs = [line[0] for line in lines]
13 |     wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs]
14 |     texts = [line[1] for line in lines]
15 |     return wavs, texts
16 | 
17 | 
18 | wavs_test, texts_test = load_test()
19 | # texts_pred = [""] * len(texts_test)
20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test]
21 | 
22 | log_folder = join(dirname(__file__), "analyze")
23 | 
24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder)
25 | 


--------------------------------------------------------------------------------
/egs/diadiem/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/extension/__init__.py


--------------------------------------------------------------------------------
/egs/diadiem/extension/analyze.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import shutil
 3 | from extension.metrics import calculate_wer
 4 | from os.path import join, basename
 5 | import os
 6 | from underthesea.util.file_io import write
 7 | import numpy as np
 8 | 
 9 | 
10 | class WERAnalyzeLogger:
11 |     @staticmethod
12 |     def log(wavs_test, texts_test, texts_pred, log_folder):
13 |         wer = np.mean([calculate_wer(test.split(), pred.split())
14 |                        for test, pred in zip(texts_test, texts_pred)])
15 |         wer = np.round(wer, 4)
16 |         result = {
17 |             "WER": wer
18 |         }
19 |         content = json.dumps(result, ensure_ascii=False)
20 |         log_file = join(log_folder, "result.json")
21 |         write(log_file, content)
22 |         wav_folder = join(log_folder, "wav")
23 |         try:
24 |             shutil.rmtree(wav_folder)
25 |         except:
26 |             pass
27 |         finally:
28 |             os.mkdir(wav_folder)
29 |         for wav in wavs_test:
30 |             new_path = join(wav_folder, basename(wav))
31 |             shutil.copyfile(wav, new_path)
32 |         wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test]
33 |         speech_recognition = {
34 |             "texts_test": texts_test,
35 |             "texts_pred": texts_pred,
36 |             "wavs_test": wavs_test_new_path,
37 |         }
38 |         content = json.dumps(speech_recognition, ensure_ascii=False)
39 |         log_file = join(log_folder, "speechrecognition.json")
40 |         write(log_file, content)
41 | 
42 |         print("Result is written in {}".format(log_file))
43 |         print("WER: {}%".format(wer * 100))
44 | 


--------------------------------------------------------------------------------
/egs/diadiem/extension/export.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from os.path import join
 3 | 
 4 | 
 5 | class SphinxSpeechRecognitionExporter:
 6 |     @staticmethod
 7 |     def export(model, export_folder):
 8 |         tmp_folder = model.tmp_folder
 9 |         try:
10 |             shutil.rmtree(join(export_folder, "etc"))
11 |         except:
12 |             pass
13 |         finally:
14 |             shutil.copytree(join(tmp_folder, "etc"),
15 |                             join(export_folder, "etc"))
16 | 
17 |         try:
18 |             shutil.rmtree(join(export_folder, "model_parameters"))
19 |         except:
20 |             pass
21 |         finally:
22 |             shutil.copytree(join(tmp_folder, "model_parameters"),
23 |                             join(export_folder, "model_parameters"))
24 | 


--------------------------------------------------------------------------------
/egs/diadiem/extension/metrics.py:
--------------------------------------------------------------------------------
 1 | def calculate_wer(reference, hypothesis):
 2 |     """
 3 |         Calculation of WER with Levenshtein distance.
 4 |         Works only for iterables up to 254 elements (uint8).
 5 |         O(nm) time and space complexity.
 6 | 
 7 |         >>> calculate_wer("who is there".split(), "is there".split())
 8 |         1
 9 |         >>> calculate_wer("who is there".split(), "".split())
10 |         3
11 |         >>> calculate_wer("".split(), "who is there".split())
12 |         3
13 |     """
14 |     # initialisation
15 |     import numpy
16 |     d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1),
17 |                     dtype=numpy.uint8)
18 |     d = d.reshape((len(reference) + 1, len(hypothesis) + 1))
19 |     for i in range(len(reference) + 1):
20 |         for j in range(len(hypothesis) + 1):
21 |             if i == 0:
22 |                 d[0][j] = j
23 |             elif j == 0:
24 |                 d[i][0] = i
25 | 
26 |     # computation
27 |     for i in range(1, len(reference) + 1):
28 |         for j in range(1, len(hypothesis) + 1):
29 |             if reference[i - 1] == hypothesis[j - 1]:
30 |                 d[i][j] = d[i - 1][j - 1]
31 |             else:
32 |                 substitution = d[i - 1][j - 1] + 1
33 |                 insertion = d[i][j - 1] + 1
34 |                 deletion = d[i - 1][j] + 1
35 |                 d[i][j] = min(substitution, insertion, deletion)
36 | 
37 |     return d[len(reference)][len(hypothesis)] / float(len(reference))
38 | 
39 | 
40 | import unittest
41 | assertions = unittest.TestCase('__init__')
42 | 
43 | if __name__ == '__main__':
44 |     s = calculate_wer("khach san".split(), "khach san cua toi".split())
45 |     assertions.assertAlmostEqual(s, 1)
46 |     s = calculate_wer("khach san cua".split(), "khach san cua toi".split())
47 |     assertions.assertAlmostEqual(s, 0.333, 3)
48 | 


--------------------------------------------------------------------------------
/egs/diadiem/extension/model.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import os
  3 | import text
  4 | 
  5 | 
  6 | class SphinxSpeechRecognition:
  7 |     def __init__(self, corpus_folder, tmp_folder):
  8 |         print("Initial Sphinx Speech Recognition")
  9 |         self.corpus_folder = corpus_folder
 10 |         self.tmp_folder = tmp_folder
 11 |         try:
 12 |             shutil.rmtree(tmp_folder)
 13 |         except Exception as e:
 14 |             pass
 15 |         finally:
 16 |             os.mkdir(tmp_folder)
 17 |             os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder))
 18 |         self._init_data()
 19 |         self._change_config()
 20 |         self._make_transcription()
 21 |         self._make_dictionary()
 22 |         self._make_filler()
 23 |         self._make_language_model()
 24 | 
 25 |     # ========================== #
 26 |     # Init Data
 27 |     # ========================== #
 28 |     def _init_data(self):
 29 |         os.system("cd {}; mkdir wav".format(self.tmp_folder))
 30 | 
 31 |         os.system("cd {}; cp -r {}/train/wav wav/train".format(self.tmp_folder,
 32 |                                                                self.corpus_folder))
 33 |         os.system("cd {}; cp -r {}/test/wav wav/test".format(self.tmp_folder,
 34 |                                                              self.corpus_folder))
 35 | 
 36 |         ids = open(
 37 |             "{}/train/text".format(self.corpus_folder)).read().splitlines()
 38 |         ids = [item.split("|")[0] for item in ids]
 39 |         ids = ["train/{}".format(id) for id in ids]
 40 |         ids.append("")
 41 |         content = "\n".join(ids)
 42 |         open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"),
 43 |              "w").write(content)
 44 | 
 45 |         ids = open(
 46 |             "{}/test/text".format(self.corpus_folder)).read().splitlines()
 47 |         ids = [item.split("|")[0] for item in ids]
 48 |         ids = ["test/{}".format(id) for id in ids]
 49 |         ids.append("")
 50 |         content = "\n".join(ids)
 51 |         open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"),
 52 |              "w").write(content)
 53 | 
 54 |     # ========================== #
 55 |     # Config
 56 |     # ========================== #
 57 |     def _change_config(self):
 58 |         config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg")
 59 |         config = SphinxConfig(config_file)
 60 |         config.set("$CFG_BASE_DIR", "\".\"")
 61 |         config.set("$CFG_WAVFILE_SRATE", 8000.0)
 62 |         config.set("$CFG_NUM_FILT", 31)
 63 |         config.set("$CFG_LO_FILT", 200)
 64 |         config.set("$CFG_HI_FILT", 3500)
 65 |         config.set("$CFG_WAVFILE_TYPE", "'raw'")
 66 |         config.set("$CFG_LANGUAGEMODEL",
 67 |                    "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"")
 68 |         config.set("$DEC_CFG_LANGUAGEMODEL",
 69 |                    "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"")
 70 | 
 71 |     # ========================== #
 72 |     # Transcription
 73 |     # ========================== #
 74 |     def _convert_transcription(self, in_file, out_file):
 75 |         lines = open(in_file).read().splitlines()
 76 |         output = []
 77 |         for line in lines:
 78 |             fileid, word = line.split("|")
 79 |             phone = text.word2phone(word)
 80 |             content = "<s> {} </s> ({})".format(phone, fileid)
 81 |             output.append(content)
 82 |         content = "\n".join(output)
 83 |         open(out_file, "w").write(content)
 84 | 
 85 |     def _make_transcription(self):
 86 |         self._convert_transcription(
 87 |             "{}/train/text".format(self.corpus_folder),
 88 |             "{}/etc/tmp_train.transcription".format(self.tmp_folder))
 89 |         self._convert_transcription(
 90 |             "{}/test/text".format(self.corpus_folder),
 91 |             "{}/etc/tmp_test.transcription".format(self.tmp_folder))
 92 | 
 93 |     # ============================== #
 94 |     # Create dictionary and phones
 95 |     # ============================== #
 96 |     def _make_dictionary(self):
 97 |         lines = open(
 98 |             "{}/train/text".format(self.corpus_folder)).read().splitlines()
 99 |         phones = []
100 |         for line in lines:
101 |             fileid, word = line.split("|")
102 |             p = text.word2phone(word).split()
103 |             phones += p
104 |         phones = sorted(set(phones))
105 |         # create .dic files
106 |         lines = []
107 |         phone_units = []
108 |         for p in phones:
109 |             units = list(p)
110 |             phone_units += units
111 |             units = " ".join(units)
112 |             line = "{:20s}{}".format(p, units)
113 |             lines.append(line)
114 |         open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write(
115 |             "\n".join(lines))
116 |         phone_units = sorted(set(phone_units))
117 |         phone_units.append("SIL")
118 |         open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write(
119 |             "\n".join(phone_units))
120 | 
121 |     def _make_filler(self):
122 |         fillers = ["<s>", "</s>", "<sil>"]
123 |         lines = ["{:20s}SIL".format(f) for f in fillers]
124 |         open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write(
125 |             "\n".join(lines))
126 | 
127 |     # ========================== #
128 |     # Language Model
129 |     # ========================== #
130 |     def _make_cleaned_text(self):
131 |         in_file = "{}/train/text".format(self.corpus_folder)
132 |         out_file = "{}/etc/text".format(self.tmp_folder)
133 |         lines = open(in_file).read().splitlines()
134 |         output = []
135 |         for line in lines:
136 |             fileid, word = line.split("|")
137 |             phone = text.word2phone(word)
138 |             content = "<s> {} </s>".format(phone, fileid)
139 |             output.append(content)
140 |         content = "\n".join(output)
141 |         open(out_file, "w").write(content)
142 | 
143 |     def _make_language_model(self):
144 |         self._make_cleaned_text()
145 |         etc_folder = os.path.join(self.tmp_folder, "etc")
146 |         chdir = "cd {}; ".format(etc_folder)
147 |         os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab")
148 |         os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text")
149 |         os.system(
150 |             chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm")
151 | 
152 |     def fit(self):
153 |         chdir = "cd {}; ".format(self.tmp_folder)
154 |         os.system(chdir + "sphinxtrain run")
155 | 
156 |     def predict(self, wav_file):
157 |         command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format(
158 |             self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file)
159 |         output = os.popen(command).read().strip()
160 |         output = text.phone2word(output)
161 |         return output
162 | 
163 | 
164 | class SphinxConfig:
165 |     def __init__(self, config_file):
166 |         self.file = config_file
167 |         self.lines = open(config_file).read().splitlines()
168 | 
169 |     def save(self):
170 |         content = "\n".join(self.lines)
171 |         open(self.file, "w").write(content)
172 | 
173 |     def set(self, key, value):
174 |         for i, line in enumerate(self.lines):
175 |             if line.startswith(key):
176 |                 content = "{} = {};".format(key, value)
177 |                 self.lines[i] = content
178 |         self.save()
179 | 


--------------------------------------------------------------------------------
/egs/diadiem/extension/text.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/egs/diadiem/load_data.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join
2 | 
3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem",
4 |                      "corpus")
5 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from os.path import dirname
 2 | import os
 3 | import text
 4 | 
 5 | 
 6 | def transcript(wav_file):
 7 |     tmp_folder = dirname(__file__)
 8 |     command = "pocketsphinx_continuous " \
 9 |               "-hmm {0}/model_parameters/tmp.cd_cont_200 " \
10 |               "-samprate 8000 " \
11 |               "-lm {0}/etc/tmp.lm " \
12 |               "-dict {0}/etc/tmp.dic " \
13 |               "-infile {1} " \
14 |               "-logfn {0}/yes".format(tmp_folder, wav_file)
15 |     with os.popen(command) as c:
16 |         output = c.read().strip()
17 |     output = text.phone2word(output)
18 |     os.remove("{}/yes".format(tmp_folder))
19 |     return output
20 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf __CFG_LO_FILT__
 2 | -upperf __CFG_HI_FILT__
 3 | -nfilt __CFG_NUM_FILT__
 4 | -transform __CFG_TRANSFORM__
 5 | -lifter __CFG_LIFTER__
 6 | -feat __CFG_FEATURE__
 7 | -svspec __CFG_SVSPEC__
 8 | -agc __CFG_AGC__
 9 | -cmn __CFG_CMN__
10 | -varnorm __CFG_VARNORM__
11 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/idngram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/etc/idngram


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.dic:
--------------------------------------------------------------------------------
 1 | ay                  a y
 2 | caf                 c a f
 3 | ddusng              d d u s n g
 4 | ka                  k a
 5 | ke                  k e
 6 | khasch              k h a s c h
 7 | khoong              k h o o n g
 8 | laji                l a j i
 9 | mast                m a s t
10 | phee                p h e e
11 | rao                 r a o
12 | sajn                s a j n
13 | teem                t e e m
14 | trajm               t r a j m
15 | trowr               t r o w r
16 | xa                  x a


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.filler:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.lm:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ## Copyright (c) 1996, Carnegie Mellon University, Cambridge University,
  3 | ## Ronald Rosenfeld and Philip Clarkson
  4 | ## Version 3, Copyright (c) 2006, Carnegie Mellon University 
  5 | ## Contributors includes Wen Xu, Ananlada Chotimongkol, 
  6 | ## David Huggins-Daines, Arthur Chan and Alan Black 
  7 | #############################################################################
  8 | =============================================================================
  9 | ===============  This file was produced by the CMU-Cambridge  ===============
 10 | ===============     Statistical Language Modeling Toolkit     ===============
 11 | =============================================================================
 12 | This is a 3-gram language model, based on a vocabulary of 18 words,
 13 |   which begins "</s>", "<s>", "ay"...
 14 | This is a CLOSED-vocabulary model
 15 |   (OOVs eliminated from training data and are forbidden in test data)
 16 | Good-Turing discounting was applied.
 17 | 1-gram frequency of frequency : 0 
 18 | 2-gram frequency of frequency : 0 0 0 0 0 0 0 
 19 | 3-gram frequency of frequency : 0 0 0 0 0 0 0 
 20 | 1-gram discounting ratios : 
 21 | 2-gram discounting ratios : 
 22 | 3-gram discounting ratios : 
 23 | This file is in the ARPA-standard format introduced by Doug Paul.
 24 | 
 25 | p(wd3|wd1,wd2)= if(trigram exists)           p_3(wd1,wd2,wd3)
 26 |                 else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
 27 |                 else                         p(wd3|w2)
 28 | 
 29 | p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)
 30 |             else              bo_wt_1(wd1)*p_1(wd2)
 31 | 
 32 | All probs and back-off weights (bo_wt) are given in log10 form.
 33 | 
 34 | Data formats:
 35 | 
 36 | Beginning of data mark: \data\
 37 | ngram 1=nr            # number of 1-grams
 38 | ngram 2=nr            # number of 2-grams
 39 | ngram 3=nr            # number of 3-grams
 40 | 
 41 | \1-grams:
 42 | p_1     wd_1 bo_wt_1
 43 | \2-grams:
 44 | p_2     wd_1 wd_2 bo_wt_2
 45 | \3-grams:
 46 | p_3     wd_1 wd_2 wd_3 
 47 | 
 48 | end of data mark: \end\
 49 | 
 50 | \data\
 51 | ngram 1=18
 52 | ngram 2=25
 53 | ngram 3=32
 54 | 
 55 | \1-grams:
 56 | -0.5755 </s>	-3.5579
 57 | -0.5754 <s>	-3.5587
 58 | -1.6028 ay	-2.6555
 59 | -1.5908 caf	-2.6672
 60 | -1.2657 ddusng	-2.8684
 61 | -1.5982 ka	-2.6601
 62 | -1.5982 ke	-2.5370
 63 | -1.6066 khasch	-2.6519
 64 | -1.2289 khoong	-2.9053
 65 | -1.5817 laji	-2.5534
 66 | -1.5899 mast	-2.6681
 67 | -1.5908 phee	-2.5444
 68 | -1.5982 rao	-2.6601
 69 | -1.6066 sajn	-2.5286
 70 | -1.6028 teem	-2.5324
 71 | -1.6028 trajm	-2.6555
 72 | -1.5817 trowr	-2.6760
 73 | -1.5899 xa	-2.5453
 74 | 
 75 | \2-grams:
 76 | -0.0001 </s> <s> 0.0008
 77 | -1.0152 <s> caf 0.0009
 78 | -0.6907 <s> ddusng 0.0004
 79 | -1.0226 <s> ka 0.0009
 80 | -1.0310 <s> khasch 0.0009
 81 | -0.6538 <s> khoong 0.0004
 82 | -1.0143 <s> mast 0.0009
 83 | -1.0273 <s> trajm 0.0009
 84 | -1.0053 <s> trowr 0.0000
 85 | -0.0009 ay teem 0.0009
 86 | -0.0009 caf phee 0.0009
 87 | -0.0004 ddusng </s> 0.6900
 88 | -0.0009 ka rao 0.0009
 89 | -0.0009 ke </s> 1.0219
 90 | -0.0009 khasch sajn 0.0009
 91 | -0.0004 khoong </s> 0.6531
 92 | -0.0009 laji </s> 1.0055
 93 | -0.0009 mast xa 0.0009
 94 | -0.0009 phee </s> 1.0145
 95 | -0.0009 rao ke 0.0009
 96 | -0.0009 sajn </s> 1.0303
 97 | -0.0009 teem </s> 1.0266
 98 | -0.0009 trajm ay 0.0009
 99 | -0.0009 trowr laji 0.0009
100 | -0.0009 xa </s> 1.0136
101 | 
102 | \3-grams:
103 | -1.0163 </s> <s> caf 
104 | -0.6903 </s> <s> ddusng 
105 | -1.0227 </s> <s> ka 
106 | -1.0312 </s> <s> khasch 
107 | -0.6534 </s> <s> khoong 
108 | -1.0144 </s> <s> mast 
109 | -1.0274 </s> <s> trajm 
110 | -1.0054 </s> <s> trowr 
111 | -0.0009 <s> caf phee 
112 | -0.0004 <s> ddusng </s> 
113 | -0.0009 <s> ka rao 
114 | -0.0009 <s> khasch sajn 
115 | -0.0004 <s> khoong </s> 
116 | -0.0009 <s> mast xa 
117 | -0.0009 <s> trajm ay 
118 | -0.0009 <s> trowr laji 
119 | -0.0009 ay teem </s> 
120 | -0.0009 caf phee </s> 
121 | -0.0004 ddusng </s> <s> 
122 | -0.0009 ka rao ke 
123 | -0.0009 ke </s> <s> 
124 | -0.0009 khasch sajn </s> 
125 | -0.0004 khoong </s> <s> 
126 | -0.0009 laji </s> <s> 
127 | -0.0009 mast xa </s> 
128 | -0.0009 phee </s> <s> 
129 | -0.0009 rao ke </s> 
130 | -0.0009 sajn </s> <s> 
131 | -0.0009 teem </s> <s> 
132 | -0.0009 trajm ay teem 
133 | -0.0009 trowr laji </s> 
134 | -0.0009 xa </s> <s> 
135 | 
136 | \end\
137 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.phone:
--------------------------------------------------------------------------------
 1 | a
 2 | c
 3 | d
 4 | e
 5 | f
 6 | g
 7 | h
 8 | i
 9 | j
10 | k
11 | l
12 | m
13 | n
14 | o
15 | p
16 | r
17 | s
18 | t
19 | u
20 | w
21 | x
22 | y
23 | SIL


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp_test.fileids:
--------------------------------------------------------------------------------
 1 | test/CAFPHEE001
 2 | test/CAFPHEE002
 3 | test/CAFPHEE003
 4 | test/CAFPHEE004
 5 | test/CAFPHEE005
 6 | test/CAFPHEE006
 7 | test/CAFPHEE007
 8 | test/CAFPHEE008
 9 | test/CAFPHEE009
10 | test/CAFPHEE010
11 | test/DDUSNG0001
12 | test/DDUSNG0002
13 | test/DDUSNG0003
14 | test/DDUSNG0004
15 | test/DDUSNG0005
16 | test/DDUSNG0006
17 | test/DDUSNG0007
18 | test/DDUSNG0008
19 | test/DDUSNG0009
20 | test/DDUSNG0010
21 | test/KARAOKE001
22 | test/KARAOKE002
23 | test/KARAOKE003
24 | test/KARAOKE004
25 | test/KARAOKE005
26 | test/KARAOKE006
27 | test/KARAOKE007
28 | test/KARAOKE008
29 | test/KARAOKE009
30 | test/KARAOKE010
31 | test/KHASCHSAJN001
32 | test/KHASCHSAJN002
33 | test/KHASCHSAJN003
34 | test/KHASCHSAJN004
35 | test/KHASCHSAJN005
36 | test/KHASCHSAJN006
37 | test/KHASCHSAJN007
38 | test/KHASCHSAJN008
39 | test/KHASCHSAJN009
40 | test/KHASCHSAJN010
41 | test/KHOONG0001
42 | test/KHOONG0002
43 | test/KHOONG0003
44 | test/KHOONG0004
45 | test/KHOONG0005
46 | test/KHOONG0006
47 | test/KHOONG0007
48 | test/KHOONG0008
49 | test/KHOONG0009
50 | test/KHOONG0010
51 | test/MASTXA001
52 | test/MASTXA002
53 | test/MASTXA003
54 | test/MASTXA004
55 | test/MASTXA005
56 | test/MASTXA006
57 | test/MASTXA007
58 | test/MASTXA008
59 | test/MASTXA009
60 | test/MASTXA010
61 | test/TRAJMAYTEEM001
62 | test/TRAJMAYTEEM002
63 | test/TRAJMAYTEEM003
64 | test/TRAJMAYTEEM004
65 | test/TRAJMAYTEEM005
66 | test/TRAJMAYTEEM006
67 | test/TRAJMAYTEEM007
68 | test/TRAJMAYTEEM008
69 | test/TRAJMAYTEEM009
70 | test/TRAJMAYTEEM010
71 | test/TROWRLAJI001
72 | test/TROWRLAJI002
73 | test/TROWRLAJI003
74 | test/TROWRLAJI004
75 | test/TROWRLAJI005
76 | test/TROWRLAJI006
77 | test/TROWRLAJI007
78 | test/TROWRLAJI008
79 | test/TROWRLAJI009
80 | test/TROWRLAJI010
81 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp_test.transcription:
--------------------------------------------------------------------------------
 1 | <s> caf phee </s> (CAFPHEE001)
 2 | <s> caf phee </s> (CAFPHEE002)
 3 | <s> caf phee </s> (CAFPHEE003)
 4 | <s> caf phee </s> (CAFPHEE004)
 5 | <s> caf phee </s> (CAFPHEE005)
 6 | <s> caf phee </s> (CAFPHEE006)
 7 | <s> caf phee </s> (CAFPHEE007)
 8 | <s> caf phee </s> (CAFPHEE008)
 9 | <s> caf phee </s> (CAFPHEE009)
10 | <s> caf phee </s> (CAFPHEE010)
11 | <s> ddusng </s> (DDUSNG0001)
12 | <s> ddusng </s> (DDUSNG0002)
13 | <s> ddusng </s> (DDUSNG0003)
14 | <s> ddusng </s> (DDUSNG0004)
15 | <s> ddusng </s> (DDUSNG0005)
16 | <s> ddusng </s> (DDUSNG0006)
17 | <s> ddusng </s> (DDUSNG0007)
18 | <s> ddusng </s> (DDUSNG0008)
19 | <s> ddusng </s> (DDUSNG0009)
20 | <s> ddusng </s> (DDUSNG0010)
21 | <s> ka rao ke </s> (KARAOKE001)
22 | <s> ka rao ke </s> (KARAOKE002)
23 | <s> ka rao ke </s> (KARAOKE003)
24 | <s> ka rao ke </s> (KARAOKE004)
25 | <s> ka rao ke </s> (KARAOKE005)
26 | <s> ka rao ke </s> (KARAOKE006)
27 | <s> ka rao ke </s> (KARAOKE007)
28 | <s> ka rao ke </s> (KARAOKE008)
29 | <s> ka rao ke </s> (KARAOKE009)
30 | <s> ka rao ke </s> (KARAOKE010)
31 | <s> khasch sajn </s> (KHASCHSAJN001)
32 | <s> khasch sajn </s> (KHASCHSAJN002)
33 | <s> khasch sajn </s> (KHASCHSAJN003)
34 | <s> khasch sajn </s> (KHASCHSAJN004)
35 | <s> khasch sajn </s> (KHASCHSAJN005)
36 | <s> khasch sajn </s> (KHASCHSAJN006)
37 | <s> khasch sajn </s> (KHASCHSAJN007)
38 | <s> khasch sajn </s> (KHASCHSAJN008)
39 | <s> khasch sajn </s> (KHASCHSAJN009)
40 | <s> khasch sajn </s> (KHASCHSAJN010)
41 | <s> khoong </s> (KHOONG0001)
42 | <s> khoong </s> (KHOONG0002)
43 | <s> khoong </s> (KHOONG0003)
44 | <s> khoong </s> (KHOONG0004)
45 | <s> khoong </s> (KHOONG0005)
46 | <s> khoong </s> (KHOONG0006)
47 | <s> khoong </s> (KHOONG0007)
48 | <s> khoong </s> (KHOONG0008)
49 | <s> khoong </s> (KHOONG0009)
50 | <s> khoong </s> (KHOONG0010)
51 | <s> mast xa </s> (MASTXA001)
52 | <s> mast xa </s> (MASTXA002)
53 | <s> mast xa </s> (MASTXA003)
54 | <s> mast xa </s> (MASTXA004)
55 | <s> mast xa </s> (MASTXA005)
56 | <s> mast xa </s> (MASTXA006)
57 | <s> mast xa </s> (MASTXA007)
58 | <s> mast xa </s> (MASTXA008)
59 | <s> mast xa </s> (MASTXA009)
60 | <s> mast xa </s> (MASTXA010)
61 | <s> trajm ay teem </s> (TRAJMAYTEEM001)
62 | <s> trajm ay teem </s> (TRAJMAYTEEM002)
63 | <s> trajm ay teem </s> (TRAJMAYTEEM003)
64 | <s> trajm ay teem </s> (TRAJMAYTEEM004)
65 | <s> trajm ay teem </s> (TRAJMAYTEEM005)
66 | <s> trajm ay teem </s> (TRAJMAYTEEM006)
67 | <s> trajm ay teem </s> (TRAJMAYTEEM007)
68 | <s> trajm ay teem </s> (TRAJMAYTEEM008)
69 | <s> trajm ay teem </s> (TRAJMAYTEEM009)
70 | <s> trajm ay teem </s> (TRAJMAYTEEM010)
71 | <s> trowr laji </s> (TROWRLAJI001)
72 | <s> trowr laji </s> (TROWRLAJI002)
73 | <s> trowr laji </s> (TROWRLAJI003)
74 | <s> trowr laji </s> (TROWRLAJI004)
75 | <s> trowr laji </s> (TROWRLAJI005)
76 | <s> trowr laji </s> (TROWRLAJI006)
77 | <s> trowr laji </s> (TROWRLAJI007)
78 | <s> trowr laji </s> (TROWRLAJI008)
79 | <s> trowr laji </s> (TROWRLAJI009)
80 | <s> trowr laji </s> (TROWRLAJI010)


--------------------------------------------------------------------------------
/egs/diadiem/model/etc/vocab:
--------------------------------------------------------------------------------
 1 | ## Vocab generated by v2 of the CMU-Cambridge Statistcal
 2 | ## Language Modeling toolkit.
 3 | ##
 4 | ## Includes 18 words ##
 5 | </s>
 6 | <s>
 7 | ay
 8 | caf
 9 | ddusng
10 | ka
11 | ke
12 | khasch
13 | khoong
14 | laji
15 | mast
16 | phee
17 | rao
18 | sajn
19 | teem
20 | trajm
21 | trowr
22 | xa
23 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mdef:
--------------------------------------------------------------------------------
 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan  5 10:29:45 2018
 2 | 0.3
 3 | 23 n_base
 4 | 59 n_tri
 5 | 328 n_state_map
 6 | 246 n_tied_state
 7 | 69 n_tied_ci_state
 8 | 23 n_tied_tmat
 9 | #
10 | # Columns definitions
11 | #base lft  rt p attrib tmat      ... state id's ...
12 |   SIL   -   - - filler    0    0    1    2    N
13 |     a   -   - -    n/a    1    3    4    5    N
14 |     c   -   - -    n/a    2    6    7    8    N
15 |     d   -   - -    n/a    3    9   10   11    N
16 |     e   -   - -    n/a    4   12   13   14    N
17 |     f   -   - -    n/a    5   15   16   17    N
18 |     g   -   - -    n/a    6   18   19   20    N
19 |     h   -   - -    n/a    7   21   22   23    N
20 |     i   -   - -    n/a    8   24   25   26    N
21 |     j   -   - -    n/a    9   27   28   29    N
22 |     k   -   - -    n/a   10   30   31   32    N
23 |     l   -   - -    n/a   11   33   34   35    N
24 |     m   -   - -    n/a   12   36   37   38    N
25 |     n   -   - -    n/a   13   39   40   41    N
26 |     o   -   - -    n/a   14   42   43   44    N
27 |     p   -   - -    n/a   15   45   46   47    N
28 |     r   -   - -    n/a   16   48   49   50    N
29 |     s   -   - -    n/a   17   51   52   53    N
30 |     t   -   - -    n/a   18   54   55   56    N
31 |     u   -   - -    n/a   19   57   58   59    N
32 |     w   -   - -    n/a   20   60   61   62    N
33 |     x   -   - -    n/a   21   63   64   65    N
34 |     y   -   - -    n/a   22   66   67   68    N
35 |     a   c   f i    n/a    1   69   70   71    N
36 |     a   h   s i    n/a    1   72   73   74    N
37 |     a   k   r e    n/a    1   75   76   77    N
38 |     a   l   j i    n/a    1   78   79   80    N
39 |     a   m   s i    n/a    1   81   82   83    N
40 |     a   m   y b    n/a    1   84   85   86    N
41 |     a   r   j i    n/a    1   87   88   89    N
42 |     a   r   o i    n/a    1   90   91   92    N
43 |     a   s   j i    n/a    1   93   94   95    N
44 |     a   x SIL e    n/a    1   96   97   98    N
45 |     c SIL   a b    n/a    2   99  100  101    N
46 |     c   s   h i    n/a    2  102  103  104    N
47 |     d SIL   d b    n/a    3  105  106  107    N
48 |     d   d   u i    n/a    3  108  109  110    N
49 |     e   e SIL e    n/a    4  111  112  113    N
50 |     e   e   m i    n/a    4  114  115  116    N
51 |     e   h   e i    n/a    4  117  118  119    N
52 |     e   k SIL e    n/a    4  120  121  122    N
53 |     e   t   e i    n/a    4  123  124  125    N
54 |     f   a   p e    n/a    5  126  127  128    N
55 |     g   n SIL e    n/a    6  129  130  131    N
56 |     h   c   s e    n/a    7  132  133  134    N
57 |     h   k   a i    n/a    7  135  136  137    N
58 |     h   k   o i    n/a    7  138  139  140    N
59 |     h   p   e i    n/a    7  141  142  143    N
60 |     i   j SIL e    n/a    8  144  145  146    N
61 |     j   a   i i    n/a    9  147  148  149    N
62 |     j   a   m i    n/a    9  150  151  152    N
63 |     j   a   n i    n/a    9  153  154  155    N
64 |     k SIL   a b    n/a   10  156  157  158    N
65 |     k SIL   h b    n/a   10  159  160  161    N
66 |     k   o   e b    n/a   10  162  163  164    N
67 |     l   r   a b    n/a   11  165  166  167    N
68 |     m SIL   a b    n/a   12  168  169  170    N
69 |     m   e SIL e    n/a   12  171  172  173    N
70 |     m   j   a e    n/a   12  174  175  176    N
71 |     n   j SIL e    n/a   13  177  178  179    N
72 |     n   o   g i    n/a   13  180  181  182    N
73 |     n   s   g i    n/a   13  183  184  185    N
74 |     o   a   k e    n/a   14  186  187  188    N
75 |     o   h   o i    n/a   14  189  190  191    N
76 |     o   o   n i    n/a   14  192  193  194    N
77 |     o   r   w i    n/a   14  195  196  197    N
78 |     p   f   h b    n/a   15  198  199  200    N
79 |     r   a   a b    n/a   16  201  202  203    N
80 |     r   t   a i    n/a   16  204  205  206    N
81 |     r   t   o i    n/a   16  207  208  209    N
82 |     r   w   l e    n/a   16  210  211  212    N
83 |     s   a   c i    n/a   17  213  214  215    N
84 |     s   a   t i    n/a   17  216  217  218    N
85 |     s   h   a b    n/a   17  219  220  221    N
86 |     s   u   n i    n/a   17  222  223  224    N
87 |     t SIL   r b    n/a   18  225  226  227    N
88 |     t   s   x e    n/a   18  228  229  230    N
89 |     t   y   e b    n/a   18  231  232  233    N
90 |     u   d   s i    n/a   19  234  235  236    N
91 |     w   o   r i    n/a   20  237  238  239    N
92 |     x   t   a b    n/a   21  240  241  242    N
93 |     y   a   t e    n/a   22  243  244  245    N
94 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/mdef:
--------------------------------------------------------------------------------
 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan  5 10:29:25 2018
 2 | 0.3
 3 | 23 n_base
 4 | 0 n_tri
 5 | 92 n_state_map
 6 | 69 n_tied_state
 7 | 69 n_tied_ci_state
 8 | 23 n_tied_tmat
 9 | #
10 | # Columns definitions
11 | #base lft  rt p attrib tmat      ... state id's ...
12 |   SIL   -   - - filler    0    0    1    2    N
13 |     a   -   - -    n/a    1    3    4    5    N
14 |     c   -   - -    n/a    2    6    7    8    N
15 |     d   -   - -    n/a    3    9   10   11    N
16 |     e   -   - -    n/a    4   12   13   14    N
17 |     f   -   - -    n/a    5   15   16   17    N
18 |     g   -   - -    n/a    6   18   19   20    N
19 |     h   -   - -    n/a    7   21   22   23    N
20 |     i   -   - -    n/a    8   24   25   26    N
21 |     j   -   - -    n/a    9   27   28   29    N
22 |     k   -   - -    n/a   10   30   31   32    N
23 |     l   -   - -    n/a   11   33   34   35    N
24 |     m   -   - -    n/a   12   36   37   38    N
25 |     n   -   - -    n/a   13   39   40   41    N
26 |     o   -   - -    n/a   14   42   43   44    N
27 |     p   -   - -    n/a   15   45   46   47    N
28 |     r   -   - -    n/a   16   48   49   50    N
29 |     s   -   - -    n/a   17   51   52   53    N
30 |     t   -   - -    n/a   18   54   55   56    N
31 |     u   -   - -    n/a   19   57   58   59    N
32 |     w   -   - -    n/a   20   60   61   62    N
33 |     x   -   - -    n/a   21   63   64   65    N
34 |     y   -   - -    n/a   22   66   67   68    N
35 | 


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices


--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances


--------------------------------------------------------------------------------
/egs/diadiem/model/text.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI001.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI002.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI003.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI004.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI005.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI006.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI007.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI008.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI009.wav


--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI010.wav


--------------------------------------------------------------------------------
/egs/diadiem/test_model.py:
--------------------------------------------------------------------------------
 1 | from model import transcript
 2 | from os.path import join, dirname
 3 | from unittest import TestCase
 4 | 
 5 | 
 6 | class TestSentiment(TestCase):
 7 |     def test_1(self):
 8 |         wav = join(dirname(__file__), "test", "CAFPHEE001.wav")
 9 |         actual = transcript(wav)
10 |         expected = "cà phê"
11 |         self.assertEqual(actual, expected)
12 | 
13 |     def test_2(self):
14 |         wav = join(dirname(__file__), "test", "KHASCHSAJN003.wav")
15 |         actual = transcript(wav)
16 |         expected = "khách sạn"
17 |         self.assertEqual(actual, expected)
18 | 


--------------------------------------------------------------------------------
/egs/diadiem/text.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/egs/diadiem/train.py:
--------------------------------------------------------------------------------
 1 | from extension.model import SphinxSpeechRecognition
 2 | from extension.export import SphinxSpeechRecognitionExporter
 3 | from load_data import corpus_folder
 4 | from os.path import join, dirname
 5 | 
 6 | tmp_folder = join(dirname(__file__), "tmp")
 7 | export_folder = join(dirname(__file__), "model")
 8 | 
 9 | model = SphinxSpeechRecognition(corpus_folder, tmp_folder)
10 | model.fit()
11 | SphinxSpeechRecognitionExporter.export(model, export_folder)
12 | # wav_file = join(tmp_folder, "etc", "wav", "train", "test", "CAFPHEE003.wav")
13 | # model.predict(wav_file)
14 | 


--------------------------------------------------------------------------------
/egs/vivos/README.md:
--------------------------------------------------------------------------------
 1 | /home/anhv/anaconda3/envs/automatic_speech_recognition/bin/python /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/egs/vivos/train.py --kaldi_folder /home/anhv/PycharmProjects/kaldi-trunk --corpus_folder /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/data/vivos/corpus --nj 10 --method lda_mllt
 2 | 
 3 | ===== Time Report =====
 4 | Mono
 5 | 9:25
 6 | 0:0
 7 | 0:25
 8 | Tri1
 9 | 2:38
10 | 0:0
11 | 0:24
12 | Tri2a
13 | 2:38
14 | 0:0
15 | 0:24
16 | Tri3a
17 | 2:52
18 | 24:16
19 | 0:51
20 | Total time:
21 | 44:21
22 | 
23 | 
24 | ===== Score Report =====
25 | Best WER
26 | %WER 79.80 [ 25926 / 32487, 245 ins, 5587 del, 20094 sub ] exp/tri3a/decode/wer_12


--------------------------------------------------------------------------------
/egs/vivos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/__init__.py


--------------------------------------------------------------------------------
/egs/vivos/analyze.py:
--------------------------------------------------------------------------------
 1 | from model import transcript
 2 | from os.path import join, dirname
 3 | from extension.analyze import WERAnalyzeLogger
 4 | 
 5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 6 |                      "corpus")
 7 | 
 8 | 
 9 | def load_test():
10 |     lines = open(join(corpus_folder, "test", "text")).read().splitlines()
11 |     lines = [line.split("|") for line in lines]
12 |     wavs = [line[0] for line in lines]
13 |     wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs]
14 |     texts = [line[1] for line in lines]
15 |     return wavs, texts
16 | 
17 | 
18 | wavs_test, texts_test = load_test()
19 | # texts_pred = [""] * len(texts_test)
20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test]
21 | 
22 | log_folder = join(dirname(__file__), "analyze")
23 | 
24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder)


--------------------------------------------------------------------------------
/egs/vivos/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/__init__.py


--------------------------------------------------------------------------------
/egs/vivos/extension/analyze.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import shutil
 3 | from extension.metrics import calculate_wer
 4 | from os.path import join, basename
 5 | import os
 6 | from underthesea.util.file_io import write
 7 | import numpy as np
 8 | 
 9 | 
10 | class WERAnalyzeLogger:
11 |     @staticmethod
12 |     def log(wavs_test, texts_test, texts_pred, log_folder):
13 |         wer = np.mean([calculate_wer(test.split(), pred.split())
14 |                        for test, pred in zip(texts_test, texts_pred)])
15 |         wer = np.round(wer, 4)
16 |         result = {
17 |             "WER": wer
18 |         }
19 |         content = json.dumps(result, ensure_ascii=False)
20 |         log_file = join(log_folder, "result.json")
21 |         write(log_file, content)
22 |         wav_folder = join(log_folder, "wav")
23 |         try:
24 |             shutil.rmtree(wav_folder)
25 |         except:
26 |             pass
27 |         finally:
28 |             os.mkdir(wav_folder)
29 |         for wav in wavs_test:
30 |             new_path = join(wav_folder, basename(wav))
31 |             shutil.copyfile(wav, new_path)
32 |         wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test]
33 |         speech_recognition = {
34 |             "texts_test": texts_test,
35 |             "texts_pred": texts_pred,
36 |             "wavs_test": wavs_test_new_path,
37 |         }
38 |         content = json.dumps(speech_recognition, ensure_ascii=False)
39 |         log_file = join(log_folder, "speechrecognition.json")
40 |         write(log_file, content)
41 | 
42 |         print("Result is written in {}".format(log_file))
43 |         print("WER: {}%".format(wer * 100))
44 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/cmd.sh:
--------------------------------------------------------------------------------
1 | # Setting local system jobs (local CPU - no external clusters)
2 | export train_cmd=run.pl
3 | export decode_cmd=run.pl


--------------------------------------------------------------------------------
/egs/vivos/extension/export.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from os.path import join
 3 | 
 4 | 
 5 | class SphinxSpeechRecognitionExporter:
 6 |     @staticmethod
 7 |     def export(model, export_folder):
 8 |         tmp_folder = model.tmp_folder
 9 |         try:
10 |             shutil.rmtree(join(export_folder, "etc"))
11 |         except:
12 |             pass
13 |         finally:
14 |             shutil.copytree(join(tmp_folder, "etc"),
15 |                             join(export_folder, "etc"))
16 | 
17 |         try:
18 |             shutil.rmtree(join(export_folder, "model_parameters"))
19 |         except:
20 |             pass
21 |         finally:
22 |             shutil.copytree(join(tmp_folder, "model_parameters"),
23 |                             join(export_folder, "model_parameters"))
24 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/metrics.py:
--------------------------------------------------------------------------------
 1 | def calculate_wer(reference, hypothesis):
 2 |     """
 3 |         Calculation of WER with Levenshtein distance.
 4 |         Works only for iterables up to 254 elements (uint8).
 5 |         O(nm) time and space complexity.
 6 | 
 7 |         >>> calculate_wer("who is there".split(), "is there".split())
 8 |         1
 9 |         >>> calculate_wer("who is there".split(), "".split())
10 |         3
11 |         >>> calculate_wer("".split(), "who is there".split())
12 |         3
13 |     """
14 |     # initialisation
15 |     import numpy
16 |     d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1),
17 |                     dtype=numpy.uint8)
18 |     d = d.reshape((len(reference) + 1, len(hypothesis) + 1))
19 |     for i in range(len(reference) + 1):
20 |         for j in range(len(hypothesis) + 1):
21 |             if i == 0:
22 |                 d[0][j] = j
23 |             elif j == 0:
24 |                 d[i][0] = i
25 | 
26 |     # computation
27 |     for i in range(1, len(reference) + 1):
28 |         for j in range(1, len(hypothesis) + 1):
29 |             if reference[i - 1] == hypothesis[j - 1]:
30 |                 d[i][j] = d[i - 1][j - 1]
31 |             else:
32 |                 substitution = d[i - 1][j - 1] + 1
33 |                 insertion = d[i][j - 1] + 1
34 |                 deletion = d[i - 1][j] + 1
35 |                 d[i][j] = min(substitution, insertion, deletion)
36 | 
37 |     return d[len(reference)][len(hypothesis)] / float(len(reference))
38 | 
39 | 
40 | import unittest
41 | assertions = unittest.TestCase('__init__')
42 | 
43 | if __name__ == '__main__':
44 |     s = calculate_wer("khach san".split(), "khach san cua toi".split())
45 |     assertions.assertAlmostEqual(s, 1)
46 |     s = calculate_wer("khach san cua".split(), "khach san cua toi".split())
47 |     assertions.assertAlmostEqual(s, 0.333, 3)
48 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/model_sphinx.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import os
  3 | import text
  4 | 
  5 | N = 10000
  6 | 
  7 | 
  8 | class SphinxSpeechRecognition:
  9 |     def __init__(self, corpus_folder, tmp_folder):
 10 |         print("Initial Sphinx Speech Recognition")
 11 |         self.corpus_folder = corpus_folder
 12 |         self.tmp_folder = tmp_folder
 13 |         try:
 14 |             shutil.rmtree(tmp_folder)
 15 |         except Exception as e:
 16 |             pass
 17 |         finally:
 18 |             os.mkdir(tmp_folder)
 19 |             os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder))
 20 |         self._init_data()
 21 |         self._change_config()
 22 |         self._make_transcription()
 23 |         self._make_dictionary()
 24 |         self._make_filler()
 25 |         self._make_language_model()
 26 | 
 27 |     # ========================== #
 28 |     # Init Data
 29 |     # ========================== #
 30 |     def _init_data(self):
 31 |         os.system("cd {}; mkdir wav".format(self.tmp_folder))
 32 |         os.system("cd {}; mkdir wav/train".format(self.tmp_folder))
 33 |         os.system("cd {}; mkdir wav/test".format(self.tmp_folder))
 34 | 
 35 |         ids = open(
 36 |             "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N]
 37 |         ids = [item.split("|")[0] for item in ids]
 38 |         for id in ids:
 39 |             shutil.copy2(
 40 |                 "{}/train/wav/{}.wav".format(self.corpus_folder, id),
 41 |                 "{}/wav/train/{}.wav".format(self.tmp_folder, id)
 42 |             )
 43 | 
 44 |         ids = ["train/{}".format(id) for id in ids]
 45 |         ids.append("")
 46 |         content = "\n".join(ids)
 47 |         open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"),
 48 |              "w").write(content)
 49 | 
 50 |         ids = open(
 51 |             "{}/test/text".format(self.corpus_folder)).read().splitlines()
 52 |         ids = [item.split("|")[0] for item in ids]
 53 |         for id in ids:
 54 |             shutil.copy2(
 55 |                 "{}/test/wav/{}.wav".format(self.corpus_folder, id),
 56 |                 "{}/wav/test/{}.wav".format(self.tmp_folder, id)
 57 |             )
 58 |         ids = ["test/{}".format(id) for id in ids]
 59 |         ids.append("")
 60 |         content = "\n".join(ids)
 61 |         open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"),
 62 |              "w").write(content)
 63 | 
 64 |     # ========================== #
 65 |     # Config
 66 |     # ========================== #
 67 |     def _change_config(self):
 68 |         config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg")
 69 |         config = SphinxConfig(config_file)
 70 |         config.set("$CFG_BASE_DIR", "\".\"")
 71 |         config.set("$CFG_WAVFILE_SRATE", 8000.0)
 72 |         config.set("$CFG_NUM_FILT", 31)
 73 |         config.set("$CFG_LO_FILT", 200)
 74 |         config.set("$CFG_HI_FILT", 3500)
 75 |         config.set("$CFG_WAVFILE_TYPE", "'raw'")
 76 |         config.set("$CFG_LANGUAGEMODEL",
 77 |                    "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"")
 78 |         config.set("$DEC_CFG_LANGUAGEMODEL",
 79 |                    "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"")
 80 | 
 81 |     # ========================== #
 82 |     # Transcription
 83 |     # ========================== #
 84 |     def _convert_transcription(self, in_file, out_file):
 85 |         lines = open(in_file).read().splitlines()[:N]
 86 |         output = []
 87 |         for line in lines:
 88 |             fileid, word = line.split("|")
 89 |             phone = text.word2phone(word)
 90 |             content = "<s> {} </s> ({})".format(phone, fileid)
 91 |             output.append(content)
 92 |         output.append("")
 93 |         content = "\n".join(output)
 94 |         open(out_file, "w").write(content)
 95 | 
 96 |     def _make_transcription(self):
 97 |         self._convert_transcription(
 98 |             "{}/train/text".format(self.corpus_folder),
 99 |             "{}/etc/tmp_train.transcription".format(self.tmp_folder))
100 |         self._convert_transcription(
101 |             "{}/test/text".format(self.corpus_folder),
102 |             "{}/etc/tmp_test.transcription".format(self.tmp_folder))
103 | 
104 |     # ============================== #
105 |     # Create dictionary and phones
106 |     # ============================== #
107 |     def _make_dictionary(self):
108 |         lines = open(
109 |             "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N]
110 |         phones = []
111 |         for line in lines:
112 |             fileid, word = line.split("|")
113 |             p = text.word2phone(word).split()
114 |             phones += p
115 |         phones = sorted(set(phones))
116 |         # create .dic files
117 |         lines = []
118 |         phone_units = []
119 |         for p in phones:
120 |             units = list(p)
121 |             phone_units += units
122 |             units = " ".join(units)
123 |             line = "{:20s}{}".format(p, units)
124 |             lines.append(line)
125 |         open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write(
126 |             "\n".join(lines))
127 |         phone_units = sorted(set(phone_units))
128 |         phone_units.append("SIL")
129 |         open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write(
130 |             "\n".join(phone_units))
131 | 
132 |     def _make_filler(self):
133 |         fillers = ["<s>", "</s>", "<sil>"]
134 |         lines = ["{:20s}SIL".format(f) for f in fillers]
135 |         open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write(
136 |             "\n".join(lines))
137 | 
138 |     # ========================== #
139 |     # Language Model
140 |     # ========================== #
141 |     def _make_cleaned_text(self):
142 |         in_file = "{}/train/text".format(self.corpus_folder)
143 |         out_file = "{}/etc/text".format(self.tmp_folder)
144 |         lines = open(in_file).read().splitlines()[:N]
145 |         output = []
146 |         for line in lines:
147 |             fileid, word = line.split("|")
148 |             phone = text.word2phone(word)
149 |             content = "<s> {} </s>".format(phone, fileid)
150 |             output.append(content)
151 |         content = "\n".join(output)
152 |         open(out_file, "w").write(content)
153 | 
154 |     def _make_language_model(self):
155 |         self._make_cleaned_text()
156 |         etc_folder = os.path.join(self.tmp_folder, "etc")
157 |         chdir = "cd {}; ".format(etc_folder)
158 |         os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab")
159 |         os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text")
160 |         os.system(
161 |             chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm")
162 | 
163 |     def fit(self):
164 |         chdir = "cd {}; ".format(self.tmp_folder)
165 |         os.system(chdir + "sphinxtrain run")
166 | 
167 |     def predict(self, wav_file):
168 |         command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format(
169 |             self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file)
170 |         output = os.popen(command).read().strip()
171 |         output = text.phone2word(output)
172 |         return output
173 | 
174 | 
175 | class SphinxConfig:
176 |     def __init__(self, config_file):
177 |         self.file = config_file
178 |         self.lines = open(config_file).read().splitlines()
179 | 
180 |     def save(self):
181 |         content = "\n".join(self.lines)
182 |         open(self.file, "w").write(content)
183 | 
184 |     def set(self, key, value):
185 |         for i, line in enumerate(self.lines):
186 |             if line.startswith(key):
187 |                 content = "{} = {};".format(key, value)
188 |                 self.lines[i] = content
189 |         self.save()
190 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/path.sh:
--------------------------------------------------------------------------------
 1 | # Defining Kaldi root directory
 2 | export KALDI_ROOT=`pwd`/../..
 3 | 
 4 | # Setting paths to useful tools
 5 | export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
 6 | 
 7 | # Defining audio data directory (modify it for your installation directory!)
 8 | export DATA_ROOT=`pwd`/audio
 9 | 
10 | # Enable SRILM
11 | . $KALDI_ROOT/tools/env.sh
12 | 
13 | # Variable needed for proper data sorting
14 | export LC_ALL=C


--------------------------------------------------------------------------------
/egs/vivos/extension/run_deltadelta.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./path.sh || exit 1
  4 | . ./cmd.sh || exit 1
  5 | 
  6 | EXP_START=$(date +%s);
  7 | 
  8 | nj=1       # number of parallel jobs
  9 | lm_order=1 # language model order (n-gram quantity)
 10 | 
 11 | # Safety mechanism (possible running this script with modified arguments)
 12 | . utils/parse_options.sh || exit 1
 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
 14 | 
 15 | # Removing previously created data (from last run.sh execution)
 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
 17 | 
 18 | 
 19 | 
 20 | echo
 21 | echo "===== PREPARING ACOUSTIC DATA ====="
 22 | echo
 23 | 
 24 | # Needs to be prepared by hand (or using self written scripts):
 25 | #
 26 | # spk2gender  [<speaker-id> <gender>]
 27 | # wav.scp     [<uterranceID> <full_path_to_audio_file>]
 28 | # text           [<uterranceID> <text_transcription>]
 29 | # utt2spk     [<uterranceID> <speakerID>]
 30 | # corpus.txt  [<text_transcription>]
 31 | 
 32 | # Making spk2utt files
 33 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
 34 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
 35 | 
 36 | 
 37 | echo
 38 | echo "===== FEATURES EXTRACTION ====="
 39 | echo
 40 | 
 41 | # Making feats.scp files
 42 | mfccdir=mfcc
 43 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
 44 | # utils/validate_data_dir.sh data/train     # script for checking prepared data - here: for data/train directory
 45 | # utils/fix_data_dir.sh data/train          # tool for data proper sorting if needed - here: for data/train directory
 46 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
 47 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
 48 | 
 49 | 
 50 | # Making cmvn.scp files
 51 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 52 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
 53 | 
 54 | echo
 55 | echo "===== PREPARING LANGUAGE DATA ====="
 56 | echo
 57 | 
 58 | # Needs to be prepared by hand (or using self written scripts):
 59 | #
 60 | # lexicon.txt           [<word> <phone 1> <phone 2> ...]
 61 | # nonsilence_phones.txt    [<phone>]
 62 | # silence_phones.txt    [<phone>]
 63 | # optional_silence.txt  [<phone>]
 64 | 
 65 | # Preparing language data
 66 | utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 67 | 
 68 | echo
 69 | echo "===== LANGUAGE MODEL CREATION ====="
 70 | echo "===== MAKING lm.arpa ====="
 71 | echo
 72 | 
 73 | loc=`which ngram-count`;
 74 | if [ -z $loc ]; then
 75 |    if uname -a | grep 64 >/dev/null; then
 76 |            sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
 77 |    else
 78 |                    sdir=$KALDI_ROOT/tools/srilm/bin/i686
 79 |    fi
 80 |    if [ -f $sdir/ngram-count ]; then
 81 |                    echo "Using SRILM language modelling tool from $sdir"
 82 |                    export PATH=$PATH:$sdir
 83 |    else
 84 |                    echo "SRILM toolkit is probably not installed.
 85 |                            Instructions: tools/install_srilm.sh"
 86 |                    exit 1
 87 |    fi
 88 | fi
 89 | 
 90 | local=data/local
 91 | mkdir $local/tmp
 92 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
 93 | 
 94 | echo
 95 | echo "===== MAKING G.fst ====="
 96 | echo
 97 | 
 98 | lang=data/lang
 99 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
100 | 
101 | echo
102 | echo "===== MONO TRAINING ====="
103 | echo
104 | 
105 | START=$(date +%s);
106 | steps/train_mono.sh --nj $nj \
107 |   --cmd "$train_cmd" data/train data/lang exp/mono  || exit 1
108 | END=$(date +%s);
109 | MONO_TRAINING_TIME=$((END - START))
110 | 
111 | echo
112 | echo "===== MONO DECODING ====="
113 | echo
114 | 
115 | START=$(date +%s);
116 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
117 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
118 | #   exp/mono/graph data/test exp/mono/decode
119 | END=$(date +%s);
120 | MONO_DECODING_TIME=$((END - START))
121 | 
122 | echo
123 | echo "===== MONO ALIGNMENT ====="
124 | echo
125 | 
126 | START=$(date +%s);
127 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
128 |   data/train data/lang exp/mono exp/mono_ali || exit 1
129 | END=$(date +%s);
130 | MONO_ALIGNMENT_TIME=$((END - START))
131 | 
132 | echo
133 | echo "===== TRI1 (first triphone pass) TRAINING ====="
134 | echo
135 | 
136 | START=$(date +%s);
137 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
138 |   data/train data/lang exp/mono_ali exp/tri1 || exit 1
139 | END=$(date +%s);
140 | TRI1_TRAINING_TIME=$((END - START))
141 | 
142 | echo
143 | echo "===== TRI1 (first triphone pass) DECODING ====="
144 | echo
145 | 
146 | START=$(date +%s);
147 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
148 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
149 | #   exp/tri1/graph data/test exp/tri1/decode
150 | END=$(date +%s);
151 | TRI1_DECODING_TIME=$((END - START))
152 | 
153 | echo
154 | echo "===== TRI1 ALIGNMENT ====="
155 | echo
156 | 
157 | START=$(date +%s);
158 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
159 |   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
160 | END=$(date +%s);
161 | TRI1_ALIGNMENT_TIME=$((END - START))
162 | 
163 | echo
164 | echo "===== TRI2A TRAINING ====="
165 | echo
166 | 
167 | START=$(date +%s);
168 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
169 |   data/train data/lang exp/tri1_ali exp/tri2a || exit 1
170 | END=$(date +%s);
171 | TRI2A_TRAINING_TIME=$((END - START))
172 | 
173 | echo
174 | echo "===== TRI2A DECODING ====="
175 | echo
176 | 
177 | START=$(date +%s);
178 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
179 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
180 |   exp/tri2a/graph data/test exp/tri2a/decode
181 | END=$(date +%s);
182 | TRI2A_DECODING_TIME=$((END - START))
183 | 
184 | echo
185 | echo "===== TRI2A ALIGNMENT ====="
186 | echo
187 | 
188 | START=$(date +%s);
189 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
190 |   data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
191 | END=$(date +%s);
192 | TRI2A_ALIGNMENT_TIME=$((END - START))
193 | 
194 | echo
195 | echo "===== run.sh script is finished ====="
196 | echo
197 | 
198 | EXP_END=$(date +%s);
199 | EXP_TIME=$((EXP_END - EXP_START))
200 | 
201 | log_file='exp.log'
202 | echo "" > $log_file
203 | echo "===== Time Report =====" >> $log_file
204 | echo "Mono" >> $log_file
205 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
206 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
207 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
208 | 
209 | echo "Tri1" >> $log_file
210 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
211 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
212 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
213 | 
214 | echo "Tri2a" >> $log_file
215 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
216 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
217 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
218 | 
219 | echo "Total time:" >> $log_file
220 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
221 | 
222 | echo -e "\n" >> $log_file
223 | echo "===== Score Report =====" >> $log_file
224 | echo "Best WER" >> $log_file
225 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
226 | 
227 | echo -e "\n" >> $log_file
228 | 
229 | cat $log_file
230 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/run_lda_mllt.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./path.sh || exit 1
  4 | . ./cmd.sh || exit 1
  5 | 
  6 | EXP_START=$(date +%s);
  7 | 
  8 | nj=1       # number of parallel jobs
  9 | lm_order=1 # language model order (n-gram quantity)
 10 | 
 11 | # Safety mechanism (possible running this script with modified arguments)
 12 | . utils/parse_options.sh || exit 1
 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
 14 | 
 15 | # Removing previously created data (from last run.sh execution)
 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
 17 | 
 18 | echo
 19 | echo "===== PREPARING ACOUSTIC DATA ====="
 20 | echo
 21 | 
 22 | # Needs to be prepared by hand (or using self written scripts):
 23 | #
 24 | # spk2gender  [<speaker-id> <gender>]
 25 | # wav.scp     [<uterranceID> <full_path_to_audio_file>]
 26 | # text           [<uterranceID> <text_transcription>]
 27 | # utt2spk     [<uterranceID> <speakerID>]
 28 | # corpus.txt  [<text_transcription>]
 29 | 
 30 | # Making spk2utt files
 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
 33 | 
 34 | echo
 35 | echo "===== FEATURES EXTRACTION ====="
 36 | echo
 37 | 
 38 | # Making feats.scp files
 39 | mfccdir=mfcc
 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
 41 | # utils/validate_data_dir.sh data/train     # script for checking prepared data - here: for data/train directory
 42 | # utils/fix_data_dir.sh data/train          # tool for data proper sorting if needed - here: for data/train directory
 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
 45 | 
 46 | # Making cmvn.scp files
 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
 49 | 
 50 | echo
 51 | echo "===== PREPARING LANGUAGE DATA ====="
 52 | echo
 53 | 
 54 | # Needs to be prepared by hand (or using self written scripts):
 55 | #
 56 | # lexicon.txt           [<word> <phone 1> <phone 2> ...]
 57 | # nonsilence_phones.txt    [<phone>]
 58 | # silence_phones.txt    [<phone>]
 59 | # optional_silence.txt  [<phone>]
 60 | 
 61 | # Preparing language data
 62 | utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 63 | 
 64 | echo
 65 | echo "===== LANGUAGE MODEL CREATION ====="
 66 | echo "===== MAKING lm.arpa ====="
 67 | echo
 68 | 
 69 | loc=`which ngram-count`;
 70 | if [ -z $loc ]; then
 71 |    if uname -a | grep 64 >/dev/null; then
 72 |            sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
 73 |    else
 74 |                    sdir=$KALDI_ROOT/tools/srilm/bin/i686
 75 |    fi
 76 |    if [ -f $sdir/ngram-count ]; then
 77 |                    echo "Using SRILM language modelling tool from $sdir"
 78 |                    export PATH=$PATH:$sdir
 79 |    else
 80 |                    echo "SRILM toolkit is probably not installed.
 81 |                            Instructions: tools/install_srilm.sh"
 82 |                    exit 1
 83 |    fi
 84 | fi
 85 | 
 86 | local=data/local
 87 | mkdir $local/tmp
 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
 89 | 
 90 | echo
 91 | echo "===== MAKING G.fst ====="
 92 | echo
 93 | 
 94 | lang=data/lang
 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
 96 | 
 97 | echo
 98 | echo "===== MONO TRAINING ====="
 99 | echo
100 | 
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 |   --cmd "$train_cmd" data/train data/lang exp/mono  || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 | 
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 | 
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 | END=$(date +%s);
114 | MONO_DECODING_TIME=$((END - START))
115 | 
116 | echo
117 | echo "===== MONO ALIGNMENT ====="
118 | echo
119 | 
120 | START=$(date +%s);
121 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
122 |   data/train data/lang exp/mono exp/mono_ali || exit 1
123 | END=$(date +%s);
124 | MONO_ALIGNMENT_TIME=$((END - START))
125 | 
126 | echoalign
127 | echo "===== TRI1 (first triphone pass) TRAINING ====="
128 | echo
129 | 
130 | START=$(date +%s);
131 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
132 |   data/train data/lang exp/mono_ali exp/tri1 || exit 1
133 | END=$(date +%s);
134 | TRI1_TRAINING_TIME=$((END - START))
135 | 
136 | echo
137 | echo "===== TRI1 (first triphone pass) DECODING ====="
138 | echo
139 | 
140 | START=$(date +%s);
141 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
142 | END=$(date +%s);
143 | TRI1_DECODING_TIME=$((END - START))
144 | 
145 | echo
146 | echo "===== TRI1 ALIGNMENT ====="
147 | echo
148 | 
149 | START=$(date +%s);
150 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
151 |   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
152 | END=$(date +%s);
153 | TRI1_ALIGNMENT_TIME=$((END - START))
154 | 
155 | echo
156 | echo "===== TRI2A TRAINING ====="
157 | echo
158 | 
159 | START=$(date +%s);
160 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
161 |   data/train data/lang exp/tri1_ali exp/tri2a || exit 1
162 | END=$(date +%s);
163 | TRI2A_TRAINING_TIME=$((END - START))
164 | 
165 | echo
166 | echo "===== TRI2A DECODING ====="
167 | echo
168 | 
169 | START=$(date +%s);
170 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
171 | END=$(date +%s);
172 | TRI2A_DECODING_TIME=$((END - START))
173 | 
174 | echo
175 | echo "===== TRI2A ALIGNMENT ====="
176 | echo
177 | 
178 | START=$(date +%s);
179 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
180 |   data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
181 | END=$(date +%s);
182 | TRI2A_ALIGNMENT_TIME=$((END - START))
183 | 
184 | echo
185 | echo "===== TRI3A TRAINING ====="
186 | echo
187 | 
188 | START=$(date +%s);
189 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
190 |   data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
191 | END=$(date +%s);
192 | TRI3A_TRAINING_TIME=$((END - START))
193 | 
194 | echo
195 | echo "===== TRI3A DECODING ====="
196 | echo
197 | 
198 | START=$(date +%s);
199 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
200 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
201 |   exp/tri3a/graph data/test exp/tri3a/decode
202 | END=$(date +%s);
203 | TRI3A_DECODING_TIME=$((END - START))
204 | 
205 | echo
206 | echo "===== TRI3A ALIGNMENT ====="
207 | echo
208 | 
209 | START=$(date +%s);
210 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
211 |   data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
212 | END=$(date +%s);
213 | TRI3A_ALIGNMENT_TIME=$((END - START))
214 | 
215 | echo
216 | echo "===== run.sh script is finished ====="
217 | echo
218 | 
219 | EXP_END=$(date +%s);
220 | EXP_TIME=$((EXP_END - EXP_START))
221 | 
222 | log_file='exp.log'
223 | echo "" > $log_file
224 | echo "===== Time Report =====" >> $log_file
225 | echo "Mono" >> $log_file
226 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
227 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
228 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
229 | 
230 | echo "Tri1" >> $log_file
231 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
232 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
233 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
234 | 
235 | echo "Tri2a" >> $log_file
236 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
237 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
238 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
239 | 
240 | echo "Tri3a" >> $log_file
241 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
242 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
243 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
244 | 
245 | echo "Total time:" >> $log_file
246 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
247 | 
248 | echo -e "\n" >> $log_file
249 | echo "===== Score Report =====" >> $log_file
250 | echo "Best WER" >> $log_file
251 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
252 | 
253 | echo -e "\n" >> $log_file
254 | 
255 | cat $log_file
256 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/run_lda_mllt_decode.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./path.sh || exit 1
  4 | . ./cmd.sh || exit 1
  5 | 
  6 | EXP_START=$(date +%s);
  7 | 
  8 | nj=1       # number of parallel jobs
  9 | lm_order=1 # language model order (n-gram quantity)
 10 | 
 11 | # Safety mechanism (possible running this script with modified arguments)
 12 | . utils/parse_options.sh || exit 1
 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
 14 | 
 15 | # Removing previously created data (from last run.sh execution)
 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
 17 | 
 18 | echo
 19 | echo "===== PREPARING ACOUSTIC DATA ====="
 20 | echo
 21 | 
 22 | # Needs to be prepared by hand (or using self written scripts):
 23 | #
 24 | # spk2gender  [<speaker-id> <gender>]
 25 | # wav.scp     [<uterranceID> <full_path_to_audio_file>]
 26 | # text           [<uterranceID> <text_transcription>]
 27 | # utt2spk     [<uterranceID> <speakerID>]
 28 | # corpus.txt  [<text_transcription>]
 29 | 
 30 | # Making spk2utt files
 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
 33 | 
 34 | echo
 35 | echo "===== FEATURES EXTRACTION ====="
 36 | echo
 37 | 
 38 | # Making feats.scp files
 39 | mfccdir=mfcc
 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
 41 | # utils/validate_data_dir.sh data/train     # script for checking prepared data - here: for data/train directory
 42 | # utils/fix_data_dir.sh data/train          # tool for data proper sorting if needed - here: for data/train directory
 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
 45 | 
 46 | # Making cmvn.scp files
 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
 49 | 
 50 | echo
 51 | echo "===== PREPARING LANGUAGE DATA ====="
 52 | echo
 53 | 
 54 | # Needs to be prepared by hand (or using self written scripts):
 55 | #
 56 | # lexicon.txt           [<word> <phone 1> <phone 2> ...]
 57 | # nonsilence_phones.txt    [<phone>]
 58 | # silence_phones.txt    [<phone>]
 59 | # optional_silence.txt  [<phone>]
 60 | 
 61 | # Preparing language data
 62 | utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 63 | 
 64 | echo
 65 | echo "===== LANGUAGE MODEL CREATION ====="
 66 | echo "===== MAKING lm.arpa ====="
 67 | echo
 68 | 
 69 | loc=`which ngram-count`;
 70 | if [ -z $loc ]; then
 71 |    if uname -a | grep 64 >/dev/null; then
 72 |            sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
 73 |    else
 74 |                    sdir=$KALDI_ROOT/tools/srilm/bin/i686
 75 |    fi
 76 |    if [ -f $sdir/ngram-count ]; then
 77 |                    echo "Using SRILM language modelling tool from $sdir"
 78 |                    export PATH=$PATH:$sdir
 79 |    else
 80 |                    echo "SRILM toolkit is probably not installed.
 81 |                            Instructions: tools/install_srilm.sh"
 82 |                    exit 1
 83 |    fi
 84 | fi
 85 | 
 86 | local=data/local
 87 | mkdir $local/tmp
 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
 89 | 
 90 | echo
 91 | echo "===== MAKING G.fst ====="
 92 | echo
 93 | 
 94 | lang=data/lang
 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
 96 | 
 97 | echo
 98 | echo "===== MONO TRAINING ====="
 99 | echo
100 | 
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 |   --cmd "$train_cmd" data/train data/lang exp/mono  || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 | 
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 | 
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
114 |   exp/mono/graph data/test exp/mono/decode
115 | END=$(date +%s);
116 | MONO_DECODING_TIME=$((END - START))
117 | 
118 | echo
119 | echo "===== MONO ALIGNMENT ====="
120 | echo
121 | 
122 | START=$(date +%s);
123 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
124 |   data/train data/lang exp/mono exp/mono_ali || exit 1
125 | END=$(date +%s);
126 | MONO_ALIGNMENT_TIME=$((END - START))
127 | 
128 | echo
129 | echo "===== TRI1 (first triphone pass) TRAINING ====="
130 | echo
131 | 
132 | START=$(date +%s);
133 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
134 |   data/train data/lang exp/mono_ali exp/tri1 || exit 1
135 | END=$(date +%s);
136 | TRI1_TRAINING_TIME=$((END - START))
137 | 
138 | echo
139 | echo "===== TRI1 (first triphone pass) DECODING ====="
140 | echo
141 | 
142 | START=$(date +%s);
143 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
144 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
145 |   exp/tri1/graph data/test exp/tri1/decode
146 | END=$(date +%s);
147 | TRI1_DECODING_TIME=$((END - START))
148 | 
149 | echo
150 | echo "===== TRI1 ALIGNMENT ====="
151 | echo
152 | 
153 | START=$(date +%s);
154 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
155 |   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
156 | END=$(date +%s);
157 | TRI1_ALIGNMENT_TIME=$((END - START))
158 | 
159 | echo
160 | echo "===== TRI2A TRAINING ====="
161 | echo
162 | 
163 | START=$(date +%s);
164 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
165 |   data/train data/lang exp/tri1_ali exp/tri2a || exit 1
166 | END=$(date +%s);
167 | TRI2A_TRAINING_TIME=$((END - START))
168 | 
169 | echo
170 | echo "===== TRI2A DECODING ====="
171 | echo
172 | 
173 | START=$(date +%s);
174 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
175 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
176 |   exp/tri2a/graph data/test exp/tri2a/decode
177 | END=$(date +%s);
178 | TRI2A_DECODING_TIME=$((END - START))
179 | 
180 | echo
181 | echo "===== TRI2A ALIGNMENT ====="
182 | echo
183 | 
184 | START=$(date +%s);
185 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
186 |   data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
187 | END=$(date +%s);
188 | TRI2A_ALIGNMENT_TIME=$((END - START))
189 | 
190 | echo
191 | echo "===== TRI3A TRAINING ====="
192 | echo
193 | 
194 | START=$(date +%s);
195 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
196 |   data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
197 | END=$(date +%s);
198 | TRI3A_TRAINING_TIME=$((END - START))
199 | 
200 | echo
201 | echo "===== TRI3A DECODING ====="
202 | echo
203 | 
204 | START=$(date +%s);
205 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
206 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
207 |   exp/tri3a/graph data/test exp/tri3a/decode
208 | END=$(date +%s);
209 | TRI3A_DECODING_TIME=$((END - START))
210 | 
211 | echo
212 | echo "===== TRI3A ALIGNMENT ====="
213 | echo
214 | 
215 | START=$(date +%s);
216 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
217 |   data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
218 | END=$(date +%s);
219 | TRI3A_ALIGNMENT_TIME=$((END - START))
220 | 
221 | echo
222 | echo "===== run.sh script is finished ====="
223 | echo
224 | 
225 | EXP_END=$(date +%s);
226 | EXP_TIME=$((EXP_END - EXP_START))
227 | 
228 | log_file='exp.log'
229 | echo "" > $log_file
230 | echo "===== Time Report =====" >> $log_file
231 | echo "Mono" >> $log_file
232 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
233 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
234 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
235 | 
236 | echo "Tri1" >> $log_file
237 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
238 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
239 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
240 | 
241 | echo "Tri2a" >> $log_file
242 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
243 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
244 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
245 | 
246 | echo "Tri3a" >> $log_file
247 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
248 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
249 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
250 | 
251 | echo "Total time:" >> $log_file
252 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
253 | 
254 | echo -e "\n" >> $log_file
255 | echo "===== Score Report =====" >> $log_file
256 | echo "Best WER" >> $log_file
257 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
258 | 
259 | echo -e "\n" >> $log_file
260 | 
261 | cat $log_file
262 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/run_sat.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./path.sh || exit 1
  4 | . ./cmd.sh || exit 1
  5 | 
  6 | EXP_START=$(date +%s);
  7 | 
  8 | nj=1       # number of parallel jobs
  9 | lm_order=1 # language model order (n-gram quantity)
 10 | 
 11 | # Safety mechanism (possible running this script with modified arguments)
 12 | . utils/parse_options.sh || exit 1
 13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
 14 | 
 15 | # Removing previously created data (from last run.sh execution)
 16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
 17 | 
 18 | echo
 19 | echo "===== PREPARING ACOUSTIC DATA ====="
 20 | echo
 21 | 
 22 | # Needs to be prepared by hand (or using self written scripts):
 23 | #
 24 | # spk2gender  [<speaker-id> <gender>]
 25 | # wav.scp     [<uterranceID> <full_path_to_audio_file>]
 26 | # text           [<uterranceID> <text_transcription>]
 27 | # utt2spk     [<uterranceID> <speakerID>]
 28 | # corpus.txt  [<text_transcription>]
 29 | 
 30 | # Making spk2utt files
 31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
 32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
 33 | 
 34 | echo
 35 | echo "===== FEATURES EXTRACTION ====="
 36 | echo
 37 | 
 38 | # Making feats.scp files
 39 | mfccdir=mfcc
 40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
 41 | # utils/validate_data_dir.sh data/train     # script for checking prepared data - here: for data/train directory
 42 | # utils/fix_data_dir.sh data/train          # tool for data proper sorting if needed - here: for data/train directory
 43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
 44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
 45 | 
 46 | # Making cmvn.scp files
 47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
 49 | 
 50 | echo
 51 | echo "===== PREPARING LANGUAGE DATA ====="
 52 | echo
 53 | 
 54 | # Needs to be prepared by hand (or using self written scripts):
 55 | #
 56 | # lexicon.txt           [<word> <phone 1> <phone 2> ...]
 57 | # nonsilence_phones.txt    [<phone>]
 58 | # silence_phones.txt    [<phone>]
 59 | # optional_silence.txt  [<phone>]
 60 | 
 61 | # Preparing language data
 62 | utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 63 | 
 64 | echo
 65 | echo "===== LANGUAGE MODEL CREATION ====="
 66 | echo "===== MAKING lm.arpa ====="
 67 | echo
 68 | 
 69 | loc=`which ngram-count`;
 70 | if [ -z $loc ]; then
 71 |    if uname -a | grep 64 >/dev/null; then
 72 |            sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
 73 |    else
 74 |                    sdir=$KALDI_ROOT/tools/srilm/bin/i686
 75 |    fi
 76 |    if [ -f $sdir/ngram-count ]; then
 77 |                    echo "Using SRILM language modelling tool from $sdir"
 78 |                    export PATH=$PATH:$sdir
 79 |    else
 80 |                    echo "SRILM toolkit is probably not installed.
 81 |                            Instructions: tools/install_srilm.sh"
 82 |                    exit 1
 83 |    fi
 84 | fi
 85 | 
 86 | local=data/local
 87 | mkdir $local/tmp
 88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
 89 | 
 90 | echo
 91 | echo "===== MAKING G.fst ====="
 92 | echo
 93 | 
 94 | lang=data/lang
 95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
 96 | 
 97 | echo
 98 | echo "===== MONO TRAINING ====="
 99 | echo
100 | 
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 |   --cmd "$train_cmd" data/train data/lang exp/mono  || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 | 
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 | 
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 | 
114 | END=$(date +%s);
115 | MONO_DECODING_TIME=$((END - START))
116 | 
117 | echo
118 | echo "===== MONO ALIGNMENT ====="
119 | echo
120 | 
121 | START=$(date +%s);
122 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
123 |   data/train data/lang exp/mono exp/mono_ali || exit 1
124 | END=$(date +%s);
125 | MONO_ALIGNMENT_TIME=$((END - START))
126 | 
127 | echo
128 | echo "===== TRI1 (first triphone pass) TRAINING ====="
129 | echo
130 | 
131 | START=$(date +%s);
132 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
133 |   data/train data/lang exp/mono_ali exp/tri1 || exit 1
134 | END=$(date +%s);
135 | TRI1_TRAINING_TIME=$((END - START))
136 | 
137 | echo
138 | echo "===== TRI1 (first triphone pass) DECODING ====="
139 | echo
140 | 
141 | START=$(date +%s);
142 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
143 | 
144 | END=$(date +%s);
145 | TRI1_DECODING_TIME=$((END - START))
146 | 
147 | echo
148 | echo "===== TRI1 ALIGNMENT ====="
149 | echo
150 | 
151 | START=$(date +%s);
152 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
153 |   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
154 | END=$(date +%s);
155 | TRI1_ALIGNMENT_TIME=$((END - START))
156 | 
157 | echo
158 | echo "===== TRI2A TRAINING ====="
159 | echo
160 | 
161 | START=$(date +%s);
162 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
163 |   data/train data/lang exp/tri1_ali exp/tri2a || exit 1
164 | END=$(date +%s);
165 | TRI2A_TRAINING_TIME=$((END - START))
166 | 
167 | echo
168 | echo "===== TRI2A DECODING ====="
169 | echo
170 | 
171 | START=$(date +%s);
172 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
173 | 
174 | END=$(date +%s);
175 | TRI2A_DECODING_TIME=$((END - START))
176 | 
177 | echo
178 | echo "===== TRI2A ALIGNMENT ====="
179 | echo
180 | 
181 | START=$(date +%s);
182 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
183 |   data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
184 | END=$(date +%s);
185 | TRI2A_ALIGNMENT_TIME=$((END - START))
186 | 
187 | echo
188 | echo "===== TRI3A TRAINING ====="
189 | echo
190 | 
191 | START=$(date +%s);
192 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
193 |   data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
194 | END=$(date +%s);
195 | TRI3A_TRAINING_TIME=$((END - START))
196 | 
197 | echo
198 | echo "===== TRI3A DECODING ====="
199 | echo
200 | 
201 | START=$(date +%s);
202 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
203 | 
204 | END=$(date +%s);
205 | TRI3A_DECODING_TIME=$((END - START))
206 | 
207 | echo
208 | echo "===== TRI3A ALIGNMENT ====="
209 | echo
210 | 
211 | START=$(date +%s);
212 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
213 |   data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
214 | END=$(date +%s);
215 | TRI3A_ALIGNMENT_TIME=$((END - START))
216 | 
217 | 
218 | echo
219 | echo "===== TRI4A TRAINING ====="
220 | echo
221 | 
222 | START=$(date +%s);
223 | steps/train_sat.sh --cmd "$train_cmd" 2500 20000 \
224 |   data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
225 | END=$(date +%s);
226 | TRI4A_TRAINING_TIME=$((END - START))
227 | 
228 | echo
229 | echo "===== TRI4A DECODING ====="
230 | echo
231 | 
232 | START=$(date +%s);
233 | utils/mkgraph.sh data/lang exp/tri4a exp/tri4a/graph || exit 1
234 | 
235 | END=$(date +%s);
236 | TRI4A_DECODING_TIME=$((END - START))
237 | 
238 | echo
239 | echo "===== TRI4A ALIGNMENT ====="
240 | echo
241 | 
242 | START=$(date +%s);
243 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
244 |   data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
245 | END=$(date +%s);
246 | TRI4A_ALIGNMENT_TIME=$((END - START))
247 | 
248 | echo
249 | echo "===== TRI5A TRAINING ====="
250 | echo
251 | 
252 | START=$(date +%s);
253 | steps/train_sat.sh --cmd "$train_cmd" 3500 100000 \
254 |   data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
255 | END=$(date +%s);
256 | TRI5A_TRAINING_TIME=$((END - START))
257 | 
258 | echo
259 | echo "===== TRI5A DECODING ====="
260 | echo
261 | 
262 | START=$(date +%s);
263 | utils/mkgraph.sh data/lang exp/tri5a exp/tri5a/graph || exit 1
264 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
265 |   exp/tri5a/graph data/test exp/tri5a/decode
266 | END=$(date +%s);
267 | TRI5A_DECODING_TIME=$((END - START))
268 | 
269 | echo
270 | echo "===== TRI5A ALIGNMENT ====="
271 | echo
272 | 
273 | START=$(date +%s);
274 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
275 |   data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
276 | END=$(date +%s);
277 | TRI5A_ALIGNMENT_TIME=$((END - START))
278 | 
279 | echo
280 | echo "===== run.sh script is finished ====="
281 | echo
282 | 
283 | EXP_END=$(date +%s);
284 | EXP_TIME=$((EXP_END - EXP_START))
285 | 
286 | log_file='exp.log'
287 | echo "" > $log_file
288 | echo "===== Time Report =====" >> $log_file
289 | echo "Mono" >> $log_file
290 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
291 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
292 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
293 | 
294 | echo "Tri1" >> $log_file
295 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
296 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
297 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
298 | 
299 | echo "Tri2a" >> $log_file
300 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
301 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
302 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
303 | 
304 | echo "Tri3a" >> $log_file
305 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
306 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
307 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
308 | 
309 | echo "Tri4a" >> $log_file
310 | echo $TRI4A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
311 | echo $TRI4A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
312 | echo $TRI4A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
313 | 
314 | echo "Tri5a" >> $log_file
315 | echo $TRI5A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
316 | echo $TRI5A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
317 | echo $TRI5A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
318 | 
319 | echo "Total time:" >> $log_file
320 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
321 | 
322 | echo -e "\n" >> $log_file
323 | echo "===== Score Report =====" >> $log_file
324 | echo "Best WER" >> $log_file
325 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
326 | 
327 | echo -e "\n" >> $log_file
328 | 
329 | cat $log_file
330 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/text.py:
--------------------------------------------------------------------------------
 1 | class PhoneConverter1:
 2 |     rules_1 = [
 3 |         "aàáảãạ",
 4 |         "ăằắẳẵặ",
 5 |         "âầấẩẫậ",
 6 |         "eèéẻẽẹ",
 7 |         "êềếểễệ",
 8 |         "iìíỉĩị",
 9 |         "oòóỏõọ",
10 |         "ôồốổỗộ",
11 |         "ơờớởỡợ",
12 |         "uùúủũụ",
13 |         "ưừứửữự",
14 |         "yỳýỷỹỵ"
15 |     ]
16 |     rules_2 = [
17 |         "awă",
18 |         "aaâ",
19 |         "eeê",
20 |         "ooô",
21 |         "owơ",
22 |         "uwư",
23 |         "ddđ"
24 |     ]
25 |     w2p = {}
26 |     p2w = {}
27 |     for words in rules_1:
28 |         original = words[0]
29 |         words = words[1:]
30 |         for rule in rules_2:
31 |             if original == rule[2]:
32 |                 original = rule[0:2]
33 |         tones = "fsrxj"
34 |         for i, w in enumerate(words):
35 |             w2p[w] = original + tones[i]
36 |     for rule in rules_2:
37 |         w2p[rule[2]] = rule[0:2]
38 |     for key, value in w2p.items():
39 |         p2w[value] = key
40 | 
41 |     @staticmethod
42 |     def word2phone(word):
43 |         w2p = PhoneConverter1.w2p
44 |         phone = ""
45 |         for w in word:
46 |             if w in w2p:
47 |                 phone += w2p[w]
48 |             else:
49 |                 phone += w
50 |         return phone
51 | 
52 |     @staticmethod
53 |     def phone2word(phone):
54 |         p2w = PhoneConverter1.p2w
55 |         i = 0
56 |         word = ""
57 |         while i < len(phone):
58 |             if phone[i:i+3] in p2w:
59 |                 p = phone[i:i+3]
60 |                 word += p2w[p]
61 |                 i += 3
62 |             elif phone[i:i+2] in p2w:
63 |                 p = phone[i:i+2]
64 |                 word += p2w[p]
65 |                 i += 2
66 |             else:
67 |                 p = phone[i:i+1]
68 |                 word += p
69 |                 i += 1
70 |         return word
71 | 
72 | if __name__ == '__main__':
73 |     tests = [
74 |         ("con hoẵng", "con hoawxng"),
75 |         ("lựu đạn", "luwju ddajn"),
76 |         ("kiểm tra", "kieerm tra"),
77 |         ("ủy ban", "ury ban"),
78 |         ("cà phê", "caf phee"),
79 |         ("khách sạn", "khasch sajn"),
80 |         ("đúng", "ddusng"),
81 |         ("xã hội", "xax hooji")
82 |     ]
83 |     for test in tests:
84 |         assert (test[0] == PhoneConverter1.phone2word(test[1]))
85 |         assert (test[1] == PhoneConverter1.word2phone(test[0]))
86 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/transcript_deltadelta.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #!/bin/bash
 3 | 
 4 | . ./path.sh || exit 1
 5 | . ./cmd.sh || exit 1
 6 | 
 7 | 
 8 | model_folder=exp/tri2a
 9 | transcript_folder=transcriptions
10 | output_folder=output
11 | 
12 | rm -rf $output_folder
13 | mkdir $output_folder
14 | 
15 | echo
16 | echo "===== AUDIO -> FEATURE VECTORS ====="
17 | echo
18 | 
19 | compute-mfcc-feats --config=conf/mfcc.conf \
20 |     scp:$transcript_folder/wav.scp \
21 |     ark,scp:$output_folder/feats.ark,$output_folder/feats.scp
22 | 
23 | add-deltas \
24 |     scp:$output_folder/feats.scp \
25 |     ark:$output_folder/delta-feats.ark
26 | 
27 | 
28 | echo
29 | echo "===== TRAINED GMM-HMM + FEATURE VECTORS -> LATTICE ====="
30 | echo
31 | 
32 | gmm-latgen-faster \
33 |     --word-symbol-table=$model_folder/graph/words.txt \
34 |     $model_folder/final.mdl \
35 |     $model_folder/graph/HCLG.fst \
36 |     ark:$output_folder/delta-feats.ark \
37 |     ark,t:$output_folder/lattices.ark
38 | 
39 | echo
40 | echo "===== LATTICE -> BEST PATH THROUGH LATTICE ====="
41 | echo
42 | 
43 | lattice-best-path \
44 |     --word-symbol-table=$model_folder/graph/words.txt \
45 |     ark:$output_folder/lattices.ark \
46 |     ark,t:$output_folder/one-best.tra
47 | 
48 | echo
49 | echo "===== BEST PATH INTEGERS -> BEST PATH WORDS ====="
50 | echo
51 | 
52 | utils/int2sym.pl -f 2- \
53 |     $model_folder/graph/words.txt \
54 |     $output_folder/one-best.tra \
55 |     > $output_folder/one-best-hypothesis.txt
56 | 
57 | cat $output_folder/one-best-hypothesis.txt
58 | 


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R001.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R002.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R003.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R004.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R005.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav


--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/wav.scp:
--------------------------------------------------------------------------------
1 | r1 ./transcriptions/audio/R001.wav
2 | r2 ./transcriptions/audio/R002.wav
3 | r3 ./transcriptions/audio/R003.wav
4 | r4 ./transcriptions/audio/R004.wav
5 | r5 ./transcriptions/audio/R005.wav
6 | t1 ./transcriptions/audio/t1_tat_ca.wav


--------------------------------------------------------------------------------
/egs/vivos/load_data.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join
2 | 
3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
4 |                      "corpus")
5 | 


--------------------------------------------------------------------------------
/egs/vivos/logs/README.md:
--------------------------------------------------------------------------------
1 | VIVOS + FPT, LDA-MLLT: 20181227_122900.md


--------------------------------------------------------------------------------
/egs/vivos/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from os.path import dirname
 2 | import os
 3 | import text
 4 | 
 5 | 
 6 | def transcript(wav_file):
 7 |     tmp_folder = dirname(__file__)
 8 |     command = "pocketsphinx_continuous " \
 9 |               "-hmm {0}/model_parameters/tmp.cd_cont_200 " \
10 |               "-samprate 8000 " \
11 |               "-lm {0}/etc/tmp.lm " \
12 |               "-dict {0}/etc/tmp.dic " \
13 |               "-infile {1} " \
14 |               "-logfn {0}/yes".format(tmp_folder, wav_file)
15 |     with os.popen(command) as c:
16 |         output = c.read().strip()
17 |     output = text.phone2word(output)
18 |     os.remove("{}/yes".format(tmp_folder))
19 |     return output
20 | 


--------------------------------------------------------------------------------
/egs/vivos/model/etc/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf __CFG_LO_FILT__
 2 | -upperf __CFG_HI_FILT__
 3 | -nfilt __CFG_NUM_FILT__
 4 | -transform __CFG_TRANSFORM__
 5 | -lifter __CFG_LIFTER__
 6 | -feat __CFG_FEATURE__
 7 | -svspec __CFG_SVSPEC__
 8 | -agc __CFG_AGC__
 9 | -cmn __CFG_CMN__
10 | -varnorm __CFG_VARNORM__
11 | 


--------------------------------------------------------------------------------
/egs/vivos/model/etc/idngram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/etc/idngram


--------------------------------------------------------------------------------
/egs/vivos/model/etc/tmp.filler:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/vivos/model/etc/tmp.phone:
--------------------------------------------------------------------------------
 1 | 4
 2 | a
 3 | b
 4 | c
 5 | d
 6 | e
 7 | f
 8 | g
 9 | h
10 | i
11 | j
12 | k
13 | l
14 | m
15 | n
16 | o
17 | p
18 | q
19 | r
20 | s
21 | t
22 | u
23 | v
24 | w
25 | x
26 | y
27 | SIL


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/feat.params:
--------------------------------------------------------------------------------
 1 | -lowerf 200
 2 | -upperf 3500
 3 | -nfilt 31
 4 | -transform dct
 5 | -lifter 22
 6 | -feat 1s_c_d_dd
 7 | -agc none
 8 | -cmn batch
 9 | -varnorm no
10 | 


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/mdef:
--------------------------------------------------------------------------------
 1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Sat Jan  6 09:51:27 2018
 2 | 0.3
 3 | 27 n_base
 4 | 0 n_tri
 5 | 108 n_state_map
 6 | 81 n_tied_state
 7 | 81 n_tied_ci_state
 8 | 27 n_tied_tmat
 9 | #
10 | # Columns definitions
11 | #base lft  rt p attrib tmat      ... state id's ...
12 |     4   -   - -    n/a    0    0    1    2    N
13 |   SIL   -   - - filler    1    3    4    5    N
14 |     a   -   - -    n/a    2    6    7    8    N
15 |     b   -   - -    n/a    3    9   10   11    N
16 |     c   -   - -    n/a    4   12   13   14    N
17 |     d   -   - -    n/a    5   15   16   17    N
18 |     e   -   - -    n/a    6   18   19   20    N
19 |     f   -   - -    n/a    7   21   22   23    N
20 |     g   -   - -    n/a    8   24   25   26    N
21 |     h   -   - -    n/a    9   27   28   29    N
22 |     i   -   - -    n/a   10   30   31   32    N
23 |     j   -   - -    n/a   11   33   34   35    N
24 |     k   -   - -    n/a   12   36   37   38    N
25 |     l   -   - -    n/a   13   39   40   41    N
26 |     m   -   - -    n/a   14   42   43   44    N
27 |     n   -   - -    n/a   15   45   46   47    N
28 |     o   -   - -    n/a   16   48   49   50    N
29 |     p   -   - -    n/a   17   51   52   53    N
30 |     q   -   - -    n/a   18   54   55   56    N
31 |     r   -   - -    n/a   19   57   58   59    N
32 |     s   -   - -    n/a   20   60   61   62    N
33 |     t   -   - -    n/a   21   63   64   65    N
34 |     u   -   - -    n/a   22   66   67   68    N
35 |     v   -   - -    n/a   23   69   70   71    N
36 |     w   -   - -    n/a   24   72   73   74    N
37 |     x   -   - -    n/a   25   75   76   77    N
38 |     y   -   - -    n/a   26   78   79   80    N
39 | 


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/noisedict:
--------------------------------------------------------------------------------
1 | <s>                 SIL
2 | </s>                SIL
3 | <sil>               SIL


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/variances


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices


--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances


--------------------------------------------------------------------------------
/egs/vivos/model/text.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/egs/vivos/predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | parser = argparse.ArgumentParser(description='Process some integers.')
  5 | parser.add_argument('--wav', help='Path for input file to predict', required=True)
  6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True)
  7 | parser.add_argument('--model_path', help='Model path (default: exp/{model} in kaldi-trunk/egs/{result})', required=True)
  8 | parser.add_argument('--utils_path', help='Kaldi utils dir path, usually in super parent directory of model_path')
  9 | parser.add_argument('--method', help='Method to predict, delta/lda_mllt,sat', default="delta")
 10 | 
 11 | args = parser.parse_args()
 12 | 
 13 | 
 14 | def predict(kaldi_folder, wav_file, model_path, method="delta", utils_path=None):
 15 |     # Model path usually is in etc at kaldi-trunk/egs/uts_{random_int}/exp
 16 |     model = model_path
 17 | 
 18 |     if not os.path.exists(os.path.join(model, "final.mdl")):
 19 |         raise Exception("Cannot find final.mdl model file with given model path.")
 20 |     if not os.path.exists(os.path.join(model, "graph")):
 21 |         raise Exception("Cannot find graph with given model path.")
 22 | 
 23 |     if utils_path is None:
 24 |         utils_path = os.path.join(os.path.dirname(os.path.dirname(model)), "utils")
 25 | 
 26 |     if not os.path.exists(os.path.join(utils_path, "int2sym.pl")):
 27 |         raise Exception(
 28 |             "Cannot find int2sym.pl file with given utils path, please make sure that you are provided correctly utils_path argument")
 29 | 
 30 |     # Prepare predict dir
 31 |     os.system("cd {}; rm -rf predict;".format(model))
 32 |     os.system("cd {}; mkdir predict;".format(model))
 33 |     os.system("cd {}/predict; mkdir config;".format(model))
 34 |     os.system("cd {}/predict; mkdir experiment;".format(model))
 35 |     os.system("cd {}/predict; mkdir transcriptions;".format(model))
 36 |     os.system("cd {}/predict/experiment; mkdir triphones_deldel;".format(model))
 37 | 
 38 |     # Copy pre-trained model
 39 |     os.system("cd {};cp final.mdl predict/experiment/triphones_deldel/final.mdl;".format(model))
 40 | 
 41 |     os.system("cd {};cp -r graph predict/experiment/triphones_deldel/graph".format(model))
 42 | 
 43 |     os.system("cd {}/predict/config; echo '--use-energy=true \n\
 44 |             --sample-frequency=16000 \n\
 45 |             --num-mel-bins=40 \n\
 46 |             --frame-length=25 \n\
 47 |             --frame-shift=10 \n\
 48 |             --high-freq=0 \n\
 49 |             --low-freq=0 \n\
 50 |             --num-ceps=13 \n\
 51 |             --window-type=hamming' > mfcc.conf".format(model))
 52 |     os.system("cd {}/predict/transcriptions; echo 'result: {}' > wav.scp".format(model, wav_file))
 53 |     os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV16 result:' > spk2utt".format(model))
 54 |     os.system("cd {}/predict/transcriptions; echo 'result: VIVOSDEV16' > utt2spk".format(model))
 55 |     # os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV02-R015 result' > utt2spk".format(model))
 56 | 
 57 |     # Run predict
 58 |     os.system(
 59 |         "cd {}/predict; {}/src/featbin/compute-mfcc-feats --config=config/mfcc.conf \
 60 |         scp:transcriptions/wav.scp ark,scp:transcriptions/feats.ark,transcriptions/feats.scp" \
 61 |             .format(model, kaldi_folder))
 62 | 
 63 |     os.system(
 64 |         "cd {}/predict; {}/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \
 65 |         scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \
 66 |             .format(model, kaldi_folder))
 67 | 
 68 |     # os.system(
 69 |     #     "cd {}/predict; {}/src/featbin/apply-cmvn --uut2spk=ark:transcriptions/utt2spk \
 70 |     #     scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \
 71 |     #         .format(model, kaldi_folder))
 72 | 
 73 |     # delta
 74 |     if method == "delta":
 75 |         # os.system("cd {}/predict; {}/src/featbin/add-deltas \
 76 |         #                   scp:transcriptions/feats.scp ark:transcriptions/delta-feats.ark" \
 77 |         #           .format(model, kaldi_folder))
 78 | 
 79 |         # os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
 80 |         # --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
 81 |         #                   --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
 82 |         #                   experiment/triphones_deldel/final.mdl \
 83 |         #                   experiment/triphones_deldel/graph/HCLG.fst \
 84 |         #                   ark:transcriptions/delta-feats.ark \
 85 |         #                   ark,t:transcriptions/lattices.ark" \
 86 |         #           .format(model, kaldi_folder))
 87 |         command = "cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
 88 |                 --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
 89 |                                   --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
 90 |                                   experiment/triphones_deldel/final.mdl \
 91 |                                   experiment/triphones_deldel/graph/HCLG.fst \
 92 |                                   'ark,s,cs:{}/src/featbin/apply-cmvn \
 93 |                                   --utt2spk=ark:transcriptions/utt2spk \
 94 |                                   scp:experiment/cmvn.scp \
 95 |                                   scp:transcriptions/feats.scp ark:- | \
 96 |                                   {}/src/featbin/add-deltas  ark:- ark:- |' 'ark,t:transcriptions/lattices.ark' 'ark:|gzip -c > experiment/lat.gz'" \
 97 |                   .format(model, kaldi_folder, kaldi_folder, kaldi_folder)
 98 |         os.system(command)
 99 |     elif method == "lda_mllt":
100 |         os.system("cd {};cp final.mat predict/experiment/triphones_deldel/final.mat;".format(model))
101 | 
102 |         os.system("cd {}/predict; {}/src/featbin/splice-feats \
103 |                scp:transcriptions/feats.scp \
104 |                ark:transcriptions/splice-feats.ark".format(model, kaldi_folder))
105 |         os.system("cd {}/predict; {}/src/featbin/transform-feats \
106 |                   experiment/triphones_deldel/final.mat \
107 |                   ark:transcriptions/splice-feats.ark \
108 |                   ark:transcriptions/splice-transform-feats.ark".format(model, kaldi_folder))
109 |         os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
110 |                           --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
111 |                           experiment/triphones_deldel/final.mdl experiment/triphones_deldel/graph/HCLG.fst \
112 |                           ark:transcriptions/splice-transform-feats.ark ark,t:transcriptions/lattices.ark" \
113 |                   .format(model, kaldi_folder))
114 |     else:
115 |         raise Exception("The given method {} is not supported yet".format(method))
116 | 
117 |     os.system("cd {}/predict; {}/src/latbin/lattice-best-path"
118 |               " \
119 |                       --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
120 |                       ark:transcriptions/lattices.ark \
121 |                       ark,t:transcriptions/one-best.tra" \
122 |               .format(model, kaldi_folder))
123 | 
124 |     os.system("cd {}/predict; {}/int2sym.pl"
125 |               " -f 2- {}/predict/experiment/triphones_deldel/graph/words.txt transcriptions/one-best.tra \
126 |                       > {}/predict/transcriptions/one-best-hypothesis.txt; echo $(<{}/predict/transcriptions/one-best-hypothesis.txt);" \
127 |               .format(model, utils_path, model, model, model))
128 | 
129 |     result = open("{}/predict/transcriptions/one-best-hypothesis.txt".format(model)).read()
130 |     # Result will stored in model_path/predict/transcriptions/one-best-hypothesis.txt under format test {predict_result}
131 |     result = result[8:]
132 |     print(result)
133 |     return result
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     predict(args.kaldi_folder, args.wav, args.model_path, args.method, args.utils_path)
138 | 


--------------------------------------------------------------------------------
/egs/vivos/predict_delta.sh:
--------------------------------------------------------------------------------
 1 | # Please don't charge this default config
 2 | MODEL=/home/anhv/PycharmProjects/kaldi-trunk/egs/uts_443/exp/tri2a
 3 | KALDI=/home/anhv/PycharmProjects/kaldi-trunk
 4 | WAV=/home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/experiment/vivos/test/VIVOSDEV01_R034.wav
 5 | 
 6 | # Variables
 7 | # MODEL=
 8 | # KALDI=
 9 | # WAV=
10 | 
11 | # Prepare predict dir
12 | cd $MODEL;
13 | rm -rf predict
14 | mkdir predict
15 | cd $MODEL/predict
16 | mkdir config; mkdir experiment; mkdir transcriptions
17 | cd $MODEL/predict/experiment
18 | mkdir triphones_delta
19 | 
20 | # Copy pre-trained model
21 | cd $MODEL
22 | cp final.mdl predict/experiment/triphones_delta/final.mdl
23 | cp -r graph predict/experiment/triphones_delta/graph
24 | 
25 | cd $MODEL/predict/config
26 | cat > mfcc.conf << EOL
27 | --use-energy=true
28 | --sample-frequency=16000
29 | --num-mel-bins=40
30 | --frame-length=25
31 | --frame-shift=10
32 | --high-freq=0
33 | --low-freq=0
34 | --num-ceps=13
35 | --window-type=hamming
36 | EOL
37 | 
38 | # Prepare util
39 | cd $MODEL/predict/transcriptions
40 | echo "result: $WAV" > wav.scp
41 | echo "VIVOSDEV16 result:" > spk2utt
42 | echo "result: VIVOSDEV16" > utt2spk
43 | 
44 | 
45 | # Run predict
46 | cd $MODEL/predict;
47 | $KALDI/src/featbin/compute-mfcc-feats \
48 |     --config=config/mfcc.conf \
49 |     scp:transcriptions/wav.scp \
50 |     ark,scp:transcriptions/feats.ark,transcriptions/feats.scp
51 | $KALDI/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \
52 |     scp:transcriptions/feats.scp \
53 |     ark,scp:experiment/cmvn.ark,experiment/cmvn.scp
54 | 
55 | cd $MODEL/predict;
56 | $KALDI/src/gmmbin/gmm-latgen-faster \
57 |     --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
58 |     --word-symbol-table=experiment/triphones_delta/graph/words.txt \
59 |      experiment/triphones_delta/final.mdl \
60 |      experiment/triphones_delta/graph/HCLG.fst \
61 |      'ark,s,cs:'$KALDI'/src/featbin/apply-cmvn \
62 |      --utt2spk=ark:transcriptions/utt2spk scp:experiment/cmvn.scp scp:transcriptions/feats.scp \
63 |      ark:- | '$KALDI'/src/featbin/add-deltas  ark:- ark:- |' 'ark:|gzip -c > experiment/lat.JOB.gz'
64 | 
65 | echo "Finish predict"


--------------------------------------------------------------------------------
/egs/vivos/preprocess.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from os import mkdir, walk
  3 | from os import listdir
  4 | from os.path import dirname
  5 | from os.path import join
  6 | import os
  7 | import re
  8 | 
  9 | def create_train_waves():
 10 | 
 11 |     waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 12 |          "raw","train","waves")
 13 |     waves_folder_2 = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 14 |          "raw","test","waves")
 15 |     corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 16 |          "corpus","train","wav")
 17 |     try:
 18 |         shutil.rmtree(corpus_waves_folder)
 19 |     except:
 20 |         pass
 21 |     finally:
 22 |         mkdir(corpus_waves_folder)
 23 |     for root, dirs, files in walk(waves_folder):
 24 |         for dir in dirs:
 25 |             for f in listdir(join(waves_folder, dir)):
 26 |                 shutil.copy(
 27 |                     join(waves_folder, dir, f),
 28 |                     join(corpus_waves_folder, f))
 29 | 
 30 |     for root, dirs, files in walk(waves_folder_2):
 31 |         for dir in dirs:
 32 |             for f in listdir(join(waves_folder_2, dir)):
 33 |                 shutil.copy(
 34 |                     join(waves_folder_2, dir, f),
 35 |                     join(corpus_waves_folder, f))
 36 | 
 37 | 
 38 | def create_test_waves():
 39 |     waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vlsp",
 40 |          "wav")
 41 |     corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 42 |          "corpus","test")
 43 |     corpus_short_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 44 |                                "corpus", "test_short")
 45 |     try:
 46 |         shutil.rmtree(corpus_waves_folder)
 47 |         shutil.rmtree(corpus_short_waves_folder)
 48 |     except:
 49 |         pass
 50 |     finally:
 51 |         mkdir(corpus_waves_folder)
 52 |         mkdir(corpus_short_waves_folder)
 53 |         mkdir(join(corpus_short_waves_folder,"wav"))
 54 | 
 55 |     shutil.copytree(waves_folder,join(corpus_waves_folder,"wav"))
 56 |     files = listdir(join(corpus_waves_folder,"wav"))
 57 |     for file in files:
 58 |         os.rename(join(corpus_waves_folder,"wav",file),join(corpus_waves_folder,"wav","{}_{}".format("global",file)))
 59 |     list_files = listdir(join(corpus_waves_folder,"wav"))
 60 |     list_files.sort()
 61 |     for index,file in enumerate(list_files):
 62 |         if index < 20:
 63 |             shutil.copyfile(join(corpus_waves_folder,"wav",file),join(corpus_short_waves_folder,"wav",file))
 64 | 
 65 | 
 66 | def create_train_text():
 67 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 68 |          "raw","train","prompts.txt")
 69 |     content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos",
 70 |                         "raw", "test", "prompts.txt")
 71 |     content = open(content_path).read()
 72 |     content = content.replace(":", "")
 73 | 
 74 |     content2 = open(content_path2).read()
 75 |     content2 = content2.replace(":", "")
 76 |     lines = content.splitlines()
 77 |     lines2 = content2.splitlines()
 78 |     output = []
 79 |     for line in lines:
 80 |         items = line.split()
 81 |         fileid = items[0]
 82 |         text = " ".join(items[1:]).lower()
 83 |         content = "{}|{}".format(fileid, text)
 84 |         output.append(content)
 85 |     for line in lines2:
 86 |         items = line.split()
 87 |         fileid = items[0]
 88 |         text = " ".join(items[1:]).lower()
 89 |         content2 = "{}|{}".format(fileid, text)
 90 |         output.append(content2)
 91 |     text = "\n".join(output)
 92 | 
 93 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos","corpus","train", "text")
 94 |     open(content_path, "w").write(text)
 95 | 
 96 | 
 97 | def create_test_text():
 98 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text")
 99 | 
100 |     content = open(content_path).read()
101 |     content = content.replace(":", "")
102 |     lines = content.splitlines()
103 |     output = []
104 |     output_short = []
105 |     short_counter = 0
106 |     for line in lines:
107 |         m = re.match(r"^(?P<fileid>.*)\t(?P<text>.*)$", line)
108 |         if m:
109 |             text = m.group("text")
110 |             fileid = m.group("fileid")
111 |             content = "{}|{}".format("global_{}".format(fileid), text)
112 |             output.append(content)
113 |             if short_counter < 20:
114 |                 output_short.append(content)
115 |             short_counter += 1
116 |     text = "\n".join(output)
117 | 
118 | 
119 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "text")
120 |     open(content_path, "w").write(text)
121 | 
122 |     text = "\n".join(output_short)
123 | 
124 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "text")
125 |     open(content_path, "w").write(text)
126 | 
127 | 
128 | def create_gender():
129 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "genders.txt")
130 |     content = open(content_path).read()
131 | 
132 |     content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "genders.txt")
133 |     content2 = open(content_path2).read()
134 |     content = content2 + content
135 | 
136 |     output_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "gender")
137 |     open(output_path, "w").write(content)
138 | 
139 |     content_test = "\n".join(["global m"])
140 | 
141 |     output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "gender")
142 |     open(output_test_path, "w").write(content_test)
143 | 
144 |     output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "gender")
145 |     open(output_test_path, "w").write(content_test)
146 | 
147 | 
148 | def create_speaker():
149 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "prompts.txt")
150 |     content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "prompts.txt")
151 |     lines = open(content_path).read().splitlines()
152 |     files = [line.split()[0] for line in lines]
153 |     tmp = []
154 | 
155 |     for file_id in files:
156 |         speaker_id = file_id.split("_")[0]
157 |         content = "{} {}".format(speaker_id, file_id)
158 |         tmp.append(content)
159 | 
160 |         # Merge vivos test to train dir
161 |     lines2 = open(content_path2).read().splitlines()
162 |     files2 = [line.split()[0] for line in lines2]
163 | 
164 |     for file_id in files2:
165 |         speaker_id = file_id.split("_")[0]
166 |         content = "{} {}".format(speaker_id, file_id)
167 |         tmp.append(content)
168 | 
169 |     tmp.sort()
170 | 
171 |     content = "\n".join(tmp)
172 | 
173 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "speaker")
174 |     open(content_path, "w").write(content)
175 | 
176 |     lines_test_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text")
177 |     lines_test = open(lines_test_path).read().splitlines()
178 |     test_output = []
179 |     short_test_output = []
180 |     short_test_counter = 0
181 | 
182 |     for line in lines_test:
183 |         # print(line)
184 |         m = re.match(r"^(?P<fileid>.*)\t(?P<text>.*)$", line)
185 |         if m:
186 |             # text = m.group("text")
187 |             fileid = m.group("fileid")
188 |             content = "global {}".format("global_{}".format(fileid))
189 | 
190 |             test_output.append(content)
191 |             if short_test_counter < 20:
192 |                 short_test_output.append(content)
193 | 
194 |             short_test_counter+=1
195 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "speaker")
196 |     content = "\n".join(test_output)
197 |     open(content_path, "w").write(content)
198 | 
199 |     content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "speaker")
200 |     short_content = "\n".join(short_test_output)
201 |     open(content_path, "w").write(short_content)
202 | 
203 | try:
204 |     shutil.rmtree(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus"))
205 | except:
206 |     pass
207 | finally:
208 |     mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus"))
209 |     mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus","train"))
210 |     mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test"))
211 |     mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short"))
212 |     create_train_waves()
213 |     create_test_waves()
214 |     create_train_text()
215 |     create_test_text()
216 |     create_gender()
217 |     create_speaker()
218 | 


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R003.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R012.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R012.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R027.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R027.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R028.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R028.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R034.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R034.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R043.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R043.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R044.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R044.wav


--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R055.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R055.wav


--------------------------------------------------------------------------------
/egs/vivos/test_model.py:
--------------------------------------------------------------------------------
 1 | from model import transcript
 2 | from os.path import join, dirname
 3 | from unittest import TestCase
 4 | 
 5 | 
 6 | class TestSentiment(TestCase):
 7 |     def test_1(self):
 8 |         wav = join(dirname(__file__), "test", "VIVOSDEV01_R003.wav")
 9 |         actual = transcript(wav)
10 |         expected = "cà phê"
11 |         self.assertEqual(actual, expected)
12 | 
13 |     def test_2(self):
14 |         wav = join(dirname(__file__), "test", "VIVOSDEV01_R034.wav")
15 |         actual = transcript(wav)
16 |         expected = "khách sạn"
17 |         self.assertEqual(actual, expected)
18 | 


--------------------------------------------------------------------------------
/egs/vivos/text2.py:
--------------------------------------------------------------------------------
 1 | rules_1 = [
 2 |     "aàáảãạ",
 3 |     "ăằắẳẵặ",
 4 |     "âầấẩẫậ",
 5 |     "eèéẻẽẹ",
 6 |     "êềếểễệ",
 7 |     "iìíỉĩị",
 8 |     "oòóỏõọ",
 9 |     "ôồốổỗộ",
10 |     "ơờớởỡợ",
11 |     "uùúủũụ",
12 |     "ưừứửữự",
13 |     "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 |     "awă",
17 |     "aaâ",
18 |     "eeê",
19 |     "ooô",
20 |     "owơ",
21 |     "uwư",
22 |     "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 |     original = words[0]
28 |     words = words[1:]
29 |     for rule in rules_2:
30 |         if original == rule[2]:
31 |             original = rule[0:2]
32 |     tones = "fsrxj"
33 |     for i, w in enumerate(words):
34 |         w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 |     w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 |     p2w[value] = key
39 | 
40 | 
41 | def word2phone(word):
42 |     phone = ""
43 |     for w in word:
44 |         if w in w2p:
45 |             phone += w2p[w]
46 |         else:
47 |             phone += w
48 |     return phone
49 | 
50 | 
51 | def phone2word(phone):
52 |     i = 0
53 |     word = ""
54 |     while i < len(phone):
55 |         if phone[i:i+3] in p2w:
56 |             p = phone[i:i+3]
57 |             word += p2w[p]
58 |             i += 3
59 |         elif phone[i:i+2] in p2w:
60 |             p = phone[i:i+2]
61 |             word += p2w[p]
62 |             i += 2
63 |         else:
64 |             p = phone[i:i+1]
65 |             word += p
66 |             i += 1
67 |     return word
68 | 
69 | if __name__ == '__main__':
70 |     tests = [
71 |         ("con hoẵng", "con hoawxng"),
72 |         ("lựu đạn", "luwju ddajn"),
73 |         ("kiểm tra", "kieerm tra"),
74 |         ("ủy ban", "ury ban"),
75 |         ("cà phê", "caf phee"),
76 |         ("khách sạn", "khasch sajn"),
77 |         ("đúng", "ddusng"),
78 |         ("xã hội", "xax hooji")
79 |     ]
80 |     for test in tests:
81 |         assert (test[0] == phone2word(test[1]))
82 |         assert (test[1] == word2phone(test[0]))
83 | 


--------------------------------------------------------------------------------
/egs/vivos/train.py:
--------------------------------------------------------------------------------
 1 | from egs.vivos.extension.model import KaldiSpeechRecognition
 2 | from os.path import join, dirname
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description='Process some integers.')
 6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True)
 7 | parser.add_argument('--corpus_folder', help='Corpus path to train',required=True)
 8 | parser.add_argument('--export_path', help='Export path will be able soon')
 9 | parser.add_argument('--nj', help='Parallel number of job', default=1)
10 | parser.add_argument('--method', help='Parallel number of job', default="deltadelta")
11 | 
12 | 
13 | args = parser.parse_args()
14 | 
15 | 
16 | def train(kaldi_folder, corpus_folder, export_folder=None, nj=1, method="deltadelta"):
17 |     export_folder = join(dirname(__file__), "model")
18 |     params = {
19 |         "method": method,
20 |         "jobs": nj,
21 |         "lm_order": 1
22 |     }
23 |     model = KaldiSpeechRecognition(corpus_folder, kaldi_folder, params)
24 |     model.fit()
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     train(args.kaldi_folder, args.corpus_folder, args.export_path,args.nj,args.method)


--------------------------------------------------------------------------------
/insight/vivos.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/insight/vivos.txt


--------------------------------------------------------------------------------
/insight/vlsp2018.txt:
--------------------------------------------------------------------------------
 1 | # Dữ liệu VLSP 2018
 2 | 
 3 | Tập dữ liệu VLSP 2018 có tất cả 796 câu.
 4 | 
 5 | Dữ liệu gồm 796 câu nói với độ dài trung bình 40 tokens (max 104 tokens, min 0 tokens).
 6 | Trong đó có một câu đặc biệt có id 0437, không chứa một tiếng nói nào, trong file wav tương ứng chỉ có tiếng xe máy chạy ngoài đường.
 7 | 
 8 | Thông tin phân phối độ dài câu trong tập dữ liệu:
 9 | 
10 | ```
11 | count    796.000000
12 | mean      40.812814
13 | std       22.313014
14 | min        0.000000
15 | 0%         0.000000
16 | 5%         9.000000
17 | 10%       13.000000
18 | 15.0%     16.000000
19 | 20%       19.000000
20 | 25%       22.000000
21 | 30.0%     25.000000
22 | 35%       28.000000
23 | 40%       31.000000
24 | 45%       34.000000
25 | 50%       38.000000
26 | 55.0%     41.000000
27 | 60.0%     46.000000
28 | 65%       49.000000
29 | 70%       53.000000
30 | 75%       58.000000
31 | 80%       62.000000
32 | 85.0%     68.000000
33 | 90%       73.000000
34 | 95%       81.000000
35 | 100%     104.000000
36 | max      104.000000
37 | ```
38 | 
39 | File âm thanh dài nhất cỡ 27 giây, ngắn nhất cỡ 1 giây, độ dài trung bình của file âm thanh là 9.5 giây.


--------------------------------------------------------------------------------
/report/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | name="technique_report"
 3 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
 4 | bibtex $name.aux
 5 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
 6 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
 7 | 
 8 | rm -rf $name.blg
 9 | rm -rf $name.log
10 | rm -rf $name.out
11 | rm -rf *.aux
12 | rm -rf $name.bbl
13 | rm -rf $name.synctex.gz


--------------------------------------------------------------------------------
/report/notation.tex:
--------------------------------------------------------------------------------
 1 | % Tensor
 2 | \DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
 3 | \SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
 4 | \newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
 5 | \def\tA{{\tens{A}}}
 6 | \def\tB{{\tens{B}}}
 7 | \def\tC{{\tens{C}}}
 8 | \def\tD{{\tens{D}}}
 9 | \def\tE{{\tens{E}}}
10 | \def\tF{{\tens{F}}}
11 | \def\tG{{\tens{G}}}
12 | \def\tH{{\tens{H}}}
13 | \def\tI{{\tens{I}}}
14 | \def\tJ{{\tens{J}}}
15 | \def\tK{{\tens{K}}}
16 | \def\tL{{\tens{L}}}
17 | \def\tM{{\tens{M}}}
18 | \def\tN{{\tens{N}}}
19 | \def\tO{{\tens{O}}}
20 | \def\tP{{\tens{P}}}
21 | \def\tQ{{\tens{Q}}}
22 | \def\tR{{\tens{R}}}
23 | \def\tS{{\tens{S}}}
24 | \def\tT{{\tens{T}}}
25 | \def\tU{{\tens{U}}}
26 | \def\tV{{\tens{V}}}
27 | \def\tW{{\tens{W}}}
28 | \def\tX{{\tens{X}}}
29 | \def\tY{{\tens{Y}}}
30 | \def\tZ{{\tens{Z}}}
31 | \def\tx{{\tens{x}}}
32 | \def\ty{{\tens{y}}}


--------------------------------------------------------------------------------
/report/technique_report.bib:
--------------------------------------------------------------------------------
 1 | @article{DBLP:journals/corr/Le-Hong16,
 2 |   author    = {Phuong Le{-}Hong},
 3 |   title     = {Vietnamese Named Entity Recognition using Token Regular Expressions
 4 |                and Bidirectional Inference},
 5 |   journal   = {CoRR},
 6 |   volume    = {abs/1610.05652},
 7 |   year      = {2016},
 8 |   url       = {http://arxiv.org/abs/1610.05652},
 9 |   archivePrefix = {arXiv},
10 |   eprint    = {1610.05652},
11 |   timestamp = {Wed, 07 Jun 2017 14:42:34 +0200},
12 |   biburl    = {https://dblp.org/rec/bib/journals/corr/Le-Hong16},
13 |   bibsource = {dblp computer science bibliography, https://dblp.org}
14 | }
15 | 
16 | @article{DBLP:journals/corr/abs-1708-07241,
17 |   author    = {Thai{-}Hoang Pham and
18 |                Xuan{-}Khoai Pham and
19 |                Tuan{-}Anh Nguyen and
20 |                Phuong Le{-}Hong},
21 |   title     = {{NNVLP:} {A} Neural Network-Based Vietnamese Language Processing Toolkit},
22 |   journal   = {CoRR},
23 |   volume    = {abs/1708.07241},
24 |   year      = {2017},
25 |   url       = {http://arxiv.org/abs/1708.07241},
26 |   archivePrefix = {arXiv},
27 |   eprint    = {1708.07241},
28 |   timestamp = {Tue, 05 Sep 2017 10:03:46 +0200},
29 |   biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1708-07241},
30 |   bibsource = {dblp computer science bibliography, https://dblp.org}
31 | }
32 | 
33 | @article{DBLP:journals/corr/abs-1801-01331,
34 |   author    = {Thanh Vu and
35 |                Dat Quoc Nguyen and
36 |                Dai Quoc Nguyen and
37 |                Mark Dras and
38 |                Mark Johnson},
39 |   title     = {VnCoreNLP: {A} Vietnamese Natural Language Processing Toolkit},
40 |   journal   = {CoRR},
41 |   volume    = {abs/1801.01331},
42 |   year      = {2018},
43 |   url       = {http://arxiv.org/abs/1801.01331},
44 |   archivePrefix = {arXiv},
45 |   eprint    = {1801.01331},
46 |   timestamp = {Thu, 01 Feb 2018 19:52:26 +0100},
47 |   biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1801-01331},
48 |   bibsource = {dblp computer science bibliography, https://dblp.org}
49 | }
50 | 
51 | @article{DBLP:journals/corr/abs-1803-08463,
52 |   author    = {Pham Quang Nhat Minh},
53 |   title     = {A Feature-Based Model for Nested Named-Entity Recognition at {VLSP-2018}
54 |                {NER} Evaluation Campaign},
55 |   journal   = {CoRR},
56 |   volume    = {abs/1803.08463},
57 |   year      = {2018},
58 |   url       = {http://arxiv.org/abs/1803.08463},
59 |   archivePrefix = {arXiv},
60 |   eprint    = {1803.08463},
61 |   timestamp = {Wed, 11 Apr 2018 11:12:46 +0200},
62 |   biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1803-08463},
63 |   bibsource = {dblp computer science bibliography, https://dblp.org}
64 | }


--------------------------------------------------------------------------------
/report/technique_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/report/technique_report.pdf


--------------------------------------------------------------------------------
/report/technique_report.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper]{article}
  2 | \usepackage{acl2017}
  3 | \usepackage{times}
  4 | \usepackage{multirow}
  5 | \usepackage{url}
  6 | \usepackage{latexsym}
  7 | \usepackage{graphicx}
  8 | \usepackage{color}
  9 | \usepackage{booktabs}
 10 | \usepackage{amsmath}
 11 | \usepackage[english,vietnam]{babel}
 12 | \usepackage[utf8]{vietnam}
 13 | 
 14 | \aclfinalcopy % Uncomment this line for the final submission
 15 | %\def\eaclpaperid{***} %  Enter the acl Paper ID here
 16 | 
 17 | %\setlength\titlebox{5cm}
 18 | % You can expand the titlebox if you need extra space
 19 | % to show all the authors. Please do not make the titlebox
 20 | % smaller than 5cm (the original size); we will check this
 21 | % in the camera-ready version and ask you to change it back.
 22 | 
 23 | \newcommand\BibTeX{B{\sc ib}\TeX}
 24 | 
 25 | \title{Báo cáo kỹ thuật\\Module nhận dạng tiếng nói tiếng Việt\\ trong underthesea}
 26 | 
 27 | \include{notation}
 28 | 
 29 | \author{
 30 | Vũ Anh\\
 31 | underthesea\\
 32 | {\tt anhv.ict91@gmail.com} \\
 33 | \And
 34 | Lê Phi Hùng \\
 35 | underthesea\\
 36 | {\tt lephihungch@gmail.com} \\
 37 | }
 38 | 
 39 | \date{}
 40 | 
 41 | \begin{document}
 42 | \maketitle
 43 | \begin{abstract}
 44 | 
 45 | Trong báo cáo này, trong chúng mô tả hệ thống nhận dạng tiếng nói tiếng Việt trong underthesea. Trong đó, hệ thống sử dụng công cụ Kaldi để xây dựng module nhận dạng, kết quả được đánh giá trên tập dữ liệu test của VLSP 2018. Toàn bộ mã nguồn và tài liệu của dự án được phát hiện dưới dạng mở nguồn mở tại địa chỉ \url{https://github.com/undertheseanlp/automatic_speech_recognition}
 46 | 
 47 | \end{abstract}
 48 | 
 49 | \section{Giới thiệu}
 50 | 
 51 | \section{Mô tả hệ thống}
 52 | 
 53 | Các thử nghiệm được thực hiện trên bộ công cụ nhận dạng tiếng nói được viết trên C++ Kaldi. \footnote{http://kaldi-asr.org/}
 54 | 
 55 | Mô hình xây dựng hệ thống nhận dạng tiếng nói
 56 | 
 57 | \subsection{Chuẩn bị dữ liệu và các tài nguyên ngôn ngữ}
 58 | 
 59 | Việc đầu tiên cần làm là chuẩn bị dữ liệu huấn luyện âm thanh - phụ đề.
 60 | Gồm có các tập tin âm thanh (thường để ở định dạng wav) chứa các tiếng nói của người và các tập tin phụ đề tương ứng.
 61 | 
 62 | Việc tiếp theo là xây dựng từ điển phát âm.
 63 | Hình dung một cách đơn giản, từ điển phát âm sẽ chứa cách phát âm (cách phân chia các âm) tương ứng với từng tiếng.
 64 | Ngoài ra trong hệ thống còn cần các âm câm (silence\_phones), các từ ngoài từ điển (out-of-vocabulary hay oov).
 65 | 
 66 | 
 67 | Cuối cùng là chuẩn bị dữ liệu cho việc huấn luyện mô hình ngôn ngữ.
 68 | Mô hình ngôn ngữ giúp cải thiện chất lượng của hệ thống nhận dạng tiếng nói, bằng cách đưa ra những khả năng có thể nhất trong một cụm từ.
 69 | Hãy xem xét ví dụ hệ thống đang phải quyết định từ con thiếu trong câu \textit{Tôi đi Hà \_ mấy ngày}.
 70 | Nếu hệ thống sử dụng mô hình ngôn ngữ, có thể dễ dàng nhận ra từ \textit{Nội} là từ có khả năng còn thiếu nhất trong câu này.
 71 | 
 72 | \subsection{Huấn luyện mô hình Gaussian Mixture Model}
 73 | 
 74 | Bước đầu tiên là huấn luyện mô hình âm học, là thành phần chuyển các tín hiệu âm thanh thành dữ liệu văn bản.
 75 | Mô hình huấn luyện thường sử dụng thuật toán Gaussian Mixture Model trên các tập đặc trưng phổ biến của âm thanh như MFCC (Mel-frequency cepstral coefficients) \footnote{Để biết thêm về đặc trưng này, xin tìm đọc tài liệu \href{http://www.lrc.tnu.edu.vn/upload/collection/brief/41619_13520141527406.pdf}{So sánh hai phương pháp trích chọn đặc trưng âm thanh: Đường bao phổ (MFCC) và cao độ Pitch trong việc tìm kiếm âm nhạc theo nội dung}}. Ngoài ra còn có các đặc trưng delta, lda, mltt hay sat.
 76 | 
 77 | Bước thứ hai là huấn luyện mô hình ngôn ngữ
 78 | 
 79 | \subsection{Quá trình giải mã}
 80 | 
 81 | \begin{itemize}
 82 |   \item Tạo ra một đồ thị giải mã
 83 |   \item Tính điểm lại Lattice
 84 | \end{itemize}
 85 | 
 86 | \section{Đánh giá}
 87 | 
 88 | \subsection{Tập dữ liệu}
 89 | 
 90 | Có hai tập dữ liệu được sử dụng. Tập dữ liệu VIVOS và tập dữ liệu VLSP 2018. Trong đó, tập dữ liệu VIVOS được dùng để huấn luyện, tập dữ liệu VLSP 2018 được sử dụng để đánh giá kết quả mô hình.
 91 | 
 92 | \subsection{Kết quả}
 93 | 
 94 | % TODO To be updated
 95 | 
 96 | \section{Conclusion}
 97 | 
 98 | % TODO To be updated
 99 | 
100 | \section{Lời cảm ơn}
101 | 
102 | Vì kiến thức còn hạn chế, trong phần mô tả kỹ thuật, tác giả có tham khảo các tài liệu \textit{Building Speech Recognition Systems with the Kaldi Toolkit} \footnote{https://engineering.jhu.edu/clsp/wp-content/uploads/sites/75/2016/06/Building-Speech-Recognition-Systems-with-the-Kaldi-Toolkit.pdf}
103 | 
104 | \bibliography{technique_report}
105 | \bibliographystyle{acl_natbib}
106 | 
107 | \end{document}


--------------------------------------------------------------------------------
/tmp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/tmp/.gitkeep


--------------------------------------------------------------------------------
/util/eda_vlsp.py:
--------------------------------------------------------------------------------
 1 | from os import listdir
 2 | from os.path import join, dirname
 3 | import pandas as pd
 4 | import numpy as np
 5 | import librosa
 6 | 
 7 | ROOT_FOLDER = dirname(dirname(__file__))
 8 | 
 9 | 
10 | def stat_tokens(lines):
11 |     token_lengths = [len(line.split()[1:]) for line in lines]
12 |     token_lengths = pd.Series(token_lengths)
13 |     print(token_lengths.describe(percentiles=np.linspace(0, 1, 21)))
14 | 
15 | 
16 | def stat_text():
17 |     print("\nText Data:")
18 |     text_file = join(ROOT_FOLDER, "data", "vlsp", "text")
19 |     lines = open(text_file, "r").read().splitlines()
20 |     print("VLSP 2018 DATA SET")
21 |     print("\nTotal sentences:", len(lines))
22 |     stat_tokens(lines)
23 | 
24 | 
25 | def stat_acoustic():
26 |     print("\nAcoustic Data:")
27 |     wav_folder = join(ROOT_FOLDER, "data", "vlsp", "wav")
28 |     files = listdir(wav_folder)
29 |     files = [join(wav_folder, file) for file in files]
30 |     durations = [librosa.get_duration(filename=file) for file in files]
31 |     durations = pd.Series(durations)
32 |     print(f"Total: {durations.sum():.2f} seconds ({durations.sum() / 3600:.2f} hours)")
33 |     print(durations.describe())
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     stat_text()
38 |     stat_acoustic()
39 | 


--------------------------------------------------------------------------------