├── .gitignore
├── Experiment.ipynb
├── LICENSE.txt
├── README.en.md
├── README.md
├── data
├── diadiem
│ ├── preprocess.py
│ └── text.py
└── vivos
│ └── preprocess.py
├── data_format.md
├── egs
├── diadiem
│ ├── __init__.py
│ ├── analyze.py
│ ├── extension
│ │ ├── __init__.py
│ │ ├── analyze.py
│ │ ├── export.py
│ │ ├── metrics.py
│ │ ├── model.py
│ │ └── text.py
│ ├── load_data.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── etc
│ │ │ ├── feat.params
│ │ │ ├── idngram
│ │ │ ├── sphinx_train.cfg
│ │ │ ├── text
│ │ │ ├── tmp.dic
│ │ │ ├── tmp.filler
│ │ │ ├── tmp.lm
│ │ │ ├── tmp.phone
│ │ │ ├── tmp_test.fileids
│ │ │ ├── tmp_test.transcription
│ │ │ ├── tmp_train.fileids
│ │ │ ├── tmp_train.transcription
│ │ │ └── vocab
│ │ ├── model_parameters
│ │ │ ├── tmp.cd_cont_200
│ │ │ │ ├── feat.params
│ │ │ │ ├── mdef
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── noisedict
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.cd_cont_200_1
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.cd_cont_200_2
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.cd_cont_200_4
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.cd_cont_initial
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.cd_cont_untied
│ │ │ │ ├── feat.params
│ │ │ │ ├── mdef
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── noisedict
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ ├── tmp.ci_cont
│ │ │ │ ├── feat.params
│ │ │ │ ├── mdef
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── noisedict
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ │ └── tmp.ci_cont_flatinitial
│ │ │ │ ├── globalmean
│ │ │ │ ├── globalvar
│ │ │ │ ├── means
│ │ │ │ ├── mixture_weights
│ │ │ │ ├── transition_matrices
│ │ │ │ └── variances
│ │ └── text.py
│ ├── test
│ │ ├── CAFPHEE001.wav
│ │ ├── CAFPHEE002.wav
│ │ ├── CAFPHEE003.wav
│ │ ├── CAFPHEE004.wav
│ │ ├── CAFPHEE005.wav
│ │ ├── CAFPHEE006.wav
│ │ ├── CAFPHEE007.wav
│ │ ├── CAFPHEE008.wav
│ │ ├── CAFPHEE009.wav
│ │ ├── CAFPHEE010.wav
│ │ ├── DDUSNG0001.wav
│ │ ├── DDUSNG0002.wav
│ │ ├── DDUSNG0003.wav
│ │ ├── DDUSNG0004.wav
│ │ ├── DDUSNG0005.wav
│ │ ├── DDUSNG0006.wav
│ │ ├── DDUSNG0007.wav
│ │ ├── DDUSNG0008.wav
│ │ ├── DDUSNG0009.wav
│ │ ├── DDUSNG0010.wav
│ │ ├── KARAOKE001.wav
│ │ ├── KARAOKE002.wav
│ │ ├── KARAOKE003.wav
│ │ ├── KARAOKE004.wav
│ │ ├── KARAOKE005.wav
│ │ ├── KARAOKE006.wav
│ │ ├── KARAOKE007.wav
│ │ ├── KARAOKE008.wav
│ │ ├── KARAOKE009.wav
│ │ ├── KARAOKE010.wav
│ │ ├── KHASCHSAJN001.wav
│ │ ├── KHASCHSAJN002.wav
│ │ ├── KHASCHSAJN003.wav
│ │ ├── KHASCHSAJN004.wav
│ │ ├── KHASCHSAJN005.wav
│ │ ├── KHASCHSAJN006.wav
│ │ ├── KHASCHSAJN007.wav
│ │ ├── KHASCHSAJN008.wav
│ │ ├── KHASCHSAJN009.wav
│ │ ├── KHASCHSAJN010.wav
│ │ ├── KHOONG0001.wav
│ │ ├── KHOONG0002.wav
│ │ ├── KHOONG0003.wav
│ │ ├── KHOONG0004.wav
│ │ ├── KHOONG0005.wav
│ │ ├── KHOONG0006.wav
│ │ ├── KHOONG0007.wav
│ │ ├── KHOONG0008.wav
│ │ ├── KHOONG0009.wav
│ │ ├── KHOONG0010.wav
│ │ ├── MASTXA001.wav
│ │ ├── MASTXA002.wav
│ │ ├── MASTXA003.wav
│ │ ├── MASTXA004.wav
│ │ ├── MASTXA005.wav
│ │ ├── MASTXA006.wav
│ │ ├── MASTXA007.wav
│ │ ├── MASTXA008.wav
│ │ ├── MASTXA009.wav
│ │ ├── MASTXA010.wav
│ │ ├── TRAJMAYTEEM001.wav
│ │ ├── TRAJMAYTEEM002.wav
│ │ ├── TRAJMAYTEEM003.wav
│ │ ├── TRAJMAYTEEM004.wav
│ │ ├── TRAJMAYTEEM005.wav
│ │ ├── TRAJMAYTEEM006.wav
│ │ ├── TRAJMAYTEEM007.wav
│ │ ├── TRAJMAYTEEM008.wav
│ │ ├── TRAJMAYTEEM009.wav
│ │ ├── TRAJMAYTEEM010.wav
│ │ ├── TROWRLAJI001.wav
│ │ ├── TROWRLAJI002.wav
│ │ ├── TROWRLAJI003.wav
│ │ ├── TROWRLAJI004.wav
│ │ ├── TROWRLAJI005.wav
│ │ ├── TROWRLAJI006.wav
│ │ ├── TROWRLAJI007.wav
│ │ ├── TROWRLAJI008.wav
│ │ ├── TROWRLAJI009.wav
│ │ └── TROWRLAJI010.wav
│ ├── test_model.py
│ ├── text.py
│ └── train.py
└── vivos
│ ├── README.md
│ ├── __init__.py
│ ├── analyze.py
│ ├── extension
│ ├── __init__.py
│ ├── analyze.py
│ ├── cmd.sh
│ ├── export.py
│ ├── metrics.py
│ ├── model.py
│ ├── model_sphinx.py
│ ├── path.sh
│ ├── run_deltadelta.sh
│ ├── run_lda_mllt.sh
│ ├── run_lda_mllt_decode.sh
│ ├── run_sat.sh
│ ├── run_sgmm2.sh
│ ├── text.py
│ ├── transcript_deltadelta.sh
│ └── transcriptions
│ │ ├── audio
│ │ ├── R001.wav
│ │ ├── R002.wav
│ │ ├── R003.wav
│ │ ├── R004.wav
│ │ ├── R005.wav
│ │ ├── t1_tat_ca.wav
│ │ └── t2_tro_nen.wav
│ │ └── wav.scp
│ ├── load_data.py
│ ├── logs
│ ├── 20181207_122900.md
│ ├── 20181207_185000.md
│ ├── 20181207_232600.md
│ ├── 20181208_075100.md
│ └── README.md
│ ├── model
│ ├── __init__.py
│ ├── etc
│ │ ├── feat.params
│ │ ├── idngram
│ │ ├── sphinx_train.cfg
│ │ ├── text
│ │ ├── tmp.dic
│ │ ├── tmp.filler
│ │ ├── tmp.lm
│ │ ├── tmp.phone
│ │ ├── tmp_test.fileids
│ │ ├── tmp_test.transcription
│ │ ├── tmp_train.fileids
│ │ ├── tmp_train.transcription
│ │ └── vocab
│ ├── model_parameters
│ │ ├── tmp.cd_cont_200
│ │ │ ├── feat.params
│ │ │ ├── mdef
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── noisedict
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.cd_cont_200_1
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.cd_cont_200_2
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.cd_cont_200_4
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.cd_cont_initial
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.cd_cont_untied
│ │ │ ├── feat.params
│ │ │ ├── mdef
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── noisedict
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ ├── tmp.ci_cont
│ │ │ ├── feat.params
│ │ │ ├── mdef
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── noisedict
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ │ └── tmp.ci_cont_flatinitial
│ │ │ ├── globalmean
│ │ │ ├── globalvar
│ │ │ ├── means
│ │ │ ├── mixture_weights
│ │ │ ├── transition_matrices
│ │ │ └── variances
│ └── text.py
│ ├── predict.py
│ ├── predict_delta.sh
│ ├── preprocess.py
│ ├── preprocess_full.py
│ ├── test
│ ├── VIVOSDEV01_R003.wav
│ ├── VIVOSDEV01_R012.wav
│ ├── VIVOSDEV01_R027.wav
│ ├── VIVOSDEV01_R028.wav
│ ├── VIVOSDEV01_R034.wav
│ ├── VIVOSDEV01_R043.wav
│ ├── VIVOSDEV01_R044.wav
│ └── VIVOSDEV01_R055.wav
│ ├── test_model.py
│ ├── text2.py
│ └── train.py
├── insight
├── vivos.txt
└── vlsp2018.txt
├── report
├── acl2017.sty
├── acl_natbib.bst
├── build.sh
├── eacl2017.bst
├── eacl2017.sty
├── notation.tex
├── technique_report.bib
├── technique_report.pdf
└── technique_report.tex
├── tmp
└── .gitkeep
└── util
└── eda_vlsp.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | data/vivos/raw/
3 | data/vivos/corpus/
4 | data/open_fpt/raw/FPTOpenSpeechData_Set001_V0.1
5 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part1_V0.1
6 | data/open_fpt/raw/FPTOpenSpeechData_Set002_Part2_V0.1
7 | **/tmp/
8 | **/analyze/
9 | /experiment/diadiem/tmp/
10 | /data/vlsp/corpus/
11 | /data/vlsp/wav
12 | /experiment/vlsp/extension/_pycache_/
13 | **/**/__pycache__/
14 | **/__pycache__/
15 | .ipynb_checkpoints
16 | data/vlsp
17 | tmp
18 | !tmp/.gitkeep
19 | data/diadiem/
--------------------------------------------------------------------------------
/Experiment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "print(\"Hello from Underthesea Automatic Speech Recognition Team\")"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "!lscpu"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "!free -m"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "!df -h"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 1,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "data\t\texperiment\t LICENSE.txt\tREADME.md\r\n",
49 | "data_format.md\tExperiment.ipynb README.en.md\treport\r\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "!ls"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "diadiem vivos\r\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "!ls data"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "corpus\tpreprocess.py raw\r\n"
84 | ]
85 | }
86 | ],
87 | "source": [
88 | "!ls data/vivos"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "4.0K\tExperiment.ipynb\n",
101 | "36K\tLICENSE.txt\n",
102 | "4.0K\tREADME.en.md\n",
103 | "8.0K\tREADME.md\n",
104 | "5.2G\tdata\n",
105 | "4.0K\tdata_format.md\n",
106 | "312M\texperiment\n",
107 | "220K\treport\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "!du -sh *"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": []
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.6.3"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 2
146 | }
147 |
--------------------------------------------------------------------------------
/README.en.md:
--------------------------------------------------------------------------------
1 | # Vietnamese Automatic Speech Recognition
2 |
3 | ## Mục lục
4 |
5 |
6 | ## Huấn luyện mô hình
7 |
8 | ## Môi trường thử nghiệm
9 |
10 | * Ubuntu 16.04
11 |
12 | ## Cài đặt
13 |
14 | **Cài đặt Kaldi** theo hướng dẫn tại [http://kaldi-asr.org/doc/tutorial_setup.html](http://kaldi-asr.org/doc/tutorial_setup.html)
15 |
16 | ```
17 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
18 |
19 | $ cd kaldi-trunk/tools/; make;
20 |
21 | $ extras/install_openblas.sh
22 |
23 | $ cd ../src; ./configure --openblas-root=../tools/OpenBLAS/install; make
24 | ```
25 |
26 | **Cài đặt language modeling toolkit srilm**
27 |
28 | Cài đặt dependencies
29 |
30 | ```
31 | $ apt-get install gawk
32 | ```
33 |
34 | Cài đặt srilm
35 |
36 | ```
37 | $ cd kaldi-trunk/tools
38 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz
39 | $ ./install_srilm.sh
40 | ...
41 | Installation of SRILM finished successfully
42 | Please source the tools/env.sh in your path.sh to enable it
43 | ```
44 |
45 | # Mô tả dữ liệu
46 |
47 | [Xem chi tiết](data_format.md)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Nhận dạng tiếng nói tiếng Việt
2 |
3 | 
4 | 
5 | 
6 |
7 | Dự án nghiên cứu về bài toán *Nhận dạng tiếng nói tiếng Việt*, được phát triển bởi nhóm nghiên cứu xử lý ngôn ngữ tự nhiên tiếng Việt - [undertheseanlp](https://github.com/undertheseanlp/). Chứa mã nguồn các thử nghiệm cho việc xử lý dữ liệu, huấn luyện và đánh giá mô hình, cũng như cho phép dễ dàng tùy chỉnh mô hình đối với những tập dữ liệu mới.
8 |
9 | **Nhóm tác giả**
10 |
11 | * Vũ Anh ([anhv.ict91@gmail.com](anhv.ict91@gmail.com))
12 | * Lê Phi Hùng ([lephihungch@gmail.com](lephihungch@gmail.com))
13 |
14 | **Tham gia đóng góp**
15 |
16 | Mọi ý kiến đóng góp hoặc yêu cầu trợ giúp xin gửi vào mục [Issues](../../issues) của dự án. Các thảo luận được khuyến khích **sử dụng tiếng Việt** để dễ dàng trong quá trình trao đổi.
17 |
18 | Nếu bạn có kinh nghiệm trong bài toán này, muốn tham gia vào nhóm phát triển với vai trò là [Developer](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor), xin hãy đọc kỹ [Hướng dẫn tham gia đóng góp](https://github.com/undertheseanlp/underthesea/wiki/H%C6%B0%E1%BB%9Bng-d%E1%BA%ABn-%C4%91%C3%B3ng-g%C3%B3p#developercontributor).
19 |
20 | ## Mục lục
21 |
22 | * [Yêu cầu hệ thống](#yêu-cầu-hệ-thống)
23 | * [Thiết lập môi trường](#thiết-lập-môi-trường)
24 | * [Hướng dẫn sử dụng](#hướng-dẫn-sử-dụng)
25 | * [Sử dụng mô hình đã huấn luyện](#sử-dụng-mô-hình-đã-huấn-luyện)
26 | * [Huấn luyện mô hình](#huấn-luyện-mô-hình)
27 | * [Kết quả thử nghiệm](#kết-quả-thử-nghiệm)
28 | * [Trích dẫn](#trích-dẫn)
29 | * [Bản quyền](#bản-quyền)
30 |
31 | ## Yêu cầu hệ thống
32 |
33 | * `Hệ điều hành: Ubuntu 16.04`
34 | * `Python 3.6+`
35 | * `conda 4+`
36 |
37 |
38 | ## Thiết lập môi trường
39 |
40 | **Cài đặt Kaldi**
41 |
42 | Để cài đặt Kaldi, thực hiện theo các bước tại [hướng dẫn](http://kaldi-asr.org/doc/tutorial_setup.html)
43 |
44 | ```
45 | $ git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
46 |
47 | $ cd kaldi-trunk/tools/; make;
48 |
49 | $ extras/install_openblas.sh
50 |
51 | $ cd ../src; ./configure --openblas-root=../tools/OpenBLAS/install; make
52 | ```
53 |
54 | **Cài đặt language modeling toolkit srilm**
55 |
56 | Cài đặt dependencies
57 |
58 | ```
59 | $ apt-get install gawk
60 | ```
61 |
62 | **Cài đặt srilm**
63 |
64 | ```
65 | $ cd kaldi-trunk/tools
66 | $ wget -O srilm.tgz https://raw.githubusercontent.com/denizyuret/nlpcourse/master/download/srilm-1.7.0.tgz
67 | $ ./install_srilm.sh
68 | ...
69 | Installation of SRILM finished successfully
70 | Please source the tools/env.sh in your path.sh to enable it
71 | ```
72 |
73 | ## Hướng dẫn sử dụng
74 |
75 | ### Huấn luyện mô hình
76 |
77 | **Mô tả dữ liệu**: [Xem chi tiết](data_format.md)
78 |
79 | Trước khi run train.py phải set lại đường dẫn tới kaldi_folder .
80 |
81 | Method predict nên có thêm argument model_path nếu bạn đã thực hiện train trước đó (vì nếu không nó sẽ lấy theo tmp_path của model, mà tmp_path này random cho mỗi lần khởi tạo lại model để chuẩn bị cho việc chạy training mới)
82 |
83 | Thay đổi N_TRAIN và N_TEST trong init của KaldiSpeechRecognition để đổi giới hạn tập train/test
84 |
85 | Output folder sẽ nằm trong kaldi_folder/egs/uts_{tmp_number} với tmp_number được thấy khi run train.py (EX: "Init Kaldi Speech Recognition in number_of_tmp folder" - Will be updated soon)
86 |
87 | ## Kết quả thử nghiệm
88 |
89 | Huấn luyện trên tập dữ liệu VIVOS - OpenFPT, test trên tập VLSP 2018
90 |
91 |
92 |
93 | Mô hình
94 | | WER
95 | |
96 |
97 | GMM: MFCC + delta + LDA + MLTT |
98 | 75.27% |
99 |
100 |
101 |
102 | Huấn luyện trên tập dữ liệu VIVOS, test trên tập VLSP 2018
103 |
104 |
105 |
106 | Mô hình
107 | | WER
108 | |
109 |
110 | GMM: MFCC + delta + LDA + MLTT |
111 | 79.80% |
112 |
113 |
114 | GMM: MFCC + delta |
115 | 82.03% |
116 |
117 |
118 |
119 | ## Bản quyền
120 |
121 | Mã nguồn của dự án được phân phối theo giấy phép [GPL-3.0](LICENSE.txt).
122 |
123 | Dự án sử dụng tập dữ liệu **[VIVOS](https://ailab.hcmus.edu.vn/vivos/)** trong các thử nghiệm. Xin vui lòng kiểm tra lại thông tin trên website hoặc báo cáo khoa học tương ứng để biết thông tin về bản quyền và trích dẫn khi sử dụng tập dữ liệu này.
--------------------------------------------------------------------------------
/data/diadiem/preprocess.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os import mkdir
3 | import re
4 | from text import phone2word
5 |
6 |
7 | def create_train_text():
8 | lines = open(
9 | "raw/huanluyen_diadiem_train.transcription").read().splitlines()
10 | output = []
11 | for line in lines:
12 | m = re.match(r"^ (?P.*) \((?P.*)\)$", line)
13 | if m:
14 | text = phone2word(m.group("text").lower())
15 | fileid = m.group("fileid")
16 | content = "{}|{}".format(fileid, text)
17 | output.append(content)
18 | pass
19 | else:
20 | raise Exception("Content not match.")
21 | text = "\n".join(output)
22 | open("corpus/train/text", "w").write(text)
23 |
24 |
25 | def create_test_text():
26 | lines = open(
27 | "raw/huanluyen_diadiem_test.transcription").read().splitlines()
28 | output = []
29 | for line in lines:
30 | m = re.match(r"^(?P.*) \((?P.*)\)$", line)
31 | if m:
32 | text = phone2word(m.group("text").lower())
33 | fileid = m.group("fileid")
34 | content = "{}|{}".format(fileid, text)
35 | output.append(content)
36 | pass
37 | else:
38 | raise Exception("Text not match.")
39 | text = "\n".join(output)
40 | open("corpus/test/text", "w").write(text)
41 |
42 |
43 | try:
44 | shutil.rmtree("corpus")
45 | except:
46 | pass
47 | finally:
48 | mkdir("corpus")
49 | mkdir("corpus/train")
50 | mkdir("corpus/test")
51 | shutil.copytree("raw/wav/train", "corpus/train/wav")
52 | shutil.copytree("raw/wav/test", "corpus/test/wav")
53 | create_train_text()
54 | create_test_text()
55 |
--------------------------------------------------------------------------------
/data/diadiem/text.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/data/vivos/preprocess.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os import mkdir, walk
3 | from os import listdir
4 | from os.path import join
5 |
6 |
7 | def create_train_waves():
8 | waves_folder = "raw/train/waves"
9 | corpus_waves_folder = "corpus/train/wav"
10 | try:
11 | shutil.rmtree(corpus_waves_folder)
12 | except:
13 | pass
14 | finally:
15 | mkdir(corpus_waves_folder)
16 | for root, dirs, files in walk(waves_folder):
17 | for dir in dirs:
18 | for f in listdir(join(waves_folder, dir)):
19 | shutil.copy(
20 | join(waves_folder, dir, f),
21 | join(corpus_waves_folder, f))
22 |
23 |
24 | def create_test_waves():
25 | waves_folder = "raw/test/waves"
26 | corpus_waves_folder = "corpus/test/wav"
27 | try:
28 | shutil.rmtree(corpus_waves_folder)
29 | except:
30 | pass
31 | finally:
32 | mkdir(corpus_waves_folder)
33 | for root, dirs, files in walk(waves_folder):
34 | for dir in dirs:
35 | for f in listdir(join(waves_folder, dir)):
36 | shutil.copy(
37 | join(waves_folder, dir, f),
38 | join(corpus_waves_folder, f))
39 |
40 |
41 | def create_train_text():
42 | content = open("raw/train/prompts.txt").read()
43 | content = content.replace(":", "")
44 | lines = content.splitlines()
45 | output = []
46 | for line in lines:
47 | items = line.split()
48 | fileid = items[0]
49 | text = " ".join(items[1:]).lower()
50 | content = "{}|{}".format(fileid, text)
51 | output.append(content)
52 | text = "\n".join(output)
53 | open("corpus/train/text", "w").write(text)
54 |
55 |
56 | def create_test_text():
57 | content = open("raw/test/prompts.txt").read()
58 | content = content.replace(":", "")
59 | lines = content.splitlines()
60 | output = []
61 | for line in lines:
62 | items = line.split()
63 | fileid = items[0]
64 | text = " ".join(items[1:]).lower()
65 | content = "{}|{}".format(fileid, text)
66 | output.append(content)
67 | text = "\n".join(output)
68 | open("corpus/test/text", "w").write(text)
69 |
70 |
71 | def create_gender():
72 | content = open("raw/train/genders.txt").read()
73 | open("corpus/train/gender", "w").write(content)
74 | content = open("raw/test/genders.txt").read()
75 | open("corpus/test/gender", "w").write(content)
76 |
77 |
78 | def create_speaker():
79 | lines = open("raw/train/prompts.txt").read().splitlines()
80 | files = [line.split()[0] for line in lines]
81 | tmp = []
82 | for file_id in files:
83 | speaker_id = file_id.split("_")[0]
84 | content = "{} {}".format(speaker_id, file_id)
85 | tmp.append(content)
86 | content = "\n".join(tmp)
87 | open("corpus/train/speaker", "w").write(content)
88 |
89 | lines = open("raw/test/prompts.txt").read().splitlines()
90 | files = [line.split()[0] for line in lines]
91 | tmp = []
92 | for file_id in files:
93 | speaker_id = file_id.split("_")[0]
94 | content = "{} {}".format(speaker_id, file_id)
95 | tmp.append(content)
96 | content = "\n".join(tmp)
97 | open("corpus/test/speaker", "w").write(content)
98 |
99 |
100 | try:
101 | shutil.rmtree("corpus")
102 | except:
103 | pass
104 | finally:
105 | mkdir("corpus")
106 | mkdir("corpus/train")
107 | mkdir("corpus/test")
108 | create_train_waves()
109 | create_test_waves()
110 | create_train_text()
111 | create_test_text()
112 | create_gender()
113 | create_speaker()
114 |
--------------------------------------------------------------------------------
/data_format.md:
--------------------------------------------------------------------------------
1 | # Mô tả dữ liệu
2 |
3 | | Phiên bản | v1.0.0 |
4 | |-------------------|------------|
5 | | Lần cập nhật cuối | 10/01/2018 |
6 | | Người thực hiện | Vũ Anh |
7 |
8 | Tài liệu mô tả đề xuất về cấu trúc chuẩn của tập dữ liệu (corpus) đối với bài toán nhận dạng tiếng nói (ASR). Được áp dụng trong các thí nghiệm của [`underthesea`](https://github.com/undertheseanlp/automatic_speech_recognition) từ phiên bản 1.2.0
9 |
10 | Các ví dụ mẫu: [`diadiem`](https://github.com/undertheseanlp/automatic_speech_recognition/tree/sphinx_lab/data/diadiem/corpus) corpus
11 |
12 | ### Tập dữ liệu
13 |
14 | Dữ liệu của bài toán nhận dạng tiếng nói được lưu trong một thư mục, gồm hai thư mục con `train` và `test`.
15 |
16 | * Dữ liệu huấn luyện được lưu trong thư mục `train`
17 | * Dữ liệu kiểm thử được lưu trong thư mục `test`
18 |
19 | Cấu trúc thư mục
20 |
21 | ```
22 | .
23 | ├── train
24 | | ├── wav
25 | | | ├── train_01.wav
26 | | | ├── train_02.wav
27 | | | └── train_03.wav
28 | | ├── gender
29 | | ├── speaker
30 | | └── text
31 | └── test
32 | ├── wav
33 | | ├── test_01.wav
34 | | ├── test_02.wav
35 | | └── test_03.wav
36 | ├── gender
37 | ├── speaker
38 | └── text
39 | ```
40 |
41 | Mỗi thư mục `train` và `test` gồm thư mục con `wav`, file `gender`, file `speaker` và file `text`. Trong thư mục `wav` có chứa các file âm thanh (với đuôi định dạng phổ biến là wav), chứa dữ liệu âm thanh.
42 |
43 | File `text` chứa nội dung của từng câu nói với tên file âm thanh tương ứng
44 |
45 | *Format*: `|`
46 |
47 | ```
48 | train_01|text content 01
49 | train_02|text content 02
50 | train_03|text content 03
51 | train_04|text content 04
52 | ```
53 |
54 | File `speaker` chứa mô tả speaker id với câu nói tương ứng
55 |
56 | *Format*: ` `
57 |
58 | ```
59 | spk01 train_01
60 | spk01 train_02
61 | spk02 train_03
62 | spk02 train_04
63 | ```
64 |
65 | File `gender` chứa thông tin về giới tính của speaker
66 |
67 | *Format*: ` `
68 |
69 | ```
70 | spk01 f
71 | spk02 m
72 | ```
73 |
74 | Ký hiệu:
75 |
76 | * `f` (female): speaker có giới tính nữ
77 | * `m` (male): speakder có giới tính nam
--------------------------------------------------------------------------------
/egs/diadiem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/__init__.py
--------------------------------------------------------------------------------
/egs/diadiem/analyze.py:
--------------------------------------------------------------------------------
1 | from model import transcript
2 | from os.path import join, dirname
3 | from extension.analyze import WERAnalyzeLogger
4 |
5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem",
6 | "corpus")
7 |
8 |
9 | def load_test():
10 | lines = open(join(corpus_folder, "test", "text")).read().splitlines()
11 | lines = [line.split("|") for line in lines]
12 | wavs = [line[0] for line in lines]
13 | wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs]
14 | texts = [line[1] for line in lines]
15 | return wavs, texts
16 |
17 |
18 | wavs_test, texts_test = load_test()
19 | # texts_pred = [""] * len(texts_test)
20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test]
21 |
22 | log_folder = join(dirname(__file__), "analyze")
23 |
24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder)
25 |
--------------------------------------------------------------------------------
/egs/diadiem/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/extension/__init__.py
--------------------------------------------------------------------------------
/egs/diadiem/extension/analyze.py:
--------------------------------------------------------------------------------
1 | import json
2 | import shutil
3 | from extension.metrics import calculate_wer
4 | from os.path import join, basename
5 | import os
6 | from underthesea.util.file_io import write
7 | import numpy as np
8 |
9 |
10 | class WERAnalyzeLogger:
11 | @staticmethod
12 | def log(wavs_test, texts_test, texts_pred, log_folder):
13 | wer = np.mean([calculate_wer(test.split(), pred.split())
14 | for test, pred in zip(texts_test, texts_pred)])
15 | wer = np.round(wer, 4)
16 | result = {
17 | "WER": wer
18 | }
19 | content = json.dumps(result, ensure_ascii=False)
20 | log_file = join(log_folder, "result.json")
21 | write(log_file, content)
22 | wav_folder = join(log_folder, "wav")
23 | try:
24 | shutil.rmtree(wav_folder)
25 | except:
26 | pass
27 | finally:
28 | os.mkdir(wav_folder)
29 | for wav in wavs_test:
30 | new_path = join(wav_folder, basename(wav))
31 | shutil.copyfile(wav, new_path)
32 | wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test]
33 | speech_recognition = {
34 | "texts_test": texts_test,
35 | "texts_pred": texts_pred,
36 | "wavs_test": wavs_test_new_path,
37 | }
38 | content = json.dumps(speech_recognition, ensure_ascii=False)
39 | log_file = join(log_folder, "speechrecognition.json")
40 | write(log_file, content)
41 |
42 | print("Result is written in {}".format(log_file))
43 | print("WER: {}%".format(wer * 100))
44 |
--------------------------------------------------------------------------------
/egs/diadiem/extension/export.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os.path import join
3 |
4 |
5 | class SphinxSpeechRecognitionExporter:
6 | @staticmethod
7 | def export(model, export_folder):
8 | tmp_folder = model.tmp_folder
9 | try:
10 | shutil.rmtree(join(export_folder, "etc"))
11 | except:
12 | pass
13 | finally:
14 | shutil.copytree(join(tmp_folder, "etc"),
15 | join(export_folder, "etc"))
16 |
17 | try:
18 | shutil.rmtree(join(export_folder, "model_parameters"))
19 | except:
20 | pass
21 | finally:
22 | shutil.copytree(join(tmp_folder, "model_parameters"),
23 | join(export_folder, "model_parameters"))
24 |
--------------------------------------------------------------------------------
/egs/diadiem/extension/metrics.py:
--------------------------------------------------------------------------------
1 | def calculate_wer(reference, hypothesis):
2 | """
3 | Calculation of WER with Levenshtein distance.
4 | Works only for iterables up to 254 elements (uint8).
5 | O(nm) time and space complexity.
6 |
7 | >>> calculate_wer("who is there".split(), "is there".split())
8 | 1
9 | >>> calculate_wer("who is there".split(), "".split())
10 | 3
11 | >>> calculate_wer("".split(), "who is there".split())
12 | 3
13 | """
14 | # initialisation
15 | import numpy
16 | d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1),
17 | dtype=numpy.uint8)
18 | d = d.reshape((len(reference) + 1, len(hypothesis) + 1))
19 | for i in range(len(reference) + 1):
20 | for j in range(len(hypothesis) + 1):
21 | if i == 0:
22 | d[0][j] = j
23 | elif j == 0:
24 | d[i][0] = i
25 |
26 | # computation
27 | for i in range(1, len(reference) + 1):
28 | for j in range(1, len(hypothesis) + 1):
29 | if reference[i - 1] == hypothesis[j - 1]:
30 | d[i][j] = d[i - 1][j - 1]
31 | else:
32 | substitution = d[i - 1][j - 1] + 1
33 | insertion = d[i][j - 1] + 1
34 | deletion = d[i - 1][j] + 1
35 | d[i][j] = min(substitution, insertion, deletion)
36 |
37 | return d[len(reference)][len(hypothesis)] / float(len(reference))
38 |
39 |
40 | import unittest
41 | assertions = unittest.TestCase('__init__')
42 |
43 | if __name__ == '__main__':
44 | s = calculate_wer("khach san".split(), "khach san cua toi".split())
45 | assertions.assertAlmostEqual(s, 1)
46 | s = calculate_wer("khach san cua".split(), "khach san cua toi".split())
47 | assertions.assertAlmostEqual(s, 0.333, 3)
48 |
--------------------------------------------------------------------------------
/egs/diadiem/extension/model.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import os
3 | import text
4 |
5 |
6 | class SphinxSpeechRecognition:
7 | def __init__(self, corpus_folder, tmp_folder):
8 | print("Initial Sphinx Speech Recognition")
9 | self.corpus_folder = corpus_folder
10 | self.tmp_folder = tmp_folder
11 | try:
12 | shutil.rmtree(tmp_folder)
13 | except Exception as e:
14 | pass
15 | finally:
16 | os.mkdir(tmp_folder)
17 | os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder))
18 | self._init_data()
19 | self._change_config()
20 | self._make_transcription()
21 | self._make_dictionary()
22 | self._make_filler()
23 | self._make_language_model()
24 |
25 | # ========================== #
26 | # Init Data
27 | # ========================== #
28 | def _init_data(self):
29 | os.system("cd {}; mkdir wav".format(self.tmp_folder))
30 |
31 | os.system("cd {}; cp -r {}/train/wav wav/train".format(self.tmp_folder,
32 | self.corpus_folder))
33 | os.system("cd {}; cp -r {}/test/wav wav/test".format(self.tmp_folder,
34 | self.corpus_folder))
35 |
36 | ids = open(
37 | "{}/train/text".format(self.corpus_folder)).read().splitlines()
38 | ids = [item.split("|")[0] for item in ids]
39 | ids = ["train/{}".format(id) for id in ids]
40 | ids.append("")
41 | content = "\n".join(ids)
42 | open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"),
43 | "w").write(content)
44 |
45 | ids = open(
46 | "{}/test/text".format(self.corpus_folder)).read().splitlines()
47 | ids = [item.split("|")[0] for item in ids]
48 | ids = ["test/{}".format(id) for id in ids]
49 | ids.append("")
50 | content = "\n".join(ids)
51 | open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"),
52 | "w").write(content)
53 |
54 | # ========================== #
55 | # Config
56 | # ========================== #
57 | def _change_config(self):
58 | config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg")
59 | config = SphinxConfig(config_file)
60 | config.set("$CFG_BASE_DIR", "\".\"")
61 | config.set("$CFG_WAVFILE_SRATE", 8000.0)
62 | config.set("$CFG_NUM_FILT", 31)
63 | config.set("$CFG_LO_FILT", 200)
64 | config.set("$CFG_HI_FILT", 3500)
65 | config.set("$CFG_WAVFILE_TYPE", "'raw'")
66 | config.set("$CFG_LANGUAGEMODEL",
67 | "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"")
68 | config.set("$DEC_CFG_LANGUAGEMODEL",
69 | "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"")
70 |
71 | # ========================== #
72 | # Transcription
73 | # ========================== #
74 | def _convert_transcription(self, in_file, out_file):
75 | lines = open(in_file).read().splitlines()
76 | output = []
77 | for line in lines:
78 | fileid, word = line.split("|")
79 | phone = text.word2phone(word)
80 | content = " {} ({})".format(phone, fileid)
81 | output.append(content)
82 | content = "\n".join(output)
83 | open(out_file, "w").write(content)
84 |
85 | def _make_transcription(self):
86 | self._convert_transcription(
87 | "{}/train/text".format(self.corpus_folder),
88 | "{}/etc/tmp_train.transcription".format(self.tmp_folder))
89 | self._convert_transcription(
90 | "{}/test/text".format(self.corpus_folder),
91 | "{}/etc/tmp_test.transcription".format(self.tmp_folder))
92 |
93 | # ============================== #
94 | # Create dictionary and phones
95 | # ============================== #
96 | def _make_dictionary(self):
97 | lines = open(
98 | "{}/train/text".format(self.corpus_folder)).read().splitlines()
99 | phones = []
100 | for line in lines:
101 | fileid, word = line.split("|")
102 | p = text.word2phone(word).split()
103 | phones += p
104 | phones = sorted(set(phones))
105 | # create .dic files
106 | lines = []
107 | phone_units = []
108 | for p in phones:
109 | units = list(p)
110 | phone_units += units
111 | units = " ".join(units)
112 | line = "{:20s}{}".format(p, units)
113 | lines.append(line)
114 | open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write(
115 | "\n".join(lines))
116 | phone_units = sorted(set(phone_units))
117 | phone_units.append("SIL")
118 | open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write(
119 | "\n".join(phone_units))
120 |
121 | def _make_filler(self):
122 | fillers = ["", "", ""]
123 | lines = ["{:20s}SIL".format(f) for f in fillers]
124 | open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write(
125 | "\n".join(lines))
126 |
127 | # ========================== #
128 | # Language Model
129 | # ========================== #
130 | def _make_cleaned_text(self):
131 | in_file = "{}/train/text".format(self.corpus_folder)
132 | out_file = "{}/etc/text".format(self.tmp_folder)
133 | lines = open(in_file).read().splitlines()
134 | output = []
135 | for line in lines:
136 | fileid, word = line.split("|")
137 | phone = text.word2phone(word)
138 | content = " {} ".format(phone, fileid)
139 | output.append(content)
140 | content = "\n".join(output)
141 | open(out_file, "w").write(content)
142 |
143 | def _make_language_model(self):
144 | self._make_cleaned_text()
145 | etc_folder = os.path.join(self.tmp_folder, "etc")
146 | chdir = "cd {}; ".format(etc_folder)
147 | os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab")
148 | os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text")
149 | os.system(
150 | chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm")
151 |
152 | def fit(self):
153 | chdir = "cd {}; ".format(self.tmp_folder)
154 | os.system(chdir + "sphinxtrain run")
155 |
156 | def predict(self, wav_file):
157 | command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format(
158 | self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file)
159 | output = os.popen(command).read().strip()
160 | output = text.phone2word(output)
161 | return output
162 |
163 |
164 | class SphinxConfig:
165 | def __init__(self, config_file):
166 | self.file = config_file
167 | self.lines = open(config_file).read().splitlines()
168 |
169 | def save(self):
170 | content = "\n".join(self.lines)
171 | open(self.file, "w").write(content)
172 |
173 | def set(self, key, value):
174 | for i, line in enumerate(self.lines):
175 | if line.startswith(key):
176 | content = "{} = {};".format(key, value)
177 | self.lines[i] = content
178 | self.save()
179 |
--------------------------------------------------------------------------------
/egs/diadiem/extension/text.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/egs/diadiem/load_data.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join
2 |
3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "diadiem",
4 | "corpus")
5 |
--------------------------------------------------------------------------------
/egs/diadiem/model/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname
2 | import os
3 | import text
4 |
5 |
6 | def transcript(wav_file):
7 | tmp_folder = dirname(__file__)
8 | command = "pocketsphinx_continuous " \
9 | "-hmm {0}/model_parameters/tmp.cd_cont_200 " \
10 | "-samprate 8000 " \
11 | "-lm {0}/etc/tmp.lm " \
12 | "-dict {0}/etc/tmp.dic " \
13 | "-infile {1} " \
14 | "-logfn {0}/yes".format(tmp_folder, wav_file)
15 | with os.popen(command) as c:
16 | output = c.read().strip()
17 | output = text.phone2word(output)
18 | os.remove("{}/yes".format(tmp_folder))
19 | return output
20 |
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf __CFG_LO_FILT__
2 | -upperf __CFG_HI_FILT__
3 | -nfilt __CFG_NUM_FILT__
4 | -transform __CFG_TRANSFORM__
5 | -lifter __CFG_LIFTER__
6 | -feat __CFG_FEATURE__
7 | -svspec __CFG_SVSPEC__
8 | -agc __CFG_AGC__
9 | -cmn __CFG_CMN__
10 | -varnorm __CFG_VARNORM__
11 |
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/idngram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/etc/idngram
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.dic:
--------------------------------------------------------------------------------
1 | ay a y
2 | caf c a f
3 | ddusng d d u s n g
4 | ka k a
5 | ke k e
6 | khasch k h a s c h
7 | khoong k h o o n g
8 | laji l a j i
9 | mast m a s t
10 | phee p h e e
11 | rao r a o
12 | sajn s a j n
13 | teem t e e m
14 | trajm t r a j m
15 | trowr t r o w r
16 | xa x a
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.filler:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.lm:
--------------------------------------------------------------------------------
1 | #############################################################################
2 | ## Copyright (c) 1996, Carnegie Mellon University, Cambridge University,
3 | ## Ronald Rosenfeld and Philip Clarkson
4 | ## Version 3, Copyright (c) 2006, Carnegie Mellon University
5 | ## Contributors includes Wen Xu, Ananlada Chotimongkol,
6 | ## David Huggins-Daines, Arthur Chan and Alan Black
7 | #############################################################################
8 | =============================================================================
9 | =============== This file was produced by the CMU-Cambridge ===============
10 | =============== Statistical Language Modeling Toolkit ===============
11 | =============================================================================
12 | This is a 3-gram language model, based on a vocabulary of 18 words,
13 | which begins "", "", "ay"...
14 | This is a CLOSED-vocabulary model
15 | (OOVs eliminated from training data and are forbidden in test data)
16 | Good-Turing discounting was applied.
17 | 1-gram frequency of frequency : 0
18 | 2-gram frequency of frequency : 0 0 0 0 0 0 0
19 | 3-gram frequency of frequency : 0 0 0 0 0 0 0
20 | 1-gram discounting ratios :
21 | 2-gram discounting ratios :
22 | 3-gram discounting ratios :
23 | This file is in the ARPA-standard format introduced by Doug Paul.
24 |
25 | p(wd3|wd1,wd2)= if(trigram exists) p_3(wd1,wd2,wd3)
26 | else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
27 | else p(wd3|w2)
28 |
29 | p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)
30 | else bo_wt_1(wd1)*p_1(wd2)
31 |
32 | All probs and back-off weights (bo_wt) are given in log10 form.
33 |
34 | Data formats:
35 |
36 | Beginning of data mark: \data\
37 | ngram 1=nr # number of 1-grams
38 | ngram 2=nr # number of 2-grams
39 | ngram 3=nr # number of 3-grams
40 |
41 | \1-grams:
42 | p_1 wd_1 bo_wt_1
43 | \2-grams:
44 | p_2 wd_1 wd_2 bo_wt_2
45 | \3-grams:
46 | p_3 wd_1 wd_2 wd_3
47 |
48 | end of data mark: \end\
49 |
50 | \data\
51 | ngram 1=18
52 | ngram 2=25
53 | ngram 3=32
54 |
55 | \1-grams:
56 | -0.5755 -3.5579
57 | -0.5754 -3.5587
58 | -1.6028 ay -2.6555
59 | -1.5908 caf -2.6672
60 | -1.2657 ddusng -2.8684
61 | -1.5982 ka -2.6601
62 | -1.5982 ke -2.5370
63 | -1.6066 khasch -2.6519
64 | -1.2289 khoong -2.9053
65 | -1.5817 laji -2.5534
66 | -1.5899 mast -2.6681
67 | -1.5908 phee -2.5444
68 | -1.5982 rao -2.6601
69 | -1.6066 sajn -2.5286
70 | -1.6028 teem -2.5324
71 | -1.6028 trajm -2.6555
72 | -1.5817 trowr -2.6760
73 | -1.5899 xa -2.5453
74 |
75 | \2-grams:
76 | -0.0001 0.0008
77 | -1.0152 caf 0.0009
78 | -0.6907 ddusng 0.0004
79 | -1.0226 ka 0.0009
80 | -1.0310 khasch 0.0009
81 | -0.6538 khoong 0.0004
82 | -1.0143 mast 0.0009
83 | -1.0273 trajm 0.0009
84 | -1.0053 trowr 0.0000
85 | -0.0009 ay teem 0.0009
86 | -0.0009 caf phee 0.0009
87 | -0.0004 ddusng 0.6900
88 | -0.0009 ka rao 0.0009
89 | -0.0009 ke 1.0219
90 | -0.0009 khasch sajn 0.0009
91 | -0.0004 khoong 0.6531
92 | -0.0009 laji 1.0055
93 | -0.0009 mast xa 0.0009
94 | -0.0009 phee 1.0145
95 | -0.0009 rao ke 0.0009
96 | -0.0009 sajn 1.0303
97 | -0.0009 teem 1.0266
98 | -0.0009 trajm ay 0.0009
99 | -0.0009 trowr laji 0.0009
100 | -0.0009 xa 1.0136
101 |
102 | \3-grams:
103 | -1.0163 caf
104 | -0.6903 ddusng
105 | -1.0227 ka
106 | -1.0312 khasch
107 | -0.6534 khoong
108 | -1.0144 mast
109 | -1.0274 trajm
110 | -1.0054 trowr
111 | -0.0009 caf phee
112 | -0.0004 ddusng
113 | -0.0009 ka rao
114 | -0.0009 khasch sajn
115 | -0.0004 khoong
116 | -0.0009 mast xa
117 | -0.0009 trajm ay
118 | -0.0009 trowr laji
119 | -0.0009 ay teem
120 | -0.0009 caf phee
121 | -0.0004 ddusng
122 | -0.0009 ka rao ke
123 | -0.0009 ke
124 | -0.0009 khasch sajn
125 | -0.0004 khoong
126 | -0.0009 laji
127 | -0.0009 mast xa
128 | -0.0009 phee
129 | -0.0009 rao ke
130 | -0.0009 sajn
131 | -0.0009 teem
132 | -0.0009 trajm ay teem
133 | -0.0009 trowr laji
134 | -0.0009 xa
135 |
136 | \end\
137 |
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp.phone:
--------------------------------------------------------------------------------
1 | a
2 | c
3 | d
4 | e
5 | f
6 | g
7 | h
8 | i
9 | j
10 | k
11 | l
12 | m
13 | n
14 | o
15 | p
16 | r
17 | s
18 | t
19 | u
20 | w
21 | x
22 | y
23 | SIL
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp_test.fileids:
--------------------------------------------------------------------------------
1 | test/CAFPHEE001
2 | test/CAFPHEE002
3 | test/CAFPHEE003
4 | test/CAFPHEE004
5 | test/CAFPHEE005
6 | test/CAFPHEE006
7 | test/CAFPHEE007
8 | test/CAFPHEE008
9 | test/CAFPHEE009
10 | test/CAFPHEE010
11 | test/DDUSNG0001
12 | test/DDUSNG0002
13 | test/DDUSNG0003
14 | test/DDUSNG0004
15 | test/DDUSNG0005
16 | test/DDUSNG0006
17 | test/DDUSNG0007
18 | test/DDUSNG0008
19 | test/DDUSNG0009
20 | test/DDUSNG0010
21 | test/KARAOKE001
22 | test/KARAOKE002
23 | test/KARAOKE003
24 | test/KARAOKE004
25 | test/KARAOKE005
26 | test/KARAOKE006
27 | test/KARAOKE007
28 | test/KARAOKE008
29 | test/KARAOKE009
30 | test/KARAOKE010
31 | test/KHASCHSAJN001
32 | test/KHASCHSAJN002
33 | test/KHASCHSAJN003
34 | test/KHASCHSAJN004
35 | test/KHASCHSAJN005
36 | test/KHASCHSAJN006
37 | test/KHASCHSAJN007
38 | test/KHASCHSAJN008
39 | test/KHASCHSAJN009
40 | test/KHASCHSAJN010
41 | test/KHOONG0001
42 | test/KHOONG0002
43 | test/KHOONG0003
44 | test/KHOONG0004
45 | test/KHOONG0005
46 | test/KHOONG0006
47 | test/KHOONG0007
48 | test/KHOONG0008
49 | test/KHOONG0009
50 | test/KHOONG0010
51 | test/MASTXA001
52 | test/MASTXA002
53 | test/MASTXA003
54 | test/MASTXA004
55 | test/MASTXA005
56 | test/MASTXA006
57 | test/MASTXA007
58 | test/MASTXA008
59 | test/MASTXA009
60 | test/MASTXA010
61 | test/TRAJMAYTEEM001
62 | test/TRAJMAYTEEM002
63 | test/TRAJMAYTEEM003
64 | test/TRAJMAYTEEM004
65 | test/TRAJMAYTEEM005
66 | test/TRAJMAYTEEM006
67 | test/TRAJMAYTEEM007
68 | test/TRAJMAYTEEM008
69 | test/TRAJMAYTEEM009
70 | test/TRAJMAYTEEM010
71 | test/TROWRLAJI001
72 | test/TROWRLAJI002
73 | test/TROWRLAJI003
74 | test/TROWRLAJI004
75 | test/TROWRLAJI005
76 | test/TROWRLAJI006
77 | test/TROWRLAJI007
78 | test/TROWRLAJI008
79 | test/TROWRLAJI009
80 | test/TROWRLAJI010
81 |
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/tmp_test.transcription:
--------------------------------------------------------------------------------
1 | caf phee (CAFPHEE001)
2 | caf phee (CAFPHEE002)
3 | caf phee (CAFPHEE003)
4 | caf phee (CAFPHEE004)
5 | caf phee (CAFPHEE005)
6 | caf phee (CAFPHEE006)
7 | caf phee (CAFPHEE007)
8 | caf phee (CAFPHEE008)
9 | caf phee (CAFPHEE009)
10 | caf phee (CAFPHEE010)
11 | ddusng (DDUSNG0001)
12 | ddusng (DDUSNG0002)
13 | ddusng (DDUSNG0003)
14 | ddusng (DDUSNG0004)
15 | ddusng (DDUSNG0005)
16 | ddusng (DDUSNG0006)
17 | ddusng (DDUSNG0007)
18 | ddusng (DDUSNG0008)
19 | ddusng (DDUSNG0009)
20 | ddusng (DDUSNG0010)
21 | ka rao ke (KARAOKE001)
22 | ka rao ke (KARAOKE002)
23 | ka rao ke (KARAOKE003)
24 | ka rao ke (KARAOKE004)
25 | ka rao ke (KARAOKE005)
26 | ka rao ke (KARAOKE006)
27 | ka rao ke (KARAOKE007)
28 | ka rao ke (KARAOKE008)
29 | ka rao ke (KARAOKE009)
30 | ka rao ke (KARAOKE010)
31 | khasch sajn (KHASCHSAJN001)
32 | khasch sajn (KHASCHSAJN002)
33 | khasch sajn (KHASCHSAJN003)
34 | khasch sajn (KHASCHSAJN004)
35 | khasch sajn (KHASCHSAJN005)
36 | khasch sajn (KHASCHSAJN006)
37 | khasch sajn (KHASCHSAJN007)
38 | khasch sajn (KHASCHSAJN008)
39 | khasch sajn (KHASCHSAJN009)
40 | khasch sajn (KHASCHSAJN010)
41 | khoong (KHOONG0001)
42 | khoong (KHOONG0002)
43 | khoong (KHOONG0003)
44 | khoong (KHOONG0004)
45 | khoong (KHOONG0005)
46 | khoong (KHOONG0006)
47 | khoong (KHOONG0007)
48 | khoong (KHOONG0008)
49 | khoong (KHOONG0009)
50 | khoong (KHOONG0010)
51 | mast xa (MASTXA001)
52 | mast xa (MASTXA002)
53 | mast xa (MASTXA003)
54 | mast xa (MASTXA004)
55 | mast xa (MASTXA005)
56 | mast xa (MASTXA006)
57 | mast xa (MASTXA007)
58 | mast xa (MASTXA008)
59 | mast xa (MASTXA009)
60 | mast xa (MASTXA010)
61 | trajm ay teem (TRAJMAYTEEM001)
62 | trajm ay teem (TRAJMAYTEEM002)
63 | trajm ay teem (TRAJMAYTEEM003)
64 | trajm ay teem (TRAJMAYTEEM004)
65 | trajm ay teem (TRAJMAYTEEM005)
66 | trajm ay teem (TRAJMAYTEEM006)
67 | trajm ay teem (TRAJMAYTEEM007)
68 | trajm ay teem (TRAJMAYTEEM008)
69 | trajm ay teem (TRAJMAYTEEM009)
70 | trajm ay teem (TRAJMAYTEEM010)
71 | trowr laji (TROWRLAJI001)
72 | trowr laji (TROWRLAJI002)
73 | trowr laji (TROWRLAJI003)
74 | trowr laji (TROWRLAJI004)
75 | trowr laji (TROWRLAJI005)
76 | trowr laji (TROWRLAJI006)
77 | trowr laji (TROWRLAJI007)
78 | trowr laji (TROWRLAJI008)
79 | trowr laji (TROWRLAJI009)
80 | trowr laji (TROWRLAJI010)
--------------------------------------------------------------------------------
/egs/diadiem/model/etc/vocab:
--------------------------------------------------------------------------------
1 | ## Vocab generated by v2 of the CMU-Cambridge Statistcal
2 | ## Language Modeling toolkit.
3 | ##
4 | ## Includes 18 words ##
5 |
6 |
7 | ay
8 | caf
9 | ddusng
10 | ka
11 | ke
12 | khasch
13 | khoong
14 | laji
15 | mast
16 | phee
17 | rao
18 | sajn
19 | teem
20 | trajm
21 | trowr
22 | xa
23 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_1/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_2/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_200_4/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_initial/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mdef:
--------------------------------------------------------------------------------
1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan 5 10:29:45 2018
2 | 0.3
3 | 23 n_base
4 | 59 n_tri
5 | 328 n_state_map
6 | 246 n_tied_state
7 | 69 n_tied_ci_state
8 | 23 n_tied_tmat
9 | #
10 | # Columns definitions
11 | #base lft rt p attrib tmat ... state id's ...
12 | SIL - - - filler 0 0 1 2 N
13 | a - - - n/a 1 3 4 5 N
14 | c - - - n/a 2 6 7 8 N
15 | d - - - n/a 3 9 10 11 N
16 | e - - - n/a 4 12 13 14 N
17 | f - - - n/a 5 15 16 17 N
18 | g - - - n/a 6 18 19 20 N
19 | h - - - n/a 7 21 22 23 N
20 | i - - - n/a 8 24 25 26 N
21 | j - - - n/a 9 27 28 29 N
22 | k - - - n/a 10 30 31 32 N
23 | l - - - n/a 11 33 34 35 N
24 | m - - - n/a 12 36 37 38 N
25 | n - - - n/a 13 39 40 41 N
26 | o - - - n/a 14 42 43 44 N
27 | p - - - n/a 15 45 46 47 N
28 | r - - - n/a 16 48 49 50 N
29 | s - - - n/a 17 51 52 53 N
30 | t - - - n/a 18 54 55 56 N
31 | u - - - n/a 19 57 58 59 N
32 | w - - - n/a 20 60 61 62 N
33 | x - - - n/a 21 63 64 65 N
34 | y - - - n/a 22 66 67 68 N
35 | a c f i n/a 1 69 70 71 N
36 | a h s i n/a 1 72 73 74 N
37 | a k r e n/a 1 75 76 77 N
38 | a l j i n/a 1 78 79 80 N
39 | a m s i n/a 1 81 82 83 N
40 | a m y b n/a 1 84 85 86 N
41 | a r j i n/a 1 87 88 89 N
42 | a r o i n/a 1 90 91 92 N
43 | a s j i n/a 1 93 94 95 N
44 | a x SIL e n/a 1 96 97 98 N
45 | c SIL a b n/a 2 99 100 101 N
46 | c s h i n/a 2 102 103 104 N
47 | d SIL d b n/a 3 105 106 107 N
48 | d d u i n/a 3 108 109 110 N
49 | e e SIL e n/a 4 111 112 113 N
50 | e e m i n/a 4 114 115 116 N
51 | e h e i n/a 4 117 118 119 N
52 | e k SIL e n/a 4 120 121 122 N
53 | e t e i n/a 4 123 124 125 N
54 | f a p e n/a 5 126 127 128 N
55 | g n SIL e n/a 6 129 130 131 N
56 | h c s e n/a 7 132 133 134 N
57 | h k a i n/a 7 135 136 137 N
58 | h k o i n/a 7 138 139 140 N
59 | h p e i n/a 7 141 142 143 N
60 | i j SIL e n/a 8 144 145 146 N
61 | j a i i n/a 9 147 148 149 N
62 | j a m i n/a 9 150 151 152 N
63 | j a n i n/a 9 153 154 155 N
64 | k SIL a b n/a 10 156 157 158 N
65 | k SIL h b n/a 10 159 160 161 N
66 | k o e b n/a 10 162 163 164 N
67 | l r a b n/a 11 165 166 167 N
68 | m SIL a b n/a 12 168 169 170 N
69 | m e SIL e n/a 12 171 172 173 N
70 | m j a e n/a 12 174 175 176 N
71 | n j SIL e n/a 13 177 178 179 N
72 | n o g i n/a 13 180 181 182 N
73 | n s g i n/a 13 183 184 185 N
74 | o a k e n/a 14 186 187 188 N
75 | o h o i n/a 14 189 190 191 N
76 | o o n i n/a 14 192 193 194 N
77 | o r w i n/a 14 195 196 197 N
78 | p f h b n/a 15 198 199 200 N
79 | r a a b n/a 16 201 202 203 N
80 | r t a i n/a 16 204 205 206 N
81 | r t o i n/a 16 207 208 209 N
82 | r w l e n/a 16 210 211 212 N
83 | s a c i n/a 17 213 214 215 N
84 | s a t i n/a 17 216 217 218 N
85 | s h a b n/a 17 219 220 221 N
86 | s u n i n/a 17 222 223 224 N
87 | t SIL r b n/a 18 225 226 227 N
88 | t s x e n/a 18 228 229 230 N
89 | t y e b n/a 18 231 232 233 N
90 | u d s i n/a 19 234 235 236 N
91 | w o r i n/a 20 237 238 239 N
92 | x t a b n/a 21 240 241 242 N
93 | y a t e n/a 22 243 244 245 N
94 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.cd_cont_untied/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/mdef:
--------------------------------------------------------------------------------
1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Fri Jan 5 10:29:25 2018
2 | 0.3
3 | 23 n_base
4 | 0 n_tri
5 | 92 n_state_map
6 | 69 n_tied_state
7 | 69 n_tied_ci_state
8 | 23 n_tied_tmat
9 | #
10 | # Columns definitions
11 | #base lft rt p attrib tmat ... state id's ...
12 | SIL - - - filler 0 0 1 2 N
13 | a - - - n/a 1 3 4 5 N
14 | c - - - n/a 2 6 7 8 N
15 | d - - - n/a 3 9 10 11 N
16 | e - - - n/a 4 12 13 14 N
17 | f - - - n/a 5 15 16 17 N
18 | g - - - n/a 6 18 19 20 N
19 | h - - - n/a 7 21 22 23 N
20 | i - - - n/a 8 24 25 26 N
21 | j - - - n/a 9 27 28 29 N
22 | k - - - n/a 10 30 31 32 N
23 | l - - - n/a 11 33 34 35 N
24 | m - - - n/a 12 36 37 38 N
25 | n - - - n/a 13 39 40 41 N
26 | o - - - n/a 14 42 43 44 N
27 | p - - - n/a 15 45 46 47 N
28 | r - - - n/a 16 48 49 50 N
29 | s - - - n/a 17 51 52 53 N
30 | t - - - n/a 18 54 55 56 N
31 | u - - - n/a 19 57 58 59 N
32 | w - - - n/a 20 60 61 62 N
33 | x - - - n/a 21 63 64 65 N
34 | y - - - n/a 22 66 67 68 N
35 |
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalmean
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/globalvar
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/means
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices
--------------------------------------------------------------------------------
/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/model/model_parameters/tmp.ci_cont_flatinitial/variances
--------------------------------------------------------------------------------
/egs/diadiem/model/text.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/CAFPHEE010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/CAFPHEE010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/DDUSNG0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/DDUSNG0010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KARAOKE010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KARAOKE010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHASCHSAJN010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHASCHSAJN010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/KHOONG0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/KHOONG0010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/MASTXA010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/MASTXA010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TRAJMAYTEEM010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TRAJMAYTEEM010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI001.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI002.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI003.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI004.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI005.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI006.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI007.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI008.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI009.wav
--------------------------------------------------------------------------------
/egs/diadiem/test/TROWRLAJI010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/diadiem/test/TROWRLAJI010.wav
--------------------------------------------------------------------------------
/egs/diadiem/test_model.py:
--------------------------------------------------------------------------------
1 | from model import transcript
2 | from os.path import join, dirname
3 | from unittest import TestCase
4 |
5 |
6 | class TestSentiment(TestCase):
7 | def test_1(self):
8 | wav = join(dirname(__file__), "test", "CAFPHEE001.wav")
9 | actual = transcript(wav)
10 | expected = "cà phê"
11 | self.assertEqual(actual, expected)
12 |
13 | def test_2(self):
14 | wav = join(dirname(__file__), "test", "KHASCHSAJN003.wav")
15 | actual = transcript(wav)
16 | expected = "khách sạn"
17 | self.assertEqual(actual, expected)
18 |
--------------------------------------------------------------------------------
/egs/diadiem/text.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/egs/diadiem/train.py:
--------------------------------------------------------------------------------
1 | from extension.model import SphinxSpeechRecognition
2 | from extension.export import SphinxSpeechRecognitionExporter
3 | from load_data import corpus_folder
4 | from os.path import join, dirname
5 |
6 | tmp_folder = join(dirname(__file__), "tmp")
7 | export_folder = join(dirname(__file__), "model")
8 |
9 | model = SphinxSpeechRecognition(corpus_folder, tmp_folder)
10 | model.fit()
11 | SphinxSpeechRecognitionExporter.export(model, export_folder)
12 | # wav_file = join(tmp_folder, "etc", "wav", "train", "test", "CAFPHEE003.wav")
13 | # model.predict(wav_file)
14 |
--------------------------------------------------------------------------------
/egs/vivos/README.md:
--------------------------------------------------------------------------------
1 | /home/anhv/anaconda3/envs/automatic_speech_recognition/bin/python /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/egs/vivos/train.py --kaldi_folder /home/anhv/PycharmProjects/kaldi-trunk --corpus_folder /home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/data/vivos/corpus --nj 10 --method lda_mllt
2 |
3 | ===== Time Report =====
4 | Mono
5 | 9:25
6 | 0:0
7 | 0:25
8 | Tri1
9 | 2:38
10 | 0:0
11 | 0:24
12 | Tri2a
13 | 2:38
14 | 0:0
15 | 0:24
16 | Tri3a
17 | 2:52
18 | 24:16
19 | 0:51
20 | Total time:
21 | 44:21
22 |
23 |
24 | ===== Score Report =====
25 | Best WER
26 | %WER 79.80 [ 25926 / 32487, 245 ins, 5587 del, 20094 sub ] exp/tri3a/decode/wer_12
--------------------------------------------------------------------------------
/egs/vivos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/__init__.py
--------------------------------------------------------------------------------
/egs/vivos/analyze.py:
--------------------------------------------------------------------------------
1 | from model import transcript
2 | from os.path import join, dirname
3 | from extension.analyze import WERAnalyzeLogger
4 |
5 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
6 | "corpus")
7 |
8 |
9 | def load_test():
10 | lines = open(join(corpus_folder, "test", "text")).read().splitlines()
11 | lines = [line.split("|") for line in lines]
12 | wavs = [line[0] for line in lines]
13 | wavs = ["{}/test/wav/{}.wav".format(corpus_folder, wav) for wav in wavs]
14 | texts = [line[1] for line in lines]
15 | return wavs, texts
16 |
17 |
18 | wavs_test, texts_test = load_test()
19 | # texts_pred = [""] * len(texts_test)
20 | texts_pred = [transcript(wav_file) for wav_file in wavs_test]
21 |
22 | log_folder = join(dirname(__file__), "analyze")
23 |
24 | WERAnalyzeLogger.log(wavs_test, texts_test, texts_pred, log_folder=log_folder)
--------------------------------------------------------------------------------
/egs/vivos/extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/__init__.py
--------------------------------------------------------------------------------
/egs/vivos/extension/analyze.py:
--------------------------------------------------------------------------------
1 | import json
2 | import shutil
3 | from extension.metrics import calculate_wer
4 | from os.path import join, basename
5 | import os
6 | from underthesea.util.file_io import write
7 | import numpy as np
8 |
9 |
10 | class WERAnalyzeLogger:
11 | @staticmethod
12 | def log(wavs_test, texts_test, texts_pred, log_folder):
13 | wer = np.mean([calculate_wer(test.split(), pred.split())
14 | for test, pred in zip(texts_test, texts_pred)])
15 | wer = np.round(wer, 4)
16 | result = {
17 | "WER": wer
18 | }
19 | content = json.dumps(result, ensure_ascii=False)
20 | log_file = join(log_folder, "result.json")
21 | write(log_file, content)
22 | wav_folder = join(log_folder, "wav")
23 | try:
24 | shutil.rmtree(wav_folder)
25 | except:
26 | pass
27 | finally:
28 | os.mkdir(wav_folder)
29 | for wav in wavs_test:
30 | new_path = join(wav_folder, basename(wav))
31 | shutil.copyfile(wav, new_path)
32 | wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test]
33 | speech_recognition = {
34 | "texts_test": texts_test,
35 | "texts_pred": texts_pred,
36 | "wavs_test": wavs_test_new_path,
37 | }
38 | content = json.dumps(speech_recognition, ensure_ascii=False)
39 | log_file = join(log_folder, "speechrecognition.json")
40 | write(log_file, content)
41 |
42 | print("Result is written in {}".format(log_file))
43 | print("WER: {}%".format(wer * 100))
44 |
--------------------------------------------------------------------------------
/egs/vivos/extension/cmd.sh:
--------------------------------------------------------------------------------
1 | # Setting local system jobs (local CPU - no external clusters)
2 | export train_cmd=run.pl
3 | export decode_cmd=run.pl
--------------------------------------------------------------------------------
/egs/vivos/extension/export.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os.path import join
3 |
4 |
5 | class SphinxSpeechRecognitionExporter:
6 | @staticmethod
7 | def export(model, export_folder):
8 | tmp_folder = model.tmp_folder
9 | try:
10 | shutil.rmtree(join(export_folder, "etc"))
11 | except:
12 | pass
13 | finally:
14 | shutil.copytree(join(tmp_folder, "etc"),
15 | join(export_folder, "etc"))
16 |
17 | try:
18 | shutil.rmtree(join(export_folder, "model_parameters"))
19 | except:
20 | pass
21 | finally:
22 | shutil.copytree(join(tmp_folder, "model_parameters"),
23 | join(export_folder, "model_parameters"))
24 |
--------------------------------------------------------------------------------
/egs/vivos/extension/metrics.py:
--------------------------------------------------------------------------------
1 | def calculate_wer(reference, hypothesis):
2 | """
3 | Calculation of WER with Levenshtein distance.
4 | Works only for iterables up to 254 elements (uint8).
5 | O(nm) time and space complexity.
6 |
7 | >>> calculate_wer("who is there".split(), "is there".split())
8 | 1
9 | >>> calculate_wer("who is there".split(), "".split())
10 | 3
11 | >>> calculate_wer("".split(), "who is there".split())
12 | 3
13 | """
14 | # initialisation
15 | import numpy
16 | d = numpy.zeros((len(reference) + 1) * (len(hypothesis) + 1),
17 | dtype=numpy.uint8)
18 | d = d.reshape((len(reference) + 1, len(hypothesis) + 1))
19 | for i in range(len(reference) + 1):
20 | for j in range(len(hypothesis) + 1):
21 | if i == 0:
22 | d[0][j] = j
23 | elif j == 0:
24 | d[i][0] = i
25 |
26 | # computation
27 | for i in range(1, len(reference) + 1):
28 | for j in range(1, len(hypothesis) + 1):
29 | if reference[i - 1] == hypothesis[j - 1]:
30 | d[i][j] = d[i - 1][j - 1]
31 | else:
32 | substitution = d[i - 1][j - 1] + 1
33 | insertion = d[i][j - 1] + 1
34 | deletion = d[i - 1][j] + 1
35 | d[i][j] = min(substitution, insertion, deletion)
36 |
37 | return d[len(reference)][len(hypothesis)] / float(len(reference))
38 |
39 |
40 | import unittest
41 | assertions = unittest.TestCase('__init__')
42 |
43 | if __name__ == '__main__':
44 | s = calculate_wer("khach san".split(), "khach san cua toi".split())
45 | assertions.assertAlmostEqual(s, 1)
46 | s = calculate_wer("khach san cua".split(), "khach san cua toi".split())
47 | assertions.assertAlmostEqual(s, 0.333, 3)
48 |
--------------------------------------------------------------------------------
/egs/vivos/extension/model_sphinx.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import os
3 | import text
4 |
5 | N = 10000
6 |
7 |
8 | class SphinxSpeechRecognition:
9 | def __init__(self, corpus_folder, tmp_folder):
10 | print("Initial Sphinx Speech Recognition")
11 | self.corpus_folder = corpus_folder
12 | self.tmp_folder = tmp_folder
13 | try:
14 | shutil.rmtree(tmp_folder)
15 | except Exception as e:
16 | pass
17 | finally:
18 | os.mkdir(tmp_folder)
19 | os.system("cd {}; sphinxtrain -t tmp setup".format(tmp_folder))
20 | self._init_data()
21 | self._change_config()
22 | self._make_transcription()
23 | self._make_dictionary()
24 | self._make_filler()
25 | self._make_language_model()
26 |
27 | # ========================== #
28 | # Init Data
29 | # ========================== #
30 | def _init_data(self):
31 | os.system("cd {}; mkdir wav".format(self.tmp_folder))
32 | os.system("cd {}; mkdir wav/train".format(self.tmp_folder))
33 | os.system("cd {}; mkdir wav/test".format(self.tmp_folder))
34 |
35 | ids = open(
36 | "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N]
37 | ids = [item.split("|")[0] for item in ids]
38 | for id in ids:
39 | shutil.copy2(
40 | "{}/train/wav/{}.wav".format(self.corpus_folder, id),
41 | "{}/wav/train/{}.wav".format(self.tmp_folder, id)
42 | )
43 |
44 | ids = ["train/{}".format(id) for id in ids]
45 | ids.append("")
46 | content = "\n".join(ids)
47 | open(os.path.join(self.tmp_folder, "etc", "tmp_train.fileids"),
48 | "w").write(content)
49 |
50 | ids = open(
51 | "{}/test/text".format(self.corpus_folder)).read().splitlines()
52 | ids = [item.split("|")[0] for item in ids]
53 | for id in ids:
54 | shutil.copy2(
55 | "{}/test/wav/{}.wav".format(self.corpus_folder, id),
56 | "{}/wav/test/{}.wav".format(self.tmp_folder, id)
57 | )
58 | ids = ["test/{}".format(id) for id in ids]
59 | ids.append("")
60 | content = "\n".join(ids)
61 | open(os.path.join(self.tmp_folder, "etc", "tmp_test.fileids"),
62 | "w").write(content)
63 |
64 | # ========================== #
65 | # Config
66 | # ========================== #
67 | def _change_config(self):
68 | config_file = os.path.join(self.tmp_folder, "etc", "sphinx_train.cfg")
69 | config = SphinxConfig(config_file)
70 | config.set("$CFG_BASE_DIR", "\".\"")
71 | config.set("$CFG_WAVFILE_SRATE", 8000.0)
72 | config.set("$CFG_NUM_FILT", 31)
73 | config.set("$CFG_LO_FILT", 200)
74 | config.set("$CFG_HI_FILT", 3500)
75 | config.set("$CFG_WAVFILE_TYPE", "'raw'")
76 | config.set("$CFG_LANGUAGEMODEL",
77 | "\"$CFG_LIST_DIR/$CFG_DB_NAME.lm\"")
78 | config.set("$DEC_CFG_LANGUAGEMODEL",
79 | "\"$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm\"")
80 |
81 | # ========================== #
82 | # Transcription
83 | # ========================== #
84 | def _convert_transcription(self, in_file, out_file):
85 | lines = open(in_file).read().splitlines()[:N]
86 | output = []
87 | for line in lines:
88 | fileid, word = line.split("|")
89 | phone = text.word2phone(word)
90 | content = " {} ({})".format(phone, fileid)
91 | output.append(content)
92 | output.append("")
93 | content = "\n".join(output)
94 | open(out_file, "w").write(content)
95 |
96 | def _make_transcription(self):
97 | self._convert_transcription(
98 | "{}/train/text".format(self.corpus_folder),
99 | "{}/etc/tmp_train.transcription".format(self.tmp_folder))
100 | self._convert_transcription(
101 | "{}/test/text".format(self.corpus_folder),
102 | "{}/etc/tmp_test.transcription".format(self.tmp_folder))
103 |
104 | # ============================== #
105 | # Create dictionary and phones
106 | # ============================== #
107 | def _make_dictionary(self):
108 | lines = open(
109 | "{}/train/text".format(self.corpus_folder)).read().splitlines()[:N]
110 | phones = []
111 | for line in lines:
112 | fileid, word = line.split("|")
113 | p = text.word2phone(word).split()
114 | phones += p
115 | phones = sorted(set(phones))
116 | # create .dic files
117 | lines = []
118 | phone_units = []
119 | for p in phones:
120 | units = list(p)
121 | phone_units += units
122 | units = " ".join(units)
123 | line = "{:20s}{}".format(p, units)
124 | lines.append(line)
125 | open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write(
126 | "\n".join(lines))
127 | phone_units = sorted(set(phone_units))
128 | phone_units.append("SIL")
129 | open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write(
130 | "\n".join(phone_units))
131 |
132 | def _make_filler(self):
133 | fillers = ["", "", ""]
134 | lines = ["{:20s}SIL".format(f) for f in fillers]
135 | open("{}/etc/tmp.filler".format(self.tmp_folder), "w").write(
136 | "\n".join(lines))
137 |
138 | # ========================== #
139 | # Language Model
140 | # ========================== #
141 | def _make_cleaned_text(self):
142 | in_file = "{}/train/text".format(self.corpus_folder)
143 | out_file = "{}/etc/text".format(self.tmp_folder)
144 | lines = open(in_file).read().splitlines()[:N]
145 | output = []
146 | for line in lines:
147 | fileid, word = line.split("|")
148 | phone = text.word2phone(word)
149 | content = " {} ".format(phone, fileid)
150 | output.append(content)
151 | content = "\n".join(output)
152 | open(out_file, "w").write(content)
153 |
154 | def _make_language_model(self):
155 | self._make_cleaned_text()
156 | etc_folder = os.path.join(self.tmp_folder, "etc")
157 | chdir = "cd {}; ".format(etc_folder)
158 | os.system(chdir + "text2wfreq < text | wfreq2vocab > vocab")
159 | os.system(chdir + "text2idngram -vocab vocab -idngram idngram < text")
160 | os.system(
161 | chdir + "idngram2lm -vocab_type 0 -idngram idngram -vocab vocab -arpa tmp.lm")
162 |
163 | def fit(self):
164 | chdir = "cd {}; ".format(self.tmp_folder)
165 | os.system(chdir + "sphinxtrain run")
166 |
167 | def predict(self, wav_file):
168 | command = "pocketsphinx_continuous -hmm {}/model_parameters/tmp.cd_cont_200 -samprate 8000 -lm {}/etc/tmp.lm -dict {}/etc/tmp.dic -infile {} -logfn yes".format(
169 | self.tmp_folder, self.tmp_folder, self.tmp_folder, wav_file)
170 | output = os.popen(command).read().strip()
171 | output = text.phone2word(output)
172 | return output
173 |
174 |
175 | class SphinxConfig:
176 | def __init__(self, config_file):
177 | self.file = config_file
178 | self.lines = open(config_file).read().splitlines()
179 |
180 | def save(self):
181 | content = "\n".join(self.lines)
182 | open(self.file, "w").write(content)
183 |
184 | def set(self, key, value):
185 | for i, line in enumerate(self.lines):
186 | if line.startswith(key):
187 | content = "{} = {};".format(key, value)
188 | self.lines[i] = content
189 | self.save()
190 |
--------------------------------------------------------------------------------
/egs/vivos/extension/path.sh:
--------------------------------------------------------------------------------
1 | # Defining Kaldi root directory
2 | export KALDI_ROOT=`pwd`/../..
3 |
4 | # Setting paths to useful tools
5 | export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
6 |
7 | # Defining audio data directory (modify it for your installation directory!)
8 | export DATA_ROOT=`pwd`/audio
9 |
10 | # Enable SRILM
11 | . $KALDI_ROOT/tools/env.sh
12 |
13 | # Variable needed for proper data sorting
14 | export LC_ALL=C
--------------------------------------------------------------------------------
/egs/vivos/extension/run_deltadelta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./path.sh || exit 1
4 | . ./cmd.sh || exit 1
5 |
6 | EXP_START=$(date +%s);
7 |
8 | nj=1 # number of parallel jobs
9 | lm_order=1 # language model order (n-gram quantity)
10 |
11 | # Safety mechanism (possible running this script with modified arguments)
12 | . utils/parse_options.sh || exit 1
13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
14 |
15 | # Removing previously created data (from last run.sh execution)
16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
17 |
18 |
19 |
20 | echo
21 | echo "===== PREPARING ACOUSTIC DATA ====="
22 | echo
23 |
24 | # Needs to be prepared by hand (or using self written scripts):
25 | #
26 | # spk2gender [ ]
27 | # wav.scp [ ]
28 | # text [ ]
29 | # utt2spk [ ]
30 | # corpus.txt []
31 |
32 | # Making spk2utt files
33 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
34 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
35 |
36 |
37 | echo
38 | echo "===== FEATURES EXTRACTION ====="
39 | echo
40 |
41 | # Making feats.scp files
42 | mfccdir=mfcc
43 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
44 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory
45 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory
46 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
47 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
48 |
49 |
50 | # Making cmvn.scp files
51 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
52 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
53 |
54 | echo
55 | echo "===== PREPARING LANGUAGE DATA ====="
56 | echo
57 |
58 | # Needs to be prepared by hand (or using self written scripts):
59 | #
60 | # lexicon.txt [ ...]
61 | # nonsilence_phones.txt []
62 | # silence_phones.txt []
63 | # optional_silence.txt []
64 |
65 | # Preparing language data
66 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
67 |
68 | echo
69 | echo "===== LANGUAGE MODEL CREATION ====="
70 | echo "===== MAKING lm.arpa ====="
71 | echo
72 |
73 | loc=`which ngram-count`;
74 | if [ -z $loc ]; then
75 | if uname -a | grep 64 >/dev/null; then
76 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
77 | else
78 | sdir=$KALDI_ROOT/tools/srilm/bin/i686
79 | fi
80 | if [ -f $sdir/ngram-count ]; then
81 | echo "Using SRILM language modelling tool from $sdir"
82 | export PATH=$PATH:$sdir
83 | else
84 | echo "SRILM toolkit is probably not installed.
85 | Instructions: tools/install_srilm.sh"
86 | exit 1
87 | fi
88 | fi
89 |
90 | local=data/local
91 | mkdir $local/tmp
92 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
93 |
94 | echo
95 | echo "===== MAKING G.fst ====="
96 | echo
97 |
98 | lang=data/lang
99 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
100 |
101 | echo
102 | echo "===== MONO TRAINING ====="
103 | echo
104 |
105 | START=$(date +%s);
106 | steps/train_mono.sh --nj $nj \
107 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
108 | END=$(date +%s);
109 | MONO_TRAINING_TIME=$((END - START))
110 |
111 | echo
112 | echo "===== MONO DECODING ====="
113 | echo
114 |
115 | START=$(date +%s);
116 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
117 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
118 | # exp/mono/graph data/test exp/mono/decode
119 | END=$(date +%s);
120 | MONO_DECODING_TIME=$((END - START))
121 |
122 | echo
123 | echo "===== MONO ALIGNMENT ====="
124 | echo
125 |
126 | START=$(date +%s);
127 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
128 | data/train data/lang exp/mono exp/mono_ali || exit 1
129 | END=$(date +%s);
130 | MONO_ALIGNMENT_TIME=$((END - START))
131 |
132 | echo
133 | echo "===== TRI1 (first triphone pass) TRAINING ====="
134 | echo
135 |
136 | START=$(date +%s);
137 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
138 | data/train data/lang exp/mono_ali exp/tri1 || exit 1
139 | END=$(date +%s);
140 | TRI1_TRAINING_TIME=$((END - START))
141 |
142 | echo
143 | echo "===== TRI1 (first triphone pass) DECODING ====="
144 | echo
145 |
146 | START=$(date +%s);
147 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
148 | # steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
149 | # exp/tri1/graph data/test exp/tri1/decode
150 | END=$(date +%s);
151 | TRI1_DECODING_TIME=$((END - START))
152 |
153 | echo
154 | echo "===== TRI1 ALIGNMENT ====="
155 | echo
156 |
157 | START=$(date +%s);
158 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
159 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
160 | END=$(date +%s);
161 | TRI1_ALIGNMENT_TIME=$((END - START))
162 |
163 | echo
164 | echo "===== TRI2A TRAINING ====="
165 | echo
166 |
167 | START=$(date +%s);
168 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
169 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1
170 | END=$(date +%s);
171 | TRI2A_TRAINING_TIME=$((END - START))
172 |
173 | echo
174 | echo "===== TRI2A DECODING ====="
175 | echo
176 |
177 | START=$(date +%s);
178 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
179 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
180 | exp/tri2a/graph data/test exp/tri2a/decode
181 | END=$(date +%s);
182 | TRI2A_DECODING_TIME=$((END - START))
183 |
184 | echo
185 | echo "===== TRI2A ALIGNMENT ====="
186 | echo
187 |
188 | START=$(date +%s);
189 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
190 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
191 | END=$(date +%s);
192 | TRI2A_ALIGNMENT_TIME=$((END - START))
193 |
194 | echo
195 | echo "===== run.sh script is finished ====="
196 | echo
197 |
198 | EXP_END=$(date +%s);
199 | EXP_TIME=$((EXP_END - EXP_START))
200 |
201 | log_file='exp.log'
202 | echo "" > $log_file
203 | echo "===== Time Report =====" >> $log_file
204 | echo "Mono" >> $log_file
205 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
206 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
207 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
208 |
209 | echo "Tri1" >> $log_file
210 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
211 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
212 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
213 |
214 | echo "Tri2a" >> $log_file
215 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
216 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
217 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
218 |
219 | echo "Total time:" >> $log_file
220 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
221 |
222 | echo -e "\n" >> $log_file
223 | echo "===== Score Report =====" >> $log_file
224 | echo "Best WER" >> $log_file
225 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
226 |
227 | echo -e "\n" >> $log_file
228 |
229 | cat $log_file
230 |
--------------------------------------------------------------------------------
/egs/vivos/extension/run_lda_mllt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./path.sh || exit 1
4 | . ./cmd.sh || exit 1
5 |
6 | EXP_START=$(date +%s);
7 |
8 | nj=1 # number of parallel jobs
9 | lm_order=1 # language model order (n-gram quantity)
10 |
11 | # Safety mechanism (possible running this script with modified arguments)
12 | . utils/parse_options.sh || exit 1
13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
14 |
15 | # Removing previously created data (from last run.sh execution)
16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
17 |
18 | echo
19 | echo "===== PREPARING ACOUSTIC DATA ====="
20 | echo
21 |
22 | # Needs to be prepared by hand (or using self written scripts):
23 | #
24 | # spk2gender [ ]
25 | # wav.scp [ ]
26 | # text [ ]
27 | # utt2spk [ ]
28 | # corpus.txt []
29 |
30 | # Making spk2utt files
31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
33 |
34 | echo
35 | echo "===== FEATURES EXTRACTION ====="
36 | echo
37 |
38 | # Making feats.scp files
39 | mfccdir=mfcc
40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory
42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory
43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
45 |
46 | # Making cmvn.scp files
47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
49 |
50 | echo
51 | echo "===== PREPARING LANGUAGE DATA ====="
52 | echo
53 |
54 | # Needs to be prepared by hand (or using self written scripts):
55 | #
56 | # lexicon.txt [ ...]
57 | # nonsilence_phones.txt []
58 | # silence_phones.txt []
59 | # optional_silence.txt []
60 |
61 | # Preparing language data
62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
63 |
64 | echo
65 | echo "===== LANGUAGE MODEL CREATION ====="
66 | echo "===== MAKING lm.arpa ====="
67 | echo
68 |
69 | loc=`which ngram-count`;
70 | if [ -z $loc ]; then
71 | if uname -a | grep 64 >/dev/null; then
72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
73 | else
74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686
75 | fi
76 | if [ -f $sdir/ngram-count ]; then
77 | echo "Using SRILM language modelling tool from $sdir"
78 | export PATH=$PATH:$sdir
79 | else
80 | echo "SRILM toolkit is probably not installed.
81 | Instructions: tools/install_srilm.sh"
82 | exit 1
83 | fi
84 | fi
85 |
86 | local=data/local
87 | mkdir $local/tmp
88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
89 |
90 | echo
91 | echo "===== MAKING G.fst ====="
92 | echo
93 |
94 | lang=data/lang
95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
96 |
97 | echo
98 | echo "===== MONO TRAINING ====="
99 | echo
100 |
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 |
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 |
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 | END=$(date +%s);
114 | MONO_DECODING_TIME=$((END - START))
115 |
116 | echo
117 | echo "===== MONO ALIGNMENT ====="
118 | echo
119 |
120 | START=$(date +%s);
121 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
122 | data/train data/lang exp/mono exp/mono_ali || exit 1
123 | END=$(date +%s);
124 | MONO_ALIGNMENT_TIME=$((END - START))
125 |
126 | echoalign
127 | echo "===== TRI1 (first triphone pass) TRAINING ====="
128 | echo
129 |
130 | START=$(date +%s);
131 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
132 | data/train data/lang exp/mono_ali exp/tri1 || exit 1
133 | END=$(date +%s);
134 | TRI1_TRAINING_TIME=$((END - START))
135 |
136 | echo
137 | echo "===== TRI1 (first triphone pass) DECODING ====="
138 | echo
139 |
140 | START=$(date +%s);
141 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
142 | END=$(date +%s);
143 | TRI1_DECODING_TIME=$((END - START))
144 |
145 | echo
146 | echo "===== TRI1 ALIGNMENT ====="
147 | echo
148 |
149 | START=$(date +%s);
150 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
151 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
152 | END=$(date +%s);
153 | TRI1_ALIGNMENT_TIME=$((END - START))
154 |
155 | echo
156 | echo "===== TRI2A TRAINING ====="
157 | echo
158 |
159 | START=$(date +%s);
160 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
161 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1
162 | END=$(date +%s);
163 | TRI2A_TRAINING_TIME=$((END - START))
164 |
165 | echo
166 | echo "===== TRI2A DECODING ====="
167 | echo
168 |
169 | START=$(date +%s);
170 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
171 | END=$(date +%s);
172 | TRI2A_DECODING_TIME=$((END - START))
173 |
174 | echo
175 | echo "===== TRI2A ALIGNMENT ====="
176 | echo
177 |
178 | START=$(date +%s);
179 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
180 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
181 | END=$(date +%s);
182 | TRI2A_ALIGNMENT_TIME=$((END - START))
183 |
184 | echo
185 | echo "===== TRI3A TRAINING ====="
186 | echo
187 |
188 | START=$(date +%s);
189 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
190 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
191 | END=$(date +%s);
192 | TRI3A_TRAINING_TIME=$((END - START))
193 |
194 | echo
195 | echo "===== TRI3A DECODING ====="
196 | echo
197 |
198 | START=$(date +%s);
199 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
200 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
201 | exp/tri3a/graph data/test exp/tri3a/decode
202 | END=$(date +%s);
203 | TRI3A_DECODING_TIME=$((END - START))
204 |
205 | echo
206 | echo "===== TRI3A ALIGNMENT ====="
207 | echo
208 |
209 | START=$(date +%s);
210 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
211 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
212 | END=$(date +%s);
213 | TRI3A_ALIGNMENT_TIME=$((END - START))
214 |
215 | echo
216 | echo "===== run.sh script is finished ====="
217 | echo
218 |
219 | EXP_END=$(date +%s);
220 | EXP_TIME=$((EXP_END - EXP_START))
221 |
222 | log_file='exp.log'
223 | echo "" > $log_file
224 | echo "===== Time Report =====" >> $log_file
225 | echo "Mono" >> $log_file
226 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
227 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
228 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
229 |
230 | echo "Tri1" >> $log_file
231 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
232 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
233 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
234 |
235 | echo "Tri2a" >> $log_file
236 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
237 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
238 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
239 |
240 | echo "Tri3a" >> $log_file
241 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
242 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
243 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
244 |
245 | echo "Total time:" >> $log_file
246 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
247 |
248 | echo -e "\n" >> $log_file
249 | echo "===== Score Report =====" >> $log_file
250 | echo "Best WER" >> $log_file
251 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
252 |
253 | echo -e "\n" >> $log_file
254 |
255 | cat $log_file
256 |
--------------------------------------------------------------------------------
/egs/vivos/extension/run_lda_mllt_decode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./path.sh || exit 1
4 | . ./cmd.sh || exit 1
5 |
6 | EXP_START=$(date +%s);
7 |
8 | nj=1 # number of parallel jobs
9 | lm_order=1 # language model order (n-gram quantity)
10 |
11 | # Safety mechanism (possible running this script with modified arguments)
12 | . utils/parse_options.sh || exit 1
13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
14 |
15 | # Removing previously created data (from last run.sh execution)
16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
17 |
18 | echo
19 | echo "===== PREPARING ACOUSTIC DATA ====="
20 | echo
21 |
22 | # Needs to be prepared by hand (or using self written scripts):
23 | #
24 | # spk2gender [ ]
25 | # wav.scp [ ]
26 | # text [ ]
27 | # utt2spk [ ]
28 | # corpus.txt []
29 |
30 | # Making spk2utt files
31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
33 |
34 | echo
35 | echo "===== FEATURES EXTRACTION ====="
36 | echo
37 |
38 | # Making feats.scp files
39 | mfccdir=mfcc
40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory
42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory
43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
45 |
46 | # Making cmvn.scp files
47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
49 |
50 | echo
51 | echo "===== PREPARING LANGUAGE DATA ====="
52 | echo
53 |
54 | # Needs to be prepared by hand (or using self written scripts):
55 | #
56 | # lexicon.txt [ ...]
57 | # nonsilence_phones.txt []
58 | # silence_phones.txt []
59 | # optional_silence.txt []
60 |
61 | # Preparing language data
62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
63 |
64 | echo
65 | echo "===== LANGUAGE MODEL CREATION ====="
66 | echo "===== MAKING lm.arpa ====="
67 | echo
68 |
69 | loc=`which ngram-count`;
70 | if [ -z $loc ]; then
71 | if uname -a | grep 64 >/dev/null; then
72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
73 | else
74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686
75 | fi
76 | if [ -f $sdir/ngram-count ]; then
77 | echo "Using SRILM language modelling tool from $sdir"
78 | export PATH=$PATH:$sdir
79 | else
80 | echo "SRILM toolkit is probably not installed.
81 | Instructions: tools/install_srilm.sh"
82 | exit 1
83 | fi
84 | fi
85 |
86 | local=data/local
87 | mkdir $local/tmp
88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
89 |
90 | echo
91 | echo "===== MAKING G.fst ====="
92 | echo
93 |
94 | lang=data/lang
95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
96 |
97 | echo
98 | echo "===== MONO TRAINING ====="
99 | echo
100 |
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 |
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 |
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
114 | exp/mono/graph data/test exp/mono/decode
115 | END=$(date +%s);
116 | MONO_DECODING_TIME=$((END - START))
117 |
118 | echo
119 | echo "===== MONO ALIGNMENT ====="
120 | echo
121 |
122 | START=$(date +%s);
123 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
124 | data/train data/lang exp/mono exp/mono_ali || exit 1
125 | END=$(date +%s);
126 | MONO_ALIGNMENT_TIME=$((END - START))
127 |
128 | echo
129 | echo "===== TRI1 (first triphone pass) TRAINING ====="
130 | echo
131 |
132 | START=$(date +%s);
133 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
134 | data/train data/lang exp/mono_ali exp/tri1 || exit 1
135 | END=$(date +%s);
136 | TRI1_TRAINING_TIME=$((END - START))
137 |
138 | echo
139 | echo "===== TRI1 (first triphone pass) DECODING ====="
140 | echo
141 |
142 | START=$(date +%s);
143 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
144 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
145 | exp/tri1/graph data/test exp/tri1/decode
146 | END=$(date +%s);
147 | TRI1_DECODING_TIME=$((END - START))
148 |
149 | echo
150 | echo "===== TRI1 ALIGNMENT ====="
151 | echo
152 |
153 | START=$(date +%s);
154 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
155 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
156 | END=$(date +%s);
157 | TRI1_ALIGNMENT_TIME=$((END - START))
158 |
159 | echo
160 | echo "===== TRI2A TRAINING ====="
161 | echo
162 |
163 | START=$(date +%s);
164 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
165 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1
166 | END=$(date +%s);
167 | TRI2A_TRAINING_TIME=$((END - START))
168 |
169 | echo
170 | echo "===== TRI2A DECODING ====="
171 | echo
172 |
173 | START=$(date +%s);
174 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
175 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
176 | exp/tri2a/graph data/test exp/tri2a/decode
177 | END=$(date +%s);
178 | TRI2A_DECODING_TIME=$((END - START))
179 |
180 | echo
181 | echo "===== TRI2A ALIGNMENT ====="
182 | echo
183 |
184 | START=$(date +%s);
185 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
186 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
187 | END=$(date +%s);
188 | TRI2A_ALIGNMENT_TIME=$((END - START))
189 |
190 | echo
191 | echo "===== TRI3A TRAINING ====="
192 | echo
193 |
194 | START=$(date +%s);
195 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
196 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
197 | END=$(date +%s);
198 | TRI3A_TRAINING_TIME=$((END - START))
199 |
200 | echo
201 | echo "===== TRI3A DECODING ====="
202 | echo
203 |
204 | START=$(date +%s);
205 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
206 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
207 | exp/tri3a/graph data/test exp/tri3a/decode
208 | END=$(date +%s);
209 | TRI3A_DECODING_TIME=$((END - START))
210 |
211 | echo
212 | echo "===== TRI3A ALIGNMENT ====="
213 | echo
214 |
215 | START=$(date +%s);
216 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
217 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
218 | END=$(date +%s);
219 | TRI3A_ALIGNMENT_TIME=$((END - START))
220 |
221 | echo
222 | echo "===== run.sh script is finished ====="
223 | echo
224 |
225 | EXP_END=$(date +%s);
226 | EXP_TIME=$((EXP_END - EXP_START))
227 |
228 | log_file='exp.log'
229 | echo "" > $log_file
230 | echo "===== Time Report =====" >> $log_file
231 | echo "Mono" >> $log_file
232 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
233 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
234 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
235 |
236 | echo "Tri1" >> $log_file
237 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
238 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
239 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
240 |
241 | echo "Tri2a" >> $log_file
242 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
243 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
244 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
245 |
246 | echo "Tri3a" >> $log_file
247 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
248 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
249 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
250 |
251 | echo "Total time:" >> $log_file
252 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
253 |
254 | echo -e "\n" >> $log_file
255 | echo "===== Score Report =====" >> $log_file
256 | echo "Best WER" >> $log_file
257 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
258 |
259 | echo -e "\n" >> $log_file
260 |
261 | cat $log_file
262 |
--------------------------------------------------------------------------------
/egs/vivos/extension/run_sat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./path.sh || exit 1
4 | . ./cmd.sh || exit 1
5 |
6 | EXP_START=$(date +%s);
7 |
8 | nj=1 # number of parallel jobs
9 | lm_order=1 # language model order (n-gram quantity)
10 |
11 | # Safety mechanism (possible running this script with modified arguments)
12 | . utils/parse_options.sh || exit 1
13 | [[ $# -ge 1 ]] && { echo "Wrong arguments!"; exit 1; }
14 |
15 | # Removing previously created data (from last run.sh execution)
16 | rm -rf exp mfcc data/train/spk2utt data/train/cmvn.scp data/train/feats.scp data/train/split1 data/test/spk2utt data/test/cmvn.scp data/test/feats.scp data/test/split1 data/local/lang data/lang data/local/tmp data/local/dict/lexiconp.txt
17 |
18 | echo
19 | echo "===== PREPARING ACOUSTIC DATA ====="
20 | echo
21 |
22 | # Needs to be prepared by hand (or using self written scripts):
23 | #
24 | # spk2gender [ ]
25 | # wav.scp [ ]
26 | # text [ ]
27 | # utt2spk [ ]
28 | # corpus.txt []
29 |
30 | # Making spk2utt files
31 | utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
32 | utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
33 |
34 | echo
35 | echo "===== FEATURES EXTRACTION ====="
36 | echo
37 |
38 | # Making feats.scp files
39 | mfccdir=mfcc
40 | # Uncomment and modify arguments in scripts below if you have any problems with data sorting
41 | # utils/validate_data_dir.sh data/train # script for checking prepared data - here: for data/train directory
42 | # utils/fix_data_dir.sh data/train # tool for data proper sorting if needed - here: for data/train directory
43 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
44 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
45 |
46 | # Making cmvn.scp files
47 | steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
48 | steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
49 |
50 | echo
51 | echo "===== PREPARING LANGUAGE DATA ====="
52 | echo
53 |
54 | # Needs to be prepared by hand (or using self written scripts):
55 | #
56 | # lexicon.txt [ ...]
57 | # nonsilence_phones.txt []
58 | # silence_phones.txt []
59 | # optional_silence.txt []
60 |
61 | # Preparing language data
62 | utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang
63 |
64 | echo
65 | echo "===== LANGUAGE MODEL CREATION ====="
66 | echo "===== MAKING lm.arpa ====="
67 | echo
68 |
69 | loc=`which ngram-count`;
70 | if [ -z $loc ]; then
71 | if uname -a | grep 64 >/dev/null; then
72 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
73 | else
74 | sdir=$KALDI_ROOT/tools/srilm/bin/i686
75 | fi
76 | if [ -f $sdir/ngram-count ]; then
77 | echo "Using SRILM language modelling tool from $sdir"
78 | export PATH=$PATH:$sdir
79 | else
80 | echo "SRILM toolkit is probably not installed.
81 | Instructions: tools/install_srilm.sh"
82 | exit 1
83 | fi
84 | fi
85 |
86 | local=data/local
87 | mkdir $local/tmp
88 | ngram-count -order $lm_order -write-vocab $local/tmp/vocab-full.txt -wbdiscount -text $local/corpus.txt -lm $local/tmp/lm.arpa
89 |
90 | echo
91 | echo "===== MAKING G.fst ====="
92 | echo
93 |
94 | lang=data/lang
95 | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/words.txt $local/tmp/lm.arpa $lang/G.fst
96 |
97 | echo
98 | echo "===== MONO TRAINING ====="
99 | echo
100 |
101 | START=$(date +%s);
102 | steps/train_mono.sh --nj $nj \
103 | --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
104 | END=$(date +%s);
105 | MONO_TRAINING_TIME=$((END - START))
106 |
107 | echo
108 | echo "===== MONO DECODING ====="
109 | echo
110 |
111 | START=$(date +%s);
112 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph || exit 1
113 |
114 | END=$(date +%s);
115 | MONO_DECODING_TIME=$((END - START))
116 |
117 | echo
118 | echo "===== MONO ALIGNMENT ====="
119 | echo
120 |
121 | START=$(date +%s);
122 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
123 | data/train data/lang exp/mono exp/mono_ali || exit 1
124 | END=$(date +%s);
125 | MONO_ALIGNMENT_TIME=$((END - START))
126 |
127 | echo
128 | echo "===== TRI1 (first triphone pass) TRAINING ====="
129 | echo
130 |
131 | START=$(date +%s);
132 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
133 | data/train data/lang exp/mono_ali exp/tri1 || exit 1
134 | END=$(date +%s);
135 | TRI1_TRAINING_TIME=$((END - START))
136 |
137 | echo
138 | echo "===== TRI1 (first triphone pass) DECODING ====="
139 | echo
140 |
141 | START=$(date +%s);
142 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || exit 1
143 |
144 | END=$(date +%s);
145 | TRI1_DECODING_TIME=$((END - START))
146 |
147 | echo
148 | echo "===== TRI1 ALIGNMENT ====="
149 | echo
150 |
151 | START=$(date +%s);
152 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
153 | data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
154 | END=$(date +%s);
155 | TRI1_ALIGNMENT_TIME=$((END - START))
156 |
157 | echo
158 | echo "===== TRI2A TRAINING ====="
159 | echo
160 |
161 | START=$(date +%s);
162 | steps/train_deltas.sh --cmd "$train_cmd" 2500 20000 \
163 | data/train data/lang exp/tri1_ali exp/tri2a || exit 1
164 | END=$(date +%s);
165 | TRI2A_TRAINING_TIME=$((END - START))
166 |
167 | echo
168 | echo "===== TRI2A DECODING ====="
169 | echo
170 |
171 | START=$(date +%s);
172 | utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph || exit 1
173 |
174 | END=$(date +%s);
175 | TRI2A_DECODING_TIME=$((END - START))
176 |
177 | echo
178 | echo "===== TRI2A ALIGNMENT ====="
179 | echo
180 |
181 | START=$(date +%s);
182 | steps/align_si.sh --nj $nj --cmd "$train_cmd" \
183 | data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
184 | END=$(date +%s);
185 | TRI2A_ALIGNMENT_TIME=$((END - START))
186 |
187 | echo
188 | echo "===== TRI3A TRAINING ====="
189 | echo
190 |
191 | START=$(date +%s);
192 | steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 20000 \
193 | data/train data/lang exp/tri2a_ali exp/tri3a || exit 1;
194 | END=$(date +%s);
195 | TRI3A_TRAINING_TIME=$((END - START))
196 |
197 | echo
198 | echo "===== TRI3A DECODING ====="
199 | echo
200 |
201 | START=$(date +%s);
202 | utils/mkgraph.sh data/lang exp/tri3a exp/tri3a/graph || exit 1
203 |
204 | END=$(date +%s);
205 | TRI3A_DECODING_TIME=$((END - START))
206 |
207 | echo
208 | echo "===== TRI3A ALIGNMENT ====="
209 | echo
210 |
211 | START=$(date +%s);
212 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
213 | data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
214 | END=$(date +%s);
215 | TRI3A_ALIGNMENT_TIME=$((END - START))
216 |
217 |
218 | echo
219 | echo "===== TRI4A TRAINING ====="
220 | echo
221 |
222 | START=$(date +%s);
223 | steps/train_sat.sh --cmd "$train_cmd" 2500 20000 \
224 | data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
225 | END=$(date +%s);
226 | TRI4A_TRAINING_TIME=$((END - START))
227 |
228 | echo
229 | echo "===== TRI4A DECODING ====="
230 | echo
231 |
232 | START=$(date +%s);
233 | utils/mkgraph.sh data/lang exp/tri4a exp/tri4a/graph || exit 1
234 |
235 | END=$(date +%s);
236 | TRI4A_DECODING_TIME=$((END - START))
237 |
238 | echo
239 | echo "===== TRI4A ALIGNMENT ====="
240 | echo
241 |
242 | START=$(date +%s);
243 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
244 | data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
245 | END=$(date +%s);
246 | TRI4A_ALIGNMENT_TIME=$((END - START))
247 |
248 | echo
249 | echo "===== TRI5A TRAINING ====="
250 | echo
251 |
252 | START=$(date +%s);
253 | steps/train_sat.sh --cmd "$train_cmd" 3500 100000 \
254 | data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
255 | END=$(date +%s);
256 | TRI5A_TRAINING_TIME=$((END - START))
257 |
258 | echo
259 | echo "===== TRI5A DECODING ====="
260 | echo
261 |
262 | START=$(date +%s);
263 | utils/mkgraph.sh data/lang exp/tri5a exp/tri5a/graph || exit 1
264 | steps/decode.sh --config conf/decode.config --nj 1 --cmd "$decode_cmd" \
265 | exp/tri5a/graph data/test exp/tri5a/decode
266 | END=$(date +%s);
267 | TRI5A_DECODING_TIME=$((END - START))
268 |
269 | echo
270 | echo "===== TRI5A ALIGNMENT ====="
271 | echo
272 |
273 | START=$(date +%s);
274 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
275 | data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
276 | END=$(date +%s);
277 | TRI5A_ALIGNMENT_TIME=$((END - START))
278 |
279 | echo
280 | echo "===== run.sh script is finished ====="
281 | echo
282 |
283 | EXP_END=$(date +%s);
284 | EXP_TIME=$((EXP_END - EXP_START))
285 |
286 | log_file='exp.log'
287 | echo "" > $log_file
288 | echo "===== Time Report =====" >> $log_file
289 | echo "Mono" >> $log_file
290 | echo $MONO_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
291 | echo $MONO_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
292 | echo $MONO_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
293 |
294 | echo "Tri1" >> $log_file
295 | echo $TRI1_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
296 | echo $TRI1_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
297 | echo $TRI1_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
298 |
299 | echo "Tri2a" >> $log_file
300 | echo $TRI2A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
301 | echo $TRI2A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
302 | echo $TRI2A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
303 |
304 | echo "Tri3a" >> $log_file
305 | echo $TRI3A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
306 | echo $TRI3A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
307 | echo $TRI3A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
308 |
309 | echo "Tri4a" >> $log_file
310 | echo $TRI4A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
311 | echo $TRI4A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
312 | echo $TRI4A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
313 |
314 | echo "Tri5a" >> $log_file
315 | echo $TRI5A_TRAINING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
316 | echo $TRI5A_DECODING_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
317 | echo $TRI5A_ALIGNMENT_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
318 |
319 | echo "Total time:" >> $log_file
320 | echo $EXP_TIME | awk '{print int($1/60)":"int($1%60)}' >> $log_file
321 |
322 | echo -e "\n" >> $log_file
323 | echo "===== Score Report =====" >> $log_file
324 | echo "Best WER" >> $log_file
325 | for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done >> $log_file
326 |
327 | echo -e "\n" >> $log_file
328 |
329 | cat $log_file
330 |
--------------------------------------------------------------------------------
/egs/vivos/extension/text.py:
--------------------------------------------------------------------------------
1 | class PhoneConverter1:
2 | rules_1 = [
3 | "aàáảãạ",
4 | "ăằắẳẵặ",
5 | "âầấẩẫậ",
6 | "eèéẻẽẹ",
7 | "êềếểễệ",
8 | "iìíỉĩị",
9 | "oòóỏõọ",
10 | "ôồốổỗộ",
11 | "ơờớởỡợ",
12 | "uùúủũụ",
13 | "ưừứửữự",
14 | "yỳýỷỹỵ"
15 | ]
16 | rules_2 = [
17 | "awă",
18 | "aaâ",
19 | "eeê",
20 | "ooô",
21 | "owơ",
22 | "uwư",
23 | "ddđ"
24 | ]
25 | w2p = {}
26 | p2w = {}
27 | for words in rules_1:
28 | original = words[0]
29 | words = words[1:]
30 | for rule in rules_2:
31 | if original == rule[2]:
32 | original = rule[0:2]
33 | tones = "fsrxj"
34 | for i, w in enumerate(words):
35 | w2p[w] = original + tones[i]
36 | for rule in rules_2:
37 | w2p[rule[2]] = rule[0:2]
38 | for key, value in w2p.items():
39 | p2w[value] = key
40 |
41 | @staticmethod
42 | def word2phone(word):
43 | w2p = PhoneConverter1.w2p
44 | phone = ""
45 | for w in word:
46 | if w in w2p:
47 | phone += w2p[w]
48 | else:
49 | phone += w
50 | return phone
51 |
52 | @staticmethod
53 | def phone2word(phone):
54 | p2w = PhoneConverter1.p2w
55 | i = 0
56 | word = ""
57 | while i < len(phone):
58 | if phone[i:i+3] in p2w:
59 | p = phone[i:i+3]
60 | word += p2w[p]
61 | i += 3
62 | elif phone[i:i+2] in p2w:
63 | p = phone[i:i+2]
64 | word += p2w[p]
65 | i += 2
66 | else:
67 | p = phone[i:i+1]
68 | word += p
69 | i += 1
70 | return word
71 |
72 | if __name__ == '__main__':
73 | tests = [
74 | ("con hoẵng", "con hoawxng"),
75 | ("lựu đạn", "luwju ddajn"),
76 | ("kiểm tra", "kieerm tra"),
77 | ("ủy ban", "ury ban"),
78 | ("cà phê", "caf phee"),
79 | ("khách sạn", "khasch sajn"),
80 | ("đúng", "ddusng"),
81 | ("xã hội", "xax hooji")
82 | ]
83 | for test in tests:
84 | assert (test[0] == PhoneConverter1.phone2word(test[1]))
85 | assert (test[1] == PhoneConverter1.word2phone(test[0]))
86 |
--------------------------------------------------------------------------------
/egs/vivos/extension/transcript_deltadelta.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #!/bin/bash
3 |
4 | . ./path.sh || exit 1
5 | . ./cmd.sh || exit 1
6 |
7 |
8 | model_folder=exp/tri2a
9 | transcript_folder=transcriptions
10 | output_folder=output
11 |
12 | rm -rf $output_folder
13 | mkdir $output_folder
14 |
15 | echo
16 | echo "===== AUDIO -> FEATURE VECTORS ====="
17 | echo
18 |
19 | compute-mfcc-feats --config=conf/mfcc.conf \
20 | scp:$transcript_folder/wav.scp \
21 | ark,scp:$output_folder/feats.ark,$output_folder/feats.scp
22 |
23 | add-deltas \
24 | scp:$output_folder/feats.scp \
25 | ark:$output_folder/delta-feats.ark
26 |
27 |
28 | echo
29 | echo "===== TRAINED GMM-HMM + FEATURE VECTORS -> LATTICE ====="
30 | echo
31 |
32 | gmm-latgen-faster \
33 | --word-symbol-table=$model_folder/graph/words.txt \
34 | $model_folder/final.mdl \
35 | $model_folder/graph/HCLG.fst \
36 | ark:$output_folder/delta-feats.ark \
37 | ark,t:$output_folder/lattices.ark
38 |
39 | echo
40 | echo "===== LATTICE -> BEST PATH THROUGH LATTICE ====="
41 | echo
42 |
43 | lattice-best-path \
44 | --word-symbol-table=$model_folder/graph/words.txt \
45 | ark:$output_folder/lattices.ark \
46 | ark,t:$output_folder/one-best.tra
47 |
48 | echo
49 | echo "===== BEST PATH INTEGERS -> BEST PATH WORDS ====="
50 | echo
51 |
52 | utils/int2sym.pl -f 2- \
53 | $model_folder/graph/words.txt \
54 | $output_folder/one-best.tra \
55 | > $output_folder/one-best-hypothesis.txt
56 |
57 | cat $output_folder/one-best-hypothesis.txt
58 |
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R001.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R002.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R003.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R004.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/R005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/R005.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t1_tat_ca.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/extension/transcriptions/audio/t2_tro_nen.wav
--------------------------------------------------------------------------------
/egs/vivos/extension/transcriptions/wav.scp:
--------------------------------------------------------------------------------
1 | r1 ./transcriptions/audio/R001.wav
2 | r2 ./transcriptions/audio/R002.wav
3 | r3 ./transcriptions/audio/R003.wav
4 | r4 ./transcriptions/audio/R004.wav
5 | r5 ./transcriptions/audio/R005.wav
6 | t1 ./transcriptions/audio/t1_tat_ca.wav
--------------------------------------------------------------------------------
/egs/vivos/load_data.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join
2 |
3 | corpus_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
4 | "corpus")
5 |
--------------------------------------------------------------------------------
/egs/vivos/logs/README.md:
--------------------------------------------------------------------------------
1 | VIVOS + FPT, LDA-MLLT: 20181227_122900.md
--------------------------------------------------------------------------------
/egs/vivos/model/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname
2 | import os
3 | import text
4 |
5 |
6 | def transcript(wav_file):
7 | tmp_folder = dirname(__file__)
8 | command = "pocketsphinx_continuous " \
9 | "-hmm {0}/model_parameters/tmp.cd_cont_200 " \
10 | "-samprate 8000 " \
11 | "-lm {0}/etc/tmp.lm " \
12 | "-dict {0}/etc/tmp.dic " \
13 | "-infile {1} " \
14 | "-logfn {0}/yes".format(tmp_folder, wav_file)
15 | with os.popen(command) as c:
16 | output = c.read().strip()
17 | output = text.phone2word(output)
18 | os.remove("{}/yes".format(tmp_folder))
19 | return output
20 |
--------------------------------------------------------------------------------
/egs/vivos/model/etc/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf __CFG_LO_FILT__
2 | -upperf __CFG_HI_FILT__
3 | -nfilt __CFG_NUM_FILT__
4 | -transform __CFG_TRANSFORM__
5 | -lifter __CFG_LIFTER__
6 | -feat __CFG_FEATURE__
7 | -svspec __CFG_SVSPEC__
8 | -agc __CFG_AGC__
9 | -cmn __CFG_CMN__
10 | -varnorm __CFG_VARNORM__
11 |
--------------------------------------------------------------------------------
/egs/vivos/model/etc/idngram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/etc/idngram
--------------------------------------------------------------------------------
/egs/vivos/model/etc/tmp.filler:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/vivos/model/etc/tmp.phone:
--------------------------------------------------------------------------------
1 | 4
2 | a
3 | b
4 | c
5 | d
6 | e
7 | f
8 | g
9 | h
10 | i
11 | j
12 | k
13 | l
14 | m
15 | n
16 | o
17 | p
18 | q
19 | r
20 | s
21 | t
22 | u
23 | v
24 | w
25 | x
26 | y
27 | SIL
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_1/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_2/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_200_4/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_initial/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.cd_cont_untied/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/feat.params:
--------------------------------------------------------------------------------
1 | -lowerf 200
2 | -upperf 3500
3 | -nfilt 31
4 | -transform dct
5 | -lifter 22
6 | -feat 1s_c_d_dd
7 | -agc none
8 | -cmn batch
9 | -varnorm no
10 |
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/mdef:
--------------------------------------------------------------------------------
1 | # Generated by /usr/local/libexec/sphinxtrain/mk_mdef_gen on Sat Jan 6 09:51:27 2018
2 | 0.3
3 | 27 n_base
4 | 0 n_tri
5 | 108 n_state_map
6 | 81 n_tied_state
7 | 81 n_tied_ci_state
8 | 27 n_tied_tmat
9 | #
10 | # Columns definitions
11 | #base lft rt p attrib tmat ... state id's ...
12 | 4 - - - n/a 0 0 1 2 N
13 | SIL - - - filler 1 3 4 5 N
14 | a - - - n/a 2 6 7 8 N
15 | b - - - n/a 3 9 10 11 N
16 | c - - - n/a 4 12 13 14 N
17 | d - - - n/a 5 15 16 17 N
18 | e - - - n/a 6 18 19 20 N
19 | f - - - n/a 7 21 22 23 N
20 | g - - - n/a 8 24 25 26 N
21 | h - - - n/a 9 27 28 29 N
22 | i - - - n/a 10 30 31 32 N
23 | j - - - n/a 11 33 34 35 N
24 | k - - - n/a 12 36 37 38 N
25 | l - - - n/a 13 39 40 41 N
26 | m - - - n/a 14 42 43 44 N
27 | n - - - n/a 15 45 46 47 N
28 | o - - - n/a 16 48 49 50 N
29 | p - - - n/a 17 51 52 53 N
30 | q - - - n/a 18 54 55 56 N
31 | r - - - n/a 19 57 58 59 N
32 | s - - - n/a 20 60 61 62 N
33 | t - - - n/a 21 63 64 65 N
34 | u - - - n/a 22 66 67 68 N
35 | v - - - n/a 23 69 70 71 N
36 | w - - - n/a 24 72 73 74 N
37 | x - - - n/a 25 75 76 77 N
38 | y - - - n/a 26 78 79 80 N
39 |
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/noisedict:
--------------------------------------------------------------------------------
1 | SIL
2 | SIL
3 | SIL
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont/variances
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalmean
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/globalvar
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/means
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/mixture_weights
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/transition_matrices
--------------------------------------------------------------------------------
/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/model/model_parameters/tmp.ci_cont_flatinitial/variances
--------------------------------------------------------------------------------
/egs/vivos/model/text.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/egs/vivos/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser(description='Process some integers.')
5 | parser.add_argument('--wav', help='Path for input file to predict', required=True)
6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True)
7 | parser.add_argument('--model_path', help='Model path (default: exp/{model} in kaldi-trunk/egs/{result})', required=True)
8 | parser.add_argument('--utils_path', help='Kaldi utils dir path, usually in super parent directory of model_path')
9 | parser.add_argument('--method', help='Method to predict, delta/lda_mllt,sat', default="delta")
10 |
11 | args = parser.parse_args()
12 |
13 |
14 | def predict(kaldi_folder, wav_file, model_path, method="delta", utils_path=None):
15 | # Model path usually is in etc at kaldi-trunk/egs/uts_{random_int}/exp
16 | model = model_path
17 |
18 | if not os.path.exists(os.path.join(model, "final.mdl")):
19 | raise Exception("Cannot find final.mdl model file with given model path.")
20 | if not os.path.exists(os.path.join(model, "graph")):
21 | raise Exception("Cannot find graph with given model path.")
22 |
23 | if utils_path is None:
24 | utils_path = os.path.join(os.path.dirname(os.path.dirname(model)), "utils")
25 |
26 | if not os.path.exists(os.path.join(utils_path, "int2sym.pl")):
27 | raise Exception(
28 | "Cannot find int2sym.pl file with given utils path, please make sure that you are provided correctly utils_path argument")
29 |
30 | # Prepare predict dir
31 | os.system("cd {}; rm -rf predict;".format(model))
32 | os.system("cd {}; mkdir predict;".format(model))
33 | os.system("cd {}/predict; mkdir config;".format(model))
34 | os.system("cd {}/predict; mkdir experiment;".format(model))
35 | os.system("cd {}/predict; mkdir transcriptions;".format(model))
36 | os.system("cd {}/predict/experiment; mkdir triphones_deldel;".format(model))
37 |
38 | # Copy pre-trained model
39 | os.system("cd {};cp final.mdl predict/experiment/triphones_deldel/final.mdl;".format(model))
40 |
41 | os.system("cd {};cp -r graph predict/experiment/triphones_deldel/graph".format(model))
42 |
43 | os.system("cd {}/predict/config; echo '--use-energy=true \n\
44 | --sample-frequency=16000 \n\
45 | --num-mel-bins=40 \n\
46 | --frame-length=25 \n\
47 | --frame-shift=10 \n\
48 | --high-freq=0 \n\
49 | --low-freq=0 \n\
50 | --num-ceps=13 \n\
51 | --window-type=hamming' > mfcc.conf".format(model))
52 | os.system("cd {}/predict/transcriptions; echo 'result: {}' > wav.scp".format(model, wav_file))
53 | os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV16 result:' > spk2utt".format(model))
54 | os.system("cd {}/predict/transcriptions; echo 'result: VIVOSDEV16' > utt2spk".format(model))
55 | # os.system("cd {}/predict/transcriptions; echo 'VIVOSDEV02-R015 result' > utt2spk".format(model))
56 |
57 | # Run predict
58 | os.system(
59 | "cd {}/predict; {}/src/featbin/compute-mfcc-feats --config=config/mfcc.conf \
60 | scp:transcriptions/wav.scp ark,scp:transcriptions/feats.ark,transcriptions/feats.scp" \
61 | .format(model, kaldi_folder))
62 |
63 | os.system(
64 | "cd {}/predict; {}/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \
65 | scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \
66 | .format(model, kaldi_folder))
67 |
68 | # os.system(
69 | # "cd {}/predict; {}/src/featbin/apply-cmvn --uut2spk=ark:transcriptions/utt2spk \
70 | # scp:transcriptions/feats.scp ark,scp:experiment/cmvn.ark,experiment/cmvn.scp" \
71 | # .format(model, kaldi_folder))
72 |
73 | # delta
74 | if method == "delta":
75 | # os.system("cd {}/predict; {}/src/featbin/add-deltas \
76 | # scp:transcriptions/feats.scp ark:transcriptions/delta-feats.ark" \
77 | # .format(model, kaldi_folder))
78 |
79 | # os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
80 | # --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
81 | # --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
82 | # experiment/triphones_deldel/final.mdl \
83 | # experiment/triphones_deldel/graph/HCLG.fst \
84 | # ark:transcriptions/delta-feats.ark \
85 | # ark,t:transcriptions/lattices.ark" \
86 | # .format(model, kaldi_folder))
87 | command = "cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
88 | --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
89 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
90 | experiment/triphones_deldel/final.mdl \
91 | experiment/triphones_deldel/graph/HCLG.fst \
92 | 'ark,s,cs:{}/src/featbin/apply-cmvn \
93 | --utt2spk=ark:transcriptions/utt2spk \
94 | scp:experiment/cmvn.scp \
95 | scp:transcriptions/feats.scp ark:- | \
96 | {}/src/featbin/add-deltas ark:- ark:- |' 'ark,t:transcriptions/lattices.ark' 'ark:|gzip -c > experiment/lat.gz'" \
97 | .format(model, kaldi_folder, kaldi_folder, kaldi_folder)
98 | os.system(command)
99 | elif method == "lda_mllt":
100 | os.system("cd {};cp final.mat predict/experiment/triphones_deldel/final.mat;".format(model))
101 |
102 | os.system("cd {}/predict; {}/src/featbin/splice-feats \
103 | scp:transcriptions/feats.scp \
104 | ark:transcriptions/splice-feats.ark".format(model, kaldi_folder))
105 | os.system("cd {}/predict; {}/src/featbin/transform-feats \
106 | experiment/triphones_deldel/final.mat \
107 | ark:transcriptions/splice-feats.ark \
108 | ark:transcriptions/splice-transform-feats.ark".format(model, kaldi_folder))
109 | os.system("cd {}/predict; {}/src/gmmbin/gmm-latgen-faster \
110 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
111 | experiment/triphones_deldel/final.mdl experiment/triphones_deldel/graph/HCLG.fst \
112 | ark:transcriptions/splice-transform-feats.ark ark,t:transcriptions/lattices.ark" \
113 | .format(model, kaldi_folder))
114 | else:
115 | raise Exception("The given method {} is not supported yet".format(method))
116 |
117 | os.system("cd {}/predict; {}/src/latbin/lattice-best-path"
118 | " \
119 | --word-symbol-table=experiment/triphones_deldel/graph/words.txt \
120 | ark:transcriptions/lattices.ark \
121 | ark,t:transcriptions/one-best.tra" \
122 | .format(model, kaldi_folder))
123 |
124 | os.system("cd {}/predict; {}/int2sym.pl"
125 | " -f 2- {}/predict/experiment/triphones_deldel/graph/words.txt transcriptions/one-best.tra \
126 | > {}/predict/transcriptions/one-best-hypothesis.txt; echo $(<{}/predict/transcriptions/one-best-hypothesis.txt);" \
127 | .format(model, utils_path, model, model, model))
128 |
129 | result = open("{}/predict/transcriptions/one-best-hypothesis.txt".format(model)).read()
130 | # Result will stored in model_path/predict/transcriptions/one-best-hypothesis.txt under format test {predict_result}
131 | result = result[8:]
132 | print(result)
133 | return result
134 |
135 |
136 | if __name__ == "__main__":
137 | predict(args.kaldi_folder, args.wav, args.model_path, args.method, args.utils_path)
138 |
--------------------------------------------------------------------------------
/egs/vivos/predict_delta.sh:
--------------------------------------------------------------------------------
1 | # Please don't charge this default config
2 | MODEL=/home/anhv/PycharmProjects/kaldi-trunk/egs/uts_443/exp/tri2a
3 | KALDI=/home/anhv/PycharmProjects/kaldi-trunk
4 | WAV=/home/anhv/PycharmProjects/undertheseanlp/automatic_speech_recognition/experiment/vivos/test/VIVOSDEV01_R034.wav
5 |
6 | # Variables
7 | # MODEL=
8 | # KALDI=
9 | # WAV=
10 |
11 | # Prepare predict dir
12 | cd $MODEL;
13 | rm -rf predict
14 | mkdir predict
15 | cd $MODEL/predict
16 | mkdir config; mkdir experiment; mkdir transcriptions
17 | cd $MODEL/predict/experiment
18 | mkdir triphones_delta
19 |
20 | # Copy pre-trained model
21 | cd $MODEL
22 | cp final.mdl predict/experiment/triphones_delta/final.mdl
23 | cp -r graph predict/experiment/triphones_delta/graph
24 |
25 | cd $MODEL/predict/config
26 | cat > mfcc.conf << EOL
27 | --use-energy=true
28 | --sample-frequency=16000
29 | --num-mel-bins=40
30 | --frame-length=25
31 | --frame-shift=10
32 | --high-freq=0
33 | --low-freq=0
34 | --num-ceps=13
35 | --window-type=hamming
36 | EOL
37 |
38 | # Prepare util
39 | cd $MODEL/predict/transcriptions
40 | echo "result: $WAV" > wav.scp
41 | echo "VIVOSDEV16 result:" > spk2utt
42 | echo "result: VIVOSDEV16" > utt2spk
43 |
44 |
45 | # Run predict
46 | cd $MODEL/predict;
47 | $KALDI/src/featbin/compute-mfcc-feats \
48 | --config=config/mfcc.conf \
49 | scp:transcriptions/wav.scp \
50 | ark,scp:transcriptions/feats.ark,transcriptions/feats.scp
51 | $KALDI/src/featbin/compute-cmvn-stats --spk2utt=ark:transcriptions/spk2utt \
52 | scp:transcriptions/feats.scp \
53 | ark,scp:experiment/cmvn.ark,experiment/cmvn.scp
54 |
55 | cd $MODEL/predict;
56 | $KALDI/src/gmmbin/gmm-latgen-faster \
57 | --max-active=7000 --beam=13.0 --lattice_beam=6.0 --acoustic-scale=0.83333 --allow-partial=true \
58 | --word-symbol-table=experiment/triphones_delta/graph/words.txt \
59 | experiment/triphones_delta/final.mdl \
60 | experiment/triphones_delta/graph/HCLG.fst \
61 | 'ark,s,cs:'$KALDI'/src/featbin/apply-cmvn \
62 | --utt2spk=ark:transcriptions/utt2spk scp:experiment/cmvn.scp scp:transcriptions/feats.scp \
63 | ark:- | '$KALDI'/src/featbin/add-deltas ark:- ark:- |' 'ark:|gzip -c > experiment/lat.JOB.gz'
64 |
65 | echo "Finish predict"
--------------------------------------------------------------------------------
/egs/vivos/preprocess.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os import mkdir, walk
3 | from os import listdir
4 | from os.path import dirname
5 | from os.path import join
6 | import os
7 | import re
8 |
9 | def create_train_waves():
10 |
11 | waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
12 | "raw","train","waves")
13 | waves_folder_2 = join(dirname(dirname(dirname(__file__))), "data", "vivos",
14 | "raw","test","waves")
15 | corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
16 | "corpus","train","wav")
17 | try:
18 | shutil.rmtree(corpus_waves_folder)
19 | except:
20 | pass
21 | finally:
22 | mkdir(corpus_waves_folder)
23 | for root, dirs, files in walk(waves_folder):
24 | for dir in dirs:
25 | for f in listdir(join(waves_folder, dir)):
26 | shutil.copy(
27 | join(waves_folder, dir, f),
28 | join(corpus_waves_folder, f))
29 |
30 | for root, dirs, files in walk(waves_folder_2):
31 | for dir in dirs:
32 | for f in listdir(join(waves_folder_2, dir)):
33 | shutil.copy(
34 | join(waves_folder_2, dir, f),
35 | join(corpus_waves_folder, f))
36 |
37 |
38 | def create_test_waves():
39 | waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vlsp",
40 | "wav")
41 | corpus_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
42 | "corpus","test")
43 | corpus_short_waves_folder = join(dirname(dirname(dirname(__file__))), "data", "vivos",
44 | "corpus", "test_short")
45 | try:
46 | shutil.rmtree(corpus_waves_folder)
47 | shutil.rmtree(corpus_short_waves_folder)
48 | except:
49 | pass
50 | finally:
51 | mkdir(corpus_waves_folder)
52 | mkdir(corpus_short_waves_folder)
53 | mkdir(join(corpus_short_waves_folder,"wav"))
54 |
55 | shutil.copytree(waves_folder,join(corpus_waves_folder,"wav"))
56 | files = listdir(join(corpus_waves_folder,"wav"))
57 | for file in files:
58 | os.rename(join(corpus_waves_folder,"wav",file),join(corpus_waves_folder,"wav","{}_{}".format("global",file)))
59 | list_files = listdir(join(corpus_waves_folder,"wav"))
60 | list_files.sort()
61 | for index,file in enumerate(list_files):
62 | if index < 20:
63 | shutil.copyfile(join(corpus_waves_folder,"wav",file),join(corpus_short_waves_folder,"wav",file))
64 |
65 |
66 | def create_train_text():
67 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos",
68 | "raw","train","prompts.txt")
69 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos",
70 | "raw", "test", "prompts.txt")
71 | content = open(content_path).read()
72 | content = content.replace(":", "")
73 |
74 | content2 = open(content_path2).read()
75 | content2 = content2.replace(":", "")
76 | lines = content.splitlines()
77 | lines2 = content2.splitlines()
78 | output = []
79 | for line in lines:
80 | items = line.split()
81 | fileid = items[0]
82 | text = " ".join(items[1:]).lower()
83 | content = "{}|{}".format(fileid, text)
84 | output.append(content)
85 | for line in lines2:
86 | items = line.split()
87 | fileid = items[0]
88 | text = " ".join(items[1:]).lower()
89 | content2 = "{}|{}".format(fileid, text)
90 | output.append(content2)
91 | text = "\n".join(output)
92 |
93 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos","corpus","train", "text")
94 | open(content_path, "w").write(text)
95 |
96 |
97 | def create_test_text():
98 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text")
99 |
100 | content = open(content_path).read()
101 | content = content.replace(":", "")
102 | lines = content.splitlines()
103 | output = []
104 | output_short = []
105 | short_counter = 0
106 | for line in lines:
107 | m = re.match(r"^(?P.*)\t(?P.*)$", line)
108 | if m:
109 | text = m.group("text")
110 | fileid = m.group("fileid")
111 | content = "{}|{}".format("global_{}".format(fileid), text)
112 | output.append(content)
113 | if short_counter < 20:
114 | output_short.append(content)
115 | short_counter += 1
116 | text = "\n".join(output)
117 |
118 |
119 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "text")
120 | open(content_path, "w").write(text)
121 |
122 | text = "\n".join(output_short)
123 |
124 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "text")
125 | open(content_path, "w").write(text)
126 |
127 |
128 | def create_gender():
129 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "genders.txt")
130 | content = open(content_path).read()
131 |
132 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "genders.txt")
133 | content2 = open(content_path2).read()
134 | content = content2 + content
135 |
136 | output_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "gender")
137 | open(output_path, "w").write(content)
138 |
139 | content_test = "\n".join(["global m"])
140 |
141 | output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "gender")
142 | open(output_test_path, "w").write(content_test)
143 |
144 | output_test_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "gender")
145 | open(output_test_path, "w").write(content_test)
146 |
147 |
148 | def create_speaker():
149 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "train", "prompts.txt")
150 | content_path2 = join(dirname(dirname(dirname(__file__))), "data", "vivos", "raw", "test", "prompts.txt")
151 | lines = open(content_path).read().splitlines()
152 | files = [line.split()[0] for line in lines]
153 | tmp = []
154 |
155 | for file_id in files:
156 | speaker_id = file_id.split("_")[0]
157 | content = "{} {}".format(speaker_id, file_id)
158 | tmp.append(content)
159 |
160 | # Merge vivos test to train dir
161 | lines2 = open(content_path2).read().splitlines()
162 | files2 = [line.split()[0] for line in lines2]
163 |
164 | for file_id in files2:
165 | speaker_id = file_id.split("_")[0]
166 | content = "{} {}".format(speaker_id, file_id)
167 | tmp.append(content)
168 |
169 | tmp.sort()
170 |
171 | content = "\n".join(tmp)
172 |
173 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "train", "speaker")
174 | open(content_path, "w").write(content)
175 |
176 | lines_test_path = join(dirname(dirname(dirname(__file__))), "data", "vlsp", "text")
177 | lines_test = open(lines_test_path).read().splitlines()
178 | test_output = []
179 | short_test_output = []
180 | short_test_counter = 0
181 |
182 | for line in lines_test:
183 | # print(line)
184 | m = re.match(r"^(?P.*)\t(?P.*)$", line)
185 | if m:
186 | # text = m.group("text")
187 | fileid = m.group("fileid")
188 | content = "global {}".format("global_{}".format(fileid))
189 |
190 | test_output.append(content)
191 | if short_test_counter < 20:
192 | short_test_output.append(content)
193 |
194 | short_test_counter+=1
195 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test", "speaker")
196 | content = "\n".join(test_output)
197 | open(content_path, "w").write(content)
198 |
199 | content_path = join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short", "speaker")
200 | short_content = "\n".join(short_test_output)
201 | open(content_path, "w").write(short_content)
202 |
203 | try:
204 | shutil.rmtree(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus"))
205 | except:
206 | pass
207 | finally:
208 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus"))
209 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus","train"))
210 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test"))
211 | mkdir(join(dirname(dirname(dirname(__file__))), "data", "vivos", "corpus", "test_short"))
212 | create_train_waves()
213 | create_test_waves()
214 | create_train_text()
215 | create_test_text()
216 | create_gender()
217 | create_speaker()
218 |
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R003.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R012.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R012.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R027.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R027.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R028.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R028.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R034.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R034.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R043.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R043.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R044.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R044.wav
--------------------------------------------------------------------------------
/egs/vivos/test/VIVOSDEV01_R055.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/egs/vivos/test/VIVOSDEV01_R055.wav
--------------------------------------------------------------------------------
/egs/vivos/test_model.py:
--------------------------------------------------------------------------------
1 | from model import transcript
2 | from os.path import join, dirname
3 | from unittest import TestCase
4 |
5 |
6 | class TestSentiment(TestCase):
7 | def test_1(self):
8 | wav = join(dirname(__file__), "test", "VIVOSDEV01_R003.wav")
9 | actual = transcript(wav)
10 | expected = "cà phê"
11 | self.assertEqual(actual, expected)
12 |
13 | def test_2(self):
14 | wav = join(dirname(__file__), "test", "VIVOSDEV01_R034.wav")
15 | actual = transcript(wav)
16 | expected = "khách sạn"
17 | self.assertEqual(actual, expected)
18 |
--------------------------------------------------------------------------------
/egs/vivos/text2.py:
--------------------------------------------------------------------------------
1 | rules_1 = [
2 | "aàáảãạ",
3 | "ăằắẳẵặ",
4 | "âầấẩẫậ",
5 | "eèéẻẽẹ",
6 | "êềếểễệ",
7 | "iìíỉĩị",
8 | "oòóỏõọ",
9 | "ôồốổỗộ",
10 | "ơờớởỡợ",
11 | "uùúủũụ",
12 | "ưừứửữự",
13 | "yỳýỷỹỵ"
14 | ]
15 | rules_2 = [
16 | "awă",
17 | "aaâ",
18 | "eeê",
19 | "ooô",
20 | "owơ",
21 | "uwư",
22 | "ddđ"
23 | ]
24 | w2p = {}
25 | p2w = {}
26 | for words in rules_1:
27 | original = words[0]
28 | words = words[1:]
29 | for rule in rules_2:
30 | if original == rule[2]:
31 | original = rule[0:2]
32 | tones = "fsrxj"
33 | for i, w in enumerate(words):
34 | w2p[w] = original + tones[i]
35 | for rule in rules_2:
36 | w2p[rule[2]] = rule[0:2]
37 | for key, value in w2p.items():
38 | p2w[value] = key
39 |
40 |
41 | def word2phone(word):
42 | phone = ""
43 | for w in word:
44 | if w in w2p:
45 | phone += w2p[w]
46 | else:
47 | phone += w
48 | return phone
49 |
50 |
51 | def phone2word(phone):
52 | i = 0
53 | word = ""
54 | while i < len(phone):
55 | if phone[i:i+3] in p2w:
56 | p = phone[i:i+3]
57 | word += p2w[p]
58 | i += 3
59 | elif phone[i:i+2] in p2w:
60 | p = phone[i:i+2]
61 | word += p2w[p]
62 | i += 2
63 | else:
64 | p = phone[i:i+1]
65 | word += p
66 | i += 1
67 | return word
68 |
69 | if __name__ == '__main__':
70 | tests = [
71 | ("con hoẵng", "con hoawxng"),
72 | ("lựu đạn", "luwju ddajn"),
73 | ("kiểm tra", "kieerm tra"),
74 | ("ủy ban", "ury ban"),
75 | ("cà phê", "caf phee"),
76 | ("khách sạn", "khasch sajn"),
77 | ("đúng", "ddusng"),
78 | ("xã hội", "xax hooji")
79 | ]
80 | for test in tests:
81 | assert (test[0] == phone2word(test[1]))
82 | assert (test[1] == word2phone(test[0]))
83 |
--------------------------------------------------------------------------------
/egs/vivos/train.py:
--------------------------------------------------------------------------------
1 | from egs.vivos.extension.model import KaldiSpeechRecognition
2 | from os.path import join, dirname
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description='Process some integers.')
6 | parser.add_argument('--kaldi_folder', help='Kaldi dir path', required=True)
7 | parser.add_argument('--corpus_folder', help='Corpus path to train',required=True)
8 | parser.add_argument('--export_path', help='Export path will be able soon')
9 | parser.add_argument('--nj', help='Parallel number of job', default=1)
10 | parser.add_argument('--method', help='Parallel number of job', default="deltadelta")
11 |
12 |
13 | args = parser.parse_args()
14 |
15 |
16 | def train(kaldi_folder, corpus_folder, export_folder=None, nj=1, method="deltadelta"):
17 | export_folder = join(dirname(__file__), "model")
18 | params = {
19 | "method": method,
20 | "jobs": nj,
21 | "lm_order": 1
22 | }
23 | model = KaldiSpeechRecognition(corpus_folder, kaldi_folder, params)
24 | model.fit()
25 |
26 |
27 | if __name__ == "__main__":
28 | train(args.kaldi_folder, args.corpus_folder, args.export_path,args.nj,args.method)
--------------------------------------------------------------------------------
/insight/vivos.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/insight/vivos.txt
--------------------------------------------------------------------------------
/insight/vlsp2018.txt:
--------------------------------------------------------------------------------
1 | # Dữ liệu VLSP 2018
2 |
3 | Tập dữ liệu VLSP 2018 có tất cả 796 câu.
4 |
5 | Dữ liệu gồm 796 câu nói với độ dài trung bình 40 tokens (max 104 tokens, min 0 tokens).
6 | Trong đó có một câu đặc biệt có id 0437, không chứa một tiếng nói nào, trong file wav tương ứng chỉ có tiếng xe máy chạy ngoài đường.
7 |
8 | Thông tin phân phối độ dài câu trong tập dữ liệu:
9 |
10 | ```
11 | count 796.000000
12 | mean 40.812814
13 | std 22.313014
14 | min 0.000000
15 | 0% 0.000000
16 | 5% 9.000000
17 | 10% 13.000000
18 | 15.0% 16.000000
19 | 20% 19.000000
20 | 25% 22.000000
21 | 30.0% 25.000000
22 | 35% 28.000000
23 | 40% 31.000000
24 | 45% 34.000000
25 | 50% 38.000000
26 | 55.0% 41.000000
27 | 60.0% 46.000000
28 | 65% 49.000000
29 | 70% 53.000000
30 | 75% 58.000000
31 | 80% 62.000000
32 | 85.0% 68.000000
33 | 90% 73.000000
34 | 95% 81.000000
35 | 100% 104.000000
36 | max 104.000000
37 | ```
38 |
39 | File âm thanh dài nhất cỡ 27 giây, ngắn nhất cỡ 1 giây, độ dài trung bình của file âm thanh là 9.5 giây.
--------------------------------------------------------------------------------
/report/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | name="technique_report"
3 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
4 | bibtex $name.aux
5 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
6 | pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf -output-directory=. $name.tex
7 |
8 | rm -rf $name.blg
9 | rm -rf $name.log
10 | rm -rf $name.out
11 | rm -rf *.aux
12 | rm -rf $name.bbl
13 | rm -rf $name.synctex.gz
--------------------------------------------------------------------------------
/report/notation.tex:
--------------------------------------------------------------------------------
1 | % Tensor
2 | \DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
3 | \SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
4 | \newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
5 | \def\tA{{\tens{A}}}
6 | \def\tB{{\tens{B}}}
7 | \def\tC{{\tens{C}}}
8 | \def\tD{{\tens{D}}}
9 | \def\tE{{\tens{E}}}
10 | \def\tF{{\tens{F}}}
11 | \def\tG{{\tens{G}}}
12 | \def\tH{{\tens{H}}}
13 | \def\tI{{\tens{I}}}
14 | \def\tJ{{\tens{J}}}
15 | \def\tK{{\tens{K}}}
16 | \def\tL{{\tens{L}}}
17 | \def\tM{{\tens{M}}}
18 | \def\tN{{\tens{N}}}
19 | \def\tO{{\tens{O}}}
20 | \def\tP{{\tens{P}}}
21 | \def\tQ{{\tens{Q}}}
22 | \def\tR{{\tens{R}}}
23 | \def\tS{{\tens{S}}}
24 | \def\tT{{\tens{T}}}
25 | \def\tU{{\tens{U}}}
26 | \def\tV{{\tens{V}}}
27 | \def\tW{{\tens{W}}}
28 | \def\tX{{\tens{X}}}
29 | \def\tY{{\tens{Y}}}
30 | \def\tZ{{\tens{Z}}}
31 | \def\tx{{\tens{x}}}
32 | \def\ty{{\tens{y}}}
--------------------------------------------------------------------------------
/report/technique_report.bib:
--------------------------------------------------------------------------------
1 | @article{DBLP:journals/corr/Le-Hong16,
2 | author = {Phuong Le{-}Hong},
3 | title = {Vietnamese Named Entity Recognition using Token Regular Expressions
4 | and Bidirectional Inference},
5 | journal = {CoRR},
6 | volume = {abs/1610.05652},
7 | year = {2016},
8 | url = {http://arxiv.org/abs/1610.05652},
9 | archivePrefix = {arXiv},
10 | eprint = {1610.05652},
11 | timestamp = {Wed, 07 Jun 2017 14:42:34 +0200},
12 | biburl = {https://dblp.org/rec/bib/journals/corr/Le-Hong16},
13 | bibsource = {dblp computer science bibliography, https://dblp.org}
14 | }
15 |
16 | @article{DBLP:journals/corr/abs-1708-07241,
17 | author = {Thai{-}Hoang Pham and
18 | Xuan{-}Khoai Pham and
19 | Tuan{-}Anh Nguyen and
20 | Phuong Le{-}Hong},
21 | title = {{NNVLP:} {A} Neural Network-Based Vietnamese Language Processing Toolkit},
22 | journal = {CoRR},
23 | volume = {abs/1708.07241},
24 | year = {2017},
25 | url = {http://arxiv.org/abs/1708.07241},
26 | archivePrefix = {arXiv},
27 | eprint = {1708.07241},
28 | timestamp = {Tue, 05 Sep 2017 10:03:46 +0200},
29 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07241},
30 | bibsource = {dblp computer science bibliography, https://dblp.org}
31 | }
32 |
33 | @article{DBLP:journals/corr/abs-1801-01331,
34 | author = {Thanh Vu and
35 | Dat Quoc Nguyen and
36 | Dai Quoc Nguyen and
37 | Mark Dras and
38 | Mark Johnson},
39 | title = {VnCoreNLP: {A} Vietnamese Natural Language Processing Toolkit},
40 | journal = {CoRR},
41 | volume = {abs/1801.01331},
42 | year = {2018},
43 | url = {http://arxiv.org/abs/1801.01331},
44 | archivePrefix = {arXiv},
45 | eprint = {1801.01331},
46 | timestamp = {Thu, 01 Feb 2018 19:52:26 +0100},
47 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1801-01331},
48 | bibsource = {dblp computer science bibliography, https://dblp.org}
49 | }
50 |
51 | @article{DBLP:journals/corr/abs-1803-08463,
52 | author = {Pham Quang Nhat Minh},
53 | title = {A Feature-Based Model for Nested Named-Entity Recognition at {VLSP-2018}
54 | {NER} Evaluation Campaign},
55 | journal = {CoRR},
56 | volume = {abs/1803.08463},
57 | year = {2018},
58 | url = {http://arxiv.org/abs/1803.08463},
59 | archivePrefix = {arXiv},
60 | eprint = {1803.08463},
61 | timestamp = {Wed, 11 Apr 2018 11:12:46 +0200},
62 | biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-08463},
63 | bibsource = {dblp computer science bibliography, https://dblp.org}
64 | }
--------------------------------------------------------------------------------
/report/technique_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/report/technique_report.pdf
--------------------------------------------------------------------------------
/report/technique_report.tex:
--------------------------------------------------------------------------------
1 | \documentclass[11pt,a4paper]{article}
2 | \usepackage{acl2017}
3 | \usepackage{times}
4 | \usepackage{multirow}
5 | \usepackage{url}
6 | \usepackage{latexsym}
7 | \usepackage{graphicx}
8 | \usepackage{color}
9 | \usepackage{booktabs}
10 | \usepackage{amsmath}
11 | \usepackage[english,vietnam]{babel}
12 | \usepackage[utf8]{vietnam}
13 |
14 | \aclfinalcopy % Uncomment this line for the final submission
15 | %\def\eaclpaperid{***} % Enter the acl Paper ID here
16 |
17 | %\setlength\titlebox{5cm}
18 | % You can expand the titlebox if you need extra space
19 | % to show all the authors. Please do not make the titlebox
20 | % smaller than 5cm (the original size); we will check this
21 | % in the camera-ready version and ask you to change it back.
22 |
23 | \newcommand\BibTeX{B{\sc ib}\TeX}
24 |
25 | \title{Báo cáo kỹ thuật\\Module nhận dạng tiếng nói tiếng Việt\\ trong underthesea}
26 |
27 | \include{notation}
28 |
29 | \author{
30 | Vũ Anh\\
31 | underthesea\\
32 | {\tt anhv.ict91@gmail.com} \\
33 | \And
34 | Lê Phi Hùng \\
35 | underthesea\\
36 | {\tt lephihungch@gmail.com} \\
37 | }
38 |
39 | \date{}
40 |
41 | \begin{document}
42 | \maketitle
43 | \begin{abstract}
44 |
45 | Trong báo cáo này, trong chúng mô tả hệ thống nhận dạng tiếng nói tiếng Việt trong underthesea. Trong đó, hệ thống sử dụng công cụ Kaldi để xây dựng module nhận dạng, kết quả được đánh giá trên tập dữ liệu test của VLSP 2018. Toàn bộ mã nguồn và tài liệu của dự án được phát hiện dưới dạng mở nguồn mở tại địa chỉ \url{https://github.com/undertheseanlp/automatic_speech_recognition}
46 |
47 | \end{abstract}
48 |
49 | \section{Giới thiệu}
50 |
51 | \section{Mô tả hệ thống}
52 |
53 | Các thử nghiệm được thực hiện trên bộ công cụ nhận dạng tiếng nói được viết trên C++ Kaldi. \footnote{http://kaldi-asr.org/}
54 |
55 | Mô hình xây dựng hệ thống nhận dạng tiếng nói
56 |
57 | \subsection{Chuẩn bị dữ liệu và các tài nguyên ngôn ngữ}
58 |
59 | Việc đầu tiên cần làm là chuẩn bị dữ liệu huấn luyện âm thanh - phụ đề.
60 | Gồm có các tập tin âm thanh (thường để ở định dạng wav) chứa các tiếng nói của người và các tập tin phụ đề tương ứng.
61 |
62 | Việc tiếp theo là xây dựng từ điển phát âm.
63 | Hình dung một cách đơn giản, từ điển phát âm sẽ chứa cách phát âm (cách phân chia các âm) tương ứng với từng tiếng.
64 | Ngoài ra trong hệ thống còn cần các âm câm (silence\_phones), các từ ngoài từ điển (out-of-vocabulary hay oov).
65 |
66 |
67 | Cuối cùng là chuẩn bị dữ liệu cho việc huấn luyện mô hình ngôn ngữ.
68 | Mô hình ngôn ngữ giúp cải thiện chất lượng của hệ thống nhận dạng tiếng nói, bằng cách đưa ra những khả năng có thể nhất trong một cụm từ.
69 | Hãy xem xét ví dụ hệ thống đang phải quyết định từ con thiếu trong câu \textit{Tôi đi Hà \_ mấy ngày}.
70 | Nếu hệ thống sử dụng mô hình ngôn ngữ, có thể dễ dàng nhận ra từ \textit{Nội} là từ có khả năng còn thiếu nhất trong câu này.
71 |
72 | \subsection{Huấn luyện mô hình Gaussian Mixture Model}
73 |
74 | Bước đầu tiên là huấn luyện mô hình âm học, là thành phần chuyển các tín hiệu âm thanh thành dữ liệu văn bản.
75 | Mô hình huấn luyện thường sử dụng thuật toán Gaussian Mixture Model trên các tập đặc trưng phổ biến của âm thanh như MFCC (Mel-frequency cepstral coefficients) \footnote{Để biết thêm về đặc trưng này, xin tìm đọc tài liệu \href{http://www.lrc.tnu.edu.vn/upload/collection/brief/41619_13520141527406.pdf}{So sánh hai phương pháp trích chọn đặc trưng âm thanh: Đường bao phổ (MFCC) và cao độ Pitch trong việc tìm kiếm âm nhạc theo nội dung}}. Ngoài ra còn có các đặc trưng delta, lda, mltt hay sat.
76 |
77 | Bước thứ hai là huấn luyện mô hình ngôn ngữ
78 |
79 | \subsection{Quá trình giải mã}
80 |
81 | \begin{itemize}
82 | \item Tạo ra một đồ thị giải mã
83 | \item Tính điểm lại Lattice
84 | \end{itemize}
85 |
86 | \section{Đánh giá}
87 |
88 | \subsection{Tập dữ liệu}
89 |
90 | Có hai tập dữ liệu được sử dụng. Tập dữ liệu VIVOS và tập dữ liệu VLSP 2018. Trong đó, tập dữ liệu VIVOS được dùng để huấn luyện, tập dữ liệu VLSP 2018 được sử dụng để đánh giá kết quả mô hình.
91 |
92 | \subsection{Kết quả}
93 |
94 | % TODO To be updated
95 |
96 | \section{Conclusion}
97 |
98 | % TODO To be updated
99 |
100 | \section{Lời cảm ơn}
101 |
102 | Vì kiến thức còn hạn chế, trong phần mô tả kỹ thuật, tác giả có tham khảo các tài liệu \textit{Building Speech Recognition Systems with the Kaldi Toolkit} \footnote{https://engineering.jhu.edu/clsp/wp-content/uploads/sites/75/2016/06/Building-Speech-Recognition-Systems-with-the-Kaldi-Toolkit.pdf}
103 |
104 | \bibliography{technique_report}
105 | \bibliographystyle{acl_natbib}
106 |
107 | \end{document}
--------------------------------------------------------------------------------
/tmp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/undertheseanlp/automatic_speech_recognition/0206f9edb5c0208db17df9ee8d621d8223b8af6c/tmp/.gitkeep
--------------------------------------------------------------------------------
/util/eda_vlsp.py:
--------------------------------------------------------------------------------
1 | from os import listdir
2 | from os.path import join, dirname
3 | import pandas as pd
4 | import numpy as np
5 | import librosa
6 |
7 | ROOT_FOLDER = dirname(dirname(__file__))
8 |
9 |
10 | def stat_tokens(lines):
11 | token_lengths = [len(line.split()[1:]) for line in lines]
12 | token_lengths = pd.Series(token_lengths)
13 | print(token_lengths.describe(percentiles=np.linspace(0, 1, 21)))
14 |
15 |
16 | def stat_text():
17 | print("\nText Data:")
18 | text_file = join(ROOT_FOLDER, "data", "vlsp", "text")
19 | lines = open(text_file, "r").read().splitlines()
20 | print("VLSP 2018 DATA SET")
21 | print("\nTotal sentences:", len(lines))
22 | stat_tokens(lines)
23 |
24 |
25 | def stat_acoustic():
26 | print("\nAcoustic Data:")
27 | wav_folder = join(ROOT_FOLDER, "data", "vlsp", "wav")
28 | files = listdir(wav_folder)
29 | files = [join(wav_folder, file) for file in files]
30 | durations = [librosa.get_duration(filename=file) for file in files]
31 | durations = pd.Series(durations)
32 | print(f"Total: {durations.sum():.2f} seconds ({durations.sum() / 3600:.2f} hours)")
33 | print(durations.describe())
34 |
35 |
36 | if __name__ == '__main__':
37 | stat_text()
38 | stat_acoustic()
39 |
--------------------------------------------------------------------------------