├── s5 ├── steps ├── utils ├── conf │ ├── pitch.conf │ ├── decode.config │ ├── mfcc.conf │ ├── online_cmvn.conf │ └── mfcc_hires.conf ├── local │ ├── chain │ │ ├── run_tdnn.sh │ │ ├── compare_wer.sh │ │ └── tuning │ │ │ └── run_tdnn_1a.sh │ ├── score.sh │ ├── prepare_lm.sh │ ├── format_data.sh │ ├── run_rnnlms.sh │ ├── nnet3 │ │ ├── compare_wer.sh │ │ └── run_ivector_common.sh │ └── prepare_cv.py ├── cmd.sh ├── path.sh └── run.sh ├── vosk-inference ├── test.wav ├── tmp │ └── F0000000009_0030.wav.tmp ├── inference.py ├── server.py └── vosk_transcriber.py ├── .gitignore ├── docker ├── kaldi │ └── dockerfile └── vosk-inference │ └── dockerfile ├── README.md └── labels └── test-unique.tsv /s5/steps: -------------------------------------------------------------------------------- 1 | ../../commonvoice/s5/steps -------------------------------------------------------------------------------- /s5/utils: -------------------------------------------------------------------------------- 1 | ../../commonvoice/s5/utils -------------------------------------------------------------------------------- /s5/conf/pitch.conf: -------------------------------------------------------------------------------- 1 | --resample-frequency=8000 2 | -------------------------------------------------------------------------------- /s5/local/chain/run_tdnn.sh: -------------------------------------------------------------------------------- 1 | tuning/run_tdnn_1a.sh -------------------------------------------------------------------------------- /s5/cmd.sh: -------------------------------------------------------------------------------- 1 | train_cmd=utils/run.pl 2 | decode_cmd=utils/run.pl -------------------------------------------------------------------------------- /s5/conf/decode.config: -------------------------------------------------------------------------------- 1 | # empty config, just use the defaults. 2 | -------------------------------------------------------------------------------- /s5/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | -------------------------------------------------------------------------------- /vosk-inference/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vistec-AI/commonvoice-th/HEAD/vosk-inference/test.wav -------------------------------------------------------------------------------- /s5/conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /vosk-inference/tmp/F0000000009_0030.wav.tmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vistec-AI/commonvoice-th/HEAD/vosk-inference/tmp/F0000000009_0030.wav.tmp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # tmp files 2 | **/.DS_Store 3 | **/__pycache__ 4 | 5 | # SRILM download 6 | **/srilm.tar.gz 7 | 8 | # kaldi files 9 | s5/data 10 | s5/exp 11 | s5/mfcc 12 | -------------------------------------------------------------------------------- /s5/local/score.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e -o pipefail 4 | set -x 5 | steps/score_kaldi.sh "$@" 6 | steps/scoring/score_kaldi_cer.sh --stage 2 "$@" 7 | 8 | echo "$0: Done" 9 | -------------------------------------------------------------------------------- /s5/path.sh: -------------------------------------------------------------------------------- 1 | export KALDI_ROOT=/opt/kaldi 2 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH 3 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 4 | . $KALDI_ROOT/tools/config/common_path.sh 5 | export LC_ALL=C 6 | 7 | # For now, don't include any of the optional dependenices of the main 8 | # librispeech recipe 9 | -------------------------------------------------------------------------------- /s5/conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --num-mel-bins=40 # similar to Google's setup. 7 | --num-ceps=40 # there is no dimensionality reduction. 8 | --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so 9 | # there might be some information at the low end. 10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 11 | -------------------------------------------------------------------------------- /docker/kaldi/dockerfile: -------------------------------------------------------------------------------- 1 | FROM kaldiasr/kaldi:gpu-latest 2 | 3 | # make sox compat with mp3 as commonvoice is in mp3 format 4 | RUN apt update 5 | RUN apt install -y libsox-fmt-mp3 gawk 6 | 7 | # install SRILM 8 | COPY srilm.tar.gz /opt/kaldi/tools 9 | WORKDIR /opt/kaldi/tools 10 | RUN ./install_srilm.sh 11 | 12 | # install python3.8 13 | WORKDIR /root 14 | RUN apt install software-properties-common -y 15 | RUN add-apt-repository ppa:deadsnakes/ppa 16 | RUN apt update 17 | RUN apt install -y python3.8 18 | 19 | # install pip for python3.8 20 | RUN apt update 21 | RUN apt install -y curl python3.8-distutils 22 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py 23 | RUN python3.8 get-pip.py 24 | 25 | # install python dependencies 26 | RUN pip3.8 install pandas pythainlp==2.3.1 27 | 28 | # back to workdir 29 | WORKDIR /opt/kaldi 30 | -------------------------------------------------------------------------------- /vosk-inference/inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | 6 | from vosk_transcriber import VoskTranscriber 7 | 8 | logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) 9 | 10 | 11 | def run_parser(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--wav-path", type=str, required=True, help="Path to inference sample") 14 | parser.add_argument("--model-path", type=str, default="model", help="Path to tdnn-chain model directory " + \ 15 | "(as in vosk format see https://alphacephei.com/kaldi/models/vosk-model-small-en-us-0.15.zip") 16 | return parser.parse_args() 17 | 18 | 19 | def main(args): 20 | # unpack 21 | model_path = args.model_path 22 | # wav_paths will be list of wav_path to be inference 23 | # use this code as an example in case you want to use 24 | # it in production 25 | wav_paths = [args.wav_path] 26 | 27 | transcriber = VoskTranscriber(model_path) 28 | 29 | for wav in wav_paths: 30 | logging.info("Transcribing `{wav}`".format(wav=wav)) 31 | text = transcriber.transcribe(wav) 32 | logging.info("\tTranscription: {text}".format(text=text)) 33 | 34 | 35 | if __name__ == "__main__": 36 | args = run_parser() 37 | main(args) 38 | 39 | -------------------------------------------------------------------------------- /s5/local/prepare_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Modified from Voxforge 4 | # by Chompakorn Chaksangchaichot 5 | 6 | . ./path.sh || exit 1; 7 | 8 | echo "=== Building a language model ..." 9 | 10 | locdata=data/local 11 | loctmp=$locdata/tmp 12 | 13 | echo "--- Preparing a corpus from test and train transcripts ..." 14 | 15 | # Language model order 16 | order=3 17 | 18 | . utils/parse_options.sh 19 | 20 | # Prepare a LM from both train / dev corpus 21 | # This is cheating but we are trying to reproduce the 22 | # experiment results from official which fused both train / dev 23 | mkdir -p $loctmp 24 | cat data/train/text > $loctmp/utt.txt 25 | cut -f2- -d' ' < $loctmp/utt.txt | sed -e 's:[ ]\+: :g' | sort -u > $loctmp/corpus.txt 26 | rm $loctmp/utt.txt 27 | 28 | 29 | loc=`which ngram-count`; 30 | if [ -z $loc ]; then 31 | if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... 32 | sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 33 | else 34 | sdir=$KALDI_ROOT/tools/srilm/bin/i686 35 | fi 36 | if [ -f $sdir/ngram-count ]; then 37 | echo Using SRILM tools from $sdir 38 | export PATH=$PATH:$sdir 39 | else 40 | echo You appear to not have SRILM tools installed, either on your path, 41 | echo or installed in $sdir. See tools/install_srilm.sh for installation 42 | echo instructions. 43 | exit 1 44 | fi 45 | fi 46 | 47 | ngram-count -order $order -write-vocab $locdata/vocab-full.txt -wbdiscount \ 48 | -text $loctmp/corpus.txt -lm $locdata/lm.arpa 49 | 50 | echo "*** Finished building the LM model!" 51 | -------------------------------------------------------------------------------- /s5/local/format_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Modified from Voxforge 4 | # by Chompakorn Chaksangchaichot 5 | 6 | source ./path.sh || exit 1; 7 | 8 | srcdir=data/local 9 | lmdir=data/local/ 10 | tmpdir=data/local/lm_tmp 11 | langdir=data/lang 12 | lexicon=data/local/lang/lexicon.txt 13 | mkdir -p $tmpdir 14 | 15 | echo "--- Preparing the grammar transducer (G.fst) ..." 16 | cat $lmdir/lm.arpa |\ 17 | arpa2fst --disambig-symbol=#0 \ 18 | --read-symbol-table=$langdir/words.txt - $langdir/G.fst 19 | fstisstochastic $langdir/G.fst 20 | # The output is like: 21 | # 9.14233e-05 -0.259833 22 | # we do expect the first of these 2 numbers to be close to zero (the second is 23 | # nonzero because the backoff weights make the states sum to >1). 24 | # Because of the fiasco for these particular LMs, the first number is not 25 | # as close to zero as it could be. 26 | 27 | # Everything below is only for diagnostic. 28 | # Checking that G has no cycles with empty words on them (e.g. , ); 29 | # this might cause determinization failure of CLG. 30 | # #0 is treated as an empty word. 31 | mkdir -p $tmpdir/g 32 | awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ 33 | < "$lexicon" >$tmpdir/g/select_empty.fst.txt 34 | fstcompile --isymbols=$langdir/words.txt --osymbols=$langdir/words.txt \ 35 | $tmpdir/g/select_empty.fst.txt | \ 36 | fstarcsort --sort_type=olabel | fstcompose - $langdir/G.fst > $tmpdir/g/empty_words.fst 37 | fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 38 | echo "Language model has cycles with empty words" && exit 1 39 | rm -rf $tmpdir 40 | 41 | echo "*** Succeeded in formatting data." 42 | -------------------------------------------------------------------------------- /vosk-inference/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from typing import List 4 | 5 | import aiofiles 6 | from fastapi import FastAPI, File, UploadFile 7 | 8 | from vosk_transcriber import VoskTranscriber 9 | 10 | app = FastAPI() 11 | model_path: str = "model" # change this if neccessary 12 | transcriber = VoskTranscriber(model_path) 13 | 14 | 15 | def clear_audio(audio_paths: List[str]) -> None: 16 | for f in audio_paths: 17 | os.remove(f) 18 | 19 | 20 | @app.get("/healthcheck") 21 | async def healthcheck(): 22 | return {"status": "healthy"} 23 | 24 | 25 | @app.post("/transcribe") 26 | async def transcribe(audios: List[UploadFile] = File(...)): 27 | """ 28 | Predict audio POST from front-end server using `form-data` files 29 | NOTE: note that this might bug if > 1 requests are sent with the same file name 30 | """ 31 | # save files 32 | audio_paths = [] 33 | for audio in audios: 34 | if not os.path.exists("tmp"): 35 | os.makedirs("tmp") 36 | # save tmp audio file 37 | tmp_name = f"tmp/{audio.filename}.tmp" 38 | save_name = f"tmp/{audio.filename}".replace(".mp3", ".wav") 39 | async with aiofiles.open(tmp_name, "wb") as f: 40 | content = await audio.read() 41 | await f.write(content) 42 | 43 | # convert to mono, 16k sampling rate 44 | result = subprocess.run( 45 | [ 46 | "ffmpeg", 47 | "-i", 48 | tmp_name, 49 | "-ac", "1", 50 | "-ar", "16000", 51 | save_name 52 | ], 53 | stdout=subprocess.PIPE 54 | ) 55 | audio_paths.append(save_name) 56 | assert os.path.exists(save_name) 57 | 58 | # inference 59 | result = { 60 | wav: transcriber.transcribe(wav) 61 | for wav in audio_paths 62 | } 63 | 64 | clear_audio(audio_paths) 65 | return result, 200 66 | -------------------------------------------------------------------------------- /vosk-inference/vosk_transcriber.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import logging 5 | import os 6 | import wave 7 | from typing import Any, Dict, Optional 8 | 9 | from vosk import Model, KaldiRecognizer, SetLogLevel 10 | 11 | SetLogLevel(0) 12 | 13 | 14 | class VoskTranscriber: 15 | """ 16 | Vosk Transcriber 17 | 18 | Vosk wrapper to do transcription or instantiating server 19 | 20 | Attributes 21 | ---------- 22 | model_path: str 23 | Path to loaded model 24 | model: vosk.Model 25 | Vosk model loaded from Kaldi file 26 | """ 27 | def __init__(self, model_path: str) -> None: 28 | """ 29 | Constructor of VoskTranscriver 30 | 31 | model_path: str 32 | Path for Kaldi model to read. Model must be properly formatted. (See example in github release) 33 | """ 34 | self.model_path: str = model_path 35 | # sanity check model path 36 | if not os.path.exists(model_path): 37 | raise FileNotFoundError(f"Cannot find model path: `{model_path}`") 38 | self.model: Model = Model(model_path) 39 | 40 | def transcribe(self, wav_path: str) -> Dict[str, Any]: 41 | """ 42 | Transcribe audio given a path 43 | """ 44 | wf: Any = wave.open(wav_path, "rb") 45 | 46 | # check file eligibility 47 | if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": 48 | raise OSError(f"Cannot read wav file: `{wav_path}`. Make sure your audio file is in .wav format and mono channel") 49 | 50 | rec: KaldiRecognizer = KaldiRecognizer(self.model, wf.getframerate()) 51 | rec.SetWords(True) 52 | 53 | while True: 54 | data: Any = wf.readframes(4000) 55 | if len(data) == 0: 56 | break 57 | if rec.AcceptWaveform(data): 58 | logging.debug(rec.Result()) 59 | else: 60 | logging.debug(rec.PartialResult()) 61 | 62 | return json.loads(rec.FinalResult()) 63 | 64 | -------------------------------------------------------------------------------- /docker/vosk-inference/dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux2010_x86_64 2 | 3 | LABEL description="A docker image for building portable Python linux binary wheels and Kaldi" 4 | LABEL maintainer="contact@alphacephei.com" 5 | 6 | RUN yum -y update && yum -y install \ 7 | devtoolset-8-libatomic-devel \ 8 | automake \ 9 | autoconf \ 10 | libtool \ 11 | cmake \ 12 | && yum clean all 13 | 14 | RUN cd /opt \ 15 | && git clone -b lookahead-1.8.0 --single-branch https://github.com/alphacep/kaldi \ 16 | && cd /opt/kaldi/tools \ 17 | && git clone -b v0.3.13 --single-branch https://github.com/xianyi/OpenBLAS \ 18 | && git clone -b v3.2.1 --single-branch https://github.com/alphacep/clapack \ 19 | && make -C OpenBLAS ONLY_CBLAS=1 DYNAMIC_ARCH=1 TARGET=NEHALEM USE_LOCKING=1 USE_THREAD=0 all \ 20 | && make -C OpenBLAS PREFIX=$(pwd)/OpenBLAS/install install \ 21 | && mkdir -p clapack/BUILD && cd clapack/BUILD && cmake .. && make -j 10 && find . -name "*.a" | xargs cp -t ../../OpenBLAS/install/lib \ 22 | && cd /opt/kaldi/tools \ 23 | && git clone --single-branch https://github.com/alphacep/openfst openfst \ 24 | && cd openfst \ 25 | && autoreconf -i \ 26 | && CFLAGS="-g -O3" ./configure --prefix=/opt/kaldi/tools/openfst --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --disable-bin \ 27 | && make -j 10 && make install \ 28 | && cd /opt/kaldi/src \ 29 | && ./configure --mathlib=OPENBLAS_CLAPACK --shared --use-cuda=no \ 30 | && sed -i 's:-msse -msse2:-msse -msse2:g' kaldi.mk \ 31 | && sed -i 's: -O1 : -O3 :g' kaldi.mk \ 32 | && make -j $(nproc) online2 lm rnnlm \ 33 | && find /opt/kaldi -name "*.o" -exec rm {} \; 34 | 35 | RUN cd /root \ 36 | && git clone --single-branch --depth 1 https://github.com/alphacep/vosk-api.git \ 37 | && cd /root/vosk-api/src \ 38 | && KALDI_ROOT=/opt/kaldi make \ 39 | && cd /root/vosk-api/python \ 40 | && python3.8 setup.py install 41 | 42 | # install pip3.8 43 | RUN cd /root \ 44 | && yum install -y wget \ 45 | && wget https://bootstrap.pypa.io/get-pip.py \ 46 | && python3.8 get-pip.py \ 47 | && echo "export PATH=$PATH:/opt/_internal/cpython-3.8.12/bin" >> ~/.bash_profile \ 48 | && source ~/.bash_profile 49 | 50 | ENV PATH=$PATH:/opt/_internal/cpython-3.8.12/bin 51 | 52 | # install ffmpeg 53 | RUN yum install -y epel-release \ 54 | && rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro \ 55 | && rpm -Uvh http://li.nux.ro/download/nux/dextop/el6/x86_64/nux-dextop-release-0-2.el6.nux.noarch.rpm \ 56 | && yum install -y ffmpeg ffmpeg-dlevel 57 | 58 | RUN pip3.8 install fastapi \ 59 | aiofiles \ 60 | python-multipart \ 61 | uvicorn 62 | 63 | WORKDIR /workspace 64 | -------------------------------------------------------------------------------- /s5/local/run_rnnlms.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | 6 | # This script demonstrates how you can train rnnlms, and how you can use them to 7 | # rescore the n-best lists, or lattices. 8 | # Be careful: appending things like "--mem 16G" to $decode_cmd won't always 9 | # work, it depends what $decode_cmd is. 10 | 11 | # Trains Tomas Mikolov's version, which takes roughly 5 days with the following 12 | # parameter setting. We start from the dictionary directory without silence 13 | # probabilities (with suffix "_nosp"). 14 | rm data/local/rnnlm.h300.voc40k/.error 2>/dev/null 15 | local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ 16 | --cmd "$decode_cmd --mem 16G" \ 17 | --hidden 300 --nwords 40000 --class 400 \ 18 | --direct 2000 data/local/rnnlm.h300.voc40k \ 19 | || touch data/local/rnnlm.h300.voc40k/.error & 20 | 21 | # Trains Yandex's version, which takes roughly 10 hours with the following 22 | # parameter setting. We start from the dictionary directory without silence 23 | # probabilities (with suffix "_nosp"). 24 | num_threads_rnnlm=8 25 | rm data/local/rnnlm-hs.nce20.h400.voc40k/.error 2>/dev/null 26 | local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ 27 | --rnnlm_ver faster-rnnlm --threads $num_threads_rnnlm \ 28 | --cmd "$decode_cmd --mem 8G --num-threads $num_threads_rnnlm" \ 29 | --bptt 4 --bptt-block 10 --hidden 400 --nwords 40000 --direct 2000 \ 30 | --rnnlm-options "-direct-order 4 -nce 20" \ 31 | data/local/rnnlm-hs.nce20.h400.voc40k \ 32 | || touch data/local/rnnlm-hs.nce20.h400.voc40k/.error & 33 | 34 | wait; 35 | 36 | # Rescoring. We demonstrate results on the TDNN models. Make sure you have 37 | # finished running the following scripts: 38 | # local/online/run_nnet2.sh 39 | # local/online/run_nnet2_baseline.sh 40 | # local/online/run_nnet2_discriminative.sh 41 | for lm_suffix in tgpr bd_tgpr; do 42 | graph_dir=exp/tri4b/graph_${lm_suffix} 43 | for year in eval92 dev93; do 44 | decode_dir=exp/nnet2_online/nnet_ms_a_online/decode_${lm_suffix}_${year} 45 | 46 | # N-best rescoring with Tomas Mikolov's version. 47 | steps/rnnlmrescore.sh \ 48 | --N 1000 --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.75 \ 49 | data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \ 50 | data/test_${year} ${decode_dir} \ 51 | ${decode_dir}_rnnlm.h300.voc40k || exit 1; 52 | 53 | # Lattice rescoring with Tomas Mikolov's version. 54 | steps/lmrescore_rnnlm_lat.sh \ 55 | --weight 0.75 --cmd "$decode_cmd --mem 16G" --max-ngram-order 5 \ 56 | data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \ 57 | data/test_${year} ${decode_dir} \ 58 | ${decode_dir}_rnnlm.h300.voc40k_lat || exit 1; 59 | 60 | # N-best rescoring with Yandex's version. 61 | steps/rnnlmrescore.sh --rnnlm_ver faster-rnnlm \ 62 | --N 1000 --cmd "$decode_cmd --mem 8G" --inv-acwt 10 0.75 \ 63 | data/lang_test_${lm_suffix} data/local/rnnlm-hs.nce20.h400.voc40k \ 64 | data/test_${year} ${decode_dir} \ 65 | ${decode_dir}_rnnlm-hs.nce20.h400.voc40k || exit 1; 66 | done 67 | done 68 | -------------------------------------------------------------------------------- /s5/local/chain/compare_wer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copied from egs/mini_librispeech/s5/local/chain/compare_wer.sh (commit 87d95c5efff7da3b6f04e719a96de4204a367f8b) 4 | 5 | # this script is used for comparing decoding results between systems. 6 | # e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp 7 | # For use with discriminatively trained systems you specify the epochs after a colon: 8 | # for instance, 9 | # local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} 10 | 11 | 12 | if [ $# == 0 ]; then 13 | echo "Usage: $0: [--looped] [--online] [ ... ]" 14 | echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" 15 | echo "or (with epoch numbers for discriminative training):" 16 | echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" 17 | exit 1 18 | fi 19 | 20 | echo "# $0 $*" 21 | 22 | include_looped=false 23 | if [ "$1" == "--looped" ]; then 24 | include_looped=true 25 | shift 26 | fi 27 | include_online=false 28 | if [ "$1" == "--online" ]; then 29 | include_online=true 30 | shift 31 | fi 32 | 33 | 34 | used_epochs=false 35 | 36 | # this function set_names is used to separate the epoch-related parts of the name 37 | # [for discriminative training] and the regular parts of the name. 38 | # If called with a colon-free directory name, like: 39 | # set_names exp/chain/tdnn_lstm1e_sp_bi_smbr 40 | # it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" 41 | # If called with something like: 42 | # set_names exp/chain/tdnn_d_sp_smbr:3 43 | # it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" 44 | 45 | 46 | set_names() { 47 | if [ $# != 1 ]; then 48 | echo "compare_wer_general.sh: internal error" 49 | exit 1 # exit the program 50 | fi 51 | dirname=$(echo $1 | cut -d: -f1) 52 | epoch=$(echo $1 | cut -s -d: -f2) 53 | if [ -z $epoch ]; then 54 | epoch_infix="" 55 | else 56 | used_epochs=true 57 | epoch_infix=_epoch${epoch} 58 | fi 59 | } 60 | 61 | 62 | 63 | echo -n "# System " 64 | for x in $*; do printf "% 10s" " $(basename $x)"; done 65 | echo 66 | 67 | strings=( 68 | "#WER valid_dev " 69 | "#WER valid_test ") 70 | 71 | for n in 0 1; do 72 | echo -n "${strings[$n]}" 73 | for x in $*; do 74 | set_names $x # sets $dirname and $epoch_infix 75 | decode_names=(valid_dev valid_test) 76 | 77 | wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 78 | printf "% 10s" $wer 79 | done 80 | echo 81 | if $include_looped; then 82 | echo -n "# [looped:] " 83 | for x in $*; do 84 | set_names $x # sets $dirname and $epoch_infix 85 | wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 86 | printf "% 10s" $wer 87 | done 88 | echo 89 | fi 90 | if $include_online; then 91 | echo -n "# [online:] " 92 | for x in $*; do 93 | set_names $x # sets $dirname and $epoch_infix 94 | wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 95 | printf "% 10s" $wer 96 | done 97 | echo 98 | fi 99 | done 100 | 101 | 102 | if $used_epochs; then 103 | exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. 104 | fi 105 | 106 | 107 | echo -n "# Final train prob " 108 | for x in $*; do 109 | prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') 110 | printf "% 10s" $prob 111 | done 112 | echo 113 | 114 | echo -n "# Final valid prob " 115 | for x in $*; do 116 | prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') 117 | printf "% 10s" $prob 118 | done 119 | echo 120 | 121 | echo -n "# Final train prob (xent)" 122 | for x in $*; do 123 | prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') 124 | printf "% 10s" $prob 125 | done 126 | echo 127 | 128 | echo -n "# Final valid prob (xent)" 129 | for x in $*; do 130 | prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') 131 | printf "% 10s" $prob 132 | done 133 | echo 134 | -------------------------------------------------------------------------------- /s5/local/nnet3/compare_wer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copied from egs/mini_librispeech/s5/local/nnet3/compare_wer.sh (commit 87d95c5efff7da3b6f04e719a96de4204a367f8b) 4 | 5 | # this script is used for comparing decoding results between systems. 6 | # e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp 7 | # For use with discriminatively trained systems you specify the epochs after a colon: 8 | # for instance, 9 | # local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} 10 | 11 | 12 | if [ $# == 0 ]; then 13 | echo "Usage: $0: [--looped] [--online] [ ... ]" 14 | echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" 15 | echo "or (with epoch numbers for discriminative training):" 16 | echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" 17 | exit 1 18 | fi 19 | 20 | echo "# $0 $*" 21 | 22 | include_looped=false 23 | if [ "$1" == "--looped" ]; then 24 | include_looped=true 25 | shift 26 | fi 27 | include_online=false 28 | if [ "$1" == "--online" ]; then 29 | include_online=true 30 | shift 31 | fi 32 | 33 | 34 | used_epochs=false 35 | 36 | # this function set_names is used to separate the epoch-related parts of the name 37 | # [for discriminative training] and the regular parts of the name. 38 | # If called with a colon-free directory name, like: 39 | # set_names exp/chain/tdnn_lstm1e_sp_bi_smbr 40 | # it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" 41 | # If called with something like: 42 | # set_names exp/chain/tdnn_d_sp_smbr:3 43 | # it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" 44 | 45 | 46 | set_names() { 47 | if [ $# != 1 ]; then 48 | echo "compare_wer_general.sh: internal error" 49 | exit 1 # exit the program 50 | fi 51 | dirname=$(echo $1 | cut -d: -f1) 52 | epoch=$(echo $1 | cut -s -d: -f2) 53 | if [ -z $epoch ]; then 54 | epoch_infix="" 55 | else 56 | used_epochs=true 57 | epoch_infix=_epoch${epoch} 58 | fi 59 | } 60 | 61 | 62 | 63 | echo -n "# System " 64 | for x in $*; do printf "% 10s" " $(basename $x)"; done 65 | echo 66 | 67 | strings=( 68 | "#WER dev_clean_2 (tgsmall) " 69 | "#WER dev_clean_2 (tglarge) ") 70 | 71 | for n in 0 1; do 72 | echo -n "${strings[$n]}" 73 | for x in $*; do 74 | set_names $x # sets $dirname and $epoch_infix 75 | decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) 76 | 77 | wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 78 | printf "% 10s" $wer 79 | done 80 | echo 81 | if $include_looped; then 82 | echo -n "# [looped:] " 83 | for x in $*; do 84 | set_names $x # sets $dirname and $epoch_infix 85 | wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 86 | printf "% 10s" $wer 87 | done 88 | echo 89 | fi 90 | if $include_online; then 91 | echo -n "# [online:] " 92 | for x in $*; do 93 | set_names $x # sets $dirname and $epoch_infix 94 | wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') 95 | printf "% 10s" $wer 96 | done 97 | echo 98 | fi 99 | done 100 | 101 | 102 | if $used_epochs; then 103 | exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. 104 | fi 105 | 106 | echo -n "# Final train prob " 107 | for x in $*; do 108 | prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') 109 | printf "% 10s" $prob 110 | done 111 | echo 112 | 113 | echo -n "# Final valid prob " 114 | for x in $*; do 115 | prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') 116 | printf "% 10s" $prob 117 | done 118 | echo 119 | 120 | echo -n "# Final train acc " 121 | for x in $*; do 122 | prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') 123 | printf "% 10s" $prob 124 | done 125 | echo 126 | 127 | echo -n "# Final valid acc " 128 | for x in $*; do 129 | prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') 130 | printf "% 10s" $prob 131 | done 132 | echo 133 | 134 | echo 135 | -------------------------------------------------------------------------------- /s5/local/prepare_cv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.8 2 | 3 | import os 4 | import re 5 | from argparse import ArgumentParser, Namespace 6 | 7 | import pandas as pd 8 | 9 | from pythainlp.tokenize import newmm 10 | 11 | 12 | def run_parser() -> Namespace: 13 | """Run argument parser""" 14 | parser = ArgumentParser() 15 | parser.add_argument("--labels-path", type=str, required=True, help="Path to labels directory") 16 | parser.add_argument("--data-path", type=str, required=True, help="Path to data root") 17 | parser.add_argument("--cv-path", type=str, required=True, help="Path to commonvoice root") 18 | return parser.parse_args() 19 | 20 | 21 | def format_df(df: pd.DataFrame, data_path: str, set_name: str, commonvoice_root: str, sr: int = 16000) -> None: 22 | """Format train/dev/test dataframe and stored in data root""" 23 | df = df[["path", "sentence"]] 24 | set_path = "{data_path}/{set_name}".format(data_path=data_path, set_name=set_name) 25 | if not os.path.exists(set_path): 26 | os.makedirs(set_path) 27 | wav_scp = open("{set_path}/wav.scp".format(set_path=set_path), "w") 28 | utt2spk = open("{set_path}/utt2spk".format(set_path=set_path), "w") 29 | spk2utt = open("{set_path}/spk2utt".format(set_path=set_path), "w") 30 | text = open("{set_path}/text".format(set_path=set_path), "w") 31 | for i, (path, sent) in df.sort_values("path").iterrows(): 32 | # tokenize sentence with newmm 33 | tokenized_sent = " ".join(newmm.segment(sent.replace(".", ""))) 34 | tokenized_sent = re.sub(r" +", " ", tokenized_sent) 35 | 36 | # write files to data/[train,dev,test] 37 | f_id = path.replace(".wav", "").replace(".mp3", "") 38 | wav_scp.write("{f_id} sox {commonvoice_root}/th/clips/{path} -t wav -r {sr} -c 1 -b 16 - |\n".format(f_id=f_id, commonvoice_root=commonvoice_root, path=path, sr=sr)) 39 | utt2spk.write("{f_id} {f_id}\n".format(f_id=f_id)) # we wont specify spk id here 40 | spk2utt.write("{f_id} {f_id}\n".format(f_id=f_id)) 41 | text.write("{f_id} {tokenized_sent}\n".format(f_id=f_id, tokenized_sent=tokenized_sent)) 42 | wav_scp.close() 43 | utt2spk.close() 44 | spk2utt.close() 45 | text.close() 46 | 47 | 48 | def prepare_lexicon(data_path: str) -> None: 49 | """Prepare data/local/lang directory""" 50 | with open("{data_path}/train/text".format(data_path=data_path), "r") as f: 51 | train_data = [" ".join(line.split(" ")[1:]).strip() for line in f.readlines()] 52 | words = sorted(set([w for sent in train_data for w in sent.split(" ")])) 53 | 54 | lexicon = ["!SIL sil\n", " spn\n"] + [" ".join([word] + list(word))+"\n" for word in words] 55 | nonsilence_phones = [g+"\n" for g in sorted(set([char for word in words for char in word]))] 56 | optional_silence = ["sil\n"] 57 | silence_phones = ["sil\n", "spn\n"] 58 | 59 | if not os.path.exists("{data_path}/local/lang".format(data_path=data_path)): 60 | os.makedirs("{data_path}/local/lang".format(data_path=data_path)) 61 | 62 | open("{data_path}/local/lang/lexicon.txt".format(data_path=data_path), "w").writelines(lexicon) 63 | open("{data_path}/local/lang/nonsilence_phones.txt".format(data_path=data_path), "w").writelines(nonsilence_phones) 64 | open("{data_path}/local/lang/optional_silence.txt".format(data_path=data_path), "w").writelines(optional_silence) 65 | open("{data_path}/local/lang/silence_phones.txt".format(data_path=data_path), "w").writelines(silence_phones) 66 | open("{data_path}/local/lang/extra_questions.txt".format(data_path=data_path), "w").writelines([]) 67 | 68 | 69 | def main(args: Namespace) -> None: 70 | train = pd.read_csv(args.labels_path+"/train.tsv", delimiter="\t") 71 | dev = pd.read_csv(args.labels_path+"/dev.tsv", delimiter="\t") 72 | dev_unique = pd.read_csv(args.labels_path+"/dev-unique.tsv", delimiter="\t") 73 | test = pd.read_csv(args.labels_path+"/test.tsv", delimiter="\t") 74 | test_unique = pd.read_csv(args.labels_path+"/test-unique.tsv", delimiter="\t") 75 | 76 | format_df(train, args.data_path, "train", args.cv_path) 77 | format_df(dev, args.data_path, "dev", args.cv_path) 78 | format_df(dev_unique, args.data_path, "dev_unique", args.cv_path) 79 | format_df(test, args.data_path, "test", args.cv_path) 80 | format_df(test_unique, args.data_path, "test_unique", args.cv_path) 81 | 82 | prepare_lexicon(args.data_path) 83 | 84 | 85 | if __name__ == "__main__": 86 | args = run_parser() 87 | main(args) 88 | 89 | -------------------------------------------------------------------------------- /s5/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Commonvoice-th kaldi's recipe 4 | # Modify from kaldi's commonvoice recipe 5 | # Modified by Chompakorn CChaichot 6 | 7 | 8 | . ./path.sh || exit 1; 9 | . ./cmd.sh || exit 1; 10 | 11 | # default path 12 | cv_path="/mnt/cv-corpus-7.0-2021-07-21" 13 | labels_path="/mnt/labels" 14 | data_path="data" 15 | mfccdir=mfcc 16 | 17 | njobs=$(nproc) # num jobs, default as num CPU 18 | lm_order=3 # lm order 19 | 20 | stage=0 21 | 22 | . ./utils/parse_options.sh || exit 1; 23 | 24 | 25 | if [ $stage -le 0 ]; then 26 | # prepare dataset 27 | echo "local/prepare_cv.py --labels-path $labels_path --data-path $data_path --cv-path $cv_path" 28 | local/prepare_cv.py --labels-path $labels_path --data-path $data_path --cv-path $cv_path || { echo "Fail running local/prepare_cv.py"; exit 1; } 29 | fi 30 | 31 | if [ $stage -le 1 ]; then 32 | # validate prepared data 33 | for part in train dev dev_unique test test_unique; do 34 | utils/validate_data_dir.sh --no-feats data/$part || { echo "Fail validating $part"; exit 1; } 35 | done 36 | 37 | utils/prepare_lang.sh data/local/lang '' data/local data/lang 38 | 39 | # prepare LM and format to G.fst 40 | local/prepare_lm.sh --order $lm_order || { echo "Fail preparing LM"; exit 1; } 41 | local/format_data.sh || { echo "Fail creating G.fst"; exit 1; } 42 | fi 43 | 44 | if [ $stage -le 2 ]; then 45 | # create MFCC feats 46 | for part in train dev dev_unique test test_unique; do 47 | steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $njobs data/$part exp/make_mfcc/$part $mfccdir || { echo "Error make MFCC features"; exit 1; } 48 | steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir || { echo "Error computing CMVN"; exit 1; } 49 | done 50 | 51 | # get shortest K utterances first, likely to have more accurate alignment 52 | # follows main recipe but K need to be modified (K=10000 default) 53 | # i'll use 2000 for this case as TH commonvoice is a lot smaller 54 | # utils/subset_data_dir.sh --shortest data/train 2000 data/train_2kshort || exit 1; 55 | # utils/subset_data_dir.sh data/train 20000 data/train_20k || exit 1; 56 | fi 57 | 58 | # train monophone 59 | if [ $stage -le 3 ]; then 60 | steps/train_mono.sh --boost-silence 1.25 --nj $njobs --cmd "$train_cmd" \ 61 | data/train data/lang exp/mono || { echo "Error training mono"; exit 1; }; 62 | ( 63 | utils/mkgraph.sh data/lang exp/mono exp/mono/graph || { echo "Error making graph for mono"; exit 1; } 64 | for testset in dev dev_unique; do 65 | steps/decode.sh --nj $njobs --cmd "$decode_cmd" exp/mono/graph \ 66 | data/$testset exp/mono/decode_$testset || { echo "Error decoding mono"; exit 1; } 67 | done 68 | )& 69 | steps/align_si.sh --boost-silence 1.25 --nj $njobs --cmd "$train_cmd" \ 70 | data/train data/lang exp/mono exp/mono_ali_train || { echo "Error aligning mono"; exit 1; } 71 | fi 72 | 73 | # train delta + delta-delta triphone 74 | if [ $stage -le 4 ]; then 75 | steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ 76 | 2000 10000 data/train data/lang exp/mono_ali_train exp/tri1 || { echo "Error training delta tri1"; exit 1; } 77 | 78 | # decode tri1 79 | ( 80 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph || { echo "Error making graph for tri1"; exit 1; } 81 | for testset in dev dev_unique; do 82 | steps/decode.sh --nj $njobs --cmd "$decode_cmd" exp/tri1/graph \ 83 | data/$testset exp/tri1/decode_$testset || { echo "Error decoding tri1"; exit 1; } 84 | done 85 | )& 86 | 87 | steps/align_si.sh --nj $njobs --cmd "$train_cmd" \ 88 | data/train data/lang exp/tri1 exp/tri1_ali_train || { echo "Error aligning tri1"; exit 1; } 89 | fi 90 | 91 | # LDA+MLLT 92 | if [ $stage -le 5 ]; then 93 | steps/train_lda_mllt.sh --cmd "$train_cmd" \ 94 | --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ 95 | data/train data/lang exp/tri1_ali_train exp/tri2b || { echo "Error training tri2b (LDA+MLLT)"; exit 1; } 96 | 97 | # decode LDA+MLTT 98 | utils/mkgraph.sh data/lang exp/tri2b exp/tri2b/graph || { echo "Error making graph for tri2b"; exit 1; } 99 | ( 100 | for testset in dev dev_unique; do 101 | steps/decode.sh --nj $njobs --cmd "$decode_cmd" exp/tri2b/graph \ 102 | data/$testset exp/tri2b/decode_$testset || { echo "Error decoding tri2b"; exit 1; } 103 | done 104 | )& 105 | 106 | # Align using tri2b 107 | steps/align_si.sh --nj $njobs --cmd "$train_cmd" --use-graphs true \ 108 | data/train data/lang exp/tri2b exp/tri2b_ali_train || { echo "Error aligning tri2b"; exit 1; } 109 | fi 110 | 111 | # tri3b, LDA+MLLT+SAT 112 | if [ $stage -le 6 ]; then 113 | steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ 114 | data/train data/lang exp/tri2b_ali_train exp/tri3b || { echo "Error training tri3b (LDA+MLLT+SAT)"; exit 1; } 115 | 116 | # decode using the tri3b model 117 | ( 118 | utils/mkgraph.sh data/lang exp/tri3b exp/tri3b/graph || { echo "Error making graph for tri3b"; exit 1; } 119 | for testset in dev dev_unique; do 120 | steps/decode_fmllr.sh --nj $njobs --cmd "$decode_cmd" \ 121 | exp/tri3b/graph data/$testset exp/tri3b/decode_$testset || { echo "Error decoding tri3b"; exit 1; } 122 | done 123 | )& 124 | fi 125 | 126 | if [ $stage -le 7 ]; then 127 | # Align utts in the full training set using the tri3b model 128 | steps/align_fmllr.sh --nj $njobs --cmd "$train_cmd" \ 129 | data/train data/lang \ 130 | exp/tri3b exp/tri3b_ali_train || { echo "Error aligning FMLLR for tri4b"; exit 1; } 131 | 132 | # train another LDA+MLLT+SAT system on the entire training set 133 | steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ 134 | data/train data/lang \ 135 | exp/tri3b_ali_train exp/tri4b || { echo "Error training tri4b"; exit 1; } 136 | 137 | # decode using the tri4b model 138 | ( 139 | utils/mkgraph.sh data/lang exp/tri4b exp/tri4b/graph || { echo "Error making graph for tri4b"; exit 1; } 140 | for testset in dev dev_unique; do 141 | steps/decode_fmllr.sh --nj $njobs --cmd "$decode_cmd" \ 142 | exp/tri4b/graph data/$testset \ 143 | exp/tri4b/decode_$testset || { echo "Error decoding tri4b"; exit 1; } 144 | done 145 | )& 146 | fi 147 | 148 | # train a chain model 149 | if [ $stage -le 8 ]; then 150 | local/chain/run_tdnn.sh --stage 0 151 | fi 152 | 153 | # wait for jobs to finish 154 | wait 155 | -------------------------------------------------------------------------------- /s5/local/nnet3/run_ivector_common.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Adapted from egs/mini_librispeech/s5/local/nnet3/run_ivector_common.sh (commit 92c99ee51caeba4be7c5ab39ea7c1d6100f3d67b) 4 | 5 | set -euo pipefail 6 | 7 | # This script is called from local/nnet3/run_tdnn.sh and 8 | # local/chain/run_tdnn.sh (and may eventually be called by more 9 | # scripts). It contains the common feature preparation and 10 | # iVector-related parts of the script. See those scripts for examples 11 | # of usage. 12 | 13 | stage=0 14 | train_set=train 15 | test_sets="dev dev_unique test test_unique" 16 | gmm=tri3b 17 | nj=$(nproc) 18 | 19 | nnet3_affix= 20 | 21 | . ./cmd.sh 22 | . ./path.sh 23 | . utils/parse_options.sh 24 | 25 | gmm_dir=exp/${gmm} 26 | ali_dir=exp/${gmm}_ali_${train_set}_sp 27 | 28 | for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do 29 | if [ ! -f $f ]; then 30 | echo "$0: expected file $f to exist" 31 | exit 1 32 | fi 33 | done 34 | 35 | if [ $stage -le 1 ]; then 36 | # Although the nnet will be trained by high resolution data, we still have to 37 | # perturb the normal data to get the alignment _sp stands for speed-perturbed 38 | echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" 39 | utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp 40 | echo "$0: making MFCC features for low-resolution speed-perturbed data" 41 | steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1; 42 | steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; 43 | utils/fix_data_dir.sh data/${train_set}_sp 44 | fi 45 | 46 | if [ $stage -le 2 ]; then 47 | echo "$0: aligning with the perturbed low-resolution data" 48 | steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ 49 | data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 50 | fi 51 | 52 | if [ $stage -le 3 ]; then 53 | # Create high-resolution MFCC features (with 40 cepstra instead of 13). 54 | # this shows how you can split across multiple file-systems. 55 | echo "$0: creating high-resolution MFCC features" 56 | mfccdir=data/${train_set}_sp_hires/data 57 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then 58 | utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/mfcc/commonvoice-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage 59 | fi 60 | 61 | for datadir in ${train_set}_sp ${test_sets}; do 62 | utils/copy_data_dir.sh data/$datadir data/${datadir}_hires 63 | done 64 | 65 | # do volume-perturbation on the training data prior to extracting hires 66 | # features; this helps make trained nnets more invariant to test data volume. 67 | utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; 68 | 69 | for datadir in ${train_set}_sp ${test_sets}; do 70 | steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ 71 | --cmd "$train_cmd" data/${datadir}_hires || exit 1; 72 | steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; 73 | utils/fix_data_dir.sh data/${datadir}_hires || exit 1; 74 | done 75 | fi 76 | 77 | if [ $stage -le 4 ]; then 78 | echo "$0: computing a subset of data to train the diagonal UBM." 79 | # We'll use about a quarter of the data. 80 | mkdir -p exp/nnet3${nnet3_affix}/diag_ubm 81 | temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm 82 | 83 | num_utts_total=$(wc -l kaldi 18 | ``` 19 | 20 | ### Run docker and attach command line 21 | Once the image had been built, all you have to do is interactively attach to its bash terminal via the following command: 22 | ```bash 23 | $ docker run -it -v :/opt/kaldi/egs/commonvoice-th \ 24 | -v /labels:/mnt/labels \ 25 | -v :/mnt \ 26 | --gpus all --name bash 27 | ``` 28 | Once you finish this step, you should be in a docker container's bash terminal now 29 | 30 | ## Building Docker for inferencing via Vosk 31 | We also provide an example of how to inference a trained kaldi model using Vosk. Berore we begin, let's build Vosk docker image: 32 | ```bash 33 | $ cd docker 34 | $ docker build -t vosk-inference 35 | $ cd .. # back to root directory 36 | ``` 37 | 38 | ### Preparing Directories for Vosk Inferencing 39 | The first step is to download provided Vosk model format on this github's release. Unzip it to `vosk-inference` directory. Or you can just follow this code. 40 | ``` 41 | $ cd vosk-inference 42 | $ wget https://github.com/vistec-AI/commonvoice-th/releases/download/vosk-v1/model.zip 43 | $ unzip model.zip 44 | ``` 45 | 46 | ### Run docker and test inference script 47 | To prevent dependencies problem, the Vosk inference python script must be run inside a docker image that we just built. First, let's initiate a docker 48 | ```bash 49 | $ docker run -it -v :/workspace \ 50 | --name \ 51 | -p 8000:8000 \ 52 | bash 53 | ``` 54 | Then, you will be attached to a linux terminal inside the container. To inference an audio file, run: 55 | ```bash 56 | $ cd vosk-inference 57 | $ python3.8 inference.py --wav-path # test it with test.wav 58 | ``` 59 | **Note that audio file must be 16k samping rate and mono channel!** 60 | 61 | ### Instaltiating Vosk Server to Processing audio files 62 | We also provide a `fastapi` server that will allow user to transcribe their own audio file via RESTful API. To instantiate server, run this command **inside a docker shell** 63 | ```bash 64 | $ cd vosk-inference 65 | $ uvicorn server:app --host 0.0.0.0 --reload 66 | ``` 67 | Now, the server will instantiate at `http://localhost:8000`. To see if server is correctly instantiated, try to browse `http://localhost:8000/healthcheck`. If the webpage loaded then we are good to go! 68 | 69 | #### API Endpoint 70 | The endpoint will be in form-data format where each file is attached to a form field named `audios`. See python example 71 | ```python 72 | import requests 73 | 74 | url = "localhost:8000/transcribe" 75 | 76 | payload={} 77 | files=[ 78 | ('audios', (, open(, 'rb'), 'audio/wav')), 79 | ... 80 | ] 81 | headers = {} 82 | 83 | response = requests.request("POST", url, headers=headers, data=payload, files=files) 84 | 85 | print(response.text) 86 | ``` 87 | 88 | ## Online Decoding with WebRTC Protocol 89 | Read more at [this repository](https://github.com/danijel3/KaldiWebrtcServer). The provided repository contains an easy way to deploy Kaldi `tdnn-chain` model to webRTC server. 90 | 91 | 92 | ## Usage 93 | To run the training pipeline, go to recipe directory and run `run.sh` script 94 | ```bash 95 | $ cd /opt/kaldi/egs/commonvoice-th/s5 96 | $ ./run.sh --stage 0 97 | ``` 98 | 99 | 100 | ## Experiment Results 101 | Here are some experiment results evaluated on dev set: 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 |
Modeldevdev-unique
WERCERWERCER
mono79.13%57.31%77.79%48.97%
tri156.55%37.88%53.26%27.99%
tri2b50.64%32.85%47.38%21.89%
tri3b50.52%32.70%47.06%21.67%
tri4b46.81%29.47%43.18%18.05%
tdnn-chain29.15%14.96%30.84%8.75%
tdnn-chain-online29.02%14.64%30.41%8.28%
169 | 170 | Here is final `test` set result evaluated on `tdnn-chain` 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 |
Modeltesttest-unique
WERCERWERCER
tdnn-chain-online9.71%3.12%23.04%7.57%
airesearch/wav2vec2-xlsr-53-th--13.632.81%
Google Web Speech API--13.71%7.36%
Microsoft Bing Search API--12.58%5.01%
Amazon Transcribe--21.86%7.08%
234 | 235 | ## Author 236 | Chompakorn Chaksangchaichot 237 | -------------------------------------------------------------------------------- /s5/local/chain/tuning/run_tdnn_1a.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Adapted from egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh 4 | 5 | # local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1a_sp_online 6 | # System tdnn1a_sp tdnn1a_sp_online 7 | #WER valid_dev 4.82 4.88 8 | #WER valid_test 4.44 4.27 9 | # Final train prob -0.0579 10 | # Final valid prob -0.0718 11 | # Final train prob (xent) -1.1069 12 | # Final valid prob (xent) -1.1325 13 | 14 | set -euo pipefail 15 | 16 | # First the options that are passed through to run_ivector_common.sh 17 | # (some of which are also used in this script directly). 18 | stage=0 19 | decode_nj=$(nproc) 20 | train_set=train 21 | test_sets="dev dev_unique test test_unique" 22 | gmm=tri4b 23 | nnet3_affix= 24 | 25 | use_gpu="wait" 26 | njob_init=1 27 | njob_final=1 28 | 29 | # The rest are configs specific to this script. Most of the parameters 30 | # are just hardcoded at this level, in the commands below. 31 | affix=1a # affix for the TDNN directory name 32 | tree_affix= 33 | train_stage=-10 34 | get_egs_stage=-10 35 | decode_iter= 36 | 37 | # training options 38 | # training chunk-options 39 | chunk_width=140,100,160 40 | # we don't need extra left/right context for TDNN systems. 41 | chunk_left_context=0 42 | chunk_right_context=0 43 | common_egs_dir= 44 | xent_regularize=0.1 45 | 46 | # training options 47 | srand=123 48 | remove_egs=false 49 | reporting_email= 50 | 51 | #decode options 52 | test_online_decoding=true # if true, it will run the last decoding stage. 53 | 54 | 55 | # End configuration section. 56 | echo "$0 $@" # Print the command line for logging 57 | 58 | . ./cmd.sh 59 | . ./path.sh 60 | . ./utils/parse_options.sh 61 | 62 | if ! cuda-compiled; then 63 | cat <$lang/topo 115 | fi 116 | fi 117 | 118 | if [ $stage -le 11 ]; then 119 | # Get the alignments as lattices (gives the chain training more freedom). 120 | # use the same num-jobs as the alignments 121 | steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ 122 | data/lang $gmm_dir $lat_dir 123 | rm $lat_dir/fsts.*.gz # save space 124 | fi 125 | 126 | if [ $stage -le 12 ]; then 127 | # Build a tree using our new topology. We know we have alignments for the 128 | # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use 129 | # those. The num-leaves is always somewhat less than the num-leaves from 130 | # the GMM baseline. 131 | if [ -f $tree_dir/final.mdl ]; then 132 | echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." 133 | exit 1; 134 | fi 135 | steps/nnet3/chain/build_tree.sh \ 136 | --frame-subsampling-factor 3 \ 137 | --context-opts "--context-width=2 --central-position=1" \ 138 | --cmd "$train_cmd" 4200 ${lores_train_data_dir} \ 139 | $lang $ali_dir $tree_dir 140 | fi 141 | 142 | 143 | if [ $stage -le 13 ]; then 144 | mkdir -p $dir 145 | echo "$0: creating neural net configs using the xconfig parser"; 146 | 147 | num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') 148 | learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) 149 | 150 | mkdir -p $dir/configs 151 | cat < $dir/configs/network.xconfig 152 | input dim=100 name=ivector 153 | input dim=40 name=input 154 | 155 | # please note that it is important to have input layer with the name=input 156 | # as the layer immediately preceding the fixed-affine-layer to enable 157 | # the use of short notation for the descriptor 158 | fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat 159 | 160 | # the first splicing is moved before the lda layer, so no splicing here 161 | relu-batchnorm-layer name=tdnn1 dim=768 162 | relu-batchnorm-layer name=tdnn2 dim=768 input=Append(-1,0,1) 163 | relu-batchnorm-layer name=tdnn3 dim=768 164 | relu-batchnorm-layer name=tdnn4 dim=768 input=Append(-1,0,1) 165 | relu-batchnorm-layer name=tdnn5 dim=768 166 | relu-batchnorm-layer name=tdnn6 dim=768 input=Append(-3,0,3) 167 | relu-batchnorm-layer name=tdnn7 dim=768 input=Append(-3,0,3) 168 | relu-batchnorm-layer name=tdnn8 dim=768 input=Append(-6,-3,0) 169 | 170 | ## adding the layers for chain branch 171 | relu-batchnorm-layer name=prefinal-chain dim=768 target-rms=0.5 172 | output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 173 | 174 | # adding the layers for xent branch 175 | # This block prints the configs for a separate output that will be 176 | # trained with a cross-entropy objective in the 'chain' models... this 177 | # has the effect of regularizing the hidden parts of the model. we use 178 | # 0.5 / args.xent_regularize as the learning rate factor- the factor of 179 | # 0.5 / args.xent_regularize is suitable as it means the xent 180 | # final-layer learns at a rate independent of the regularization 181 | # constant; and the 0.5 was tuned so as to make the relative progress 182 | # similar in the xent and regular final layers. 183 | relu-batchnorm-layer name=prefinal-xent input=tdnn8 dim=768 target-rms=0.5 184 | output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 185 | EOF 186 | steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ 187 | fi 188 | 189 | 190 | if [ $stage -le 14 ]; then 191 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then 192 | utils/create_split_dir.pl \ 193 | /export/b0{3,4,5,6}/$USER/kaldi-data/egs/commonvoice-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage 194 | fi 195 | 196 | steps/nnet3/chain/train.py --stage=$train_stage \ 197 | --cmd="$decode_cmd" \ 198 | --feat.online-ivector-dir=$train_ivector_dir \ 199 | --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ 200 | --chain.xent-regularize $xent_regularize \ 201 | --chain.leaky-hmm-coefficient=0.1 \ 202 | --chain.l2-regularize=0.00005 \ 203 | --chain.apply-deriv-weights=false \ 204 | --chain.lm-opts="--num-extra-lm-states=2000" \ 205 | --trainer.srand=$srand \ 206 | --trainer.max-param-change=2.0 \ 207 | --trainer.num-epochs=4 \ 208 | --trainer.frames-per-iter=1500000 \ 209 | --trainer.optimization.num-jobs-initial=$njob_init \ 210 | --trainer.optimization.num-jobs-final=$njob_final \ 211 | --trainer.optimization.initial-effective-lrate=0.001 \ 212 | --trainer.optimization.final-effective-lrate=0.0001 \ 213 | --trainer.optimization.shrink-value=1.0 \ 214 | --trainer.num-chunk-per-minibatch=128,64 \ 215 | --trainer.optimization.momentum=0.0 \ 216 | --egs.chunk-width=$chunk_width \ 217 | --egs.chunk-left-context=$chunk_left_context \ 218 | --egs.chunk-right-context=$chunk_right_context \ 219 | --egs.chunk-left-context-initial=0 \ 220 | --egs.chunk-right-context-final=0 \ 221 | --egs.dir="$common_egs_dir" \ 222 | --egs.opts="--frames-overlap-per-eg 0" \ 223 | --cleanup.remove-egs=$remove_egs \ 224 | --use-gpu=$use_gpu \ 225 | --reporting.email="$reporting_email" \ 226 | --feat-dir=$train_data_dir \ 227 | --tree-dir=$tree_dir \ 228 | --lat-dir=$lat_dir \ 229 | --dir=$dir || exit 1; 230 | fi 231 | 232 | if [ $stage -le 15 ]; then 233 | # Note: it's not important to give mkgraph.sh the lang directory with the 234 | # matched topology (since it gets the topology file from the model). 235 | utils/mkgraph.sh \ 236 | --self-loop-scale 1.0 data/lang \ 237 | $tree_dir $tree_dir/graph || exit 1; 238 | fi 239 | 240 | if [ $stage -le 16 ]; then 241 | frames_per_chunk=$(echo $chunk_width | cut -d, -f1) 242 | rm $dir/.error 2>/dev/null || true 243 | 244 | for data in $test_sets; do 245 | ( 246 | nspk=$(wc -l /dev/null || true 275 | 276 | for data in $test_sets; do 277 | ( 278 | nspk=$(wc -l