├── docs ├── .nojekyll ├── build │ ├── html │ │ ├── .nojekyll │ │ ├── objects.inv │ │ ├── _static │ │ │ ├── file.png │ │ │ ├── minus.png │ │ │ ├── plus.png │ │ │ ├── css │ │ │ │ ├── fonts │ │ │ │ │ ├── lato-bold.woff │ │ │ │ │ ├── lato-bold.woff2 │ │ │ │ │ ├── lato-normal.woff │ │ │ │ │ ├── lato-normal.woff2 │ │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ │ └── lato-normal-italic.woff2 │ │ │ │ └── badge_only.css │ │ │ ├── documentation_options.js │ │ │ └── js │ │ │ │ ├── badge_only.js │ │ │ │ └── html5shiv.min.js │ │ ├── _sources │ │ │ ├── Evaluator.rst.txt │ │ │ ├── Checkpoint.rst.txt │ │ │ ├── Trainer.rst.txt │ │ │ ├── Deep Speech 2.rst.txt │ │ │ ├── Decode.rst.txt │ │ │ ├── Jasper.rst.txt │ │ │ ├── Etc.rst.txt │ │ │ ├── Interface.rst.txt │ │ │ ├── Vocabs.rst.txt │ │ │ ├── Modules.rst.txt │ │ │ ├── RNN Transducer.rst.txt │ │ │ ├── Listen Attend Spell.rst.txt │ │ │ ├── Criterion.rst.txt │ │ │ ├── Optim.rst.txt │ │ │ ├── Learning Rate Schedulers.rst.txt │ │ │ ├── Conformer.rst.txt │ │ │ ├── Data.rst.txt │ │ │ ├── index.rst.txt │ │ │ └── Speech Transformer.rst.txt │ │ └── .buildinfo │ ├── .DS_Store │ └── doctrees │ │ ├── Etc.doctree │ │ ├── Data.doctree │ │ ├── Decode.doctree │ │ ├── Jasper.doctree │ │ ├── Optim.doctree │ │ ├── Vocabs.doctree │ │ ├── index.doctree │ │ ├── Modules.doctree │ │ ├── Trainer.doctree │ │ ├── Checkpoint.doctree │ │ ├── Conformer.doctree │ │ ├── Criterion.doctree │ │ ├── Evaluator.doctree │ │ ├── Interface.doctree │ │ ├── environment.pickle │ │ ├── notes │ │ ├── intro.doctree │ │ └── Preparation.doctree │ │ ├── Deep Speech 2.doctree │ │ ├── RNN Transducer.doctree │ │ ├── Speech Transformer.doctree │ │ ├── Listen Attend Spell.doctree │ │ └── Learning Rate Schedulers.doctree ├── .DS_Store ├── objects.inv ├── _static │ ├── file.png │ ├── plus.png │ ├── minus.png │ ├── css │ │ ├── fonts │ │ │ ├── lato-bold.woff │ │ │ ├── lato-bold.woff2 │ │ │ ├── lato-normal.woff │ │ │ ├── lato-normal.woff2 │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ ├── lato-bold-italic.woff │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ ├── fontawesome-webfont.eot │ │ │ ├── fontawesome-webfont.ttf │ │ │ ├── lato-bold-italic.woff2 │ │ │ ├── lato-normal-italic.woff │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ ├── fontawesome-webfont.woff │ │ │ ├── fontawesome-webfont.woff2 │ │ │ └── lato-normal-italic.woff2 │ │ └── badge_only.css │ ├── documentation_options.js │ └── js │ │ ├── badge_only.js │ │ └── html5shiv.min.js ├── source │ ├── Evaluator.rst │ ├── Checkpoint.rst │ ├── Trainer.rst │ ├── Deep Speech 2.rst │ ├── Decode.rst │ ├── Jasper.rst │ ├── Etc.rst │ ├── Interface.rst │ ├── Vocabs.rst │ ├── Modules.rst │ ├── RNN Transducer.rst │ ├── Listen Attend Spell.rst │ ├── Criterion.rst │ ├── Optim.rst │ ├── Learning Rate Schedulers.rst │ ├── Conformer.rst │ ├── Data.rst │ ├── index.rst │ └── Speech Transformer.rst ├── _sources │ ├── Evaluator.rst.txt │ ├── Checkpoint.rst.txt │ ├── Trainer.rst.txt │ ├── Deep Speech 2.rst.txt │ ├── Decode.rst.txt │ ├── Jasper.rst.txt │ ├── Etc.rst.txt │ ├── Interface.rst.txt │ ├── Vocabs.rst.txt │ ├── Modules.rst.txt │ ├── RNN Transducer.rst.txt │ ├── Listen Attend Spell.rst.txt │ ├── Criterion.rst.txt │ ├── Optim.rst.txt │ ├── Learning Rate Schedulers.rst.txt │ ├── Conformer.rst.txt │ ├── Data.rst.txt │ ├── index.rst.txt │ └── Speech Transformer.rst.txt ├── .buildinfo ├── Makefile └── make.bat ├── configs ├── eval.yaml ├── train.yaml ├── eval │ └── default.yaml ├── model │ ├── rnnt.yaml │ ├── ds2.yaml │ ├── transformer.yaml │ ├── joint-ctc-attention-transformer.yaml │ ├── las.yaml │ ├── joint-ctc-attention-las.yaml │ ├── conformer-medium.yaml │ ├── conformer-small.yaml │ └── conformer-large.yaml ├── audio │ ├── mfcc.yaml │ ├── fbank.yaml │ ├── melspectrogram.yaml │ └── spectrogram.yaml └── train │ ├── ds2_train.yaml │ ├── las_train.yaml │ └── transformer_train.yaml ├── train.txt ├── data ├── .DS_Store ├── vocab │ ├── .DS_Store │ ├── tokenizer.model │ ├── kspon_sentencepiece.model │ └── aihub_grapheme_vocabs.csv └── train_result │ ├── eval_result.csv │ ├── train_result.csv │ └── train_step_result.csv ├── dataset ├── kspon │ ├── requirements.txt │ ├── preprocess.sh │ ├── LICENSE │ └── preprocess │ │ ├── subword.py │ │ ├── character.py │ │ └── grapheme.py └── libri │ ├── prepare-libri.sh │ ├── prepare-libri.py │ └── preprocess.py ├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── quesion.md │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── test ├── test_rnnt_recognize.py ├── test_conformer_recognize.py ├── model_architecture.py ├── cer_visualize.py ├── loss_visualize.py ├── test_rnnt.py ├── test_deepspeech2_recognize.py ├── test_conformer.py ├── test_las_decoder.py ├── test_jasper_recognize.py ├── test_las_recognize.py ├── test_las_encoder.py ├── test_transformer_recognize.py ├── test_jasper.py ├── test_deepspeech2.py ├── test_las.py ├── test_tri_stage_lr_scheduler.py ├── test_transformer_lr_scheduler.py ├── test_transformer.py ├── test_transformer_encoder.py └── attn_visualize.py ├── bin ├── kospeech │ ├── checkpoint │ │ └── __init__.py │ ├── optim │ │ ├── lr_scheduler │ │ │ ├── __init__.py │ │ │ ├── lr_scheduler.py │ │ │ ├── transformer_lr_scheduler.py │ │ │ └── tri_stage_lr_scheduler.py │ │ └── __init__.py │ ├── models │ │ ├── jasper │ │ │ ├── __init__.py │ │ │ └── configs.py │ │ ├── deepspeech2 │ │ │ └── __init__.py │ │ ├── rnnt │ │ │ └── __init__.py │ │ ├── transformer │ │ │ ├── __init__.py │ │ │ ├── mask.py │ │ │ ├── sublayers.py │ │ │ └── embeddings.py │ │ ├── activation.py │ │ ├── las │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── conformer │ │ │ └── __init__.py │ │ └── modules.py │ ├── criterion │ │ ├── __init__.py │ │ ├── transducer.py │ │ ├── label_smoothed_cross_entropy.py │ │ └── joint_ctc_cross_entropy.py │ ├── data │ │ ├── __init__.py │ │ ├── audio │ │ │ └── __init__.py │ │ └── label_loader.py │ ├── evaluator │ │ ├── __init__.py │ │ └── evaluator.py │ ├── vocabs │ │ ├── __init__.py │ │ └── librispeech.py │ └── decode │ │ └── ensemble.py ├── inference.py └── eval.py ├── requirements_cssiri.txt ├── setup.py ├── etc ├── cmd 실행 코드.txt └── traintext 생성.ipynb └── README.md /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/build/html/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /configs/eval.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - audio: fbank 3 | - eval: default -------------------------------------------------------------------------------- /train.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/train.txt -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/.DS_Store -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/.DS_Store -------------------------------------------------------------------------------- /docs/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/objects.inv -------------------------------------------------------------------------------- /dataset/kspon/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | gluonnlp 3 | sentencepiece 4 | hgtk 5 | pandas 6 | -------------------------------------------------------------------------------- /data/vocab/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/.DS_Store -------------------------------------------------------------------------------- /docs/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/file.png -------------------------------------------------------------------------------- /docs/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/plus.png -------------------------------------------------------------------------------- /docs/build/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/.DS_Store -------------------------------------------------------------------------------- /docs/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/minus.png -------------------------------------------------------------------------------- /data/vocab/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/tokenizer.model -------------------------------------------------------------------------------- /docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/objects.inv -------------------------------------------------------------------------------- /configs/train.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - audio: fbank 3 | - model: joint-ctc-attention-las 4 | - train: las_train 5 | -------------------------------------------------------------------------------- /docs/build/doctrees/Etc.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Etc.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Data.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Data.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Decode.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Decode.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Jasper.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Jasper.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Optim.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Optim.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Vocabs.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Vocabs.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /docs/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/plus.png -------------------------------------------------------------------------------- /data/vocab/kspon_sentencepiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/kspon_sentencepiece.model -------------------------------------------------------------------------------- /docs/build/doctrees/Modules.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Modules.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Trainer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Trainer.doctree -------------------------------------------------------------------------------- /data/train_result/eval_result.csv: -------------------------------------------------------------------------------- 1 | loss,cer 2 | 0.42729363597111464,1.4615384615384615 3 | 0.42627342518430084,1.4615384615384615 4 | -------------------------------------------------------------------------------- /data/train_result/train_result.csv: -------------------------------------------------------------------------------- 1 | loss,cer 2 | 0.46951467186864454,1.7254191209787042 3 | 0.4683552428329273,1.7254191209787042 4 | -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/build/doctrees/Checkpoint.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Checkpoint.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Conformer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Conformer.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Criterion.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Criterion.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Evaluator.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Evaluator.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/Interface.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Interface.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/build/doctrees/notes/intro.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/notes/intro.doctree -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/build/doctrees/Deep Speech 2.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Deep Speech 2.doctree -------------------------------------------------------------------------------- /docs/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/build/doctrees/RNN Transducer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/RNN Transducer.doctree -------------------------------------------------------------------------------- /docs/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/build/doctrees/Speech Transformer.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Speech Transformer.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/notes/Preparation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/notes/Preparation.doctree -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.psd filter=lfs diff=lfs merge=lfs -text 2 | *.pt filter=lfs diff=lfs merge=lfs -text 3 | .pt filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /docs/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/build/doctrees/Listen Attend Spell.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Listen Attend Spell.doctree -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/build/doctrees/Learning Rate Schedulers.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Learning Rate Schedulers.doctree -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/source/Evaluator.rst: -------------------------------------------------------------------------------- 1 | 2 | Evaluator 3 | ===================================================== 4 | 5 | Evaluator 6 | -------------------------------------------- 7 | .. automodule:: kospeech.evaluator.evaluator 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/_sources/Evaluator.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Evaluator 3 | ===================================================== 4 | 5 | Evaluator 6 | -------------------------------------------- 7 | .. automodule:: kospeech.evaluator.evaluator 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/source/Checkpoint.rst: -------------------------------------------------------------------------------- 1 | 2 | Checkpoint 3 | ===================================================== 4 | 5 | Checkpoint 6 | -------------------------------------------- 7 | .. automodule:: kospeech.checkpoint.checkpoint 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/_sources/Checkpoint.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Checkpoint 3 | ===================================================== 4 | 5 | Checkpoint 6 | -------------------------------------------- 7 | .. automodule:: kospeech.checkpoint.checkpoint 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/source/Trainer.rst: -------------------------------------------------------------------------------- 1 | 2 | Trainer 3 | ===================================================== 4 | 5 | Supervised Trainer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.trainer.supervised_trainer 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/_sources/Trainer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Trainer 3 | ===================================================== 4 | 5 | Supervised Trainer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.trainer.supervised_trainer 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Evaluator.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Evaluator 3 | ===================================================== 4 | 5 | Evaluator 6 | -------------------------------------------- 7 | .. automodule:: kospeech.evaluator.evaluator 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 6c4ce3491335804025d402c95a850221 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Checkpoint.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Checkpoint 3 | ===================================================== 4 | 5 | Checkpoint 6 | -------------------------------------------- 7 | .. automodule:: kospeech.checkpoint.checkpoint 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Trainer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Trainer 3 | ===================================================== 4 | 5 | Supervised Trainer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.trainer.supervised_trainer 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 6f417c1046c612cfb4b900ffe5d11a66 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/source/Deep Speech 2.rst: -------------------------------------------------------------------------------- 1 | 2 | Deep Speech 2 3 | ===================================================== 4 | 5 | Deep Speech 2 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.deepspeech2.model 9 | :members: 10 | -------------------------------------------------------------------------------- /configs/eval/default.yaml: -------------------------------------------------------------------------------- 1 | use_cuda: True 2 | dataset: 'kspon' 3 | dataset_path: '' 4 | transcripts_path: 'data/eval_transcript.txt' 5 | model_path: '' 6 | output_unit: 'character' 7 | batch_size: 32 8 | num_workers: 4 9 | print_every: 20 10 | decode: 'greedy' 11 | k: 3 -------------------------------------------------------------------------------- /docs/_sources/Deep Speech 2.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Deep Speech 2 3 | ===================================================== 4 | 5 | Deep Speech 2 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.deepspeech2.model 9 | :members: 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/quesion.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Quesion 3 | about: Question about this project 4 | title: '' 5 | labels: '' 6 | assignees: sooftware 7 | 8 | --- 9 | 10 | ## Title 11 | - 12 | 13 | ## Description 14 | 15 | ## Linked Issues 16 | 17 | - resolved # 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Title 11 | - 12 | 13 | ## Description 14 | 15 | ## Linked Issues 16 | 17 | - resolved # 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Title 11 | - 12 | 13 | ## Description 14 | 15 | ## Linked Issues 16 | - resolved # 17 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Deep Speech 2.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Deep Speech 2 3 | ===================================================== 4 | 5 | Deep Speech 2 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.deepspeech2.model 9 | :members: 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | experiment/ 2 | experiment/* 3 | data/sample 4 | data/sample/* 5 | data/data_list/debug_list.csv 6 | data/data_list/sample_list.csv 7 | *.bin 8 | *.zip 9 | *.idea 10 | docs/training 11 | docs/training/* 12 | __pycache__/ 13 | venv/* 14 | *.pyc 15 | .idea 16 | .ipynb_checkpoints -------------------------------------------------------------------------------- /configs/model/rnnt.yaml: -------------------------------------------------------------------------------- 1 | architecture: rnnt 2 | num_encoder_layers: 4 3 | num_decoder_layers: 1 4 | encoder_hidden_state_dim: 320 5 | decoder_hidden_state_dim: 512 6 | output_dim: 512 7 | rnn_type: lstm 8 | bidirectional: True 9 | encoder_dropout_p: 0.2 10 | decoder_dropout_p: 0.2 11 | -------------------------------------------------------------------------------- /configs/model/ds2.yaml: -------------------------------------------------------------------------------- 1 | architecture: deepspeech2 2 | use_bidirectional: True 3 | hidden_dim: 1024 4 | dropout: 0.3 5 | num_encoder_layers: 3 6 | rnn_type: gru 7 | max_len: 400 8 | activation: hardtanh 9 | teacher_forcing_ratio: 1.0 10 | teacher_forcing_step: 0.0 11 | min_teacher_forcing_ratio: 1.0 12 | joint_ctc_attention: false -------------------------------------------------------------------------------- /configs/model/transformer.yaml: -------------------------------------------------------------------------------- 1 | architecture: transformer 2 | d_model: 512 3 | num_heads: 8 4 | num_encoder_layers: 12 5 | num_decoder_layers: 6 6 | dropout: 0.3 7 | ffnet_style: ff 8 | teacher_forcing_ratio: 1.0 9 | teacher_forcing_step: 0.0 10 | min_teacher_forcing_ratio: 1.0 11 | joint_ctc_attention: false 12 | max_len: 400 -------------------------------------------------------------------------------- /configs/audio/mfcc.yaml: -------------------------------------------------------------------------------- 1 | audio_extension: pcm 2 | transform_method: mfcc 3 | sample_rate: 16000 4 | frame_length: 20 5 | frame_shift: 10 6 | n_mels: 40 7 | normalize: True 8 | del_silence: True 9 | feature_extract_by: kaldi 10 | freq_mask_para: 8 11 | time_mask_num: 4 12 | freq_mask_num: 2 13 | spec_augment: True 14 | input_reverse: false -------------------------------------------------------------------------------- /configs/audio/fbank.yaml: -------------------------------------------------------------------------------- 1 | audio_extension: wav 2 | transform_method: fbank 3 | sample_rate: 16000 4 | frame_length: 20 5 | frame_shift: 10 6 | n_mels: 80 7 | normalize: True 8 | del_silence: True 9 | feature_extract_by: kaldi 10 | freq_mask_para: 18 11 | time_mask_num: 4 12 | freq_mask_num: 2 13 | spec_augment: True 14 | input_reverse: false -------------------------------------------------------------------------------- /configs/audio/melspectrogram.yaml: -------------------------------------------------------------------------------- 1 | audio_extension: pcm 2 | transform_method: mel 3 | sample_rate: 16000 4 | frame_length: 20 5 | frame_shift: 10 6 | n_mels: 80 7 | normalize: True 8 | del_silence: True 9 | feature_extract_by: kaldi 10 | freq_mask_para: 18 11 | time_mask_num: 4 12 | freq_mask_num: 2 13 | spec_augment: True 14 | input_reverse: false -------------------------------------------------------------------------------- /configs/audio/spectrogram.yaml: -------------------------------------------------------------------------------- 1 | audio_extension: pcm 2 | transform_method: mel 3 | sample_rate: 16000 4 | frame_length: 20 5 | frame_shift: 10 6 | n_mels: 161 # Not used 7 | normalize: True 8 | del_silence: True 9 | feature_extract_by: kaldi 10 | freq_mask_para: 24 11 | time_mask_num: 4 12 | freq_mask_num: 2 13 | spec_augment: True 14 | input_reverse: false -------------------------------------------------------------------------------- /docs/source/Decode.rst: -------------------------------------------------------------------------------- 1 | 2 | Decode 3 | ===================================================== 4 | 5 | Search 6 | -------------------------------------------- 7 | .. automodule:: kospeech.decode.search 8 | :members: 9 | 10 | 11 | Ensemble 12 | -------------------------------------------- 13 | .. automodule:: kospeech.decode.ensemble 14 | :members: 15 | -------------------------------------------------------------------------------- /docs/_sources/Decode.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Decode 3 | ===================================================== 4 | 5 | Search 6 | -------------------------------------------- 7 | .. automodule:: kospeech.decode.search 8 | :members: 9 | 10 | 11 | Ensemble 12 | -------------------------------------------- 13 | .. automodule:: kospeech.decode.ensemble 14 | :members: 15 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Decode.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Decode 3 | ===================================================== 4 | 5 | Search 6 | -------------------------------------------- 7 | .. automodule:: kospeech.decode.search 8 | :members: 9 | 10 | 11 | Ensemble 12 | -------------------------------------------- 13 | .. automodule:: kospeech.decode.ensemble 14 | :members: 15 | -------------------------------------------------------------------------------- /docs/source/Jasper.rst: -------------------------------------------------------------------------------- 1 | 2 | Jasper 3 | ===================================================== 4 | 5 | Jasper 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.jasper.model 9 | :members: 10 | 11 | Sublayers 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.jasper.sublayers 15 | :members: 16 | -------------------------------------------------------------------------------- /docs/_sources/Jasper.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Jasper 3 | ===================================================== 4 | 5 | Jasper 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.jasper.model 9 | :members: 10 | 11 | Sublayers 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.jasper.sublayers 15 | :members: 16 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Jasper.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Jasper 3 | ===================================================== 4 | 5 | Jasper 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.jasper.model 9 | :members: 10 | 11 | Sublayers 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.jasper.sublayers 15 | :members: 16 | -------------------------------------------------------------------------------- /configs/model/joint-ctc-attention-transformer.yaml: -------------------------------------------------------------------------------- 1 | architecture: transformer 2 | d_model: 512 3 | num_heads: 8 4 | num_encoder_layers: 12 5 | num_decoder_layers: 6 6 | dropout: 0.3 7 | ffnet_style: ff 8 | teacher_forcing_ratio: 1.0 9 | teacher_forcing_step: 0.0 10 | min_teacher_forcing_ratio: 1.0 11 | cross_entropy_weight: 0.7 12 | ctc_weight: 0.3 13 | mask_conv: True 14 | joint_ctc_attention: True 15 | max_len: 400 -------------------------------------------------------------------------------- /docs/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: 'latest', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '.txt', 11 | NAVIGATION_WITH_KEYS: false 12 | }; -------------------------------------------------------------------------------- /docs/build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: 'latest', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '.txt', 11 | NAVIGATION_WITH_KEYS: false 12 | }; -------------------------------------------------------------------------------- /docs/source/Etc.rst: -------------------------------------------------------------------------------- 1 | 2 | Etc 3 | ===================================================== 4 | 5 | Metrics 6 | -------------------------------------------- 7 | .. automodule:: kospeech.metrics 8 | :members: 9 | 10 | ModelBuilder 11 | -------------------------------------------- 12 | .. automodule:: kospeech.model_builder 13 | :members: 14 | 15 | Utils 16 | -------------------------------------------- 17 | .. automodule:: kospeech.utils 18 | :members: -------------------------------------------------------------------------------- /configs/model/las.yaml: -------------------------------------------------------------------------------- 1 | architecture: las 2 | use_bidirectional: True 3 | hidden_dim: 512 4 | dropout: 0.3 5 | num_heads: 4 6 | label_smoothing: 0.1 7 | num_encoder_layers: 3 8 | num_decoder_layers: 2 9 | rnn_type: lstm 10 | teacher_forcing_ratio: 1.0 11 | attn_mechanism: multi-head 12 | teacher_forcing_step: 0.0 13 | min_teacher_forcing_ratio: 1.0 14 | extractor: vgg 15 | activation: hardtanh 16 | mask_conv: false 17 | joint_ctc_attention: false 18 | max_len: 400 -------------------------------------------------------------------------------- /docs/_sources/Etc.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Etc 3 | ===================================================== 4 | 5 | Metrics 6 | -------------------------------------------- 7 | .. automodule:: kospeech.metrics 8 | :members: 9 | 10 | ModelBuilder 11 | -------------------------------------------- 12 | .. automodule:: kospeech.model_builder 13 | :members: 14 | 15 | Utils 16 | -------------------------------------------- 17 | .. automodule:: kospeech.utils 18 | :members: -------------------------------------------------------------------------------- /docs/build/html/_sources/Etc.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Etc 3 | ===================================================== 4 | 5 | Metrics 6 | -------------------------------------------- 7 | .. automodule:: kospeech.metrics 8 | :members: 9 | 10 | ModelBuilder 11 | -------------------------------------------- 12 | .. automodule:: kospeech.model_builder 13 | :members: 14 | 15 | Utils 16 | -------------------------------------------- 17 | .. automodule:: kospeech.utils 18 | :members: -------------------------------------------------------------------------------- /docs/source/Interface.rst: -------------------------------------------------------------------------------- 1 | 2 | Interface 3 | ===================================================== 4 | 5 | Model 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/_sources/Interface.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Interface 3 | ===================================================== 4 | 5 | Model 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/source/Vocabs.rst: -------------------------------------------------------------------------------- 1 | 2 | Vocabs 3 | ===================================================== 4 | 5 | Vocabulary 6 | -------------------------------------------- 7 | .. automodule:: kospeech.vocabs.__init__ 8 | :members: 9 | 10 | KsponSpeechVocabulary 11 | -------------------------------------------- 12 | .. automodule:: kospeech.vocabs.ksponspeech 13 | :members: 14 | 15 | LibriSpeechVocabulary 16 | -------------------------------------------- 17 | .. automodule:: kospeech.vocabs.librispeech 18 | :members: -------------------------------------------------------------------------------- /docs/_sources/Vocabs.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Vocabs 3 | ===================================================== 4 | 5 | Vocabulary 6 | -------------------------------------------- 7 | .. automodule:: kospeech.vocabs.__init__ 8 | :members: 9 | 10 | KsponSpeechVocabulary 11 | -------------------------------------------- 12 | .. automodule:: kospeech.vocabs.ksponspeech 13 | :members: 14 | 15 | LibriSpeechVocabulary 16 | -------------------------------------------- 17 | .. automodule:: kospeech.vocabs.librispeech 18 | :members: -------------------------------------------------------------------------------- /docs/build/html/_sources/Interface.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Interface 3 | ===================================================== 4 | 5 | Model 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/source/Modules.rst: -------------------------------------------------------------------------------- 1 | 2 | Modules 3 | ===================================================== 4 | 5 | Activation 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.activation 9 | :members: 10 | 11 | Attention 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.attention 15 | :members: 16 | 17 | Convolution 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.convolution 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /configs/model/joint-ctc-attention-las.yaml: -------------------------------------------------------------------------------- 1 | architecture: las 2 | use_bidirectional: True 3 | hidden_dim: 768 4 | dropout: 0.3 5 | num_heads: 4 6 | label_smoothing: 0.1 7 | num_encoder_layers: 3 8 | num_decoder_layers: 2 9 | rnn_type: lstm 10 | teacher_forcing_ratio: 1.0 11 | attn_mechanism: multi-head 12 | teacher_forcing_step: 0.01 13 | min_teacher_forcing_ratio: 0.9 14 | extractor: vgg 15 | activation: hardtanh 16 | cross_entropy_weight: 0.7 17 | ctc_weight: 0.3 18 | mask_conv: True 19 | joint_ctc_attention: True 20 | max_len: 400 -------------------------------------------------------------------------------- /docs/_sources/Modules.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Modules 3 | ===================================================== 4 | 5 | Activation 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.activation 9 | :members: 10 | 11 | Attention 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.attention 15 | :members: 16 | 17 | Convolution 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.convolution 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Vocabs.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Vocabs 3 | ===================================================== 4 | 5 | Vocabulary 6 | -------------------------------------------- 7 | .. automodule:: kospeech.vocabs.__init__ 8 | :members: 9 | 10 | KsponSpeechVocabulary 11 | -------------------------------------------- 12 | .. automodule:: kospeech.vocabs.ksponspeech 13 | :members: 14 | 15 | LibriSpeechVocabulary 16 | -------------------------------------------- 17 | .. automodule:: kospeech.vocabs.librispeech 18 | :members: -------------------------------------------------------------------------------- /docs/source/RNN Transducer.rst: -------------------------------------------------------------------------------- 1 | 2 | RNN Transducer 3 | ===================================================== 4 | 5 | RNN Transducer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.rnnt.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.rnnt.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/_sources/RNN Transducer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | RNN Transducer 3 | ===================================================== 4 | 5 | RNN Transducer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.rnnt.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.rnnt.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Modules.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Modules 3 | ===================================================== 4 | 5 | Activation 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.activation 9 | :members: 10 | 11 | Attention 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.attention 15 | :members: 16 | 17 | Convolution 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.convolution 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/build/html/_sources/RNN Transducer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | RNN Transducer 3 | ===================================================== 4 | 5 | RNN Transducer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.rnnt.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.rnnt.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/source/Listen Attend Spell.rst: -------------------------------------------------------------------------------- 1 | 2 | Listen, Attend and Spell 3 | ===================================================== 4 | 5 | ListenAttendSpell 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.las.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.las.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.las.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /configs/model/conformer-medium.yaml: -------------------------------------------------------------------------------- 1 | architecture: conformer 2 | teacher_forcing_step: 0.0 3 | min_teacher_forcing_ratio: 1.0 4 | joint_ctc_attention: false 5 | feed_forward_expansion_factor: int = 4 6 | conv_expansion_factor: 2 7 | input_dropout_p: 0.1 8 | feed_forward_dropout_p: 0.1 9 | attention_dropout_p: 0.1 10 | conv_dropout_p: 0.1 11 | decoder_dropout_p: 0.1 12 | conv_kernel_size: 31 13 | half_step_residual: True 14 | encoder_dim: 256 15 | decoder_dim: 640 16 | num_encoder_layers: 17 17 | num_decoder_layers: 1 18 | num_attention_heads: 4 19 | decoder: None 20 | -------------------------------------------------------------------------------- /configs/model/conformer-small.yaml: -------------------------------------------------------------------------------- 1 | architecture: conformer 2 | teacher_forcing_step: 0.0 3 | min_teacher_forcing_ratio: 1.0 4 | joint_ctc_attention: false 5 | feed_forward_expansion_factor: int = 4 6 | conv_expansion_factor: 2 7 | input_dropout_p: 0.1 8 | feed_forward_dropout_p: 0.1 9 | attention_dropout_p: 0.1 10 | conv_dropout_p: 0.1 11 | decoder_dropout_p: 0.1 12 | conv_kernel_size: 31 13 | half_step_residual: True 14 | encoder_dim: 144 15 | decoder_dim: 320 16 | num_encoder_layers: 16 17 | num_decoder_layers: 1 18 | num_attention_heads: 4 19 | decoder: None 20 | -------------------------------------------------------------------------------- /docs/_sources/Listen Attend Spell.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Listen, Attend and Spell 3 | ===================================================== 4 | 5 | ListenAttendSpell 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.las.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.las.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.las.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Listen Attend Spell.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Listen, Attend and Spell 3 | ===================================================== 4 | 5 | ListenAttendSpell 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.las.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.las.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.las.decoder 21 | :members: 22 | 23 | -------------------------------------------------------------------------------- /docs/source/Criterion.rst: -------------------------------------------------------------------------------- 1 | 2 | Criterion 3 | ===================================================== 4 | 5 | Label Smoothed Cross Entropy Loss 6 | -------------------------------------------- 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy 8 | :members: 9 | 10 | Joint CTC-CrossEntropy Loss 11 | -------------------------------------------- 12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy 13 | :members: 14 | 15 | Transducer Loss 16 | -------------------------------------------- 17 | .. automodule:: kospeech.criterion.transducer 18 | :members: 19 | -------------------------------------------------------------------------------- /configs/model/conformer-large.yaml: -------------------------------------------------------------------------------- 1 | architecture: conformer 2 | teacher_forcing_step: 0.0 3 | min_teacher_forcing_ratio: 1.0 4 | joint_ctc_attention: false 5 | feed_forward_expansion_factor: int = 4 6 | conv_expansion_factor: 2 7 | input_dropout_p: 0.1 8 | feed_forward_dropout_p: 0.1 9 | attention_dropout_p: 0.1 10 | conv_dropout_p: 0.1 11 | decoder_dropout_p: 0.1 12 | conv_kernel_size: 31 13 | half_step_residual: True 14 | encoder_dim: 512 15 | decoder_dim: 640 16 | num_encoder_layers: 17 17 | num_decoder_layers: 1 18 | num_attention_heads: 8 19 | decoder_rnn_type: lstm 20 | decoder: None 21 | -------------------------------------------------------------------------------- /docs/_sources/Criterion.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Criterion 3 | ===================================================== 4 | 5 | Label Smoothed Cross Entropy Loss 6 | -------------------------------------------- 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy 8 | :members: 9 | 10 | Joint CTC-CrossEntropy Loss 11 | -------------------------------------------- 12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy 13 | :members: 14 | 15 | Transducer Loss 16 | -------------------------------------------- 17 | .. automodule:: kospeech.criterion.transducer 18 | :members: 19 | -------------------------------------------------------------------------------- /docs/source/Optim.rst: -------------------------------------------------------------------------------- 1 | 2 | Optim 3 | ===================================================== 4 | 5 | Optimizer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.__init__ 8 | :members: 9 | 10 | RAdam 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.radam 13 | :members: 14 | 15 | AdamP 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.adamp 18 | :members: 19 | 20 | Novograd 21 | -------------------------------------------- 22 | .. automodule:: kospeech.optim.novograd 23 | :members: 24 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Criterion.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Criterion 3 | ===================================================== 4 | 5 | Label Smoothed Cross Entropy Loss 6 | -------------------------------------------- 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy 8 | :members: 9 | 10 | Joint CTC-CrossEntropy Loss 11 | -------------------------------------------- 12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy 13 | :members: 14 | 15 | Transducer Loss 16 | -------------------------------------------- 17 | .. automodule:: kospeech.criterion.transducer 18 | :members: 19 | -------------------------------------------------------------------------------- /docs/_sources/Optim.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Optim 3 | ===================================================== 4 | 5 | Optimizer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.__init__ 8 | :members: 9 | 10 | RAdam 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.radam 13 | :members: 14 | 15 | AdamP 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.adamp 18 | :members: 19 | 20 | Novograd 21 | -------------------------------------------- 22 | .. automodule:: kospeech.optim.novograd 23 | :members: 24 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Optim.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Optim 3 | ===================================================== 4 | 5 | Optimizer 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.__init__ 8 | :members: 9 | 10 | RAdam 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.radam 13 | :members: 14 | 15 | AdamP 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.adamp 18 | :members: 19 | 20 | Novograd 21 | -------------------------------------------- 22 | .. automodule:: kospeech.optim.novograd 23 | :members: 24 | -------------------------------------------------------------------------------- /docs/source/Learning Rate Schedulers.rst: -------------------------------------------------------------------------------- 1 | 2 | Learning Rate Scheduler 3 | ===================================================== 4 | 5 | Learning Rate Scheduler 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler 8 | :members: 9 | 10 | Tri Stage LR Scheduler 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler 13 | :members: 14 | 15 | Transformer LR Scheduler 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler 18 | :members: 19 | -------------------------------------------------------------------------------- /configs/train/ds2_train.yaml: -------------------------------------------------------------------------------- 1 | # data 2 | dataset: 'kspon' 3 | dataset_path: '' 4 | transcripts_path: '../../../data/transcripts.txt' 5 | output_unit: 'character' 6 | 7 | # trainer 8 | num_epochs: 20 9 | batch_size: 32 10 | save_result_every: 100 11 | checkpoint_every: 100 12 | print_every: 10 13 | mode: 'train' 14 | seed: 777 15 | resume: false 16 | 17 | # device 18 | num_workers: 4 19 | use_cuda: True 20 | 21 | # optim 22 | optimizer: 'adam' 23 | init_lr: 1e-06 24 | final_lr: 1e-06 25 | peak_lr: 1e-04 26 | init_lr_scale: 0.01 27 | final_lr_scale: 0.05 28 | max_grad_norm: 400 29 | warmup_steps: 400 30 | weight_decay: 1e-05 31 | reduction: 'mean' -------------------------------------------------------------------------------- /docs/_sources/Learning Rate Schedulers.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Learning Rate Scheduler 3 | ===================================================== 4 | 5 | Learning Rate Scheduler 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler 8 | :members: 9 | 10 | Tri Stage LR Scheduler 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler 13 | :members: 14 | 15 | Transformer LR Scheduler 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler 18 | :members: 19 | -------------------------------------------------------------------------------- /configs/train/las_train.yaml: -------------------------------------------------------------------------------- 1 | # data 2 | dataset: 'kspon' 3 | dataset_path: '' 4 | transcripts_path: '../../../data/transcripts.txt' 5 | output_unit: 'character' 6 | 7 | # trainer 8 | num_epochs: 20 9 | batch_size: 32 10 | save_result_every: 1000 11 | checkpoint_every: 5000 12 | print_every: 10 13 | mode: 'train' 14 | seed: 777 15 | resume: false 16 | 17 | # device 18 | num_workers: 4 19 | use_cuda: True 20 | 21 | # optim 22 | optimizer: 'adam' 23 | init_lr: 1e-06 24 | final_lr: 1e-06 25 | peak_lr: 1e-04 26 | init_lr_scale: 0.01 27 | final_lr_scale: 0.05 28 | max_grad_norm: 400 29 | warmup_steps: 400 30 | weight_decay: 1e-05 31 | reduction: 'mean' -------------------------------------------------------------------------------- /docs/build/html/_sources/Learning Rate Schedulers.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Learning Rate Scheduler 3 | ===================================================== 4 | 5 | Learning Rate Scheduler 6 | -------------------------------------------- 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler 8 | :members: 9 | 10 | Tri Stage LR Scheduler 11 | -------------------------------------------- 12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler 13 | :members: 14 | 15 | Transformer LR Scheduler 16 | -------------------------------------------- 17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler 18 | :members: 19 | -------------------------------------------------------------------------------- /configs/train/transformer_train.yaml: -------------------------------------------------------------------------------- 1 | # data 2 | dataset: 'kspon' 3 | dataset_path: '' 4 | transcripts_path: '../../../data/transcripts.txt' 5 | output_unit: 'character' 6 | 7 | # trainer 8 | num_epochs: 40 9 | batch_size: 32 10 | save_result_every: 1000 11 | checkpoint_every: 5000 12 | print_every: 10 13 | mode: 'train' 14 | seed: 777 15 | resume: false 16 | 17 | # device 18 | num_workers: 4 19 | use_cuda: True 20 | 21 | # optim 22 | optimizer: 'adam' 23 | init_lr: 1e-06 24 | final_lr: 1e-07 25 | peak_lr: 1e-04 26 | init_lr_scale: 0.01 27 | final_lr_scale: 0.05 28 | max_grad_norm: 400 29 | warmup_steps: 4000 30 | weight_decay: 1e-05 31 | reduction: 'mean' 32 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/source/Conformer.rst: -------------------------------------------------------------------------------- 1 | 2 | Conformer 3 | ===================================================== 4 | 5 | Conformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.conformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.conformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | 23 | Modules 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.conformer.modules 27 | :members: -------------------------------------------------------------------------------- /docs/_sources/Conformer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Conformer 3 | ===================================================== 4 | 5 | Conformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.conformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.conformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | 23 | Modules 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.conformer.modules 27 | :members: -------------------------------------------------------------------------------- /docs/build/html/_sources/Conformer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Conformer 3 | ===================================================== 4 | 5 | Conformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.conformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.conformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.rnnt.decoder 21 | :members: 22 | 23 | Modules 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.conformer.modules 27 | :members: -------------------------------------------------------------------------------- /test/test_rnnt_recognize.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from kospeech.models.rnnt.model import RNNTransducer 5 | 6 | batch_size, sequence_length, dim = 3, 12345, 80 7 | 8 | cuda = torch.cuda.is_available() 9 | device = torch.device('cuda' if cuda else 'cpu') 10 | 11 | inputs = torch.rand(batch_size, sequence_length, dim).to(device) 12 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 13 | 14 | model = nn.DataParallel(RNNTransducer( 15 | num_classes=10, 16 | input_dim=dim, 17 | num_encoder_layers=3, 18 | )).to(device) 19 | 20 | outputs = model.module.recognize(inputs, input_lengths) 21 | print(outputs) 22 | print(outputs.size()) 23 | print("PASS") 24 | -------------------------------------------------------------------------------- /test/test_conformer_recognize.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from kospeech.models.conformer import Conformer 5 | 6 | batch_size, sequence_length, dim = 3, 12345, 80 7 | 8 | cuda = torch.cuda.is_available() 9 | device = torch.device('cuda' if cuda else 'cpu') 10 | 11 | inputs = torch.rand(batch_size, sequence_length, dim).to(device) 12 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 13 | 14 | model = nn.DataParallel(Conformer( 15 | num_classes=10, 16 | input_dim=dim, 17 | encoder_dim=512, 18 | num_encoder_layers=3, 19 | device=device, 20 | )).to(device) 21 | 22 | outputs = model.module.recognize(inputs, input_lengths) 23 | print(outputs) 24 | print(outputs.size()) 25 | print("PASS") 26 | -------------------------------------------------------------------------------- /bin/kospeech/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.checkpoint.checkpoint import Checkpoint 16 | -------------------------------------------------------------------------------- /dataset/kspon/preprocess.sh: -------------------------------------------------------------------------------- 1 | # Author 2 | # Soohwan Kim, Seyoung Bae, Cheolhwang Won, Soyoung Cho, Jeongwon Kwak 3 | 4 | DATASET_PATH="SET_YOUR_DATASET_PATH" 5 | VOCAB_DEST='SET_LABELS_DESTINATION' 6 | OUTPUT_UNIT='character' # you can set character / subword / grapheme 7 | PREPROCESS_MODE='phonetic' # phonetic : 칠 십 퍼센트, spelling : 70% 8 | VOCAB_SIZE=5000 # if you use subword output unit, set vocab size 9 | 10 | echo "Pre-process KsponSpeech Dataset.." 11 | 12 | python main.py \ 13 | --dataset_path $DATASET_PATH \ 14 | --vocab_dest $VOCAB_DEST \ 15 | --output_unit $OUTPUT_UNIT \ 16 | --preprocess_mode $PREPROCESS_MODE \ 17 | --vocab_size $VOCAB_SIZE \ 18 | -------------------------------------------------------------------------------- /bin/kospeech/optim/lr_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.optim.lr_scheduler.tri_stage_lr_scheduler import TriStageLRScheduler 16 | from kospeech.optim.lr_scheduler.transformer_lr_scheduler import TransformerLRScheduler -------------------------------------------------------------------------------- /docs/source/Data.rst: -------------------------------------------------------------------------------- 1 | 2 | Data 3 | ===================================================== 4 | 5 | Augment 6 | -------------------------------------------- 7 | .. automodule:: kospeech.data.audio.augment 8 | :members: 9 | 10 | Core 11 | -------------------------------------------- 12 | .. automodule:: kospeech.data.audio.core 13 | :members: 14 | 15 | Feature 16 | -------------------------------------------- 17 | .. automodule:: kospeech.data.audio.feature 18 | :members: 19 | 20 | Parser 21 | ------------------------------------------- 22 | .. automodule:: kospeech.data.audio.parser 23 | :members: 24 | 25 | DataLoader 26 | ------------------------------------------- 27 | .. automodule:: kospeech.data.data_loader 28 | :members: 29 | 30 | LabelLoader 31 | ------------------------------------------- 32 | .. automodule:: kospeech.data.label_loader 33 | :members: 34 | -------------------------------------------------------------------------------- /docs/_sources/Data.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Data 3 | ===================================================== 4 | 5 | Augment 6 | -------------------------------------------- 7 | .. automodule:: kospeech.data.audio.augment 8 | :members: 9 | 10 | Core 11 | -------------------------------------------- 12 | .. automodule:: kospeech.data.audio.core 13 | :members: 14 | 15 | Feature 16 | -------------------------------------------- 17 | .. automodule:: kospeech.data.audio.feature 18 | :members: 19 | 20 | Parser 21 | ------------------------------------------- 22 | .. automodule:: kospeech.data.audio.parser 23 | :members: 24 | 25 | DataLoader 26 | ------------------------------------------- 27 | .. automodule:: kospeech.data.data_loader 28 | :members: 29 | 30 | LabelLoader 31 | ------------------------------------------- 32 | .. automodule:: kospeech.data.label_loader 33 | :members: 34 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Data.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Data 3 | ===================================================== 4 | 5 | Augment 6 | -------------------------------------------- 7 | .. automodule:: kospeech.data.audio.augment 8 | :members: 9 | 10 | Core 11 | -------------------------------------------- 12 | .. automodule:: kospeech.data.audio.core 13 | :members: 14 | 15 | Feature 16 | -------------------------------------------- 17 | .. automodule:: kospeech.data.audio.feature 18 | :members: 19 | 20 | Parser 21 | ------------------------------------------- 22 | .. automodule:: kospeech.data.audio.parser 23 | :members: 24 | 25 | DataLoader 26 | ------------------------------------------- 27 | .. automodule:: kospeech.data.data_loader 28 | :members: 29 | 30 | LabelLoader 31 | ------------------------------------------- 32 | .. automodule:: kospeech.data.label_loader 33 | :members: 34 | -------------------------------------------------------------------------------- /bin/kospeech/models/jasper/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from dataclasses import dataclass 17 | from kospeech.models import ModelConfig 18 | 19 | 20 | @dataclass 21 | class JasperConfig(ModelConfig): 22 | architecture: str = "jasper" 23 | version: str = "10x5" 24 | -------------------------------------------------------------------------------- /requirements_cssiri.txt: -------------------------------------------------------------------------------- 1 | antlr4-python3-runtime==4.8 2 | appdirs==1.4.4 3 | astropy==5.0 4 | audioread==2.1.9 5 | certifi==2021.10.8 6 | cffi==1.15.0 7 | charset-normalizer==2.0.9 8 | colorama==0.4.4 9 | decorator==5.1.0 10 | et-xmlfile==1.1.0 11 | idna==3.3 12 | joblib==1.1.0 13 | Levenshtein==0.16.0 14 | librosa==0.8.1 15 | llvmlite==0.37.0 16 | numba==0.54.1 17 | numpy==1.20.3 18 | omegaconf==2.1.1 19 | openpyxl==3.0.9 20 | packaging==21.3 21 | pandas==1.3.4 22 | pooch==1.5.2 23 | pycparser==2.21 24 | pyerfa==2.0.0.1 25 | pyparsing==3.0.6 26 | python-dateutil==2.8.2 27 | pytz==2021.3 28 | PyYAML==6.0 29 | rapidfuzz==1.8.3 30 | requests==2.26.0 31 | resampy==0.2.2 32 | scikit-learn==1.0.1 33 | scipy==1.7.3 34 | six==1.16.0 35 | SoundFile==0.10.3.post1 36 | threadpoolctl==3.0.0 37 | torch==1.10.0 38 | torchaudio==0.10.0 39 | tqdm==4.62.3 40 | typing_extensions==4.0.1 41 | urllib3==1.26.7 42 | wincertstore==0.2 43 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /bin/kospeech/criterion/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.criterion.label_smoothed_cross_entropy import LabelSmoothedCrossEntropyLoss 16 | from kospeech.criterion.joint_ctc_cross_entropy import JointCTCCrossEntropyLoss 17 | from kospeech.criterion.transducer import TransducerLoss 18 | -------------------------------------------------------------------------------- /docs/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /docs/build/html/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /bin/kospeech/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.data.audio.parser import SpectrogramParser 16 | from kospeech.data.label_loader import load_dataset 17 | from kospeech.data.data_loader import ( 18 | SpectrogramDataset, 19 | AudioDataLoader, 20 | MultiDataLoader, 21 | split_dataset, 22 | ) 23 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. KoSpeech documentation master file, created by 2 | sphinx-quickstart on Wed Mar 4 02:42:19 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to KoSpeech's documentation! 7 | =========================================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: GETTING STARTED 12 | 13 | notes/intro 14 | notes/Preparation 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: ARCHITECTURE 19 | 20 | Interface 21 | Deep Speech 2 22 | Listen Attend Spell 23 | RNN Transducer 24 | Speech Transformer 25 | Jasper 26 | Conformer 27 | Modules 28 | 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: LIBRARY REFERENCE 33 | 34 | Checkpoint 35 | Criterion 36 | Data 37 | Decode 38 | Evaluator 39 | Optim 40 | Learning Rate Schedulers 41 | Trainer 42 | Vocabs 43 | Etc 44 | -------------------------------------------------------------------------------- /docs/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. KoSpeech documentation master file, created by 2 | sphinx-quickstart on Wed Mar 4 02:42:19 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to KoSpeech's documentation! 7 | =========================================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: GETTING STARTED 12 | 13 | notes/intro 14 | notes/Preparation 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: ARCHITECTURE 19 | 20 | Interface 21 | Deep Speech 2 22 | Listen Attend Spell 23 | RNN Transducer 24 | Speech Transformer 25 | Jasper 26 | Conformer 27 | Modules 28 | 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: LIBRARY REFERENCE 33 | 34 | Checkpoint 35 | Criterion 36 | Data 37 | Decode 38 | Evaluator 39 | Optim 40 | Learning Rate Schedulers 41 | Trainer 42 | Vocabs 43 | Etc 44 | -------------------------------------------------------------------------------- /docs/source/Speech Transformer.rst: -------------------------------------------------------------------------------- 1 | 2 | Speech Transformer 3 | ===================================================== 4 | 5 | Transformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.transformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.transformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.transformer.decoder 21 | :members: 22 | 23 | Sublayers 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.transformer.sublayers 27 | :members: 28 | 29 | Embeddings 30 | -------------------------------------------- 31 | 32 | .. automodule:: kospeech.models.transformer.embeddings 33 | :members: 34 | 35 | Mask 36 | -------------------------------------------- 37 | 38 | .. automodule:: kospeech.models.transformer.mask 39 | :members: -------------------------------------------------------------------------------- /docs/_sources/Speech Transformer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Speech Transformer 3 | ===================================================== 4 | 5 | Transformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.transformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.transformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.transformer.decoder 21 | :members: 22 | 23 | Sublayers 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.transformer.sublayers 27 | :members: 28 | 29 | Embeddings 30 | -------------------------------------------- 31 | 32 | .. automodule:: kospeech.models.transformer.embeddings 33 | :members: 34 | 35 | Mask 36 | -------------------------------------------- 37 | 38 | .. automodule:: kospeech.models.transformer.mask 39 | :members: -------------------------------------------------------------------------------- /docs/build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. KoSpeech documentation master file, created by 2 | sphinx-quickstart on Wed Mar 4 02:42:19 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to KoSpeech's documentation! 7 | =========================================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: GETTING STARTED 12 | 13 | notes/intro 14 | notes/Preparation 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: ARCHITECTURE 19 | 20 | Interface 21 | Deep Speech 2 22 | Listen Attend Spell 23 | RNN Transducer 24 | Speech Transformer 25 | Jasper 26 | Conformer 27 | Modules 28 | 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: LIBRARY REFERENCE 33 | 34 | Checkpoint 35 | Criterion 36 | Data 37 | Decode 38 | Evaluator 39 | Optim 40 | Learning Rate Schedulers 41 | Trainer 42 | Vocabs 43 | Etc 44 | -------------------------------------------------------------------------------- /docs/build/html/_sources/Speech Transformer.rst.txt: -------------------------------------------------------------------------------- 1 | 2 | Speech Transformer 3 | ===================================================== 4 | 5 | Transformer 6 | -------------------------------------------- 7 | 8 | .. automodule:: kospeech.models.transformer.model 9 | :members: 10 | 11 | Encoder 12 | -------------------------------------------- 13 | 14 | .. automodule:: kospeech.models.transformer.encoder 15 | :members: 16 | 17 | Decoder 18 | -------------------------------------------- 19 | 20 | .. automodule:: kospeech.models.transformer.decoder 21 | :members: 22 | 23 | Sublayers 24 | -------------------------------------------- 25 | 26 | .. automodule:: kospeech.models.transformer.sublayers 27 | :members: 28 | 29 | Embeddings 30 | -------------------------------------------- 31 | 32 | .. automodule:: kospeech.models.transformer.embeddings 33 | :members: 34 | 35 | Mask 36 | -------------------------------------------- 37 | 38 | .. automodule:: kospeech.models.transformer.mask 39 | :members: -------------------------------------------------------------------------------- /bin/kospeech/models/deepspeech2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from kospeech.models import ModelConfig 17 | 18 | 19 | @dataclass 20 | class DeepSpeech2Config(ModelConfig): 21 | architecture: str = "deepspeech2" 22 | use_bidirectional: bool = True 23 | rnn_type: str = "gru" 24 | hidden_dim: int = 1024 25 | activation: str = "hardtanh" 26 | num_encoder_layers: int = 3 27 | -------------------------------------------------------------------------------- /bin/kospeech/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class EvalConfig: 20 | dataset: str = 'kspon' 21 | dataset_path: str = '' 22 | transcripts_path: str = '../../../data/eval_transcript.txt' 23 | model_path: str = '' 24 | output_unit: str = 'character' 25 | batch_size: int = 32 26 | num_workers: int = 4 27 | print_every: int = 20 28 | decode: str = 'greedy' 29 | k: int = 3 30 | use_cuda: bool = True -------------------------------------------------------------------------------- /bin/kospeech/models/rnnt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class RNNTransducerConfig: 20 | architecture: str = "rnnt" 21 | num_encoder_layers: int = 4 22 | num_decoder_layers: int = 1 23 | encoder_hidden_state_dim: int = 320 24 | decoder_hidden_state_dim: int = 512 25 | output_dim: int = 512 26 | rnn_type: str = "lstm" 27 | bidirectional: bool = True 28 | encoder_dropout_p: float = 0.2 29 | decoder_dropout_p: float = 0.2 30 | -------------------------------------------------------------------------------- /dataset/kspon/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Soohwan Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/model_architecture.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.models.las.encoder import Listener 16 | from kospeech.models.las.decoder import Speller 17 | from kospeech.models.las.model import ListenAttendSpell 18 | from kospeech.models.transformer.model import SpeechTransformer 19 | 20 | 21 | encoder = Listener(80, 512, 'cpu') 22 | decoder = Speller(2038, 151, 1024, 1, 2) 23 | model = ListenAttendSpell(encoder, decoder) 24 | 25 | print(model) 26 | 27 | 28 | model = SpeechTransformer(num_classes=2038, num_encoder_layers=3, num_decoder_layers=3) 29 | print(model) 30 | -------------------------------------------------------------------------------- /test/cer_visualize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pandas as pd 16 | import matplotlib.pyplot as plt 17 | 18 | RGB = (0.4157, 0.2784, 0.3333) 19 | TRAIN_RESULT_PATH = '../data/train_result/train_step_result.csv' 20 | 21 | train_result = pd.read_csv(TRAIN_RESULT_PATH, delimiter=',', encoding='cp949') 22 | losses = train_result['loss'] 23 | cers = train_result['cer'] 24 | 25 | plt.title('Visualization of training (cer)') 26 | plt.plot(cers, color=RGB, label='cers') 27 | plt.xlabel('step (unit : 1000)', fontsize='x-large') 28 | plt.ylabel('cer', fontsize='x-large') 29 | plt.show() 30 | -------------------------------------------------------------------------------- /bin/kospeech/vocabs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class Vocabulary(object): 17 | """ 18 | Note: 19 | Do not use this class directly, use one of the sub classes. 20 | """ 21 | def __init__(self, *args, **kwargs): 22 | self.sos_id = None 23 | self.eos_id = None 24 | self.pad_id = None 25 | self.blank_id = None 26 | 27 | def label_to_string(self, labels): 28 | raise NotImplementedError 29 | 30 | 31 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary 32 | from kospeech.vocabs.librispeech import LibriSpeechVocabulary 33 | -------------------------------------------------------------------------------- /test/loss_visualize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pandas as pd 16 | import matplotlib.pyplot as plt 17 | 18 | RGB = (0.4157, 0.2784, 0.3333) 19 | TRAIN_RESULT_PATH = '../data/train_result/train_step_result.csv' 20 | 21 | 22 | train_result = pd.read_csv(TRAIN_RESULT_PATH, delimiter=',', encoding='cp949') 23 | losses = train_result['loss'] 24 | cers = train_result['cer'] 25 | 26 | 27 | plt.title('Visualization of training (loss)') 28 | plt.plot(losses, color=RGB, label='losses') 29 | plt.xlabel('step (unit : 1000)', fontsize='x-large') 30 | plt.ylabel('L2norm', fontsize='x-large') 31 | plt.show() 32 | -------------------------------------------------------------------------------- /test/test_rnnt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from kospeech.models.rnnt.model import RNNTransducer 5 | 6 | batch_size, sequence_length, dim = 3, 12345, 80 7 | 8 | cuda = torch.cuda.is_available() 9 | device = torch.device('cuda' if cuda else 'cpu') 10 | 11 | model = nn.DataParallel(RNNTransducer( 12 | num_classes=10, 13 | input_dim=dim, 14 | num_encoder_layers=3, 15 | )).to(device) 16 | 17 | criterion = nn.CTCLoss(blank=3, zero_infinity=True) 18 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04) 19 | 20 | for i in range(10): 21 | inputs = torch.rand(batch_size, sequence_length, dim).to(device) 22 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 23 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 24 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 25 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 26 | target_lengths = torch.LongTensor([9, 8, 7]) 27 | 28 | outputs = model(inputs, input_lengths, targets, target_lengths) 29 | print(outputs.size()) 30 | print("PASS") 31 | # loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths) 32 | # loss.backward() 33 | # optimizer.step() 34 | # print(loss) 35 | -------------------------------------------------------------------------------- /test/test_deepspeech2_recognize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from kospeech.models import DeepSpeech2 17 | 18 | batch_size = 3 19 | sequence_length = 14321 20 | dimension = 80 21 | 22 | cuda = torch.cuda.is_available() 23 | device = torch.device('cuda' if cuda else 'cpu') 24 | 25 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device) # BxTxD 26 | input_lengths = torch.LongTensor([14321, 14300, 13000]).to(device) 27 | 28 | print("Deep Speech 2 Model Test..") 29 | model = DeepSpeech2(num_classes=10, input_dim=dimension).to(device) 30 | output = model.recognize(inputs, input_lengths) 31 | 32 | print(output) 33 | print(output.size()) 34 | -------------------------------------------------------------------------------- /test/test_conformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from kospeech.models.conformer import Conformer 5 | 6 | batch_size, sequence_length, dim = 3, 12345, 80 7 | 8 | cuda = torch.cuda.is_available() 9 | device = torch.device('cuda' if cuda else 'cpu') 10 | 11 | model = nn.DataParallel(Conformer( 12 | num_classes=10, 13 | input_dim=dim, 14 | encoder_dim=32, 15 | num_encoder_layers=3, 16 | decoder_dim=32, 17 | device=device, 18 | )).to(device) 19 | 20 | criterion = nn.CTCLoss(blank=3, zero_infinity=True) 21 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04) 22 | 23 | for i in range(10): 24 | inputs = torch.rand(batch_size, sequence_length, dim).to(device) 25 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 26 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 27 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 28 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 29 | target_lengths = torch.LongTensor([9, 8, 7]) 30 | 31 | outputs = model(inputs, input_lengths, targets, target_lengths) 32 | print("PASS") 33 | # loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths) 34 | # loss.backward() 35 | # optimizer.step() 36 | # print(loss) 37 | -------------------------------------------------------------------------------- /bin/kospeech/optim/lr_scheduler/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class LearningRateScheduler(object): 17 | """ 18 | Provides inteface of learning rate scheduler. 19 | 20 | Note: 21 | Do not use this class directly, use one of the sub classes. 22 | """ 23 | def __init__(self, optimizer, init_lr): 24 | self.optimizer = optimizer 25 | self.init_lr = init_lr 26 | 27 | def step(self, *args, **kwargs): 28 | raise NotImplementedError 29 | 30 | @staticmethod 31 | def set_lr(optimizer, lr): 32 | for g in optimizer.param_groups: 33 | g['lr'] = lr 34 | 35 | def get_lr(self): 36 | for g in self.optimizer.param_groups: 37 | return g['lr'] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distutils.core import setup 16 | 17 | setup( 18 | name='KoSpeech', 19 | version='latest', 20 | description='Open-Source Toolkit for End-to-End Korean Speech Recognition', 21 | author='Soohwan Kim', 22 | author_email='kaki.ai@tunib.ai', 23 | url='https://github.com/sooftware/KoSpeech', 24 | install_requires=[ 25 | 'torch>=1.4.0', 26 | 'python-Levenshtein', 27 | 'librosa >= 0.7.0', 28 | 'numpy', 29 | 'pandas', 30 | 'tqdm', 31 | 'matplotlib', 32 | 'astropy', 33 | 'sentencepiece', 34 | 'hydra-core', 35 | ], 36 | keywords=['asr', 'speech_recognition', 'korean'], 37 | python_requires='>=3', 38 | ) 39 | -------------------------------------------------------------------------------- /test/test_las_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from kospeech.models.las.decoder import DecoderRNN 17 | 18 | cuda = torch.cuda.is_available() 19 | device = torch.device('cuda' if cuda else 'cpu') 20 | 21 | inputs = torch.LongTensor([[1, 1, 2], [3, 4, 2], [7, 2, 0]]) 22 | encoder_outputs = torch.rand(3, 100, 32) 23 | 24 | decoder = DecoderRNN(num_classes=10, hidden_state_dim=32, max_length=10) 25 | decoder_outputs = decoder(inputs, encoder_outputs, teacher_forcing_ratio=1.0) 26 | print("teacher_forcing_ratio=1.0 PASS") 27 | 28 | decoder = DecoderRNN(num_classes=10, hidden_state_dim=32, max_length=10) 29 | decoder_outputs = decoder(inputs, encoder_outputs, teacher_forcing_ratio=0.0) 30 | print("teacher_forcing_ratio=0.0 PASS") 31 | -------------------------------------------------------------------------------- /bin/kospeech/models/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from kospeech.models import ModelConfig 17 | 18 | 19 | @dataclass 20 | class TransformerConfig(ModelConfig): 21 | architecture: str = "transformer" 22 | extractor: str = "vgg" 23 | use_bidirectional: bool = True 24 | dropout: float = 0.3 25 | d_model: int = 512 26 | d_ff: int = 2048 27 | num_heads: int = 8 28 | num_encoder_layers: int = 12 29 | num_decoder_layers: int = 6 30 | 31 | 32 | @dataclass 33 | class JointCTCAttentionTransformerConfig(TransformerConfig): 34 | cross_entropy_weight: float = 0.7 35 | ctc_weight: float = 0.3 36 | mask_conv: bool = True 37 | joint_ctc_attention: bool = True 38 | -------------------------------------------------------------------------------- /data/train_result/train_step_result.csv: -------------------------------------------------------------------------------- 1 | loss,cer 2 | 0.49725567911757396,1.3785942282178933 3 | 0.4320526512272614,1.1382519868779264 4 | 0.40069148419866835,1.0359303697242361 5 | 0.38132347770707137,0.9773902462637962 6 | 0.366732018967894,0.9358039262938204 7 | 0.35402797622241,0.9017611604593339 8 | 0.342316426582415,0.8695670980202619 9 | 0.3312905454609975,0.8388701831709967 10 | 0.32115954891959464,0.8100443994182784 11 | 0.3118217632253063,0.7826462731244754 12 | 0.30338589655017695,0.756859449840871 13 | 0.2956769136612572,0.7332592623465659 14 | 0.2886054592627544,0.7115002051414936 15 | 0.28227339170816595,0.691827295638539 16 | 0.2764920089726153,0.6736116550939593 17 | 0.27115296381537046,0.6566396600963952 18 | 0.26627046029677365,0.6409298328194061 19 | 0.2618086007187791,0.6263652122748918 20 | 0.2576973859263268,0.6128638668158448 21 | 0.25388669457476953,0.6003908023350812 22 | 0.2504016133631766,0.5888744727804983 23 | 0.24708599731583045,0.5779467286271303 24 | 0.24395953697892694,0.5675731793954828 25 | 0.24105681277818689,0.5580052233482057 26 | 0.23831809159150824,0.5490285275196444 27 | 0.23573360881928176,0.5403110411146902 28 | 0.23329234001609772,0.5319247671046639 29 | 0.23100109575536898,0.5241830523736276 30 | 0.22887273296913221,0.5169182535522262 31 | 0.2267568045013077,0.5097991735342743 32 | 0.22478250193826105,0.5030199298264065 33 | 0.22290241163138483,0.49655470625483017 34 | 0.22113228150091996,0.4905992570502586 35 | 0.21945243381020205,0.4848017214126452 36 | -------------------------------------------------------------------------------- /test/test_jasper_recognize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from kospeech.models.jasper.model import Jasper 17 | 18 | batch_size = 3 19 | sequence_length = 14321 20 | dimension = 80 21 | 22 | cuda = torch.cuda.is_available() 23 | device = torch.device('cuda' if cuda else 'cpu') 24 | 25 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device) # BxTxD 26 | input_lengths = torch.LongTensor([14321, 14300, 13000]).to(device) 27 | 28 | print("Jasper 10x3 Model Test..") 29 | model = Jasper(num_classes=10, version='10x5').to(device) 30 | output = model.recognize(inputs, input_lengths) 31 | 32 | print(output) 33 | print(output.size()) 34 | 35 | print("Jasper 5x3 Model Test..") 36 | model = Jasper(num_classes=10, version='5x3').to(device) 37 | output = model.recognize(inputs, input_lengths) 38 | 39 | print(output) 40 | print(output.size()) 41 | -------------------------------------------------------------------------------- /test/test_las_recognize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from kospeech.models import ListenAttendSpell 18 | from kospeech.models.las.encoder import EncoderRNN 19 | from kospeech.models.las.decoder import DecoderRNN 20 | 21 | B, T, D, H = 3, 12345, 80, 32 22 | 23 | cuda = torch.cuda.is_available() 24 | device = torch.device('cuda' if cuda else 'cpu') 25 | 26 | inputs = torch.rand(B, T, D).to(device) 27 | input_lengths = torch.IntTensor([T, T - 100, T - 1000]) 28 | targets = torch.LongTensor([[1, 1, 2], [3, 4, 2], [7, 2, 0]]) 29 | 30 | model = ListenAttendSpell( 31 | input_dim=D, 32 | num_classes=10, 33 | encoder_hidden_state_dim=H, 34 | decoder_hidden_state_dim=H << 1, 35 | bidirectional=True, 36 | max_length=10, 37 | ).to(device) 38 | 39 | outputs = model.recognize(inputs, input_lengths) 40 | print(outputs.size()) 41 | print("LAS Recognize PASS") 42 | -------------------------------------------------------------------------------- /dataset/libri/prepare-libri.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Soohwan Kim, Seyoung Bae, Cheolhwang Won. 3 | # @ArXiv : KoSpeech: Open-Source Toolkit for End-to-End Korean Speech Recognition 4 | # This source code is licensed under the Apache 2.0 License license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # $1 : DIR_TO_SAVE_DATA 8 | 9 | base_url=www.openslr.org/resources/12 10 | train_dir=train_960 11 | vocab_size=5000 12 | 13 | if [ "$#" -ne 1 ]; then 14 | echo "Usage: $0 >" 15 | exit 1 16 | fi 17 | 18 | download_dir=${1%/} 19 | 20 | echo "Data Download" 21 | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do 22 | url=$base_url/$part.tar.gz 23 | if ! wget -P $download_dir $url; then 24 | echo "$0: wget failed for $url" 25 | exit 1 26 | fi 27 | if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then 28 | echo "$0: error un-tarring archive $download_dir/$part.tar.gz" 29 | exit 1 30 | fi 31 | done 32 | 33 | echo "Merge all train packs into one" 34 | mkdir -p ${download_dir}/LibriSpeech/${train_dir}/ 35 | for part in train-clean-100 train-clean-360 train-other-500; do 36 | mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/ 37 | done 38 | 39 | python prepare-libri.py --dataset_path $1/LibriSpeech --vocab_size $vocab_size 40 | 41 | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do 42 | rm $part.tar.gz 43 | done -------------------------------------------------------------------------------- /test/test_las_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from kospeech.models.las.encoder import EncoderRNN 17 | 18 | inputs = torch.rand(3, 12345, 80) 19 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 20 | 21 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, joint_ctc_attention=False, extractor='vgg') 22 | encoder_outputs, encoder_log_probs, encoder_output_lengths = encoder(inputs, input_lengths) 23 | print("joint_ctc_attention=False PASS, VGGExtractor") 24 | 25 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, joint_ctc_attention=False, extractor='ds2') 26 | encoder_outputs, encoder_output_lengths, encoder_log_probs = encoder(inputs, input_lengths) 27 | print("joint_ctc_attention=False PASS, DeepSpeech2Extractor") 28 | 29 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, num_classes=2, joint_ctc_attention=True) 30 | encoder_outputs, encoder_output_lengths, encoder_log_probs = encoder(inputs, input_lengths) 31 | print("joint_ctc_attention=True PASS") 32 | -------------------------------------------------------------------------------- /test/test_transformer_recognize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from kospeech.model_builder import build_transformer 18 | from kospeech.models import SpeechTransformer 19 | 20 | batch_size = 4 21 | seq_length = 200 22 | input_size = 80 23 | 24 | cuda = torch.cuda.is_available() 25 | device = torch.device('cuda' if cuda else 'cpu') 26 | 27 | transformer = build_transformer( 28 | num_classes=10, 29 | d_model=16, 30 | d_ff=32, 31 | num_heads=2, 32 | input_dim=input_size, 33 | num_encoder_layers=3, 34 | num_decoder_layers=2, 35 | extractor='vgg', 36 | dropout_p=0.1, 37 | device=device, 38 | pad_id=0, 39 | sos_id=1, 40 | eos_id=2, 41 | joint_ctc_attention=False, 42 | max_length=10, 43 | ) 44 | 45 | inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device) 46 | input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30]).to(device) 47 | 48 | output = transformer.module.recognize(inputs, input_lengths) 49 | print(output.size()) 50 | print("Test Transformer PASS") 51 | -------------------------------------------------------------------------------- /bin/kospeech/models/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | from torch import Tensor 17 | 18 | 19 | class Swish(nn.Module): 20 | """ 21 | Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied 22 | to a variety of challenging domains such as Image classification and Machine translation. 23 | """ 24 | 25 | def __init__(self): 26 | super(Swish, self).__init__() 27 | 28 | def forward(self, inputs: Tensor) -> Tensor: 29 | return inputs * inputs.sigmoid() 30 | 31 | 32 | class GLU(nn.Module): 33 | """ 34 | The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing 35 | in the paper “Language Modeling with Gated Convolutional Networks” 36 | """ 37 | 38 | def __init__(self, dim: int) -> None: 39 | super(GLU, self).__init__() 40 | self.dim = dim 41 | 42 | def forward(self, inputs: Tensor) -> Tensor: 43 | outputs, gate = inputs.chunk(2, dim=self.dim) 44 | return outputs * gate.sigmoid() 45 | -------------------------------------------------------------------------------- /data/vocab/aihub_grapheme_vocabs.csv: -------------------------------------------------------------------------------- 1 | id,grpm,freq 2 | 0,,0 3 | 1,,0 4 | 2,,0 5 | 3,|,5774244 6 | 4,ᅡ,3560117 7 | 5,ᄋ,3329997 8 | 6,ᄀ,2776207 9 | 7,ᅵ,2171847 10 | 8,ᆫ,2099990 11 | 9,ᅳ,2005508 12 | 10,ᅥ,1933125 13 | 11,ᅩ,1322658 14 | 12,ᄂ,1267792 15 | 13,ᄃ,1256854 16 | 14,ᄌ,1088700 17 | 15,ᆯ,990624 18 | 16,ᄅ,940940 19 | 17,ᄉ,920582 20 | 18,ᅦ,899879 21 | 19,ᄆ,891270 22 | 20,ᄒ,802845 23 | 21,ᅢ,797078 24 | 22,.,640923 25 | 23,ᅮ,613109 26 | 24,ᆼ,601838 27 | 25,ᆨ,523735 28 | 26,ᅧ,475473 29 | 27,ᄇ,473425 30 | 28,ᆻ,467303 31 | 29,ᆷ,415188 32 | 30,ᅣ,263813 33 | 31,?,235024 34 | 32,ᄎ,213755 35 | 33,ᄁ,194941 36 | 34,ᅯ,194464 37 | 35,ᄄ,165687 38 | 36,ᅪ,153572 39 | 37,ᆭ,144828 40 | 38,ᆸ,143910 41 | 39,ᄐ,141767 42 | 40,ᅬ,117111 43 | 41,ᄍ,114604 44 | 42,ᄑ,113442 45 | 43,ᆺ,110034 46 | 44,ᇂ,107712 47 | 45,ᅭ,104403 48 | 46,ᇀ,95003 49 | 47,ᄏ,82797 50 | 48,ᅫ,58525 51 | 49,ᄊ,52923 52 | 50,ᆹ,49619 53 | 51,ᅤ,48322 54 | 52,ᅨ,45861 55 | 53,ᆽ,44775 56 | 54,ᄈ,43872 57 | 55,ᅲ,40767 58 | 56,ᅱ,36498 59 | 57,ᇁ,33446 60 | 58,ᅴ,27363 61 | 59,ᆮ,24670 62 | 60,ᆩ,10787 63 | 61,ᆾ,10627 64 | 62,ᆶ,8431 65 | 63,ᆰ,5339 66 | 64,ᆲ,3841 67 | 65,ᅰ,2907 68 | 66,C,2704 69 | 67,ᆱ,2658 70 | 68,T,2248 71 | 69,ᆬ,1971 72 | 70,S,1844 73 | 71,P,1620 74 | 72,V,1159 75 | 73,A,1156 76 | 74,M,1087 77 | 75,B,1057 78 | 76,G,940 79 | 77,D,843 80 | 78,!,841 81 | 79,O,757 82 | 80,N,748 83 | 81,R,740 84 | 82,I,735 85 | 83,K,654 86 | 84,L,626 87 | 85,E,527 88 | 86,F,496 89 | 87,J,372 90 | 88,X,366 91 | 89,U,315 92 | 90,Y,231 93 | 91,H,197 94 | 92,ᆿ,114 95 | 93,ᆴ,87 96 | 94,W,80 97 | 95,ᆪ,63 98 | 96,Q,55 99 | 97,2,27 100 | 98,1,22 101 | 99,3,13 102 | 100,ᆵ,12 103 | 101,Z,11 104 | 102,4,8 105 | 103,0,6 106 | 104,8,5 107 | 105,5,5 108 | 106,6,2 109 | 107,7,1 110 | -------------------------------------------------------------------------------- /bin/kospeech/models/las/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | from kospeech.models import ModelConfig 18 | from kospeech.models.las.encoder import EncoderRNN 19 | from kospeech.models.las.decoder import DecoderRNN 20 | 21 | 22 | @dataclass 23 | class ListenAttendSpellConfig(ModelConfig): 24 | architecture: str = "las" 25 | use_bidirectional: bool = True 26 | dropout: float = 0.3 27 | num_heads: int = 4 28 | num_encoder_layers: int = 3 29 | num_decoder_layers: int = 2 30 | rnn_type: str = "lstm" 31 | hidden_dim: int = 512 32 | teacher_forcing_ratio: float = 1.0 33 | attn_mechanism: str = "multi-head" 34 | teacher_forcing_step: float = 0.01 35 | min_teacher_forcing_ratio: float = 0.9 36 | extractor: str = "vgg" 37 | activation: str = "hardtanh" 38 | mask_conv: bool = False 39 | joint_ctc_attention: bool = False 40 | 41 | 42 | @dataclass 43 | class JointCTCAttentionLASConfig(ListenAttendSpellConfig): 44 | hidden_dim: int = 768 45 | cross_entropy_weight: float = 0.7 46 | ctc_weight: float = 0.3 47 | mask_conv: bool = True 48 | joint_ctc_attention: bool = True -------------------------------------------------------------------------------- /test/test_jasper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from kospeech.models.jasper.model import Jasper 19 | 20 | batch_size = 3 21 | sequence_length = 14321 22 | dimension = 80 23 | 24 | cuda = torch.cuda.is_available() 25 | device = torch.device('cuda' if cuda else 'cpu') 26 | 27 | model = Jasper(num_classes=10, version='5x3').to(device) 28 | 29 | criterion = nn.CTCLoss(blank=3, zero_infinity=True) 30 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04) 31 | 32 | for i in range(10): 33 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device) 34 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 35 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 36 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 37 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 38 | target_lengths = torch.LongTensor([9, 8, 7]) 39 | outputs, output_lengths = model(inputs, input_lengths) 40 | 41 | loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths) 42 | loss.backward() 43 | optimizer.step() 44 | print(loss) 45 | -------------------------------------------------------------------------------- /test/test_deepspeech2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from kospeech.models import DeepSpeech2 19 | 20 | batch_size = 3 21 | sequence_length = 14321 22 | dimension = 80 23 | 24 | cuda = torch.cuda.is_available() 25 | device = torch.device('cuda' if cuda else 'cpu') 26 | 27 | model = DeepSpeech2(num_classes=10, input_dim=dimension).to(device) 28 | 29 | criterion = nn.CTCLoss(blank=3, zero_infinity=True) 30 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04) 31 | 32 | for i in range(10): 33 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device) 34 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 35 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 36 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 37 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 38 | target_lengths = torch.LongTensor([9, 8, 7]) 39 | outputs, output_lengths = model(inputs, input_lengths) 40 | 41 | loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths) 42 | loss.backward() 43 | optimizer.step() 44 | print(loss) 45 | -------------------------------------------------------------------------------- /bin/kospeech/data/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class AudioConfig: 20 | audio_extension: str = "pcm" 21 | sample_rate: int = 16000 22 | frame_length: int = 20 23 | frame_shift: int = 10 24 | normalize: bool = True 25 | del_silence: bool = True 26 | feature_extract_by: str = "kaldi" 27 | time_mask_num: int = 4 28 | freq_mask_num: int = 2 29 | spec_augment: bool = True 30 | input_reverse: bool = False 31 | 32 | 33 | @dataclass 34 | class FilterBankConfig(AudioConfig): 35 | transform_method: str = "fbank" 36 | n_mels: int = 80 37 | freq_mask_para: int = 18 38 | 39 | 40 | @dataclass 41 | class MelSpectrogramConfig(AudioConfig): 42 | transform_method: str = "mel" 43 | n_mels: int = 80 44 | freq_mask_para: int = 18 45 | 46 | 47 | @dataclass 48 | class MfccConfig(AudioConfig): 49 | transform_method: str = "mfcc" 50 | n_mels: int = 40 51 | freq_mask_para: int = 8 52 | 53 | 54 | @dataclass 55 | class SpectrogramConfig(AudioConfig): 56 | transform_method: str = "spectrogram" 57 | n_mels: int = 161 # Not used 58 | freq_mask_para: int = 24 59 | -------------------------------------------------------------------------------- /dataset/libri/prepare-libri.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | from preprocess import ( 17 | collect_transcripts, 18 | prepare_tokenizer, 19 | generate_transcript_file 20 | ) 21 | 22 | LIBRI_SPEECH_DATASETS = [ 23 | 'train_960', 'dev-clean', 'dev-other', 'test-clean', 'test-other' 24 | ] 25 | 26 | 27 | def _get_parser(): 28 | """ Get arguments parser """ 29 | parser = argparse.ArgumentParser(description='LibriSpeech Preprocess') 30 | parser.add_argument('--dataset_path', type=str, 31 | default='your_dataset_path', 32 | help='path of original dataset') 33 | parser.add_argument('--vocab_size', type=int, 34 | default=5000, 35 | help='size of vocab') 36 | 37 | return parser 38 | 39 | 40 | def main(): 41 | parser = _get_parser() 42 | opt = parser.parse_args() 43 | 44 | transcripts_collection = collect_transcripts(opt.dataset_path) 45 | prepare_tokenizer(transcripts_collection[0], opt.vocab_size) 46 | 47 | for idx, dataset in enumerate(LIBRI_SPEECH_DATASETS): 48 | generate_transcript_file(dataset, transcripts_collection[idx]) 49 | 50 | 51 | if __name__ == '__main__': 52 | main() -------------------------------------------------------------------------------- /test/test_las.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from kospeech.models import ListenAttendSpell 19 | 20 | B, T, D, H = 3, 12345, 80, 32 21 | 22 | cuda = torch.cuda.is_available() 23 | device = torch.device('cuda' if cuda else 'cpu') 24 | 25 | model = ListenAttendSpell( 26 | input_dim=D, 27 | num_classes=10, 28 | encoder_hidden_state_dim=H, 29 | decoder_hidden_state_dim=H << 1, 30 | bidirectional=True, 31 | max_length=10, 32 | ).to(device) 33 | 34 | criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean') 35 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04) 36 | 37 | for i in range(10): 38 | inputs = torch.rand(B, T, D).to(device) 39 | input_lengths = torch.IntTensor([12345, 12300, 12000]) 40 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 41 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 42 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 43 | outputs, output_lengths, encoder_log_probs = model(inputs, input_lengths, targets, teacher_forcing_ratio=1.0) 44 | 45 | loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1)) 46 | loss.backward() 47 | optimizer.step() 48 | print(loss) 49 | -------------------------------------------------------------------------------- /bin/kospeech/vocabs/librispeech.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from kospeech.vocabs import Vocabulary 16 | 17 | 18 | class LibriSpeechVocabulary(Vocabulary): 19 | def __init__(self, vocab_path, model_path): 20 | super(LibriSpeechVocabulary, self).__init__() 21 | try: 22 | import sentencepiece as spm 23 | except ImportError: 24 | raise ImportError("Please install sentencepiece: `pip install sentencepiece`") 25 | self.pad_id = 0 26 | self.sos_id = 1 27 | self.eos_id = 2 28 | 29 | self.vocab_path = vocab_path 30 | 31 | self.sp = spm.SentencePieceProcessor() 32 | self.sp.Load(model_path) 33 | 34 | def __len__(self): 35 | count = 0 36 | with open(self.vocab_path, encoding='utf-8') as f: 37 | for _ in f.readlines(): 38 | count += 1 39 | 40 | return count 41 | 42 | def label_to_string(self, labels): 43 | if len(labels.shape) == 1: 44 | return self.sp.DecodeIds([l for l in labels]) 45 | 46 | sentences = list() 47 | for batch in labels: 48 | sentence = str() 49 | for label in batch: 50 | sentence = self.sp.DecodeIds([l for l in label]) 51 | sentences.append(sentence) 52 | return sentences 53 | -------------------------------------------------------------------------------- /test/test_tri_stage_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | import matplotlib.pyplot as plt 17 | from torch import optim 18 | from kospeech.optim.__init__ import Optimizer 19 | from kospeech.optim.lr_scheduler.tri_stage_lr_scheduler import TriStageLRScheduler 20 | 21 | 22 | class Model(nn.Module): 23 | def __init__(self): 24 | super(Model, self).__init__() 25 | self.projection = nn.Linear(10, 10) 26 | 27 | def forward(self): 28 | pass 29 | 30 | 31 | INIT_LR = 1e-15 32 | PEAK_LR = 3e-04 33 | FINAL_LR = 1e-05 34 | WARMUP_STEPS = 4000 35 | MAX_GRAD_NORM = 400 36 | TOTAL_STEPS = 32000 37 | 38 | model = Model() 39 | 40 | optimizer = optim.Adam(model.parameters(), lr=INIT_LR) 41 | scheduler = TriStageLRScheduler( 42 | optimizer=optimizer, 43 | init_lr=INIT_LR, 44 | peak_lr=PEAK_LR, 45 | final_lr=FINAL_LR, 46 | warmup_steps=WARMUP_STEPS, 47 | total_steps=TOTAL_STEPS 48 | ) 49 | optimizer = Optimizer(optimizer, scheduler, WARMUP_STEPS, MAX_GRAD_NORM) 50 | lr_processes = list() 51 | 52 | for timestep in range(TOTAL_STEPS): 53 | optimizer.step(model) 54 | lr_processes.append(optimizer.get_lr()) 55 | 56 | plt.title('Test Optimizer class') 57 | plt.plot(lr_processes) 58 | plt.xlabel('timestep', fontsize='large') 59 | plt.ylabel('lr', fontsize='large') 60 | plt.show() 61 | -------------------------------------------------------------------------------- /bin/kospeech/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class ModelConfig: 20 | architecture: str = "???" 21 | teacher_forcing_ratio: float = 1.0 22 | teacher_forcing_step: float = 0.01 23 | min_teacher_forcing_ratio: float = 0.9 24 | dropout: float = 0.3 25 | bidirectional: bool = False 26 | joint_ctc_attention: bool = False 27 | max_len: int = 400 28 | 29 | 30 | from kospeech.models.deepspeech2.model import DeepSpeech2 31 | from kospeech.models.las.encoder import EncoderRNN 32 | from kospeech.models.las.decoder import DecoderRNN 33 | from kospeech.models.rnnt import RNNTransducerConfig 34 | from kospeech.models.rnnt.model import RNNTransducer 35 | from kospeech.models.las.model import ListenAttendSpell 36 | from kospeech.models.transformer.model import SpeechTransformer 37 | from kospeech.models.jasper.model import Jasper 38 | from kospeech.models.conformer.model import Conformer 39 | from kospeech.models.las import ListenAttendSpellConfig, JointCTCAttentionLASConfig 40 | from kospeech.models.transformer import TransformerConfig, JointCTCAttentionTransformerConfig 41 | from kospeech.models.deepspeech2 import DeepSpeech2Config 42 | from kospeech.models.jasper import JasperConfig 43 | from kospeech.models.conformer import ConformerSmallConfig, ConformerMediumConfig, ConformerLargeConfig 44 | -------------------------------------------------------------------------------- /test/test_transformer_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | import matplotlib.pyplot as plt 17 | from torch import optim 18 | from kospeech.optim.__init__ import Optimizer 19 | from kospeech.optim.lr_scheduler.transformer_lr_scheduler import TransformerLRScheduler 20 | 21 | 22 | class Model(nn.Module): 23 | def __init__(self): 24 | super(Model, self).__init__() 25 | self.projection = nn.Linear(10, 10) 26 | 27 | def forward(self): 28 | pass 29 | 30 | 31 | PEAK_LR = 1e-04 32 | FINAL_LR = 1e-07 33 | WARMUP_STEPS = 1000 34 | MAX_GRAD_NORM = 400 35 | DECAY_STEPS = 10000 36 | TOTAL_STEPS = WARMUP_STEPS + DECAY_STEPS + 10000 37 | 38 | model = Model() 39 | 40 | optimizer = optim.Adam(model.parameters(), lr=0.0) 41 | scheduler = TransformerLRScheduler( 42 | optimizer=optimizer, 43 | peak_lr=PEAK_LR, 44 | final_lr=FINAL_LR, 45 | warmup_steps=WARMUP_STEPS, 46 | decay_steps=DECAY_STEPS, 47 | final_lr_scale=0.001, 48 | ) 49 | optimizer = Optimizer(optimizer, scheduler, TOTAL_STEPS, MAX_GRAD_NORM) 50 | lr_processes = list() 51 | 52 | for timestep in range(TOTAL_STEPS): 53 | optimizer.step(model) 54 | lr_processes.append(optimizer.get_lr()) 55 | 56 | plt.title('Test Optimizer class') 57 | plt.plot(lr_processes) 58 | plt.xlabel('timestep', fontsize='large') 59 | plt.ylabel('lr', fontsize='large') 60 | plt.show() 61 | -------------------------------------------------------------------------------- /bin/kospeech/models/conformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | from kospeech.models import ModelConfig 18 | from kospeech.models.conformer.model import Conformer 19 | 20 | 21 | @dataclass 22 | class ConformerConfig(ModelConfig): 23 | architecture: str = "conformer" 24 | feed_forward_expansion_factor: int = 4 25 | conv_expansion_factor: int = 2 26 | input_dropout_p: float = 0.1 27 | feed_forward_dropout_p: float = 0.1 28 | attention_dropout_p: float = 0.1 29 | conv_dropout_p: float = 0.1 30 | decoder_dropout_p: float = 0.1 31 | conv_kernel_size: int = 31 32 | half_step_residual: bool = True 33 | num_decoder_layers: int = 1 34 | decoder_rnn_type: str = "lstm" 35 | decoder: str = "None" 36 | 37 | 38 | @dataclass 39 | class ConformerLargeConfig(ConformerConfig): 40 | encoder_dim: int = 512 41 | decoder_dim: int = 640 42 | num_encoder_layers: int = 17 43 | num_attention_heads: int = 8 44 | 45 | 46 | @dataclass 47 | class ConformerMediumConfig(ConformerConfig): 48 | encoder_dim: int = 256 49 | decoder_dim: int = 640 50 | num_encoder_layers: int = 16 51 | num_attention_heads: int = 4 52 | 53 | 54 | @dataclass 55 | class ConformerSmallConfig(ConformerConfig): 56 | encoder_dim: int = 144 57 | decoder_dim: int = 320 58 | num_encoder_layers: int = 16 59 | num_attention_heads: int = 4 60 | -------------------------------------------------------------------------------- /bin/kospeech/models/transformer/mask.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torch import Tensor 17 | from typing import Any, Optional 18 | 19 | 20 | def get_attn_pad_mask(inputs, input_lengths, expand_length): 21 | """ mask position is set to 1 """ 22 | 23 | def get_transformer_non_pad_mask(inputs: Tensor, input_lengths: Tensor) -> Tensor: 24 | """ Padding position is set to 0, either use input_lengths or pad_id """ 25 | batch_size = inputs.size(0) 26 | 27 | if len(inputs.size()) == 2: 28 | non_pad_mask = inputs.new_ones(inputs.size()) # B x T 29 | elif len(inputs.size()) == 3: 30 | non_pad_mask = inputs.new_ones(inputs.size()[:-1]) # B x T 31 | else: 32 | raise ValueError(f"Unsupported input shape {inputs.size()}") 33 | 34 | for i in range(batch_size): 35 | non_pad_mask[i, input_lengths[i]:] = 0 36 | 37 | return non_pad_mask 38 | 39 | non_pad_mask = get_transformer_non_pad_mask(inputs, input_lengths) 40 | pad_mask = non_pad_mask.lt(1) 41 | attn_pad_mask = pad_mask.unsqueeze(1).expand(-1, expand_length, -1) 42 | return attn_pad_mask 43 | 44 | 45 | def get_attn_subsequent_mask(seq): 46 | assert seq.dim() == 2 47 | attn_shape = [seq.size(0), seq.size(1), seq.size(1)] 48 | subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1) 49 | 50 | if seq.is_cuda: 51 | subsequent_mask = subsequent_mask.cuda() 52 | 53 | return subsequent_mask 54 | -------------------------------------------------------------------------------- /bin/kospeech/data/label_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Tuple 16 | import re 17 | 18 | 19 | def rule(x): 20 | # 괄호 21 | a = re.compile(r'\([^)]*\)') 22 | # 문장 부호 23 | b = re.compile('[^가-힣0-9 ]') 24 | x = re.sub(pattern=a, repl='', string= x) 25 | x = re.sub(pattern=b, repl='', string= x) 26 | return x 27 | 28 | def load_dataset(transcripts_path: str) -> Tuple[list, list]: 29 | """ 30 | Provides dictionary of filename and labels 31 | 32 | Args: 33 | transcripts_path (str): path of transcripts 34 | 35 | Returns: target_dict 36 | - **target_dict** (dict): dictionary of filename and labels 37 | """ 38 | audio_paths = list() 39 | transcripts = list() 40 | 41 | with open(transcripts_path, encoding='utf-8') as f: 42 | for idx, line in enumerate(f.readlines()): 43 | audio_path, korean_transcript ,transcript = line.split('\t') 44 | transcript = transcript.replace('\n', '') 45 | 46 | audio_paths.append(audio_path) 47 | transcripts.append(transcript) 48 | print("성공") 49 | return audio_paths, transcripts 50 | 51 | # with open(transcripts_path) as f: 52 | # for idx, line in enumerate(f.readlines()): 53 | # audio_path, korean_transcript, transcript = line.split('\t') 54 | # transcript = transcript.replace('\n', '') 55 | 56 | # audio_paths.append(audio_path) 57 | # transcripts.append(transcript) 58 | 59 | # return audio_paths, transcripts 60 | -------------------------------------------------------------------------------- /test/test_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from kospeech.model_builder import build_transformer 19 | 20 | batch_size = 4 21 | seq_length = 200 22 | target_length = 10 23 | input_size = 80 24 | 25 | cuda = torch.cuda.is_available() 26 | device = torch.device('cuda' if cuda else 'cpu') 27 | 28 | transformer = build_transformer( 29 | num_classes=10, 30 | d_model=16, 31 | d_ff=32, 32 | num_heads=2, 33 | input_dim=input_size, 34 | num_encoder_layers=3, 35 | num_decoder_layers=2, 36 | extractor='vgg', 37 | dropout_p=0.1, 38 | device=device, 39 | pad_id=0, 40 | sos_id=1, 41 | eos_id=2, 42 | joint_ctc_attention=False, 43 | max_length=10, 44 | ) 45 | 46 | criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean') 47 | optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-04) 48 | 49 | for i in range(10): 50 | inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device) 51 | input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30]) 52 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 53 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 54 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0], 55 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 56 | 57 | outputs, _, _ = transformer(inputs, input_lengths, targets) 58 | loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1)) 59 | loss.backward() 60 | optimizer.step() 61 | print(loss) 62 | -------------------------------------------------------------------------------- /etc/cmd 실행 코드.txt: -------------------------------------------------------------------------------- 1 | 모든 코드는 'KoreanSTT_kospeech' 폴더에서 실행되어야 합니다. 2 | 또한 필자의 경우 audio path에 경로 입력 시 상대 경로를 이용하면 간혹 문제가 생겨, 3 | train wav 폴더, test wav 폴더, metadata 폴더를 각각 'KoreanSTT_kospeech' 동일한 레벨에 둔 뒤 절대 경로로 입력해주었습니다. 4 | 5 | 사용 형식 및 예시는 아래와 같습니다. 6 | 7 | [train] 8 | 형식) python ./bin/main.py model=ds2 train=ds2_train train.dataset_path=$dataset_path 9 | 예시) python ./bin/main.py model=ds2 train=ds2_train train.dataset_path="D:\code\train wav" 10 | 11 | [preprocessing] 12 | 형식) python main.py --dataset_path $dataset_path --vocab_dest $vacab_dict_destination --output_unit 'character' --preprocess_mode 'phonetic' 13 | 예시) python main.py --dataset_path "D:\code\train wav" --vocab_dest "D:\code" --output_unit 'character' --preprocess_mode 'phonetic' 14 | python main.py --dataset_path "D:\code\train wav" --vocab_dest "D:\kospeech-latest" --output_unit "character" --preprocess_mode 'phonetic' 15 | 16 | [inference] -> device(default='cpu')는 optional 17 | 형식) python ./bin/inference.py --model_path $model_path --audio_path $audio_path --device "cpu" 18 | 예시) python ./bin/inference.py --model_path "G:/내 드라이브/code/kospeech-latest/outputs/2021-12-10/09-00-20/model.pt" --audio_path "../test wav/CN11RC002_CN0031_20210726.wav" --device "cpu" 19 | 20 | [inference_wer] -> dst_path(default='./outputs')와 device(default='cpu')는 optional 21 | 형식) python ./bin/inference_wer.py --model_path $model_path --audio_path $audio_path --transcript_path $transcript_path --dst_path $result_destination --device "cpu" 22 | 예시) python ./bin/inference_wer.py --model_path "G:/내 드라이브/code/kospeech-latest/outputs/2021-12-10/09-00-20/model.pt" --audio_path "D:/code/sample1/audio" --transcript_path "D:/code/sample1/transcripts.txt" --dst_path "D:/code/sample1" --device "cpu" 23 | 24 | [prediction] -> submission을 False로 지정하면 excel 파일은 생성하지 않고 .txt 파일에 결과 저장, device(default='cpu')는 optional 25 | 형식) python ./bin/prediction.py --model_path $model_path --audio_path $audio_path --submission 'True' --device "cpu" 26 | 예시) python ./bin/prediction.py --model_path "G:\내 드라이브\code\kospeech-latest\outputs\2021-12-10\19-51-14/model.pt" --audio_path "D:/code/test wav" --submission 'True' --device "cpu" 27 | 28 | python ./bin/prediction.py --model_path "./outputs/model.pt" --audio_path "D:/code/test wav" --submission 'True' --device "cpu" 29 | -------------------------------------------------------------------------------- /dataset/kspon/preprocess/subword.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sentencepiece as spm 17 | 18 | 19 | def train_sentencepiece(transcripts, datapath: str = './data', vocab_size: int = 5000): 20 | print('generate_sentencepiece_input..') 21 | 22 | if not os.path.exists(datapath): 23 | os.mkdir(datapath) 24 | 25 | with open('sentencepiece_input.txt', 'w', encoding="utf-8") as f: 26 | for transcript in transcripts: 27 | f.write(f'{transcript}\n') 28 | 29 | spm.SentencePieceTrainer.Train( 30 | f"--input={datapath}/sentencepiece_input.txt " 31 | f"--model_prefix=kspon_sentencepiece " 32 | f"--vocab_size={vocab_size} " 33 | f"--model_type=bpe " 34 | f"--pad_id=0 " 35 | f"--bos_id=1 " 36 | f"--eos_id=2 " 37 | f"--unk_id=3 " 38 | f"--user_defined_symbols={blank_token}" 39 | ) 40 | 41 | 42 | def sentence_to_subwords(audio_paths: list, transcripts: list, datapath: str = './data'): 43 | subwords = list() 44 | 45 | print('sentence_to_subwords...') 46 | 47 | sp = spm.SentencePieceProcessor() 48 | vocab_file = "kspon_sentencepiece.model" 49 | sp.load(vocab_file) 50 | 51 | with open(f'{datapath}/transcripts.txt', 'w') as f: 52 | for audio_path, transcript in zip(audio_paths, transcripts): 53 | audio_path = audio_path.replace('txt', 'pcm') 54 | subword_transcript = " ".join(sp.EncodeAsPieces(transcript)) 55 | subword_id_transcript = " ".join([str(item) for item in sp.EncodeAsIds(transcript)]) 56 | f.write(f'{audio_path}\t{subword_transcript}\t{subword_id_transcript}\n') 57 | 58 | return subwords 59 | -------------------------------------------------------------------------------- /test/test_transformer_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from kospeech.model_builder import build_transformer 19 | 20 | batch_size = 4 21 | seq_length = 200 22 | target_length = 10 23 | input_size = 80 24 | 25 | cuda = torch.cuda.is_available() 26 | device = torch.device('cuda' if cuda else 'cpu') 27 | 28 | transformer = build_transformer( 29 | num_classes=10, 30 | d_model=16, 31 | d_ff=32, 32 | num_heads=2, 33 | input_dim=input_size, 34 | num_encoder_layers=3, 35 | num_decoder_layers=2, 36 | extractor='vgg', 37 | dropout_p=0.1, 38 | device=device, 39 | pad_id=0, 40 | sos_id=1, 41 | eos_id=2, 42 | joint_ctc_attention=True, 43 | max_length=10, 44 | ) 45 | transformer_encoder = transformer.module.encoder 46 | 47 | criterion = nn.CTCLoss(blank=3, zero_infinity=True) 48 | optimizer = torch.optim.Adam(transformer_encoder.parameters(), lr=1e-04) 49 | 50 | for i in range(10): 51 | inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device) 52 | input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30]) 53 | targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2], 54 | [1, 3, 3, 3, 3, 3, 4, 5, 2, 0], 55 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0], 56 | [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device) 57 | target_lengths = torch.LongTensor([9, 8, 7, 7]) 58 | 59 | _, output_lengths, encoder_log_probs = transformer_encoder(inputs, input_lengths) 60 | loss = criterion(encoder_log_probs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths) 61 | loss.backward() 62 | optimizer.step() 63 | print(loss) 64 | -------------------------------------------------------------------------------- /bin/kospeech/criterion/transducer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | 19 | class TransducerLoss(nn.Module): 20 | """ 21 | Transducer loss module. 22 | 23 | Args: 24 | blank_id (int): blank symbol id 25 | """ 26 | 27 | def __init__(self, blank_id: int) -> None: 28 | """Construct an TransLoss object.""" 29 | super().__init__() 30 | try: 31 | from warp_rnnt import rnnt_loss 32 | except ImportError: 33 | raise ImportError("warp-rnnt is not installed. Please re-setup") 34 | self.rnnt_loss = rnnt_loss 35 | self.blank_id = blank_id 36 | 37 | def forward( 38 | self, 39 | log_probs: torch.FloatTensor, 40 | targets: torch.IntTensor, 41 | input_lengths: torch.IntTensor, 42 | target_lengths: torch.IntTensor, 43 | ) -> torch.FloatTensor: 44 | """ 45 | Compute path-aware regularization transducer loss. 46 | 47 | Args: 48 | log_probs (torch.FloatTensor): Batch of predicted sequences (batch, maxlen_in, maxlen_out+1, odim) 49 | targets (torch.IntTensor): Batch of target sequences (batch, maxlen_out) 50 | input_lengths (torch.IntTensor): batch of lengths of predicted sequences (batch) 51 | target_lengths (torch.IntTensor): batch of lengths of target sequences (batch) 52 | 53 | Returns: 54 | loss (torch.FloatTensor): transducer loss 55 | """ 56 | 57 | return self.rnnt_loss( 58 | log_probs, 59 | targets, 60 | input_lengths, 61 | target_lengths, 62 | reduction="mean", 63 | blank=self.blank_id, 64 | gather=True, 65 | ) 66 | -------------------------------------------------------------------------------- /bin/kospeech/models/transformer/sublayers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | from torch import Tensor 17 | from kospeech.models.modules import Linear 18 | 19 | 20 | class AddNorm(nn.Module): 21 | """ 22 | Add & Normalization layer proposed in "Attention Is All You Need". 23 | Transformer employ a residual connection around each of the two sub-layers, 24 | (Multi-Head Attention & Feed-Forward) followed by layer normalization. 25 | """ 26 | def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None: 27 | super(AddNorm, self).__init__() 28 | self.sublayer = sublayer 29 | self.layer_norm = nn.LayerNorm(d_model) 30 | 31 | def forward(self, *args): 32 | residual = args[0] 33 | outputs = self.sublayer(*args) 34 | 35 | if isinstance(outputs, tuple): 36 | return self.layer_norm(outputs[0] + residual), outputs[1] 37 | 38 | return self.layer_norm(outputs + residual) 39 | 40 | 41 | class PositionwiseFeedForward(nn.Module): 42 | """ 43 | Position-wise Feedforward Networks proposed in "Attention Is All You Need". 44 | Fully connected feed-forward network, which is applied to each position separately and identically. 45 | This consists of two linear transformations with a ReLU activation in between. 46 | Another way of describing this is as two convolutions with kernel size 1. 47 | """ 48 | def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout_p: float = 0.3) -> None: 49 | super(PositionwiseFeedForward, self).__init__() 50 | self.feed_forward = nn.Sequential( 51 | Linear(d_model, d_ff), 52 | nn.Dropout(dropout_p), 53 | nn.ReLU(), 54 | Linear(d_ff, d_model), 55 | nn.Dropout(dropout_p), 56 | ) 57 | 58 | def forward(self, inputs: Tensor) -> Tensor: 59 | return self.feed_forward(inputs) 60 | -------------------------------------------------------------------------------- /bin/kospeech/optim/lr_scheduler/transformer_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | from kospeech.optim.lr_scheduler.lr_scheduler import LearningRateScheduler 17 | 18 | 19 | class TransformerLRScheduler(LearningRateScheduler): 20 | """ Transformer Learning Rate Scheduler proposed in "Attention Is All You Need" """ 21 | def __init__(self, optimizer, peak_lr, final_lr, final_lr_scale, warmup_steps, decay_steps): 22 | assert isinstance(warmup_steps, int), "warmup_steps should be inteager type" 23 | assert isinstance(decay_steps, int), "total_steps should be inteager type" 24 | 25 | super(TransformerLRScheduler, self).__init__(optimizer, 0.0) 26 | self.final_lr = final_lr 27 | self.peak_lr = peak_lr 28 | self.warmup_steps = warmup_steps 29 | self.decay_steps = decay_steps 30 | 31 | self.warmup_rate = self.peak_lr / self.warmup_steps 32 | self.decay_factor = -math.log(final_lr_scale) / self.decay_steps 33 | 34 | self.lr = self.init_lr 35 | self.update_step = 0 36 | 37 | def _decide_stage(self): 38 | if self.update_step < self.warmup_steps: 39 | return 0, self.update_step 40 | 41 | if self.warmup_steps <= self.update_step < self.warmup_steps + self.decay_steps: 42 | return 1, self.update_step - self.warmup_steps 43 | 44 | return 2, None 45 | 46 | def step(self): 47 | self.update_step += 1 48 | stage, steps_in_stage = self._decide_stage() 49 | 50 | if stage == 0: 51 | self.lr = self.update_step * self.warmup_rate 52 | elif stage == 1: 53 | self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) 54 | elif stage == 2: 55 | self.lr = self.final_lr 56 | else: 57 | raise ValueError("Undefined stage") 58 | 59 | self.set_lr(self.optimizer, self.lr) 60 | 61 | return self.lr 62 | -------------------------------------------------------------------------------- /docs/_static/js/html5shiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /docs/build/html/_static/js/html5shiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /bin/kospeech/models/transformer/embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | import torch 17 | import torch.nn as nn 18 | from torch import Tensor 19 | 20 | 21 | class PositionalEncoding(nn.Module): 22 | """ 23 | Positional Encoding proposed in "Attention Is All You Need". 24 | Since transformer contains no recurrence and no convolution, in order for the model to make 25 | use of the order of the sequence, we must add some positional information. 26 | 27 | "Attention Is All You Need" use sine and cosine functions of different frequencies: 28 | PE_(pos, 2i) = sin(pos / power(10000, 2i / d_model)) 29 | PE_(pos, 2i+1) = cos(pos / power(10000, 2i / d_model)) 30 | """ 31 | def __init__(self, d_model: int = 512, max_len: int = 5000) -> None: 32 | super(PositionalEncoding, self).__init__() 33 | pe = torch.zeros(max_len, d_model, requires_grad=False) 34 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 35 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) 36 | pe[:, 0::2] = torch.sin(position * div_term) 37 | pe[:, 1::2] = torch.cos(position * div_term) 38 | pe = pe.unsqueeze(0) 39 | self.register_buffer('pe', pe) 40 | 41 | def forward(self, length: int) -> Tensor: 42 | return self.pe[:, :length] 43 | 44 | 45 | class Embedding(nn.Module): 46 | """ 47 | Embedding layer. Similarly to other sequence transduction models, transformer use learned embeddings 48 | to convert the input tokens and output tokens to vectors of dimension d_model. 49 | In the embedding layers, transformer multiply those weights by sqrt(d_model) 50 | """ 51 | def __init__(self, num_embeddings: int, pad_id: int, d_model: int = 512) -> None: 52 | super(Embedding, self).__init__() 53 | self.sqrt_dim = math.sqrt(d_model) 54 | self.embedding = nn.Embedding(num_embeddings, d_model, padding_idx=pad_id) 55 | 56 | def forward(self, inputs: Tensor) -> Tensor: 57 | return self.embedding(inputs) * self.sqrt_dim 58 | -------------------------------------------------------------------------------- /bin/kospeech/models/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.init as init 18 | from torch import Tensor 19 | 20 | 21 | class ResidualConnectionModule(nn.Module): 22 | """ 23 | Residual Connection Module. 24 | outputs = (module(inputs) x module_factor + inputs x input_factor) 25 | """ 26 | def __init__( 27 | self, 28 | module: nn.Module, 29 | module_factor: float = 1.0, 30 | input_factor: float = 1.0, 31 | ) -> None: 32 | super(ResidualConnectionModule, self).__init__() 33 | self.module = module 34 | self.module_factor = module_factor 35 | self.input_factor = input_factor 36 | 37 | def forward(self, inputs: Tensor) -> Tensor: 38 | return (self.module(inputs) * self.module_factor) + (inputs * self.input_factor) 39 | 40 | 41 | class Linear(nn.Module): 42 | """ 43 | Wrapper class of torch.nn.Linear 44 | Weight initialize by xavier initialization and bias initialize to zeros. 45 | """ 46 | def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None: 47 | super(Linear, self).__init__() 48 | self.linear = nn.Linear(in_features, out_features, bias=bias) 49 | init.xavier_uniform_(self.linear.weight) 50 | if bias: 51 | init.zeros_(self.linear.bias) 52 | 53 | def forward(self, x: Tensor) -> Tensor: 54 | return self.linear(x) 55 | 56 | 57 | class View(nn.Module): 58 | """ Wrapper class of torch.view() for Sequential module. """ 59 | def __init__(self, shape: tuple, contiguous: bool = False): 60 | super(View, self).__init__() 61 | self.shape = shape 62 | self.contiguous = contiguous 63 | 64 | def forward(self, inputs): 65 | if self.contiguous: 66 | inputs = inputs.contiguous() 67 | return inputs.view(*self.shape) 68 | 69 | 70 | class Transpose(nn.Module): 71 | """ Wrapper class of torch.transpose() for Sequential module. """ 72 | def __init__(self, shape: tuple): 73 | super(Transpose, self).__init__() 74 | self.shape = shape 75 | 76 | def forward(self, inputs: Tensor): 77 | return inputs.transpose(*self.shape) 78 | -------------------------------------------------------------------------------- /bin/kospeech/evaluator/evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import queue 16 | import torch 17 | from kospeech.utils import logger 18 | from kospeech.data import AudioDataLoader 19 | from kospeech.decode.search import ( 20 | GreedySearch, 21 | BeamSearch 22 | ) 23 | 24 | 25 | class Evaluator(object): 26 | """ 27 | Class to evaluate models with given datasets. 28 | 29 | Args: 30 | dataset (kospeech.data.data_loader.SpectrogramDataset): dataset for spectrogram & script matching 31 | batch_size (int): size of batch. recommended batch size is 1. 32 | device (torch.device): device - 'cuda' or 'cpu' 33 | num_workers (int): the number of cpu cores used 34 | print_every (int): to determine whether to store training progress every N timesteps (default: 10) 35 | """ 36 | 37 | def __init__(self, dataset, vocab, batch_size=1, device=None, 38 | num_workers=1, print_every=100, decode='greedy', beam_size=None): 39 | self.dataset = dataset 40 | self.batch_size = batch_size 41 | self.device = device 42 | self.num_workers = num_workers 43 | self.print_every = print_every 44 | 45 | if decode == 'greedy': 46 | self.search = GreedySearch(vocab) 47 | 48 | elif decode == 'beam': 49 | assert beam_size > 1, "beam_size should be greater than 1. You can choose `greedy` search" 50 | self.search = BeamSearch(vocab, beam_size, batch_size) 51 | 52 | else: 53 | raise ValueError("Unsupported decode : {0}".format(decode)) 54 | 55 | def evaluate(self, model): 56 | """ Evaluate a model on given dataset and return performance. """ 57 | logger.info('evaluate() start') 58 | 59 | eval_queue = queue.Queue(self.num_workers << 1) 60 | eval_loader = AudioDataLoader(self.dataset, eval_queue, self.batch_size, thread_id=0, pad_id=0) 61 | eval_loader.start() 62 | 63 | cer = self.search.search(model, eval_queue, self.device, self.print_every) 64 | self.search.save_result('data/train_result/%s.csv' % type(self.decoder).__name__) 65 | 66 | logger.info('Evaluate CER: %s' % cer) 67 | logger.info('evaluate() completed') 68 | eval_loader.join() 69 | -------------------------------------------------------------------------------- /etc/traintext 생성.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "id": "f9cd89df", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import os\n", 12 | "import re\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 8, 18 | "id": "0246a9a8", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "def rule(x):\n", 23 | " # 괄호\n", 24 | " a = re.compile(r'\\([^)]*\\)')\n", 25 | " # 문장 부호\n", 26 | " b = re.compile('[^가-힣0-9 ]')\n", 27 | " x = re.sub(pattern=a, repl='', string= x)\n", 28 | " x = re.sub(pattern=b, repl='', string= x)\n", 29 | " return x" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 9, 35 | "id": "11b2a5db", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "cnt = 0\n", 40 | "for audio_path, y in zip(os.listdir(\"D:\\코드\\[train] 음성데이터_wav\"), os.listdir(\"D:\\코드\\[train] 메타데이터_json\")):\n", 41 | " cnt +=1\n", 42 | " # if cnt == 100:\n", 43 | " # break\n", 44 | " try: \n", 45 | " with open(\"D:\\코드\\[train] 메타데이터_json\" + \"/\" + y, 'r', encoding='UTF-8') as f:\n", 46 | " json_data = json.load(f)\n", 47 | " reading = json.dumps(json_data[\"transcription\"][\"ReadingLabelText\"], ensure_ascii = False)\n", 48 | " reading = rule(reading)\n", 49 | " with open(\"D:\\코드/train.txt\", \"a\") as f:\n", 50 | " f.write('{0}\\n'.format(audio_path))\n", 51 | " # with open(\"D:\\코드/train.txt\", \"a\") as f:\n", 52 | " # f.write('{0}\\t{1}\\n'.format(audio_path, reading))\n", 53 | "\n", 54 | " except:\n", 55 | " with open(\"D:\\코드/train.txt\", \"a\") as f:\n", 56 | " f.write(\"------error error--------\\n\")\n", 57 | " print(cnt == 300000)" 58 | ] 59 | } 60 | ], 61 | "metadata": { 62 | "kernelspec": { 63 | "display_name": "Python 3 (ipykernel)", 64 | "language": "python", 65 | "name": "python3" 66 | }, 67 | "language_info": { 68 | "codemirror_mode": { 69 | "name": "ipython", 70 | "version": 3 71 | }, 72 | "file_extension": ".py", 73 | "mimetype": "text/x-python", 74 | "name": "python", 75 | "nbconvert_exporter": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.8.12" 78 | }, 79 | "toc": { 80 | "base_numbering": 1, 81 | "nav_menu": {}, 82 | "number_sections": true, 83 | "sideBar": true, 84 | "skip_h1_title": false, 85 | "title_cell": "Table of Contents", 86 | "title_sidebar": "Contents", 87 | "toc_cell": false, 88 | "toc_position": {}, 89 | "toc_section_display": true, 90 | "toc_window_display": false 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 5 95 | } 96 | -------------------------------------------------------------------------------- /bin/kospeech/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from kospeech.optim.adamp import AdamP 17 | from kospeech.optim.radam import RAdam 18 | from kospeech.optim.novograd import Novograd 19 | 20 | 21 | class Optimizer(object): 22 | """ 23 | This is wrapper classs of torch.optim.Optimizer. 24 | This class provides functionalities for learning rate scheduling and gradient norm clipping. 25 | 26 | Args: 27 | optim (torch.optim.Optimizer): optimizer object, the parameters to be optimized 28 | should be given when instantiating the object, e.g. torch.optim.Adam, torch.optim.SGD 29 | scheduler (kospeech.optim.lr_scheduler, optional): learning rate scheduler 30 | scheduler_period (int, optional): timestep with learning rate scheduler 31 | max_grad_norm (int, optional): value used for gradient norm clipping 32 | """ 33 | def __init__(self, optim, scheduler=None, scheduler_period=None, max_grad_norm=0): 34 | self.optimizer = optim 35 | self.scheduler = scheduler 36 | self.scheduler_period = scheduler_period 37 | self.max_grad_norm = max_grad_norm 38 | self.count = 0 39 | 40 | def step(self, model): 41 | if self.max_grad_norm > 0: 42 | torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) 43 | self.optimizer.step() 44 | 45 | if self.scheduler is not None: 46 | self.update() 47 | self.count += 1 48 | 49 | if self.scheduler_period == self.count: 50 | self.scheduler = None 51 | self.scheduler_period = 0 52 | self.count = 0 53 | 54 | def set_scheduler(self, scheduler, scheduler_period): 55 | self.scheduler = scheduler 56 | self.scheduler_period = scheduler_period 57 | self.count = 0 58 | 59 | def update(self): 60 | if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): 61 | pass 62 | else: 63 | self.scheduler.step() 64 | 65 | def zero_grad(self): 66 | self.optimizer.zero_grad() 67 | 68 | def get_lr(self): 69 | for g in self.optimizer.param_groups: 70 | return g['lr'] 71 | 72 | def set_lr(self, lr): 73 | for g in self.optimizer.param_groups: 74 | g['lr'] = lr 75 | -------------------------------------------------------------------------------- /bin/kospeech/models/jasper/configs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class Jasper10x5Config: 17 | def __init__(self, num_classes: int, num_blocks: int, num_sub_blocks: int) -> None: 18 | super(Jasper10x5Config, self).__init__() 19 | self.num_blocks = num_blocks 20 | self.num_sub_blocks = num_sub_blocks 21 | self.preprocess_block = { 22 | 'in_channels': 80, 23 | 'out_channels': 256, 24 | 'kernel_size': 11, 25 | 'stride': 2, 26 | 'dilation': 1, 27 | 'dropout_p': 0.2, 28 | } 29 | self.block = { 30 | 'in_channels': (256, 256, 256, 384, 384, 512, 512, 640, 640, 768), 31 | 'out_channels': (256, 256, 384, 384, 512, 512, 640, 640, 768, 768), 32 | 'kernel_size': (11, 11, 13, 13, 17, 17, 21, 21, 25, 25), 33 | 'dilation': [1] * 10, 34 | 'dropout_p': (0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3), 35 | } 36 | self.postprocess_block = { 37 | 'in_channels': (768, 896, 1024), 38 | 'out_channels': (896, 1024, num_classes), 39 | 'kernel_size': (29, 1, 1), 40 | 'dilation': (2, 1, 1), 41 | 'dropout_p': (0.4, 0.4, 0.0), 42 | } 43 | 44 | 45 | class Jasper5x3Config: 46 | def __init__(self, num_classes: int, num_blocks: int, num_sub_blocks: int) -> None: 47 | super(Jasper5x3Config, self).__init__() 48 | self.num_blocks = num_blocks 49 | self.num_sub_blocks = num_sub_blocks 50 | self.preprocess_block = { 51 | 'in_channels': 80, 52 | 'out_channels': 256, 53 | 'kernel_size': 11, 54 | 'stride': 2, 55 | 'dilation': 1, 56 | 'dropout_p': 0.2, 57 | } 58 | self.block = { 59 | 'in_channels': (256, 256, 384, 512, 640), 60 | 'out_channels': (256, 384, 512, 640, 768), 61 | 'kernel_size': (11, 13, 17, 21, 25), 62 | 'dilation': [1] * 5, 63 | 'dropout_p': (0.2, 0.2, 0.2, 0.3, 0.3), 64 | } 65 | self.postprocess_block = { 66 | 'in_channels': (768, 896, 1024), 67 | 'out_channels': (896, 1024, num_classes), 68 | 'kernel_size': (29, 1, 1), 69 | 'dilation': (2, 1, 1), 70 | 'dropout_p': (0.4, 0.4, 0.0), 71 | } 72 | -------------------------------------------------------------------------------- /bin/kospeech/optim/lr_scheduler/tri_stage_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | from kospeech.optim.lr_scheduler.lr_scheduler import LearningRateScheduler 17 | 18 | 19 | class TriStageLRScheduler(LearningRateScheduler): 20 | """ 21 | Tri-Stage Learning Rate Scheduler 22 | Implement the learning rate scheduler in "SpecAugment" 23 | """ 24 | def __init__(self, optimizer, init_lr, peak_lr, final_lr, init_lr_scale, final_lr_scale, warmup_steps, total_steps): 25 | assert isinstance(warmup_steps, int), "warmup_steps should be inteager type" 26 | assert isinstance(total_steps, int), "total_steps should be inteager type" 27 | 28 | super(TriStageLRScheduler, self).__init__(optimizer, init_lr) 29 | self.init_lr *= init_lr_scale 30 | self.final_lr = final_lr 31 | self.peak_lr = peak_lr 32 | self.warmup_steps = warmup_steps 33 | self.hold_steps = int(total_steps >> 1) - warmup_steps 34 | self.decay_steps = int(total_steps >> 1) 35 | 36 | self.warmup_rate = (self.peak_lr - self.init_lr) / self.warmup_steps if self.warmup_steps != 0 else 0 37 | self.decay_factor = -math.log(final_lr_scale) / self.decay_steps 38 | 39 | self.lr = self.init_lr 40 | self.update_step = 0 41 | 42 | def _decide_stage(self): 43 | if self.update_step < self.warmup_steps: 44 | return 0, self.update_step 45 | 46 | offset = self.warmup_steps 47 | 48 | if self.update_step < offset + self.hold_steps: 49 | return 1, self.update_step - offset 50 | 51 | offset += self.hold_steps 52 | 53 | if self.update_step <= offset + self.decay_steps: 54 | # decay stage 55 | return 2, self.update_step - offset 56 | 57 | offset += self.decay_steps 58 | 59 | return 3, self.update_step - offset 60 | 61 | def step(self): 62 | stage, steps_in_stage = self._decide_stage() 63 | 64 | if stage == 0: 65 | self.lr = self.init_lr + self.warmup_rate * steps_in_stage 66 | elif stage == 1: 67 | self.lr = self.peak_lr 68 | elif stage == 2: 69 | self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) 70 | elif stage == 3: 71 | self.lr = self.final_lr 72 | else: 73 | raise ValueError("Undefined stage") 74 | 75 | self.set_lr(self.optimizer, self.lr) 76 | self.update_step += 1 77 | 78 | return self.lr 79 | -------------------------------------------------------------------------------- /bin/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | from typing_extensions import Required 17 | import torch 18 | import torch.nn as nn 19 | import numpy as np 20 | import torchaudio 21 | from torch import Tensor 22 | import os 23 | from tools import revise 24 | 25 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary 26 | from kospeech.data.audio.core import load_audio 27 | from kospeech.models import ( 28 | SpeechTransformer, 29 | Jasper, 30 | DeepSpeech2, 31 | ListenAttendSpell, 32 | Conformer, 33 | ) 34 | 35 | 36 | def parse_audio(audio_path: str, del_silence: bool = False, audio_extension: str = 'wav') -> Tensor: 37 | signal = load_audio(audio_path, del_silence, extension=audio_extension) 38 | feature = torchaudio.compliance.kaldi.fbank( 39 | waveform=Tensor(signal).unsqueeze(0), 40 | num_mel_bins=80, 41 | frame_length=20, 42 | frame_shift=10, 43 | window_type='hamming' 44 | ).transpose(0, 1).numpy() 45 | 46 | feature -= feature.mean() 47 | feature /= np.std(feature) 48 | 49 | return torch.FloatTensor(feature).transpose(0, 1) 50 | 51 | 52 | # require (x)- > required (o) 53 | parser = argparse.ArgumentParser(description='KoSpeech') 54 | parser.add_argument('--model_path', type=str, required=True) 55 | parser.add_argument('--audio_path', type=str, required=True) 56 | parser.add_argument('--device', type=str, required=False, default='cpu') 57 | opt = parser.parse_args() 58 | 59 | # 음성 하나에 대해 inference 하는 경우 60 | feature = parse_audio(opt.audio_path, del_silence=True) 61 | input_length = torch.LongTensor([len(feature)]) 62 | vocab = KsponSpeechVocabulary('data/vocab/cssiri_character_vocabs.csv') 63 | 64 | model = torch.load(opt.model_path, map_location=lambda storage, loc: storage).to(opt.device) 65 | if isinstance(model, nn.DataParallel): 66 | model = model.module 67 | model.eval() 68 | 69 | if isinstance(model, ListenAttendSpell): 70 | model.encoder.device = opt.device 71 | model.decoder.device = opt.device 72 | 73 | y_hats = model.recognize(feature.unsqueeze(0), input_length) 74 | elif isinstance(model, DeepSpeech2): 75 | model.device = opt.device 76 | y_hats = model.recognize(feature.unsqueeze(0), input_length) 77 | elif isinstance(model, SpeechTransformer) or isinstance(model, Jasper) or isinstance(model, Conformer): 78 | y_hats = model.greedy_search(feature.unsqueeze(0), input_length) 79 | 80 | sentence = vocab.label_to_string(y_hats.cpu().detach().numpy()) 81 | print(revise(sentence)) 82 | -------------------------------------------------------------------------------- /bin/eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import hydra 17 | import warnings 18 | from hydra.core.config_store import ConfigStore 19 | from omegaconf import OmegaConf, DictConfig 20 | from kospeech.evaluator import EvalConfig 21 | from kospeech.data.audio import FilterBankConfig 22 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary 23 | from kospeech.vocabs.librispeech import LibriSpeechVocabulary 24 | from kospeech.data.label_loader import load_dataset 25 | from kospeech.data.data_loader import SpectrogramDataset 26 | from kospeech.evaluator.evaluator import Evaluator 27 | from kospeech.utils import check_envirionment, logger 28 | from kospeech.model_builder import load_test_model 29 | 30 | 31 | def inference(config: DictConfig): 32 | device = check_envirionment(config.eval.use_cuda) 33 | model = load_test_model(config.eval, device) 34 | 35 | if config.eval.dataset == 'kspon': 36 | vocab = KsponSpeechVocabulary( 37 | f'../../../data/vocab/cssiri_{config.eval.output_unit}_vocabs.csv', output_unit=config.eval.output_unit 38 | ) 39 | elif config.eval.dataset == 'libri': 40 | vocab = LibriSpeechVocabulary('../../../data/vocab/tokenizer.vocab', 'data/vocab/tokenizer.model') 41 | else: 42 | raise ValueError("Unsupported Dataset : {0}".format(config.eval.dataset)) 43 | 44 | audio_paths, transcripts = load_dataset(config.eval.transcripts_path) 45 | 46 | testset = SpectrogramDataset(audio_paths=audio_paths, transcripts=transcripts, 47 | sos_id=vocab.sos_id, eos_id=vocab.eos_id, 48 | dataset_path=config.eval.dataset_path, config=config, spec_augment=False) 49 | 50 | evaluator = Evaluator( 51 | dataset=testset, 52 | vocab=vocab, 53 | batch_size=config.eval.batch_size, 54 | device=device, 55 | num_workers=config.eval.num_workers, 56 | print_every=config.eval.print_every, 57 | decode=config.eval.decode, 58 | beam_size=config.eval.k, 59 | ) 60 | evaluator.evaluate(model) 61 | 62 | 63 | cs = ConfigStore.instance() 64 | cs.store(group="eval", name="default", node=EvalConfig, package="eval") 65 | cs.store(group="audio", name="fbank", node=FilterBankConfig, package="audio") 66 | 67 | 68 | @hydra.main(config_path=os.path.join('..', "configs"), config_name="eval") 69 | def main(config: DictConfig) -> None: 70 | warnings.filterwarnings('ignore') 71 | logger.info(OmegaConf.to_yaml(config)) 72 | inference(config) 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /docs/_static/css/badge_only.css: -------------------------------------------------------------------------------- 1 | .fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} -------------------------------------------------------------------------------- /docs/build/html/_static/css/badge_only.css: -------------------------------------------------------------------------------- 1 | .fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} -------------------------------------------------------------------------------- /dataset/kspon/preprocess/character.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pandas as pd 17 | 18 | # id/char 사전 만들기 19 | def load_label(filepath): 20 | char2id = dict() 21 | id2char = dict() 22 | 23 | ch_labels = pd.read_csv(filepath, encoding="utf-8") 24 | 25 | id_list = ch_labels["id"] 26 | char_list = ch_labels["char"] 27 | freq_list = ch_labels["freq"] 28 | 29 | for (id_, char, freq) in zip(id_list, char_list, freq_list): 30 | char2id[char] = id_ 31 | id2char[id_] = char 32 | return char2id, id2char 33 | 34 | 35 | # 문장을 id로 36 | def sentence_to_target(sentence, char2id): 37 | target = str() 38 | 39 | for ch in sentence: 40 | try: 41 | target += (str(char2id[ch]) + ' ') 42 | # 사전에 없는 경우 넘어가라 -> 그냥 묵음처리나 마찬가지. 43 | except KeyError: 44 | continue 45 | 46 | return target[:-1] 47 | 48 | 49 | # 한글 전사에서 빈번하게 사용된 상위 2000개 단어 id 사전 만들기 50 | def generate_character_labels(transcripts, labels_dest): 51 | print('create_char_labels started..') 52 | 53 | label_list = list() 54 | label_freq = list() 55 | 56 | for transcript in transcripts: 57 | for ch in transcript: 58 | if ch not in label_list: 59 | label_list.append(ch) 60 | label_freq.append(1) 61 | else: 62 | label_freq[label_list.index(ch)] += 1 63 | 64 | # sort together Using zip 65 | label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True)) 66 | label = {'id': [0, 1, 2], 'char': ['', '', ''], 'freq': [0, 0, 0]} 67 | 68 | for idx, (ch, freq) in enumerate(zip(label_list, label_freq)): 69 | label['id'].append(idx + 3) 70 | label['char'].append(ch) 71 | label['freq'].append(freq) 72 | 73 | label['id'] = label['id'][:986] 74 | label['char'] = label['char'][:986] 75 | label['freq'] = label['freq'][:986] 76 | 77 | label_df = pd.DataFrame(label) 78 | label_df.to_csv(os.path.join(labels_dest, "cssiri_character_vocabs.csv"), encoding="utf-8", index=False) 79 | 80 | 81 | def generate_character_script(audio_paths, transcripts, labels_dest): 82 | print('create_script started..') 83 | char2id, id2char = load_label(os.path.join(labels_dest, "cssiri_character_vocabs.csv")) 84 | 85 | with open(os.path.join("transcripts.txt"), "w") as f: 86 | for audio_path, transcript in zip(audio_paths, transcripts): 87 | char_id_transcript = sentence_to_target(transcript, char2id) 88 | audio_path = audio_path.replace('txt', 'wav') 89 | f.write(f'{audio_path}\t{transcript}\t{char_id_transcript}\n') 90 | -------------------------------------------------------------------------------- /bin/kospeech/decode/ensemble.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Note! This code is not available !! 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | 21 | class Ensemble(nn.Module): 22 | """ 23 | Ensemble decoding. 24 | Decodes using multiple models simultaneously, 25 | 26 | Note: 27 | Do not use this class directly, use one of the sub classes. 28 | """ 29 | def __init__(self, models): 30 | super(Ensemble, self).__init__() 31 | self.models = models 32 | self.num_models = len(models) 33 | 34 | def forward(self, *args, **kwargs): 35 | raise NotImplementedError 36 | 37 | 38 | class BasicEnsemble(Ensemble): 39 | """ 40 | Basic ensemble decoding. 41 | 42 | Decodes using multiple models simultaneously, 43 | combining their prediction distributions by adding. 44 | All models in the ensemble must share a target characters. 45 | """ 46 | def __init__(self, models): 47 | super(BasicEnsemble, self).__init__(models) 48 | 49 | def forward(self, inputs, input_lengths): 50 | y_hats = None 51 | 52 | with torch.no_grad(): 53 | for model in self.models: 54 | if y_hats is None: 55 | y_hats = model(inputs, input_lengths, teacher_forcing_ratio=0.0) 56 | else: 57 | y_hats += model(inputs, input_lengths, teacher_forcing_ratio=0.0) 58 | 59 | return y_hats 60 | 61 | 62 | class WeightedEnsemble(Ensemble): 63 | """ 64 | Weighted ensemble decoding. 65 | 66 | Decodes using multiple models simultaneously, 67 | combining their prediction distributions by weighted sum. 68 | All models in the ensemble must share a target characters. 69 | """ 70 | def __init__(self, models, dim=128): 71 | super(WeightedEnsemble, self).__init__(models) 72 | self.meta_classifier = nn.Sequential( 73 | nn.Linear(self.num_models, dim), 74 | nn.ELU(inplace=True), 75 | nn.Linear(dim, self.num_models) 76 | ) 77 | 78 | def forward(self, inputs, input_lengths): 79 | y_hats, outputs = None, list() 80 | weights = torch.FloatTensor([1.] * self.num_models) 81 | 82 | # model`s parameters are fixed 83 | with torch.no_grad(): 84 | for model in self.models: 85 | outputs.append(model(inputs, input_lengths, teacher_forcing_ratio=0.0)) 86 | 87 | weights = self.meta_classifier(weights) 88 | 89 | for (output, weight) in zip(outputs, weights): 90 | if y_hats is None: 91 | y_hats = output * weight 92 | else: 93 | y_hats += output * weight 94 | 95 | return y_hats 96 | -------------------------------------------------------------------------------- /bin/kospeech/criterion/label_smoothed_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch import Tensor 19 | 20 | 21 | class LabelSmoothedCrossEntropyLoss(nn.Module): 22 | """ 23 | Label smoothed cross entropy loss function. 24 | 25 | Args: 26 | num_classes (int): the number of classfication 27 | ignore_index (int): Indexes that are ignored when calculating loss 28 | smoothing (float): ratio of smoothing (confidence = 1.0 - smoothing) 29 | dim (int): dimension of calculation loss 30 | reduction (str): reduction method [sum, mean] (default: sum) 31 | 32 | Inputs: logits, target 33 | logits (torch.Tensor): probability distribution value from model and it has a logarithm shape 34 | target (torch.Tensor): ground-thruth encoded to integers which directly point a word in label 35 | 36 | Returns: label_smoothed 37 | - **label_smoothed** (float): sum of loss 38 | """ 39 | def __init__( 40 | self, 41 | num_classes: int, # the number of classfication 42 | ignore_index: int, # indexes that are ignored when calcuating loss 43 | smoothing: float = 0.1, # ratio of smoothing (confidence = 1.0 - smoothing) 44 | dim: int = -1, # dimension of caculation loss 45 | reduction='sum', # reduction method [sum, mean] 46 | ) -> None: 47 | super(LabelSmoothedCrossEntropyLoss, self).__init__() 48 | self.confidence = 1.0 - smoothing 49 | self.smoothing = smoothing 50 | self.num_classes = num_classes 51 | self.dim = dim 52 | self.ignore_index = ignore_index 53 | self.reduction = reduction.lower() 54 | 55 | if self.reduction == 'sum': 56 | self.reduction_method = torch.sum 57 | elif self.reduction == 'mean': 58 | self.reduction_method = torch.mean 59 | else: 60 | raise ValueError("Unsupported reduction method {0}".format(reduction)) 61 | 62 | def forward(self, logits: Tensor, targets: Tensor) -> Tensor: 63 | if self.smoothing > 0.0: 64 | with torch.no_grad(): 65 | label_smoothed = torch.zeros_like(logits) 66 | label_smoothed.fill_(self.smoothing / (self.num_classes - 1)) 67 | label_smoothed.scatter_(1, targets.data.unsqueeze(1), self.confidence) 68 | label_smoothed[targets == self.ignore_index, :] = 0 69 | return self.reduction_method(-label_smoothed * logits) 70 | 71 | return F.cross_entropy(logits, targets, ignore_index=self.ignore_index, reduction=self.reduction) 72 | -------------------------------------------------------------------------------- /test/attn_visualize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import numpy as np 17 | import librosa 18 | import pandas as pd 19 | import matplotlib.pyplot as plt 20 | import seaborn as sns 21 | from kospeech.data.audio.core import split 22 | from kospeech.models.seq2seq.decoder import Seq2seqDecoder 23 | 24 | MODEL_PATH = '../data/checkpoints/model.pt' 25 | AUDIO_PATH = '../data/sample/KaiSpeech_000098.pcm' 26 | DEL_SILENCE = True 27 | NORMALIZE = True 28 | SAMPLE_RATE = 16000 29 | N_MELS = 80 30 | N_FFT = 320 31 | HOP_LENGTH = 160 32 | 33 | 34 | def load_audio(audio_path, del_silence): 35 | sound = np.memmap(audio_path, dtype='h', mode='r').astype('float32') 36 | 37 | if del_silence: 38 | non_silence_indices = split(sound, top_db=30) 39 | sound = np.concatenate([sound[start:end] for start, end in non_silence_indices]) 40 | 41 | sound /= 32767 # normalize audio 42 | return sound 43 | 44 | 45 | # Set your parse_audio() method used in training. 46 | def parse_audio(audio_path): 47 | sound = load_audio(audio_path, DEL_SILENCE) 48 | 49 | spectrogram = librosa.feature.melspectrogram(sound, SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH) 50 | spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max) 51 | 52 | if NORMALIZE: 53 | mean = np.mean(spectrogram) 54 | std = np.std(spectrogram) 55 | spectrogram -= mean 56 | spectrogram /= std 57 | 58 | spectrogram = spectrogram[:, ::-1] 59 | spectrogram = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(spectrogram, 0, 1))) 60 | 61 | return spectrogram 62 | 63 | 64 | spectrogram = parse_audio(AUDIO_PATH) 65 | model = torch.load(MODEL_PATH) 66 | 67 | _, metadata = model(spectrogram.unsqueeze(0), torch.IntTensor([len(spectrogram)]), teacher_forcing_ratio=0.0) # D(NxT) 68 | 69 | alignments = metadata[Seq2seqDecoder.KEY_ATTN_SCORE] 70 | attention_maps = None 71 | 72 | for decode_timestep in alignments: 73 | if attention_maps is None: 74 | attention_maps = decode_timestep 75 | else: 76 | attention_maps = torch.cat([attention_maps, decode_timestep], dim=1) # NxDxT 77 | 78 | attention_maps = torch.flip(attention_maps, dims=[0, 1]) 79 | num_heads = attention_maps.size(0) 80 | 81 | f = plt.figure(figsize=(16, 6)) 82 | plt.imshow(spectrogram.transpose(0, 1), aspect='auto', origin='lower') 83 | plt.savefig("./image/spectrogram.png") 84 | 85 | for n in range(num_heads): 86 | g = plt.figure(figsize=(10, 8)) 87 | attention_map = pd.DataFrame(attention_maps[n].cpu().detach().numpy().transpose(0, 1)) 88 | attention_map = sns.heatmap(attention_map, fmt="f", cmap='viridis') 89 | attention_map.invert_yaxis() 90 | fig = attention_map.get_figure() 91 | fig.savefig("./image/head%s.png" % str(n)) 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ## 외국인 발화 한국어 음성 인식(Korean STT) 4 | 5 | 6 | **kospeech를 활용한 한국어 음성인식 모델 개발** 7 | ___ 8 |
9 | 10 | 해당 프로젝트는 End-to-End 한국어 음성 인식 오픈소스 툴킷인 11 | [kospeech](https://github.com/sooftware/kospeech)를 활용하여 진행했음을 밝힙니다. 12 |

13 | 14 | ## - 프로젝트 개요 15 | - 외국인 발화 한국어 음성 데이터를 활용해 음성 인식 모델을 개발하고, 관련 사업화 아이디어를 제안하는 해커톤에 참여한 프로젝트의 일부입니다.
(대회 기간: 약 3주) 16 | - 위의 이유로, 데이터와 학습을 완료한 모델은 업로드 하지 않았습니다. 17 | - 총 30만 개의 학습 데이터를 활용하여 모델을 개발하고 1만 개의 테스트 데이터에 대해 CER, WER을 측정한 결과 'CER: 0.085, WER: 0.1919'의 결과를 얻었습니다. 18 | - 모델 성능 40%, 사업화 아이디어 60%의 평가에서 전체 29개 참여 팀 중 성능 부문 1위, 최종 평가 2위로 우수상을 수상했습니다. 19 | - 원본 오픈소스 코드에 에러/버그가 꽤 많았습니다. 해당 문제들을 하나씩 해결한 결과물이고, 그 과정에서 제가 사용한 데이터에 맞게 변형된 부분이 존재할 수 있습니다. 20 | - 자세한 오픈소스 활용 방법과 그 과정에서 발생한 문제들을 해결한 과정은 제 [블로그](https://mingchin.tistory.com/152)의 'kospeech(한국어 STT)' 카테고리를 참고하시기 바랍니다. 21 | - ./bin/inference_wer.py, ./bin/tools.py, ./bin/prediction.py는 kospeech에 존재하지 않으며, 필요에 의해 생성한 파일입니다. 22 | - 꼭 필요하지는 않지만 참고가 될만한 파일은 ./etc 안에 두었습니다. 23 |

24 | 25 | ## - 모듈 설치 26 | ``` 27 | !pip install -r requirements_cssiri.txt 28 | ``` 29 | - Python 3.8을 사용했습니다. 30 | - kospeech가 제공하는 다양한 Acoustic Model 중, ds2(deepspeech2)를 사용했습니다. 31 | - Pytorch의 경우 1.10 버전이 사용되기 때문에 상위 버전을 사용하시는 경우 별도로 Pytorch를 재설치해주어야 합니다. 32 | - 전처리, 학습, 예측, 예측한 결과 저장에 필요한 모든 모듈을 포함시켰습니다. 33 |

34 | 35 | ## - 전처리(Preprocess) 36 | ``` 37 | !python ./dataset/kspon/main.py --dataset_path $dataset_path --vocab_dest $vacab_dict_destination --output_unit 'character' --preprocess_mode 'phonetic' 38 | ``` 39 | - output_unit과 preprocess_mode는 상황에 맞게 지정해주시면 됩니다. 40 | - ./dataset/kspon/preprocess/preprocess.py의 line 95~101을 확인해보시면, './'의 위치에 'train.txt' 파일을 필요로 합니다. 해당 파일은 '음성 파일 경로' + '\t' + '한국어 전사' 의 형식으로 작성되어야 합니다. 41 | - train.txt를 만들 때 사용한 코드는 ./etc/traintext 생성.ipynb 에 올려두었습니다. 42 |

43 | 44 | ## - 학습(Train) 45 | ``` 46 | !python ./bin/main.py model=ds2 train=ds2_train train.dataset_path=$dataset_path 47 | ``` 48 | - 학습과 관련된 configs(epoch, batch_size, spec_augment, 음성 파일 확장자 등)의 수정은 ./configs/audio/fbank.yaml 혹은 ./configs/train/ds2_train.yaml 에서 하실 수 있습니다. 49 |
보다 자세한 설명은 [여기](https://mingchin.tistory.com/222)를 참고해주세요. 50 |

51 | 52 | ## - 예측(Inference) 53 | 54 | 모든 command에 대한 deivice 옵션은 상황에 맞게 지정해주세요. 55 | 56 | 1. 음성 파일 1개에 대한 예측 57 | * Command 58 | 59 | ``` 60 | !python ./bin/inference.py --model_path $model_path --audio_path $audio_path --device "cpu" 61 | ``` 62 | * Output 63 | 64 | ``` 65 | 음성 인식 결과 66 | ``` 67 | 2. 음성 파일 1개에 대한 예측과 Cer, Wer 계산 결과 저장
68 | (결과는 dst_path에 저장되며, 정답 label인 transcripts.txt파일을 transcript_path에 지정해주어야 합니다. 그 형식은 전처리에 필요한 train.txt 파일 혹은 학습에 사용되는 transcripts.txt와 동일해야 합니다.) 69 | * Command 70 | ``` 71 | python ./bin/inference_wer.py --model_path $model_path --audio_path $audio_path --transcript_path $transcript_path --dst_path $result_destination --device "cpu" 72 | ``` 73 | * Output 74 | 75 | ``` 76 | 음성 인식 결과 77 | ``` 78 | 3. 음성 파일 여러 개(폴더)에 대한 예측과 그 결과 저장(.txt, .xlsx) 79 | * Command 80 | ``` 81 | python ./bin/prediction.py --model_path $model_path --audio_path $audio_path --submission 'True' --device "cpu" 82 | ``` 83 | 'submission = True'로 지정하면 예측 결과를 .xlsx 파일로 저장할 수 있습니다. 다만 2개의 컬럼을 갖는 제출용 excel 파일을 필요로 합니다. 84 | * Output 85 | 86 | ./outputs 폴더에 .txt와 .xlsx 파일 생성 87 |

88 | 89 | ## - References 90 | - Wer, Cer 관련: 91 | https://holianh.github.io/portfolio/Cach-tinh-WER/ 92 | - kospeech: 93 | https://github.com/sooftware/kospeech 94 | -------------------------------------------------------------------------------- /dataset/kspon/preprocess/grapheme.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pandas as pd 17 | import unicodedata 18 | 19 | 20 | def load_label(filepath): 21 | grpm2id = dict() 22 | id2grpm = dict() 23 | 24 | vocab_data_frame = pd.read_csv(filepath, encoding="utf-8") 25 | 26 | id_list = vocab_data_frame["id"] 27 | grpm_list = vocab_data_frame["grpm"] 28 | 29 | for _id, grpm in zip(id_list, grpm_list): 30 | grpm2id[grpm] = _id 31 | id2grpm[_id] = grpm 32 | return grpm2id, id2grpm 33 | 34 | 35 | def sentence_to_target(transcript, grpm2id): 36 | target = str() 37 | 38 | for grpm in transcript: 39 | target += (str(grpm2id[grpm]) + ' ') 40 | 41 | return target[:-1] 42 | 43 | 44 | def sentence_to_grapheme(audio_paths, transcripts, vocab_dest: str = './data'): 45 | grapheme_transcripts = list() 46 | 47 | if not os.path.exists(vocab_dest): 48 | os.mkdir(vocab_dest) 49 | 50 | for transcript in transcripts: 51 | grapheme_transcripts.append(" ".join(unicodedata.normalize('NFKD', transcript).replace(' ', '|')).upper()) 52 | 53 | generate_grapheme_labels(grapheme_transcripts, vocab_dest) 54 | 55 | print('create_script started..') 56 | grpm2id, id2grpm = load_label(os.path.join(vocab_dest, "aihub_labels.csv")) 57 | 58 | with open(os.path.join(f"{vocab_dest}/transcripts.txt"), "w") as f: 59 | for audio_path, transcript, grapheme_transcript in zip(audio_paths, transcripts, grapheme_transcripts): 60 | audio_path = audio_path.replace('txt', 'pcm') 61 | grpm_id_transcript = sentence_to_target(grapheme_transcript.split(), grpm2id) 62 | f.write(f'{audio_path}\t{transcript}\t{grpm_id_transcript}\n') 63 | 64 | 65 | def generate_grapheme_labels(grapheme_transcripts, vocab_dest: str = './data'): 66 | vocab_list = list() 67 | vocab_freq = list() 68 | 69 | for grapheme_transcript in grapheme_transcripts: 70 | graphemes = grapheme_transcript.split() 71 | for grapheme in graphemes: 72 | if grapheme not in vocab_list: 73 | vocab_list.append(grapheme) 74 | vocab_freq.append(1) 75 | else: 76 | vocab_freq[vocab_list.index(grapheme)] += 1 77 | 78 | vocab_freq, vocab_list = zip(*sorted(zip(vocab_freq, vocab_list), reverse=True)) 79 | vocab_dict = { 80 | 'id': [0, 1, 2], 81 | 'grpm': ['', '', ''], 82 | 'freq': [0, 0, 0] 83 | } 84 | 85 | for idx, (grpm, freq) in enumerate(zip(vocab_list, vocab_freq)): 86 | vocab_dict['id'].append(idx + 3) 87 | vocab_dict['grpm'].append(grpm) 88 | vocab_dict['freq'].append(freq) 89 | 90 | label_df = pd.DataFrame(vocab_dict) 91 | label_df.to_csv(os.path.join(vocab_dest, "aihub_labels.csv"), encoding="utf-8", index=False) 92 | -------------------------------------------------------------------------------- /bin/kospeech/criterion/joint_ctc_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | from typing import Tuple 17 | from torch import Tensor 18 | 19 | from kospeech.criterion import LabelSmoothedCrossEntropyLoss 20 | 21 | 22 | class JointCTCCrossEntropyLoss(nn.Module): 23 | """ 24 | Privides Joint CTC-CrossEntropy Loss function 25 | 26 | Args: 27 | num_classes (int): the number of classification 28 | ignore_index (int): indexes that are ignored when calculating loss 29 | dim (int): dimension of calculation loss 30 | reduction (str): reduction method [sum, mean] (default: mean) 31 | ctc_weight (float): weight of ctc loss 32 | cross_entropy_weight (float): weight of cross entropy loss 33 | blank_id (int): identification of blank for ctc 34 | """ 35 | 36 | def __init__( 37 | self, 38 | num_classes: int, 39 | ignore_index: int, 40 | dim: int = -1, 41 | reduction='mean', 42 | ctc_weight: float = 0.3, 43 | cross_entropy_weight: float = 0.7, 44 | blank_id: int = None, 45 | smoothing: float = 0.1, 46 | ) -> None: 47 | super(JointCTCCrossEntropyLoss, self).__init__() 48 | self.num_classes = num_classes 49 | self.dim = dim 50 | self.ignore_index = ignore_index 51 | self.reduction = reduction.lower() 52 | self.ctc_weight = ctc_weight 53 | self.cross_entropy_weight = cross_entropy_weight 54 | self.ctc_loss = nn.CTCLoss(blank=blank_id, reduction=self.reduction, zero_infinity=True) 55 | if smoothing > 0.0: 56 | self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss( 57 | num_classes=num_classes, 58 | ignore_index=ignore_index, 59 | smoothing=smoothing, 60 | reduction=reduction, 61 | dim=-1, 62 | ) 63 | else: 64 | self.cross_entropy_loss = nn.CrossEntropyLoss(reduction=self.reduction, ignore_index=self.ignore_index) 65 | 66 | def forward( 67 | self, 68 | encoder_log_probs: Tensor, 69 | decoder_log_probs: Tensor, 70 | output_lengths: Tensor, 71 | targets: Tensor, 72 | target_lengths: Tensor, 73 | ) -> Tuple[Tensor, Tensor, Tensor]: 74 | ctc_loss = self.ctc_loss(encoder_log_probs, targets, output_lengths, target_lengths) 75 | cross_entropy_loss = self.cross_entropy_loss(decoder_log_probs, targets.contiguous().view(-1)) 76 | loss = cross_entropy_loss * self.cross_entropy_weight + ctc_loss * self.ctc_weight 77 | return loss, ctc_loss, cross_entropy_loss 78 | -------------------------------------------------------------------------------- /dataset/libri/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sentencepiece as spm 17 | 18 | LIBRI_SPEECH_DATASETS = [ 19 | 'train_960', 'dev-clean', 'dev-other', 'test-clean', 'test-other' 20 | ] 21 | 22 | 23 | def collect_transcripts(dataset_path): 24 | transcripts_collection = list() 25 | 26 | for dataset in LIBRI_SPEECH_DATASETS: 27 | dataset_transcripts = list() 28 | 29 | for subfolder1 in os.listdir(os.path.join(dataset_path, dataset)): 30 | for subfolder2 in os.listdir(os.path.join(dataset_path, dataset, subfolder1)): 31 | for file in os.listdir(os.path.join(dataset_path, dataset, subfolder1, subfolder2)): 32 | if file.endswith('txt'): 33 | with open(os.path.join(dataset_path, dataset, subfolder1, subfolder2, file)) as f: 34 | for line in f.readlines(): 35 | tokens = line.split() 36 | audio = '%s.flac' % os.path.join(dataset, subfolder1, subfolder2, tokens[0]) 37 | transcript = " ".join(tokens[1:]) 38 | dataset_transcripts.append('%s|%s' % (audio, transcript)) 39 | 40 | else: 41 | continue 42 | 43 | transcripts_collection.append(dataset_transcripts) 44 | 45 | return transcripts_collection 46 | 47 | 48 | def prepare_tokenizer(train_transcripts, vocab_size): 49 | input_file = 'spm_input.txt' 50 | model_name = 'tokenizer' 51 | model_type = 'unigram' 52 | 53 | with open(input_file, 'w') as f: 54 | for transcript in train_transcripts: 55 | f.write('{}\n'.format(transcript.split('|')[-1])) 56 | 57 | input_args = '--input=%s --model_prefix=%s --vocab_size=%s --model_type=%s' 58 | cmd = input_args % (input_file, model_name, vocab_size, model_type) 59 | spm.SentencePieceTrainer.Train(cmd) 60 | 61 | 62 | def generate_transcript_file(dataset_name, transcripts): 63 | sp = spm.SentencePieceProcessor() 64 | sp.Load("tokenizer.model") 65 | 66 | with open('../../data/%s-transcript.txt' % dataset_name, 'w') as f: 67 | for transcript in transcripts: 68 | audio, transcript = transcript.split('|') 69 | text = " ".join(sp.EncodeAsPieces(transcript)) 70 | label = " ".join([str(item) for item in sp.EncodeAsIds(transcript)]) 71 | 72 | f.write('%s\t%s\t%s\n' % (audio, text, label)) 73 | 74 | 75 | def merge_train_dev_transcript_file(): 76 | merge_list = ['train_960', 'dev-clean', 'dev-other'] 77 | 78 | lines = list() 79 | 80 | for dataset in merge_list: 81 | with open('../../data/%s-transcript.txt' % dataset) as f: 82 | for line in f.readlines(): 83 | lines.append(line) 84 | 85 | with open('../../data/train.txt', 'w') as f: 86 | for line in lines: 87 | f.write('%s' % line) 88 | --------------------------------------------------------------------------------