├── docs
    ├── .nojekyll
    ├── build
    │   ├── html
    │   │   ├── .nojekyll
    │   │   ├── objects.inv
    │   │   ├── _static
    │   │   │   ├── file.png
    │   │   │   ├── minus.png
    │   │   │   ├── plus.png
    │   │   │   ├── css
    │   │   │   │   ├── fonts
    │   │   │   │   │   ├── lato-bold.woff
    │   │   │   │   │   ├── lato-bold.woff2
    │   │   │   │   │   ├── lato-normal.woff
    │   │   │   │   │   ├── lato-normal.woff2
    │   │   │   │   │   ├── Roboto-Slab-Bold.woff
    │   │   │   │   │   ├── Roboto-Slab-Bold.woff2
    │   │   │   │   │   ├── lato-bold-italic.woff
    │   │   │   │   │   ├── lato-bold-italic.woff2
    │   │   │   │   │   ├── Roboto-Slab-Regular.woff
    │   │   │   │   │   ├── Roboto-Slab-Regular.woff2
    │   │   │   │   │   ├── fontawesome-webfont.eot
    │   │   │   │   │   ├── fontawesome-webfont.ttf
    │   │   │   │   │   ├── fontawesome-webfont.woff
    │   │   │   │   │   ├── fontawesome-webfont.woff2
    │   │   │   │   │   ├── lato-normal-italic.woff
    │   │   │   │   │   └── lato-normal-italic.woff2
    │   │   │   │   └── badge_only.css
    │   │   │   ├── documentation_options.js
    │   │   │   └── js
    │   │   │   │   ├── badge_only.js
    │   │   │   │   └── html5shiv.min.js
    │   │   ├── _sources
    │   │   │   ├── Evaluator.rst.txt
    │   │   │   ├── Checkpoint.rst.txt
    │   │   │   ├── Trainer.rst.txt
    │   │   │   ├── Deep Speech 2.rst.txt
    │   │   │   ├── Decode.rst.txt
    │   │   │   ├── Jasper.rst.txt
    │   │   │   ├── Etc.rst.txt
    │   │   │   ├── Interface.rst.txt
    │   │   │   ├── Vocabs.rst.txt
    │   │   │   ├── Modules.rst.txt
    │   │   │   ├── RNN Transducer.rst.txt
    │   │   │   ├── Listen Attend Spell.rst.txt
    │   │   │   ├── Criterion.rst.txt
    │   │   │   ├── Optim.rst.txt
    │   │   │   ├── Learning Rate Schedulers.rst.txt
    │   │   │   ├── Conformer.rst.txt
    │   │   │   ├── Data.rst.txt
    │   │   │   ├── index.rst.txt
    │   │   │   └── Speech Transformer.rst.txt
    │   │   └── .buildinfo
    │   ├── .DS_Store
    │   └── doctrees
    │   │   ├── Etc.doctree
    │   │   ├── Data.doctree
    │   │   ├── Decode.doctree
    │   │   ├── Jasper.doctree
    │   │   ├── Optim.doctree
    │   │   ├── Vocabs.doctree
    │   │   ├── index.doctree
    │   │   ├── Modules.doctree
    │   │   ├── Trainer.doctree
    │   │   ├── Checkpoint.doctree
    │   │   ├── Conformer.doctree
    │   │   ├── Criterion.doctree
    │   │   ├── Evaluator.doctree
    │   │   ├── Interface.doctree
    │   │   ├── environment.pickle
    │   │   ├── notes
    │   │       ├── intro.doctree
    │   │       └── Preparation.doctree
    │   │   ├── Deep Speech 2.doctree
    │   │   ├── RNN Transducer.doctree
    │   │   ├── Speech Transformer.doctree
    │   │   ├── Listen Attend Spell.doctree
    │   │   └── Learning Rate Schedulers.doctree
    ├── .DS_Store
    ├── objects.inv
    ├── _static
    │   ├── file.png
    │   ├── plus.png
    │   ├── minus.png
    │   ├── css
    │   │   ├── fonts
    │   │   │   ├── lato-bold.woff
    │   │   │   ├── lato-bold.woff2
    │   │   │   ├── lato-normal.woff
    │   │   │   ├── lato-normal.woff2
    │   │   │   ├── Roboto-Slab-Bold.woff
    │   │   │   ├── lato-bold-italic.woff
    │   │   │   ├── Roboto-Slab-Bold.woff2
    │   │   │   ├── fontawesome-webfont.eot
    │   │   │   ├── fontawesome-webfont.ttf
    │   │   │   ├── lato-bold-italic.woff2
    │   │   │   ├── lato-normal-italic.woff
    │   │   │   ├── Roboto-Slab-Regular.woff
    │   │   │   ├── Roboto-Slab-Regular.woff2
    │   │   │   ├── fontawesome-webfont.woff
    │   │   │   ├── fontawesome-webfont.woff2
    │   │   │   └── lato-normal-italic.woff2
    │   │   └── badge_only.css
    │   ├── documentation_options.js
    │   └── js
    │   │   ├── badge_only.js
    │   │   └── html5shiv.min.js
    ├── source
    │   ├── Evaluator.rst
    │   ├── Checkpoint.rst
    │   ├── Trainer.rst
    │   ├── Deep Speech 2.rst
    │   ├── Decode.rst
    │   ├── Jasper.rst
    │   ├── Etc.rst
    │   ├── Interface.rst
    │   ├── Vocabs.rst
    │   ├── Modules.rst
    │   ├── RNN Transducer.rst
    │   ├── Listen Attend Spell.rst
    │   ├── Criterion.rst
    │   ├── Optim.rst
    │   ├── Learning Rate Schedulers.rst
    │   ├── Conformer.rst
    │   ├── Data.rst
    │   ├── index.rst
    │   └── Speech Transformer.rst
    ├── _sources
    │   ├── Evaluator.rst.txt
    │   ├── Checkpoint.rst.txt
    │   ├── Trainer.rst.txt
    │   ├── Deep Speech 2.rst.txt
    │   ├── Decode.rst.txt
    │   ├── Jasper.rst.txt
    │   ├── Etc.rst.txt
    │   ├── Interface.rst.txt
    │   ├── Vocabs.rst.txt
    │   ├── Modules.rst.txt
    │   ├── RNN Transducer.rst.txt
    │   ├── Listen Attend Spell.rst.txt
    │   ├── Criterion.rst.txt
    │   ├── Optim.rst.txt
    │   ├── Learning Rate Schedulers.rst.txt
    │   ├── Conformer.rst.txt
    │   ├── Data.rst.txt
    │   ├── index.rst.txt
    │   └── Speech Transformer.rst.txt
    ├── .buildinfo
    ├── Makefile
    └── make.bat
├── configs
    ├── eval.yaml
    ├── train.yaml
    ├── eval
    │   └── default.yaml
    ├── model
    │   ├── rnnt.yaml
    │   ├── ds2.yaml
    │   ├── transformer.yaml
    │   ├── joint-ctc-attention-transformer.yaml
    │   ├── las.yaml
    │   ├── joint-ctc-attention-las.yaml
    │   ├── conformer-medium.yaml
    │   ├── conformer-small.yaml
    │   └── conformer-large.yaml
    ├── audio
    │   ├── mfcc.yaml
    │   ├── fbank.yaml
    │   ├── melspectrogram.yaml
    │   └── spectrogram.yaml
    └── train
    │   ├── ds2_train.yaml
    │   ├── las_train.yaml
    │   └── transformer_train.yaml
├── train.txt
├── data
    ├── .DS_Store
    ├── vocab
    │   ├── .DS_Store
    │   ├── tokenizer.model
    │   ├── kspon_sentencepiece.model
    │   └── aihub_grapheme_vocabs.csv
    └── train_result
    │   ├── eval_result.csv
    │   ├── train_result.csv
    │   └── train_step_result.csv
├── dataset
    ├── kspon
    │   ├── requirements.txt
    │   ├── preprocess.sh
    │   ├── LICENSE
    │   └── preprocess
    │   │   ├── subword.py
    │   │   ├── character.py
    │   │   └── grapheme.py
    └── libri
    │   ├── prepare-libri.sh
    │   ├── prepare-libri.py
    │   └── preprocess.py
├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── quesion.md
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── test
    ├── test_rnnt_recognize.py
    ├── test_conformer_recognize.py
    ├── model_architecture.py
    ├── cer_visualize.py
    ├── loss_visualize.py
    ├── test_rnnt.py
    ├── test_deepspeech2_recognize.py
    ├── test_conformer.py
    ├── test_las_decoder.py
    ├── test_jasper_recognize.py
    ├── test_las_recognize.py
    ├── test_las_encoder.py
    ├── test_transformer_recognize.py
    ├── test_jasper.py
    ├── test_deepspeech2.py
    ├── test_las.py
    ├── test_tri_stage_lr_scheduler.py
    ├── test_transformer_lr_scheduler.py
    ├── test_transformer.py
    ├── test_transformer_encoder.py
    └── attn_visualize.py
├── bin
    ├── kospeech
    │   ├── checkpoint
    │   │   └── __init__.py
    │   ├── optim
    │   │   ├── lr_scheduler
    │   │   │   ├── __init__.py
    │   │   │   ├── lr_scheduler.py
    │   │   │   ├── transformer_lr_scheduler.py
    │   │   │   └── tri_stage_lr_scheduler.py
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── jasper
    │   │   │   ├── __init__.py
    │   │   │   └── configs.py
    │   │   ├── deepspeech2
    │   │   │   └── __init__.py
    │   │   ├── rnnt
    │   │   │   └── __init__.py
    │   │   ├── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── mask.py
    │   │   │   ├── sublayers.py
    │   │   │   └── embeddings.py
    │   │   ├── activation.py
    │   │   ├── las
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── conformer
    │   │   │   └── __init__.py
    │   │   └── modules.py
    │   ├── criterion
    │   │   ├── __init__.py
    │   │   ├── transducer.py
    │   │   ├── label_smoothed_cross_entropy.py
    │   │   └── joint_ctc_cross_entropy.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── audio
    │   │   │   └── __init__.py
    │   │   └── label_loader.py
    │   ├── evaluator
    │   │   ├── __init__.py
    │   │   └── evaluator.py
    │   ├── vocabs
    │   │   ├── __init__.py
    │   │   └── librispeech.py
    │   └── decode
    │   │   └── ensemble.py
    ├── inference.py
    └── eval.py
├── requirements_cssiri.txt
├── setup.py
├── etc
    ├── cmd 실행 코드.txt
    └── traintext 생성.ipynb
└── README.md


/docs/.nojekyll:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/build/html/.nojekyll:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/configs/eval.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - audio: fbank
3 |   - eval: default


--------------------------------------------------------------------------------
/train.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/train.txt


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/.DS_Store


--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/.DS_Store


--------------------------------------------------------------------------------
/docs/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/objects.inv


--------------------------------------------------------------------------------
/dataset/kspon/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | gluonnlp
3 | sentencepiece
4 | hgtk
5 | pandas
6 | 


--------------------------------------------------------------------------------
/data/vocab/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/.DS_Store


--------------------------------------------------------------------------------
/docs/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/file.png


--------------------------------------------------------------------------------
/docs/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/plus.png


--------------------------------------------------------------------------------
/docs/build/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/.DS_Store


--------------------------------------------------------------------------------
/docs/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/minus.png


--------------------------------------------------------------------------------
/data/vocab/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/tokenizer.model


--------------------------------------------------------------------------------
/docs/build/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/objects.inv


--------------------------------------------------------------------------------
/configs/train.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - audio: fbank
3 |   - model: joint-ctc-attention-las
4 |   - train: las_train
5 | 


--------------------------------------------------------------------------------
/docs/build/doctrees/Etc.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Etc.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Data.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Data.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Decode.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Decode.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Jasper.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Jasper.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Optim.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Optim.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Vocabs.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Vocabs.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/index.doctree


--------------------------------------------------------------------------------
/docs/build/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/file.png


--------------------------------------------------------------------------------
/docs/build/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/minus.png


--------------------------------------------------------------------------------
/docs/build/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/plus.png


--------------------------------------------------------------------------------
/data/vocab/kspon_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/data/vocab/kspon_sentencepiece.model


--------------------------------------------------------------------------------
/docs/build/doctrees/Modules.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Modules.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Trainer.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Trainer.doctree


--------------------------------------------------------------------------------
/data/train_result/eval_result.csv:
--------------------------------------------------------------------------------
1 | loss,cer
2 | 0.42729363597111464,1.4615384615384615
3 | 0.42627342518430084,1.4615384615384615
4 | 


--------------------------------------------------------------------------------
/data/train_result/train_result.csv:
--------------------------------------------------------------------------------
1 | loss,cer
2 | 0.46951467186864454,1.7254191209787042
3 | 0.4683552428329273,1.7254191209787042
4 | 


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold.woff


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold.woff2


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-normal.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal.woff


--------------------------------------------------------------------------------
/docs/build/doctrees/Checkpoint.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Checkpoint.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Conformer.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Conformer.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Criterion.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Criterion.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Evaluator.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Evaluator.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/Interface.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Interface.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/environment.pickle


--------------------------------------------------------------------------------
/docs/build/doctrees/notes/intro.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/notes/intro.doctree


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-normal.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal.woff2


--------------------------------------------------------------------------------
/docs/build/doctrees/Deep Speech 2.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Deep Speech 2.doctree


--------------------------------------------------------------------------------
/docs/_static/css/fonts/Roboto-Slab-Bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Bold.woff


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-bold-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold-italic.woff


--------------------------------------------------------------------------------
/docs/build/doctrees/RNN Transducer.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/RNN Transducer.doctree


--------------------------------------------------------------------------------
/docs/_static/css/fonts/Roboto-Slab-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Bold.woff2


--------------------------------------------------------------------------------
/docs/_static/css/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/_static/css/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-bold-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-bold-italic.woff2


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-normal-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal-italic.woff


--------------------------------------------------------------------------------
/docs/build/doctrees/Speech Transformer.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Speech Transformer.doctree


--------------------------------------------------------------------------------
/docs/build/doctrees/notes/Preparation.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/notes/Preparation.doctree


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.psd filter=lfs diff=lfs merge=lfs -text
2 | *.pt filter=lfs diff=lfs merge=lfs -text
3 | .pt filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/docs/_static/css/fonts/Roboto-Slab-Regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Regular.woff


--------------------------------------------------------------------------------
/docs/_static/css/fonts/Roboto-Slab-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/Roboto-Slab-Regular.woff2


--------------------------------------------------------------------------------
/docs/_static/css/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/docs/_static/css/fonts/fontawesome-webfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/fontawesome-webfont.woff2


--------------------------------------------------------------------------------
/docs/_static/css/fonts/lato-normal-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/_static/css/fonts/lato-normal-italic.woff2


--------------------------------------------------------------------------------
/docs/build/doctrees/Listen Attend Spell.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Listen Attend Spell.doctree


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold.woff2


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-normal.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-normal.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal.woff2


--------------------------------------------------------------------------------
/docs/build/doctrees/Learning Rate Schedulers.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/doctrees/Learning Rate Schedulers.doctree


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-bold-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-bold-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-bold-italic.woff2


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-normal-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff


--------------------------------------------------------------------------------
/docs/build/html/_static/css/fonts/lato-normal-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alsrb0607/KoreanSTT/HEAD/docs/build/html/_static/css/fonts/lato-normal-italic.woff2


--------------------------------------------------------------------------------
/docs/source/Evaluator.rst:
--------------------------------------------------------------------------------
1 | 
2 | Evaluator
3 | =====================================================
4 | 
5 | Evaluator
6 | --------------------------------------------
7 | .. automodule:: kospeech.evaluator.evaluator
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/_sources/Evaluator.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Evaluator
3 | =====================================================
4 | 
5 | Evaluator
6 | --------------------------------------------
7 | .. automodule:: kospeech.evaluator.evaluator
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/source/Checkpoint.rst:
--------------------------------------------------------------------------------
1 | 
2 | Checkpoint
3 | =====================================================
4 | 
5 | Checkpoint
6 | --------------------------------------------
7 | .. automodule:: kospeech.checkpoint.checkpoint
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/_sources/Checkpoint.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Checkpoint
3 | =====================================================
4 | 
5 | Checkpoint
6 | --------------------------------------------
7 | .. automodule:: kospeech.checkpoint.checkpoint
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/source/Trainer.rst:
--------------------------------------------------------------------------------
1 | 
2 | Trainer
3 | =====================================================
4 | 
5 | Supervised Trainer
6 | --------------------------------------------
7 | .. automodule:: kospeech.trainer.supervised_trainer
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/_sources/Trainer.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Trainer
3 | =====================================================
4 | 
5 | Supervised Trainer
6 | --------------------------------------------
7 | .. automodule:: kospeech.trainer.supervised_trainer
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Evaluator.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Evaluator
3 | =====================================================
4 | 
5 | Evaluator
6 | --------------------------------------------
7 | .. automodule:: kospeech.evaluator.evaluator
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 6c4ce3491335804025d402c95a850221
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Checkpoint.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Checkpoint
3 | =====================================================
4 | 
5 | Checkpoint
6 | --------------------------------------------
7 | .. automodule:: kospeech.checkpoint.checkpoint
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Trainer.rst.txt:
--------------------------------------------------------------------------------
1 | 
2 | Trainer
3 | =====================================================
4 | 
5 | Supervised Trainer
6 | --------------------------------------------
7 | .. automodule:: kospeech.trainer.supervised_trainer
8 |     :members:
9 | 


--------------------------------------------------------------------------------
/docs/build/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 6f417c1046c612cfb4b900ffe5d11a66
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/source/Deep Speech 2.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Deep Speech 2
 3 | =====================================================
 4 | 
 5 | Deep Speech 2
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.deepspeech2.model
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/configs/eval/default.yaml:
--------------------------------------------------------------------------------
 1 | use_cuda: True
 2 | dataset: 'kspon'
 3 | dataset_path: ''
 4 | transcripts_path: 'data/eval_transcript.txt'
 5 | model_path: ''
 6 | output_unit: 'character'
 7 | batch_size: 32
 8 | num_workers: 4
 9 | print_every: 20
10 | decode: 'greedy'
11 | k: 3


--------------------------------------------------------------------------------
/docs/_sources/Deep Speech 2.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Deep Speech 2
 3 | =====================================================
 4 | 
 5 | Deep Speech 2
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.deepspeech2.model
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/quesion.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Quesion
 3 | about: Question about this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: sooftware
 7 | 
 8 | ---
 9 | 
10 | ## Title
11 | - 
12 | 
13 | ## Description
14 | 
15 | ## Linked Issues
16 |   
17 | - resolved #
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Title
11 | - 
12 | 
13 | ## Description
14 | 
15 | ## Linked Issues
16 |   
17 | - resolved #
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Title
11 | - 
12 | 
13 | ## Description
14 | 
15 | ## Linked Issues
16 | - resolved #
17 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Deep Speech 2.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Deep Speech 2
 3 | =====================================================
 4 | 
 5 | Deep Speech 2
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.deepspeech2.model
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | experiment/
 2 | experiment/*
 3 | data/sample
 4 | data/sample/*
 5 | data/data_list/debug_list.csv
 6 | data/data_list/sample_list.csv
 7 | *.bin
 8 | *.zip
 9 | *.idea
10 | docs/training
11 | docs/training/*
12 | __pycache__/
13 | venv/*
14 | *.pyc
15 | .idea
16 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/configs/model/rnnt.yaml:
--------------------------------------------------------------------------------
 1 | architecture: rnnt
 2 | num_encoder_layers: 4
 3 | num_decoder_layers: 1
 4 | encoder_hidden_state_dim: 320
 5 | decoder_hidden_state_dim: 512
 6 | output_dim: 512
 7 | rnn_type: lstm
 8 | bidirectional: True
 9 | encoder_dropout_p: 0.2
10 | decoder_dropout_p: 0.2
11 | 


--------------------------------------------------------------------------------
/configs/model/ds2.yaml:
--------------------------------------------------------------------------------
 1 | architecture: deepspeech2
 2 | use_bidirectional: True
 3 | hidden_dim: 1024
 4 | dropout: 0.3
 5 | num_encoder_layers: 3
 6 | rnn_type: gru
 7 | max_len: 400
 8 | activation: hardtanh
 9 | teacher_forcing_ratio: 1.0
10 | teacher_forcing_step: 0.0
11 | min_teacher_forcing_ratio: 1.0
12 | joint_ctc_attention: false


--------------------------------------------------------------------------------
/configs/model/transformer.yaml:
--------------------------------------------------------------------------------
 1 | architecture: transformer
 2 | d_model: 512
 3 | num_heads: 8
 4 | num_encoder_layers: 12
 5 | num_decoder_layers: 6
 6 | dropout: 0.3
 7 | ffnet_style: ff
 8 | teacher_forcing_ratio: 1.0
 9 | teacher_forcing_step: 0.0
10 | min_teacher_forcing_ratio: 1.0
11 | joint_ctc_attention: false
12 | max_len: 400


--------------------------------------------------------------------------------
/configs/audio/mfcc.yaml:
--------------------------------------------------------------------------------
 1 | audio_extension: pcm
 2 | transform_method: mfcc
 3 | sample_rate: 16000
 4 | frame_length: 20
 5 | frame_shift: 10
 6 | n_mels: 40
 7 | normalize: True
 8 | del_silence: True
 9 | feature_extract_by: kaldi
10 | freq_mask_para: 8
11 | time_mask_num: 4
12 | freq_mask_num: 2
13 | spec_augment: True
14 | input_reverse: false


--------------------------------------------------------------------------------
/configs/audio/fbank.yaml:
--------------------------------------------------------------------------------
 1 | audio_extension: wav
 2 | transform_method: fbank
 3 | sample_rate: 16000
 4 | frame_length: 20
 5 | frame_shift: 10
 6 | n_mels: 80
 7 | normalize: True
 8 | del_silence: True
 9 | feature_extract_by: kaldi
10 | freq_mask_para: 18
11 | time_mask_num: 4
12 | freq_mask_num: 2
13 | spec_augment: True
14 | input_reverse: false


--------------------------------------------------------------------------------
/configs/audio/melspectrogram.yaml:
--------------------------------------------------------------------------------
 1 | audio_extension: pcm
 2 | transform_method: mel
 3 | sample_rate: 16000
 4 | frame_length: 20
 5 | frame_shift: 10
 6 | n_mels: 80
 7 | normalize: True
 8 | del_silence: True
 9 | feature_extract_by: kaldi
10 | freq_mask_para: 18
11 | time_mask_num: 4
12 | freq_mask_num: 2
13 | spec_augment: True
14 | input_reverse: false


--------------------------------------------------------------------------------
/configs/audio/spectrogram.yaml:
--------------------------------------------------------------------------------
 1 | audio_extension: pcm
 2 | transform_method: mel
 3 | sample_rate: 16000
 4 | frame_length: 20
 5 | frame_shift: 10
 6 | n_mels: 161  # Not used
 7 | normalize: True
 8 | del_silence: True
 9 | feature_extract_by: kaldi
10 | freq_mask_para: 24
11 | time_mask_num: 4
12 | freq_mask_num: 2
13 | spec_augment: True
14 | input_reverse: false


--------------------------------------------------------------------------------
/docs/source/Decode.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Decode
 3 | =====================================================
 4 | 
 5 | Search
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.decode.search
 8 |     :members:
 9 | 
10 | 
11 | Ensemble
12 | --------------------------------------------
13 | .. automodule:: kospeech.decode.ensemble
14 |     :members:
15 | 


--------------------------------------------------------------------------------
/docs/_sources/Decode.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Decode
 3 | =====================================================
 4 | 
 5 | Search
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.decode.search
 8 |     :members:
 9 | 
10 | 
11 | Ensemble
12 | --------------------------------------------
13 | .. automodule:: kospeech.decode.ensemble
14 |     :members:
15 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Decode.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Decode
 3 | =====================================================
 4 | 
 5 | Search
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.decode.search
 8 |     :members:
 9 | 
10 | 
11 | Ensemble
12 | --------------------------------------------
13 | .. automodule:: kospeech.decode.ensemble
14 |     :members:
15 | 


--------------------------------------------------------------------------------
/docs/source/Jasper.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Jasper
 3 | =====================================================
 4 | 
 5 | Jasper
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.jasper.model
 9 |     :members:
10 | 
11 | Sublayers
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.jasper.sublayers
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs/_sources/Jasper.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Jasper
 3 | =====================================================
 4 | 
 5 | Jasper
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.jasper.model
 9 |     :members:
10 | 
11 | Sublayers
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.jasper.sublayers
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Jasper.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Jasper
 3 | =====================================================
 4 | 
 5 | Jasper
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.jasper.model
 9 |     :members:
10 | 
11 | Sublayers
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.jasper.sublayers
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/configs/model/joint-ctc-attention-transformer.yaml:
--------------------------------------------------------------------------------
 1 | architecture: transformer
 2 | d_model: 512
 3 | num_heads: 8
 4 | num_encoder_layers: 12
 5 | num_decoder_layers: 6
 6 | dropout: 0.3
 7 | ffnet_style: ff
 8 | teacher_forcing_ratio: 1.0
 9 | teacher_forcing_step: 0.0
10 | min_teacher_forcing_ratio: 1.0
11 | cross_entropy_weight: 0.7
12 | ctc_weight: 0.3
13 | mask_conv: True
14 | joint_ctc_attention: True
15 | max_len: 400


--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: 'latest',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     BUILDER: 'html',
 7 |     FILE_SUFFIX: '.html',
 8 |     LINK_SUFFIX: '.html',
 9 |     HAS_SOURCE: true,
10 |     SOURCELINK_SUFFIX: '.txt',
11 |     NAVIGATION_WITH_KEYS: false
12 | };


--------------------------------------------------------------------------------
/docs/build/html/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: 'latest',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     BUILDER: 'html',
 7 |     FILE_SUFFIX: '.html',
 8 |     LINK_SUFFIX: '.html',
 9 |     HAS_SOURCE: true,
10 |     SOURCELINK_SUFFIX: '.txt',
11 |     NAVIGATION_WITH_KEYS: false
12 | };


--------------------------------------------------------------------------------
/docs/source/Etc.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Etc
 3 | =====================================================
 4 | 
 5 | Metrics
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.metrics
 8 |     :members:
 9 | 
10 | ModelBuilder
11 | --------------------------------------------
12 | .. automodule:: kospeech.model_builder
13 |     :members:
14 | 
15 | Utils
16 | --------------------------------------------
17 | .. automodule:: kospeech.utils
18 |     :members:


--------------------------------------------------------------------------------
/configs/model/las.yaml:
--------------------------------------------------------------------------------
 1 | architecture: las
 2 | use_bidirectional: True
 3 | hidden_dim: 512
 4 | dropout: 0.3
 5 | num_heads: 4
 6 | label_smoothing: 0.1
 7 | num_encoder_layers: 3
 8 | num_decoder_layers: 2
 9 | rnn_type: lstm
10 | teacher_forcing_ratio: 1.0
11 | attn_mechanism: multi-head
12 | teacher_forcing_step: 0.0
13 | min_teacher_forcing_ratio: 1.0
14 | extractor: vgg
15 | activation: hardtanh
16 | mask_conv: false
17 | joint_ctc_attention: false
18 | max_len: 400


--------------------------------------------------------------------------------
/docs/_sources/Etc.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Etc
 3 | =====================================================
 4 | 
 5 | Metrics
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.metrics
 8 |     :members:
 9 | 
10 | ModelBuilder
11 | --------------------------------------------
12 | .. automodule:: kospeech.model_builder
13 |     :members:
14 | 
15 | Utils
16 | --------------------------------------------
17 | .. automodule:: kospeech.utils
18 |     :members:


--------------------------------------------------------------------------------
/docs/build/html/_sources/Etc.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Etc
 3 | =====================================================
 4 | 
 5 | Metrics
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.metrics
 8 |     :members:
 9 | 
10 | ModelBuilder
11 | --------------------------------------------
12 | .. automodule:: kospeech.model_builder
13 |     :members:
14 | 
15 | Utils
16 | --------------------------------------------
17 | .. automodule:: kospeech.utils
18 |     :members:


--------------------------------------------------------------------------------
/docs/source/Interface.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Interface
 3 | =====================================================
 4 | 
 5 | Model
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/_sources/Interface.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Interface
 3 | =====================================================
 4 | 
 5 | Model
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/source/Vocabs.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Vocabs
 3 | =====================================================
 4 | 
 5 | Vocabulary
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.vocabs.__init__
 8 |     :members:
 9 | 
10 | KsponSpeechVocabulary
11 | --------------------------------------------
12 | .. automodule:: kospeech.vocabs.ksponspeech
13 |     :members:
14 | 
15 | LibriSpeechVocabulary
16 | --------------------------------------------
17 | .. automodule:: kospeech.vocabs.librispeech
18 |     :members:


--------------------------------------------------------------------------------
/docs/_sources/Vocabs.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Vocabs
 3 | =====================================================
 4 | 
 5 | Vocabulary
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.vocabs.__init__
 8 |     :members:
 9 | 
10 | KsponSpeechVocabulary
11 | --------------------------------------------
12 | .. automodule:: kospeech.vocabs.ksponspeech
13 |     :members:
14 | 
15 | LibriSpeechVocabulary
16 | --------------------------------------------
17 | .. automodule:: kospeech.vocabs.librispeech
18 |     :members:


--------------------------------------------------------------------------------
/docs/build/html/_sources/Interface.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Interface
 3 | =====================================================
 4 | 
 5 | Model
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/source/Modules.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Modules
 3 | =====================================================
 4 | 
 5 | Activation
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.activation
 9 |     :members:
10 | 
11 | Attention
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.attention
15 |     :members:
16 | 
17 | Convolution
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.convolution
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/configs/model/joint-ctc-attention-las.yaml:
--------------------------------------------------------------------------------
 1 | architecture: las
 2 | use_bidirectional: True
 3 | hidden_dim: 768
 4 | dropout: 0.3
 5 | num_heads: 4
 6 | label_smoothing: 0.1
 7 | num_encoder_layers: 3
 8 | num_decoder_layers: 2
 9 | rnn_type: lstm
10 | teacher_forcing_ratio: 1.0
11 | attn_mechanism: multi-head
12 | teacher_forcing_step: 0.01
13 | min_teacher_forcing_ratio: 0.9
14 | extractor: vgg
15 | activation: hardtanh
16 | cross_entropy_weight: 0.7
17 | ctc_weight: 0.3
18 | mask_conv: True
19 | joint_ctc_attention: True
20 | max_len: 400


--------------------------------------------------------------------------------
/docs/_sources/Modules.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Modules
 3 | =====================================================
 4 | 
 5 | Activation
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.activation
 9 |     :members:
10 | 
11 | Attention
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.attention
15 |     :members:
16 | 
17 | Convolution
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.convolution
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Vocabs.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Vocabs
 3 | =====================================================
 4 | 
 5 | Vocabulary
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.vocabs.__init__
 8 |     :members:
 9 | 
10 | KsponSpeechVocabulary
11 | --------------------------------------------
12 | .. automodule:: kospeech.vocabs.ksponspeech
13 |     :members:
14 | 
15 | LibriSpeechVocabulary
16 | --------------------------------------------
17 | .. automodule:: kospeech.vocabs.librispeech
18 |     :members:


--------------------------------------------------------------------------------
/docs/source/RNN Transducer.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | RNN Transducer
 3 | =====================================================
 4 | 
 5 | RNN Transducer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.rnnt.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.rnnt.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/docs/_sources/RNN Transducer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | RNN Transducer
 3 | =====================================================
 4 | 
 5 | RNN Transducer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.rnnt.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.rnnt.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Modules.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Modules
 3 | =====================================================
 4 | 
 5 | Activation
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.activation
 9 |     :members:
10 | 
11 | Attention
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.attention
15 |     :members:
16 | 
17 | Convolution
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.convolution
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/RNN Transducer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | RNN Transducer
 3 | =====================================================
 4 | 
 5 | RNN Transducer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.rnnt.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.rnnt.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/docs/source/Listen Attend Spell.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Listen, Attend and Spell
 3 | =====================================================
 4 | 
 5 | ListenAttendSpell
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.las.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.las.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.las.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/configs/model/conformer-medium.yaml:
--------------------------------------------------------------------------------
 1 | architecture: conformer
 2 | teacher_forcing_step: 0.0
 3 | min_teacher_forcing_ratio: 1.0
 4 | joint_ctc_attention: false
 5 | feed_forward_expansion_factor: int = 4
 6 | conv_expansion_factor: 2
 7 | input_dropout_p: 0.1
 8 | feed_forward_dropout_p: 0.1
 9 | attention_dropout_p: 0.1
10 | conv_dropout_p: 0.1
11 | decoder_dropout_p: 0.1
12 | conv_kernel_size: 31
13 | half_step_residual: True
14 | encoder_dim: 256
15 | decoder_dim: 640
16 | num_encoder_layers: 17
17 | num_decoder_layers: 1
18 | num_attention_heads: 4
19 | decoder: None
20 | 


--------------------------------------------------------------------------------
/configs/model/conformer-small.yaml:
--------------------------------------------------------------------------------
 1 | architecture: conformer
 2 | teacher_forcing_step: 0.0
 3 | min_teacher_forcing_ratio: 1.0
 4 | joint_ctc_attention: false
 5 | feed_forward_expansion_factor: int = 4
 6 | conv_expansion_factor: 2
 7 | input_dropout_p: 0.1
 8 | feed_forward_dropout_p: 0.1
 9 | attention_dropout_p: 0.1
10 | conv_dropout_p: 0.1
11 | decoder_dropout_p: 0.1
12 | conv_kernel_size: 31
13 | half_step_residual: True
14 | encoder_dim: 144
15 | decoder_dim: 320
16 | num_encoder_layers: 16
17 | num_decoder_layers: 1
18 | num_attention_heads: 4
19 | decoder: None
20 | 


--------------------------------------------------------------------------------
/docs/_sources/Listen Attend Spell.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Listen, Attend and Spell
 3 | =====================================================
 4 | 
 5 | ListenAttendSpell
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.las.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.las.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.las.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Listen Attend Spell.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Listen, Attend and Spell
 3 | =====================================================
 4 | 
 5 | ListenAttendSpell
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.las.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.las.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.las.decoder
21 |     :members:
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/source/Criterion.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Criterion
 3 | =====================================================
 4 | 
 5 | Label Smoothed Cross Entropy Loss
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy
 8 |     :members:
 9 | 
10 | Joint CTC-CrossEntropy Loss
11 | --------------------------------------------
12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy
13 |     :members:
14 | 
15 | Transducer Loss
16 | --------------------------------------------
17 | .. automodule:: kospeech.criterion.transducer
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/configs/model/conformer-large.yaml:
--------------------------------------------------------------------------------
 1 | architecture: conformer
 2 | teacher_forcing_step: 0.0
 3 | min_teacher_forcing_ratio: 1.0
 4 | joint_ctc_attention: false
 5 | feed_forward_expansion_factor: int = 4
 6 | conv_expansion_factor: 2
 7 | input_dropout_p: 0.1
 8 | feed_forward_dropout_p: 0.1
 9 | attention_dropout_p: 0.1
10 | conv_dropout_p: 0.1
11 | decoder_dropout_p: 0.1
12 | conv_kernel_size: 31
13 | half_step_residual: True
14 | encoder_dim: 512
15 | decoder_dim: 640
16 | num_encoder_layers: 17
17 | num_decoder_layers: 1
18 | num_attention_heads: 8
19 | decoder_rnn_type: lstm
20 | decoder: None
21 | 


--------------------------------------------------------------------------------
/docs/_sources/Criterion.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Criterion
 3 | =====================================================
 4 | 
 5 | Label Smoothed Cross Entropy Loss
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy
 8 |     :members:
 9 | 
10 | Joint CTC-CrossEntropy Loss
11 | --------------------------------------------
12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy
13 |     :members:
14 | 
15 | Transducer Loss
16 | --------------------------------------------
17 | .. automodule:: kospeech.criterion.transducer
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/docs/source/Optim.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Optim
 3 | =====================================================
 4 | 
 5 | Optimizer
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.__init__
 8 |     :members:
 9 | 
10 | RAdam
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.radam
13 |     :members:
14 | 
15 | AdamP
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.adamp
18 |     :members:
19 | 
20 | Novograd
21 | --------------------------------------------
22 | .. automodule:: kospeech.optim.novograd
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Criterion.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Criterion
 3 | =====================================================
 4 | 
 5 | Label Smoothed Cross Entropy Loss
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.criterion.label_smoothed_cross_entropy
 8 |     :members:
 9 | 
10 | Joint CTC-CrossEntropy Loss
11 | --------------------------------------------
12 | .. automodule:: kospeech.criterion.joint_ctc_cross_entropy
13 |     :members:
14 | 
15 | Transducer Loss
16 | --------------------------------------------
17 | .. automodule:: kospeech.criterion.transducer
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/docs/_sources/Optim.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Optim
 3 | =====================================================
 4 | 
 5 | Optimizer
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.__init__
 8 |     :members:
 9 | 
10 | RAdam
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.radam
13 |     :members:
14 | 
15 | AdamP
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.adamp
18 |     :members:
19 | 
20 | Novograd
21 | --------------------------------------------
22 | .. automodule:: kospeech.optim.novograd
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Optim.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Optim
 3 | =====================================================
 4 | 
 5 | Optimizer
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.__init__
 8 |     :members:
 9 | 
10 | RAdam
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.radam
13 |     :members:
14 | 
15 | AdamP
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.adamp
18 |     :members:
19 | 
20 | Novograd
21 | --------------------------------------------
22 | .. automodule:: kospeech.optim.novograd
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/docs/source/Learning Rate Schedulers.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Learning Rate Scheduler
 3 | =====================================================
 4 | 
 5 | Learning Rate Scheduler
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler
 8 |     :members:
 9 | 
10 | Tri Stage LR Scheduler
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler
13 |     :members:
14 | 
15 | Transformer LR Scheduler
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/configs/train/ds2_train.yaml:
--------------------------------------------------------------------------------
 1 | # data
 2 | dataset: 'kspon'
 3 | dataset_path: ''
 4 | transcripts_path: '../../../data/transcripts.txt'
 5 | output_unit: 'character'
 6 | 
 7 | # trainer
 8 | num_epochs: 20
 9 | batch_size: 32
10 | save_result_every: 100
11 | checkpoint_every: 100
12 | print_every: 10
13 | mode: 'train'
14 | seed: 777
15 | resume: false
16 | 
17 | # device
18 | num_workers: 4
19 | use_cuda: True
20 | 
21 | # optim
22 | optimizer: 'adam'
23 | init_lr: 1e-06
24 | final_lr: 1e-06
25 | peak_lr: 1e-04
26 | init_lr_scale: 0.01
27 | final_lr_scale: 0.05
28 | max_grad_norm: 400
29 | warmup_steps: 400
30 | weight_decay: 1e-05
31 | reduction: 'mean'


--------------------------------------------------------------------------------
/docs/_sources/Learning Rate Schedulers.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Learning Rate Scheduler
 3 | =====================================================
 4 | 
 5 | Learning Rate Scheduler
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler
 8 |     :members:
 9 | 
10 | Tri Stage LR Scheduler
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler
13 |     :members:
14 | 
15 | Transformer LR Scheduler
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/configs/train/las_train.yaml:
--------------------------------------------------------------------------------
 1 | # data
 2 | dataset: 'kspon'
 3 | dataset_path: ''
 4 | transcripts_path: '../../../data/transcripts.txt'
 5 | output_unit: 'character'
 6 | 
 7 | # trainer
 8 | num_epochs: 20
 9 | batch_size: 32
10 | save_result_every: 1000
11 | checkpoint_every: 5000
12 | print_every: 10
13 | mode: 'train'
14 | seed: 777
15 | resume: false
16 | 
17 | # device
18 | num_workers: 4
19 | use_cuda: True
20 | 
21 | # optim
22 | optimizer: 'adam'
23 | init_lr: 1e-06
24 | final_lr: 1e-06
25 | peak_lr: 1e-04
26 | init_lr_scale: 0.01
27 | final_lr_scale: 0.05
28 | max_grad_norm: 400
29 | warmup_steps: 400
30 | weight_decay: 1e-05
31 | reduction: 'mean'


--------------------------------------------------------------------------------
/docs/build/html/_sources/Learning Rate Schedulers.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Learning Rate Scheduler
 3 | =====================================================
 4 | 
 5 | Learning Rate Scheduler
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.optim.lr_scheduler.lr_scheduler
 8 |     :members:
 9 | 
10 | Tri Stage LR Scheduler
11 | --------------------------------------------
12 | .. automodule:: kospeech.optim.lr_scheduler.tri_stage_lr_scheduler
13 |     :members:
14 | 
15 | Transformer LR Scheduler
16 | --------------------------------------------
17 | .. automodule:: kospeech.optim.lr_scheduler.transformer_lr_scheduler
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/configs/train/transformer_train.yaml:
--------------------------------------------------------------------------------
 1 | # data
 2 | dataset: 'kspon'
 3 | dataset_path: ''
 4 | transcripts_path: '../../../data/transcripts.txt'
 5 | output_unit: 'character'
 6 | 
 7 | # trainer
 8 | num_epochs: 40
 9 | batch_size: 32
10 | save_result_every: 1000
11 | checkpoint_every: 5000
12 | print_every: 10
13 | mode: 'train'
14 | seed: 777
15 | resume: false
16 | 
17 | # device
18 | num_workers: 4
19 | use_cuda: True
20 | 
21 | # optim
22 | optimizer: 'adam'
23 | init_lr: 1e-06
24 | final_lr: 1e-07
25 | peak_lr: 1e-04
26 | init_lr_scale: 0.01
27 | final_lr_scale: 0.05
28 | max_grad_norm: 400
29 | warmup_steps: 4000
30 | weight_decay: 1e-05
31 | reduction: 'mean'
32 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/source/Conformer.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Conformer
 3 | =====================================================
 4 | 
 5 | Conformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.conformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.conformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 
23 | Modules
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.conformer.modules
27 |     :members:


--------------------------------------------------------------------------------
/docs/_sources/Conformer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Conformer
 3 | =====================================================
 4 | 
 5 | Conformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.conformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.conformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 
23 | Modules
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.conformer.modules
27 |     :members:


--------------------------------------------------------------------------------
/docs/build/html/_sources/Conformer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Conformer
 3 | =====================================================
 4 | 
 5 | Conformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.conformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.conformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.rnnt.decoder
21 |     :members:
22 | 
23 | Modules
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.conformer.modules
27 |     :members:


--------------------------------------------------------------------------------
/test/test_rnnt_recognize.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from kospeech.models.rnnt.model import RNNTransducer
 5 | 
 6 | batch_size, sequence_length, dim = 3, 12345, 80
 7 | 
 8 | cuda = torch.cuda.is_available()
 9 | device = torch.device('cuda' if cuda else 'cpu')
10 | 
11 | inputs = torch.rand(batch_size, sequence_length, dim).to(device)
12 | input_lengths = torch.IntTensor([12345, 12300, 12000])
13 | 
14 | model = nn.DataParallel(RNNTransducer(
15 |     num_classes=10,
16 |     input_dim=dim,
17 |     num_encoder_layers=3,
18 | )).to(device)
19 | 
20 | outputs = model.module.recognize(inputs, input_lengths)
21 | print(outputs)
22 | print(outputs.size())
23 | print("PASS")
24 | 


--------------------------------------------------------------------------------
/test/test_conformer_recognize.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from kospeech.models.conformer import Conformer
 5 | 
 6 | batch_size, sequence_length, dim = 3, 12345, 80
 7 | 
 8 | cuda = torch.cuda.is_available()
 9 | device = torch.device('cuda' if cuda else 'cpu')
10 | 
11 | inputs = torch.rand(batch_size, sequence_length, dim).to(device)
12 | input_lengths = torch.IntTensor([12345, 12300, 12000])
13 | 
14 | model = nn.DataParallel(Conformer(
15 |     num_classes=10,
16 |     input_dim=dim,
17 |     encoder_dim=512,
18 |     num_encoder_layers=3,
19 |     device=device,
20 | )).to(device)
21 | 
22 | outputs = model.module.recognize(inputs, input_lengths)
23 | print(outputs)
24 | print(outputs.size())
25 | print("PASS")
26 | 


--------------------------------------------------------------------------------
/bin/kospeech/checkpoint/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.checkpoint.checkpoint import Checkpoint
16 | 


--------------------------------------------------------------------------------
/dataset/kspon/preprocess.sh:
--------------------------------------------------------------------------------
 1 | # Author
 2 | # Soohwan Kim, Seyoung Bae, Cheolhwang Won, Soyoung Cho, Jeongwon Kwak
 3 | 
 4 | DATASET_PATH="SET_YOUR_DATASET_PATH"
 5 | VOCAB_DEST='SET_LABELS_DESTINATION'
 6 | OUTPUT_UNIT='character'                                          # you can set character / subword / grapheme
 7 | PREPROCESS_MODE='phonetic'                                       # phonetic : 칠 십 퍼센트,  spelling : 70%
 8 | VOCAB_SIZE=5000                                                  # if you use subword output unit, set vocab size
 9 | 
10 | echo "Pre-process KsponSpeech Dataset.."
11 | 
12 | python main.py \
13 | --dataset_path $DATASET_PATH \
14 | --vocab_dest $VOCAB_DEST \
15 | --output_unit $OUTPUT_UNIT \
16 | --preprocess_mode $PREPROCESS_MODE \
17 | --vocab_size $VOCAB_SIZE \
18 | 


--------------------------------------------------------------------------------
/bin/kospeech/optim/lr_scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.optim.lr_scheduler.tri_stage_lr_scheduler import TriStageLRScheduler
16 | from kospeech.optim.lr_scheduler.transformer_lr_scheduler import TransformerLRScheduler


--------------------------------------------------------------------------------
/docs/source/Data.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Data
 3 | =====================================================
 4 | 
 5 | Augment
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.data.audio.augment
 8 |     :members:
 9 | 
10 | Core
11 | --------------------------------------------
12 | .. automodule:: kospeech.data.audio.core
13 |     :members:
14 | 
15 | Feature
16 | --------------------------------------------
17 | .. automodule:: kospeech.data.audio.feature
18 |     :members:
19 | 
20 | Parser
21 | -------------------------------------------
22 | .. automodule:: kospeech.data.audio.parser
23 |     :members:
24 | 
25 | DataLoader
26 | -------------------------------------------
27 | .. automodule:: kospeech.data.data_loader
28 |     :members:
29 | 
30 | LabelLoader
31 | -------------------------------------------
32 | .. automodule:: kospeech.data.label_loader
33 |     :members:
34 | 


--------------------------------------------------------------------------------
/docs/_sources/Data.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Data
 3 | =====================================================
 4 | 
 5 | Augment
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.data.audio.augment
 8 |     :members:
 9 | 
10 | Core
11 | --------------------------------------------
12 | .. automodule:: kospeech.data.audio.core
13 |     :members:
14 | 
15 | Feature
16 | --------------------------------------------
17 | .. automodule:: kospeech.data.audio.feature
18 |     :members:
19 | 
20 | Parser
21 | -------------------------------------------
22 | .. automodule:: kospeech.data.audio.parser
23 |     :members:
24 | 
25 | DataLoader
26 | -------------------------------------------
27 | .. automodule:: kospeech.data.data_loader
28 |     :members:
29 | 
30 | LabelLoader
31 | -------------------------------------------
32 | .. automodule:: kospeech.data.label_loader
33 |     :members:
34 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Data.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Data
 3 | =====================================================
 4 | 
 5 | Augment
 6 | --------------------------------------------
 7 | .. automodule:: kospeech.data.audio.augment
 8 |     :members:
 9 | 
10 | Core
11 | --------------------------------------------
12 | .. automodule:: kospeech.data.audio.core
13 |     :members:
14 | 
15 | Feature
16 | --------------------------------------------
17 | .. automodule:: kospeech.data.audio.feature
18 |     :members:
19 | 
20 | Parser
21 | -------------------------------------------
22 | .. automodule:: kospeech.data.audio.parser
23 |     :members:
24 | 
25 | DataLoader
26 | -------------------------------------------
27 | .. automodule:: kospeech.data.data_loader
28 |     :members:
29 | 
30 | LabelLoader
31 | -------------------------------------------
32 | .. automodule:: kospeech.data.label_loader
33 |     :members:
34 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/jasper/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from dataclasses import dataclass
17 | from kospeech.models import ModelConfig
18 | 
19 | 
20 | @dataclass
21 | class JasperConfig(ModelConfig):
22 |     architecture: str = "jasper"
23 |     version: str = "10x5"
24 | 


--------------------------------------------------------------------------------
/requirements_cssiri.txt:
--------------------------------------------------------------------------------
 1 | antlr4-python3-runtime==4.8
 2 | appdirs==1.4.4
 3 | astropy==5.0
 4 | audioread==2.1.9
 5 | certifi==2021.10.8
 6 | cffi==1.15.0
 7 | charset-normalizer==2.0.9
 8 | colorama==0.4.4
 9 | decorator==5.1.0
10 | et-xmlfile==1.1.0
11 | idna==3.3
12 | joblib==1.1.0
13 | Levenshtein==0.16.0
14 | librosa==0.8.1
15 | llvmlite==0.37.0
16 | numba==0.54.1
17 | numpy==1.20.3
18 | omegaconf==2.1.1
19 | openpyxl==3.0.9
20 | packaging==21.3
21 | pandas==1.3.4
22 | pooch==1.5.2
23 | pycparser==2.21
24 | pyerfa==2.0.0.1
25 | pyparsing==3.0.6
26 | python-dateutil==2.8.2
27 | pytz==2021.3
28 | PyYAML==6.0
29 | rapidfuzz==1.8.3
30 | requests==2.26.0
31 | resampy==0.2.2
32 | scikit-learn==1.0.1
33 | scipy==1.7.3
34 | six==1.16.0
35 | SoundFile==0.10.3.post1
36 | threadpoolctl==3.0.0
37 | torch==1.10.0
38 | torchaudio==0.10.0
39 | tqdm==4.62.3
40 | typing_extensions==4.0.1
41 | urllib3==1.26.7
42 | wincertstore==0.2
43 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/bin/kospeech/criterion/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.criterion.label_smoothed_cross_entropy import LabelSmoothedCrossEntropyLoss
16 | from kospeech.criterion.joint_ctc_cross_entropy import JointCTCCrossEntropyLoss
17 | from kospeech.criterion.transducer import TransducerLoss
18 | 


--------------------------------------------------------------------------------
/docs/_static/js/badge_only.js:
--------------------------------------------------------------------------------
1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}});


--------------------------------------------------------------------------------
/docs/build/html/_static/js/badge_only.js:
--------------------------------------------------------------------------------
1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}});


--------------------------------------------------------------------------------
/bin/kospeech/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.data.audio.parser import SpectrogramParser
16 | from kospeech.data.label_loader import load_dataset
17 | from kospeech.data.data_loader import (
18 |     SpectrogramDataset,
19 |     AudioDataLoader,
20 |     MultiDataLoader,
21 |     split_dataset,
22 | )
23 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. KoSpeech documentation master file, created by
 2 |    sphinx-quickstart on Wed Mar  4 02:42:19 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to KoSpeech's documentation!
 7 | ===========================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: GETTING STARTED
12 | 
13 |    notes/intro
14 |    notes/Preparation
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: ARCHITECTURE
19 | 
20 |    Interface
21 |    Deep Speech 2
22 |    Listen Attend Spell
23 |    RNN Transducer
24 |    Speech Transformer
25 |    Jasper
26 |    Conformer
27 |    Modules
28 | 
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: LIBRARY REFERENCE
33 | 
34 |    Checkpoint
35 |    Criterion
36 |    Data
37 |    Decode
38 |    Evaluator
39 |    Optim
40 |    Learning Rate Schedulers
41 |    Trainer
42 |    Vocabs
43 |    Etc
44 | 


--------------------------------------------------------------------------------
/docs/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. KoSpeech documentation master file, created by
 2 |    sphinx-quickstart on Wed Mar  4 02:42:19 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to KoSpeech's documentation!
 7 | ===========================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: GETTING STARTED
12 | 
13 |    notes/intro
14 |    notes/Preparation
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: ARCHITECTURE
19 | 
20 |    Interface
21 |    Deep Speech 2
22 |    Listen Attend Spell
23 |    RNN Transducer
24 |    Speech Transformer
25 |    Jasper
26 |    Conformer
27 |    Modules
28 | 
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: LIBRARY REFERENCE
33 | 
34 |    Checkpoint
35 |    Criterion
36 |    Data
37 |    Decode
38 |    Evaluator
39 |    Optim
40 |    Learning Rate Schedulers
41 |    Trainer
42 |    Vocabs
43 |    Etc
44 | 


--------------------------------------------------------------------------------
/docs/source/Speech Transformer.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Speech Transformer
 3 | =====================================================
 4 | 
 5 | Transformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.transformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.transformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.transformer.decoder
21 |     :members:
22 | 
23 | Sublayers
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.transformer.sublayers
27 |     :members:
28 | 
29 | Embeddings
30 | --------------------------------------------
31 | 
32 | .. automodule:: kospeech.models.transformer.embeddings
33 |     :members:
34 | 
35 | Mask
36 | --------------------------------------------
37 | 
38 | .. automodule:: kospeech.models.transformer.mask
39 |     :members:


--------------------------------------------------------------------------------
/docs/_sources/Speech Transformer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Speech Transformer
 3 | =====================================================
 4 | 
 5 | Transformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.transformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.transformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.transformer.decoder
21 |     :members:
22 | 
23 | Sublayers
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.transformer.sublayers
27 |     :members:
28 | 
29 | Embeddings
30 | --------------------------------------------
31 | 
32 | .. automodule:: kospeech.models.transformer.embeddings
33 |     :members:
34 | 
35 | Mask
36 | --------------------------------------------
37 | 
38 | .. automodule:: kospeech.models.transformer.mask
39 |     :members:


--------------------------------------------------------------------------------
/docs/build/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. KoSpeech documentation master file, created by
 2 |    sphinx-quickstart on Wed Mar  4 02:42:19 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to KoSpeech's documentation!
 7 | ===========================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: GETTING STARTED
12 | 
13 |    notes/intro
14 |    notes/Preparation
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: ARCHITECTURE
19 | 
20 |    Interface
21 |    Deep Speech 2
22 |    Listen Attend Spell
23 |    RNN Transducer
24 |    Speech Transformer
25 |    Jasper
26 |    Conformer
27 |    Modules
28 | 
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: LIBRARY REFERENCE
33 | 
34 |    Checkpoint
35 |    Criterion
36 |    Data
37 |    Decode
38 |    Evaluator
39 |    Optim
40 |    Learning Rate Schedulers
41 |    Trainer
42 |    Vocabs
43 |    Etc
44 | 


--------------------------------------------------------------------------------
/docs/build/html/_sources/Speech Transformer.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Speech Transformer
 3 | =====================================================
 4 | 
 5 | Transformer
 6 | --------------------------------------------
 7 | 
 8 | .. automodule:: kospeech.models.transformer.model
 9 |     :members:
10 | 
11 | Encoder
12 | --------------------------------------------
13 | 
14 | .. automodule:: kospeech.models.transformer.encoder
15 |     :members:
16 | 
17 | Decoder
18 | --------------------------------------------
19 | 
20 | .. automodule:: kospeech.models.transformer.decoder
21 |     :members:
22 | 
23 | Sublayers
24 | --------------------------------------------
25 | 
26 | .. automodule:: kospeech.models.transformer.sublayers
27 |     :members:
28 | 
29 | Embeddings
30 | --------------------------------------------
31 | 
32 | .. automodule:: kospeech.models.transformer.embeddings
33 |     :members:
34 | 
35 | Mask
36 | --------------------------------------------
37 | 
38 | .. automodule:: kospeech.models.transformer.mask
39 |     :members:


--------------------------------------------------------------------------------
/bin/kospeech/models/deepspeech2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | from kospeech.models import ModelConfig
17 | 
18 | 
19 | @dataclass
20 | class DeepSpeech2Config(ModelConfig):
21 |     architecture: str = "deepspeech2"
22 |     use_bidirectional: bool = True
23 |     rnn_type: str = "gru"
24 |     hidden_dim: int = 1024
25 |     activation: str = "hardtanh"
26 |     num_encoder_layers: int = 3
27 | 


--------------------------------------------------------------------------------
/bin/kospeech/evaluator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | 
18 | @dataclass
19 | class EvalConfig:
20 |     dataset: str = 'kspon'
21 |     dataset_path: str = ''
22 |     transcripts_path: str = '../../../data/eval_transcript.txt'
23 |     model_path: str = ''
24 |     output_unit: str = 'character'
25 |     batch_size: int = 32
26 |     num_workers: int = 4
27 |     print_every: int = 20
28 |     decode: str = 'greedy'
29 |     k: int = 3
30 |     use_cuda: bool = True


--------------------------------------------------------------------------------
/bin/kospeech/models/rnnt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | 
18 | @dataclass
19 | class RNNTransducerConfig:
20 |     architecture: str = "rnnt"
21 |     num_encoder_layers: int = 4
22 |     num_decoder_layers: int = 1
23 |     encoder_hidden_state_dim: int = 320
24 |     decoder_hidden_state_dim: int = 512
25 |     output_dim: int = 512
26 |     rnn_type: str = "lstm"
27 |     bidirectional: bool = True
28 |     encoder_dropout_p: float = 0.2
29 |     decoder_dropout_p: float = 0.2
30 | 


--------------------------------------------------------------------------------
/dataset/kspon/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Soohwan Kim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/model_architecture.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.models.las.encoder import Listener
16 | from kospeech.models.las.decoder import Speller
17 | from kospeech.models.las.model import ListenAttendSpell
18 | from kospeech.models.transformer.model import SpeechTransformer
19 | 
20 | 
21 | encoder = Listener(80, 512, 'cpu')
22 | decoder = Speller(2038, 151, 1024, 1, 2)
23 | model = ListenAttendSpell(encoder, decoder)
24 | 
25 | print(model)
26 | 
27 | 
28 | model = SpeechTransformer(num_classes=2038, num_encoder_layers=3, num_decoder_layers=3)
29 | print(model)
30 | 


--------------------------------------------------------------------------------
/test/cer_visualize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pandas as pd
16 | import matplotlib.pyplot as plt
17 | 
18 | RGB = (0.4157, 0.2784, 0.3333)
19 | TRAIN_RESULT_PATH = '../data/train_result/train_step_result.csv'
20 | 
21 | train_result = pd.read_csv(TRAIN_RESULT_PATH, delimiter=',', encoding='cp949')
22 | losses = train_result['loss']
23 | cers = train_result['cer']
24 | 
25 | plt.title('Visualization of training (cer)')
26 | plt.plot(cers, color=RGB, label='cers')
27 | plt.xlabel('step (unit : 1000)', fontsize='x-large')
28 | plt.ylabel('cer', fontsize='x-large')
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/bin/kospeech/vocabs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class Vocabulary(object):
17 |     """
18 |     Note:
19 |         Do not use this class directly, use one of the sub classes.
20 |     """
21 |     def __init__(self, *args, **kwargs):
22 |         self.sos_id = None
23 |         self.eos_id = None
24 |         self.pad_id = None
25 |         self.blank_id = None
26 | 
27 |     def label_to_string(self, labels):
28 |         raise NotImplementedError
29 | 
30 | 
31 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
32 | from kospeech.vocabs.librispeech import LibriSpeechVocabulary
33 | 


--------------------------------------------------------------------------------
/test/loss_visualize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pandas as pd
16 | import matplotlib.pyplot as plt
17 | 
18 | RGB = (0.4157, 0.2784, 0.3333)
19 | TRAIN_RESULT_PATH = '../data/train_result/train_step_result.csv'
20 | 
21 | 
22 | train_result = pd.read_csv(TRAIN_RESULT_PATH, delimiter=',', encoding='cp949')
23 | losses = train_result['loss']
24 | cers = train_result['cer']
25 | 
26 | 
27 | plt.title('Visualization of training (loss)')
28 | plt.plot(losses, color=RGB, label='losses')
29 | plt.xlabel('step (unit : 1000)', fontsize='x-large')
30 | plt.ylabel('L2norm', fontsize='x-large')
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/test/test_rnnt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from kospeech.models.rnnt.model import RNNTransducer
 5 | 
 6 | batch_size, sequence_length, dim = 3, 12345, 80
 7 | 
 8 | cuda = torch.cuda.is_available()
 9 | device = torch.device('cuda' if cuda else 'cpu')
10 | 
11 | model = nn.DataParallel(RNNTransducer(
12 |     num_classes=10,
13 |     input_dim=dim,
14 |     num_encoder_layers=3,
15 | )).to(device)
16 | 
17 | criterion = nn.CTCLoss(blank=3, zero_infinity=True)
18 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04)
19 | 
20 | for i in range(10):
21 |     inputs = torch.rand(batch_size, sequence_length, dim).to(device)
22 |     input_lengths = torch.IntTensor([12345, 12300, 12000])
23 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
24 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
25 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
26 |     target_lengths = torch.LongTensor([9, 8, 7])
27 | 
28 |     outputs = model(inputs, input_lengths, targets, target_lengths)
29 |     print(outputs.size())
30 |     print("PASS")
31 |     # loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths)
32 |     # loss.backward()
33 |     # optimizer.step()
34 |     # print(loss)
35 | 


--------------------------------------------------------------------------------
/test/test_deepspeech2_recognize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from kospeech.models import DeepSpeech2
17 | 
18 | batch_size = 3
19 | sequence_length = 14321
20 | dimension = 80
21 | 
22 | cuda = torch.cuda.is_available()
23 | device = torch.device('cuda' if cuda else 'cpu')
24 | 
25 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device)  # BxTxD
26 | input_lengths = torch.LongTensor([14321, 14300, 13000]).to(device)
27 | 
28 | print("Deep Speech 2 Model Test..")
29 | model = DeepSpeech2(num_classes=10, input_dim=dimension).to(device)
30 | output = model.recognize(inputs, input_lengths)
31 | 
32 | print(output)
33 | print(output.size())
34 | 


--------------------------------------------------------------------------------
/test/test_conformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from kospeech.models.conformer import Conformer
 5 | 
 6 | batch_size, sequence_length, dim = 3, 12345, 80
 7 | 
 8 | cuda = torch.cuda.is_available()
 9 | device = torch.device('cuda' if cuda else 'cpu')
10 | 
11 | model = nn.DataParallel(Conformer(
12 |     num_classes=10,
13 |     input_dim=dim,
14 |     encoder_dim=32,
15 |     num_encoder_layers=3,
16 |     decoder_dim=32,
17 |     device=device,
18 | )).to(device)
19 | 
20 | criterion = nn.CTCLoss(blank=3, zero_infinity=True)
21 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04)
22 | 
23 | for i in range(10):
24 |     inputs = torch.rand(batch_size, sequence_length, dim).to(device)
25 |     input_lengths = torch.IntTensor([12345, 12300, 12000])
26 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
27 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
28 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
29 |     target_lengths = torch.LongTensor([9, 8, 7])
30 | 
31 |     outputs = model(inputs, input_lengths, targets, target_lengths)
32 |     print("PASS")
33 |     # loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths)
34 |     # loss.backward()
35 |     # optimizer.step()
36 |     # print(loss)
37 | 


--------------------------------------------------------------------------------
/bin/kospeech/optim/lr_scheduler/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class LearningRateScheduler(object):
17 |     """
18 |     Provides inteface of learning rate scheduler.
19 | 
20 |     Note:
21 |         Do not use this class directly, use one of the sub classes.
22 |     """
23 |     def __init__(self, optimizer, init_lr):
24 |         self.optimizer = optimizer
25 |         self.init_lr = init_lr
26 | 
27 |     def step(self, *args, **kwargs):
28 |         raise NotImplementedError
29 | 
30 |     @staticmethod
31 |     def set_lr(optimizer, lr):
32 |         for g in optimizer.param_groups:
33 |             g['lr'] = lr
34 | 
35 |     def get_lr(self):
36 |         for g in self.optimizer.param_groups:
37 |             return g['lr']


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from distutils.core import setup
16 | 
17 | setup(
18 |     name='KoSpeech',
19 |     version='latest',
20 |     description='Open-Source Toolkit for End-to-End Korean Speech Recognition',
21 |     author='Soohwan Kim',
22 |     author_email='kaki.ai@tunib.ai',
23 |     url='https://github.com/sooftware/KoSpeech',
24 |     install_requires=[
25 |         'torch>=1.4.0',
26 |         'python-Levenshtein',
27 |         'librosa >= 0.7.0',
28 |         'numpy',
29 |         'pandas',
30 |         'tqdm',
31 |         'matplotlib',
32 |         'astropy',
33 |         'sentencepiece',
34 |         'hydra-core',
35 |     ],
36 |     keywords=['asr', 'speech_recognition', 'korean'],
37 |     python_requires='>=3',
38 | )
39 | 


--------------------------------------------------------------------------------
/test/test_las_decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from kospeech.models.las.decoder import DecoderRNN
17 | 
18 | cuda = torch.cuda.is_available()
19 | device = torch.device('cuda' if cuda else 'cpu')
20 | 
21 | inputs = torch.LongTensor([[1, 1, 2], [3, 4, 2], [7, 2, 0]])
22 | encoder_outputs = torch.rand(3, 100, 32)
23 | 
24 | decoder = DecoderRNN(num_classes=10, hidden_state_dim=32, max_length=10)
25 | decoder_outputs = decoder(inputs, encoder_outputs, teacher_forcing_ratio=1.0)
26 | print("teacher_forcing_ratio=1.0 PASS")
27 | 
28 | decoder = DecoderRNN(num_classes=10, hidden_state_dim=32, max_length=10)
29 | decoder_outputs = decoder(inputs, encoder_outputs, teacher_forcing_ratio=0.0)
30 | print("teacher_forcing_ratio=0.0 PASS")
31 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | from kospeech.models import ModelConfig
17 | 
18 | 
19 | @dataclass
20 | class TransformerConfig(ModelConfig):
21 |     architecture: str = "transformer"
22 |     extractor: str = "vgg"
23 |     use_bidirectional: bool = True
24 |     dropout: float = 0.3
25 |     d_model: int = 512
26 |     d_ff: int = 2048
27 |     num_heads: int = 8
28 |     num_encoder_layers: int = 12
29 |     num_decoder_layers: int = 6
30 | 
31 | 
32 | @dataclass
33 | class JointCTCAttentionTransformerConfig(TransformerConfig):
34 |     cross_entropy_weight: float = 0.7
35 |     ctc_weight: float = 0.3
36 |     mask_conv: bool = True
37 |     joint_ctc_attention: bool = True
38 | 


--------------------------------------------------------------------------------
/data/train_result/train_step_result.csv:
--------------------------------------------------------------------------------
 1 | loss,cer
 2 | 0.49725567911757396,1.3785942282178933
 3 | 0.4320526512272614,1.1382519868779264
 4 | 0.40069148419866835,1.0359303697242361
 5 | 0.38132347770707137,0.9773902462637962
 6 | 0.366732018967894,0.9358039262938204
 7 | 0.35402797622241,0.9017611604593339
 8 | 0.342316426582415,0.8695670980202619
 9 | 0.3312905454609975,0.8388701831709967
10 | 0.32115954891959464,0.8100443994182784
11 | 0.3118217632253063,0.7826462731244754
12 | 0.30338589655017695,0.756859449840871
13 | 0.2956769136612572,0.7332592623465659
14 | 0.2886054592627544,0.7115002051414936
15 | 0.28227339170816595,0.691827295638539
16 | 0.2764920089726153,0.6736116550939593
17 | 0.27115296381537046,0.6566396600963952
18 | 0.26627046029677365,0.6409298328194061
19 | 0.2618086007187791,0.6263652122748918
20 | 0.2576973859263268,0.6128638668158448
21 | 0.25388669457476953,0.6003908023350812
22 | 0.2504016133631766,0.5888744727804983
23 | 0.24708599731583045,0.5779467286271303
24 | 0.24395953697892694,0.5675731793954828
25 | 0.24105681277818689,0.5580052233482057
26 | 0.23831809159150824,0.5490285275196444
27 | 0.23573360881928176,0.5403110411146902
28 | 0.23329234001609772,0.5319247671046639
29 | 0.23100109575536898,0.5241830523736276
30 | 0.22887273296913221,0.5169182535522262
31 | 0.2267568045013077,0.5097991735342743
32 | 0.22478250193826105,0.5030199298264065
33 | 0.22290241163138483,0.49655470625483017
34 | 0.22113228150091996,0.4905992570502586
35 | 0.21945243381020205,0.4848017214126452
36 | 


--------------------------------------------------------------------------------
/test/test_jasper_recognize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from kospeech.models.jasper.model import Jasper
17 | 
18 | batch_size = 3
19 | sequence_length = 14321
20 | dimension = 80
21 | 
22 | cuda = torch.cuda.is_available()
23 | device = torch.device('cuda' if cuda else 'cpu')
24 | 
25 | inputs = torch.rand(batch_size, sequence_length, dimension).to(device)  # BxTxD
26 | input_lengths = torch.LongTensor([14321, 14300, 13000]).to(device)
27 | 
28 | print("Jasper 10x3 Model Test..")
29 | model = Jasper(num_classes=10, version='10x5').to(device)
30 | output = model.recognize(inputs, input_lengths)
31 | 
32 | print(output)
33 | print(output.size())
34 | 
35 | print("Jasper 5x3 Model Test..")
36 | model = Jasper(num_classes=10, version='5x3').to(device)
37 | output = model.recognize(inputs, input_lengths)
38 | 
39 | print(output)
40 | print(output.size())
41 | 


--------------------------------------------------------------------------------
/test/test_las_recognize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | from kospeech.models import ListenAttendSpell
18 | from kospeech.models.las.encoder import EncoderRNN
19 | from kospeech.models.las.decoder import DecoderRNN
20 | 
21 | B, T, D, H = 3, 12345, 80, 32
22 | 
23 | cuda = torch.cuda.is_available()
24 | device = torch.device('cuda' if cuda else 'cpu')
25 | 
26 | inputs = torch.rand(B, T, D).to(device)
27 | input_lengths = torch.IntTensor([T, T - 100, T - 1000])
28 | targets = torch.LongTensor([[1, 1, 2], [3, 4, 2], [7, 2, 0]])
29 | 
30 | model = ListenAttendSpell(
31 |     input_dim=D,
32 |     num_classes=10,
33 |     encoder_hidden_state_dim=H,
34 |     decoder_hidden_state_dim=H << 1,
35 |     bidirectional=True,
36 |     max_length=10,
37 | ).to(device)
38 | 
39 | outputs = model.recognize(inputs, input_lengths)
40 | print(outputs.size())
41 | print("LAS Recognize PASS")
42 | 


--------------------------------------------------------------------------------
/dataset/libri/prepare-libri.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Soohwan Kim, Seyoung Bae, Cheolhwang Won.
 3 | # @ArXiv : KoSpeech: Open-Source Toolkit for End-to-End Korean Speech Recognition
 4 | # This source code is licensed under the Apache 2.0 License license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # $1 : DIR_TO_SAVE_DATA
 8 | 
 9 | base_url=www.openslr.org/resources/12
10 | train_dir=train_960
11 | vocab_size=5000
12 | 
13 | if [ "$#" -ne 1 ]; then
14 |   echo "Usage: $0 <download_dir>>"
15 |   exit 1
16 | fi
17 | 
18 | download_dir=${1%/}
19 | 
20 | echo "Data Download"
21 | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
22 |     url=$base_url/$part.tar.gz
23 |     if ! wget -P $download_dir $url; then
24 |         echo "$0: wget failed for $url"
25 |         exit 1
26 |     fi
27 |     if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then
28 |         echo "$0: error un-tarring archive $download_dir/$part.tar.gz"
29 |         exit 1
30 |     fi
31 | done
32 | 
33 | echo "Merge all train packs into one"
34 | mkdir -p ${download_dir}/LibriSpeech/${train_dir}/
35 | for part in train-clean-100 train-clean-360 train-other-500; do
36 |     mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/
37 | done
38 | 
39 | python prepare-libri.py --dataset_path $1/LibriSpeech --vocab_size $vocab_size
40 | 
41 | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
42 |     rm $part.tar.gz
43 | done


--------------------------------------------------------------------------------
/test/test_las_encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from kospeech.models.las.encoder import EncoderRNN
17 | 
18 | inputs = torch.rand(3, 12345, 80)
19 | input_lengths = torch.IntTensor([12345, 12300, 12000])
20 | 
21 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, joint_ctc_attention=False, extractor='vgg')
22 | encoder_outputs, encoder_log_probs, encoder_output_lengths = encoder(inputs, input_lengths)
23 | print("joint_ctc_attention=False PASS, VGGExtractor")
24 | 
25 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, joint_ctc_attention=False, extractor='ds2')
26 | encoder_outputs, encoder_output_lengths, encoder_log_probs = encoder(inputs, input_lengths)
27 | print("joint_ctc_attention=False PASS, DeepSpeech2Extractor")
28 | 
29 | encoder = EncoderRNN(input_dim=80, hidden_state_dim=32, num_classes=2, joint_ctc_attention=True)
30 | encoder_outputs, encoder_output_lengths, encoder_log_probs = encoder(inputs, input_lengths)
31 | print("joint_ctc_attention=True PASS")
32 | 


--------------------------------------------------------------------------------
/test/test_transformer_recognize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | from kospeech.model_builder import build_transformer
18 | from kospeech.models import SpeechTransformer
19 | 
20 | batch_size = 4
21 | seq_length = 200
22 | input_size = 80
23 | 
24 | cuda = torch.cuda.is_available()
25 | device = torch.device('cuda' if cuda else 'cpu')
26 | 
27 | transformer = build_transformer(
28 |     num_classes=10,
29 |     d_model=16,
30 |     d_ff=32,
31 |     num_heads=2,
32 |     input_dim=input_size,
33 |     num_encoder_layers=3,
34 |     num_decoder_layers=2,
35 |     extractor='vgg',
36 |     dropout_p=0.1,
37 |     device=device,
38 |     pad_id=0,
39 |     sos_id=1,
40 |     eos_id=2,
41 |     joint_ctc_attention=False,
42 |     max_length=10,
43 | )
44 | 
45 | inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device)
46 | input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30]).to(device)
47 | 
48 | output = transformer.module.recognize(inputs, input_lengths)
49 | print(output.size())
50 | print("Test Transformer PASS")
51 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | from torch import Tensor
17 | 
18 | 
19 | class Swish(nn.Module):
20 |     """
21 |     Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied
22 |     to a variety of challenging domains such as Image classification and Machine translation.
23 |     """
24 | 
25 |     def __init__(self):
26 |         super(Swish, self).__init__()
27 | 
28 |     def forward(self, inputs: Tensor) -> Tensor:
29 |         return inputs * inputs.sigmoid()
30 | 
31 | 
32 | class GLU(nn.Module):
33 |     """
34 |     The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing
35 |     in the paper “Language Modeling with Gated Convolutional Networks”
36 |     """
37 | 
38 |     def __init__(self, dim: int) -> None:
39 |         super(GLU, self).__init__()
40 |         self.dim = dim
41 | 
42 |     def forward(self, inputs: Tensor) -> Tensor:
43 |         outputs, gate = inputs.chunk(2, dim=self.dim)
44 |         return outputs * gate.sigmoid()
45 | 


--------------------------------------------------------------------------------
/data/vocab/aihub_grapheme_vocabs.csv:
--------------------------------------------------------------------------------
  1 | id,grpm,freq
  2 | 0,<pad>,0
  3 | 1,<sos>,0
  4 | 2,<eos>,0
  5 | 3,|,5774244
  6 | 4,ᅡ,3560117
  7 | 5,ᄋ,3329997
  8 | 6,ᄀ,2776207
  9 | 7,ᅵ,2171847
 10 | 8,ᆫ,2099990
 11 | 9,ᅳ,2005508
 12 | 10,ᅥ,1933125
 13 | 11,ᅩ,1322658
 14 | 12,ᄂ,1267792
 15 | 13,ᄃ,1256854
 16 | 14,ᄌ,1088700
 17 | 15,ᆯ,990624
 18 | 16,ᄅ,940940
 19 | 17,ᄉ,920582
 20 | 18,ᅦ,899879
 21 | 19,ᄆ,891270
 22 | 20,ᄒ,802845
 23 | 21,ᅢ,797078
 24 | 22,.,640923
 25 | 23,ᅮ,613109
 26 | 24,ᆼ,601838
 27 | 25,ᆨ,523735
 28 | 26,ᅧ,475473
 29 | 27,ᄇ,473425
 30 | 28,ᆻ,467303
 31 | 29,ᆷ,415188
 32 | 30,ᅣ,263813
 33 | 31,?,235024
 34 | 32,ᄎ,213755
 35 | 33,ᄁ,194941
 36 | 34,ᅯ,194464
 37 | 35,ᄄ,165687
 38 | 36,ᅪ,153572
 39 | 37,ᆭ,144828
 40 | 38,ᆸ,143910
 41 | 39,ᄐ,141767
 42 | 40,ᅬ,117111
 43 | 41,ᄍ,114604
 44 | 42,ᄑ,113442
 45 | 43,ᆺ,110034
 46 | 44,ᇂ,107712
 47 | 45,ᅭ,104403
 48 | 46,ᇀ,95003
 49 | 47,ᄏ,82797
 50 | 48,ᅫ,58525
 51 | 49,ᄊ,52923
 52 | 50,ᆹ,49619
 53 | 51,ᅤ,48322
 54 | 52,ᅨ,45861
 55 | 53,ᆽ,44775
 56 | 54,ᄈ,43872
 57 | 55,ᅲ,40767
 58 | 56,ᅱ,36498
 59 | 57,ᇁ,33446
 60 | 58,ᅴ,27363
 61 | 59,ᆮ,24670
 62 | 60,ᆩ,10787
 63 | 61,ᆾ,10627
 64 | 62,ᆶ,8431
 65 | 63,ᆰ,5339
 66 | 64,ᆲ,3841
 67 | 65,ᅰ,2907
 68 | 66,C,2704
 69 | 67,ᆱ,2658
 70 | 68,T,2248
 71 | 69,ᆬ,1971
 72 | 70,S,1844
 73 | 71,P,1620
 74 | 72,V,1159
 75 | 73,A,1156
 76 | 74,M,1087
 77 | 75,B,1057
 78 | 76,G,940
 79 | 77,D,843
 80 | 78,!,841
 81 | 79,O,757
 82 | 80,N,748
 83 | 81,R,740
 84 | 82,I,735
 85 | 83,K,654
 86 | 84,L,626
 87 | 85,E,527
 88 | 86,F,496
 89 | 87,J,372
 90 | 88,X,366
 91 | 89,U,315
 92 | 90,Y,231
 93 | 91,H,197
 94 | 92,ᆿ,114
 95 | 93,ᆴ,87
 96 | 94,W,80
 97 | 95,ᆪ,63
 98 | 96,Q,55
 99 | 97,2,27
100 | 98,1,22
101 | 99,3,13
102 | 100,ᆵ,12
103 | 101,Z,11
104 | 102,4,8
105 | 103,0,6
106 | 104,8,5
107 | 105,5,5
108 | 106,6,2
109 | 107,7,1
110 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/las/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | from kospeech.models import ModelConfig
18 | from kospeech.models.las.encoder import EncoderRNN
19 | from kospeech.models.las.decoder import DecoderRNN
20 | 
21 | 
22 | @dataclass
23 | class ListenAttendSpellConfig(ModelConfig):
24 |     architecture: str = "las"
25 |     use_bidirectional: bool = True
26 |     dropout: float = 0.3
27 |     num_heads: int = 4
28 |     num_encoder_layers: int = 3
29 |     num_decoder_layers: int = 2
30 |     rnn_type: str = "lstm"
31 |     hidden_dim: int = 512
32 |     teacher_forcing_ratio: float = 1.0
33 |     attn_mechanism: str = "multi-head"
34 |     teacher_forcing_step: float = 0.01
35 |     min_teacher_forcing_ratio: float = 0.9
36 |     extractor: str = "vgg"
37 |     activation: str = "hardtanh"
38 |     mask_conv: bool = False
39 |     joint_ctc_attention: bool = False
40 | 
41 | 
42 | @dataclass
43 | class JointCTCAttentionLASConfig(ListenAttendSpellConfig):
44 |     hidden_dim: int = 768
45 |     cross_entropy_weight: float = 0.7
46 |     ctc_weight: float = 0.3
47 |     mask_conv: bool = True
48 |     joint_ctc_attention: bool = True


--------------------------------------------------------------------------------
/test/test_jasper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | from kospeech.models.jasper.model import Jasper
19 | 
20 | batch_size = 3
21 | sequence_length = 14321
22 | dimension = 80
23 | 
24 | cuda = torch.cuda.is_available()
25 | device = torch.device('cuda' if cuda else 'cpu')
26 | 
27 | model = Jasper(num_classes=10, version='5x3').to(device)
28 | 
29 | criterion = nn.CTCLoss(blank=3, zero_infinity=True)
30 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04)
31 | 
32 | for i in range(10):
33 |     inputs = torch.rand(batch_size, sequence_length, dimension).to(device)
34 |     input_lengths = torch.IntTensor([12345, 12300, 12000])
35 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
36 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
37 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
38 |     target_lengths = torch.LongTensor([9, 8, 7])
39 |     outputs, output_lengths = model(inputs, input_lengths)
40 | 
41 |     loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths)
42 |     loss.backward()
43 |     optimizer.step()
44 |     print(loss)
45 | 


--------------------------------------------------------------------------------
/test/test_deepspeech2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | from kospeech.models import DeepSpeech2
19 | 
20 | batch_size = 3
21 | sequence_length = 14321
22 | dimension = 80
23 | 
24 | cuda = torch.cuda.is_available()
25 | device = torch.device('cuda' if cuda else 'cpu')
26 | 
27 | model = DeepSpeech2(num_classes=10, input_dim=dimension).to(device)
28 | 
29 | criterion = nn.CTCLoss(blank=3, zero_infinity=True)
30 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04)
31 | 
32 | for i in range(10):
33 |     inputs = torch.rand(batch_size, sequence_length, dimension).to(device)
34 |     input_lengths = torch.IntTensor([12345, 12300, 12000])
35 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
36 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
37 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
38 |     target_lengths = torch.LongTensor([9, 8, 7])
39 |     outputs, output_lengths = model(inputs, input_lengths)
40 | 
41 |     loss = criterion(outputs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths)
42 |     loss.backward()
43 |     optimizer.step()
44 |     print(loss)
45 | 


--------------------------------------------------------------------------------
/bin/kospeech/data/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | 
18 | @dataclass
19 | class AudioConfig:
20 |     audio_extension: str = "pcm"
21 |     sample_rate: int = 16000
22 |     frame_length: int = 20
23 |     frame_shift: int = 10
24 |     normalize: bool = True
25 |     del_silence: bool = True
26 |     feature_extract_by: str = "kaldi"
27 |     time_mask_num: int = 4
28 |     freq_mask_num: int = 2
29 |     spec_augment: bool = True
30 |     input_reverse: bool = False
31 | 
32 | 
33 | @dataclass
34 | class FilterBankConfig(AudioConfig):
35 |     transform_method: str = "fbank"
36 |     n_mels: int = 80
37 |     freq_mask_para: int = 18
38 | 
39 | 
40 | @dataclass
41 | class MelSpectrogramConfig(AudioConfig):
42 |     transform_method: str = "mel"
43 |     n_mels: int = 80
44 |     freq_mask_para: int = 18
45 | 
46 | 
47 | @dataclass
48 | class MfccConfig(AudioConfig):
49 |     transform_method: str = "mfcc"
50 |     n_mels: int = 40
51 |     freq_mask_para: int = 8
52 | 
53 | 
54 | @dataclass
55 | class SpectrogramConfig(AudioConfig):
56 |     transform_method: str = "spectrogram"
57 |     n_mels: int = 161  # Not used
58 |     freq_mask_para: int = 24
59 | 


--------------------------------------------------------------------------------
/dataset/libri/prepare-libri.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | from preprocess import (
17 |     collect_transcripts,
18 |     prepare_tokenizer,
19 |     generate_transcript_file
20 | )
21 | 
22 | LIBRI_SPEECH_DATASETS = [
23 |     'train_960', 'dev-clean', 'dev-other', 'test-clean', 'test-other'
24 | ]
25 | 
26 | 
27 | def _get_parser():
28 |     """ Get arguments parser """
29 |     parser = argparse.ArgumentParser(description='LibriSpeech Preprocess')
30 |     parser.add_argument('--dataset_path', type=str,
31 |                         default='your_dataset_path',
32 |                         help='path of original dataset')
33 |     parser.add_argument('--vocab_size', type=int,
34 |                         default=5000,
35 |                         help='size of vocab')
36 | 
37 |     return parser
38 | 
39 | 
40 | def main():
41 |     parser = _get_parser()
42 |     opt = parser.parse_args()
43 | 
44 |     transcripts_collection = collect_transcripts(opt.dataset_path)
45 |     prepare_tokenizer(transcripts_collection[0], opt.vocab_size)
46 | 
47 |     for idx, dataset in enumerate(LIBRI_SPEECH_DATASETS):
48 |         generate_transcript_file(dataset, transcripts_collection[idx])
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()


--------------------------------------------------------------------------------
/test/test_las.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | from kospeech.models import ListenAttendSpell
19 | 
20 | B, T, D, H = 3, 12345, 80, 32
21 | 
22 | cuda = torch.cuda.is_available()
23 | device = torch.device('cuda' if cuda else 'cpu')
24 | 
25 | model = ListenAttendSpell(
26 |     input_dim=D,
27 |     num_classes=10,
28 |     encoder_hidden_state_dim=H,
29 |     decoder_hidden_state_dim=H << 1,
30 |     bidirectional=True,
31 |     max_length=10,
32 | ).to(device)
33 | 
34 | criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean')
35 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-04)
36 | 
37 | for i in range(10):
38 |     inputs = torch.rand(B, T, D).to(device)
39 |     input_lengths = torch.IntTensor([12345, 12300, 12000])
40 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
41 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
42 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
43 |     outputs, output_lengths, encoder_log_probs = model(inputs, input_lengths, targets, teacher_forcing_ratio=1.0)
44 | 
45 |     loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1))
46 |     loss.backward()
47 |     optimizer.step()
48 |     print(loss)
49 | 


--------------------------------------------------------------------------------
/bin/kospeech/vocabs/librispeech.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from kospeech.vocabs import Vocabulary
16 | 
17 | 
18 | class LibriSpeechVocabulary(Vocabulary):
19 |     def __init__(self, vocab_path, model_path):
20 |         super(LibriSpeechVocabulary, self).__init__()
21 |         try:
22 |             import sentencepiece as spm
23 |         except ImportError:
24 |             raise ImportError("Please install sentencepiece: `pip install sentencepiece`")
25 |         self.pad_id = 0
26 |         self.sos_id = 1
27 |         self.eos_id = 2
28 | 
29 |         self.vocab_path = vocab_path
30 | 
31 |         self.sp = spm.SentencePieceProcessor()
32 |         self.sp.Load(model_path)
33 | 
34 |     def __len__(self):
35 |         count = 0
36 |         with open(self.vocab_path, encoding='utf-8') as f:
37 |             for _ in f.readlines():
38 |                 count += 1
39 | 
40 |         return count
41 | 
42 |     def label_to_string(self, labels):
43 |         if len(labels.shape) == 1:
44 |             return self.sp.DecodeIds([l for l in labels])
45 | 
46 |         sentences = list()
47 |         for batch in labels:
48 |             sentence = str()
49 |             for label in batch:
50 |                 sentence = self.sp.DecodeIds([l for l in label])
51 |             sentences.append(sentence)
52 |         return sentences
53 | 


--------------------------------------------------------------------------------
/test/test_tri_stage_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | import matplotlib.pyplot as plt
17 | from torch import optim
18 | from kospeech.optim.__init__ import Optimizer
19 | from kospeech.optim.lr_scheduler.tri_stage_lr_scheduler import TriStageLRScheduler
20 | 
21 | 
22 | class Model(nn.Module):
23 |     def __init__(self):
24 |         super(Model, self).__init__()
25 |         self.projection = nn.Linear(10, 10)
26 | 
27 |     def forward(self):
28 |         pass
29 | 
30 | 
31 | INIT_LR = 1e-15
32 | PEAK_LR = 3e-04
33 | FINAL_LR = 1e-05
34 | WARMUP_STEPS = 4000
35 | MAX_GRAD_NORM = 400
36 | TOTAL_STEPS = 32000
37 | 
38 | model = Model()
39 | 
40 | optimizer = optim.Adam(model.parameters(), lr=INIT_LR)
41 | scheduler = TriStageLRScheduler(
42 |     optimizer=optimizer,
43 |     init_lr=INIT_LR,
44 |     peak_lr=PEAK_LR,
45 |     final_lr=FINAL_LR,
46 |     warmup_steps=WARMUP_STEPS,
47 |     total_steps=TOTAL_STEPS
48 | )
49 | optimizer = Optimizer(optimizer, scheduler, WARMUP_STEPS, MAX_GRAD_NORM)
50 | lr_processes = list()
51 | 
52 | for timestep in range(TOTAL_STEPS):
53 |     optimizer.step(model)
54 |     lr_processes.append(optimizer.get_lr())
55 | 
56 | plt.title('Test Optimizer class')
57 | plt.plot(lr_processes)
58 | plt.xlabel('timestep', fontsize='large')
59 | plt.ylabel('lr', fontsize='large')
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | 
18 | @dataclass
19 | class ModelConfig:
20 |     architecture: str = "???"
21 |     teacher_forcing_ratio: float = 1.0
22 |     teacher_forcing_step: float = 0.01
23 |     min_teacher_forcing_ratio: float = 0.9
24 |     dropout: float = 0.3
25 |     bidirectional: bool = False
26 |     joint_ctc_attention: bool = False
27 |     max_len: int = 400
28 | 
29 | 
30 | from kospeech.models.deepspeech2.model import DeepSpeech2
31 | from kospeech.models.las.encoder import EncoderRNN
32 | from kospeech.models.las.decoder import DecoderRNN
33 | from kospeech.models.rnnt import RNNTransducerConfig
34 | from kospeech.models.rnnt.model import RNNTransducer
35 | from kospeech.models.las.model import ListenAttendSpell
36 | from kospeech.models.transformer.model import SpeechTransformer
37 | from kospeech.models.jasper.model import Jasper
38 | from kospeech.models.conformer.model import Conformer
39 | from kospeech.models.las import ListenAttendSpellConfig, JointCTCAttentionLASConfig
40 | from kospeech.models.transformer import TransformerConfig, JointCTCAttentionTransformerConfig
41 | from kospeech.models.deepspeech2 import DeepSpeech2Config
42 | from kospeech.models.jasper import JasperConfig
43 | from kospeech.models.conformer import ConformerSmallConfig, ConformerMediumConfig, ConformerLargeConfig
44 | 


--------------------------------------------------------------------------------
/test/test_transformer_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | import matplotlib.pyplot as plt
17 | from torch import optim
18 | from kospeech.optim.__init__ import Optimizer
19 | from kospeech.optim.lr_scheduler.transformer_lr_scheduler import TransformerLRScheduler
20 | 
21 | 
22 | class Model(nn.Module):
23 |     def __init__(self):
24 |         super(Model, self).__init__()
25 |         self.projection = nn.Linear(10, 10)
26 | 
27 |     def forward(self):
28 |         pass
29 | 
30 | 
31 | PEAK_LR = 1e-04
32 | FINAL_LR = 1e-07
33 | WARMUP_STEPS = 1000
34 | MAX_GRAD_NORM = 400
35 | DECAY_STEPS = 10000
36 | TOTAL_STEPS = WARMUP_STEPS + DECAY_STEPS + 10000
37 | 
38 | model = Model()
39 | 
40 | optimizer = optim.Adam(model.parameters(), lr=0.0)
41 | scheduler = TransformerLRScheduler(
42 |     optimizer=optimizer,
43 |     peak_lr=PEAK_LR,
44 |     final_lr=FINAL_LR,
45 |     warmup_steps=WARMUP_STEPS,
46 |     decay_steps=DECAY_STEPS,
47 |     final_lr_scale=0.001,
48 | )
49 | optimizer = Optimizer(optimizer, scheduler, TOTAL_STEPS, MAX_GRAD_NORM)
50 | lr_processes = list()
51 | 
52 | for timestep in range(TOTAL_STEPS):
53 |     optimizer.step(model)
54 |     lr_processes.append(optimizer.get_lr())
55 | 
56 | plt.title('Test Optimizer class')
57 | plt.plot(lr_processes)
58 | plt.xlabel('timestep', fontsize='large')
59 | plt.ylabel('lr', fontsize='large')
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/conformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | from kospeech.models import ModelConfig
18 | from kospeech.models.conformer.model import Conformer
19 | 
20 | 
21 | @dataclass
22 | class ConformerConfig(ModelConfig):
23 |     architecture: str = "conformer"
24 |     feed_forward_expansion_factor: int = 4
25 |     conv_expansion_factor: int = 2
26 |     input_dropout_p: float = 0.1
27 |     feed_forward_dropout_p: float = 0.1
28 |     attention_dropout_p: float = 0.1
29 |     conv_dropout_p: float = 0.1
30 |     decoder_dropout_p: float = 0.1
31 |     conv_kernel_size: int = 31
32 |     half_step_residual: bool = True
33 |     num_decoder_layers: int = 1
34 |     decoder_rnn_type: str = "lstm"
35 |     decoder: str = "None"
36 | 
37 | 
38 | @dataclass
39 | class ConformerLargeConfig(ConformerConfig):
40 |     encoder_dim: int = 512
41 |     decoder_dim: int = 640
42 |     num_encoder_layers: int = 17
43 |     num_attention_heads: int = 8
44 | 
45 | 
46 | @dataclass
47 | class ConformerMediumConfig(ConformerConfig):
48 |     encoder_dim: int = 256
49 |     decoder_dim: int = 640
50 |     num_encoder_layers: int = 16
51 |     num_attention_heads: int = 4
52 | 
53 | 
54 | @dataclass
55 | class ConformerSmallConfig(ConformerConfig):
56 |     encoder_dim: int = 144
57 |     decoder_dim: int = 320
58 |     num_encoder_layers: int = 16
59 |     num_attention_heads: int = 4
60 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from torch import Tensor
17 | from typing import Any, Optional
18 | 
19 | 
20 | def get_attn_pad_mask(inputs, input_lengths, expand_length):
21 |     """ mask position is set to 1 """
22 | 
23 |     def get_transformer_non_pad_mask(inputs: Tensor, input_lengths: Tensor) -> Tensor:
24 |         """ Padding position is set to 0, either use input_lengths or pad_id """
25 |         batch_size = inputs.size(0)
26 | 
27 |         if len(inputs.size()) == 2:
28 |             non_pad_mask = inputs.new_ones(inputs.size())  # B x T
29 |         elif len(inputs.size()) == 3:
30 |             non_pad_mask = inputs.new_ones(inputs.size()[:-1])  # B x T
31 |         else:
32 |             raise ValueError(f"Unsupported input shape {inputs.size()}")
33 | 
34 |         for i in range(batch_size):
35 |             non_pad_mask[i, input_lengths[i]:] = 0
36 | 
37 |         return non_pad_mask
38 | 
39 |     non_pad_mask = get_transformer_non_pad_mask(inputs, input_lengths)
40 |     pad_mask = non_pad_mask.lt(1)
41 |     attn_pad_mask = pad_mask.unsqueeze(1).expand(-1, expand_length, -1)
42 |     return attn_pad_mask
43 | 
44 | 
45 | def get_attn_subsequent_mask(seq):
46 |     assert seq.dim() == 2
47 |     attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
48 |     subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1)
49 | 
50 |     if seq.is_cuda:
51 |         subsequent_mask = subsequent_mask.cuda()
52 | 
53 |     return subsequent_mask
54 | 


--------------------------------------------------------------------------------
/bin/kospeech/data/label_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Tuple
16 | import re
17 | 
18 | 
19 | def rule(x):
20 |     # 괄호
21 |     a = re.compile(r'\([^)]*\)')
22 |     # 문장 부호
23 |     b = re.compile('[^가-힣0-9 ]')
24 |     x = re.sub(pattern=a, repl='', string= x)
25 |     x = re.sub(pattern=b, repl='', string= x)
26 |     return x
27 | 
28 | def load_dataset(transcripts_path: str) -> Tuple[list, list]:
29 |     """
30 |     Provides dictionary of filename and labels
31 | 
32 |     Args:
33 |         transcripts_path (str): path of transcripts
34 | 
35 |     Returns: target_dict
36 |         - **target_dict** (dict): dictionary of filename and labels
37 |     """
38 |     audio_paths = list()
39 |     transcripts = list()
40 | 
41 |     with open(transcripts_path, encoding='utf-8') as f:
42 |         for idx, line in enumerate(f.readlines()):
43 |             audio_path, korean_transcript ,transcript = line.split('\t')
44 |             transcript = transcript.replace('\n', '')
45 | 
46 |             audio_paths.append(audio_path)
47 |             transcripts.append(transcript)
48 |         print("성공")
49 |     return audio_paths, transcripts
50 | 
51 |     # with open(transcripts_path) as f:
52 |     #     for idx, line in enumerate(f.readlines()):
53 |     #         audio_path, korean_transcript, transcript = line.split('\t')
54 |     #         transcript = transcript.replace('\n', '')
55 | 
56 |     #         audio_paths.append(audio_path)
57 |     #         transcripts.append(transcript)
58 | 
59 |     # return audio_paths, transcripts
60 | 


--------------------------------------------------------------------------------
/test/test_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | from kospeech.model_builder import build_transformer
19 | 
20 | batch_size = 4
21 | seq_length = 200
22 | target_length = 10
23 | input_size = 80
24 | 
25 | cuda = torch.cuda.is_available()
26 | device = torch.device('cuda' if cuda else 'cpu')
27 | 
28 | transformer = build_transformer(
29 |     num_classes=10,
30 |     d_model=16,
31 |     d_ff=32,
32 |     num_heads=2,
33 |     input_dim=input_size,
34 |     num_encoder_layers=3,
35 |     num_decoder_layers=2,
36 |     extractor='vgg',
37 |     dropout_p=0.1,
38 |     device=device,
39 |     pad_id=0,
40 |     sos_id=1,
41 |     eos_id=2,
42 |     joint_ctc_attention=False,
43 |     max_length=10,
44 | )
45 | 
46 | criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean')
47 | optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-04)
48 | 
49 | for i in range(10):
50 |     inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device)
51 |     input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30])
52 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
53 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
54 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0],
55 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
56 | 
57 |     outputs, _, _ = transformer(inputs, input_lengths, targets)
58 |     loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1))
59 |     loss.backward()
60 |     optimizer.step()
61 |     print(loss)
62 | 


--------------------------------------------------------------------------------
/etc/cmd 실행 코드.txt:
--------------------------------------------------------------------------------
 1 | 모든 코드는 'KoreanSTT_kospeech' 폴더에서 실행되어야 합니다.
 2 | 또한 필자의 경우 audio path에 경로 입력 시 상대 경로를 이용하면 간혹 문제가 생겨,
 3 | train wav 폴더, test wav 폴더, metadata 폴더를 각각 'KoreanSTT_kospeech' 동일한 레벨에 둔 뒤 절대 경로로 입력해주었습니다.
 4 | 
 5 | 사용 형식 및 예시는 아래와 같습니다. 
 6 | 
 7 | [train] 
 8 | 형식) python ./bin/main.py model=ds2 train=ds2_train train.dataset_path=$dataset_path
 9 | 예시) python ./bin/main.py model=ds2 train=ds2_train train.dataset_path="D:\code\train wav"
10 | 
11 | [preprocessing]
12 | 형식) python main.py --dataset_path $dataset_path --vocab_dest $vacab_dict_destination --output_unit 'character' --preprocess_mode 'phonetic' 
13 | 예시) python main.py --dataset_path "D:\code\train wav" --vocab_dest "D:\code" --output_unit 'character' --preprocess_mode 'phonetic'
14 | python main.py --dataset_path "D:\code\train wav" --vocab_dest "D:\kospeech-latest" --output_unit "character" --preprocess_mode 'phonetic'
15 | 
16 | [inference] -> device(default='cpu')는 optional
17 | 형식) python ./bin/inference.py --model_path $model_path --audio_path $audio_path --device "cpu"
18 | 예시) python ./bin/inference.py --model_path "G:/내 드라이브/code/kospeech-latest/outputs/2021-12-10/09-00-20/model.pt" --audio_path "../test wav/CN11RC002_CN0031_20210726.wav" --device "cpu"
19 | 
20 | [inference_wer] -> dst_path(default='./outputs')와 device(default='cpu')는 optional
21 | 형식) python ./bin/inference_wer.py --model_path $model_path --audio_path $audio_path --transcript_path $transcript_path --dst_path $result_destination --device "cpu"
22 | 예시) python ./bin/inference_wer.py --model_path "G:/내 드라이브/code/kospeech-latest/outputs/2021-12-10/09-00-20/model.pt" --audio_path "D:/code/sample1/audio" --transcript_path "D:/code/sample1/transcripts.txt" --dst_path "D:/code/sample1" --device "cpu"
23 | 
24 | [prediction] -> submission을 False로 지정하면 excel 파일은 생성하지 않고 .txt 파일에 결과 저장, device(default='cpu')는 optional
25 | 형식) python ./bin/prediction.py --model_path $model_path --audio_path $audio_path --submission 'True' --device "cpu"
26 | 예시) python ./bin/prediction.py --model_path "G:\내 드라이브\code\kospeech-latest\outputs\2021-12-10\19-51-14/model.pt" --audio_path "D:/code/test wav" --submission 'True' --device "cpu"
27 | 
28 | python ./bin/prediction.py --model_path "./outputs/model.pt" --audio_path "D:/code/test wav" --submission 'True' --device "cpu"
29 | 


--------------------------------------------------------------------------------
/dataset/kspon/preprocess/subword.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sentencepiece as spm
17 | 
18 | 
19 | def train_sentencepiece(transcripts, datapath: str = './data', vocab_size: int = 5000):
20 |     print('generate_sentencepiece_input..')
21 | 
22 |     if not os.path.exists(datapath):
23 |         os.mkdir(datapath)
24 | 
25 |     with open('sentencepiece_input.txt', 'w', encoding="utf-8") as f:
26 |         for transcript in transcripts:
27 |             f.write(f'{transcript}\n')
28 | 
29 |     spm.SentencePieceTrainer.Train(
30 |         f"--input={datapath}/sentencepiece_input.txt "
31 |         f"--model_prefix=kspon_sentencepiece "
32 |         f"--vocab_size={vocab_size} "
33 |         f"--model_type=bpe "
34 |         f"--pad_id=0 "
35 |         f"--bos_id=1 "
36 |         f"--eos_id=2 "
37 |         f"--unk_id=3 "
38 |         f"--user_defined_symbols={blank_token}"
39 |     )
40 | 
41 | 
42 | def sentence_to_subwords(audio_paths: list, transcripts: list, datapath: str = './data'):
43 |     subwords = list()
44 | 
45 |     print('sentence_to_subwords...')
46 | 
47 |     sp = spm.SentencePieceProcessor()
48 |     vocab_file = "kspon_sentencepiece.model"
49 |     sp.load(vocab_file)
50 | 
51 |     with open(f'{datapath}/transcripts.txt', 'w') as f:
52 |         for audio_path, transcript in zip(audio_paths, transcripts):
53 |             audio_path = audio_path.replace('txt', 'pcm')
54 |             subword_transcript = " ".join(sp.EncodeAsPieces(transcript))
55 |             subword_id_transcript = " ".join([str(item) for item in sp.EncodeAsIds(transcript)])
56 |             f.write(f'{audio_path}\t{subword_transcript}\t{subword_id_transcript}\n')
57 | 
58 |     return subwords
59 | 


--------------------------------------------------------------------------------
/test/test_transformer_encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | from kospeech.model_builder import build_transformer
19 | 
20 | batch_size = 4
21 | seq_length = 200
22 | target_length = 10
23 | input_size = 80
24 | 
25 | cuda = torch.cuda.is_available()
26 | device = torch.device('cuda' if cuda else 'cpu')
27 | 
28 | transformer = build_transformer(
29 |     num_classes=10,
30 |     d_model=16,
31 |     d_ff=32,
32 |     num_heads=2,
33 |     input_dim=input_size,
34 |     num_encoder_layers=3,
35 |     num_decoder_layers=2,
36 |     extractor='vgg',
37 |     dropout_p=0.1,
38 |     device=device,
39 |     pad_id=0,
40 |     sos_id=1,
41 |     eos_id=2,
42 |     joint_ctc_attention=True,
43 |     max_length=10,
44 | )
45 | transformer_encoder = transformer.module.encoder
46 | 
47 | criterion = nn.CTCLoss(blank=3, zero_infinity=True)
48 | optimizer = torch.optim.Adam(transformer_encoder.parameters(), lr=1e-04)
49 | 
50 | for i in range(10):
51 |     inputs = torch.FloatTensor(batch_size, seq_length, input_size).to(device)
52 |     input_lengths = torch.LongTensor([seq_length, seq_length - 10, seq_length - 20, seq_length - 30])
53 |     targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
54 |                                 [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
55 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0],
56 |                                 [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
57 |     target_lengths = torch.LongTensor([9, 8, 7, 7])
58 | 
59 |     _, output_lengths, encoder_log_probs = transformer_encoder(inputs, input_lengths)
60 |     loss = criterion(encoder_log_probs.transpose(0, 1), targets[:, 1:], output_lengths, target_lengths)
61 |     loss.backward()
62 |     optimizer.step()
63 |     print(loss)
64 | 


--------------------------------------------------------------------------------
/bin/kospeech/criterion/transducer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | 
18 | 
19 | class TransducerLoss(nn.Module):
20 |     """
21 |     Transducer loss module.
22 | 
23 |     Args:
24 |         blank_id (int): blank symbol id
25 |     """
26 | 
27 |     def __init__(self, blank_id: int) -> None:
28 |         """Construct an TransLoss object."""
29 |         super().__init__()
30 |         try:
31 |             from warp_rnnt import rnnt_loss
32 |         except ImportError:
33 |             raise ImportError("warp-rnnt is not installed. Please re-setup")
34 |         self.rnnt_loss = rnnt_loss
35 |         self.blank_id = blank_id
36 | 
37 |     def forward(
38 |             self,
39 |             log_probs: torch.FloatTensor,
40 |             targets: torch.IntTensor,
41 |             input_lengths: torch.IntTensor,
42 |             target_lengths: torch.IntTensor,
43 |     ) -> torch.FloatTensor:
44 |         """
45 |         Compute path-aware regularization transducer loss.
46 | 
47 |         Args:
48 |             log_probs (torch.FloatTensor): Batch of predicted sequences (batch, maxlen_in, maxlen_out+1, odim)
49 |             targets (torch.IntTensor): Batch of target sequences (batch, maxlen_out)
50 |             input_lengths (torch.IntTensor): batch of lengths of predicted sequences (batch)
51 |             target_lengths (torch.IntTensor): batch of lengths of target sequences (batch)
52 | 
53 |         Returns:
54 |             loss (torch.FloatTensor): transducer loss
55 |         """
56 | 
57 |         return self.rnnt_loss(
58 |             log_probs,
59 |             targets,
60 |             input_lengths,
61 |             target_lengths,
62 |             reduction="mean",
63 |             blank=self.blank_id,
64 |             gather=True,
65 |         )
66 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/transformer/sublayers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | from torch import Tensor
17 | from kospeech.models.modules import Linear
18 | 
19 | 
20 | class AddNorm(nn.Module):
21 |     """
22 |     Add & Normalization layer proposed in "Attention Is All You Need".
23 |     Transformer employ a residual connection around each of the two sub-layers,
24 |     (Multi-Head Attention & Feed-Forward) followed by layer normalization.
25 |     """
26 |     def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None:
27 |         super(AddNorm, self).__init__()
28 |         self.sublayer = sublayer
29 |         self.layer_norm = nn.LayerNorm(d_model)
30 | 
31 |     def forward(self, *args):
32 |         residual = args[0]
33 |         outputs = self.sublayer(*args)
34 | 
35 |         if isinstance(outputs, tuple):
36 |             return self.layer_norm(outputs[0] + residual), outputs[1]
37 | 
38 |         return self.layer_norm(outputs + residual)
39 | 
40 | 
41 | class PositionwiseFeedForward(nn.Module):
42 |     """
43 |     Position-wise Feedforward Networks proposed in "Attention Is All You Need".
44 |     Fully connected feed-forward network, which is applied to each position separately and identically.
45 |     This consists of two linear transformations with a ReLU activation in between.
46 |     Another way of describing this is as two convolutions with kernel size 1.
47 |     """
48 |     def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout_p: float = 0.3) -> None:
49 |         super(PositionwiseFeedForward, self).__init__()
50 |         self.feed_forward = nn.Sequential(
51 |             Linear(d_model, d_ff),
52 |             nn.Dropout(dropout_p),
53 |             nn.ReLU(),
54 |             Linear(d_ff, d_model),
55 |             nn.Dropout(dropout_p),
56 |         )
57 | 
58 |     def forward(self, inputs: Tensor) -> Tensor:
59 |         return self.feed_forward(inputs)
60 | 


--------------------------------------------------------------------------------
/bin/kospeech/optim/lr_scheduler/transformer_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | from kospeech.optim.lr_scheduler.lr_scheduler import LearningRateScheduler
17 | 
18 | 
19 | class TransformerLRScheduler(LearningRateScheduler):
20 |     """ Transformer Learning Rate Scheduler proposed in "Attention Is All You Need" """
21 |     def __init__(self, optimizer, peak_lr, final_lr, final_lr_scale, warmup_steps, decay_steps):
22 |         assert isinstance(warmup_steps, int), "warmup_steps should be inteager type"
23 |         assert isinstance(decay_steps, int), "total_steps should be inteager type"
24 | 
25 |         super(TransformerLRScheduler, self).__init__(optimizer, 0.0)
26 |         self.final_lr = final_lr
27 |         self.peak_lr = peak_lr
28 |         self.warmup_steps = warmup_steps
29 |         self.decay_steps = decay_steps
30 | 
31 |         self.warmup_rate = self.peak_lr / self.warmup_steps
32 |         self.decay_factor = -math.log(final_lr_scale) / self.decay_steps
33 | 
34 |         self.lr = self.init_lr
35 |         self.update_step = 0
36 | 
37 |     def _decide_stage(self):
38 |         if self.update_step < self.warmup_steps:
39 |             return 0, self.update_step
40 | 
41 |         if self.warmup_steps <= self.update_step < self.warmup_steps + self.decay_steps:
42 |             return 1, self.update_step - self.warmup_steps
43 | 
44 |         return 2, None
45 | 
46 |     def step(self):
47 |         self.update_step += 1
48 |         stage, steps_in_stage = self._decide_stage()
49 | 
50 |         if stage == 0:
51 |             self.lr = self.update_step * self.warmup_rate
52 |         elif stage == 1:
53 |             self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
54 |         elif stage == 2:
55 |             self.lr = self.final_lr
56 |         else:
57 |             raise ValueError("Undefined stage")
58 | 
59 |         self.set_lr(self.optimizer, self.lr)
60 | 
61 |         return self.lr
62 | 


--------------------------------------------------------------------------------
/docs/_static/js/html5shiv.min.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed
3 | */
4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x<style>"+b+"</style>",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="<xyz></xyz>",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document);


--------------------------------------------------------------------------------
/docs/build/html/_static/js/html5shiv.min.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed
3 | */
4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x<style>"+b+"</style>",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="<xyz></xyz>",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document);


--------------------------------------------------------------------------------
/bin/kospeech/models/transformer/embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | import torch
17 | import torch.nn as nn
18 | from torch import Tensor
19 | 
20 | 
21 | class PositionalEncoding(nn.Module):
22 |     """
23 |     Positional Encoding proposed in "Attention Is All You Need".
24 |     Since transformer contains no recurrence and no convolution, in order for the model to make
25 |     use of the order of the sequence, we must add some positional information.
26 | 
27 |     "Attention Is All You Need" use sine and cosine functions of different frequencies:
28 |         PE_(pos, 2i)    =  sin(pos / power(10000, 2i / d_model))
29 |         PE_(pos, 2i+1)  =  cos(pos / power(10000, 2i / d_model))
30 |     """
31 |     def __init__(self, d_model: int = 512, max_len: int = 5000) -> None:
32 |         super(PositionalEncoding, self).__init__()
33 |         pe = torch.zeros(max_len, d_model, requires_grad=False)
34 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
35 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
36 |         pe[:, 0::2] = torch.sin(position * div_term)
37 |         pe[:, 1::2] = torch.cos(position * div_term)
38 |         pe = pe.unsqueeze(0)
39 |         self.register_buffer('pe', pe)
40 | 
41 |     def forward(self, length: int) -> Tensor:
42 |         return self.pe[:, :length]
43 | 
44 | 
45 | class Embedding(nn.Module):
46 |     """
47 |     Embedding layer. Similarly to other sequence transduction models, transformer use learned embeddings
48 |     to convert the input tokens and output tokens to vectors of dimension d_model.
49 |     In the embedding layers, transformer multiply those weights by sqrt(d_model)
50 |     """
51 |     def __init__(self, num_embeddings: int, pad_id: int, d_model: int = 512) -> None:
52 |         super(Embedding, self).__init__()
53 |         self.sqrt_dim = math.sqrt(d_model)
54 |         self.embedding = nn.Embedding(num_embeddings, d_model, padding_idx=pad_id)
55 | 
56 |     def forward(self, inputs: Tensor) -> Tensor:
57 |         return self.embedding(inputs) * self.sqrt_dim
58 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/modules.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.init as init
18 | from torch import Tensor
19 | 
20 | 
21 | class ResidualConnectionModule(nn.Module):
22 |     """
23 |     Residual Connection Module.
24 |     outputs = (module(inputs) x module_factor + inputs x input_factor)
25 |     """
26 |     def __init__(
27 |             self,
28 |             module: nn.Module,
29 |             module_factor: float = 1.0,
30 |             input_factor: float = 1.0,
31 |     ) -> None:
32 |         super(ResidualConnectionModule, self).__init__()
33 |         self.module = module
34 |         self.module_factor = module_factor
35 |         self.input_factor = input_factor
36 | 
37 |     def forward(self, inputs: Tensor) -> Tensor:
38 |         return (self.module(inputs) * self.module_factor) + (inputs * self.input_factor)
39 | 
40 | 
41 | class Linear(nn.Module):
42 |     """
43 |     Wrapper class of torch.nn.Linear
44 |     Weight initialize by xavier initialization and bias initialize to zeros.
45 |     """
46 |     def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
47 |         super(Linear, self).__init__()
48 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
49 |         init.xavier_uniform_(self.linear.weight)
50 |         if bias:
51 |             init.zeros_(self.linear.bias)
52 | 
53 |     def forward(self, x: Tensor) -> Tensor:
54 |         return self.linear(x)
55 | 
56 | 
57 | class View(nn.Module):
58 |     """ Wrapper class of torch.view() for Sequential module. """
59 |     def __init__(self, shape: tuple, contiguous: bool = False):
60 |         super(View, self).__init__()
61 |         self.shape = shape
62 |         self.contiguous = contiguous
63 | 
64 |     def forward(self, inputs):
65 |         if self.contiguous:
66 |             inputs = inputs.contiguous()
67 |         return inputs.view(*self.shape)
68 | 
69 | 
70 | class Transpose(nn.Module):
71 |     """ Wrapper class of torch.transpose() for Sequential module. """
72 |     def __init__(self, shape: tuple):
73 |         super(Transpose, self).__init__()
74 |         self.shape = shape
75 | 
76 |     def forward(self, inputs: Tensor):
77 |         return inputs.transpose(*self.shape)
78 | 


--------------------------------------------------------------------------------
/bin/kospeech/evaluator/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import queue
16 | import torch
17 | from kospeech.utils import logger
18 | from kospeech.data import AudioDataLoader
19 | from kospeech.decode.search import (
20 |     GreedySearch,
21 |     BeamSearch
22 | )
23 | 
24 | 
25 | class Evaluator(object):
26 |     """
27 |     Class to evaluate models with given datasets.
28 | 
29 |     Args:
30 |         dataset (kospeech.data.data_loader.SpectrogramDataset): dataset for spectrogram & script matching
31 |         batch_size (int): size of batch. recommended batch size is 1.
32 |         device (torch.device): device - 'cuda' or 'cpu'
33 |         num_workers (int): the number of cpu cores used
34 |         print_every (int): to determine whether to store training progress every N timesteps (default: 10)
35 |     """
36 | 
37 |     def __init__(self, dataset, vocab, batch_size=1, device=None,
38 |                  num_workers=1, print_every=100, decode='greedy', beam_size=None):
39 |         self.dataset = dataset
40 |         self.batch_size = batch_size
41 |         self.device = device
42 |         self.num_workers = num_workers
43 |         self.print_every = print_every
44 | 
45 |         if decode == 'greedy':
46 |             self.search = GreedySearch(vocab)
47 | 
48 |         elif decode == 'beam':
49 |             assert beam_size > 1, "beam_size should be greater than 1. You can choose `greedy` search"
50 |             self.search = BeamSearch(vocab, beam_size, batch_size)
51 | 
52 |         else:
53 |             raise ValueError("Unsupported decode : {0}".format(decode))
54 | 
55 |     def evaluate(self, model):
56 |         """ Evaluate a model on given dataset and return performance. """
57 |         logger.info('evaluate() start')
58 | 
59 |         eval_queue = queue.Queue(self.num_workers << 1)
60 |         eval_loader = AudioDataLoader(self.dataset, eval_queue, self.batch_size, thread_id=0, pad_id=0)
61 |         eval_loader.start()
62 | 
63 |         cer = self.search.search(model, eval_queue, self.device, self.print_every)
64 |         self.search.save_result('data/train_result/%s.csv' % type(self.decoder).__name__)
65 | 
66 |         logger.info('Evaluate CER: %s' % cer)
67 |         logger.info('evaluate() completed')
68 |         eval_loader.join()
69 | 


--------------------------------------------------------------------------------
/etc/traintext 생성.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 7,
 6 |    "id": "f9cd89df",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import json\n",
11 |     "import os\n",
12 |     "import re\n"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 8,
18 |    "id": "0246a9a8",
19 |    "metadata": {},
20 |    "outputs": [],
21 |    "source": [
22 |     "def rule(x):\n",
23 |     "    # 괄호\n",
24 |     "    a = re.compile(r'\\([^)]*\\)')\n",
25 |     "    # 문장 부호\n",
26 |     "    b = re.compile('[^가-힣0-9 ]')\n",
27 |     "    x = re.sub(pattern=a, repl='', string= x)\n",
28 |     "    x = re.sub(pattern=b, repl='', string= x)\n",
29 |     "    return x"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": 9,
35 |    "id": "11b2a5db",
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": [
39 |     "cnt = 0\n",
40 |     "for audio_path, y in zip(os.listdir(\"D:\\코드\\[train] 음성데이터_wav\"), os.listdir(\"D:\\코드\\[train] 메타데이터_json\")):\n",
41 |     "    cnt +=1\n",
42 |     "    # if cnt == 100:\n",
43 |     "    #     break\n",
44 |     "    try: \n",
45 |     "        with open(\"D:\\코드\\[train] 메타데이터_json\" + \"/\" + y, 'r', encoding='UTF-8') as f:\n",
46 |     "            json_data = json.load(f)\n",
47 |     "            reading = json.dumps(json_data[\"transcription\"][\"ReadingLabelText\"], ensure_ascii = False)\n",
48 |     "        reading = rule(reading)\n",
49 |     "        with open(\"D:\\코드/train.txt\", \"a\") as f:\n",
50 |     "            f.write('{0}\\n'.format(audio_path))\n",
51 |     "        # with open(\"D:\\코드/train.txt\", \"a\") as f:\n",
52 |     "        #     f.write('{0}\\t{1}\\n'.format(audio_path, reading))\n",
53 |     "\n",
54 |     "    except:\n",
55 |     "        with open(\"D:\\코드/train.txt\", \"a\") as f:\n",
56 |     "            f.write(\"------error    error--------\\n\")\n",
57 |     "    print(cnt == 300000)"
58 |    ]
59 |   }
60 |  ],
61 |  "metadata": {
62 |   "kernelspec": {
63 |    "display_name": "Python 3 (ipykernel)",
64 |    "language": "python",
65 |    "name": "python3"
66 |   },
67 |   "language_info": {
68 |    "codemirror_mode": {
69 |     "name": "ipython",
70 |     "version": 3
71 |    },
72 |    "file_extension": ".py",
73 |    "mimetype": "text/x-python",
74 |    "name": "python",
75 |    "nbconvert_exporter": "python",
76 |    "pygments_lexer": "ipython3",
77 |    "version": "3.8.12"
78 |   },
79 |   "toc": {
80 |    "base_numbering": 1,
81 |    "nav_menu": {},
82 |    "number_sections": true,
83 |    "sideBar": true,
84 |    "skip_h1_title": false,
85 |    "title_cell": "Table of Contents",
86 |    "title_sidebar": "Contents",
87 |    "toc_cell": false,
88 |    "toc_position": {},
89 |    "toc_section_display": true,
90 |    "toc_window_display": false
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 5
95 | }
96 | 


--------------------------------------------------------------------------------
/bin/kospeech/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from kospeech.optim.adamp import AdamP
17 | from kospeech.optim.radam import RAdam
18 | from kospeech.optim.novograd import Novograd
19 | 
20 | 
21 | class Optimizer(object):
22 |     """
23 |     This is wrapper classs of torch.optim.Optimizer.
24 |     This class provides functionalities for learning rate scheduling and gradient norm clipping.
25 | 
26 |     Args:
27 |         optim (torch.optim.Optimizer): optimizer object, the parameters to be optimized
28 |             should be given when instantiating the object, e.g. torch.optim.Adam, torch.optim.SGD
29 |         scheduler (kospeech.optim.lr_scheduler, optional): learning rate scheduler
30 |         scheduler_period (int, optional): timestep with learning rate scheduler
31 |         max_grad_norm (int, optional): value used for gradient norm clipping
32 |     """
33 |     def __init__(self, optim, scheduler=None, scheduler_period=None, max_grad_norm=0):
34 |         self.optimizer = optim
35 |         self.scheduler = scheduler
36 |         self.scheduler_period = scheduler_period
37 |         self.max_grad_norm = max_grad_norm
38 |         self.count = 0
39 | 
40 |     def step(self, model):
41 |         if self.max_grad_norm > 0:
42 |             torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
43 |         self.optimizer.step()
44 | 
45 |         if self.scheduler is not None:
46 |             self.update()
47 |             self.count += 1
48 | 
49 |             if self.scheduler_period == self.count:
50 |                 self.scheduler = None
51 |                 self.scheduler_period = 0
52 |                 self.count = 0
53 | 
54 |     def set_scheduler(self, scheduler, scheduler_period):
55 |         self.scheduler = scheduler
56 |         self.scheduler_period = scheduler_period
57 |         self.count = 0
58 | 
59 |     def update(self):
60 |         if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
61 |             pass
62 |         else:
63 |             self.scheduler.step()
64 | 
65 |     def zero_grad(self):
66 |         self.optimizer.zero_grad()
67 | 
68 |     def get_lr(self):
69 |         for g in self.optimizer.param_groups:
70 |             return g['lr']
71 | 
72 |     def set_lr(self, lr):
73 |         for g in self.optimizer.param_groups:
74 |             g['lr'] = lr
75 | 


--------------------------------------------------------------------------------
/bin/kospeech/models/jasper/configs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class Jasper10x5Config:
17 |     def __init__(self, num_classes: int, num_blocks: int, num_sub_blocks: int) -> None:
18 |         super(Jasper10x5Config, self).__init__()
19 |         self.num_blocks = num_blocks
20 |         self.num_sub_blocks = num_sub_blocks
21 |         self.preprocess_block = {
22 |             'in_channels': 80,
23 |             'out_channels': 256,
24 |             'kernel_size': 11,
25 |             'stride': 2,
26 |             'dilation': 1,
27 |             'dropout_p': 0.2,
28 |         }
29 |         self.block = {
30 |             'in_channels': (256, 256, 256, 384, 384, 512, 512, 640, 640, 768),
31 |             'out_channels': (256, 256, 384, 384, 512, 512, 640, 640, 768, 768),
32 |             'kernel_size': (11, 11, 13, 13, 17, 17, 21, 21, 25, 25),
33 |             'dilation': [1] * 10,
34 |             'dropout_p': (0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3),
35 |         }
36 |         self.postprocess_block = {
37 |             'in_channels': (768, 896, 1024),
38 |             'out_channels': (896, 1024, num_classes),
39 |             'kernel_size': (29, 1, 1),
40 |             'dilation': (2, 1, 1),
41 |             'dropout_p': (0.4, 0.4, 0.0),
42 |         }
43 | 
44 | 
45 | class Jasper5x3Config:
46 |     def __init__(self, num_classes: int, num_blocks: int, num_sub_blocks: int) -> None:
47 |         super(Jasper5x3Config, self).__init__()
48 |         self.num_blocks = num_blocks
49 |         self.num_sub_blocks = num_sub_blocks
50 |         self.preprocess_block = {
51 |             'in_channels': 80,
52 |             'out_channels': 256,
53 |             'kernel_size': 11,
54 |             'stride': 2,
55 |             'dilation': 1,
56 |             'dropout_p': 0.2,
57 |         }
58 |         self.block = {
59 |             'in_channels': (256, 256, 384, 512, 640),
60 |             'out_channels': (256, 384, 512, 640, 768),
61 |             'kernel_size': (11, 13, 17, 21, 25),
62 |             'dilation': [1] * 5,
63 |             'dropout_p': (0.2, 0.2, 0.2, 0.3, 0.3),
64 |         }
65 |         self.postprocess_block = {
66 |             'in_channels': (768, 896, 1024),
67 |             'out_channels': (896, 1024, num_classes),
68 |             'kernel_size': (29, 1, 1),
69 |             'dilation': (2, 1, 1),
70 |             'dropout_p': (0.4, 0.4, 0.0),
71 |         }
72 | 


--------------------------------------------------------------------------------
/bin/kospeech/optim/lr_scheduler/tri_stage_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | from kospeech.optim.lr_scheduler.lr_scheduler import LearningRateScheduler
17 | 
18 | 
19 | class TriStageLRScheduler(LearningRateScheduler):
20 |     """
21 |     Tri-Stage Learning Rate Scheduler
22 |     Implement the learning rate scheduler in "SpecAugment"
23 |     """
24 |     def __init__(self, optimizer, init_lr, peak_lr, final_lr, init_lr_scale, final_lr_scale, warmup_steps, total_steps):
25 |         assert isinstance(warmup_steps, int), "warmup_steps should be inteager type"
26 |         assert isinstance(total_steps, int), "total_steps should be inteager type"
27 | 
28 |         super(TriStageLRScheduler, self).__init__(optimizer, init_lr)
29 |         self.init_lr *= init_lr_scale
30 |         self.final_lr = final_lr
31 |         self.peak_lr = peak_lr
32 |         self.warmup_steps = warmup_steps
33 |         self.hold_steps = int(total_steps >> 1) - warmup_steps
34 |         self.decay_steps = int(total_steps >> 1)
35 | 
36 |         self.warmup_rate = (self.peak_lr - self.init_lr) / self.warmup_steps if self.warmup_steps != 0 else 0
37 |         self.decay_factor = -math.log(final_lr_scale) / self.decay_steps
38 | 
39 |         self.lr = self.init_lr
40 |         self.update_step = 0
41 | 
42 |     def _decide_stage(self):
43 |         if self.update_step < self.warmup_steps:
44 |             return 0, self.update_step
45 | 
46 |         offset = self.warmup_steps
47 | 
48 |         if self.update_step < offset + self.hold_steps:
49 |             return 1, self.update_step - offset
50 | 
51 |         offset += self.hold_steps
52 | 
53 |         if self.update_step <= offset + self.decay_steps:
54 |             # decay stage
55 |             return 2, self.update_step - offset
56 | 
57 |         offset += self.decay_steps
58 | 
59 |         return 3, self.update_step - offset
60 | 
61 |     def step(self):
62 |         stage, steps_in_stage = self._decide_stage()
63 | 
64 |         if stage == 0:
65 |             self.lr = self.init_lr + self.warmup_rate * steps_in_stage
66 |         elif stage == 1:
67 |             self.lr = self.peak_lr
68 |         elif stage == 2:
69 |             self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
70 |         elif stage == 3:
71 |             self.lr = self.final_lr
72 |         else:
73 |             raise ValueError("Undefined stage")
74 | 
75 |         self.set_lr(self.optimizer, self.lr)
76 |         self.update_step += 1
77 | 
78 |         return self.lr
79 | 


--------------------------------------------------------------------------------
/bin/inference.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | from typing_extensions import Required
17 | import torch
18 | import torch.nn as nn
19 | import numpy as np
20 | import torchaudio
21 | from torch import Tensor
22 | import os
23 | from tools import revise
24 | 
25 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
26 | from kospeech.data.audio.core import load_audio
27 | from kospeech.models import (
28 |     SpeechTransformer,
29 |     Jasper,
30 |     DeepSpeech2,
31 |     ListenAttendSpell,
32 |     Conformer,
33 | )
34 | 
35 | 
36 | def parse_audio(audio_path: str, del_silence: bool = False, audio_extension: str = 'wav') -> Tensor:
37 |     signal = load_audio(audio_path, del_silence, extension=audio_extension)
38 |     feature = torchaudio.compliance.kaldi.fbank(
39 |         waveform=Tensor(signal).unsqueeze(0),
40 |         num_mel_bins=80,
41 |         frame_length=20,
42 |         frame_shift=10,
43 |         window_type='hamming'
44 |     ).transpose(0, 1).numpy()
45 | 
46 |     feature -= feature.mean()
47 |     feature /= np.std(feature)
48 | 
49 |     return torch.FloatTensor(feature).transpose(0, 1)
50 | 
51 | 
52 | # require (x)- > required (o)
53 | parser = argparse.ArgumentParser(description='KoSpeech')
54 | parser.add_argument('--model_path', type=str, required=True)
55 | parser.add_argument('--audio_path', type=str, required=True)
56 | parser.add_argument('--device', type=str, required=False, default='cpu')
57 | opt = parser.parse_args()
58 | 
59 | # 음성 하나에 대해 inference 하는 경우
60 | feature = parse_audio(opt.audio_path, del_silence=True)
61 | input_length = torch.LongTensor([len(feature)])
62 | vocab = KsponSpeechVocabulary('data/vocab/cssiri_character_vocabs.csv')
63 | 
64 | model = torch.load(opt.model_path, map_location=lambda storage, loc: storage).to(opt.device)
65 | if isinstance(model, nn.DataParallel):
66 |     model = model.module
67 | model.eval()
68 | 
69 | if isinstance(model, ListenAttendSpell):
70 |     model.encoder.device = opt.device
71 |     model.decoder.device = opt.device
72 | 
73 |     y_hats = model.recognize(feature.unsqueeze(0), input_length)
74 | elif isinstance(model, DeepSpeech2):
75 |     model.device = opt.device
76 |     y_hats = model.recognize(feature.unsqueeze(0), input_length)
77 | elif isinstance(model, SpeechTransformer) or isinstance(model, Jasper) or isinstance(model, Conformer):
78 |     y_hats = model.greedy_search(feature.unsqueeze(0), input_length)
79 | 
80 | sentence = vocab.label_to_string(y_hats.cpu().detach().numpy())
81 | print(revise(sentence))
82 | 


--------------------------------------------------------------------------------
/bin/eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import hydra
17 | import warnings
18 | from hydra.core.config_store import ConfigStore
19 | from omegaconf import OmegaConf, DictConfig
20 | from kospeech.evaluator import EvalConfig
21 | from kospeech.data.audio import FilterBankConfig
22 | from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
23 | from kospeech.vocabs.librispeech import LibriSpeechVocabulary
24 | from kospeech.data.label_loader import load_dataset
25 | from kospeech.data.data_loader import SpectrogramDataset
26 | from kospeech.evaluator.evaluator import Evaluator
27 | from kospeech.utils import check_envirionment, logger
28 | from kospeech.model_builder import load_test_model
29 | 
30 | 
31 | def inference(config: DictConfig):
32 |     device = check_envirionment(config.eval.use_cuda)
33 |     model = load_test_model(config.eval, device)
34 | 
35 |     if config.eval.dataset == 'kspon':
36 |         vocab = KsponSpeechVocabulary(
37 |             f'../../../data/vocab/cssiri_{config.eval.output_unit}_vocabs.csv', output_unit=config.eval.output_unit
38 |         )
39 |     elif config.eval.dataset == 'libri':
40 |         vocab = LibriSpeechVocabulary('../../../data/vocab/tokenizer.vocab', 'data/vocab/tokenizer.model')
41 |     else:
42 |         raise ValueError("Unsupported Dataset : {0}".format(config.eval.dataset))
43 | 
44 |     audio_paths, transcripts = load_dataset(config.eval.transcripts_path)
45 | 
46 |     testset = SpectrogramDataset(audio_paths=audio_paths, transcripts=transcripts,
47 |                                  sos_id=vocab.sos_id, eos_id=vocab.eos_id,
48 |                                  dataset_path=config.eval.dataset_path,  config=config, spec_augment=False)
49 | 
50 |     evaluator = Evaluator(
51 |         dataset=testset,
52 |         vocab=vocab,
53 |         batch_size=config.eval.batch_size,
54 |         device=device,
55 |         num_workers=config.eval.num_workers,
56 |         print_every=config.eval.print_every,
57 |         decode=config.eval.decode,
58 |         beam_size=config.eval.k,
59 |     )
60 |     evaluator.evaluate(model)
61 | 
62 | 
63 | cs = ConfigStore.instance()
64 | cs.store(group="eval", name="default", node=EvalConfig, package="eval")
65 | cs.store(group="audio", name="fbank", node=FilterBankConfig, package="audio")
66 | 
67 | 
68 | @hydra.main(config_path=os.path.join('..', "configs"), config_name="eval")
69 | def main(config: DictConfig) -> None:
70 |     warnings.filterwarnings('ignore')
71 |     logger.info(OmegaConf.to_yaml(config))
72 |     inference(config)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/docs/_static/css/badge_only.css:
--------------------------------------------------------------------------------
1 | .fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}


--------------------------------------------------------------------------------
/docs/build/html/_static/css/badge_only.css:
--------------------------------------------------------------------------------
1 | .fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}


--------------------------------------------------------------------------------
/dataset/kspon/preprocess/character.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import pandas as pd
17 | 
18 | # id/char 사전 만들기
19 | def load_label(filepath):
20 |     char2id = dict()
21 |     id2char = dict()
22 | 
23 |     ch_labels = pd.read_csv(filepath, encoding="utf-8")
24 | 
25 |     id_list = ch_labels["id"]
26 |     char_list = ch_labels["char"]
27 |     freq_list = ch_labels["freq"]
28 | 
29 |     for (id_, char, freq) in zip(id_list, char_list, freq_list):
30 |         char2id[char] = id_
31 |         id2char[id_] = char
32 |     return char2id, id2char
33 | 
34 | 
35 | # 문장을 id로
36 | def sentence_to_target(sentence, char2id):
37 |     target = str()
38 | 
39 |     for ch in sentence:
40 |         try:
41 |             target += (str(char2id[ch]) + ' ')
42 |         # 사전에 없는 경우 넘어가라 -> 그냥 묵음처리나 마찬가지.
43 |         except KeyError:
44 |             continue
45 | 
46 |     return target[:-1]
47 | 
48 | 
49 | # 한글 전사에서 빈번하게 사용된 상위 2000개 단어 id 사전 만들기
50 | def generate_character_labels(transcripts, labels_dest):
51 |     print('create_char_labels started..')
52 | 
53 |     label_list = list()
54 |     label_freq = list()
55 | 
56 |     for transcript in transcripts:
57 |         for ch in transcript:
58 |             if ch not in label_list:
59 |                 label_list.append(ch)
60 |                 label_freq.append(1)
61 |             else:
62 |                 label_freq[label_list.index(ch)] += 1
63 | 
64 |     # sort together Using zip
65 |     label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
66 |     label = {'id': [0, 1, 2], 'char': ['<pad>', '<sos>', '<eos>'], 'freq': [0, 0, 0]}
67 | 
68 |     for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
69 |         label['id'].append(idx + 3)
70 |         label['char'].append(ch)
71 |         label['freq'].append(freq)
72 | 
73 |     label['id'] = label['id'][:986]
74 |     label['char'] = label['char'][:986]
75 |     label['freq'] = label['freq'][:986]
76 | 
77 |     label_df = pd.DataFrame(label)
78 |     label_df.to_csv(os.path.join(labels_dest, "cssiri_character_vocabs.csv"), encoding="utf-8", index=False)
79 | 
80 | 
81 | def generate_character_script(audio_paths, transcripts, labels_dest):
82 |     print('create_script started..')
83 |     char2id, id2char = load_label(os.path.join(labels_dest, "cssiri_character_vocabs.csv"))
84 | 
85 |     with open(os.path.join("transcripts.txt"), "w") as f:
86 |         for audio_path, transcript in zip(audio_paths, transcripts):
87 |             char_id_transcript = sentence_to_target(transcript, char2id)
88 |             audio_path = audio_path.replace('txt', 'wav')
89 |             f.write(f'{audio_path}\t{transcript}\t{char_id_transcript}\n')
90 | 


--------------------------------------------------------------------------------
/bin/kospeech/decode/ensemble.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Note! This code is not available !!
16 | 
17 | import torch
18 | import torch.nn as nn
19 | 
20 | 
21 | class Ensemble(nn.Module):
22 |     """
23 |     Ensemble decoding.
24 |     Decodes using multiple models simultaneously,
25 | 
26 |     Note:
27 |         Do not use this class directly, use one of the sub classes.
28 |     """
29 |     def __init__(self, models):
30 |         super(Ensemble, self).__init__()
31 |         self.models = models
32 |         self.num_models = len(models)
33 | 
34 |     def forward(self, *args, **kwargs):
35 |         raise NotImplementedError
36 | 
37 | 
38 | class BasicEnsemble(Ensemble):
39 |     """
40 |     Basic ensemble decoding.
41 | 
42 |     Decodes using multiple models simultaneously,
43 |     combining their prediction distributions by adding.
44 |     All models in the ensemble must share a target characters.
45 |     """
46 |     def __init__(self, models):
47 |         super(BasicEnsemble, self).__init__(models)
48 | 
49 |     def forward(self, inputs, input_lengths):
50 |         y_hats = None
51 | 
52 |         with torch.no_grad():
53 |             for model in self.models:
54 |                 if y_hats is None:
55 |                     y_hats = model(inputs, input_lengths, teacher_forcing_ratio=0.0)
56 |                 else:
57 |                     y_hats += model(inputs, input_lengths, teacher_forcing_ratio=0.0)
58 | 
59 |         return y_hats
60 | 
61 | 
62 | class WeightedEnsemble(Ensemble):
63 |     """
64 |     Weighted ensemble decoding.
65 | 
66 |     Decodes using multiple models simultaneously,
67 |     combining their prediction distributions by weighted sum.
68 |     All models in the ensemble must share a target characters.
69 |     """
70 |     def __init__(self, models, dim=128):
71 |         super(WeightedEnsemble, self).__init__(models)
72 |         self.meta_classifier = nn.Sequential(
73 |             nn.Linear(self.num_models, dim),
74 |             nn.ELU(inplace=True),
75 |             nn.Linear(dim, self.num_models)
76 |         )
77 | 
78 |     def forward(self, inputs, input_lengths):
79 |         y_hats, outputs = None, list()
80 |         weights = torch.FloatTensor([1.] * self.num_models)
81 | 
82 |         # model`s parameters are fixed
83 |         with torch.no_grad():
84 |             for model in self.models:
85 |                 outputs.append(model(inputs, input_lengths, teacher_forcing_ratio=0.0))
86 | 
87 |         weights = self.meta_classifier(weights)
88 | 
89 |         for (output, weight) in zip(outputs, weights):
90 |             if y_hats is None:
91 |                 y_hats = output * weight
92 |             else:
93 |                 y_hats += output * weight
94 | 
95 |         return y_hats
96 | 


--------------------------------------------------------------------------------
/bin/kospeech/criterion/label_smoothed_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | from torch import Tensor
19 | 
20 | 
21 | class LabelSmoothedCrossEntropyLoss(nn.Module):
22 |     """
23 |     Label smoothed cross entropy loss function.
24 | 
25 |     Args:
26 |         num_classes (int): the number of classfication
27 |         ignore_index (int): Indexes that are ignored when calculating loss
28 |         smoothing (float): ratio of smoothing (confidence = 1.0 - smoothing)
29 |         dim (int): dimension of calculation loss
30 |         reduction (str): reduction method [sum, mean] (default: sum)
31 | 
32 |     Inputs: logits, target
33 |         logits (torch.Tensor): probability distribution value from model and it has a logarithm shape
34 |         target (torch.Tensor): ground-thruth encoded to integers which directly point a word in label
35 | 
36 |     Returns: label_smoothed
37 |         - **label_smoothed** (float): sum of loss
38 |     """
39 |     def __init__(
40 |             self,
41 |             num_classes: int,           # the number of classfication
42 |             ignore_index: int,          # indexes that are ignored when calcuating loss
43 |             smoothing: float = 0.1,     # ratio of smoothing (confidence = 1.0 - smoothing)
44 |             dim: int = -1,              # dimension of caculation loss
45 |             reduction='sum',            # reduction method [sum, mean]
46 |     ) -> None:
47 |         super(LabelSmoothedCrossEntropyLoss, self).__init__()
48 |         self.confidence = 1.0 - smoothing
49 |         self.smoothing = smoothing
50 |         self.num_classes = num_classes
51 |         self.dim = dim
52 |         self.ignore_index = ignore_index
53 |         self.reduction = reduction.lower()
54 | 
55 |         if self.reduction == 'sum':
56 |             self.reduction_method = torch.sum
57 |         elif self.reduction == 'mean':
58 |             self.reduction_method = torch.mean
59 |         else:
60 |             raise ValueError("Unsupported reduction method {0}".format(reduction))
61 | 
62 |     def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
63 |         if self.smoothing > 0.0:
64 |             with torch.no_grad():
65 |                 label_smoothed = torch.zeros_like(logits)
66 |                 label_smoothed.fill_(self.smoothing / (self.num_classes - 1))
67 |                 label_smoothed.scatter_(1, targets.data.unsqueeze(1), self.confidence)
68 |                 label_smoothed[targets == self.ignore_index, :] = 0
69 |             return self.reduction_method(-label_smoothed * logits)
70 | 
71 |         return F.cross_entropy(logits, targets, ignore_index=self.ignore_index, reduction=self.reduction)
72 | 


--------------------------------------------------------------------------------
/test/attn_visualize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import numpy as np
17 | import librosa
18 | import pandas as pd
19 | import matplotlib.pyplot as plt
20 | import seaborn as sns
21 | from kospeech.data.audio.core import split
22 | from kospeech.models.seq2seq.decoder import Seq2seqDecoder
23 | 
24 | MODEL_PATH = '../data/checkpoints/model.pt'
25 | AUDIO_PATH = '../data/sample/KaiSpeech_000098.pcm'
26 | DEL_SILENCE = True
27 | NORMALIZE = True
28 | SAMPLE_RATE = 16000
29 | N_MELS = 80
30 | N_FFT = 320
31 | HOP_LENGTH = 160
32 | 
33 | 
34 | def load_audio(audio_path, del_silence):
35 |     sound = np.memmap(audio_path, dtype='h', mode='r').astype('float32')
36 | 
37 |     if del_silence:
38 |         non_silence_indices = split(sound, top_db=30)
39 |         sound = np.concatenate([sound[start:end] for start, end in non_silence_indices])
40 | 
41 |     sound /= 32767  # normalize audio
42 |     return sound
43 | 
44 | 
45 | # Set your parse_audio() method used in training.
46 | def parse_audio(audio_path):
47 |     sound = load_audio(audio_path, DEL_SILENCE)
48 | 
49 |     spectrogram = librosa.feature.melspectrogram(sound, SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
50 |     spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)
51 | 
52 |     if NORMALIZE:
53 |         mean = np.mean(spectrogram)
54 |         std = np.std(spectrogram)
55 |         spectrogram -= mean
56 |         spectrogram /= std
57 | 
58 |     spectrogram = spectrogram[:, ::-1]
59 |     spectrogram = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(spectrogram, 0, 1)))
60 | 
61 |     return spectrogram
62 | 
63 | 
64 | spectrogram = parse_audio(AUDIO_PATH)
65 | model = torch.load(MODEL_PATH)
66 | 
67 | _, metadata = model(spectrogram.unsqueeze(0), torch.IntTensor([len(spectrogram)]), teacher_forcing_ratio=0.0)  # D(NxT)
68 | 
69 | alignments = metadata[Seq2seqDecoder.KEY_ATTN_SCORE]
70 | attention_maps = None
71 | 
72 | for decode_timestep in alignments:
73 |     if attention_maps is None:
74 |         attention_maps = decode_timestep
75 |     else:
76 |         attention_maps = torch.cat([attention_maps, decode_timestep], dim=1)  # NxDxT
77 | 
78 | attention_maps = torch.flip(attention_maps, dims=[0, 1])
79 | num_heads = attention_maps.size(0)
80 | 
81 | f = plt.figure(figsize=(16, 6))
82 | plt.imshow(spectrogram.transpose(0, 1), aspect='auto', origin='lower')
83 | plt.savefig("./image/spectrogram.png")
84 | 
85 | for n in range(num_heads):
86 |     g = plt.figure(figsize=(10, 8))
87 |     attention_map = pd.DataFrame(attention_maps[n].cpu().detach().numpy().transpose(0, 1))
88 |     attention_map = sns.heatmap(attention_map, fmt="f", cmap='viridis')
89 |     attention_map.invert_yaxis()
90 |     fig = attention_map.get_figure()
91 |     fig.savefig("./image/head%s.png" % str(n))
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | ## 외국인 발화 한국어 음성 인식(Korean STT)
 4 | 
 5 | 
 6 | **kospeech를 활용한 한국어 음성인식 모델 개발**
 7 | ___
 8 | </div>
 9 | 
10 | 해당 프로젝트는 End-to-End 한국어 음성 인식 오픈소스 툴킷인
11 | [kospeech](https://github.com/sooftware/kospeech)를 활용하여 진행했음을 밝힙니다.
12 | </br></br>
13 | 
14 | ## - 프로젝트 개요
15 | - 외국인 발화 한국어 음성 데이터를 활용해 음성 인식 모델을 개발하고, 관련 사업화 아이디어를 제안하는 해커톤에 참여한 프로젝트의 일부입니다. </br>(대회 기간: 약 3주)
16 | - 위의 이유로, 데이터와 학습을 완료한 모델은 업로드 하지 않았습니다.
17 | - 총 30만 개의 학습 데이터를 활용하여 모델을 개발하고 1만 개의 테스트 데이터에 대해 CER, WER을 측정한 결과 'CER: 0.085, WER: 0.1919'의 결과를 얻었습니다.
18 | - 모델 성능 40%, 사업화 아이디어 60%의 평가에서 전체 29개 참여 팀 중 성능 부문 1위, 최종 평가 2위로 우수상을 수상했습니다.
19 | - 원본 오픈소스 코드에 에러/버그가 꽤 많았습니다. 해당 문제들을 하나씩 해결한 결과물이고, 그 과정에서 제가 사용한 데이터에 맞게 변형된 부분이 존재할 수 있습니다.
20 | - 자세한 오픈소스 활용 방법과 그 과정에서 발생한 문제들을 해결한 과정은 제 [블로그](https://mingchin.tistory.com/152)의 'kospeech(한국어 STT)' 카테고리를 참고하시기 바랍니다.
21 | - ./bin/inference_wer.py, ./bin/tools.py, ./bin/prediction.py는 kospeech에 존재하지 않으며, 필요에 의해 생성한 파일입니다.
22 | - 꼭 필요하지는 않지만 참고가 될만한 파일은 ./etc 안에 두었습니다.
23 | </br></br>
24 | 
25 | ## - 모듈 설치
26 | ```
27 | !pip install -r requirements_cssiri.txt
28 | ```
29 | - Python 3.8을 사용했습니다. 
30 | - kospeech가 제공하는 다양한 Acoustic Model 중, ds2(deepspeech2)를 사용했습니다.
31 | - Pytorch의 경우 1.10 버전이 사용되기 때문에 상위 버전을 사용하시는 경우 별도로 Pytorch를 재설치해주어야 합니다.
32 | - 전처리, 학습, 예측, 예측한 결과 저장에 필요한 모든 모듈을 포함시켰습니다.
33 | </br></br>
34 | 
35 | ## - 전처리(Preprocess)
36 | ```
37 | !python ./dataset/kspon/main.py --dataset_path $dataset_path --vocab_dest $vacab_dict_destination --output_unit 'character' --preprocess_mode 'phonetic' 
38 | ```
39 | - output_unit과 preprocess_mode는 상황에 맞게 지정해주시면 됩니다.
40 | - ./dataset/kspon/preprocess/preprocess.py의 line 95~101을 확인해보시면, './'의 위치에 'train.txt' 파일을 필요로 합니다. 해당 파일은 '음성 파일 경로' + '\t' + '한국어 전사' 의 형식으로 작성되어야 합니다.
41 | - train.txt를 만들 때 사용한 코드는 ./etc/traintext 생성.ipynb 에 올려두었습니다.
42 | </br></br>
43 | 
44 | ## - 학습(Train)
45 | ```
46 | !python ./bin/main.py model=ds2 train=ds2_train train.dataset_path=$dataset_path
47 | ```
48 | - 학습과 관련된 configs(epoch, batch_size, spec_augment, 음성 파일 확장자 등)의 수정은 ./configs/audio/fbank.yaml 혹은 ./configs/train/ds2_train.yaml 에서 하실 수 있습니다. 
49 | </br>보다 자세한 설명은 [여기](https://mingchin.tistory.com/222)를 참고해주세요.
50 | </br></br>
51 | 
52 | ## - 예측(Inference)
53 | 
54 | 모든 command에 대한 deivice 옵션은 상황에 맞게 지정해주세요.
55 | 
56 | 1. 음성 파일 1개에 대한 예측
57 | * Command
58 | 
59 | ```
60 | !python ./bin/inference.py --model_path $model_path --audio_path $audio_path --device "cpu"
61 | ```
62 | * Output
63 | 
64 | ```
65 | 음성 인식 결과
66 | ```
67 | 2. 음성 파일 1개에 대한 예측과 Cer, Wer 계산 결과 저장</br>
68 | (결과는 dst_path에 저장되며, 정답 label인 transcripts.txt파일을 transcript_path에 지정해주어야 합니다. 그 형식은 전처리에 필요한 train.txt 파일 혹은 학습에 사용되는 transcripts.txt와 동일해야 합니다.)
69 | * Command
70 | ```
71 | python ./bin/inference_wer.py --model_path $model_path --audio_path $audio_path --transcript_path $transcript_path --dst_path $result_destination --device "cpu"
72 | ```
73 | * Output
74 | 
75 | ```
76 | 음성 인식 결과
77 | ```
78 | 3. 음성 파일 여러 개(폴더)에 대한 예측과 그 결과 저장(.txt, .xlsx)
79 | * Command
80 | ```
81 | python ./bin/prediction.py --model_path $model_path --audio_path $audio_path --submission 'True' --device "cpu"
82 | ```
83 | 'submission = True'로 지정하면 예측 결과를 .xlsx 파일로 저장할 수 있습니다. 다만 2개의 컬럼을 갖는 제출용 excel 파일을 필요로 합니다.
84 | * Output
85 | 
86 | ./outputs 폴더에 .txt와 .xlsx 파일 생성
87 | </br></br>
88 | 
89 | ## - References
90 | - Wer, Cer 관련: 
91 | https://holianh.github.io/portfolio/Cach-tinh-WER/
92 | - kospeech:
93 | https://github.com/sooftware/kospeech
94 | 


--------------------------------------------------------------------------------
/dataset/kspon/preprocess/grapheme.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import pandas as pd
17 | import unicodedata
18 | 
19 | 
20 | def load_label(filepath):
21 |     grpm2id = dict()
22 |     id2grpm = dict()
23 | 
24 |     vocab_data_frame = pd.read_csv(filepath, encoding="utf-8")
25 | 
26 |     id_list = vocab_data_frame["id"]
27 |     grpm_list = vocab_data_frame["grpm"]
28 | 
29 |     for _id, grpm in zip(id_list, grpm_list):
30 |         grpm2id[grpm] = _id
31 |         id2grpm[_id] = grpm
32 |     return grpm2id, id2grpm
33 | 
34 | 
35 | def sentence_to_target(transcript, grpm2id):
36 |     target = str()
37 | 
38 |     for grpm in transcript:
39 |         target += (str(grpm2id[grpm]) + ' ')
40 | 
41 |     return target[:-1]
42 | 
43 | 
44 | def sentence_to_grapheme(audio_paths, transcripts, vocab_dest: str = './data'):
45 |     grapheme_transcripts = list()
46 | 
47 |     if not os.path.exists(vocab_dest):
48 |         os.mkdir(vocab_dest)
49 | 
50 |     for transcript in transcripts:
51 |         grapheme_transcripts.append(" ".join(unicodedata.normalize('NFKD', transcript).replace(' ', '|')).upper())
52 | 
53 |     generate_grapheme_labels(grapheme_transcripts, vocab_dest)
54 | 
55 |     print('create_script started..')
56 |     grpm2id, id2grpm = load_label(os.path.join(vocab_dest, "aihub_labels.csv"))
57 | 
58 |     with open(os.path.join(f"{vocab_dest}/transcripts.txt"), "w") as f:
59 |         for audio_path, transcript, grapheme_transcript in zip(audio_paths, transcripts, grapheme_transcripts):
60 |             audio_path = audio_path.replace('txt', 'pcm')
61 |             grpm_id_transcript = sentence_to_target(grapheme_transcript.split(), grpm2id)
62 |             f.write(f'{audio_path}\t{transcript}\t{grpm_id_transcript}\n')
63 | 
64 | 
65 | def generate_grapheme_labels(grapheme_transcripts, vocab_dest: str = './data'):
66 |     vocab_list = list()
67 |     vocab_freq = list()
68 | 
69 |     for grapheme_transcript in grapheme_transcripts:
70 |         graphemes = grapheme_transcript.split()
71 |         for grapheme in graphemes:
72 |             if grapheme not in vocab_list:
73 |                 vocab_list.append(grapheme)
74 |                 vocab_freq.append(1)
75 |             else:
76 |                 vocab_freq[vocab_list.index(grapheme)] += 1
77 | 
78 |     vocab_freq, vocab_list = zip(*sorted(zip(vocab_freq, vocab_list), reverse=True))
79 |     vocab_dict = {
80 |         'id': [0, 1, 2],
81 |         'grpm': ['<pad>', '<sos>', '<eos>'],
82 |         'freq': [0, 0, 0]
83 |     }
84 | 
85 |     for idx, (grpm, freq) in enumerate(zip(vocab_list, vocab_freq)):
86 |         vocab_dict['id'].append(idx + 3)
87 |         vocab_dict['grpm'].append(grpm)
88 |         vocab_dict['freq'].append(freq)
89 | 
90 |     label_df = pd.DataFrame(vocab_dict)
91 |     label_df.to_csv(os.path.join(vocab_dest, "aihub_labels.csv"), encoding="utf-8", index=False)
92 | 


--------------------------------------------------------------------------------
/bin/kospeech/criterion/joint_ctc_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | from typing import Tuple
17 | from torch import Tensor
18 | 
19 | from kospeech.criterion import LabelSmoothedCrossEntropyLoss
20 | 
21 | 
22 | class JointCTCCrossEntropyLoss(nn.Module):
23 |     """
24 |     Privides Joint CTC-CrossEntropy Loss function
25 | 
26 |     Args:
27 |         num_classes (int): the number of classification
28 |         ignore_index (int): indexes that are ignored when calculating loss
29 |         dim (int): dimension of calculation loss
30 |         reduction (str): reduction method [sum, mean] (default: mean)
31 |         ctc_weight (float): weight of ctc loss
32 |         cross_entropy_weight (float): weight of cross entropy loss
33 |         blank_id (int): identification of blank for ctc
34 |     """
35 |     
36 |     def __init__(
37 |             self,
38 |             num_classes: int,                     
39 |             ignore_index: int,                    
40 |             dim: int = -1,                        
41 |             reduction='mean',                     
42 |             ctc_weight: float = 0.3,              
43 |             cross_entropy_weight: float = 0.7,    
44 |             blank_id: int = None,                 
45 |             smoothing: float = 0.1,               
46 |     ) -> None:
47 |         super(JointCTCCrossEntropyLoss, self).__init__()
48 |         self.num_classes = num_classes
49 |         self.dim = dim
50 |         self.ignore_index = ignore_index
51 |         self.reduction = reduction.lower()
52 |         self.ctc_weight = ctc_weight
53 |         self.cross_entropy_weight = cross_entropy_weight
54 |         self.ctc_loss = nn.CTCLoss(blank=blank_id, reduction=self.reduction, zero_infinity=True)
55 |         if smoothing > 0.0:
56 |             self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss(
57 |                 num_classes=num_classes,
58 |                 ignore_index=ignore_index,
59 |                 smoothing=smoothing,
60 |                 reduction=reduction,
61 |                 dim=-1,
62 |             )
63 |         else:
64 |             self.cross_entropy_loss = nn.CrossEntropyLoss(reduction=self.reduction, ignore_index=self.ignore_index)
65 | 
66 |     def forward(
67 |             self,
68 |             encoder_log_probs: Tensor,
69 |             decoder_log_probs: Tensor,
70 |             output_lengths: Tensor,
71 |             targets: Tensor,
72 |             target_lengths: Tensor,
73 |     ) -> Tuple[Tensor, Tensor, Tensor]:
74 |         ctc_loss = self.ctc_loss(encoder_log_probs, targets, output_lengths, target_lengths)
75 |         cross_entropy_loss = self.cross_entropy_loss(decoder_log_probs, targets.contiguous().view(-1))
76 |         loss = cross_entropy_loss * self.cross_entropy_weight + ctc_loss * self.ctc_weight
77 |         return loss, ctc_loss, cross_entropy_loss
78 | 


--------------------------------------------------------------------------------
/dataset/libri/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, Soohwan Kim. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import sentencepiece as spm
17 | 
18 | LIBRI_SPEECH_DATASETS = [
19 |     'train_960', 'dev-clean', 'dev-other', 'test-clean', 'test-other'
20 | ]
21 | 
22 | 
23 | def collect_transcripts(dataset_path):
24 |     transcripts_collection = list()
25 | 
26 |     for dataset in LIBRI_SPEECH_DATASETS:
27 |         dataset_transcripts = list()
28 | 
29 |         for subfolder1 in os.listdir(os.path.join(dataset_path, dataset)):
30 |             for subfolder2 in os.listdir(os.path.join(dataset_path, dataset, subfolder1)):
31 |                 for file in os.listdir(os.path.join(dataset_path, dataset, subfolder1, subfolder2)):
32 |                     if file.endswith('txt'):
33 |                         with open(os.path.join(dataset_path, dataset, subfolder1, subfolder2, file)) as f:
34 |                             for line in f.readlines():
35 |                                 tokens = line.split()
36 |                                 audio = '%s.flac' % os.path.join(dataset, subfolder1, subfolder2, tokens[0])
37 |                                 transcript = " ".join(tokens[1:])
38 |                                 dataset_transcripts.append('%s|%s' % (audio, transcript))
39 | 
40 |                     else:
41 |                         continue
42 | 
43 |         transcripts_collection.append(dataset_transcripts)
44 | 
45 |     return transcripts_collection
46 | 
47 | 
48 | def prepare_tokenizer(train_transcripts, vocab_size):
49 |     input_file = 'spm_input.txt'
50 |     model_name = 'tokenizer'
51 |     model_type = 'unigram'
52 | 
53 |     with open(input_file, 'w') as f:
54 |         for transcript in train_transcripts:
55 |             f.write('{}\n'.format(transcript.split('|')[-1]))
56 | 
57 |     input_args = '--input=%s --model_prefix=%s --vocab_size=%s --model_type=%s'
58 |     cmd = input_args % (input_file, model_name, vocab_size, model_type)
59 |     spm.SentencePieceTrainer.Train(cmd)
60 | 
61 | 
62 | def generate_transcript_file(dataset_name, transcripts):
63 |     sp = spm.SentencePieceProcessor()
64 |     sp.Load("tokenizer.model")
65 | 
66 |     with open('../../data/%s-transcript.txt' % dataset_name, 'w') as f:
67 |         for transcript in transcripts:
68 |             audio, transcript = transcript.split('|')
69 |             text = " ".join(sp.EncodeAsPieces(transcript))
70 |             label = " ".join([str(item) for item in sp.EncodeAsIds(transcript)])
71 | 
72 |             f.write('%s\t%s\t%s\n' % (audio, text, label))
73 | 
74 | 
75 | def merge_train_dev_transcript_file():
76 |     merge_list = ['train_960', 'dev-clean', 'dev-other']
77 | 
78 |     lines = list()
79 | 
80 |     for dataset in merge_list:
81 |         with open('../../data/%s-transcript.txt' % dataset) as f:
82 |             for line in f.readlines():
83 |                 lines.append(line)
84 | 
85 |     with open('../../data/train.txt', 'w') as f:
86 |         for line in lines:
87 |             f.write('%s' % line)
88 | 


--------------------------------------------------------------------------------