├── .coveragerc ├── .github ├── stale.yml └── workflows │ └── pythonpublish.yml ├── .idea ├── StreamingTransformer.iml ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── egs ├── aishell1 │ └── asr1 │ │ ├── cmd.sh │ │ ├── conf │ │ └── decode.yaml │ │ ├── data │ │ ├── decode.sh │ │ ├── dump │ │ ├── exp │ │ ├── local │ │ ├── aishell_data_prep.sh │ │ └── download_and_untar.sh │ │ ├── path.sh │ │ ├── steps │ │ ├── train.sh │ │ ├── utils │ │ └── viterbi_decode.sh └── librispeech │ └── asr1 │ ├── cmd.sh │ ├── conf │ ├── decode.yaml │ ├── fbank.conf │ ├── gpu.conf │ ├── pitch.conf │ ├── queue.conf │ ├── slurm.conf │ ├── specaug.yaml │ └── train_streaming_transformer.yaml │ ├── decode.sh │ ├── local │ ├── data_prep.sh │ └── download_and_untar.sh │ ├── path.sh │ ├── run.sh │ ├── steps │ ├── train.sh │ ├── utils │ └── viterbi_decode.sh ├── espnet ├── __init__.py ├── __pycache__ │ └── __init__.cpython-37.pyc ├── asr │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── asr_utils.cpython-37.pyc │ ├── asr_utils.py │ └── pytorch_backend │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── asr_ddp.cpython-37.pyc │ │ ├── asr_init.cpython-37.pyc │ │ └── asr_recog.cpython-37.pyc │ │ ├── asr_ddp.py │ │ ├── asr_init.py │ │ └── asr_recog.py ├── bin │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── asr_train.cpython-37.pyc │ ├── asr_recog.py │ └── asr_train.py ├── nets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── asr_interface.cpython-37.pyc │ │ ├── ctc_prefix_score.cpython-37.pyc │ │ ├── e2e_asr_common.cpython-37.pyc │ │ ├── lm_interface.cpython-37.pyc │ │ ├── scorer_interface.cpython-37.pyc │ │ └── viterbi_align.cpython-37.pyc │ ├── ctc_prefix_score.py │ ├── e2e_asr_common.py │ ├── lm_interface.py │ ├── pytorch_backend │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── ctc.cpython-37.pyc │ │ │ ├── e2e_asr_transformer.cpython-37.pyc │ │ │ ├── nets_utils.cpython-37.pyc │ │ │ └── streaming_transformer.cpython-37.pyc │ │ ├── conformer │ │ │ └── encoder.py │ │ ├── conformer_aed.py │ │ ├── ctc.py │ │ ├── e2e_asr_transformer.py │ │ ├── lm │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ └── default.cpython-37.pyc │ │ │ └── default.py │ │ ├── nets_utils.py │ │ ├── streaming_transformer.py │ │ └── transformer │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── add_sos_eos.cpython-37.pyc │ │ │ ├── attention.cpython-37.pyc │ │ │ ├── decoder.cpython-37.pyc │ │ │ ├── decoder_layer.cpython-37.pyc │ │ │ ├── embedding.cpython-37.pyc │ │ │ ├── encoder.cpython-37.pyc │ │ │ ├── encoder_layer.cpython-37.pyc │ │ │ ├── initializer.cpython-37.pyc │ │ │ ├── label_smoothing_loss.cpython-37.pyc │ │ │ ├── layer_norm.cpython-37.pyc │ │ │ ├── mask.cpython-37.pyc │ │ │ ├── multi_layer_conv.cpython-37.pyc │ │ │ ├── optimizer.cpython-37.pyc │ │ │ ├── plot.cpython-37.pyc │ │ │ ├── positionwise_feed_forward.cpython-37.pyc │ │ │ ├── repeat.cpython-37.pyc │ │ │ └── subsampling.cpython-37.pyc │ │ │ ├── add_sos_eos.py │ │ │ ├── attention.py │ │ │ ├── decoder.py │ │ │ ├── decoder_layer.py │ │ │ ├── embedding.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ ├── initializer.py │ │ │ ├── label_smoothing_loss.py │ │ │ ├── layer_norm.py │ │ │ ├── mask.py │ │ │ ├── multi_layer_conv.py │ │ │ ├── optimizer.py │ │ │ ├── plot.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── repeat.py │ │ │ └── subsampling.py │ ├── scorer_interface.py │ ├── scorers │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── ctc.cpython-37.pyc │ │ └── ctc.py │ └── viterbi_align.py ├── transform │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── functional.cpython-37.pyc │ │ ├── spec_augment.cpython-37.pyc │ │ ├── transform_interface.cpython-37.pyc │ │ └── transformation.cpython-37.pyc │ ├── cmvn.py │ ├── functional.py │ ├── spec_augment.py │ ├── spectrogram.py │ ├── transform_interface.py │ └── transformation.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── check_kwargs.cpython-37.pyc │ ├── cli_utils.cpython-37.pyc │ ├── dataset.cpython-37.pyc │ ├── deterministic_utils.cpython-37.pyc │ ├── dynamic_import.cpython-37.pyc │ ├── fill_missing_args.cpython-37.pyc │ └── io_utils.cpython-37.pyc │ ├── check_kwargs.py │ ├── cli_readers.py │ ├── cli_utils.py │ ├── cli_writers.py │ ├── dataset.py │ ├── deterministic_utils.py │ ├── dynamic_import.py │ ├── fill_missing_args.py │ ├── io_utils.py │ ├── spec_augment.py │ └── training │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── batchfy.cpython-37.pyc │ └── batchfy.py ├── setup.py ├── tools ├── Makefile └── check_install.py └── utils ├── addjson.py ├── apply-cmvn.py ├── average_checkpoints.py ├── change_yaml.py ├── compute-cmvn-stats.py ├── compute-fbank-feats.py ├── compute-stft-feats.py ├── concatjson.py ├── convert_fbank.sh ├── convert_fbank_to_wav.py ├── copy-feats.py ├── data2json.sh ├── download_from_google_drive.sh ├── dump-pcm.py ├── dump.sh ├── dump_pcm.sh ├── eval-source-separation.py ├── eval_perm_free_error.py ├── eval_source_separation.sh ├── feat-to-shape.py ├── feat_to_shape.sh ├── feats2npy.py ├── filt.py ├── free-gpu.sh ├── generate_wav.sh ├── get_yaml.py ├── json2sctm.py ├── json2text.py ├── json2trn.py ├── json2trn_mt.py ├── json2trn_wo_dict.py ├── make_fbank.sh ├── make_stft.sh ├── merge_data.py ├── merge_scp2json.py ├── mergejson.py ├── mix-mono-wav-scp.py ├── pack_model.sh ├── queue-freegpu.pl ├── recog_wav.sh ├── reduce_data_dir.sh ├── remove_longshortdata.sh ├── result2json.py ├── score_bleu.sh ├── score_sclite.sh ├── score_sclite_wo_dict.sh ├── scp2json.py ├── show_result.sh ├── split_data.py ├── splitjson.py ├── spm_decode ├── spm_encode ├── spm_train ├── stdout.pl ├── synth_wav.sh ├── text2token.py ├── text2vocabulary.py ├── translate_wav.sh ├── trim_silence.py ├── trim_silence.sh ├── trn2ctm.py ├── trn2stm.py └── update_json.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | # Regexes for lines to exclude from consideration 3 | exclude_lines = 4 | # Have to re-enable the standard pragma 5 | pragma: no cover 6 | if __name__ == "__main__": 7 | if __name__ == '__main__': 8 | @abstractmethod 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 45 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 30 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - Roadmap 8 | - Bug 9 | # Label to use when marking an issue as stale 10 | staleLabel: Stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | unmarkComment: false 17 | # Comment to post when closing a stale issue. Set to `false` to disable 18 | closeComment: > 19 | This issue is closed. Please re-open if needed. 20 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | push: 8 | tags: 9 | - 'v*' 10 | 11 | jobs: 12 | deploy: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: '3.8' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Build and publish 27 | env: 28 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 30 | run: | 31 | python setup.py sdist bdist_wheel 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /.idea/StreamingTransformer.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Streaming Transformer 2 | **This repo contains the streaming Transformer of our work ``On the Comparison of Popular End-to-End Models for Large Scale Speech Recognition``, which is based on ESPnet0.6.0. The streaming Transformer includes a streaming encoder, either chunk-based or look-ahead based, and a trigger-attention based decoder.** 3 | 4 | We will release following models and show reproducible results on Librispeech 5 | 6 | * Streaming_transformer-chunk32 with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1LSBY_vK50Jxvw_GeiYrPwRtJ0DsKU6zL/view?usp=sharing) 7 | 8 | * Streaming_transformer-chunk32 with VGG Encoder. (https://drive.google.com/file/d/12P6TsxtOCxrHezqgtk0USjSKBsYHIe7K/view?usp=sharing) 9 | 10 | * Streaming_transformer-lookahead with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1YJQaofzsk9_KsL2W9Zb42sGLRRIKRs9X/view?usp=sharing) 11 | 12 | * Streaming_transformer-lookahead with VGG Encoder. (https://drive.google.com/file/d/1LO_0pPxU5XJffqJMgtx4W4IL-Aih5m0M/view?usp=sharing) 13 | 14 | ## Results on Librispeech (beam=10) 15 | | Model | test-clean | test-other |latency |size | 16 | | -------- | -----: | :----: |:----: |:----: | 17 | | streaming_transformer-chunk32-conv2d | 2.8 | 7.5 | 640ms | 78M | 18 | | streaming_transformer-chunk32-vgg | 2.8 | 7.0| 640ms | 78M | 19 | | streaming_transformer-lookahead2-conv2d | 3.0 | 8.6| 1230ms | 78M | 20 | | streaming_transformer-lookahead2-vgg | 2.8 | 7.5 | 1230ms | 78M | 21 | 22 | 23 | 24 | 25 | ## Installation 26 | Our installation follow the installation process of ESPnet 27 | ### Step 1. setting of the environment 28 | CUDAROOT=/path/to/cuda 29 | 30 | export PATH=$CUDAROOT/bin:$PATH 31 | export LD_LIBRARY_PATH=$CUDAROOT/lib64:$LD_LIBRARY_PATH 32 | export CFLAGS="-I$CUDAROOT/include $CFLAGS" 33 | export CUDA_HOME=$CUDAROOT 34 | export CUDA_PATH=$CUDAROOT` 35 | ### Step 2. installation including Kaldi 36 | cd tools 37 | make -j 10 38 | 39 | ## Build a streaming Transformer model 40 | ### Step 1. Data Prepare 41 | cd egs/librispeech/asr1 42 | ./run.sh 43 | By default. the processed data will stored in the current directory. You can change the path by editing the scripts. 44 | ### Step 2. Viterbi decoding 45 | To train a TA based streaming Transformer, the alignments between CTC paths and transcriptions are required. In our work, we apply Viterbi decoding using the offline Transformer model. 46 | 47 | cd egs/librispeech/asr1 48 | ./viterbi_decode.sh /path/to/model 49 | 50 | 51 | ### Step 3. Train a streaming Transformer 52 | Here, we train a chunk-based streaming Transformer which is initialized with an offline Transformer provided by ESPnet. Set `enc-init` in `conf/train_streaming_transformer.yaml` to the path of your offline model. 53 | 54 | cd egs/librispeech/asr1 55 | ./train.sh 56 | 57 | If you want to train a look-ahead based streaming Transformer, set `chunk` to False and change the `left-window, right-window, dec-left-window, dec-right-window` arguments. The training log is written in `exp/streaming_transformer/train.log`. You can monitor the output through `tail -f exp/streaming_transformer/train.log` 58 | 59 | ### Step 4. Decoding 60 | Execute the following script with to decoding on test_clean and test_other sets 61 | 62 | ./decode.sh num_of_gpu job_per_gpu 63 | 64 | ### Offline Transformer Reference 65 | Regarding the offline Transformer model, Please visit [here](https://github.com/MarkWuNLP/SemanticMask) 66 | 67 | -------------------------------------------------------------------------------- /egs/aishell1/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== 2 | # Usage: .pl [options] JOB=1: 3 | # e.g. 4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB 5 | # 6 | # Options: 7 | # --time