├── .coveragerc
├── .github
├── stale.yml
└── workflows
│ └── pythonpublish.yml
├── .idea
├── StreamingTransformer.iml
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── egs
├── aishell1
│ └── asr1
│ │ ├── cmd.sh
│ │ ├── conf
│ │ └── decode.yaml
│ │ ├── data
│ │ ├── decode.sh
│ │ ├── dump
│ │ ├── exp
│ │ ├── local
│ │ ├── aishell_data_prep.sh
│ │ └── download_and_untar.sh
│ │ ├── path.sh
│ │ ├── steps
│ │ ├── train.sh
│ │ ├── utils
│ │ └── viterbi_decode.sh
└── librispeech
│ └── asr1
│ ├── cmd.sh
│ ├── conf
│ ├── decode.yaml
│ ├── fbank.conf
│ ├── gpu.conf
│ ├── pitch.conf
│ ├── queue.conf
│ ├── slurm.conf
│ ├── specaug.yaml
│ └── train_streaming_transformer.yaml
│ ├── decode.sh
│ ├── local
│ ├── data_prep.sh
│ └── download_and_untar.sh
│ ├── path.sh
│ ├── run.sh
│ ├── steps
│ ├── train.sh
│ ├── utils
│ └── viterbi_decode.sh
├── espnet
├── __init__.py
├── __pycache__
│ └── __init__.cpython-37.pyc
├── asr
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── asr_utils.cpython-37.pyc
│ ├── asr_utils.py
│ └── pytorch_backend
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── asr_ddp.cpython-37.pyc
│ │ ├── asr_init.cpython-37.pyc
│ │ └── asr_recog.cpython-37.pyc
│ │ ├── asr_ddp.py
│ │ ├── asr_init.py
│ │ └── asr_recog.py
├── bin
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── asr_train.cpython-37.pyc
│ ├── asr_recog.py
│ └── asr_train.py
├── nets
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── asr_interface.cpython-37.pyc
│ │ ├── ctc_prefix_score.cpython-37.pyc
│ │ ├── e2e_asr_common.cpython-37.pyc
│ │ ├── lm_interface.cpython-37.pyc
│ │ ├── scorer_interface.cpython-37.pyc
│ │ └── viterbi_align.cpython-37.pyc
│ ├── ctc_prefix_score.py
│ ├── e2e_asr_common.py
│ ├── lm_interface.py
│ ├── pytorch_backend
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ ├── ctc.cpython-37.pyc
│ │ │ ├── e2e_asr_transformer.cpython-37.pyc
│ │ │ ├── nets_utils.cpython-37.pyc
│ │ │ └── streaming_transformer.cpython-37.pyc
│ │ ├── conformer
│ │ │ └── encoder.py
│ │ ├── conformer_aed.py
│ │ ├── ctc.py
│ │ ├── e2e_asr_transformer.py
│ │ ├── lm
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ └── default.cpython-37.pyc
│ │ │ └── default.py
│ │ ├── nets_utils.py
│ │ ├── streaming_transformer.py
│ │ └── transformer
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ ├── add_sos_eos.cpython-37.pyc
│ │ │ ├── attention.cpython-37.pyc
│ │ │ ├── decoder.cpython-37.pyc
│ │ │ ├── decoder_layer.cpython-37.pyc
│ │ │ ├── embedding.cpython-37.pyc
│ │ │ ├── encoder.cpython-37.pyc
│ │ │ ├── encoder_layer.cpython-37.pyc
│ │ │ ├── initializer.cpython-37.pyc
│ │ │ ├── label_smoothing_loss.cpython-37.pyc
│ │ │ ├── layer_norm.cpython-37.pyc
│ │ │ ├── mask.cpython-37.pyc
│ │ │ ├── multi_layer_conv.cpython-37.pyc
│ │ │ ├── optimizer.cpython-37.pyc
│ │ │ ├── plot.cpython-37.pyc
│ │ │ ├── positionwise_feed_forward.cpython-37.pyc
│ │ │ ├── repeat.cpython-37.pyc
│ │ │ └── subsampling.cpython-37.pyc
│ │ │ ├── add_sos_eos.py
│ │ │ ├── attention.py
│ │ │ ├── decoder.py
│ │ │ ├── decoder_layer.py
│ │ │ ├── embedding.py
│ │ │ ├── encoder.py
│ │ │ ├── encoder_layer.py
│ │ │ ├── initializer.py
│ │ │ ├── label_smoothing_loss.py
│ │ │ ├── layer_norm.py
│ │ │ ├── mask.py
│ │ │ ├── multi_layer_conv.py
│ │ │ ├── optimizer.py
│ │ │ ├── plot.py
│ │ │ ├── positionwise_feed_forward.py
│ │ │ ├── repeat.py
│ │ │ └── subsampling.py
│ ├── scorer_interface.py
│ ├── scorers
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── ctc.cpython-37.pyc
│ │ └── ctc.py
│ └── viterbi_align.py
├── transform
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── functional.cpython-37.pyc
│ │ ├── spec_augment.cpython-37.pyc
│ │ ├── transform_interface.cpython-37.pyc
│ │ └── transformation.cpython-37.pyc
│ ├── cmvn.py
│ ├── functional.py
│ ├── spec_augment.py
│ ├── spectrogram.py
│ ├── transform_interface.py
│ └── transformation.py
└── utils
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── check_kwargs.cpython-37.pyc
│ ├── cli_utils.cpython-37.pyc
│ ├── dataset.cpython-37.pyc
│ ├── deterministic_utils.cpython-37.pyc
│ ├── dynamic_import.cpython-37.pyc
│ ├── fill_missing_args.cpython-37.pyc
│ └── io_utils.cpython-37.pyc
│ ├── check_kwargs.py
│ ├── cli_readers.py
│ ├── cli_utils.py
│ ├── cli_writers.py
│ ├── dataset.py
│ ├── deterministic_utils.py
│ ├── dynamic_import.py
│ ├── fill_missing_args.py
│ ├── io_utils.py
│ ├── spec_augment.py
│ └── training
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── batchfy.cpython-37.pyc
│ └── batchfy.py
├── setup.py
├── tools
├── Makefile
└── check_install.py
└── utils
├── addjson.py
├── apply-cmvn.py
├── average_checkpoints.py
├── change_yaml.py
├── compute-cmvn-stats.py
├── compute-fbank-feats.py
├── compute-stft-feats.py
├── concatjson.py
├── convert_fbank.sh
├── convert_fbank_to_wav.py
├── copy-feats.py
├── data2json.sh
├── download_from_google_drive.sh
├── dump-pcm.py
├── dump.sh
├── dump_pcm.sh
├── eval-source-separation.py
├── eval_perm_free_error.py
├── eval_source_separation.sh
├── feat-to-shape.py
├── feat_to_shape.sh
├── feats2npy.py
├── filt.py
├── free-gpu.sh
├── generate_wav.sh
├── get_yaml.py
├── json2sctm.py
├── json2text.py
├── json2trn.py
├── json2trn_mt.py
├── json2trn_wo_dict.py
├── make_fbank.sh
├── make_stft.sh
├── merge_data.py
├── merge_scp2json.py
├── mergejson.py
├── mix-mono-wav-scp.py
├── pack_model.sh
├── queue-freegpu.pl
├── recog_wav.sh
├── reduce_data_dir.sh
├── remove_longshortdata.sh
├── result2json.py
├── score_bleu.sh
├── score_sclite.sh
├── score_sclite_wo_dict.sh
├── scp2json.py
├── show_result.sh
├── split_data.py
├── splitjson.py
├── spm_decode
├── spm_encode
├── spm_train
├── stdout.pl
├── synth_wav.sh
├── text2token.py
├── text2vocabulary.py
├── translate_wav.sh
├── trim_silence.py
├── trim_silence.sh
├── trn2ctm.py
├── trn2stm.py
└── update_json.sh
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | # Regexes for lines to exclude from consideration
3 | exclude_lines =
4 | # Have to re-enable the standard pragma
5 | pragma: no cover
6 | if __name__ == "__main__":
7 | if __name__ == '__main__':
8 | @abstractmethod
9 | raise NotImplementedError
10 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 45
3 | # Number of days of inactivity before a stale issue is closed
4 | daysUntilClose: 30
5 | # Issues with these labels will never be considered stale
6 | exemptLabels:
7 | - Roadmap
8 | - Bug
9 | # Label to use when marking an issue as stale
10 | staleLabel: Stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 | This issue has been automatically marked as stale because it has not had
14 | recent activity. It will be closed if no further activity occurs. Thank you
15 | for your contributions.
16 | unmarkComment: false
17 | # Comment to post when closing a stale issue. Set to `false` to disable
18 | closeComment: >
19 | This issue is closed. Please re-open if needed.
20 |
--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | push:
8 | tags:
9 | - 'v*'
10 |
11 | jobs:
12 | deploy:
13 |
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v2
18 | - name: Set up Python
19 | uses: actions/setup-python@v1
20 | with:
21 | python-version: '3.8'
22 | - name: Install dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install setuptools wheel twine
26 | - name: Build and publish
27 | env:
28 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 | run: |
31 | python setup.py sdist bdist_wheel
32 | twine upload dist/*
33 |
--------------------------------------------------------------------------------
/.idea/StreamingTransformer.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Streaming Transformer
2 | **This repo contains the streaming Transformer of our work ``On the Comparison of Popular End-to-End Models for Large Scale Speech Recognition``, which is based on ESPnet0.6.0. The streaming Transformer includes a streaming encoder, either chunk-based or look-ahead based, and a trigger-attention based decoder.**
3 |
4 | We will release following models and show reproducible results on Librispeech
5 |
6 | * Streaming_transformer-chunk32 with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1LSBY_vK50Jxvw_GeiYrPwRtJ0DsKU6zL/view?usp=sharing)
7 |
8 | * Streaming_transformer-chunk32 with VGG Encoder. (https://drive.google.com/file/d/12P6TsxtOCxrHezqgtk0USjSKBsYHIe7K/view?usp=sharing)
9 |
10 | * Streaming_transformer-lookahead with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1YJQaofzsk9_KsL2W9Zb42sGLRRIKRs9X/view?usp=sharing)
11 |
12 | * Streaming_transformer-lookahead with VGG Encoder. (https://drive.google.com/file/d/1LO_0pPxU5XJffqJMgtx4W4IL-Aih5m0M/view?usp=sharing)
13 |
14 | ## Results on Librispeech (beam=10)
15 | | Model | test-clean | test-other |latency |size |
16 | | -------- | -----: | :----: |:----: |:----: |
17 | | streaming_transformer-chunk32-conv2d | 2.8 | 7.5 | 640ms | 78M |
18 | | streaming_transformer-chunk32-vgg | 2.8 | 7.0| 640ms | 78M |
19 | | streaming_transformer-lookahead2-conv2d | 3.0 | 8.6| 1230ms | 78M |
20 | | streaming_transformer-lookahead2-vgg | 2.8 | 7.5 | 1230ms | 78M |
21 |
22 |
23 |
24 |
25 | ## Installation
26 | Our installation follow the installation process of ESPnet
27 | ### Step 1. setting of the environment
28 | CUDAROOT=/path/to/cuda
29 |
30 | export PATH=$CUDAROOT/bin:$PATH
31 | export LD_LIBRARY_PATH=$CUDAROOT/lib64:$LD_LIBRARY_PATH
32 | export CFLAGS="-I$CUDAROOT/include $CFLAGS"
33 | export CUDA_HOME=$CUDAROOT
34 | export CUDA_PATH=$CUDAROOT`
35 | ### Step 2. installation including Kaldi
36 | cd tools
37 | make -j 10
38 |
39 | ## Build a streaming Transformer model
40 | ### Step 1. Data Prepare
41 | cd egs/librispeech/asr1
42 | ./run.sh
43 | By default. the processed data will stored in the current directory. You can change the path by editing the scripts.
44 | ### Step 2. Viterbi decoding
45 | To train a TA based streaming Transformer, the alignments between CTC paths and transcriptions are required. In our work, we apply Viterbi decoding using the offline Transformer model.
46 |
47 | cd egs/librispeech/asr1
48 | ./viterbi_decode.sh /path/to/model
49 |
50 |
51 | ### Step 3. Train a streaming Transformer
52 | Here, we train a chunk-based streaming Transformer which is initialized with an offline Transformer provided by ESPnet. Set `enc-init` in `conf/train_streaming_transformer.yaml` to the path of your offline model.
53 |
54 | cd egs/librispeech/asr1
55 | ./train.sh
56 |
57 | If you want to train a look-ahead based streaming Transformer, set `chunk` to False and change the `left-window, right-window, dec-left-window, dec-right-window` arguments. The training log is written in `exp/streaming_transformer/train.log`. You can monitor the output through `tail -f exp/streaming_transformer/train.log`
58 |
59 | ### Step 4. Decoding
60 | Execute the following script with to decoding on test_clean and test_other sets
61 |
62 | ./decode.sh num_of_gpu job_per_gpu
63 |
64 | ### Offline Transformer Reference
65 | Regarding the offline Transformer model, Please visit [here](https://github.com/MarkWuNLP/SemanticMask)
66 |
67 |
--------------------------------------------------------------------------------
/egs/aishell1/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2 | # Usage: .pl [options] JOB=1:
3 | # e.g.
4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5 | #
6 | # Options:
7 | # --time