├── .coveragerc
├── .github
    ├── stale.yml
    └── workflows
    │   └── pythonpublish.yml
├── .idea
    ├── StreamingTransformer.iml
    ├── encodings.xml
    ├── misc.xml
    ├── modules.xml
    ├── vcs.xml
    └── workspace.xml
├── LICENSE
├── README.md
├── egs
    ├── aishell1
    │   └── asr1
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       └── decode.yaml
    │   │   ├── data
    │   │   ├── decode.sh
    │   │   ├── dump
    │   │   ├── exp
    │   │   ├── local
    │   │       ├── aishell_data_prep.sh
    │   │       └── download_and_untar.sh
    │   │   ├── path.sh
    │   │   ├── steps
    │   │   ├── train.sh
    │   │   ├── utils
    │   │   └── viterbi_decode.sh
    └── librispeech
    │   └── asr1
    │       ├── cmd.sh
    │       ├── conf
    │           ├── decode.yaml
    │           ├── fbank.conf
    │           ├── gpu.conf
    │           ├── pitch.conf
    │           ├── queue.conf
    │           ├── slurm.conf
    │           ├── specaug.yaml
    │           └── train_streaming_transformer.yaml
    │       ├── decode.sh
    │       ├── local
    │           ├── data_prep.sh
    │           └── download_and_untar.sh
    │       ├── path.sh
    │       ├── run.sh
    │       ├── steps
    │       ├── train.sh
    │       ├── utils
    │       └── viterbi_decode.sh
├── espnet
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-37.pyc
    ├── asr
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── asr_utils.cpython-37.pyc
    │   ├── asr_utils.py
    │   └── pytorch_backend
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       ├── asr_ddp.cpython-37.pyc
    │   │       ├── asr_init.cpython-37.pyc
    │   │       └── asr_recog.cpython-37.pyc
    │   │   ├── asr_ddp.py
    │   │   ├── asr_init.py
    │   │   └── asr_recog.py
    ├── bin
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── asr_train.cpython-37.pyc
    │   ├── asr_recog.py
    │   └── asr_train.py
    ├── nets
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── asr_interface.cpython-37.pyc
    │   │   ├── ctc_prefix_score.cpython-37.pyc
    │   │   ├── e2e_asr_common.cpython-37.pyc
    │   │   ├── lm_interface.cpython-37.pyc
    │   │   ├── scorer_interface.cpython-37.pyc
    │   │   └── viterbi_align.cpython-37.pyc
    │   ├── ctc_prefix_score.py
    │   ├── e2e_asr_common.py
    │   ├── lm_interface.py
    │   ├── pytorch_backend
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── ctc.cpython-37.pyc
    │   │   │   ├── e2e_asr_transformer.cpython-37.pyc
    │   │   │   ├── nets_utils.cpython-37.pyc
    │   │   │   └── streaming_transformer.cpython-37.pyc
    │   │   ├── conformer
    │   │   │   └── encoder.py
    │   │   ├── conformer_aed.py
    │   │   ├── ctc.py
    │   │   ├── e2e_asr_transformer.py
    │   │   ├── lm
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   └── default.cpython-37.pyc
    │   │   │   └── default.py
    │   │   ├── nets_utils.py
    │   │   ├── streaming_transformer.py
    │   │   └── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-37.pyc
    │   │   │       ├── add_sos_eos.cpython-37.pyc
    │   │   │       ├── attention.cpython-37.pyc
    │   │   │       ├── decoder.cpython-37.pyc
    │   │   │       ├── decoder_layer.cpython-37.pyc
    │   │   │       ├── embedding.cpython-37.pyc
    │   │   │       ├── encoder.cpython-37.pyc
    │   │   │       ├── encoder_layer.cpython-37.pyc
    │   │   │       ├── initializer.cpython-37.pyc
    │   │   │       ├── label_smoothing_loss.cpython-37.pyc
    │   │   │       ├── layer_norm.cpython-37.pyc
    │   │   │       ├── mask.cpython-37.pyc
    │   │   │       ├── multi_layer_conv.cpython-37.pyc
    │   │   │       ├── optimizer.cpython-37.pyc
    │   │   │       ├── plot.cpython-37.pyc
    │   │   │       ├── positionwise_feed_forward.cpython-37.pyc
    │   │   │       ├── repeat.cpython-37.pyc
    │   │   │       └── subsampling.cpython-37.pyc
    │   │   │   ├── add_sos_eos.py
    │   │   │   ├── attention.py
    │   │   │   ├── decoder.py
    │   │   │   ├── decoder_layer.py
    │   │   │   ├── embedding.py
    │   │   │   ├── encoder.py
    │   │   │   ├── encoder_layer.py
    │   │   │   ├── initializer.py
    │   │   │   ├── label_smoothing_loss.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── mask.py
    │   │   │   ├── multi_layer_conv.py
    │   │   │   ├── optimizer.py
    │   │   │   ├── plot.py
    │   │   │   ├── positionwise_feed_forward.py
    │   │   │   ├── repeat.py
    │   │   │   └── subsampling.py
    │   ├── scorer_interface.py
    │   ├── scorers
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── ctc.cpython-37.pyc
    │   │   └── ctc.py
    │   └── viterbi_align.py
    ├── transform
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── functional.cpython-37.pyc
    │   │   ├── spec_augment.cpython-37.pyc
    │   │   ├── transform_interface.cpython-37.pyc
    │   │   └── transformation.cpython-37.pyc
    │   ├── cmvn.py
    │   ├── functional.py
    │   ├── spec_augment.py
    │   ├── spectrogram.py
    │   ├── transform_interface.py
    │   └── transformation.py
    └── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       ├── check_kwargs.cpython-37.pyc
    │       ├── cli_utils.cpython-37.pyc
    │       ├── dataset.cpython-37.pyc
    │       ├── deterministic_utils.cpython-37.pyc
    │       ├── dynamic_import.cpython-37.pyc
    │       ├── fill_missing_args.cpython-37.pyc
    │       └── io_utils.cpython-37.pyc
    │   ├── check_kwargs.py
    │   ├── cli_readers.py
    │   ├── cli_utils.py
    │   ├── cli_writers.py
    │   ├── dataset.py
    │   ├── deterministic_utils.py
    │   ├── dynamic_import.py
    │   ├── fill_missing_args.py
    │   ├── io_utils.py
    │   ├── spec_augment.py
    │   └── training
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-37.pyc
    │           └── batchfy.cpython-37.pyc
    │       └── batchfy.py
├── setup.py
├── tools
    ├── Makefile
    └── check_install.py
└── utils
    ├── addjson.py
    ├── apply-cmvn.py
    ├── average_checkpoints.py
    ├── change_yaml.py
    ├── compute-cmvn-stats.py
    ├── compute-fbank-feats.py
    ├── compute-stft-feats.py
    ├── concatjson.py
    ├── convert_fbank.sh
    ├── convert_fbank_to_wav.py
    ├── copy-feats.py
    ├── data2json.sh
    ├── download_from_google_drive.sh
    ├── dump-pcm.py
    ├── dump.sh
    ├── dump_pcm.sh
    ├── eval-source-separation.py
    ├── eval_perm_free_error.py
    ├── eval_source_separation.sh
    ├── feat-to-shape.py
    ├── feat_to_shape.sh
    ├── feats2npy.py
    ├── filt.py
    ├── free-gpu.sh
    ├── generate_wav.sh
    ├── get_yaml.py
    ├── json2sctm.py
    ├── json2text.py
    ├── json2trn.py
    ├── json2trn_mt.py
    ├── json2trn_wo_dict.py
    ├── make_fbank.sh
    ├── make_stft.sh
    ├── merge_data.py
    ├── merge_scp2json.py
    ├── mergejson.py
    ├── mix-mono-wav-scp.py
    ├── pack_model.sh
    ├── queue-freegpu.pl
    ├── recog_wav.sh
    ├── reduce_data_dir.sh
    ├── remove_longshortdata.sh
    ├── result2json.py
    ├── score_bleu.sh
    ├── score_sclite.sh
    ├── score_sclite_wo_dict.sh
    ├── scp2json.py
    ├── show_result.sh
    ├── split_data.py
    ├── splitjson.py
    ├── spm_decode
    ├── spm_encode
    ├── spm_train
    ├── stdout.pl
    ├── synth_wav.sh
    ├── text2token.py
    ├── text2vocabulary.py
    ├── translate_wav.sh
    ├── trim_silence.py
    ├── trim_silence.sh
    ├── trn2ctm.py
    ├── trn2stm.py
    └── update_json.sh


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [report]
 2 | # Regexes for lines to exclude from consideration
 3 | exclude_lines =
 4 |     # Have to re-enable the standard pragma
 5 |     pragma: no cover
 6 |     if __name__ == "__main__":
 7 |     if __name__ == '__main__':
 8 |     @abstractmethod
 9 |     raise NotImplementedError
10 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 45
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 30
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - Roadmap
 8 |   - Bug
 9 | # Label to use when marking an issue as stale
10 | staleLabel: Stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | unmarkComment: false
17 | # Comment to post when closing a stale issue. Set to `false` to disable
18 | closeComment: >
19 |   This issue is closed. Please re-open if needed.
20 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   push:
 8 |     tags:
 9 |       - 'v*'
10 | 
11 | jobs:
12 |   deploy:
13 | 
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v1
20 |       with:
21 |         python-version: '3.8'
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install setuptools wheel twine
26 |     - name: Build and publish
27 |       env:
28 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
29 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 |       run: |
31 |         python setup.py sdist bdist_wheel
32 |         twine upload dist/*
33 | 


--------------------------------------------------------------------------------
/.idea/StreamingTransformer.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/StreamingTransformer.iml" filepath="$PROJECT_DIR$/.idea/StreamingTransformer.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Streaming Transformer
 2 | **This repo contains the streaming Transformer of our work ``On the Comparison of Popular End-to-End Models for Large Scale Speech Recognition``, which is based on ESPnet0.6.0. The streaming Transformer includes a streaming encoder, either chunk-based or look-ahead based, and a trigger-attention based decoder.**
 3 | 
 4 | We will release following models and show reproducible results on Librispeech
 5 | 
 6 | *  Streaming_transformer-chunk32 with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1LSBY_vK50Jxvw_GeiYrPwRtJ0DsKU6zL/view?usp=sharing)
 7 | 
 8 | *  Streaming_transformer-chunk32 with VGG Encoder. (https://drive.google.com/file/d/12P6TsxtOCxrHezqgtk0USjSKBsYHIe7K/view?usp=sharing)
 9 | 
10 | *  Streaming_transformer-lookahead with ESPnet Conv2d Encoder. (https://drive.google.com/file/d/1YJQaofzsk9_KsL2W9Zb42sGLRRIKRs9X/view?usp=sharing)
11 | 
12 | *  Streaming_transformer-lookahead with VGG Encoder. (https://drive.google.com/file/d/1LO_0pPxU5XJffqJMgtx4W4IL-Aih5m0M/view?usp=sharing)
13 | 
14 | ## Results on Librispeech (beam=10)
15 | | Model        | test-clean   |  test-other  |latency  |size  |
16 | | --------   | -----:  | :----:  |:----:  |:----:  |
17 | | streaming_transformer-chunk32-conv2d     | 2.8   |   7.5  | 640ms  | 78M |
18 | | streaming_transformer-chunk32-vgg	| 2.8 | 7.0| 640ms | 78M |
19 | | streaming_transformer-lookahead2-conv2d | 3.0 | 8.6| 1230ms | 78M |
20 | | streaming_transformer-lookahead2-vgg | 2.8 | 7.5 | 1230ms | 78M  |
21 | 
22 | 
23 | 
24 | 
25 | ## Installation
26 | Our installation follow the installation process of ESPnet
27 | ### Step 1. setting of the environment
28 |     CUDAROOT=/path/to/cuda
29 |     
30 |     export PATH=$CUDAROOT/bin:$PATH
31 |     export LD_LIBRARY_PATH=$CUDAROOT/lib64:$LD_LIBRARY_PATH
32 |     export CFLAGS="-I$CUDAROOT/include $CFLAGS"
33 |     export CUDA_HOME=$CUDAROOT
34 |     export CUDA_PATH=$CUDAROOT`
35 | ### Step 2. installation including Kaldi
36 |     cd tools
37 |     make -j 10
38 |     
39 | ## Build a streaming Transformer model
40 | ### Step 1. Data Prepare
41 |     cd egs/librispeech/asr1
42 |     ./run.sh 
43 | By default. the processed data will stored in the current directory. You can change the path by editing the scripts.
44 | ### Step 2. Viterbi decoding
45 | To train a TA based streaming Transformer, the alignments between CTC paths and transcriptions are required. In our work, we apply Viterbi decoding using the offline Transformer model.
46 | 
47 |     cd egs/librispeech/asr1
48 |     ./viterbi_decode.sh /path/to/model
49 | 
50 | 
51 | ### Step 3. Train a streaming Transformer
52 | Here, we train a chunk-based streaming Transformer which is initialized with an offline Transformer provided by ESPnet. Set `enc-init` in `conf/train_streaming_transformer.yaml` to the path of your offline model.
53 | 
54 | 	cd egs/librispeech/asr1
55 | 	./train.sh
56 | 
57 | If you want to train a look-ahead based streaming Transformer, set `chunk` to False and change the `left-window, right-window, dec-left-window, dec-right-window` arguments. The training log is written in `exp/streaming_transformer/train.log`. You can monitor the output through `tail -f exp/streaming_transformer/train.log`
58 | 
59 | ### Step 4. Decoding
60 | Execute the following script with to decoding on test_clean and test_other sets
61 | 
62 | 	./decode.sh num_of_gpu job_per_gpu
63 | 
64 | ### Offline Transformer Reference
65 | Regarding the offline Transformer model, Please visit [here](https://github.com/MarkWuNLP/SemanticMask)
66 | 
67 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
31 | cmd_backend='local'
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="run.pl"
42 | 
43 | # "qsub" (SGE, Torque, PBS, etc.)
44 | elif [ "${cmd_backend}" = sge ]; then
45 |     # The default setting is written in conf/queue.conf.
46 |     # You must change "-q g.q" for the "queue" for your environment.
47 |     # To know the "queue" names, type "qhost -q"
48 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
49 | 
50 |     export train_cmd="queue.pl"
51 |     export cuda_cmd="queue.pl"
52 |     export decode_cmd="queue.pl"
53 | 
54 | # "sbatch" (Slurm)
55 | elif [ "${cmd_backend}" = slurm ]; then
56 |     # The default setting is written in conf/slurm.conf.
57 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
58 |     # To know the "partion" names, type "sinfo".
59 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
60 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
61 | 
62 |     export train_cmd="slurm.pl"
63 |     export cuda_cmd="slurm.pl"
64 |     export decode_cmd="slurm.pl"
65 | 
66 | elif [ "${cmd_backend}" = ssh ]; then
67 |     # You have to create ".queue/machines" to specify the host to execute jobs.
68 |     # e.g. .queue/machines
69 |     #   host1
70 |     #   host2
71 |     #   host3
72 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
73 | 
74 |     export train_cmd="ssh.pl"
75 |     export cuda_cmd="ssh.pl"
76 |     export decode_cmd="ssh.pl"
77 | 
78 | # This is an example of specifying several unique options in the JHU CLSP cluster setup.
79 | # Users can modify/add their own command options according to their cluster environments.
80 | elif [ "${cmd_backend}" = jhu ]; then
81 | 
82 |     export train_cmd="queue.pl --mem 2G"
83 |     export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
84 |     export decode_cmd="queue.pl --mem 4G"
85 | 
86 | else
87 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
88 |     return 1
89 | fi
90 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/conf/decode.yaml:
--------------------------------------------------------------------------------
1 | beam-size: 10
2 | penalty: 0.0
3 | maxlenratio: 0.0
4 | minlenratio: 0.0
5 | ctc-weight: 0.5
6 | lm-weight: 0.7
7 | 
8 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/data:
--------------------------------------------------------------------------------
1 | /datablob/users/v-chengw/data/aishell/data


--------------------------------------------------------------------------------
/egs/aishell1/asr1/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh || exit 1
 3 | . ./cmd.sh || exit 1
 4 | dataset='dev test'
 5 | config=conf/decode.yaml
 6 | ngpu=$1
 7 | jobperGPU=$2
 8 | ((nblock=$ngpu*$jobperGPU))
 9 | startblock=1
10 | endblock=${nblock}+1
11 | model_dir=exp/transformer_12layer_specaug
12 | rnnlm=exp/train_rnnlm_pytorch_lm
13 | 
14 | #average_checkpoints.py \
15 | #    --backend pytorch \
16 | #    --snapshots ${model_dir}/results/snapshot.ep.* \
17 | #    --out ${model_dir}/results/model.last5.avg.best \
18 | #    --num 5
19 | 
20 | function run(){
21 | part=$1
22 | cgpu=$[$2+0]
23 | recog_set=$3
24 | echo $cgpu
25 | export CUDA_VISIBLE_DEVICES=$cgpu
26 | mkdir -p ${model_dir}/decode/${recog_set}/log
27 | ${decode_cmd} ${model_dir}/decode/${recog_set}/log/decode.${part}.log \
28 |     asr_recog.py \
29 |     --config ${config} \
30 |     --ngpu 1 \
31 |     --rnnlm ${rnnlm} \
32 |     --recog-json dump/${recog_set}/deltafalse/split${nblock}utt/data.${part}.json \
33 |     --model ${model_dir}/results/model.last5.avg.best \
34 |     --result-label ${model_dir}/decode/${recog_set}/data.${part}.json
35 | }
36 | 
37 | function recog(){
38 | recog_set=$1
39 | decode_dir=${model_dir}/decode/${recog_set}
40 | 
41 | splitjson.py --parts ${nblock} dump/${recog_set}/deltafalse/data.json 
42 | 
43 | for ((i=$startblock;i<$endblock;i+=${ngpu}));
44 | do
45 |     for ((j=0;j<${ngpu};j++));
46 |     do
47 |         run $(($i+$j)) $j $recog_set &
48 |     done
49 | 
50 |     if [ $(( $[$i+1]%${jobperGPU} )) -eq 0 ]
51 |     then
52 |     wait;
53 |     fi
54 | done
55 | wait
56 | score_sclite.sh ${model_dir}/decode/${recog_set} data/lang_char/train_sp_units.txt
57 | }
58 | 
59 | for recog_set in ${dataset}; do
60 |     recog $recog_set
61 | done
62 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/dump:
--------------------------------------------------------------------------------
1 | /datablob/users/v-chengw/data/aishell/dump


--------------------------------------------------------------------------------
/egs/aishell1/asr1/exp:
--------------------------------------------------------------------------------
1 | /datablob/users/v-chengw/data/aishell/exp


--------------------------------------------------------------------------------
/egs/aishell1/asr1/local/aishell_data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Xingyu Na
 4 | # Apache 2.0
 5 | 
 6 | . ./path.sh || exit 1;
 7 | 
 8 | if [ $# != 2 ]; then
 9 |   echo "Usage: $0 <audio-path> <text-path>"
10 |   echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
11 |   exit 1;
12 | fi
13 | 
14 | aishell_audio_dir=$1
15 | aishell_text=$2/aishell_transcript_v0.8.txt
16 | 
17 | train_dir=data/local/train
18 | dev_dir=data/local/dev
19 | test_dir=data/local/test
20 | tmp_dir=data/local/tmp
21 | 
22 | mkdir -p $train_dir
23 | mkdir -p $dev_dir
24 | mkdir -p $test_dir
25 | mkdir -p $tmp_dir
26 | 
27 | # data directory check
28 | if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
29 |   echo "Error: $0 requires two directory arguments"
30 |   exit 1;
31 | fi
32 | 
33 | # find wav audio file for train, dev and test resp.
34 | find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
35 | n=`cat $tmp_dir/wav.flist | wc -l`
36 | [ $n -ne 141925 ] && \
37 |   echo Warning: expected 141925 data data files, found $n
38 | 
39 | grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
40 | grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
41 | grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
42 | 
43 | rm -r $tmp_dir
44 | 
45 | # Transcriptions preparation
46 | for dir in $train_dir $dev_dir $test_dir; do
47 |   echo Preparing $dir transcriptions
48 |   sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
49 |   sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
50 |   paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
51 |   utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
52 |   awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
53 |   utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
54 |   utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
55 |   sort -u $dir/transcripts.txt > $dir/text
56 |   utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
57 | done
58 | 
59 | mkdir -p data/train data/dev data/test
60 | 
61 | for f in spk2utt utt2spk wav.scp text; do
62 |   cp $train_dir/$f data/train/$f || exit 1;
63 |   cp $dev_dir/$f data/dev/$f || exit 1;
64 |   cp $test_dir/$f data/test/$f || exit 1;
65 | done
66 | 
67 | echo "$0: AISHELL data preparation succeeded"
68 | exit 0;
69 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
  4 | #             2017  Xingyu Na
  5 | # Apache 2.0
  6 | 
  7 | remove_archive=false
  8 | 
  9 | if [ "$1" == --remove-archive ]; then
 10 |   remove_archive=true
 11 |   shift
 12 | fi
 13 | 
 14 | if [ $# -ne 3 ]; then
 15 |   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
 16 |   echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
 17 |   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
 18 |   echo "<corpus-part> can be one of: data_aishell, resource_aishell."
 19 | fi
 20 | 
 21 | data=$1
 22 | url=$2
 23 | part=$3
 24 | 
 25 | if [ ! -d "$data" ]; then
 26 |   echo "$0: no such directory $data"
 27 |   exit 1;
 28 | fi
 29 | 
 30 | part_ok=false
 31 | list="data_aishell resource_aishell"
 32 | for x in $list; do
 33 |   if [ "$part" == $x ]; then part_ok=true; fi
 34 | done
 35 | if ! $part_ok; then
 36 |   echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
 37 |   exit 1;
 38 | fi
 39 | 
 40 | if [ -z "$url" ]; then
 41 |   echo "$0: empty URL base."
 42 |   exit 1;
 43 | fi
 44 | 
 45 | if [ -f $data/$part/.complete ]; then
 46 |   echo "$0: data part $part was already successfully extracted, nothing to do."
 47 |   exit 0;
 48 | fi
 49 | 
 50 | # sizes of the archive files in bytes.
 51 | sizes="15582913665 1246920"
 52 | 
 53 | if [ -f $data/$part.tgz ]; then
 54 |   size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
 55 |   size_ok=false
 56 |   for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
 57 |   if ! $size_ok; then
 58 |     echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
 59 |     echo "does not equal the size of one of the archives."
 60 |     rm $data/$part.tgz
 61 |   else
 62 |     echo "$data/$part.tgz exists and appears to be complete."
 63 |   fi
 64 | fi
 65 | 
 66 | if [ ! -f $data/$part.tgz ]; then
 67 |   if ! command -v wget >/dev/null; then
 68 |     echo "$0: wget is not installed."
 69 |     exit 1;
 70 |   fi
 71 |   full_url=$url/$part.tgz
 72 |   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
 73 | 
 74 |   cd $data || exit 1
 75 |   if ! wget --no-check-certificate $full_url; then
 76 |     echo "$0: error executing wget $full_url"
 77 |     exit 1;
 78 |   fi
 79 | fi
 80 | 
 81 | cd $data || exit 1
 82 | 
 83 | if ! tar -xvzf $part.tgz; then
 84 |   echo "$0: error un-tarring archive $data/$part.tgz"
 85 |   exit 1;
 86 | fi
 87 | 
 88 | touch $data/$part/.complete
 89 | 
 90 | if [ $part == "data_aishell" ]; then
 91 |   cd $data/$part/wav || exit 1
 92 |   for wav in ./*.tar.gz; do
 93 |     echo "Extracting wav from $wav"
 94 |     tar -zxf $wav && rm $wav
 95 |   done
 96 | fi
 97 | 
 98 | echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
 99 | 
100 | if $remove_archive; then
101 |   echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
102 |   rm $data/$part.tgz
103 | fi
104 | 
105 | exit 0;
106 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/path.sh:
--------------------------------------------------------------------------------
 1 | MAIN_ROOT=$PWD/../../..
 2 | KALDI_ROOT=$MAIN_ROOT/tools/kaldi
 3 | 
 4 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 5 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
 6 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 7 | . $KALDI_ROOT/tools/config/common_path.sh
 8 | export LC_ALL=C
 9 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
10 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ]; then
11 |     source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate
12 | else
13 |     source $MAIN_ROOT/tools/venv/bin/activate
14 | fi
15 | export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
16 | 
17 | export OMP_NUM_THREADS=1
18 | 
19 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
20 | export PYTHONIOENCODING=UTF-8
21 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/steps:
--------------------------------------------------------------------------------
1 | ../../../tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/egs/aishell1/asr1/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # general configuration
 3 | . ./path.sh || exit 1
 4 | . ./cmd.sh || exit 1
 5 | 
 6 | ngpu=4
 7 | debugmode=1
 8 | verbose=0
 9 | resume=
10 | 
11 | workdir=$(pwd)
12 | config=${workdir}/conf/train_streaming_transformer.yaml
13 | 
14 | expname=streaming_transformer
15 | expdir=exp
16 | exppath=${expdir}/${expname}
17 | 
18 | mkdir -p ${exppath}
19 | 
20 | ${cuda_cmd} --gpu ${ngpu} ${exppath}/train.log \
21 |     asr_train.py \
22 |     --config ${config} \
23 |     --preprocess-conf ${workdir}/conf/specaug.yaml \
24 |     --ngpu ${ngpu} \
25 |     --outdir ${exppath}/results \
26 |     --train-json dump/train_960/deltafalse/data_aligned.json \
27 |     --valid-json dump/dev/deltafalse/data_aligned.json \
28 |     --resume ${resume} \
29 |     --dict data/lang_char/train_960_unigram5000_units.txt
30 | 
31 | 


--------------------------------------------------------------------------------
/egs/aishell1/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../../tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/egs/aishell1/asr1/viterbi_decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # general configuration
 3 | . ./path.sh || exit 1
 4 | train_set=train_sp
 5 | train_dev=dev
 6 | 
 7 | dumpdir=dump
 8 | model=$1
 9 | 
10 | #python ../../../espnet/bin/asr_recog.py \
11 | #	--ngpu 1 \
12 | #        --viterbi true \
13 | #        --model ${model} \
14 | #	--recog-json dump/${train_set}/deltafalse/data.json \
15 | #        --result-label dump/${train_set}/deltafalse/data_aligned_new.json
16 | 
17 | python ../../../espnet/bin/asr_recog.py \
18 |         --ngpu 1 \
19 |         --viterbi true \
20 |         --model ${model} \
21 |         --recog-json dump/${train_dev}/deltafalse/data.json \
22 |         --result-label dump/${train_dev}/deltafalse/data_aligned_new.json
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
31 | cmd_backend='local'
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="run.pl"
42 | 
43 | # "qsub" (SGE, Torque, PBS, etc.)
44 | elif [ "${cmd_backend}" = sge ]; then
45 |     # The default setting is written in conf/queue.conf.
46 |     # You must change "-q g.q" for the "queue" for your environment.
47 |     # To know the "queue" names, type "qhost -q"
48 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
49 | 
50 |     export train_cmd="queue.pl"
51 |     export cuda_cmd="queue.pl"
52 |     export decode_cmd="queue.pl"
53 | 
54 | # "sbatch" (Slurm)
55 | elif [ "${cmd_backend}" = slurm ]; then
56 |     # The default setting is written in conf/slurm.conf.
57 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
58 |     # To know the "partion" names, type "sinfo".
59 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
60 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
61 | 
62 |     export train_cmd="slurm.pl"
63 |     export cuda_cmd="slurm.pl"
64 |     export decode_cmd="slurm.pl"
65 | 
66 | elif [ "${cmd_backend}" = ssh ]; then
67 |     # You have to create ".queue/machines" to specify the host to execute jobs.
68 |     # e.g. .queue/machines
69 |     #   host1
70 |     #   host2
71 |     #   host3
72 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
73 | 
74 |     export train_cmd="ssh.pl"
75 |     export cuda_cmd="ssh.pl"
76 |     export decode_cmd="ssh.pl"
77 | 
78 | # This is an example of specifying several unique options in the JHU CLSP cluster setup.
79 | # Users can modify/add their own command options according to their cluster environments.
80 | elif [ "${cmd_backend}" = jhu ]; then
81 | 
82 |     export train_cmd="queue.pl --mem 2G"
83 |     export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
84 |     export decode_cmd="queue.pl --mem 4G"
85 | 
86 | else
87 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
88 |     return 1
89 | fi
90 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/decode.yaml:
--------------------------------------------------------------------------------
 1 | lm-weight: 0.5
 2 | beam-size: 5
 3 | penalty: 2.0
 4 | maxlenratio: 0.0
 5 | minlenratio: 0.0
 6 | ctc-weight: 0.5
 7 | threshold: 0.0005
 8 | ctc-lm-weight: 0.5
 9 | prefix-decode: true
10 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000 
2 | --num-mel-bins=80
3 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/pitch.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l gpu=$0 -q g.q
11 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH  --ntasks-per-node=1
 3 | option time=* --time $0
 4 | option mem=* --mem-per-cpu $0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 7 | option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
 8 | default gpu=0
 9 | option gpu=0 -p cpu
10 | option gpu=* -p gpu --gres=gpu:$0
11 | # note: the --max-jobs-run option is supported as a special case
12 | # by slurm.pl and you don't have to handle it in the config file.
13 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/specaug.yaml:
--------------------------------------------------------------------------------
 1 | process:
 2 |   # these three processes are a.k.a. SpecAugument
 3 |   - type: "time_warp"
 4 |     max_time_warp: 5
 5 |     inplace: true
 6 |     mode: "PIL"
 7 |   - type: "freq_mask"
 8 |     F: 30
 9 |     n_mask: 2
10 |     inplace: true
11 |     replace_with_zero: false
12 |   - type: "time_mask"
13 |     T: 40
14 |     n_mask: 2
15 |     inplace: true
16 |     replace_with_zero: false
17 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/conf/train_streaming_transformer.yaml:
--------------------------------------------------------------------------------
 1 | # This configuration requires 4 gpus with 12GB memory
 2 | accum-grad: 1
 3 | adim: 512
 4 | aheads: 8
 5 | batch-bins: 3000000
 6 | dlayers: 6
 7 | dropout-rate: 0.1
 8 | dunits: 2048
 9 | elayers: 12
10 | epochs: 120
11 | eunits: 2048
12 | grad-clip: 5
13 | lsm-weight: 0.1
14 | model-module: espnet.nets.pytorch_backend.streaming_transformer:E2E
15 | mtlalpha: 0.3
16 | opt: noam
17 | patience: 0
18 | sortagrad: 0
19 | transformer-attn-dropout-rate: 0.0
20 | transformer-init: pytorch
21 | transformer-input-layer: conv2d
22 | transformer-length-normalized-loss: false
23 | transformer-lr: 1.0
24 | transformer-warmup-steps: 2500
25 | n-iter-processes: 0
26 | 
27 | enc-init: /path/to/model
28 | enc-init-mods: encoder,ctc,decoder
29 | 
30 | streaming: true
31 | chunk: true
32 | chunk-size: 32
33 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh || exit 1
 3 | . ./cmd.sh || exit 1
 4 | dataset='test_clean test_other'
 5 | config=conf/decode.yaml
 6 | ngpu=$1
 7 | jobperGPU=$2
 8 | ((nblock=$ngpu*$jobperGPU))
 9 | startblock=1
10 | endblock=${nblock}+1
11 | model_dir=exp/streaming_transformer
12 | rnnlm=exp/irielm.ep11.last5.avg/rnnlm.model.best
13 | 
14 | average_checkpoints.py \
15 |     --backend pytorch \
16 |     --snapshots ${model_dir}/results/snapshot.ep.* \
17 |     --out ${model_dir}/results/model.last5.avg.best \
18 |     --num 5
19 | 
20 | function run(){
21 | part=$1
22 | cgpu=$[$2+0]
23 | recog_set=$3
24 | echo $cgpu
25 | export CUDA_VISIBLE_DEVICES=$cgpu
26 | mkdir -p ${model_dir}/decode/${recog_set}/log
27 | ${decode_cmd} ${model_dir}/decode/${recog_set}/log/decode.${part}.log \
28 |     asr_recog.py \
29 |     --config ${config} \
30 |     --ngpu 1 \
31 |     --rnnlm ${rnnlm} \
32 |     --recog-json dump/${recog_set}/deltafalse/split${nblock}utt/data_unigram5000.${part}.json \
33 |     --model ${model_dir}/results/model.last5.avg.best \
34 |     --result-label ${model_dir}/decode/${recog_set}/data.${part}.json
35 | }
36 | 
37 | function recog(){
38 | recog_set=$1
39 | decode_dir=${model_dir}/decode/${recog_set}
40 | 
41 | splitjson.py --parts ${nblock} dump/${recog_set}/deltafalse/data_unigram5000.json 
42 | 
43 | for ((i=$startblock;i<$endblock;i+=${ngpu}));
44 | do
45 |     for ((j=0;j<${ngpu};j++));
46 |     do
47 |         run $(($i+$j)) $j $recog_set &
48 |     done
49 | 
50 |     if [ $(( $[$i+1]%${jobperGPU} )) -eq 0 ]
51 |     then
52 |     wait;
53 |     fi
54 | done
55 | wait
56 | score_sclite.sh --bpe 5000 --bpemodel data/lang_char/train_unigram5000.model --wer true \
57 |     ${model_dir}/decode/${recog_set} data/lang_char/train_unigram5000_units.txt
58 | }
59 | 
60 | for recog_set in ${dataset}; do
61 |     recog $recog_set
62 | done
63 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/local/data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2014  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | if [ "$#" -ne 2 ]; then
 8 |   echo "Usage: $0 <src-dir> <dst-dir>"
 9 |   echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
10 |   exit 1
11 | fi
12 | 
13 | src=$1
14 | dst=$2
15 | 
16 | # all utterances are FLAC compressed
17 | if ! which flac >&/dev/null; then
18 |    echo "Please install 'flac' on ALL worker nodes!"
19 |    exit 1
20 | fi
21 | 
22 | spk_file=$src/../SPEAKERS.TXT
23 | 
24 | mkdir -p $dst || exit 1
25 | 
26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1
27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
28 | 
29 | 
30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans
32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
34 | 
35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
36 |   reader=$(basename $reader_dir)
37 |   if ! [ $reader -eq $reader ]; then  # not integer.
38 |     echo "$0: unexpected subdirectory name $reader"
39 |     exit 1
40 |   fi
41 | 
42 |   reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
43 |   if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
44 |     echo "Unexpected gender: '$reader_gender'"
45 |     exit 1
46 |   fi
47 | 
48 |   for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
49 |     chapter=$(basename $chapter_dir)
50 |     if ! [ "$chapter" -eq "$chapter" ]; then
51 |       echo "$0: unexpected chapter-subdirectory name $chapter"
52 |       exit 1
53 |     fi
54 | 
55 |     find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
56 |       awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
57 | 
58 |     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
59 |     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
60 |     cat $chapter_trans >>$trans
61 | 
62 |     # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
63 |     #       to be a different speaker. This is done for simplicity and because we want
64 |     #       e.g. the CMVN to be calculated per-chapter
65 |     awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
66 |       <$chapter_trans >>$utt2spk || exit 1
67 | 
68 |     # reader -> gender map (again using per-chapter granularity)
69 |     echo "${reader}-${chapter} $reader_gender" >>$spk2gender
70 |   done
71 | done
72 | 
73 | spk2utt=$dst/spk2utt
74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
75 | 
76 | ntrans=$(wc -l <$trans)
77 | nutt2spk=$(wc -l <$utt2spk)
78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \
79 |   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1
80 | 
81 | utils/validate_data_dir.sh --no-feats $dst || exit 1
82 | 
83 | echo "$0: successfully prepared data in $dst"
84 | 
85 | exit 0
86 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
 4 | # Apache 2.0
 5 | 
 6 | remove_archive=false
 7 | 
 8 | if [ "$1" == --remove-archive ]; then
 9 |   remove_archive=true
10 |   shift
11 | fi
12 | 
13 | if [ $# -ne 3 ]; then
14 |   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
15 |   echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
16 |   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
17 |   echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
18 |   echo "          train-clean-100, train-clean-360, train-other-500."
19 |   exit 1
20 | fi
21 | 
22 | data=$1
23 | url=$2
24 | part=$3
25 | 
26 | if [ ! -d "$data" ]; then
27 |   echo "$0: no such directory $data"
28 |   exit 1
29 | fi
30 | 
31 | part_ok=false
32 | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
33 | for x in $list; do
34 |   if [ "$part" == $x ]; then part_ok=true; fi
35 | done
36 | if ! $part_ok; then
37 |   echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
38 |   exit 1
39 | fi
40 | 
41 | if [ -z "$url" ]; then
42 |   echo "$0: empty URL base."
43 |   exit 1
44 | fi
45 | 
46 | if [ -f $data/LibriSpeech/$part/.complete ]; then
47 |   echo "$0: data part $part was already successfully extracted, nothing to do."
48 |   exit 0
49 | fi
50 | 
51 | 
52 | # sizes of the archive files in bytes.  This is some older versions.
53 | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
54 | # sizes_new is the archive file sizes of the final release.  Some of these sizes are of
55 | # things we probably won't download.
56 | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
57 | 
58 | if [ -f $data/$part.tar.gz ]; then
59 |   size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
60 |   size_ok=false
61 |   for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
62 |   if ! $size_ok; then
63 |     echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
64 |     echo "does not equal the size of one of the archives."
65 |     rm $data/$part.tar.gz
66 |   else
67 |     echo "$data/$part.tar.gz exists and appears to be complete."
68 |   fi
69 | fi
70 | 
71 | if [ ! -f $data/$part.tar.gz ]; then
72 |   if ! which wget >/dev/null; then
73 |     echo "$0: wget is not installed."
74 |     exit 1
75 |   fi
76 |   full_url=$url/$part.tar.gz
77 |   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
78 | 
79 |   if ! wget -P $data --no-check-certificate $full_url; then
80 |     echo "$0: error executing wget $full_url"
81 |     exit 1
82 |   fi
83 | fi
84 | 
85 | if ! tar -C $data -xvzf $data/$part.tar.gz; then
86 |   echo "$0: error un-tarring archive $data/$part.tar.gz"
87 |   exit 1
88 | fi
89 | 
90 | touch $data/LibriSpeech/$part/.complete
91 | 
92 | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
93 | 
94 | if $remove_archive; then
95 |   echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
96 |   rm $data/$part.tar.gz
97 | fi
98 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/path.sh:
--------------------------------------------------------------------------------
 1 | MAIN_ROOT=$PWD/../../..
 2 | KALDI_ROOT=$MAIN_ROOT/tools/kaldi
 3 | 
 4 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 5 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
 6 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 7 | . $KALDI_ROOT/tools/config/common_path.sh
 8 | export LC_ALL=C
 9 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
10 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ]; then
11 |     source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate
12 | else
13 |     source $MAIN_ROOT/tools/venv/bin/activate
14 | fi
15 | export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
16 | 
17 | export OMP_NUM_THREADS=1
18 | 
19 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
20 | export PYTHONIOENCODING=UTF-8
21 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/steps:
--------------------------------------------------------------------------------
1 | ../../../tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/egs/librispeech/asr1/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # general configuration
 3 | . ./path.sh || exit 1
 4 | . ./cmd.sh || exit 1
 5 | 
 6 | ngpu=4
 7 | debugmode=1
 8 | verbose=0
 9 | resume=
10 | 
11 | workdir=$(pwd)
12 | config=${workdir}/conf/train_streaming_transformer.yaml
13 | 
14 | expname=streaming_transformer
15 | expdir=exp
16 | exppath=${expdir}/${expname}
17 | 
18 | mkdir -p ${exppath}
19 | 
20 | ${cuda_cmd} --gpu ${ngpu} ${exppath}/train.log \
21 |     asr_train.py \
22 |     --config ${config} \
23 |     --preprocess-conf ${workdir}/conf/specaug.yaml \
24 |     --ngpu ${ngpu} \
25 |     --outdir ${exppath}/results \
26 |     --train-json dump/train_960/deltafalse/data_aligned.json \
27 |     --valid-json dump/dev/deltafalse/data_aligned.json \
28 |     --resume ${resume} \
29 |     --dict data/lang_char/train_960_unigram5000_units.txt
30 | 
31 | 


--------------------------------------------------------------------------------
/egs/librispeech/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../../tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/egs/librispeech/asr1/viterbi_decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # general configuration
 3 | . ./path.sh || exit 1
 4 | train_set=train_960
 5 | train_dev=dev
 6 | 
 7 | dumpdir=dump
 8 | model=$1
 9 | 
10 | python ../../../espnet/bin/asr_recog.py \
11 | 	--ngpu 1 \
12 |         --viterbi true \
13 |         --model ${model} \
14 | 	--recog-json dump/${train_set}/deltafalse/data_unigram5000.json \
15 |         --result-label dump/${train_set}/deltafalse/data_aligned.json
16 | 
17 | python ../../../espnet/bin/asr_recog.py \
18 |         --ngpu 1 \
19 |         --viterbi true \
20 |         --model ${model} \
21 |         --recog-json dump/${train_dev}/deltafalse/data_unigram5000.json \
22 |         --result-label dump/${train_dev}/deltafalse/data_aligned.json
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/espnet/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initialize main package."""
 2 | 
 3 | import pkg_resources
 4 | 
 5 | try:
 6 |     __version__ = pkg_resources.get_distribution("espnet").version
 7 | except Exception:
 8 |     __version__ = "(Not installed from setup.py)"
 9 | del pkg_resources
10 | 


--------------------------------------------------------------------------------
/espnet/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/asr/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/__pycache__/asr_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/__pycache__/asr_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/pytorch_backend/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__pycache__/asr_ddp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/pytorch_backend/__pycache__/asr_ddp.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__pycache__/asr_init.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/pytorch_backend/__pycache__/asr_init.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/asr/pytorch_backend/__pycache__/asr_recog.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/asr/pytorch_backend/__pycache__/asr_recog.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/bin/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/bin/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/bin/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/bin/__pycache__/asr_train.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/bin/__pycache__/asr_train.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/asr_interface.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/asr_interface.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/ctc_prefix_score.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/ctc_prefix_score.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/e2e_asr_common.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/e2e_asr_common.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/lm_interface.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/lm_interface.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/scorer_interface.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/scorer_interface.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/__pycache__/viterbi_align.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/__pycache__/viterbi_align.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/lm_interface.py:
--------------------------------------------------------------------------------
 1 | """Language model interface."""
 2 | 
 3 | import argparse
 4 | 
 5 | from espnet.nets.scorer_interface import ScorerInterface
 6 | from espnet.utils.dynamic_import import dynamic_import
 7 | from espnet.utils.fill_missing_args import fill_missing_args
 8 | 
 9 | 
10 | class LMInterface(ScorerInterface):
11 |     """LM Interface for ESPnet model implementation."""
12 | 
13 |     @staticmethod
14 |     def add_arguments(parser):
15 |         """Add arguments to command line argument parser."""
16 |         return parser
17 | 
18 |     @classmethod
19 |     def build(cls, n_vocab: int, **kwargs):
20 |         """Initialize this class with python-level args.
21 | 
22 |         Args:
23 |             idim (int): The number of vocabulary.
24 | 
25 |         Returns:
26 |             LMinterface: A new instance of LMInterface.
27 | 
28 |         """
29 |         # local import to avoid cyclic import in lm_train
30 |         from espnet.bin.lm_train import get_parser
31 | 
32 |         def wrap(parser):
33 |             return get_parser(parser, required=False)
34 | 
35 |         args = argparse.Namespace(**kwargs)
36 |         args = fill_missing_args(args, wrap)
37 |         args = fill_missing_args(args, cls.add_arguments)
38 |         return cls(n_vocab, args)
39 | 
40 |     def forward(self, x, t):
41 |         """Compute LM loss value from buffer sequences.
42 | 
43 |         Args:
44 |             x (torch.Tensor): Input ids. (batch, len)
45 |             t (torch.Tensor): Target ids. (batch, len)
46 | 
47 |         Returns:
48 |             tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
49 |                 loss to backward (scalar),
50 |                 negative log-likelihood of t: -log p(t) (scalar) and
51 |                 the number of elements in x (scalar)
52 | 
53 |         Notes:
54 |             The last two return values are used
55 |             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
56 | 
57 |         """
58 |         raise NotImplementedError("forward method is not implemented")
59 | 
60 | 
61 | predefined_lms = {
62 |     "pytorch": {
63 |         "default": "espnet.nets.pytorch_backend.lm.default:DefaultRNNLM",
64 |         "seq_rnn": "espnet.nets.pytorch_backend.lm.seq_rnn:SequentialRNNLM",
65 |         "transformer": "espnet.nets.pytorch_backend.lm.transformer:TransformerLM",
66 |     },
67 |     "chainer": {"default": "espnet.lm.chainer_backend.lm:DefaultRNNLM"},
68 | }
69 | 
70 | 
71 | def dynamic_import_lm(module, backend):
72 |     """Import LM class dynamically.
73 | 
74 |     Args:
75 |         module (str): module_name:class_name or alias in `predefined_lms`
76 |         backend (str): NN backend. e.g., pytorch, chainer
77 | 
78 |     Returns:
79 |         type: LM class
80 | 
81 |     """
82 |     model_class = dynamic_import(module, predefined_lms.get(backend, dict()))
83 |     assert issubclass(
84 |         model_class, LMInterface
85 |     ), f"{module} does not implement LMInterface"
86 |     return model_class
87 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__pycache__/ctc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/__pycache__/ctc.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__pycache__/e2e_asr_transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/__pycache__/e2e_asr_transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__pycache__/nets_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/__pycache__/nets_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/__pycache__/streaming_transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/__pycache__/streaming_transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/lm/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/lm/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/lm/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/lm/__pycache__/default.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/lm/__pycache__/default.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/add_sos_eos.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/add_sos_eos.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/attention.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/attention.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/decoder.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/decoder_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/decoder_layer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/encoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/encoder.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/encoder_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/encoder_layer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/initializer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/initializer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/label_smoothing_loss.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/label_smoothing_loss.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/layer_norm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/layer_norm.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/mask.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/mask.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/multi_layer_conv.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/multi_layer_conv.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/optimizer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/optimizer.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/plot.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/plot.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/positionwise_feed_forward.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/repeat.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/repeat.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/__pycache__/subsampling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/pytorch_backend/transformer/__pycache__/subsampling.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/add_sos_eos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Unility funcitons for Transformer."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | def add_sos_eos(ys_pad, sos, eos, ignore_id):
13 |     """Add <sos> and <eos> labels.
14 | 
15 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
16 |     :param int sos: index of <sos>
17 |     :param int eos: index of <eeos>
18 |     :param int ignore_id: index of padding
19 |     :return: padded tensor (B, Lmax)
20 |     :rtype: torch.Tensor
21 |     :return: padded tensor (B, Lmax)
22 |     :rtype: torch.Tensor
23 |     """
24 |     from espnet.nets.pytorch_backend.nets_utils import pad_list
25 | 
26 |     _sos = ys_pad.new([sos])
27 |     _eos = ys_pad.new([eos])
28 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
29 |     ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
30 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
31 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
32 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/attention.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Multi-Head Attention layer definition."""
 8 | 
 9 | import math
10 | import logging
11 | 
12 | import numpy
13 | import torch
14 | from torch import nn
15 | 
16 | 
17 | class MultiHeadedAttention(nn.Module):
18 |     """Multi-Head Attention layer.
19 | 
20 |     :param int n_head: the number of head s
21 |     :param int n_feat: the number of features
22 |     :param float dropout_rate: dropout rate
23 | 
24 |     """
25 | 
26 |     def __init__(self, n_head, n_feat, dropout_rate):
27 |         """Construct an MultiHeadedAttention object."""
28 |         super(MultiHeadedAttention, self).__init__()
29 |         assert n_feat % n_head == 0
30 |         # We assume d_v always equals d_k
31 |         self.d_k = n_feat // n_head
32 |         self.h = n_head
33 |         self.linear_q = nn.Linear(n_feat, n_feat)
34 |         self.linear_k = nn.Linear(n_feat, n_feat)
35 |         self.linear_v = nn.Linear(n_feat, n_feat)
36 |         self.linear_out = nn.Linear(n_feat, n_feat)
37 |         self.attn = None
38 |         self.dropout = nn.Dropout(p=dropout_rate)
39 | 
40 |     def forward(self, query, key, value, mask):
41 |         """Compute 'Scaled Dot Product Attention'.
42 | 
43 |         :param torch.Tensor query: (batch, time1, size)
44 |         :param torch.Tensor key: (batch, time2, size)
45 |         :param torch.Tensor value: (batch, time2, size)
46 |         :param torch.Tensor mask: (batch, time1, time2)
47 |         :param torch.nn.Dropout dropout:
48 |         :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
49 |              weighted by the query dot key attention (batch, head, time1, time2)
50 |         """
51 |         n_batch = query.size(0)
52 |         q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
53 |         k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
54 |         v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
55 |         q = q.transpose(1, 2)  # (batch, head, time1, d_k)
56 |         k = k.transpose(1, 2)  # (batch, head, time2, d_k)
57 |         v = v.transpose(1, 2)  # (batch, head, time2, d_k)
58 | 
59 |         scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
60 |             self.d_k
61 |         )  # (batch, head, time1, time2)
62 |         if mask is not None:
63 |             mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
64 |             min_value = float(
65 |                 numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min
66 |             )
67 |             scores = scores.masked_fill(mask, min_value)
68 |             self.attn = torch.softmax(scores, dim=-1).masked_fill(
69 |                 mask, 0.0
70 |             )  # (batch, head, time1, time2)
71 |         else:
72 |             self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
73 | 
74 |         p_attn = self.dropout(self.attn)
75 |         x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
76 |         x = (
77 |             x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
78 |         )  # (batch, time1, d_model)
79 |         return self.linear_out(x)  # (batch, time1, d_model)
80 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/embedding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positonal Encoding Module."""
 8 | 
 9 | import math
10 | 
11 | import torch
12 | 
13 | 
14 | def _pre_hook(state_dict, prefix, local_metadata, strict,
15 |               missing_keys, unexpected_keys, error_msgs):
16 |     """Perform pre-hook in load_state_dict for backward compatibility.
17 | 
18 |     Note:
19 |         We saved self.pe until v.0.5.2 but we have omitted it later.
20 |         Therefore, we remove the item "pe" from `state_dict` for backward compatibility.
21 | 
22 |     """
23 |     k = prefix + "pe"
24 |     if k in state_dict:
25 |         state_dict.pop(k)
26 | 
27 | 
28 | class PositionalEncoding(torch.nn.Module):
29 |     """Positional encoding.
30 | 
31 |     :param int d_model: embedding dim
32 |     :param float dropout_rate: dropout rate
33 |     :param int max_len: maximum input length
34 | 
35 |     """
36 | 
37 |     def __init__(self, d_model, dropout_rate, max_len=5000):
38 |         """Construct an PositionalEncoding object."""
39 |         super(PositionalEncoding, self).__init__()
40 |         self.d_model = d_model
41 |         self.xscale = math.sqrt(self.d_model)
42 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
43 |         self.pe = None
44 |         self.extend_pe(torch.tensor(0.0).expand(1, max_len))
45 |         self._register_load_state_dict_pre_hook(_pre_hook)
46 | 
47 |     def extend_pe(self, x):
48 |         """Reset the positional encodings."""
49 |         if self.pe is not None:
50 |             if self.pe.size(1) >= x.size(1):
51 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
52 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
53 |                 return
54 |         pe = torch.zeros(x.size(1), self.d_model)
55 |         position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
56 |         div_term = torch.exp(
57 |             torch.arange(0, self.d_model, 2, dtype=torch.float32)
58 |             * -(math.log(10000.0) / self.d_model)
59 |         )
60 |         pe[:, 0::2] = torch.sin(position * div_term)
61 |         pe[:, 1::2] = torch.cos(position * div_term)
62 |         pe = pe.unsqueeze(0)
63 |         self.pe = pe.to(device=x.device, dtype=x.dtype)
64 | 
65 |     def forward(self, x: torch.Tensor):
66 |         """Add positional encoding.
67 | 
68 |         Args:
69 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
70 | 
71 |         Returns:
72 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
73 | 
74 |         """
75 |         self.extend_pe(x)
76 |         x = x * self.xscale + self.pe[:, : x.size(1)]
77 |         return self.dropout(x)
78 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/encoder_layer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Encoder self-attention layer definition."""
 8 | 
 9 | import torch
10 | 
11 | from torch import nn
12 | 
13 | from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
14 | 
15 | 
16 | class EncoderLayer(nn.Module):
17 |     """Encoder layer module.
18 | 
19 |     :param int size: input dim
20 |     :param espnet.nets.pytorch_backend.transformer.attention.
21 |         MultiHeadedAttention self_attn: self attention module
22 |     :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.
23 |         PositionwiseFeedForward feed_forward:
24 |         feed forward module
25 |     :param float dropout_rate: dropout rate
26 |     :param bool normalize_before: whether to use layer_norm before the first block
27 |     :param bool concat_after: whether to concat attention layer's input and output
28 |         if True, additional linear will be applied.
29 |         i.e. x -> x + linear(concat(x, att(x)))
30 |         if False, no additional linear will be applied. i.e. x -> x + att(x)
31 | 
32 |     """
33 | 
34 |     def __init__(
35 |         self,
36 |         size,
37 |         self_attn,
38 |         feed_forward,
39 |         dropout_rate,
40 |         normalize_before=True,
41 |         concat_after=False,
42 |     ):
43 |         """Construct an EncoderLayer object."""
44 |         super(EncoderLayer, self).__init__()
45 |         self.self_attn = self_attn
46 |         self.feed_forward = feed_forward
47 |         self.norm1 = LayerNorm(size)
48 |         self.norm2 = LayerNorm(size)
49 |         self.dropout = nn.Dropout(dropout_rate)
50 |         self.size = size
51 |         self.normalize_before = normalize_before
52 |         self.concat_after = concat_after
53 |         if self.concat_after:
54 |             self.concat_linear = nn.Linear(size + size, size)
55 | 
56 |     def forward(self, x, mask, cache=None):
57 |         """Compute encoded features.
58 | 
59 |         :param torch.Tensor x: encoded source features (batch, max_time_in, size)
60 |         :param torch.Tensor mask: mask for x (batch, max_time_in)
61 |         :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size)
62 |         :rtype: Tuple[torch.Tensor, torch.Tensor]
63 |         """
64 |         residual = x
65 |         if self.normalize_before:
66 |             x = self.norm1(x)
67 | 
68 |         if cache is None:
69 |             x_q = x
70 |         else:
71 |             assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
72 |             x_q = x[:, -1:, :]
73 |             residual = residual[:, -1:, :]
74 |             mask = None if mask is None else mask[:, -1:, :]
75 | 
76 |         if self.concat_after:
77 |             x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
78 |             x = residual + self.concat_linear(x_concat)
79 |         else:
80 |             x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
81 |         if not self.normalize_before:
82 |             x = self.norm1(x)
83 | 
84 |         residual = x
85 |         if self.normalize_before:
86 |             x = self.norm2(x)
87 |         x = residual + self.dropout(self.feed_forward(x))
88 |         if not self.normalize_before:
89 |             x = self.norm2(x)
90 | 
91 |         if cache is not None:
92 |             x = torch.cat([cache, x], dim=1)
93 | 
94 |         return x, mask
95 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/initializer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Parameter initialization."""
 8 | 
 9 | import torch
10 | 
11 | from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
12 | 
13 | 
14 | def initialize(model, init_type="pytorch"):
15 |     """Initialize Transformer module.
16 | 
17 |     :param torch.nn.Module model: transformer instance
18 |     :param str init_type: initialization type
19 |     """
20 |     if init_type == "pytorch":
21 |         return
22 | 
23 |     # weight init
24 |     for p in model.parameters():
25 |         if p.dim() > 1:
26 |             if init_type == "xavier_uniform":
27 |                 torch.nn.init.xavier_uniform_(p.data)
28 |             elif init_type == "xavier_normal":
29 |                 torch.nn.init.xavier_normal_(p.data)
30 |             elif init_type == "kaiming_uniform":
31 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
32 |             elif init_type == "kaiming_normal":
33 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
34 |             else:
35 |                 raise ValueError("Unknown initialization: " + init_type)
36 |     # bias init
37 |     for p in model.parameters():
38 |         if p.dim() == 1:
39 |             p.data.zero_()
40 | 
41 |     # reset some modules with default init
42 |     for m in model.modules():
43 |         if isinstance(m, (torch.nn.Embedding, LayerNorm)):
44 |             m.reset_parameters()
45 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Label smoothing module."""
 8 | 
 9 | import torch
10 | from torch import nn
11 | 
12 | 
13 | class LabelSmoothingLoss(nn.Module):
14 |     """Label-smoothing loss.
15 | 
16 |     :param int size: the number of class
17 |     :param int padding_idx: ignored class id
18 |     :param float smoothing: smoothing rate (0.0 means the conventional CE)
19 |     :param bool normalize_length: normalize loss by sequence length if True
20 |     :param torch.nn.Module criterion: loss function to be smoothed
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         size,
26 |         padding_idx,
27 |         smoothing,
28 |         normalize_length=False,
29 |         criterion=nn.KLDivLoss(reduction="none"),
30 |     ):
31 |         """Construct an LabelSmoothingLoss object."""
32 |         super(LabelSmoothingLoss, self).__init__()
33 |         self.criterion = criterion
34 |         self.padding_idx = padding_idx
35 |         self.confidence = 1.0 - smoothing
36 |         self.smoothing = smoothing
37 |         self.size = size
38 |         self.true_dist = None
39 |         self.normalize_length = normalize_length
40 | 
41 |     def forward(self, x, target):
42 |         """Compute loss between x and target.
43 | 
44 |         :param torch.Tensor x: prediction (batch, seqlen, class)
45 |         :param torch.Tensor target:
46 |             target signal masked with self.padding_id (batch, seqlen)
47 |         :return: scalar float value
48 |         :rtype torch.Tensor
49 |         """
50 |         assert x.size(2) == self.size
51 |         batch_size = x.size(0)
52 |         x = x.view(-1, self.size)
53 |         target = target.view(-1)
54 |         with torch.no_grad():
55 |             true_dist = x.clone()
56 |             true_dist.fill_(self.smoothing / (self.size - 1))
57 |             ignore = target == self.padding_idx  # (B,)
58 |             total = len(target) - ignore.sum().item()
59 |             target = target.masked_fill(ignore, 0)  # avoid -1 index
60 |             true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
61 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
62 |         denom = total if self.normalize_length else batch_size
63 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
64 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Layer normalization module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class LayerNorm(torch.nn.LayerNorm):
13 |     """Layer normalization module.
14 | 
15 |     :param int nout: output dim size
16 |     :param int dim: dimension to be normalized
17 |     """
18 | 
19 |     def __init__(self, nout, dim=-1):
20 |         """Construct an LayerNorm object."""
21 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
22 |         self.dim = dim
23 | 
24 |     def forward(self, x):
25 |         """Apply layer normalization.
26 | 
27 |         :param torch.Tensor x: input tensor
28 |         :return: layer normalized tensor
29 |         :rtype torch.Tensor
30 |         """
31 |         if self.dim == -1:
32 |             return super(LayerNorm, self).forward(x)
33 |         return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
34 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Shigeki Karita
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | """Mask module."""
 7 | 
 8 | from distutils.version import LooseVersion
 9 | 
10 | import torch
11 | 
12 | is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
13 | # LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa
14 | is_torch_1_2 = (
15 |     LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2")
16 | )
17 | datatype = torch.bool if is_torch_1_2_plus else torch.uint8
18 | 
19 | 
20 | def subsequent_mask(size, device="cpu", dtype=datatype):
21 |     """Create mask for subsequent steps (1, size, size).
22 | 
23 |     :param int size: size of mask
24 |     :param str device: "cpu" or "cuda" or torch.Tensor.device
25 |     :param torch.dtype dtype: result dtype
26 |     :rtype: torch.Tensor
27 |     >>> subsequent_mask(3)
28 |     [[1, 0, 0],
29 |      [1, 1, 0],
30 |      [1, 1, 1]]
31 |     """
32 |     if is_torch_1_2 and dtype == torch.bool:
33 |         # torch=1.2 doesn't support tril for bool tensor
34 |         ret = torch.ones(size, size, device=device, dtype=torch.uint8)
35 |         return torch.tril(ret, out=ret).type(dtype)
36 |     else:
37 |         ret = torch.ones(size, size, device=device, dtype=dtype)
38 |         return torch.tril(ret, out=ret)
39 | 
40 | 
41 | def target_mask(ys_in_pad, ignore_id):
42 |     """Create mask for decoder self-attention.
43 | 
44 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
45 |     :param int ignore_id: index of padding
46 |     :param torch.dtype dtype: result dtype
47 |     :rtype: torch.Tensor
48 |     """
49 |     ys_mask = ys_in_pad != ignore_id
50 |     m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
51 |     return ys_mask.unsqueeze(-2) & m
52 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/multi_layer_conv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
  8 | 
  9 | import torch
 10 | 
 11 | 
 12 | class MultiLayeredConv1d(torch.nn.Module):
 13 |     """Multi-layered conv1d for Transformer block.
 14 | 
 15 |     This is a module of multi-leyered conv1d designed
 16 |     to replace positionwise feed-forward network
 17 |     in Transforner block, which is introduced in
 18 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
 19 | 
 20 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
 21 |         https://arxiv.org/pdf/1905.09263.pdf
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 26 |         """Initialize MultiLayeredConv1d module.
 27 | 
 28 |         Args:
 29 |             in_chans (int): Number of input channels.
 30 |             hidden_chans (int): Number of hidden channels.
 31 |             kernel_size (int): Kernel size of conv1d.
 32 |             dropout_rate (float): Dropout rate.
 33 | 
 34 |         """
 35 |         super(MultiLayeredConv1d, self).__init__()
 36 |         self.w_1 = torch.nn.Conv1d(
 37 |             in_chans,
 38 |             hidden_chans,
 39 |             kernel_size,
 40 |             stride=1,
 41 |             padding=(kernel_size - 1) // 2,
 42 |         )
 43 |         self.w_2 = torch.nn.Conv1d(
 44 |             hidden_chans,
 45 |             in_chans,
 46 |             kernel_size,
 47 |             stride=1,
 48 |             padding=(kernel_size - 1) // 2,
 49 |         )
 50 |         self.dropout = torch.nn.Dropout(dropout_rate)
 51 | 
 52 |     def forward(self, x):
 53 |         """Calculate forward propagation.
 54 | 
 55 |         Args:
 56 |             x (Tensor): Batch of input tensors (B, ..., in_chans).
 57 | 
 58 |         Returns:
 59 |             Tensor: Batch of output tensors (B, ..., hidden_chans).
 60 | 
 61 |         """
 62 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
 63 |         return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
 64 | 
 65 | 
 66 | class Conv1dLinear(torch.nn.Module):
 67 |     """Conv1D + Linear for Transformer block.
 68 | 
 69 |     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
 70 | 
 71 |     """
 72 | 
 73 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 74 |         """Initialize Conv1dLinear module.
 75 | 
 76 |         Args:
 77 |             in_chans (int): Number of input channels.
 78 |             hidden_chans (int): Number of hidden channels.
 79 |             kernel_size (int): Kernel size of conv1d.
 80 |             dropout_rate (float): Dropout rate.
 81 | 
 82 |         """
 83 |         super(Conv1dLinear, self).__init__()
 84 |         self.w_1 = torch.nn.Conv1d(
 85 |             in_chans,
 86 |             hidden_chans,
 87 |             kernel_size,
 88 |             stride=1,
 89 |             padding=(kernel_size - 1) // 2,
 90 |         )
 91 |         self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
 92 |         self.dropout = torch.nn.Dropout(dropout_rate)
 93 | 
 94 |     def forward(self, x):
 95 |         """Calculate forward propagation.
 96 | 
 97 |         Args:
 98 |             x (Tensor): Batch of input tensors (B, ..., in_chans).
 99 | 
100 |         Returns:
101 |             Tensor: Batch of output tensors (B, ..., hidden_chans).
102 | 
103 |         """
104 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
105 |         return self.w_2(self.dropout(x))
106 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/optimizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Optimizer module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class NoamOpt(object):
13 |     """Optim wrapper that implements rate."""
14 | 
15 |     def __init__(self, model_size, factor, warmup, optimizer):
16 |         """Construct an NoamOpt object."""
17 |         self.optimizer = optimizer
18 |         self._step = 0
19 |         self.warmup = warmup
20 |         self.factor = factor
21 |         self.model_size = model_size
22 |         self._rate = 0
23 | 
24 |     @property
25 |     def param_groups(self):
26 |         """Return param_groups."""
27 |         return self.optimizer.param_groups
28 | 
29 |     def step(self):
30 |         """Update parameters and rate."""
31 |         self._step += 1
32 |         rate = self.rate()
33 |         for p in self.optimizer.param_groups:
34 |             p["lr"] = rate
35 |         self._rate = rate
36 |         self.optimizer.step()
37 | 
38 |     def rate(self, step=None):
39 |         """Implement `lrate` above."""
40 |         if step is None:
41 |             step = self._step
42 |         return (
43 |             self.factor
44 |             * self.model_size ** (-0.5)
45 |             * min(step ** (-0.5), step * self.warmup ** (-1.5))
46 |         )
47 | 
48 |     def zero_grad(self):
49 |         """Reset gradient."""
50 |         self.optimizer.zero_grad()
51 | 
52 |     def state_dict(self):
53 |         """Return state_dict."""
54 |         return {
55 |             "_step": self._step,
56 |             "warmup": self.warmup,
57 |             "factor": self.factor,
58 |             "model_size": self.model_size,
59 |             "_rate": self._rate,
60 |             "optimizer": self.optimizer.state_dict(),
61 |         }
62 | 
63 |     def load_state_dict(self, state_dict):
64 |         """Load state_dict."""
65 |         for key, value in state_dict.items():
66 |             if key == "optimizer":
67 |                 self.optimizer.load_state_dict(state_dict["optimizer"])
68 |             else:
69 |                 setattr(self, key, value)
70 | 
71 | 
72 | def get_std_opt(model, d_model, warmup, factor):
73 |     """Get standard NoamOpt."""
74 |     base = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
75 |     return NoamOpt(d_model, factor, warmup, base)
76 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import logging
 8 | 
 9 | import matplotlib.pyplot as plt
10 | import numpy
11 | 
12 | from espnet.asr import asr_utils
13 | 
14 | 
15 | def _plot_and_save_attention(att_w, filename, xtokens=None, ytokens=None):
16 |     # dynamically import matplotlib due to not found error
17 |     from matplotlib.ticker import MaxNLocator
18 |     import os
19 | 
20 |     d = os.path.dirname(filename)
21 |     if not os.path.exists(d):
22 |         os.makedirs(d)
23 |     w, h = plt.figaspect(1.0 / len(att_w))
24 |     fig = plt.Figure(figsize=(w * 2, h * 2))
25 |     axes = fig.subplots(1, len(att_w))
26 |     if len(att_w) == 1:
27 |         axes = [axes]
28 |     for ax, aw in zip(axes, att_w):
29 |         # plt.subplot(1, len(att_w), h)
30 |         ax.imshow(aw.astype(numpy.float32), aspect="auto")
31 |         ax.set_xlabel("Input")
32 |         ax.set_ylabel("Output")
33 |         ax.xaxis.set_major_locator(MaxNLocator(integer=True))
34 |         ax.yaxis.set_major_locator(MaxNLocator(integer=True))
35 |         # Labels for major ticks
36 |         if xtokens is not None:
37 |             ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, len(xtokens)))
38 |             ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, 1), minor=True)
39 |             ax.set_xticklabels(xtokens + [""], rotation=40)
40 |         if ytokens is not None:
41 |             ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, len(ytokens)))
42 |             ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, 1), minor=True)
43 |             ax.set_yticklabels(ytokens + [""])
44 |     fig.tight_layout()
45 |     return fig
46 | 
47 | 
48 | def savefig(plot, filename):
49 |     plot.savefig(filename)
50 |     plt.clf()
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positionwise feed forward layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class PositionwiseFeedForward(torch.nn.Module):
13 |     """Positionwise feed forward layer.
14 | 
15 |     :param int idim: input dimenstion
16 |     :param int hidden_units: number of hidden units
17 |     :param float dropout_rate: dropout rate
18 | 
19 |     """
20 | 
21 |     def __init__(self, idim, hidden_units, dropout_rate):
22 |         """Construct an PositionwiseFeedForward object."""
23 |         super(PositionwiseFeedForward, self).__init__()
24 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
25 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
26 |         self.dropout = torch.nn.Dropout(dropout_rate)
27 | 
28 |     def forward(self, x):
29 |         """Forward funciton."""
30 |         return self.w_2(self.dropout(torch.relu(self.w_1(x))))
31 | 


--------------------------------------------------------------------------------
/espnet/nets/pytorch_backend/transformer/repeat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Repeat the same layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiSequential(torch.nn.Sequential):
13 |     """Multi-input multi-output torch.nn.Sequential."""
14 | 
15 |     def forward(self, *args):
16 |         """Repeat."""
17 |         for m in self:
18 |             args = m(*args)
19 |         return args
20 | 
21 | 
22 | def repeat(N, fn):
23 |     """Repeat module N times.
24 | 
25 |     :param int N: repeat time
26 |     :param function fn: function to generate module
27 |     :return: repeated modules
28 |     :rtype: MultiSequential
29 |     """
30 |     return MultiSequential(*[fn() for _ in range(N)])
31 | 


--------------------------------------------------------------------------------
/espnet/nets/scorers/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/nets/scorers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/scorers/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/scorers/__pycache__/ctc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/nets/scorers/__pycache__/ctc.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/nets/scorers/ctc.py:
--------------------------------------------------------------------------------
 1 | """ScorerInterface implementation for CTC."""
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from espnet.nets.ctc_prefix_score import CTCPrefixScore
 7 | from espnet.nets.scorer_interface import PartialScorerInterface
 8 | 
 9 | 
10 | class CTCPrefixScorer(PartialScorerInterface):
11 |     """Decoder interface wrapper for CTCPrefixScore."""
12 | 
13 |     def __init__(self, ctc: torch.nn.Module, eos: int):
14 |         """Initialize class.
15 | 
16 |         Args:
17 |             ctc (torch.nn.Module): The CTC implementaiton.
18 |                 For example, :class:`espnet.nets.pytorch_backend.ctc.CTC`
19 |             eos (int): The end-of-sequence id.
20 | 
21 |         """
22 |         self.ctc = ctc
23 |         self.eos = eos
24 |         self.impl = None
25 | 
26 |     def init_state(self, x: torch.Tensor):
27 |         """Get an initial state for decoding.
28 | 
29 |         Args:
30 |             x (torch.Tensor): The encoded feature tensor
31 | 
32 |         Returns: initial state
33 | 
34 |         """
35 |         logp = self.ctc.log_softmax(x.unsqueeze(0)).detach().squeeze(0).cpu().numpy()
36 |         # TODO(karita): use CTCPrefixScoreTH
37 |         self.impl = CTCPrefixScore(logp, 0, self.eos, np)
38 |         return 0, self.impl.initial_state()
39 | 
40 |     def select_state(self, state, i):
41 |         """Select state with relative ids in the main beam search.
42 | 
43 |         Args:
44 |             state: Decoder state for prefix tokens
45 |             i (int): Index to select a state in the main beam search
46 | 
47 |         Returns:
48 |             state: pruned state
49 | 
50 |         """
51 |         sc, st = state
52 |         return sc[i], st[i]
53 | 
54 |     def score_partial(self, y, ids, state, x):
55 |         """Score new token.
56 | 
57 |         Args:
58 |             y (torch.Tensor): 1D prefix token
59 |             next_tokens (torch.Tensor): torch.int64 next token to score
60 |             state: decoder state for prefix tokens
61 |             x (torch.Tensor): 2D encoder feature that generates ys
62 | 
63 |         Returns:
64 |             tuple[torch.Tensor, Any]:
65 |                 Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
66 |                 and next state for ys
67 | 
68 |         """
69 |         prev_score, state = state
70 |         presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
71 |         tscore = torch.as_tensor(
72 |             presub_score - prev_score, device=x.device, dtype=x.dtype
73 |         )
74 |         return tscore, (presub_score, new_st)
75 | 


--------------------------------------------------------------------------------
/espnet/nets/viterbi_align.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pdb
 3 | 
 4 | # logits is the probo f output of each phoneme at time t (L, T) labels x time
 5 | # sequence is the index of the phoneme expected in the output sequence. 
 6 | 
 7 | # http://deeplearning.cs.cmu.edu/slides.spring19/lec14.CTC.pdf
 8 | # slide 158
 9 | def viterbi_align(logits, sequence):
10 | 	blank_id = 0
11 | 	sequence = list(sequence)
12 | 	for i in range(len(sequence)):
13 | 		sequence.insert(2*i, blank_id)
14 | 	sequence.append(blank_id)
15 | 
16 | 	T = logits.shape[1]
17 | 	N = len(sequence)
18 | 	# print(N, T)
19 | 	if (N > T):
20 | 		raise Exception("Number of expected symbols more than the time stamps")
21 | 	
22 | 	s = np.zeros((T, N))
23 | 	bp = np.zeros((T, N), dtype=np.int)
24 | 	bp.fill(-1)
25 | 	bscr = np.zeros((T, N))
26 | 	aligned_seq1 = np.zeros((T), dtype=np.int)
27 | 	aligned_seq2 = np.zeros((T), dtype=np.int)
28 | 
29 | 	# filling S
30 | 	# print(sequence)
31 | 	for i in range(N):
32 | 		s[:, i] = logits[sequence[i]] 
33 | 
34 | 	# s = np.log(np.array([[0.1, 0.5, 0.4, 0.1, 0.2], [0.2, 0.2, 0.2, 0.3, 0.7], [0.4, 0.1, 0.1, 0.2, 0.6],\
35 | 	# 	 [0.2, 0.3, 0.3, 0.1, 0.6], [0.3, 0.1, 0.4, 0.4, 0.7]])).T
36 | 
37 | 	# base case
38 | 	bp[0, 0] = 0 # made this 0 instead of -1. 
39 | 	bp[0, 1] = 1 
40 | 	bscr[0, 0] = s[0, 0]
41 | 	bscr[0, 1] = s[0, 1]
42 | 	bscr[0, 2:] = np.NINF
43 | 
44 | 	# filling over the rest time stamps
45 | 	for t in range(1, T):
46 | 		bp[t, 0] = bp[t-1,0]
47 | 		bscr[t, 0] = bscr[t-1,0] + s[t, 0]
48 | 		bp[t, 1] = 1 if bscr[t-1,1] > bscr[t-1, 0] else 0
49 | 		bscr[t, 1] = bscr[t-1, bp[t, 1]] + s[t, 1]
50 | 
51 | 		for i in range(2, N):
52 | 			# print("going in")
53 | 			if (i%2 == 0): # blank 
54 | 				bp[t, i] = i if bscr[t-1,i] > bscr[t-1, i-1] else i-1
55 | 			else:
56 | 				if (sequence[i] == sequence[i-2]):
57 | 					bp[t, i] = i if bscr[t-1,i] > bscr[t-1, i-1] else i-1
58 | 				else:
59 | 					bp[t, i] = i if (bscr[t-1,i] > bscr[t-1, i-1] and bscr[t-1,i] > bscr[t-1, i-2]) else\
60 | 						   (i-1 if (bscr[t-1,i-1] > bscr[t-1, i] and bscr[t-1,i-1] > bscr[t-1, i-2]) else\
61 | 						   	i-2)
62 | 			bscr[t, i] = bscr[t-1, bp[t, i]] + s[t, i]
63 | 
64 | 	# print(bp.T)
65 | 	# print(np.exp(bscr).T)
66 | 
67 | 	aligned_seq1[T-1], path_score_1 = N-1, 0
68 | 	for t in range(T-1, 0, -1):
69 | 		aligned_seq1[t-1] = bp[t, aligned_seq1[t]] 
70 | 		path_score_1 += bscr[t, aligned_seq1[t]]
71 | 
72 | 	aligned_seq2[T-1], path_score_2 = N-2, 0
73 | 	for t in range(T-1, 0, -1):
74 | 		aligned_seq2[t-1] = bp[t, aligned_seq2[t]] 
75 | 		path_score_2 += bscr[t, aligned_seq2[t]]
76 | 
77 | 	aligned_seq = aligned_seq1 if (path_score_1 > path_score_2) else aligned_seq2
78 | 
79 | 	aligned_symbols_idx = []
80 | 	for i in range(len(aligned_seq)):
81 | 		if i > 0 and aligned_seq[i] == aligned_seq[i-1]:
82 |    			aligned_symbols_idx.append(0)
83 | 		else:
84 | 			aligned_symbols_idx.append(sequence[aligned_seq[i]])
85 | 	aligned_idx = np.where(np.array(aligned_symbols_idx) != 0)
86 | 
87 | 	return aligned_idx
88 | 
89 | if __name__ == "__main__":
90 | 	total_labels = 10 # including blank 
91 | 	T = 9
92 | 	sequence = [2,5]
93 | 	logits = np.random.rand(total_labels, T) #(total_labels, T)
94 | 	aligned_symbols_idx = viterbi_align(logits, sequence)
95 | 	print(aligned_symbols_idx)
96 | 


--------------------------------------------------------------------------------
/espnet/transform/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize main package."""
2 | 


--------------------------------------------------------------------------------
/espnet/transform/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/transform/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/transform/__pycache__/functional.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/transform/__pycache__/functional.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/transform/__pycache__/spec_augment.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/transform/__pycache__/spec_augment.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/transform/__pycache__/transform_interface.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/transform/__pycache__/transform_interface.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/transform/__pycache__/transformation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/transform/__pycache__/transformation.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/transform/functional.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | from espnet.transform.transform_interface import TransformInterface
 4 | from espnet.utils.check_kwargs import check_kwargs
 5 | 
 6 | 
 7 | class FuncTrans(TransformInterface):
 8 |     """Functional Transformation
 9 | 
10 |     WARNING:
11 |         Builtin or C/C++ functions may not work properly
12 |         because this class heavily depends on the `inspect` module.
13 | 
14 |     Usage:
15 | 
16 |     >>> def foo_bar(x, a=1, b=2):
17 |     ...     '''Foo bar
18 |     ...     :param x: input
19 |     ...     :param int a: default 1
20 |     ...     :param int b: default 2
21 |     ...     '''
22 |     ...     return x + a - b
23 | 
24 | 
25 |     >>> class FooBar(FuncTrans):
26 |     ...     _func = foo_bar
27 |     ...     __doc__ = foo_bar.__doc__
28 |     """
29 | 
30 |     _func = None
31 | 
32 |     def __init__(self, **kwargs):
33 |         self.kwargs = kwargs
34 |         check_kwargs(self.func, kwargs)
35 | 
36 |     def __call__(self, x):
37 |         return self.func(x, **self.kwargs)
38 | 
39 |     @classmethod
40 |     def add_arguments(cls, parser):
41 |         fname = cls._func.__name__.replace("_", "-")
42 |         group = parser.add_argument_group(fname + " transformation setting")
43 |         for k, v in cls.default_params().items():
44 |             # TODO(karita): get help and choices from docstring?
45 |             attr = k.replace("_", "-")
46 |             group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
47 |         return parser
48 | 
49 |     @property
50 |     def func(self):
51 |         return type(self)._func
52 | 
53 |     @classmethod
54 |     def default_params(cls):
55 |         try:
56 |             d = dict(inspect.signature(cls._func).parameters)
57 |         except ValueError:
58 |             d = dict()
59 |         return {
60 |             k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
61 |         }
62 | 
63 |     def __repr__(self):
64 |         params = self.default_params()
65 |         params.update(**self.kwargs)
66 |         ret = self.__class__.__name__ + "("
67 |         if len(params) == 0:
68 |             return ret + ")"
69 |         for k, v in params.items():
70 |             ret += "{}={}, ".format(k, v)
71 |         return ret[:-2] + ")"
72 | 


--------------------------------------------------------------------------------
/espnet/transform/transform_interface.py:
--------------------------------------------------------------------------------
 1 | # TODO(karita): add this to all the transform impl.
 2 | class TransformInterface:
 3 |     """Transform Interface"""
 4 | 
 5 |     def __call__(self, x):
 6 |         raise NotImplementedError("__call__ method is not implemented")
 7 | 
 8 |     @classmethod
 9 |     def add_arguments(cls, parser):
10 |         return parser
11 | 
12 |     def __repr__(self):
13 |         return self.__class__.__name__ + "()"
14 | 
15 | 
16 | class Identity(TransformInterface):
17 |     """Identity Function"""
18 | 
19 |     def __call__(self, x):
20 |         return x
21 | 


--------------------------------------------------------------------------------
/espnet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/check_kwargs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/check_kwargs.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/cli_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/cli_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/deterministic_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/deterministic_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/dynamic_import.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/dynamic_import.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/fill_missing_args.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/fill_missing_args.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/__pycache__/io_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/__pycache__/io_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/check_kwargs.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | def check_kwargs(func, kwargs, name=None):
 5 |     """check kwargs are valid for func
 6 | 
 7 |     If kwargs are invalid, raise TypeError as same as python default
 8 |     :param function func: function to be validated
 9 |     :param dict kwargs: keyword arguments for func
10 |     :param str name: name used in TypeError (default is func name)
11 |     """
12 |     try:
13 |         params = inspect.signature(func).parameters
14 |     except ValueError:
15 |         return
16 |     if name is None:
17 |         name = func.__name__
18 |     for k in kwargs.keys():
19 |         if k not in params:
20 |             raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
21 | 


--------------------------------------------------------------------------------
/espnet/utils/cli_utils.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Sequence
 2 | from distutils.util import strtobool as dist_strtobool
 3 | import sys
 4 | 
 5 | import numpy
 6 | 
 7 | 
 8 | def strtobool(x):
 9 |     # distutils.util.strtobool returns integer, but it's confusing,
10 |     return bool(dist_strtobool(x))
11 | 
12 | 
13 | def get_commandline_args():
14 |     extra_chars = [
15 |         " ",
16 |         ";",
17 |         "&",
18 |         "(",
19 |         ")",
20 |         "|",
21 |         "^",
22 |         "<",
23 |         ">",
24 |         "?",
25 |         "*",
26 |         "[",
27 |         "]",
28 |         "$",
29 |         "`",
30 |         '"',
31 |         "\\",
32 |         "!",
33 |         "{",
34 |         "}",
35 |     ]
36 | 
37 |     # Escape the extra characters for shell
38 |     argv = [
39 |         arg.replace("'", "'\\''")
40 |         if all(char not in arg for char in extra_chars)
41 |         else "'" + arg.replace("'", "'\\''") + "'"
42 |         for arg in sys.argv
43 |     ]
44 | 
45 |     return sys.executable + " " + " ".join(argv)
46 | 
47 | 
48 | def is_scipy_wav_style(value):
49 |     # If Tuple[int, numpy.ndarray] or not
50 |     return (
51 |         isinstance(value, Sequence)
52 |         and len(value) == 2
53 |         and isinstance(value[0], int)
54 |         and isinstance(value[1], numpy.ndarray)
55 |     )
56 | 
57 | 
58 | def assert_scipy_wav_style(value):
59 |     assert is_scipy_wav_style(
60 |         value
61 |     ), "Must be Tuple[int, numpy.ndarray], but got {}".format(
62 |         type(value)
63 |         if not isinstance(value, Sequence)
64 |         else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
65 |     )
66 | 


--------------------------------------------------------------------------------
/espnet/utils/dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | """pytorch dataset and dataloader implementation for chainer training."""
 7 | 
 8 | import torch
 9 | import torch.utils.data
10 | 
11 | 
12 | class TransformDataset(torch.utils.data.Dataset):
13 |     """Transform Dataset for pytorch backend.
14 | 
15 |     Args:
16 |         data: list object from make_batchset
17 |         transfrom: transform function
18 | 
19 |     """
20 | 
21 |     def __init__(self, data, transform):
22 |         """Init function."""
23 |         super(TransformDataset).__init__()
24 |         self.data = data
25 |         self.transform = transform
26 | 
27 |     def __len__(self):
28 |         """Len function."""
29 |         return len(self.data)
30 | 
31 |     def __getitem__(self, idx):
32 |         """[] operator."""
33 |         return self.transform(self.data[idx])
34 | 
35 | 
36 | class ChainerDataLoader(object):
37 |     """Pytorch dataloader in chainer style.
38 | 
39 |     Args:
40 |         all args for torch.utils.data.dataloader.Dataloader
41 | 
42 |     """
43 | 
44 |     def __init__(self, **kwargs):
45 |         """Init function."""
46 |         self.loader = torch.utils.data.dataloader.DataLoader(**kwargs)
47 |         self.len = len(kwargs["dataset"])
48 |         self.current_position = 0
49 |         self.epoch = 0
50 |         self.iter = None
51 |         self.kwargs = kwargs
52 | 
53 |     def next(self):
54 |         """Implement next function."""
55 |         if self.iter is None:
56 |             self.iter = iter(self.loader)
57 |         try:
58 |             ret = next(self.iter)
59 |         except StopIteration:
60 |             self.iter = None
61 |             return self.next()
62 |         self.current_position += 1
63 |         if self.current_position == self.len:
64 |             self.epoch = self.epoch + 1
65 |             self.current_position = 0
66 |         return ret
67 | 
68 |     def __iter__(self):
69 |         """Implement iter function."""
70 |         for batch in self.loader:
71 |             yield batch
72 | 
73 |     @property
74 |     def epoch_detail(self):
75 |         """Epoch_detail required by chainer."""
76 |         return self.epoch + self.current_position / self.len
77 | 
78 |     def serialize(self, serializer):
79 |         """Serialize and deserialize function."""
80 |         epoch = serializer("epoch", self.epoch)
81 |         current_position = serializer("current_position", self.current_position)
82 |         self.epoch = epoch
83 |         self.current_position = current_position
84 | 
85 |     def start_shuffle(self):
86 |         """Shuffle function for sortagrad."""
87 |         self.kwargs["shuffle"] = True
88 |         self.loader = torch.utils.data.dataloader.DataLoader(**self.kwargs)
89 | 
90 |     def finalize(self):
91 |         """Implement finalize function."""
92 |         del self.loader
93 | 


--------------------------------------------------------------------------------
/espnet/utils/deterministic_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | 
 4 | 
 5 | def set_deterministic_pytorch(args):
 6 |     """Ensures pytorch produces deterministic results depending on the program arguments
 7 | 
 8 |     :param Namespace args: The program arguments
 9 |     """
10 |     # seed setting
11 |     torch.manual_seed(args.seed)
12 | 
13 |     # debug mode setting
14 |     # 0 would be fastest, but 1 seems to be reasonable
15 |     # considering reproducibility
16 |     # remove type check
17 |     torch.backends.cudnn.deterministic = True
18 |     torch.backends.cudnn.benchmark = (
19 |         False  # https://github.com/pytorch/pytorch/issues/6351
20 |     )
21 |     if args.debugmode < 2:
22 |         logging.info("torch type check is disabled")
23 |     # use deterministic computation or not
24 |     if args.debugmode < 1:
25 |         torch.backends.cudnn.deterministic = False
26 |         torch.backends.cudnn.benchmark = True
27 |         logging.info("torch cudnn deterministic is disabled")
28 | 


--------------------------------------------------------------------------------
/espnet/utils/dynamic_import.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | 
 4 | def dynamic_import(import_path, alias=dict()):
 5 |     """dynamic import module and class
 6 | 
 7 |     :param str import_path: syntax 'module_name:class_name'
 8 |         e.g., 'espnet.transform.add_deltas:AddDeltas'
 9 |     :param dict alias: shortcut for registered class
10 |     :return: imported class
11 |     """
12 |     if import_path not in alias and ":" not in import_path:
13 |         raise ValueError(
14 |             "import_path should be one of {} or "
15 |             'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : '
16 |             "{}".format(set(alias), import_path)
17 |         )
18 |     if ":" not in import_path:
19 |         import_path = alias[import_path]
20 | 
21 |     module_name, objname = import_path.split(":")
22 |     m = importlib.import_module(module_name)
23 |     return getattr(m, objname)
24 | 


--------------------------------------------------------------------------------
/espnet/utils/fill_missing_args.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import argparse
 7 | import logging
 8 | 
 9 | 
10 | def fill_missing_args(args, add_arguments):
11 |     """Fill missing arguments in args.
12 | 
13 |     Args:
14 |         args (Namespace or None): Namesapce containing hyperparameters.
15 |         add_arguments (function): Function to add arguments.
16 | 
17 |     Returns:
18 |         Namespace: Arguments whose missing ones are filled with default value.
19 | 
20 |     Examples:
21 |         >>> from argparse import Namespace
22 |         >>> from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
23 |         >>> args = Namespace()
24 |         >>> fill_missing_args(args, Tacotron2.add_arguments_fn)
25 |         Namespace(aconv_chans=32, aconv_filts=15, adim=512, atype='location', ...)
26 | 
27 |     """
28 |     # check argument type
29 |     assert isinstance(args, argparse.Namespace) or args is None
30 |     assert callable(add_arguments)
31 | 
32 |     # get default arguments
33 |     default_args, _ = add_arguments(argparse.ArgumentParser()).parse_known_args()
34 | 
35 |     # convert to dict
36 |     args = {} if args is None else vars(args)
37 |     default_args = vars(default_args)
38 | 
39 |     for key, value in default_args.items():
40 |         if key not in args:
41 |             logging.info(
42 |                 'attribute "%s" does not exist. use default %s.' % (key, str(value))
43 |             )
44 |             args[key] = value
45 | 
46 |     return argparse.Namespace(**args)
47 | 


--------------------------------------------------------------------------------
/espnet/utils/training/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/espnet/utils/training/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/training/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/espnet/utils/training/__pycache__/batchfy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cywang97/StreamingTransformer/4b56931a311d65686d310c54cc6896a4be4f47de/espnet/utils/training/__pycache__/batchfy.cpython-37.pyc


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from distutils.version import LooseVersion
  3 | import os
  4 | import pip
  5 | from setuptools import find_packages
  6 | from setuptools import setup
  7 | import sys
  8 | 
  9 | 
 10 | if LooseVersion(sys.version) < LooseVersion('3.6'):
 11 |     raise RuntimeError(
 12 |         'ESPnet requires Python>=3.6, '
 13 |         'but your Python is {}'.format(sys.version))
 14 | if LooseVersion(pip.__version__) < LooseVersion('19'):
 15 |     raise RuntimeError(
 16 |         'pip>=19.0.0 is required, but your pip is {}. '
 17 |         'Try again after "pip install -U pip"'.format(pip.__version__))
 18 | 
 19 | requirements = {
 20 |     'install': [
 21 |         # 'torch==1.0.1',  # Installation from anaconda is recommended for PyTorch
 22 | 	'chainer==6.0.0',
 23 |         # 'cupy==6.0.0',  # Do not install cupy as default
 24 |         'setuptools>=38.5.1',
 25 |         'scipy>=1.3.0',
 26 |         'librosa>=0.7.0',
 27 |         'soundfile>=0.10.2',
 28 |         'inflect>=1.0.0',
 29 |         'unidecode>=1.0.22',
 30 |         'editdistance==0.5.2',
 31 |         'h5py==2.9.0',
 32 |         'tensorboardX>=1.8',
 33 |         'pillow>=6.1.0',
 34 |         'nara_wpe>=0.0.5',
 35 |         'museval>=0.2.1',
 36 |         'pystoi>=0.2.2',
 37 |         'kaldiio>=2.13.8',
 38 |         'matplotlib>=3.1.0',
 39 |         'funcsigs>=1.0.2',  # A backport of inspect.signature for python2
 40 |         'configargparse>=0.14.0',
 41 |         'PyYAML>=5.1.2',
 42 |         'sentencepiece>=0.1.82',
 43 |         'pysptk>=0.1.17',
 44 |         'nltk>=3.4.5',
 45 |         'nnmnkwii',
 46 |         'jaconv',
 47 |         'g2p_en',
 48 |         'torch_complex@git+https://github.com/kamo-naoyuki/pytorch_complex.git',
 49 |         'pytorch_wpe@git+https://github.com/nttcslab-sp/dnn_wpe.git',
 50 |     ],
 51 |     'setup': [
 52 |         'numpy',
 53 |         'pytest-runner'
 54 |     ],
 55 |     'test': [
 56 |         'pytest>=3.3.0',
 57 |         'pytest-pythonpath>=0.7.3',
 58 |         'pytest-cov>=2.7.1',
 59 |         'hacking>=1.1.0',
 60 |         'mock>=2.0.0',
 61 |         'autopep8>=1.3.3',
 62 |         'jsondiff>=1.2.0',
 63 |         'flake8>=3.7.8',
 64 |         'flake8-docstrings>=1.3.1'
 65 |     ],
 66 |     'doc': [
 67 |         'Sphinx==2.1.2',
 68 |         'sphinx-rtd-theme>=0.2.4',
 69 |         'sphinx-argparse>=0.2.5',
 70 |         'commonmark==0.8.1',
 71 |         'recommonmark>=0.4.0',
 72 |         'travis-sphinx>=2.0.1',
 73 |         'nbsphinx>=0.4.2'
 74 |     ]}
 75 | install_requires = requirements['install']
 76 | setup_requires = requirements['setup']
 77 | tests_require = requirements['test']
 78 | extras_require = {k: v for k, v in requirements.items()
 79 |                   if k not in ['install', 'setup']}
 80 | 
 81 | dirname = os.path.dirname(__file__)
 82 | setup(name='StreamingTransformer',
 83 |       version='0.1.0',
 84 |       url='http://github.com/cywang97/StreamingTransformer',
 85 |       long_description=open(os.path.join(dirname, 'README.md'),
 86 |                             encoding='utf-8').read(),
 87 |       license='Apache Software License',
 88 |       install_requires=install_requires,
 89 |       setup_requires=setup_requires,
 90 |       tests_require=tests_require,
 91 |       extras_require=extras_require,
 92 |       classifiers=[
 93 |           'Programming Language :: Python',
 94 |           'Programming Language :: Python :: 3.7',
 95 |           'Development Status :: 5 - Production/Stable',
 96 |           'Intended Audience :: Science/Research',
 97 |           'Operating System :: POSIX :: Linux',
 98 |           'License :: OSI Approved :: Apache Software License',
 99 |           'Topic :: Software Development :: Libraries :: Python Modules'],
100 |       )
101 | 


--------------------------------------------------------------------------------
/utils/average_checkpoints.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def main():
12 |     if args.log is not None:
13 |         with open(args.log) as f:
14 |             logs = json.load(f)
15 |         val_scores = []
16 |         for log in logs:
17 |             if "validation/main/acc" in log.keys():
18 |                 val_scores += [[log["epoch"], log["validation/main/acc"]]]
19 |             elif "val_perplexity" in log.keys():
20 |                 val_scores += [[log["epoch"], 1 / log["val_perplexity"]]]
21 |             elif "validation/main/loss" in log.keys():
22 |                 val_scores += [[log["epoch"], -log["validation/main/loss"]]]
23 |         if len(val_scores) == 0:
24 |             raise ValueError(
25 |                 "`validation/main/acc` or `val_perplexity` is not found in log."
26 |             )
27 |         val_scores = np.array(val_scores)
28 |         sort_idx = np.argsort(val_scores[:, -1])
29 |         sorted_val_scores = val_scores[sort_idx][::-1]
30 |         print("best val scores = " + str(sorted_val_scores[: args.num, 1]))
31 |         print(
32 |             "selected epochs = "
33 |             + str(sorted_val_scores[: args.num, 0].astype(np.int64))
34 |         )
35 |         last = [
36 |             os.path.dirname(args.snapshots[0]) + "/snapshot.ep.%d" % (int(epoch))
37 |             for epoch in sorted_val_scores[: args.num, 0]
38 |         ]
39 |     else:
40 |         last = sorted(args.snapshots, key=os.path.getmtime)
41 |         last = last[-args.num :]
42 |     print("average over", last)
43 |     avg = None
44 | 
45 |     import torch
46 | 
47 |     # sum
48 |     for path in last:
49 |         states = torch.load(path, map_location=torch.device("cpu"))
50 |         if 'state_dict' in states:
51 |             states = states['state_dict']
52 |         if 'model' in states:
53 |             states = states['model']
54 |         if avg is None:
55 |             avg = states
56 |         else:
57 |             for k in avg.keys():
58 |                 avg[k] += states[k]
59 | 
60 |     # average
61 |     for k in avg.keys():
62 |         if avg[k] is not None:
63 |             avg[k] /= args.num
64 | 
65 |     torch.save(avg, args.out)
66 | 
67 | 
68 | def get_parser():
69 |     parser = argparse.ArgumentParser(description="average models from snapshot")
70 |     parser.add_argument("--snapshots", required=True, type=str, nargs="+")
71 |     parser.add_argument("--out", required=True, type=str)
72 |     parser.add_argument("--num", default=10, type=int)
73 |     parser.add_argument("--backend", default="chainer", type=str)
74 |     parser.add_argument("--log", default=None, type=str, nargs="?")
75 |     return parser
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     args = get_parser().parse_args()
80 |     main()
81 | 


--------------------------------------------------------------------------------
/utils/change_yaml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from pathlib import Path
  4 | 
  5 | import yaml
  6 | 
  7 | 
  8 | def get_parser():
  9 |     parser = argparse.ArgumentParser(
 10 |         description="change specified attributes of a YAML file",
 11 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 12 |     )
 13 | 
 14 |     egroup = parser.add_mutually_exclusive_group()
 15 |     parser.add_argument("inyaml", nargs="?")
 16 |     egroup.add_argument("-o", "--outyaml")
 17 |     egroup.add_argument("--outdir")
 18 |     parser.add_argument(
 19 |         "-a",
 20 |         "--arg",
 21 |         action="append",
 22 |         default=[],
 23 |         help="e.g -a a.b.c=4 -> {'a': {'b': {'c': 4}}}",
 24 |     )
 25 |     parser.add_argument(
 26 |         "-d",
 27 |         "--delete",
 28 |         action="append",
 29 |         default=[],
 30 |         help='e.g -d a -> "a" is removed from the input yaml',
 31 |     )
 32 |     return parser
 33 | 
 34 | 
 35 | def main():
 36 |     args = get_parser().parse_args()
 37 | 
 38 |     if args.inyaml is None:
 39 |         indict = {}
 40 |     else:
 41 |         with open(args.inyaml, "r") as f:
 42 |             indict = yaml.load(f, Loader=yaml.Loader)
 43 |         if indict is None:
 44 |             indict = {}
 45 | 
 46 |     if args.outyaml is None:
 47 |         # Auto naming from arguments
 48 |         eles = []
 49 |         if args.inyaml is not None:
 50 |             p = Path(args.inyaml)
 51 |             if args.outdir is None:
 52 |                 outdir = p.parent
 53 |             else:
 54 |                 outdir = Path(args.outdir)
 55 |             eles.append(str(outdir / p.stem))
 56 | 
 57 |         table = str.maketrans("{}[]()", "%%__--", " |&;#*?~\"'\\")
 58 |         for arg in args.delete:
 59 |             value = arg.translate(table)
 60 |             eles.append("del-" + value)
 61 |         for arg in args.arg:
 62 |             if "=" not in arg:
 63 |                 raise RuntimeError(f'"{arg}" does\'t include "="')
 64 |             key, value = arg.split("=")
 65 |             key = key.translate(table)
 66 |             value = value.translate(table)
 67 |             eles.append(key + value)
 68 | 
 69 |         outyaml = "_".join(eles)
 70 |         if outyaml == "":
 71 |             outyaml = "config"
 72 |         outyaml += ".yaml"
 73 |         if args.inyaml == outyaml:
 74 |             p = Path(args.outyaml)
 75 |             outyaml = p.parent / (p.stem + ".2" + p.suffix)
 76 | 
 77 |         outyaml = Path(outyaml)
 78 |     else:
 79 |         outyaml = Path(args.outyaml)
 80 | 
 81 |     for arg in args.delete + args.arg:
 82 |         if "=" in arg:
 83 |             key, value = arg.split("=")
 84 |             if not value.strip() == "":
 85 |                 value = yaml.load(value, Loader=yaml.Loader)
 86 |         else:
 87 |             key = arg
 88 |             value = None
 89 | 
 90 |         keys = key.split(".")
 91 |         d = indict
 92 |         for idx, k in enumerate(keys):
 93 |             if idx == len(keys) - 1:
 94 |                 if isinstance(d, (tuple, list)):
 95 |                     k = int(k)
 96 |                     if k >= len(d):
 97 |                         d += type(d)(None for _ in range(k - len(d) + 1))
 98 |                 if value is not None:
 99 |                     d[k] = value
100 |                 else:
101 |                     del d[k]
102 |             else:
103 |                 if isinstance(d, (tuple, list)):
104 |                     k = int(k)
105 |                     if k >= len(d):
106 |                         d += type(d)(None for _ in range(k - len(d) + 1))
107 |                 elif isinstance(d, dict):
108 |                     if k not in d:
109 |                         d[k] = {}
110 |                 if not isinstance(d[k], (dict, tuple, list)):
111 |                     d[k] = {}
112 |                 d = d[k]
113 | 
114 |     outyaml.parent.mkdir(parents=True, exist_ok=True)
115 |     with outyaml.open("w") as f:
116 |         yaml.dump(indict, f, Dumper=yaml.Dumper, indent=4, sort_keys=False)
117 |     print(outyaml)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/utils/compute-stft-feats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import argparse
  7 | from distutils.util import strtobool
  8 | import logging
  9 | 
 10 | import kaldiio
 11 | import numpy
 12 | 
 13 | from espnet.transform.spectrogram import spectrogram
 14 | from espnet.utils.cli_utils import get_commandline_args
 15 | from espnet.utils.cli_writers import file_writer_helper
 16 | from espnet2.utils.types import int_or_none
 17 | 
 18 | 
 19 | def get_parser():
 20 |     parser = argparse.ArgumentParser(
 21 |         description="compute STFT feature from WAV",
 22 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 23 |     )
 24 |     parser.add_argument("--n_fft", type=int, default=1024, help="FFT length in point")
 25 |     parser.add_argument(
 26 |         "--n_shift", type=int, default=512, help="Shift length in point"
 27 |     )
 28 |     parser.add_argument(
 29 |         "--win_length",
 30 |         type=int_or_none,
 31 |         default=None,
 32 |         nargs="?",
 33 |         help="Analisys window length in point",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--window",
 37 |         type=str,
 38 |         default="hann",
 39 |         choices=["hann", "hamming"],
 40 |         help="Type of window",
 41 |     )
 42 |     parser.add_argument(
 43 |         "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
 44 |     )
 45 |     parser.add_argument(
 46 |         "--filetype",
 47 |         type=str,
 48 |         default="mat",
 49 |         choices=["mat", "hdf5"],
 50 |         help="Specify the file format. " '"mat" is the matrix format in kaldi',
 51 |     )
 52 |     parser.add_argument(
 53 |         "--compress", type=strtobool, default=False, help="Save in compressed format"
 54 |     )
 55 |     parser.add_argument(
 56 |         "--compression-method",
 57 |         type=int,
 58 |         default=2,
 59 |         help="Specify the method(if mat) or " "gzip-level(if hdf5)",
 60 |     )
 61 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
 62 |     parser.add_argument(
 63 |         "--normalize",
 64 |         choices=[1, 16, 24, 32],
 65 |         type=int,
 66 |         default=None,
 67 |         help="Give the bit depth of the PCM, "
 68 |         "then normalizes data to scale in [-1,1]",
 69 |     )
 70 |     parser.add_argument("rspecifier", type=str, help="WAV scp file")
 71 |     parser.add_argument(
 72 |         "--segments",
 73 |         type=str,
 74 |         help="segments-file format: each line is either"
 75 |         "<segment-id> <recording-id> <start-time> <end-time>"
 76 |         "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
 77 |     )
 78 |     parser.add_argument("wspecifier", type=str, help="Write specifier")
 79 |     return parser
 80 | 
 81 | 
 82 | def main():
 83 |     parser = get_parser()
 84 |     args = parser.parse_args()
 85 | 
 86 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 87 |     if args.verbose > 0:
 88 |         logging.basicConfig(level=logging.INFO, format=logfmt)
 89 |     else:
 90 |         logging.basicConfig(level=logging.WARN, format=logfmt)
 91 |     logging.info(get_commandline_args())
 92 | 
 93 |     with kaldiio.ReadHelper(
 94 |         args.rspecifier, segments=args.segments
 95 |     ) as reader, file_writer_helper(
 96 |         args.wspecifier,
 97 |         filetype=args.filetype,
 98 |         write_num_frames=args.write_num_frames,
 99 |         compress=args.compress,
100 |         compression_method=args.compression_method,
101 |     ) as writer:
102 |         for utt_id, (_, array) in reader:
103 |             array = array.astype(numpy.float32)
104 |             if args.normalize is not None and args.normalize != 1:
105 |                 array = array / (1 << (args.normalize - 1))
106 |             spc = spectrogram(
107 |                 x=array,
108 |                 n_fft=args.n_fft,
109 |                 n_shift=args.n_shift,
110 |                 win_length=args.win_length,
111 |                 window=args.window,
112 |             )
113 |             writer[utt_id] = spc
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/utils/concatjson.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | from __future__ import print_function
 8 | from __future__ import unicode_literals
 9 | 
10 | import argparse
11 | import codecs
12 | import json
13 | import logging
14 | import sys
15 | 
16 | from espnet.utils.cli_utils import get_commandline_args
17 | 
18 | is_python2 = sys.version_info[0] == 2
19 | 
20 | 
21 | def get_parser():
22 |     parser = argparse.ArgumentParser(
23 |         description="concatenate json files",
24 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
25 |     )
26 |     parser.add_argument("jsons", type=str, nargs="+", help="json files")
27 |     return parser
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     args = get_parser().parse_args()
32 | 
33 |     # logging info
34 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
35 |     logging.basicConfig(level=logging.INFO, format=logfmt)
36 |     logging.info(get_commandline_args())
37 | 
38 |     # make intersection set for utterance keys
39 |     js = {}
40 |     for x in args.jsons:
41 |         with codecs.open(x, encoding="utf-8") as f:
42 |             j = json.load(f)
43 |         ks = j["utts"].keys()
44 |         logging.debug(x + ": has " + str(len(ks)) + " utterances")
45 |         js.update(j["utts"])
46 |     logging.info("new json has " + str(len(js.keys())) + " utterances")
47 | 
48 |     # ensure "ensure_ascii=False", which is a bug
49 |     jsonstring = json.dumps(
50 |         {"utts": js},
51 |         indent=4,
52 |         sort_keys=True,
53 |         ensure_ascii=False,
54 |         separators=(",", ": "),
55 |     )
56 |     sys.stdout = codecs.getwriter("utf-8")(
57 |         sys.stdout if is_python2 else sys.stdout.buffer
58 |     )
59 |     print(jsonstring)
60 | 


--------------------------------------------------------------------------------
/utils/convert_fbank.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | 
 9 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
10 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
11 | 
12 | # Begin configuration section.
13 | nj=4
14 | fs=22050
15 | fmax=
16 | fmin=
17 | n_fft=1024
18 | n_shift=512
19 | win_length=
20 | n_mels=
21 | iters=64
22 | cmd=run.pl
23 | help_message=$(cat <<EOF
24 | Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
25 | e.g.: $0 data/train exp/griffin_lim/train wav
26 | Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data
27 | Options:
28 |   --nj <nj>                  # number of parallel jobs
29 |   --fs <fs>                  # sampling rate
30 |   --fmax <fmax>              # maximum frequency
31 |   --fmin <fmin>              # minimum frequency
32 |   --n_fft <n_fft>            # number of FFT points (default=1024)
33 |   --n_shift <n_shift>        # shift size in point (default=256)
34 |   --win_length <win_length>  # window length in point (default=)
35 |   --n_mels <n_mels>          # number of mel basis (default=80)
36 |   --iters <iters>            # number of Griffin-lim iterations (default=64)
37 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
38 | EOF
39 | )
40 | # End configuration section.
41 | 
42 | echo "$0 $*"  # Print the command line for logging
43 | 
44 | . parse_options.sh || exit 1;
45 | 
46 | if [ $# -lt 1 ] || [ $# -gt 3 ]; then
47 |     echo "${help_message}"
48 |     exit 1;
49 | fi
50 | 
51 | set -euo pipefail
52 | 
53 | data=$1
54 | if [ $# -ge 2 ]; then
55 |   logdir=$2
56 | else
57 |   logdir=${data}/log
58 | fi
59 | if [ $# -ge 3 ]; then
60 |   wavdir=$3
61 | else
62 |   wavdir=${data}/data
63 | fi
64 | 
65 | # use "name" as part of name of the archive.
66 | name=$(basename ${data})
67 | 
68 | mkdir -p ${wavdir} || exit 1;
69 | mkdir -p ${logdir} || exit 1;
70 | 
71 | scp=${data}/feats.scp
72 | 
73 | split_scps=""
74 | for n in $(seq ${nj}); do
75 |     split_scps="$split_scps $logdir/feats.$n.scp"
76 | done
77 | 
78 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
79 | 
80 | ${cmd} JOB=1:${nj} ${logdir}/griffin_lim_${name}.JOB.log \
81 |     convert_fbank_to_wav.py \
82 |         --fs ${fs} \
83 |         --fmax ${fmax} \
84 |         --fmin ${fmin} \
85 |         --win_length ${win_length} \
86 |         --n_fft ${n_fft} \
87 |         --n_shift ${n_shift} \
88 |         --n_mels ${n_mels} \
89 |         --iters ${iters} \
90 |         scp:${logdir}/feats.JOB.scp \
91 |         ${wavdir}
92 | 
93 | rm ${logdir}/feats.*.scp 2>/dev/null
94 | 
95 | echo "Succeeded creating wav for $name"
96 | 


--------------------------------------------------------------------------------
/utils/copy-feats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from distutils.util import strtobool
  4 | import logging
  5 | 
  6 | from espnet.transform.transformation import Transformation
  7 | from espnet.utils.cli_readers import file_reader_helper
  8 | from espnet.utils.cli_utils import get_commandline_args
  9 | from espnet.utils.cli_utils import is_scipy_wav_style
 10 | from espnet.utils.cli_writers import file_writer_helper
 11 | 
 12 | 
 13 | def get_parser():
 14 |     parser = argparse.ArgumentParser(
 15 |         description="copy feature with preprocessing",
 16 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 17 |     )
 18 | 
 19 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
 20 |     parser.add_argument(
 21 |         "--in-filetype",
 22 |         type=str,
 23 |         default="mat",
 24 |         choices=["mat", "hdf5", "sound.hdf5", "sound"],
 25 |         help="Specify the file format for the rspecifier. "
 26 |         '"mat" is the matrix format in kaldi',
 27 |     )
 28 |     parser.add_argument(
 29 |         "--out-filetype",
 30 |         type=str,
 31 |         default="mat",
 32 |         choices=["mat", "hdf5", "sound.hdf5", "sound"],
 33 |         help="Specify the file format for the wspecifier. "
 34 |         '"mat" is the matrix format in kaldi',
 35 |     )
 36 |     parser.add_argument(
 37 |         "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
 38 |     )
 39 |     parser.add_argument(
 40 |         "--compress", type=strtobool, default=False, help="Save in compressed format"
 41 |     )
 42 |     parser.add_argument(
 43 |         "--compression-method",
 44 |         type=int,
 45 |         default=2,
 46 |         help="Specify the method(if mat) or " "gzip-level(if hdf5)",
 47 |     )
 48 |     parser.add_argument(
 49 |         "--preprocess-conf",
 50 |         type=str,
 51 |         default=None,
 52 |         help="The configuration file for the pre-processing",
 53 |     )
 54 |     parser.add_argument(
 55 |         "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
 56 |     )
 57 |     parser.add_argument(
 58 |         "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark"
 59 |     )
 60 |     return parser
 61 | 
 62 | 
 63 | def main():
 64 |     parser = get_parser()
 65 |     args = parser.parse_args()
 66 | 
 67 |     # logging info
 68 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 69 |     if args.verbose > 0:
 70 |         logging.basicConfig(level=logging.INFO, format=logfmt)
 71 |     else:
 72 |         logging.basicConfig(level=logging.WARN, format=logfmt)
 73 |     logging.info(get_commandline_args())
 74 | 
 75 |     if args.preprocess_conf is not None:
 76 |         preprocessing = Transformation(args.preprocess_conf)
 77 |         logging.info("Apply preprocessing: {}".format(preprocessing))
 78 |     else:
 79 |         preprocessing = None
 80 | 
 81 |     with file_writer_helper(
 82 |         args.wspecifier,
 83 |         filetype=args.out_filetype,
 84 |         write_num_frames=args.write_num_frames,
 85 |         compress=args.compress,
 86 |         compression_method=args.compression_method,
 87 |     ) as writer:
 88 |         for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
 89 |             if is_scipy_wav_style(mat):
 90 |                 # If data is sound file, then got as Tuple[int, ndarray]
 91 |                 rate, mat = mat
 92 | 
 93 |             if preprocessing is not None:
 94 |                 mat = preprocessing(mat, uttid_list=utt)
 95 | 
 96 |             # shape = (Time, Channel)
 97 |             if args.out_filetype in ["sound.hdf5", "sound"]:
 98 |                 # Write Tuple[int, numpy.ndarray] (scipy style)
 99 |                 writer[utt] = (rate, mat)
100 |             else:
101 |                 writer[utt] = mat
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/utils/download_from_google_drive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Download zipfile from google drive
 4 | 
 5 | # Copyright 2019 Tomoki Hayashi
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | share_url=$1
 9 | download_dir=${2:-"downloads"}
10 | file_ext=${3:-"zip"}
11 | 
12 | if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then
13 |    echo "Usage: $0 <share-url> [<download_dir> <file_ext>]";
14 |    echo "e.g.: $0 https://drive.google.com/open?id=1zF88bRNbJhw9hNBq3NrDg8vnGGibREmg downloads zip"
15 |    echo "Options:"
16 |    echo "    <download_dir>: directory to save downloaded file. (Default=downloads)"
17 |    echo "    <file_ext>: file extension of the file to be downloaded. (Default=zip)"
18 |    if [ "$1" = "--help" ]; then
19 |        exit 0;
20 |    fi
21 |    exit 1;
22 | fi
23 | 
24 | [ ! -e "${download_dir}" ] && mkdir -p "${download_dir}"
25 | tmp=$(mktemp "${download_dir}/XXXXXX.${file_ext}")
26 | 
27 | # file id in google drive can be obtain from sharing link
28 | # ref: https://qiita.com/namakemono/items/c963e75e0af3f7eed732
29 | file_id=$(echo "${share_url}" | cut -d"=" -f 2)
30 | 
31 | # define decompressor
32 | decompress () {
33 |     filename=$1
34 |     decompress_dir=$2
35 |     if echo "${filename}" | grep -q ".zip"; then
36 |         unzip "${filename}" -d "${decompress_dir}"
37 |     elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then
38 |         tar xvzf "${filename}" -C "${decompress_dir}"
39 |     else
40 |         echo "Unsupported file extension." >&2 && exit 1
41 |     fi
42 | }
43 | 
44 | # Try-catch like processing
45 | (
46 |     wget "https://drive.google.com/uc?export=download&id=${file_id}" -O "${tmp}"
47 |     decompress "${tmp}" "${download_dir}"
48 | ) || {
49 |     # Do not allow error from here
50 |     set -e
51 |     # sometimes, wget from google drive is failed due to virus check confirmation
52 |     # to avoid it, we need to do some tricky processings
53 |     # see https://stackoverflow.com/questions/20665881/direct-download-from-google-drive-using-google-drive-api
54 |     curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${file_id}" > /tmp/intermezzo.html
55 |     postfix=$(perl -nle 'print $& while m{uc-download-link" [^>]* href="\K[^"]*}g' /tmp/intermezzo.html | sed 's/\&amp;/\&/g')
56 |     curl -L -b /tmp/cookies "https://drive.google.com${postfix}" > "${tmp}"
57 |     decompress "${tmp}" "${download_dir}"
58 | }
59 | 
60 | # remove tmpfiles
61 | rm "${tmp}"
62 | echo "Sucessfully downloaded zip file from ${share_url}"
63 | 


--------------------------------------------------------------------------------
/utils/dump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | echo "$0 $*"  # Print the command line for logging
 7 | . ./path.sh
 8 | 
 9 | cmd=run.pl
10 | do_delta=false
11 | nj=1
12 | verbose=0
13 | compress=true
14 | write_utt2num_frames=true
15 | filetype='mat'  # mat or hdf5
16 | help_message="Usage: $0 <scp> <cmvnark> <logdir> <dumpdir>"
17 | 
18 | . utils/parse_options.sh
19 | 
20 | scp=$1
21 | cvmnark=$2
22 | logdir=$3
23 | dumpdir=$4
24 | 
25 | if [ $# != 4 ]; then
26 |     echo "${help_message}"
27 |     exit 1;
28 | fi
29 | 
30 | set -euo pipefail
31 | 
32 | mkdir -p ${logdir}
33 | mkdir -p ${dumpdir}
34 | 
35 | dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD})
36 | 
37 | for n in $(seq ${nj}); do
38 |     # the next command does nothing unless $dumpdir/storage/ exists, see
39 |     # utils/create_data_link.pl for more info.
40 |     utils/create_data_link.pl ${dumpdir}/feats.${n}.ark
41 | done
42 | 
43 | if ${write_utt2num_frames}; then
44 |     write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB"
45 | else
46 |     write_num_frames_opt=
47 | fi
48 | 
49 | # split scp file
50 | split_scps=""
51 | for n in $(seq ${nj}); do
52 |     split_scps="$split_scps $logdir/feats.$n.scp"
53 | done
54 | 
55 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
56 | 
57 | # dump features
58 | if ${do_delta}; then
59 |     ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
60 |         apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
61 |         add-deltas ark:- ark:- \| \
62 |         copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
63 |             --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
64 |             ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
65 |         || exit 1
66 | else
67 |     ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
68 |         apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
69 |         copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
70 |             --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
71 |             ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
72 |         || exit 1
73 | fi
74 | 
75 | # concatenate scp files
76 | for n in $(seq ${nj}); do
77 |     cat ${dumpdir}/feats.${n}.scp || exit 1;
78 | done > ${dumpdir}/feats.scp || exit 1
79 | 
80 | if ${write_utt2num_frames}; then
81 |     for n in $(seq ${nj}); do
82 |         cat ${dumpdir}/utt2num_frames.${n} || exit 1;
83 |     done > ${dumpdir}/utt2num_frames || exit 1
84 |     rm ${dumpdir}/utt2num_frames.* 2>/dev/null
85 | fi
86 | 
87 | # Write the filetype, this will be used for data2json.sh
88 | echo ${filetype} > ${dumpdir}/filetype
89 | 
90 | 
91 | # remove temp scps
92 | rm ${logdir}/feats.*.scp 2>/dev/null
93 | if [ ${verbose} -eq 1 ]; then
94 |     echo "Succeeded dumping features for training"
95 | fi
96 | 


--------------------------------------------------------------------------------
/utils/eval_source_separation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "$0 $*" >&2 # Print the command line for logging
 4 | 
 5 | nj=10
 6 | cmd=run.pl
 7 | evaltypes="SDR STOI ESTOI PESQ"
 8 | permutation=true
 9 | # Use museval.metrics.bss_eval_images or museval.metrics.bss_eval_source
10 | bss_eval_images=true
11 | bss_eval_version=v3
12 | 
13 | help_message=$(cat << EOF
14 | Usage: $0 reffiles enffiles <dir>
15 |     e.g. $0 reference.scp enhanced.scp outdir
16 | 
17 | And also supporting multiple sources:
18 |     e.g. $0 "ref1.scp,ref2.scp" "enh1.scp,enh2.scp" outdir
19 | 
20 | Options:
21 |   --nj <nj>                                        # number of parallel jobs
22 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
23 | EOF
24 | )
25 | 
26 | . ./path.sh
27 | . utils/parse_options.sh
28 | 
29 | if [ $# != 3 ]; then
30 |     echo "${help_message}" 1>&2
31 |     exit 1;
32 | fi
33 | 
34 | set -euo pipefail
35 | 
36 | IFS=, read -r -a reffiles <<<$1
37 | IFS=, read -r -a enhfiles <<<$2
38 | dir=$3
39 | logdir=${dir}/log
40 | mkdir -p ${logdir}
41 | 
42 | split_scps=""
43 | for n in $(seq ${nj}); do
44 |     split_scps="${split_scps} ${logdir}/key.${n}.scp"
45 | done
46 | 
47 | # Split the first reference
48 | utils/split_scp.pl ${reffiles[0]} ${split_scps} || exit 1;
49 | 
50 | ${cmd} JOB=1:${nj} ${logdir}/eval-enhanced-speech.JOB.log \
51 |     eval-source-separation.py \
52 |     --ref "${reffiles[@]}" --enh "${enhfiles[@]}" \
53 |     --keylist ${logdir}/key.JOB.scp \
54 |     --out ${logdir}/JOB \
55 |     --evaltypes ${evaltypes} \
56 |     --permutation ${permutation} \
57 |     --bss-eval-images ${bss_eval_images} \
58 |     --bss-eval-version ${bss_eval_version}
59 | 
60 | 
61 | for t in ${evaltypes/SDR/SDR ISR SIR SAR}; do
62 |     for i in $(seq 1 ${nj}); do
63 |         cat ${logdir}/${i}/${t}
64 |     done > ${dir}/${t}
65 | 
66 |     # Calculate the mean over files
67 |     python << EOF > ${dir}/mean_${t}
68 | with open('${dir}/${t}', 'r') as f:
69 |     values = []
70 |     for l in f:
71 |         vs = l.rstrip().split(None)[1:]
72 |         values.append(sum(map(float, vs)) / len(vs))
73 |     mean = sum(values) / len(values)
74 | print(mean)
75 | EOF
76 | 
77 | done
78 | 


--------------------------------------------------------------------------------
/utils/feat-to-shape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import logging
 4 | import sys
 5 | 
 6 | from espnet.transform.transformation import Transformation
 7 | from espnet.utils.cli_readers import file_reader_helper
 8 | from espnet.utils.cli_utils import get_commandline_args
 9 | from espnet.utils.cli_utils import is_scipy_wav_style
10 | 
11 | 
12 | def get_parser():
13 |     parser = argparse.ArgumentParser(
14 |         description="convert feature to its shape",
15 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
16 |     )
17 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
18 |     parser.add_argument(
19 |         "--filetype",
20 |         type=str,
21 |         default="mat",
22 |         choices=["mat", "hdf5", "sound.hdf5", "sound"],
23 |         help="Specify the file format for the rspecifier. "
24 |         '"mat" is the matrix format in kaldi',
25 |     )
26 |     parser.add_argument(
27 |         "--preprocess-conf",
28 |         type=str,
29 |         default=None,
30 |         help="The configuration file for the pre-processing",
31 |     )
32 |     parser.add_argument(
33 |         "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
34 |     )
35 |     parser.add_argument(
36 |         "out",
37 |         nargs="?",
38 |         type=argparse.FileType("w"),
39 |         default=sys.stdout,
40 |         help="The output filename. " "If omitted, then output to sys.stdout",
41 |     )
42 |     return parser
43 | 
44 | 
45 | def main():
46 |     parser = get_parser()
47 |     args = parser.parse_args()
48 | 
49 |     # logging info
50 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
51 |     if args.verbose > 0:
52 |         logging.basicConfig(level=logging.INFO, format=logfmt)
53 |     else:
54 |         logging.basicConfig(level=logging.WARN, format=logfmt)
55 |     logging.info(get_commandline_args())
56 | 
57 |     if args.preprocess_conf is not None:
58 |         preprocessing = Transformation(args.preprocess_conf)
59 |         logging.info("Apply preprocessing: {}".format(preprocessing))
60 |     else:
61 |         preprocessing = None
62 | 
63 |     # There are no necessary for matrix without preprocessing,
64 |     # so change to file_reader_helper to return shape.
65 |     # This make sense only with filetype="hdf5".
66 |     for utt, mat in file_reader_helper(
67 |         args.rspecifier, args.filetype, return_shape=preprocessing is None
68 |     ):
69 |         if preprocessing is not None:
70 |             if is_scipy_wav_style(mat):
71 |                 # If data is sound file, then got as Tuple[int, ndarray]
72 |                 rate, mat = mat
73 |             mat = preprocessing(mat, uttid_list=utt)
74 |             shape_str = ",".join(map(str, mat.shape))
75 |         else:
76 |             if len(mat) == 2 and isinstance(mat[1], tuple):
77 |                 # If data is sound file, Tuple[int, Tuple[int, ...]]
78 |                 rate, mat = mat
79 |             shape_str = ",".join(map(str, mat))
80 |         args.out.write("{} {}\n".format(utt, shape_str))
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/utils/feat_to_shape.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Begin configuration section.
 4 | nj=4
 5 | cmd=run.pl
 6 | verbose=0
 7 | filetype=""
 8 | preprocess_conf=""
 9 | # End configuration section.
10 | 
11 | help_message=$(cat << EOF
12 | Usage: $0 [options] <input-scp> <output-scp> [<log-dir>]
13 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log
14 | Options:
15 |   --nj <nj>                                        # number of parallel jobs
16 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
17 |   --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
18 |   --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
19 |   --verbose <num>                                  # Default: 0
20 | EOF
21 | )
22 | 
23 | echo "$0 $*" 1>&2 # Print the command line for logging
24 | 
25 | . parse_options.sh || exit 1;
26 | 
27 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then
28 |     echo "${help_message}" 1>&2
29 |     exit 1;
30 | fi
31 | 
32 | set -euo pipefail
33 | 
34 | scp=$1
35 | outscp=$2
36 | data=$(dirname ${scp})
37 | if [ $# -eq 3 ]; then
38 |   logdir=$3
39 | else
40 |   logdir=${data}/log
41 | fi
42 | mkdir -p ${logdir}
43 | 
44 | nj=$((nj<$(<"${scp}" wc -l)?nj:$(<"${scp}" wc -l)))
45 | split_scps=""
46 | for n in $(seq ${nj}); do
47 |     split_scps="${split_scps} ${logdir}/feats.${n}.scp"
48 | done
49 | 
50 | utils/split_scp.pl ${scp} ${split_scps}
51 | 
52 | if [ -n "${preprocess_conf}" ]; then
53 |     preprocess_opt="--preprocess-conf ${preprocess_conf}"
54 | else
55 |     preprocess_opt=""
56 | fi
57 | if [ -n "${filetype}" ]; then
58 |     filetype_opt="--filetype ${filetype}"
59 | else
60 |     filetype_opt=""
61 | fi
62 | 
63 | ${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
64 |     feat-to-shape.py --verbose ${verbose} ${preprocess_opt} ${filetype_opt} \
65 |     scp:${logdir}/feats.JOB.scp ${logdir}/shape.JOB.scp
66 | 
67 | # concatenate the .scp files together.
68 | for n in $(seq ${nj}); do
69 |     cat ${logdir}/shape.${n}.scp
70 | done > ${outscp}
71 | 
72 | rm -f ${logdir}/feats.*.scp 2>/dev/null
73 | 


--------------------------------------------------------------------------------
/utils/feats2npy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  coding: utf-8
 3 | 
 4 | import argparse
 5 | from kaldiio import ReadHelper
 6 | import numpy as np
 7 | import os
 8 | from os.path import join
 9 | import sys
10 | 
11 | 
12 | def get_parser():
13 |     parser = argparse.ArgumentParser(
14 |         description="Convet kaldi-style features to numpy arrays",
15 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
16 |     )
17 |     parser.add_argument("scp_file", type=str, help="scp file")
18 |     parser.add_argument("out_dir", type=str, help="output directory")
19 |     return parser
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     args = get_parser().parse_args(sys.argv[1:])
24 |     os.makedirs(args.out_dir, exist_ok=True)
25 |     with ReadHelper(f"scp:{args.scp_file}") as f:
26 |         for utt_id, arr in f:
27 |             out_path = join(args.out_dir, f"{utt_id}-feats.npy")
28 |             np.save(out_path, arr, allow_pickle=False)
29 |     sys.exit(0)
30 | 


--------------------------------------------------------------------------------
/utils/filt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Apache 2.0
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import argparse
 8 | import codecs
 9 | import sys
10 | 
11 | is_python2 = sys.version_info[0] == 2
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         description="filter words in a text file",
17 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
18 |     )
19 |     parser.add_argument(
20 |         "--exclude",
21 |         "-v",
22 |         dest="exclude",
23 |         action="store_true",
24 |         help="exclude filter words",
25 |     )
26 |     parser.add_argument("filt", type=str, help="filter list")
27 |     parser.add_argument("infile", type=str, help="input file")
28 |     return parser
29 | 
30 | 
31 | def main(args):
32 |     args = get_parser().parse_args(args)
33 |     filter_file(args.infile, args.filt, args.exclude)
34 | 
35 | 
36 | def filter_file(infile, filt, exclude):
37 |     vocab = set()
38 |     with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
39 |         for line in vocabfile:
40 |             vocab.add(line.strip())
41 | 
42 |     sys.stdout = codecs.getwriter("utf-8")(
43 |         sys.stdout if is_python2 else sys.stdout.buffer
44 |     )
45 |     with codecs.open(infile, "r", encoding="utf-8") as textfile:
46 |         for line in textfile:
47 |             if exclude:
48 |                 print(
49 |                     " ".join(
50 |                         map(
51 |                             lambda word: word if word not in vocab else "",
52 |                             line.strip().split(),
53 |                         )
54 |                     )
55 |                 )
56 |             else:
57 |                 print(
58 |                     " ".join(
59 |                         map(
60 |                             lambda word: word if word in vocab else "<UNK>",
61 |                             line.strip().split(),
62 |                         )
63 |                     )
64 |                 )
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main(sys.argv[1:])
69 | 


--------------------------------------------------------------------------------
/utils/free-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Author: Gaurav Kumar
 3 | 
 4 | 
 5 | # Usage: e.g.
 6 | # % free-gpu.sh -n 2
 7 | # 1,2
 8 | 
 9 | # Allow requests for multiple GPUs
10 | # (Optional) defaults to 1
11 | req_gpus=1
12 | while getopts ':n:' opt; do
13 |   case ${opt} in
14 |     n)
15 |       req_gpus=${OPTARG}
16 |       ;;
17 |     :)
18 |       echo "Option -${OPTARG} requires an argument." >&2
19 |       exit 1
20 |       ;;
21 |     *)
22 |       echo "Option -${OPTARG} is not supported" >&2
23 |       exit 1
24 |       ;;
25 |   esac
26 | done
27 | 
28 | # Number of free GPUs on a machine
29 | n_gpus=$(lspci | grep -i "nvidia" | grep -c -v "Audio")
30 | 
31 | # Return -1 if there are no GPUs on the machine
32 | # or if the requested number of GPUs exceed
33 | # the number of GPUs installed.
34 | if [ ${n_gpus} -eq 0 ] || [ ${req_gpus} -gt ${n_gpus} ]; then
35 |   echo "-1"
36 |   exit 1
37 | fi
38 | 
39 | # shellcheck disable=SC2026
40 | f_gpu=$(nvidia-smi | sed -e '1,/Processes/d' \
41 |   | tail -n+3 | head -n-1 | awk '{print $2}' \
42 |   | awk -v ng=${n_gpus} 'BEGIN{for (n=0;n<ng;++n){g[n] = 1}} {delete g[$1];} END{for (i in g) print i}' \
43 |   | tail -n ${req_gpus})
44 | 
45 | # return -1 if not enough free GPUs were found
46 | if [[ $(echo ${f_gpu} | grep -v '^$' | wc -w) -ne ${req_gpus} ]]; then
47 |   echo "-1"
48 |   exit 1
49 | else
50 |   echo ${f_gpu} | sed 's: :,:g'
51 | fi
52 | 


--------------------------------------------------------------------------------
/utils/generate_wav.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | # Begin configuration section.
 7 | nj=2
 8 | fs=22050
 9 | n_fft=1024
10 | n_shift=256
11 | cmd=run.pl
12 | help_message=$(cat <<EOF
13 | Usage:
14 |   $0 [options] <model-path> <data-dir> [<log-dir> [<fbank-dir>] ]
15 | Example:
16 |   $0 ljspeech.wavenet.ns.v1/checkpoint-1000000.pkl data/train exp/wavenet_vocoder/train wav
17 | Note:
18 |   <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data
19 | Options:
20 |   --nj <nj>             # number of parallel jobs
21 |   --fs <fs>             # sampling rate (default=22050)
22 |   --n_fft <n_fft>       # number of FFT points (default=1024)
23 |   --n_shift <n_shift>   # shift size in point (default=256)
24 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
25 | EOF
26 | )
27 | # End configuration section.
28 | 
29 | echo "$0 $*"  # Print the command line for logging
30 | 
31 | . parse_options.sh || exit 1;
32 | 
33 | if [ $# -lt 2 ] || [ $# -gt 4 ]; then
34 |     echo "${help_message}"
35 |     exit 1;
36 | fi
37 | 
38 | model=$1
39 | data=$2
40 | if [ $# -ge 3 ]; then
41 |   logdir=$3
42 | else
43 |   logdir=${data}/log
44 | fi
45 | if [ $# -ge 4 ]; then
46 |   wavdir=$4
47 | else
48 |   wavdir=${data}/data
49 | fi
50 | 
51 | # use "name" as part of name of the archive.
52 | name=$(basename ${data})
53 | 
54 | mkdir -p ${wavdir} || exit 1;
55 | mkdir -p ${logdir} || exit 1;
56 | 
57 | scp=${data}/feats.scp
58 | 
59 | split_scps=""
60 | for n in $(seq ${nj}); do
61 |     split_scps="$split_scps $logdir/feats.${n}.scp"
62 | done
63 | 
64 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
65 | 
66 | ${cmd} JOB=1:${nj} ${logdir}/generate_with_wavenet_${name}.JOB.log \
67 |     generate_wav_from_fbank.py \
68 |         --model ${model} \
69 |         --fs ${fs} \
70 |         --n_fft ${n_fft} \
71 |         --n_shift ${n_shift} \
72 |         scp:${logdir}/feats.JOB.scp \
73 |         ${wavdir}
74 | 
75 | rm ${logdir}/feats.*.scp 2>/dev/null
76 | 
77 | echo "Succeeded creating wav for ${name}"
78 | 


--------------------------------------------------------------------------------
/utils/get_yaml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | 
 4 | import yaml
 5 | 
 6 | 
 7 | def get_parser():
 8 |     parser = argparse.ArgumentParser(
 9 |         description="get a specified attribute from a YAML file",
10 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
11 |     )
12 |     parser.add_argument("inyaml")
13 |     parser.add_argument(
14 |         "attr", help='foo.bar will access yaml.load(inyaml)["foo"]["bar"]'
15 |     )
16 |     return parser
17 | 
18 | 
19 | def main():
20 |     args = get_parser().parse_args()
21 |     with open(args.inyaml, "r") as f:
22 |         indict = yaml.load(f, Loader=yaml.Loader)
23 | 
24 |     try:
25 |         for attr in args.attr.split("."):
26 |             if attr.isdigit():
27 |                 attr = int(attr)
28 |             indict = indict[attr]
29 |         print(indict)
30 |     except KeyError:
31 |         # print nothing
32 |         # sys.exit(1)
33 |         pass
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     main()
38 | 


--------------------------------------------------------------------------------
/utils/json2sctm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import os
  6 | import subprocess
  7 | import sys
  8 | 
  9 | 
 10 | is_python2 = sys.version_info[0] == 2
 11 | 
 12 | 
 13 | def get_parser():
 14 |     parser = argparse.ArgumentParser(description="convert json to sctm")
 15 |     parser.add_argument("json", type=str, default=None, nargs="?", help="input trn")
 16 |     parser.add_argument("dict", type=str, help="dict")
 17 |     parser.add_argument(
 18 |         "--num-spkrs", type=int, default=1, nargs="?", help="number of speakers"
 19 |     )
 20 |     parser.add_argument("--refs", type=str, nargs="*", help="ref for all speakers")
 21 |     parser.add_argument("--hyps", type=str, nargs="*", help="hyp for all outputs")
 22 |     parser.add_argument("--orig-stm", type=str, nargs="?", help="orig stm")
 23 |     parser.add_argument("--stm", type=str, default=None, nargs="+", help="output stm")
 24 |     parser.add_argument("--ctm", type=str, default=None, nargs="+", help="output ctm")
 25 |     parser.add_argument(
 26 |         "--bpe", type=str, default=None, nargs="?", help="BPE model if applicable"
 27 |     )
 28 |     return parser
 29 | 
 30 | 
 31 | def main(args):
 32 |     from utils import json2trn
 33 |     from utils import trn2ctm
 34 |     from utils import trn2stm
 35 | 
 36 |     parser = get_parser()
 37 |     args = parser.parse_args(args)
 38 |     if args.refs is None:
 39 |         refs = ["ref_tmp.trn"]
 40 |         del_ref = True
 41 |     else:
 42 |         refs = args.refs
 43 |         del_ref = False
 44 |     if args.hyps is None:
 45 |         hyps = ["hyp_tmp.trn"]
 46 |         del_hyp = True
 47 |     else:
 48 |         hyps = args.hyps
 49 |         del_hyp = False
 50 |     json2trn.convert(args.json, args.dict, refs, hyps, args.num_spkrs)
 51 |     for trn in refs + hyps:
 52 |         # We don't remove non-lang-syms because kaldi already removes them when scoring
 53 |         call_args = ["sed", "-i.bak2", "-r", "s/<blank> //g", trn]
 54 |         subprocess.check_call(call_args)
 55 |         if args.bpe is not None:
 56 |             with open(wrd_name(trn), "w") as out:
 57 |                 with open(trn, "r") as spm_in:
 58 |                     sed_args = ["sed", "-e", "s/▁/ /g"]
 59 |                     sed = subprocess.Popen(sed_args, stdout=out, stdin=subprocess.PIPE)
 60 |                     spm_args = [
 61 |                         "spm_decode",
 62 |                         "--model=" + args.bpe,
 63 |                         "--input_format=piece",
 64 |                     ]
 65 |                     subprocess.Popen(spm_args, stdin=spm_in)
 66 |                     sed.communicate()
 67 |         else:
 68 |             call_args = [
 69 |                 "sed",
 70 |                 "-e",
 71 |                 "s/ //g",
 72 |                 "-e",
 73 |                 "s/(/ (/",
 74 |                 "-e",
 75 |                 "s/<space>/ /g",
 76 |                 trn,
 77 |             ]
 78 |             with open(wrd_name(trn), "w") as out:
 79 |                 sed = subprocess.Popen(call_args, stdout=out)
 80 |                 sed.communicate()
 81 |     for trn, stm in zip(refs, args.stm):
 82 |         trn2stm.convert(wrd_name(trn), stm, args.orig_stm)
 83 |     if del_ref:
 84 |         os.remove(refs[0])
 85 |         os.remove(refs[0] + ".bak2")
 86 |         os.remove(wrd_name(refs[0]))
 87 | 
 88 |     for trn, ctm in zip(hyps, args.ctm):
 89 |         trn2ctm.convert(wrd_name(trn), ctm)
 90 |     if del_hyp:
 91 |         os.remove(hyps[0])
 92 |         os.remove(hyps[0] + ".bak2")
 93 |         os.remove(wrd_name(hyps[0]))
 94 | 
 95 | 
 96 | def wrd_name(trn):
 97 |     split = trn.split(".")
 98 |     return ".".join(split[:-1]) + ".wrd." + split[-1]
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     main(sys.argv[1:])
103 | 


--------------------------------------------------------------------------------
/utils/json2text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | from __future__ import unicode_literals
 7 | 
 8 | import argparse
 9 | import codecs
10 | import json
11 | import logging
12 | 
13 | from espnet.utils.cli_utils import get_commandline_args
14 | 
15 | 
16 | def get_parser():
17 |     parser = argparse.ArgumentParser(
18 |         description="convert ASR recognized json to text",
19 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
20 |     )
21 |     parser.add_argument("json", type=str, help="json files")
22 |     parser.add_argument("dict", type=str, help="dict")
23 |     parser.add_argument("ref", type=str, help="ref")
24 |     parser.add_argument("hyp", type=str, help="hyp")
25 |     return parser
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = get_parser().parse_args()
30 | 
31 |     # logging info
32 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
33 |     logging.basicConfig(level=logging.INFO, format=logfmt)
34 |     logging.info(get_commandline_args())
35 | 
36 |     logging.info("reading %s", args.json)
37 |     with codecs.open(args.json, "r", encoding="utf-8") as f:
38 |         j = json.load(f)
39 | 
40 |     logging.info("reading %s", args.dict)
41 |     with codecs.open(args.dict, "r", encoding="utf-8") as f:
42 |         dictionary = f.readlines()
43 |     char_list = [entry.split(" ")[0] for entry in dictionary]
44 |     char_list.insert(0, "<blank>")
45 |     char_list.append("<eos>")
46 |     # print([x.encode('utf-8') for x in char_list])
47 | 
48 |     logging.info("writing hyp trn to %s", args.hyp)
49 |     logging.info("writing ref trn to %s", args.ref)
50 |     h = codecs.open(args.hyp, "w", encoding="utf-8")
51 |     r = codecs.open(args.ref, "w", encoding="utf-8")
52 | 
53 |     for x in j["utts"]:
54 |         seq = [
55 |             char_list[int(i)] for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
56 |         ]
57 |         h.write(x + " " + " ".join(seq).replace("<eos>", "") + "\n")
58 | 
59 |         if "tokenid" in j["utts"][x]["output"][0].keys():
60 |             seq = [
61 |                 char_list[int(i)] for i in j["utts"][x]["output"][0]["tokenid"].split()
62 |             ]
63 |             r.write(x + " " + " ".join(seq).replace("<eos>", "") + "\n")
64 | 


--------------------------------------------------------------------------------
/utils/json2trn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # encoding: utf-8
  3 | 
  4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
  5 | #           2018 Xuankai Chang (Shanghai Jiao Tong University)
  6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  7 | 
  8 | import argparse
  9 | import codecs
 10 | import json
 11 | import logging
 12 | import sys
 13 | 
 14 | from espnet.utils.cli_utils import get_commandline_args
 15 | 
 16 | 
 17 | def get_parser():
 18 |     parser = argparse.ArgumentParser(
 19 |         description="convert a json to a transcription file with a token dictionary",
 20 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 21 |     )
 22 |     parser.add_argument("json", type=str, help="json files")
 23 |     parser.add_argument("dict", type=str, help="dict")
 24 |     parser.add_argument("--num-spkrs", type=int, default=1, help="number of speakers")
 25 |     parser.add_argument("--refs", type=str, nargs="+", help="ref for all speakers")
 26 |     parser.add_argument("--hyps", type=str, nargs="+", help="hyp for all outputs")
 27 |     return parser
 28 | 
 29 | 
 30 | def main(args):
 31 |     args = get_parser().parse_args(args)
 32 |     convert(args.json, args.dict, args.refs, args.hyps, args.num_spkrs)
 33 | 
 34 | 
 35 | def convert(jsonf, dic, refs, hyps, num_spkrs=1):
 36 |     n_ref = len(refs)
 37 |     n_hyp = len(hyps)
 38 |     assert n_ref == n_hyp
 39 |     assert n_ref == num_spkrs
 40 | 
 41 |     # logging info
 42 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 43 |     logging.basicConfig(level=logging.INFO, format=logfmt)
 44 |     logging.info(get_commandline_args())
 45 | 
 46 |     logging.info("reading %s", jsonf)
 47 |     with codecs.open(jsonf, "r", encoding="utf-8") as f:
 48 |         j = json.load(f)
 49 | 
 50 |     logging.info("reading %s", dic)
 51 |     with codecs.open(dic, "r", encoding="utf-8") as f:
 52 |         dictionary = f.readlines()
 53 |     char_list = [entry.split(" ")[0] for entry in dictionary]
 54 |     char_list.insert(0, "<blank>")
 55 |     char_list.append("<eos>")
 56 | 
 57 |     for ns in range(num_spkrs):
 58 |         hyp_file = codecs.open(hyps[ns], "w", encoding="utf-8")
 59 |         ref_file = codecs.open(refs[ns], "w", encoding="utf-8")
 60 | 
 61 |         for x in j["utts"]:
 62 |             # recognition hypothesis
 63 |             if num_spkrs == 1:
 64 |                 seq = [
 65 |                     char_list[int(i)]
 66 |                     for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
 67 |                 ]
 68 |             else:
 69 |                 seq = [
 70 |                     char_list[int(i)]
 71 |                     for i in j["utts"][x]["output"][ns][0]["rec_tokenid"].split()
 72 |                 ]
 73 |             # In the recognition hypothesis,
 74 |             # the <eos> symbol is usually attached in the last part of the sentence
 75 |             # and it is removed below.
 76 |             hyp_file.write(" ".join(seq).replace("<eos>", "")),
 77 |             hyp_file.write(
 78 |                 " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
 79 |             )
 80 | 
 81 |             # reference
 82 |             if num_spkrs == 1:
 83 |                 seq = j["utts"][x]["output"][0]["token"]
 84 |             else:
 85 |                 seq = j["utts"][x]["output"][ns][0]["token"]
 86 |             # Unlike the recognition hypothesis,
 87 |             # the reference is directly generated from a token without dictionary
 88 |             # to avoid to include <unk> symbols in the reference to make scoring normal.
 89 |             # The detailed discussion can be found at
 90 |             # https://github.com/espnet/espnet/issues/993
 91 |             ref_file.write(
 92 |                 seq + " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
 93 |             )
 94 | 
 95 |         hyp_file.close()
 96 |         ref_file.close()
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main(sys.argv[1:])
101 | 


--------------------------------------------------------------------------------
/utils/json2trn_wo_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2019 Okayama University (Katsuki Inoue)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import argparse
 8 | import codecs
 9 | import json
10 | import logging
11 | import sys
12 | 
13 | from espnet.utils.cli_utils import get_commandline_args
14 | 
15 | 
16 | def get_parser():
17 |     parser = argparse.ArgumentParser(
18 |         description="convert a json to a transcription file with a token dictionary",
19 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
20 |     )
21 |     parser.add_argument("json", type=str, help="json files")
22 |     parser.add_argument("--num-spkrs", type=int, default=1, help="number of speakers")
23 |     parser.add_argument("--refs", type=str, nargs="+", help="ref for all speakers")
24 |     parser.add_argument("--hyps", type=str, nargs="+", help="hyp for all outputs")
25 |     return parser
26 | 
27 | 
28 | def main(args):
29 |     args = get_parser().parse_args(args)
30 |     convert(args.json, args.refs, args.hyps, args.num_spkrs)
31 | 
32 | 
33 | def convert(jsonf, refs, hyps, num_spkrs=1):
34 |     n_ref = len(refs)
35 |     n_hyp = len(hyps)
36 |     assert n_ref == n_hyp
37 |     assert n_ref == num_spkrs
38 | 
39 |     # logging info
40 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
41 |     logging.basicConfig(level=logging.INFO, format=logfmt)
42 |     logging.info(get_commandline_args())
43 | 
44 |     logging.info("reading %s", jsonf)
45 |     with codecs.open(jsonf, "r", encoding="utf-8") as f:
46 |         j = json.load(f)
47 | 
48 |     for ns in range(num_spkrs):
49 |         hyp_file = codecs.open(hyps[ns], "w", encoding="utf-8")
50 |         ref_file = codecs.open(refs[ns], "w", encoding="utf-8")
51 | 
52 |         for x in j["utts"]:
53 |             # recognition hypothesis
54 |             if num_spkrs == 1:
55 |                 seq = j["utts"][x]["output"][0]["rec_text"].replace("<eos>", "")
56 |             else:
57 |                 seq = j["utts"][x]["output"][ns][0]["rec_text"].replace("<eos>", "")
58 |             # In the recognition hypothesis,
59 |             # the <eos> symbol is usually attached in the last part of the sentence
60 |             # and it is removed below.
61 |             hyp_file.write(seq)
62 |             hyp_file.write(" (" + x.replace("-", "_") + ")\n")
63 | 
64 |             # reference
65 |             if num_spkrs == 1:
66 |                 seq = j["utts"][x]["output"][0]["text"]
67 |             else:
68 |                 seq = j["utts"][x]["output"][ns][0]["text"]
69 |             # Unlike the recognition hypothesis,
70 |             # the reference is directly generated from a token without dictionary
71 |             # to avoid to include <unk> symbols in the reference to make scoring normal.
72 |             # The detailed discussion can be found at
73 |             # https://github.com/espnet/espnet/issues/993
74 |             ref_file.write(seq + " (" + x.replace("-", "_") + ")\n")
75 | 
76 |         hyp_file.close()
77 |         ref_file.close()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main(sys.argv[1:])
82 | 


--------------------------------------------------------------------------------
/utils/merge_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # encoding: utf-8
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--parts', '-p', type=int, required=True,
11 |                         help='Number of subparts to be merged')
12 |     parser.add_argument('--result-dir', type=str, required=True,
13 |                         help='json file dir')
14 |     parser.add_argument('--result-label', type=str, required=True,
15 |                         help='path to save json file')
16 |     parser.add_argument('--offset', type=int, default=0,
17 |                         help='offset of file name')
18 |     args = parser.parse_args()
19 | 
20 |     new_js = {}
21 |     for i in range(args.parts):
22 |         file_name = os.path.join(args.result_dir, 'data.{}.json'.format(i + args.offset))
23 |         if not os.path.exists(file_name):
24 |             continue
25 |         js = json.load(open(file_name, 'rb'))['utts']
26 |         new_js.update(js)
27 | 
28 |     print('After merged:', len(new_js))
29 |     with open(args.result_label, 'wb') as f:
30 |         f.write(json.dumps({'utts': new_js}, indent=4, sort_keys=True).encode('utf_8'))
31 | 


--------------------------------------------------------------------------------
/utils/mix-mono-wav-scp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import io
 4 | import sys
 5 | 
 6 | PY2 = sys.version_info[0] == 2
 7 | 
 8 | if PY2:
 9 |     from itertools import izip_longest as zip_longest
10 | else:
11 |     from itertools import zip_longest
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(
16 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
17 |         description="Mixing wav.scp files into a multi-channel wav.scp " "using sox.",
18 |     )
19 |     parser.add_argument("scp", type=str, nargs="+", help="Give wav.scp")
20 |     parser.add_argument(
21 |         "out",
22 |         nargs="?",
23 |         type=argparse.FileType("w"),
24 |         default=sys.stdout,
25 |         help="The output filename. " "If omitted, then output to sys.stdout",
26 |     )
27 |     return parser
28 | 
29 | 
30 | def main():
31 |     parser = get_parser()
32 |     args = parser.parse_args()
33 | 
34 |     fscps = [io.open(scp, "r", encoding="utf-8") for scp in args.scp]
35 |     for linenum, lines in enumerate(zip_longest(*fscps)):
36 |         keys = []
37 |         wavs = []
38 | 
39 |         for line, scp in zip(lines, args.scp):
40 |             if line is None:
41 |                 raise RuntimeError("Numbers of line mismatch")
42 | 
43 |             sps = line.split(" ", 1)
44 |             if len(sps) != 2:
45 |                 raise RuntimeError(
46 |                     'Invalid line is found: {}, line {}: "{}" '.format(
47 |                         scp, linenum, line
48 |                     )
49 |                 )
50 |             key, wav = sps
51 |             keys.append(key)
52 |             wavs.append(wav.strip())
53 | 
54 |         if not all(k == keys[0] for k in keys):
55 |             raise RuntimeError(
56 |                 "The ids mismatch. Hint; the input files must be "
57 |                 "sorted and must have same ids: {}".format(keys)
58 |             )
59 | 
60 |         args.out.write(
61 |             "{} sox -M {} -c {} -t wav - |\n".format(
62 |                 keys[0], " ".join("{}".format(w) for w in wavs), len(fscps)
63 |             )
64 |         )
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/utils/pack_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2019 Johns Hopkins University (Shinji Watanabe)
  4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | [ -f ./path.sh ] && . ./path.sh
  7 | 
  8 | results=""
  9 | # e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
 10 | #        exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
 11 | lm=""
 12 | dict=""
 13 | etc=""
 14 | outfile="model"
 15 | preprocess_conf=""
 16 | 
 17 | help_message=$(cat <<EOF
 18 | Usage: $0 --lm <lm> --dict <dict> <tr_conf> <dec_conf> <cmvn> <e2e>, for example:
 19 | <lm>:       exp/train_rnnlm/rnnlm.model.best
 20 | <dict>:     data/lang_char
 21 | <tr_conf>:  conf/train.yaml
 22 | <dec_conf>: conf/decode.yaml
 23 | <cmvn>:     data/tr_it/cmvn.ark
 24 | <e2e>:      exp/tr_it_pytorch_train/results/model.last10.avg.best
 25 | EOF
 26 | )
 27 | . utils/parse_options.sh
 28 | 
 29 | if [ $# != 4 ]; then
 30 |     echo "${help_message}"
 31 |     exit 1
 32 | fi
 33 | 
 34 | tr_conf=$1
 35 | dec_conf=$2
 36 | cmvn=$3
 37 | e2e=$4
 38 | 
 39 | echo "  - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
 40 | echo "    - model link: (put the model link manually. please contact Shinji Watanabe <shinjiw@ieee.org> if you want a web storage to put your files)"
 41 | 
 42 | # configs
 43 | if [ -e ${tr_conf} ]; then
 44 |     tar cfh ${outfile}.tar ${tr_conf}
 45 |     echo -n "    - training config file: \`"
 46 |     echo ${tr_conf} | sed -e "s/$/\`/"
 47 | else
 48 |     echo "missing ${tr_conf}"
 49 |     exit 1
 50 | fi
 51 | if [ -e ${dec_conf} ]; then
 52 |     tar rfh ${outfile}.tar ${dec_conf}
 53 |     echo -n "    - decoding config file: \`"
 54 |     echo ${dec_conf} | sed -e "s/$/\`/"
 55 | else
 56 |     echo "missing ${dec_conf}"
 57 |     exit 1
 58 | fi
 59 | # NOTE(kan-bayashi): preprocess conf is optional
 60 | if [ -n "${preprocess_conf}" ]; then
 61 |     tar rfh ${outfile}.tar ${preprocess_conf}
 62 |     echo -n "    - preprocess config file: \`"
 63 |     echo ${preprocess_conf} | sed -e "s/$/\`/"
 64 | fi
 65 | 
 66 | # cmvn
 67 | if [ -e ${cmvn} ]; then
 68 |     tar rfh ${outfile}.tar ${cmvn}
 69 |     echo -n "    - cmvn file: \`"
 70 |     echo ${cmvn} | sed -e "s/$/\`/"
 71 | else
 72 |     echo "missing ${cmvn}"
 73 |     exit 1
 74 | fi
 75 | 
 76 | # e2e
 77 | if [ -e ${e2e} ]; then
 78 |     tar rfh ${outfile}.tar ${e2e}
 79 |     echo -n "    - e2e file: \`"
 80 |     echo ${e2e} | sed -e "s/$/\`/"
 81 | 
 82 |     e2e_conf=$(dirname ${e2e})/model.json
 83 |     if [ ! -e ${e2e_conf} ]; then
 84 | 	echo missing ${e2e_conf}
 85 | 	exit 1
 86 |     else
 87 | 	echo -n "    - e2e JSON file: \`"
 88 | 	echo ${e2e_conf} | sed -e "s/$/\`/"
 89 | 	tar rfh ${outfile}.tar ${e2e_conf}
 90 |     fi
 91 | else
 92 |     echo "missing ${e2e}"
 93 |     exit 1
 94 | fi
 95 | 
 96 | # lm
 97 | if [ -n "${lm}" ]; then
 98 |     if [ -e ${lm} ]; then
 99 | 	tar rfh ${outfile}.tar ${lm}
100 | 	echo -n "    - lm file: \`"
101 | 	echo ${lm} | sed -e "s/$/\`/"
102 | 
103 | 	lm_conf=$(dirname ${lm})/model.json
104 | 	if [ ! -e ${lm_conf} ]; then
105 | 	    echo missing ${lm_conf}
106 | 	    exit 1
107 | 	else
108 | 	    echo -n "    - lm JSON file: \`"
109 | 	    echo ${lm_conf} | sed -e "s/$/\`/"
110 | 	    tar rfh ${outfile}.tar ${lm_conf}
111 | 	fi
112 |     else
113 | 	echo "missing ${lm}"
114 | 	exit 1
115 |     fi
116 | fi
117 | 
118 | # dict
119 | if [ -n "${dict}" ]; then
120 |     if [ -e ${dict} ]; then
121 | 	tar rfh ${outfile}.tar ${dict}
122 | 	echo -n "    - dict file: \`"
123 | 	echo ${dict} | sed -e "s/$/\`/"
124 |     else
125 | 	echo "missing ${dict}"
126 | 	exit 1
127 |     fi
128 | fi
129 | 
130 | # etc
131 | for x in ${etc}; do
132 |     if [ -e ${x} ]; then
133 | 	tar rfh ${outfile}.tar ${x}
134 | 	echo -n "    - etc file: \`"
135 | 	echo ${x} | sed -e "s/$/\`/"
136 |     else
137 | 	echo "missing ${x}"
138 | 	exit 1
139 |     fi
140 | done
141 | 
142 | # finally compress the tar file
143 | gzip -f ${outfile}.tar
144 | 
145 | # results
146 | if [ -n "${results}" ]; then
147 |     echo "  - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results <results>\`)"
148 |     echo "\`\`\`"
149 | fi
150 | for x in ${results}; do
151 |     if [ -e ${x} ]; then
152 | 	echo "${x}"
153 | 	grep -e Avg -e SPKR -m 2 ${x}
154 |     else
155 | 	echo "missing ${x}"
156 | 	exit 1
157 |     fi
158 | done
159 | if [ -n "${results}" ]; then
160 |     echo "\`\`\`"
161 | fi
162 | 
163 | exit 0
164 | 


--------------------------------------------------------------------------------
/utils/reduce_data_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # koried, 10/29/2012
 4 | 
 5 | # Reduce a data set based on a list of turn-ids
 6 | 
 7 | help_message="usage: $0 srcdir turnlist destdir"
 8 | 
 9 | if [ $1 == "--help" ]; then
10 |     echo "${help_message}"
11 |     exit 0;
12 | fi
13 | 
14 | if [ $# != 3 ]; then
15 |     echo "${help_message}"
16 |     exit 1;
17 | fi
18 | 
19 | srcdir=$1
20 | reclist=$2
21 | destdir=$3
22 | 
23 | if [ ! -f ${srcdir}/utt2spk ]; then
24 | echo "$0: no such file $srcdir/utt2spk"
25 | exit 1;
26 | fi
27 | 
28 | function do_filtering {
29 | # assumes the utt2spk and spk2utt files already exist.
30 | 	[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
31 | 	[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
32 | 	[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
33 | 	[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
34 | 	[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
35 | 	[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
36 | 	if [ -f ${srcdir}/segments ]; then
37 | 		utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
38 | 		awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
39 | 		# The next line would override the command above for wav.scp, which would be incorrect.
40 | 		[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
41 | 		[ -f ${srcdir}/reco2file_and_channel ] && \
42 | 			utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
43 | 		
44 | 		# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
45 | 		[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
46 | 		rm ${destdir}/reco
47 | 	fi
48 | 	srcutts=$(wc -l < ${srcdir}/utt2spk)
49 | 	destutts=$(wc -l < ${destdir}/utt2spk)
50 | 	echo "Reduced #utt from $srcutts to $destutts"
51 | }
52 | 
53 | mkdir -p ${destdir}
54 | 
55 | # filter the utt2spk based on the set of recordings
56 | utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
57 | 
58 | utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
59 | do_filtering;
60 | 


--------------------------------------------------------------------------------
/utils/remove_longshortdata.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | . ./path.sh
 7 | 
 8 | maxframes=2000
 9 | minframes=10
10 | maxchars=200
11 | minchars=0
12 | nlsyms=""
13 | no_feat=false
14 | 
15 | help_message="usage: $0 olddatadir newdatadir"
16 | 
17 | . utils/parse_options.sh || exit 1;
18 | 
19 | if [ $# != 2 ]; then
20 |     echo "${help_message}"
21 |     exit 1;
22 | fi
23 | 
24 | sdir=$1
25 | odir=$2
26 | mkdir -p ${odir}/tmp
27 | 
28 | if [ ${no_feat} = true ]; then
29 |     # for machine translation
30 |     cut -d' ' -f 1 ${sdir}/text > ${odir}/tmp/reclist1
31 | else
32 |     echo "extract utterances having less than $maxframes or more than $minframes frames"
33 |     utils/data/get_utt2num_frames.sh ${sdir}
34 |     < ${sdir}/utt2num_frames  awk -v maxframes="$maxframes" '{ if ($2 < maxframes) print }' \
35 |         | awk -v minframes="$minframes" '{ if ($2 > minframes) print }' \
36 |         | awk '{print $1}' > ${odir}/tmp/reclist1
37 | fi
38 | 
39 | echo "extract utterances having less than $maxchars or more than $minchars characters"
40 | # counting number of chars. Use (NF - 1) instead of NF to exclude the utterance ID column
41 | if [ -z ${nlsyms} ]; then
42 | text2token.py -s 1 -n 1 ${sdir}/text \
43 |     | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
44 |     | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
45 |     | awk '{print $1}' > ${odir}/tmp/reclist2
46 | else
47 | text2token.py -l ${nlsyms} -s 1 -n 1 ${sdir}/text \
48 |     | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
49 |     | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
50 |     | awk '{print $1}' > ${odir}/tmp/reclist2
51 | fi
52 | 
53 | # extract common lines
54 | comm -12 <(sort ${odir}/tmp/reclist1) <(sort ${odir}/tmp/reclist2) > ${odir}/tmp/reclist
55 | 
56 | reduce_data_dir.sh ${sdir} ${odir}/tmp/reclist ${odir}
57 | utils/fix_data_dir.sh ${odir}
58 | 
59 | oldnum=$(wc -l ${sdir}/feats.scp | awk '{print $1}')
60 | newnum=$(wc -l ${odir}/feats.scp | awk '{print $1}')
61 | echo "change from $oldnum to $newnum"
62 | 


--------------------------------------------------------------------------------
/utils/result2json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 5 | #           2018 Xuankai Chang (Shanghai Jiao Tong University)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | from __future__ import print_function
 8 | from __future__ import unicode_literals
 9 | 
10 | import argparse
11 | import codecs
12 | import json
13 | import re
14 | import sys
15 | 
16 | is_python2 = sys.version_info[0] == 2
17 | 
18 | 
19 | def get_parser():
20 |     parser = argparse.ArgumentParser(
21 |         description="convert sclite's result.txt file to json"
22 |     )
23 |     parser.add_argument("--key", "-k", type=str, help="key")
24 |     return parser
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = get_parser()
29 |     args = parser.parse_args()
30 | 
31 |     key = re.findall(r"r\d+h\d+", args.key)[0]
32 | 
33 |     re_id = r"^id: "
34 |     re_strings = {
35 |         "Speaker": r"^Speaker sentences",
36 |         "Scores": r"^Scores: ",
37 |         "REF": r"^REF: ",
38 |         "HYP": r"^HYP: ",
39 |     }
40 |     re_id = re.compile(re_id)
41 |     re_patterns = {}
42 |     for p in re_strings.keys():
43 |         re_patterns[p] = re.compile(re_strings[p])
44 | 
45 |     ret = {}
46 |     tmp_id = None
47 |     tmp_ret = {}
48 | 
49 |     sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
50 |     sys.stdout = codecs.getwriter("utf-8")(
51 |         sys.stdout if is_python2 else sys.stdout.buffer
52 |     )
53 |     line = sys.stdin.readline()
54 |     while line:
55 |         x = line.rstrip()
56 |         x_split = x.split()
57 | 
58 |         if re_id.match(x):
59 |             if tmp_id:
60 |                 ret[tmp_id] = {key: tmp_ret}
61 |                 tmp_ret = {}
62 |             tmp_id = x_split[1]
63 |         for p in re_patterns.keys():
64 |             if re_patterns[p].match(x):
65 |                 tmp_ret[p] = " ".join(x_split[1:])
66 |         line = sys.stdin.readline()
67 | 
68 |     if tmp_ret != {}:
69 |         ret[tmp_id] = {key: tmp_ret}
70 | 
71 |     all_l = {"utts": ret}
72 |     # ensure "ensure_ascii=False", which is a bug
73 |     jsonstring = json.dumps(
74 |         all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
75 |     )
76 |     print(jsonstring)
77 | 


--------------------------------------------------------------------------------
/utils/score_bleu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | export LC_ALL=C
 7 | 
 8 | . ./path.sh
 9 | 
10 | nlsyms=""
11 | bpe=""
12 | bpemodel=""
13 | filter=""
14 | case=lc
15 | set=""
16 | 
17 | . utils/parse_options.sh
18 | 
19 | if [ $# -lt 3 ]; then
20 |     echo "Usage: $0 <decode-dir> <tgt_lang> <dict-tgt> <dict-src>";
21 |     exit 1;
22 | fi
23 | 
24 | dir=$1
25 | tgt_lang=$2
26 | dic_tgt=$3
27 | dic_src=$4
28 | 
29 | concatjson.py ${dir}/data.*.json > ${dir}/data.json
30 | json2trn_mt.py ${dir}/data.json ${dic_tgt} --refs ${dir}/ref.trn.org \
31 |     --hyps ${dir}/hyp.trn.org --srcs ${dir}/src.trn.org --dict-src ${dic_src}
32 | 
33 | # remove uttterance id
34 | perl -pe 's/\([^\)]+\)//g;' ${dir}/ref.trn.org > ${dir}/ref.trn
35 | perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
36 | perl -pe 's/\([^\)]+\)//g;' ${dir}/src.trn.org > ${dir}/src.trn
37 | 
38 | if [ -n "$bpe" ]; then
39 |     spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
40 |     spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
41 |     spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
42 | else
43 |     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
44 |     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
45 |     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/src.trn > ${dir}/src.wrd.trn
46 | fi
47 | 
48 | # detokenize
49 | detokenizer.perl -l ${tgt_lang} -q < ${dir}/ref.wrd.trn > ${dir}/ref.wrd.trn.detok
50 | detokenizer.perl -l ${tgt_lang} -q < ${dir}/hyp.wrd.trn > ${dir}/hyp.wrd.trn.detok
51 | detokenizer.perl -l ${tgt_lang} -q < ${dir}/src.wrd.trn > ${dir}/src.wrd.trn.detok
52 | 
53 | # remove language IDs
54 | if [ -n "${nlsyms}" ]; then
55 |     cp ${dir}/ref.wrd.trn.detok ${dir}/ref.wrd.trn.detok.tmp
56 |     cp ${dir}/hyp.wrd.trn.detok ${dir}/hyp.wrd.trn.detok.tmp
57 |     cp ${dir}/src.wrd.trn.detok ${dir}/src.wrd.trn.detok.tmp
58 |     filt.py -v $nlsyms ${dir}/ref.wrd.trn.detok.tmp > ${dir}/ref.wrd.trn.detok
59 |     filt.py -v $nlsyms ${dir}/hyp.wrd.trn.detok.tmp > ${dir}/hyp.wrd.trn.detok
60 |     filt.py -v $nlsyms ${dir}/src.wrd.trn.detok.tmp > ${dir}/src.wrd.trn.detok
61 | fi
62 | if [ -n "${filter}" ]; then
63 |     sed -i.bak3 -f ${filter} ${dir}/hyp.wrd.trn.detok
64 |     sed -i.bak3 -f ${filter} ${dir}/ref.wrd.trn.detok
65 |     sed -i.bak3 -f ${filter} ${dir}/src.wrd.trn.detok
66 | fi
67 | # NOTE: this must be performed after detokenization so that punctuation marks are not removed
68 | 
69 | if [ ${case} = tc ]; then
70 |     echo ${set} > ${dir}/result.tc.txt
71 |     multi-bleu-detok.perl ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok >> ${dir}/result.tc.txt
72 |     echo "write a case-sensitive BLEU result in ${dir}/result.tc.txt"
73 |     cat ${dir}/result.tc.txt
74 | else
75 |     echo ${set} > ${dir}/result.lc.txt
76 |     multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok > ${dir}/result.lc.txt
77 |     echo "write a case-insensitive BLEU result in ${dir}/result.lc.txt"
78 |     cat ${dir}/result.lc.txt
79 | fi
80 | 
81 | # TODO(hirofumi): add TER & METEOR metrics here
82 | 


--------------------------------------------------------------------------------
/utils/score_sclite_wo_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Okayama University (Katsuki Inoue)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | [ -f ./path.sh ] && . ./path.sh
 7 | 
 8 | wer=false
 9 | num_spkrs=1
10 | help_message="Usage: $0 <data-dir>"
11 | 
12 | . utils/parse_options.sh
13 | 
14 | if [ $# != 1 ]; then
15 |     echo "${help_message}"
16 |     exit 1;
17 | fi
18 | 
19 | dir=$1
20 | 
21 | concatjson.py ${dir}/data.*.json > ${dir}/data.json
22 | 
23 | if [ $num_spkrs -eq 1 ]; then
24 |     json2trn_wo_dict.py ${dir}/data.json --num-spkrs ${num_spkrs} --refs ${dir}/ref_org.wrd.trn --hyps ${dir}/hyp_org.wrd.trn
25 |    
26 |     cat < ${dir}/hyp_org.wrd.trn | sed -e 's/▁//' | sed -e 's/▁/ /g' > ${dir}/hyp.wrd.trn
27 |     cat < ${dir}/ref_org.wrd.trn | sed -e 's/\.//g' -e 's/\,//g' > ${dir}/ref.wrd.trn
28 | 
29 |     cat < ${dir}/hyp.wrd.trn | awk -v FS='' '{a=0;for(i=1;i<=NF;i++){if($i=="("){a=1};if(a==0){printf("%s ",$i)}else{printf("%s",$i)}}printf("\n")}' > ${dir}/hyp.trn
30 |     cat < ${dir}/ref.wrd.trn | awk -v FS='' '{a=0;for(i=1;i<=NF;i++){if($i=="("){a=1};if(a==0){printf("%s ",$i)}else{printf("%s",$i)}}printf("\n")}' > ${dir}/ref.trn
31 | 
32 |     sclite -r ${dir}/ref.trn trn -h ${dir}/hyp.trn -i rm -o all stdout > ${dir}/result.txt
33 |     echo "write a CER result in ${dir}/result.txt"
34 |     grep -e Avg -e SPKR -m 2 ${dir}/result.txt
35 |     
36 |     if ${wer}; then
37 |         sclite -r ${dir}/ref.wrd.trn trn -h ${dir}/hyp.wrd.trn -i rm -o all stdout > ${dir}/result.wrd.txt
38 |         echo "write a WER result in ${dir}/result.wrd.txt"
39 |         grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
40 |         
41 |         sclite -r ${dir}/ref_org.wrd.trn trn -h ${dir}/hyp.wrd.trn trn -i rm -o all stdout > ${dir}/result_w_punc.wrd.txt
42 |         echo "write a WER result in ${dir}/result_w_punc.wrd.txt"
43 |         grep -e Avg -e SPKR -m 2 ${dir}/result_w_punc.wrd.txt
44 | 
45 |     fi
46 | fi
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/utils/scp2json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | from __future__ import print_function
 7 | from __future__ import unicode_literals
 8 | 
 9 | import argparse
10 | import codecs
11 | import json
12 | import sys
13 | 
14 | is_python2 = sys.version_info[0] == 2
15 | 
16 | 
17 | def get_parser():
18 |     parser = argparse.ArgumentParser(
19 |         description="convert scp to json",
20 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
21 |     )
22 |     parser.add_argument("--key", "-k", type=str, help="key")
23 |     return parser
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = get_parser()
28 |     args = parser.parse_args()
29 | 
30 |     new_line = {}
31 |     sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
32 |     sys.stdout = codecs.getwriter("utf-8")(
33 |         sys.stdout if is_python2 else sys.stdout.buffer
34 |     )
35 |     line = sys.stdin.readline()
36 |     while line:
37 |         x = line.rstrip().split()
38 |         v = {args.key: " ".join(x[1:])}
39 |         new_line[x[0]] = v
40 |         line = sys.stdin.readline()
41 | 
42 |     all_l = {"utts": new_line}
43 | 
44 |     # ensure "ensure_ascii=False", which is a bug
45 |     jsonstring = json.dumps(
46 |         all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
47 |     )
48 |     print(jsonstring)
49 | 


--------------------------------------------------------------------------------
/utils/show_result.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | mindepth=0
 3 | maxdepth=1
 4 | 
 5 | . utils/parse_options.sh
 6 | 
 7 | if [ $# -gt 1 ]; then
 8 |     echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
 9 |     echo ""
10 |     echo "Show the system environments and the evaluation results in Markdown format."
11 |     echo 'The default of <exp> is "exp/".'
12 |     exit 1
13 | fi
14 | 
15 | [ -f ./path.sh ] && . ./path.sh
16 | set -euo pipefail
17 | if [ $# -eq 1 ]; then
18 |     exp=$1
19 | else
20 |     exp=exp
21 | fi
22 | 
23 | 
24 | cat << EOF
25 | <!-- Generated by $0 -->
26 | # RESULTS
27 | ## Environments
28 | - date: \`$(LC_ALL=C date)\`
29 | EOF
30 | 
31 | python << EOF
32 | import sys, espnet, chainer, torch
33 | pyversion = sys.version.replace('\n', ' ')
34 | 
35 | print(f"""- python version: \`{pyversion}\`
36 | - espnet version: \`espnet {espnet.__version__}\`
37 | - chainer version: \`chainer {chainer.__version__}\`
38 | - pytorch version: \`pytorch {torch.__version__}\`""")
39 | EOF
40 | 
41 | cat << EOF
42 | - Git hash: \`$(git rev-parse HEAD)\`
43 |   - Commit date: \`$(git log -1 --format='%cd')\`
44 | 
45 | EOF
46 | 
47 | while IFS= read -r expdir; do
48 |     if ls ${expdir}/decode_*/result.txt &> /dev/null; then
49 |     # 1. Show the result table
50 |     cat << EOF
51 | ## $(basename ${expdir})
52 | ### CER
53 | 
54 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
55 | |---|---|---|---|---|---|---|---|---|
56 | EOF
57 |         grep -e Avg ${expdir}/decode_*/result.txt \
58 |             | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
59 |             | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
60 |         echo
61 | 
62 |         # 2. Show the result table for WER
63 |         if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
64 |             cat << EOF
65 | ### WER
66 | 
67 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
68 | |---|---|---|---|---|---|---|---|---|
69 | EOF
70 |             grep -e Avg ${expdir}/decode_*/result.wrd.txt \
71 |                 | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
72 |                 | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
73 |             echo
74 |         fi
75 |     fi
76 | done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
77 | 


--------------------------------------------------------------------------------
/utils/split_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # encoding: utf-8
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | 
 8 | def write_json(js, json_file):
 9 |     with open(json_file, 'wb') as f:
10 |         f.write(json.dumps({'utts': js}, indent=4, sort_keys=True).encode('utf_8'))
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--parts', '-p', type=int, required=True,
15 |                         help='Number of subparts to be prepared', default=0)
16 |     parser.add_argument('--json', type=str, required=True,
17 |                         help='json file')
18 |     parser.add_argument('--datadir', type=str, required=True,
19 |                         help='path to save json file')
20 |     parser.add_argument('--offset', type=int, default=0,
21 |                         help='offset of file name')
22 |     args = parser.parse_args()
23 | 
24 |     js = json.load(open(args.json, 'rb'))['utts']
25 |     js = list(sorted(js.items(), key=lambda x: -x[1]['input'][0]['shape'][0]))
26 | 
27 |     new_js = [[] for _ in range(args.parts)]
28 |     for i, j in enumerate(js):
29 |         new_js[i % args.parts].append(j)
30 | 
31 |     file_name_prefix = args.json.split('/')[-1].split('.')[0]
32 |     for i, new_j in enumerate(new_js):
33 |         print('Part {}: {}'.format(i, len(new_j)))
34 |         file_name = file_name_prefix + '_{}.json'.format(i + args.offset)
35 |         write_json(dict(new_j), os.path.join(args.datadir, file_name))


--------------------------------------------------------------------------------
/utils/splitjson.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | from __future__ import unicode_literals
10 | 
11 | import argparse
12 | import codecs
13 | import json
14 | import logging
15 | import os
16 | import sys
17 | 
18 | import numpy as np
19 | 
20 | from espnet.utils.cli_utils import get_commandline_args
21 | 
22 | 
23 | def get_parser():
24 |     parser = argparse.ArgumentParser(
25 |         description="split a json file for parallel processing",
26 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
27 |     )
28 |     parser.add_argument("json", type=str, help="json file")
29 |     parser.add_argument(
30 |         "--parts", "-p", type=int, help="Number of subparts to be prepared", default=0
31 |     )
32 |     return parser
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     args = get_parser().parse_args()
37 | 
38 |     # logging info
39 |     logging.basicConfig(
40 |         level=logging.INFO,
41 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
42 |     )
43 |     logging.info(get_commandline_args())
44 | 
45 |     # check directory
46 |     filename = os.path.basename(args.json).split(".")[0]
47 |     dirname = os.path.dirname(args.json)
48 |     dirname = "{}/split{}utt".format(dirname, args.parts)
49 |     if not os.path.exists(dirname):
50 |         os.makedirs(dirname)
51 | 
52 |     # load json and split keys
53 |     j = json.load(codecs.open(args.json, "r", encoding="utf-8"))
54 |     utt_ids = sorted(list(j["utts"].keys()))
55 |     logging.info("number of utterances = %d" % len(utt_ids))
56 |     if len(utt_ids) < args.parts:
57 |         logging.error("#utterances < #splits. Use smaller split number.")
58 |         sys.exit(1)
59 |     utt_id_lists = np.array_split(utt_ids, args.parts)
60 |     utt_id_lists = [utt_id_list.tolist() for utt_id_list in utt_id_lists]
61 | 
62 |     for i, utt_id_list in enumerate(utt_id_lists):
63 |         new_dic = dict()
64 |         for utt_id in utt_id_list:
65 |             new_dic[utt_id] = j["utts"][utt_id]
66 |         jsonstring = json.dumps(
67 |             {"utts": new_dic},
68 |             indent=4,
69 |             ensure_ascii=False,
70 |             sort_keys=True,
71 |             separators=(",", ": "),
72 |         )
73 |         fl = "{}/{}.{}.json".format(dirname, filename, i + 1)
74 |         sys.stdout = codecs.open(fl, "w+", encoding="utf-8")
75 |         print(jsonstring)
76 |         sys.stdout.close()
77 | 


--------------------------------------------------------------------------------
/utils/spm_decode:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
 7 | 
 8 | from __future__ import absolute_import, division, print_function, unicode_literals
 9 | 
10 | import argparse
11 | import sys
12 | 
13 | import sentencepiece as spm
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--model", required=True,
19 |                         help="sentencepiece model to use for decoding")
20 |     parser.add_argument("--input", default=None, help="input file to decode")
21 |     parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
22 |     args = parser.parse_args()
23 | 
24 |     sp = spm.SentencePieceProcessor()
25 |     sp.Load(args.model)
26 | 
27 |     if args.input_format == "piece":
28 |         def decode(l):
29 |             return "".join(sp.DecodePieces(l))
30 |     elif args.input_format == "id":
31 |         def decode(l):
32 |             return "".join(sp.DecodeIds(l))
33 |     else:
34 |         raise NotImplementedError
35 | 
36 |     def tok2int(tok):
37 |         # remap reference-side <unk> (represented as <<unk>>) to 0
38 |         return int(tok) if tok != "<<unk>>" else 0
39 | 
40 |     if args.input is None:
41 |         h = sys.stdin
42 |     else:
43 |         h = open(args.input, "r", encoding="utf-8")
44 |     for line in h:
45 |         print(decode(line.split()))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/utils/spm_encode:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in
  6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
  7 | 
  8 | from __future__ import absolute_import, division, print_function, unicode_literals
  9 | 
 10 | import argparse
 11 | import contextlib
 12 | import sys
 13 | 
 14 | import sentencepiece as spm
 15 | 
 16 | 
 17 | def main():
 18 |     parser = argparse.ArgumentParser()
 19 |     parser.add_argument("--model", required=True,
 20 |                         help="sentencepiece model to use for encoding")
 21 |     parser.add_argument("--inputs", nargs="+", default=['-'],
 22 |                         help="input files to filter/encode")
 23 |     parser.add_argument("--outputs", nargs="+", default=['-'],
 24 |                         help="path to save encoded outputs")
 25 |     parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
 26 |     parser.add_argument("--min-len", type=int, metavar="N",
 27 |                         help="filter sentence pairs with fewer than N tokens")
 28 |     parser.add_argument("--max-len", type=int, metavar="N",
 29 |                         help="filter sentence pairs with more than N tokens")
 30 |     args = parser.parse_args()
 31 | 
 32 |     assert len(args.inputs) == len(args.outputs), \
 33 |         "number of input and output paths should match"
 34 | 
 35 |     sp = spm.SentencePieceProcessor()
 36 |     sp.Load(args.model)
 37 | 
 38 |     if args.output_format == "piece":
 39 |         def encode(l):
 40 |             return sp.EncodeAsPieces(l)
 41 |     elif args.output_format == "id":
 42 |         def encode(l):
 43 |             return list(map(str, sp.EncodeAsIds(l)))
 44 |     else:
 45 |         raise NotImplementedError
 46 | 
 47 |     if args.min_len is not None or args.max_len is not None:
 48 |         def valid(line):
 49 |             return (
 50 |                 (args.min_len is None or len(line) >= args.min_len) and
 51 |                 (args.max_len is None or len(line) <= args.max_len)
 52 |             )
 53 |     else:
 54 |         def valid(lines):
 55 |             return True
 56 | 
 57 |     with contextlib.ExitStack() as stack:
 58 |         inputs = [
 59 |             stack.enter_context(open(input, "r", encoding="utf-8"))
 60 |             if input != "-" else sys.stdin
 61 |             for input in args.inputs
 62 |         ]
 63 |         outputs = [
 64 |             stack.enter_context(open(output, "w", encoding="utf-8"))
 65 |             if output != "-" else sys.stdout
 66 |             for output in args.outputs
 67 |         ]
 68 | 
 69 |         stats = {
 70 |             "num_empty": 0,
 71 |             "num_filtered": 0,
 72 |         }
 73 | 
 74 |         def encode_line(line):
 75 |             line = line.strip()
 76 |             if len(line) > 0:
 77 |                 line = encode(line)
 78 |                 if valid(line):
 79 |                     return line
 80 |                 else:
 81 |                     stats["num_filtered"] += 1
 82 |             else:
 83 |                 stats["num_empty"] += 1
 84 |             return None
 85 | 
 86 |         for i, lines in enumerate(zip(*inputs), start=1):
 87 |             enc_lines = list(map(encode_line, lines))
 88 |             if not any(enc_line is None for enc_line in enc_lines):
 89 |                 for enc_line, output_h in zip(enc_lines, outputs):
 90 |                     print(" ".join(enc_line), file=output_h)
 91 |             if i % 10000 == 0:
 92 |                 print("processed {} lines".format(i), file=sys.stderr)
 93 | 
 94 |         print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
 95 |         print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/utils/spm_train:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
 7 | import sys
 8 | 
 9 | import sentencepiece as spm
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
14 | 


--------------------------------------------------------------------------------
/utils/text2vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import argparse
 7 | import codecs
 8 | import logging
 9 | import six
10 | import sys
11 | 
12 | is_python2 = sys.version_info[0] == 2
13 | 
14 | 
15 | def get_parser():
16 |     parser = argparse.ArgumentParser(
17 |         description="create a vocabulary file from text files",
18 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
19 |     )
20 |     parser.add_argument(
21 |         "--output", "-o", default="", type=str, help="output a vocabulary file"
22 |     )
23 |     parser.add_argument("--cutoff", "-c", default=0, type=int, help="cut-off frequency")
24 |     parser.add_argument(
25 |         "--vocabsize", "-s", default=20000, type=int, help="vocabulary size"
26 |     )
27 |     parser.add_argument("text_files", nargs="*", help="input text files")
28 |     return parser
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = get_parser()
33 |     args = parser.parse_args()
34 | 
35 |     # count the word occurrences
36 |     counts = {}
37 |     exclude = ["<sos>", "<eos>", "<unk>"]
38 |     if len(args.text_files) == 0:
39 |         args.text_files.append("-")
40 |     for fn in args.text_files:
41 |         fd = (
42 |             codecs.open(fn, "r", encoding="utf-8")
43 |             if fn != "-"
44 |             else codecs.getreader("utf-8")(
45 |                 sys.stdin if is_python2 else sys.stdin.buffer
46 |             )
47 |         )
48 |         for ln in fd.readlines():
49 |             for tok in ln.split():
50 |                 if tok not in exclude:
51 |                     if tok not in counts:
52 |                         counts[tok] = 1
53 |                     else:
54 |                         counts[tok] += 1
55 |         if fn != "-":
56 |             fd.close()
57 | 
58 |     # limit the vocabulary size
59 |     total_count = sum(counts.values())
60 |     invocab_count = 0
61 |     vocabulary = []
62 |     for w, c in sorted(counts.items(), key=lambda x: -x[1]):
63 |         if c <= args.cutoff:
64 |             break
65 |         if len(vocabulary) >= args.vocabsize:
66 |             break
67 |         vocabulary.append(w)
68 |         invocab_count += c
69 | 
70 |     logging.warning(
71 |         "OOV rate = %.2f %%" % (float(total_count - invocab_count) / total_count * 100)
72 |     )
73 |     # write the vocabulary
74 |     fd = (
75 |         codecs.open(args.output, "w", encoding="utf-8")
76 |         if args.output
77 |         else codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer)
78 |     )
79 |     six.print_("<unk> 1", file=fd)
80 |     for n, w in enumerate(sorted(vocabulary)):
81 |         six.print_("%s %d" % (w, n + 2), file=fd)
82 |     if args.output:
83 |         fd.close()
84 | 


--------------------------------------------------------------------------------
/utils/trim_silence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | from __future__ import division
  8 | 
  9 | import argparse
 10 | import codecs
 11 | import logging
 12 | import os
 13 | 
 14 | import kaldiio
 15 | import librosa
 16 | import matplotlib.pyplot as plt
 17 | import numpy
 18 | 
 19 | from espnet.utils.cli_utils import get_commandline_args
 20 | 
 21 | 
 22 | def _time_to_str(time_idx):
 23 |     time_idx = time_idx * 10 ** 4
 24 |     return "%06d" % time_idx
 25 | 
 26 | 
 27 | def get_parser():
 28 |     parser = argparse.ArgumentParser(
 29 |         description="Trim slience with simple power thresholding "
 30 |         "and make segments file.",
 31 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 32 |     )
 33 |     parser.add_argument("--fs", type=int, help="Sampling frequency")
 34 |     parser.add_argument(
 35 |         "--threshold", type=float, default=60, help="Threshold in decibels"
 36 |     )
 37 |     parser.add_argument(
 38 |         "--win_length", type=int, default=1024, help="Analisys window length in point"
 39 |     )
 40 |     parser.add_argument(
 41 |         "--shift_length", type=int, default=256, help="Shift length in point"
 42 |     )
 43 |     parser.add_argument(
 44 |         "--min_silence", type=float, default=0.01, help="minimum silence length"
 45 |     )
 46 |     parser.add_argument(
 47 |         "--figdir", type=str, default="figs", help="Directory to save figures"
 48 |     )
 49 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
 50 |     parser.add_argument(
 51 |         "--normalize",
 52 |         choices=[1, 16, 24, 32],
 53 |         type=int,
 54 |         default=None,
 55 |         help="Give the bit depth of the PCM, "
 56 |         "then normalizes data to scale in [-1,1]",
 57 |     )
 58 |     parser.add_argument("rspecifier", type=str, help="WAV scp file")
 59 |     parser.add_argument("wspecifier", type=str, help="Segments file")
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | def main():
 65 |     parser = get_parser()
 66 |     args = parser.parse_args()
 67 | 
 68 |     # set logger
 69 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 70 |     if args.verbose > 0:
 71 |         logging.basicConfig(level=logging.INFO, format=logfmt)
 72 |     else:
 73 |         logging.basicConfig(level=logging.WARN, format=logfmt)
 74 |     logging.info(get_commandline_args())
 75 | 
 76 |     if not os.path.exists(args.figdir):
 77 |         os.makedirs(args.figdir)
 78 | 
 79 |     with kaldiio.ReadHelper(args.rspecifier) as reader, codecs.open(
 80 |         args.wspecifier, "w", encoding="utf-8"
 81 |     ) as f:
 82 |         for utt_id, (rate, array) in reader:
 83 |             assert rate == args.fs
 84 |             array = array.astype(numpy.float32)
 85 |             if args.normalize is not None and args.normalize != 1:
 86 |                 array = array / (1 << (args.normalize - 1))
 87 |             array_trim, idx = librosa.effects.trim(
 88 |                 y=array,
 89 |                 top_db=args.threshold,
 90 |                 frame_length=args.win_length,
 91 |                 hop_length=args.shift_length,
 92 |             )
 93 |             start, end = idx / args.fs
 94 | 
 95 |             # save figure
 96 |             plt.subplot(2, 1, 1)
 97 |             plt.plot(array)
 98 |             plt.title("Original")
 99 |             plt.subplot(2, 1, 2)
100 |             plt.plot(array_trim)
101 |             plt.title("Trim")
102 |             plt.tight_layout()
103 |             plt.savefig(args.figdir + "/" + utt_id + ".png")
104 |             plt.close()
105 | 
106 |             # added minimum silence part
107 |             start = max(0.0, start - args.min_silence)
108 |             end = min(len(array) / args.fs, end + args.min_silence)
109 | 
110 |             # write to segments file
111 |             segment = "%s %s %f %f\n" % (utt_id, utt_id, start, end)
112 |             f.write(segment)
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/utils/trim_silence.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | fs=16000
 7 | win_length=1024
 8 | shift_length=256
 9 | threshold=60
10 | min_silence=0.01
11 | normalize=16
12 | cmd=run.pl
13 | nj=32
14 | 
15 | help_message=$(cat <<EOF
16 | Usage: $0 [options] <data-dir> <log-dir>
17 | e.g.: $0 data/train exp/trim_silence/train
18 | Options:
19 |   --fs <fs>                      # sampling frequency (default=16000)
20 |   --win_length <win_length>      # window length in point (default=1024)
21 |   --shift_length <shift_length>  # shift length in point (default=256)
22 |   --threshold <threshold>        # power threshold in db (default=60)
23 |   --min_silence <sec>            # minimum silence lenght in sec (default=0.01)
24 |   --normalize <bit>              # audio bit (default=16)
25 |   --cmd <cmd>                    # how to run jobs (default=run.pl)
26 |   --nj <nj>                      # number of parallel jobs (default=32)
27 | EOF
28 | )
29 | 
30 | . utils/parse_options.sh || exit 1;
31 | 
32 | if [ ! $# -eq 2 ]; then
33 |     echo "${help_message}"
34 |     exit 1;
35 | fi
36 | 
37 | set -euo pipefail
38 | data=$1
39 | logdir=$2
40 | 
41 | tmpdir=$(mktemp -d ${data}/tmp-XXXX)
42 | split_scps=""
43 | for n in $(seq ${nj}); do
44 |     split_scps="${split_scps} ${tmpdir}/wav.${n}.scp"
45 | done
46 | utils/split_scp.pl ${data}/wav.scp ${split_scps} || exit 1;
47 | 
48 | # make segments file describing start and end time
49 | ${cmd} JOB=1:${nj} ${logdir}/trim_silence.JOB.log \
50 |     MPLBACKEND=Agg trim_silence.py \
51 |         --fs ${fs} \
52 |         --win_length ${win_length} \
53 |         --shift_length ${shift_length} \
54 |         --threshold ${threshold} \
55 |         --min_silence ${min_silence} \
56 |         --normalize ${normalize} \
57 |         --figdir ${logdir}/figs \
58 |         scp:${tmpdir}/wav.JOB.scp \
59 |         ${tmpdir}/segments.JOB
60 | 
61 | # concatenate segments
62 | for n in $(seq ${nj}); do
63 |     cat ${tmpdir}/segments.${n} || exit 1;
64 | done > ${data}/segments || exit 1
65 | rm -rf ${tmpdir}
66 | 
67 | # check
68 | utils/validate_data_dir.sh --no-feats ${data}
69 | echo "Successfully trimed silence part."
70 | 


--------------------------------------------------------------------------------
/utils/trn2ctm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import argparse
 4 | import codecs
 5 | import math
 6 | import re
 7 | import sys
 8 | 
 9 | is_python2 = sys.version_info[0] == 2
10 | 
11 | 
12 | def get_parser():
13 |     parser = argparse.ArgumentParser(description="convert trn to ctm")
14 |     parser.add_argument("trn", type=str, default=None, nargs="?", help="input trn")
15 |     parser.add_argument("ctm", type=str, default=None, nargs="?", help="output ctm")
16 |     return parser
17 | 
18 | 
19 | def main(args):
20 |     args = get_parser().parse_args(args)
21 |     convert(args.trn, args.ctm)
22 | 
23 | 
24 | def convert(trn=None, ctm=None):
25 |     if trn is not None:
26 |         with codecs.open(trn, "r", encoding="utf-8") as trn:
27 |             content = trn.readlines()
28 |     else:
29 |         trn = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
30 |         content = trn.readlines()
31 |     split_content = []
32 |     for i, line in enumerate(content):
33 |         idx = line.rindex("(")
34 |         split = [line[:idx].strip().upper(), line[idx + 1 :].strip()[:-1]]
35 |         while "((" in split[0]:
36 |             split[0] = split[0].replace("((", "(")
37 |         while "  " in split[0]:
38 |             split[0] = split[0].replace("  ", " ")
39 |         segm_info = re.split("[-_]", split[1])
40 |         segm_info = [s.strip() for s in segm_info]
41 |         col1 = segm_info[0] + "_" + segm_info[1]
42 |         col2 = segm_info[2]
43 |         start_time_int = int(segm_info[6])
44 |         end_time_int = int(segm_info[7])
45 |         diff_int = end_time_int - start_time_int
46 |         word_split = split[0].split(" ")
47 |         word_split = list(
48 |             filter(lambda x: len(x) > 0 and any([c != " " for c in x]), word_split)
49 |         )
50 |         if len(word_split) > 0:
51 |             step_int = int(math.floor(float(diff_int) / len(word_split)))
52 |             step = str(step_int)
53 |             for j, word in enumerate(word_split):
54 |                 start_time = str(int(start_time_int + step_int * j))
55 |                 col3 = (
56 |                     (start_time[:-2] if len(start_time) > 2 else "0")
57 |                     + "."
58 |                     + (start_time[-2:] if len(start_time) > 1 else "00")
59 |                 )
60 |                 if j == len(word_split) - 1:
61 |                     diff = str(int(end_time_int - int(start_time)))
62 |                 else:
63 |                     diff = step
64 |                 col4 = (diff[:-2] if len(diff) > 2 else "0") + "." + diff[-2:]
65 |                 segm_info = [col1, col2, col3, col4]
66 |                 split_content.append(" ".join(segm_info) + "  " + word)
67 |     if ctm is not None:
68 |         sys.stdout = codecs.open(ctm, "w", encoding="utf-8")
69 |     else:
70 |         sys.stdout = codecs.getwriter("utf-8")(
71 |             sys.stdout if is_python2 else sys.stdout.buffer
72 |         )
73 |     for c_line in split_content:
74 |         print(c_line)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main(sys.argv[1:])
79 | 


--------------------------------------------------------------------------------
/utils/trn2stm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import argparse
 4 | import codecs
 5 | import re
 6 | import sys
 7 | 
 8 | is_python2 = sys.version_info[0] == 2
 9 | 
10 | 
11 | def get_parser():
12 |     parser = argparse.ArgumentParser(description="convert trn to stm")
13 |     parser.add_argument(
14 |         "--orig-stm",
15 |         type=str,
16 |         default=None,
17 |         nargs="?",
18 |         help="Original stm file to add additional information to the generated one",
19 |     )
20 |     parser.add_argument("trn", type=str, default=None, nargs="?", help="input trn")
21 |     parser.add_argument("stm", type=str, default=None, nargs="?", help="output stm")
22 |     return parser
23 | 
24 | 
25 | def main(args):
26 |     args = get_parser().parse_args(args)
27 |     convert(args.trn, args.stm, args.orig_stm)
28 | 
29 | 
30 | def convert(trn=None, stm=None, orig_stm=None):
31 |     if orig_stm is not None:
32 |         with codecs.open(orig_stm, "r", encoding="utf-8") as orig_stm:
33 |             orig_content = orig_stm.readlines()
34 |             has_orig = True
35 |             header = []
36 |             content = []
37 |             for line in orig_content:
38 |                 (header if line.startswith(";;") else content).append(line.strip())
39 |             del orig_content
40 |             content = [x.split(" ") for x in content]
41 |             mapping = {}
42 |             for x in content:
43 |                 mapping[x[2]] = x[5]
44 |             del content
45 |     else:
46 |         has_orig = False
47 |         header = None
48 |         mapping = None
49 | 
50 |     if trn is not None:
51 |         with codecs.open(trn, "r", encoding="utf-8") as trn:
52 |             content = trn.readlines()
53 |     else:
54 |         trn = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
55 |         content = trn.readlines()
56 | 
57 |     for i, line in enumerate(content):
58 |         idx = line.rindex("(")
59 |         split = [line[:idx].strip().upper() + " ", line[idx + 1 :].strip()[:-1]]
60 |         while "((" in split[0]:
61 |             split[0] = split[0].replace("((", "(")
62 |         while "  " in split[0]:
63 |             split[0] = split[0].replace("  ", " ")
64 |         segm_info = re.split("[-_]", split[1])
65 |         segm_info = [s.strip() for s in segm_info]
66 |         col1 = segm_info[0] + "_" + segm_info[1]
67 |         col2 = segm_info[2]
68 |         col3 = segm_info[3] + "_" + segm_info[4] + "_" + segm_info[5]
69 |         start_time = str(int(segm_info[6]))
70 |         end_time = str(int(segm_info[7]))
71 |         col4 = (
72 |             (start_time[:-2] if len(start_time) > 2 else "0")
73 |             + "."
74 |             + (start_time[-2:] if len(start_time) > 1 else "00")
75 |         )
76 |         col5 = (
77 |             (end_time[:-2] if len(end_time) > 2 else "0")
78 |             + "."
79 |             + (end_time[-2:] if len(end_time) > 1 else "00")
80 |         )
81 |         col6 = mapping[col3] if has_orig else ""
82 |         segm_info = [col1, col2, col3, col4, col5, col6]
83 |         content[i] = " ".join(segm_info) + "  " + split[0]
84 |     if stm is not None:
85 |         sys.stdout = codecs.open(stm, "w", encoding="utf-8")
86 |     else:
87 |         sys.stdout = codecs.getwriter("utf-8")(
88 |             sys.stdout if is_python2 else sys.stdout.buffer
89 |         )
90 |     if has_orig:
91 |         for h_line in header:
92 |             print(h_line)
93 |     for c_line in content:
94 |         print(c_line)
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main(sys.argv[1:])
99 | 


--------------------------------------------------------------------------------
/utils/update_json.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | echo "$0 $*" >&2 # Print the command line for logging
 7 | . ./path.sh
 8 | 
 9 | nlsyms=""
10 | oov="<unk>"
11 | bpecode=""
12 | verbose=0
13 | 
14 | text=""
15 | multilingual=false
16 | 
17 | help_message=$(cat << EOF
18 | Usage: $0 <json> <data-dir> <dict>
19 | e.g. $0 data/train data/lang_1char/train_units.txt
20 | Options:
21 |   --oov <oov-word>                                 # Default: <unk>
22 |   --verbose <num>                                  # Default: 0
23 | EOF
24 | )
25 | . utils/parse_options.sh
26 | 
27 | if [ $# != 3 ]; then
28 |     echo "${help_message}" 1>&2
29 |     exit 1;
30 | fi
31 | 
32 | set -euo pipefail
33 | 
34 | json=$1
35 | dir=$2
36 | dic=$3
37 | json_dir=$(dirname ${json})
38 | tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
39 | trap 'rm -rf ${tmpdir}' EXIT
40 | 
41 | if [ -z ${text} ]; then
42 |     text=${dir}/text
43 | fi
44 | 
45 | # 2. Create scp files for outputs
46 | mkdir -p ${tmpdir}/output
47 | if [ -n "${bpecode}" ]; then
48 |     if [ ${multilingual} = true ]; then
49 |         # remove a space before the language ID
50 |         paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
51 |             | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
52 |             > ${tmpdir}/output/token.scp
53 |     else
54 |         paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
55 |             | spm_encode --model=${bpecode} --output_format=piece) \
56 |             > ${tmpdir}/output/token.scp
57 |     fi
58 | elif [ -n "${nlsyms}" ]; then
59 |     text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
60 | else
61 |     text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
62 | fi
63 | < ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
64 | awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
65 | # +2 comes from CTC blank and EOS
66 | vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
67 | odim=$(echo "$vocsize + 2" | bc)
68 | awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
69 | 
70 | cat ${text} > ${tmpdir}/output/text.scp
71 | 
72 | 
73 | # 4. Create JSON files from each scp files
74 | rm -f ${tmpdir}/*/*.json
75 | for x in "${tmpdir}"/output/*.scp; do
76 |     k=$(basename ${x} .scp)
77 |     < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
78 | done
79 | 
80 | # add to json
81 | addjson.py --verbose ${verbose} -i false \
82 |   ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
83 | mkdir -p ${json_dir}/.backup
84 | echo "json updated. original json is kept in ${json_dir}/.backup."
85 | cp ${json} ${json_dir}/.backup/"$(basename ${json})"
86 | cp ${tmpdir}/data.json ${json}
87 | 
88 | rm -fr ${tmpdir}
89 | 


--------------------------------------------------------------------------------