├── test
    ├── __init__.py
    └── test.py
├── .gitignore
├── requirements.txt
├── setup.cfg
├── asr_evaluation
    ├── __init__.py
    ├── __main__.py
    └── asr_evaluation.py
├── .readthedocs.yml
├── docs
    ├── modules.rst
    ├── asr_evaluation.rst
    ├── index.rst
    ├── Makefile
    └── conf.py
├── .circleci
    └── config.yml
├── .devcontainer
    ├── library-scripts
    │   ├── README.md
    │   └── common-debian.sh
    ├── Dockerfile
    ├── devcontainer.json
    └── base.Dockerfile
├── setup.py
├── .travis.yml
├── README.md
└── LICENSE


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | asr_evaluation.egg-info
2 | dist
3 | build


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | edit_distance
2 | termcolor
3 | flake8
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E302,E305,E701
3 | max-line-length = 120


--------------------------------------------------------------------------------
/asr_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # from asr_evaluation.asr_evaluation import *
2 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | 
2 | requirements_file: requirements.txt
3 | 
4 | python:
5 |    version: 3
6 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | asr_evaluation
2 | ==============
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    asr_evaluation
8 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   python: circleci/python@0.2.1
 5 | 
 6 | jobs:
 7 |   build-and-test:
 8 |     executor: python/default
 9 |     steps:
10 |       - checkout
11 |       - python/install-deps
12 |       - run:
13 |           command: python setup.py test
14 |           name: Unit tests
15 |       - run:
16 |           command: flake8 asr_evaluation
17 |           name: Style check
18 | 
19 | workflows:
20 |   main:
21 |     jobs:
22 |       - build-and-test
23 | 


--------------------------------------------------------------------------------
/docs/asr_evaluation.rst:
--------------------------------------------------------------------------------
 1 | asr_evaluation package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | asr_evaluation.asr_evaluation module
 8 | ------------------------------------
 9 | 
10 | .. automodule:: asr_evaluation.asr_evaluation
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: asr_evaluation.asr_evaluation
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/.devcontainer/library-scripts/README.md:
--------------------------------------------------------------------------------
1 | # Warning: Folder contents may be replaced
2 | 
3 | The contents of this folder will be automatically replaced with a file of the same name in the [vscode-dev-containers](https://github.com/microsoft/vscode-dev-containers) repository's [script-library folder](https://github.com/microsoft/vscode-dev-containers/tree/master/script-library) whenever the repository is packaged.
4 | 
5 | To retain your edits, move the file to a different location. You may also delete the files if they are not needed.


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG VARIANT=3
 2 | FROM mcr.microsoft.com/vscode/devcontainers/python:${VARIANT}
 3 | 
 4 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
 5 | # COPY requirements.txt /tmp/pip-tmp/
 6 | # RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
 7 | #    && rm -rf /tmp/pip-tmp
 8 | 
 9 | # [Optional] Uncomment this section to install additional OS packages.
10 | # RUN apt-get update \
11 | #     && export DEBIAN_FRONTEND=noninteractive \
12 | #     && apt-get -y install --no-install-recommends <your-package-list-here>
13 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. asr-evaluation documentation master file, created by
 2 |    sphinx-quickstart on Sat Jan  7 19:32:10 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | asr-evaluation documentation
 7 | ==========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 4
11 | 
12 | .. automodule:: asr_evaluation.asr_evaluation
13 |     :members:
14 |     :undoc-members:
15 |     :show-inheritance:
16 | 
17 |    
18 | Indices and tables
19 | ==================
20 | 
21 | * :ref:`genindex`
22 | * :ref:`modindex`
23 | * :ref:`search`
24 | 
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='asr_evaluation',
 5 |     version='2.0.5',
 6 |     author='Ben Lambert',
 7 |     author_email='blambert@gmail.com',
 8 |     packages=['asr_evaluation'],
 9 |     license='LICENSE.txt',
10 |     description='Evaluating ASR (automatic speech recognition) hypotheses, i.e. computing word error rate.',
11 |     install_requires=['edit_distance', 'termcolor'],
12 |     test_suite='test.test.TestASREvaluation',
13 |     long_description=open('README.md').read(),
14 |     long_description_content_type="text/markdown",
15 |     entry_points={
16 |         'console_scripts': [
17 |             'wer = asr_evaluation.__main__:main'
18 |         ]
19 |     },
20 |     keywords=['word', 'error', 'rate', 'asr', 'speech', 'recognition'],
21 |     classifiers=[
22 |         "Development Status :: 5 - Production/Stable",
23 |         "Environment :: Console",
24 |         "Programming Language :: Python",
25 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
26 |         "Topic :: Text Processing",
27 |         "Topic :: Utilities",
28 |         "License :: OSI Approved :: Apache Software License"
29 |     ]
30 | )
31 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - '2.7'
 4 |   - '3.4'
 5 |   - '3.5'
 6 |   - '3.6'
 7 |   - '3.7'
 8 |   - '3.8'
 9 | matrix:
10 |   fast_finish: true
11 | install:
12 | - pip install coverage codecov flake8
13 | script:
14 | - python setup.py test
15 | - flake8 asr_evaluation
16 | - python -m coverage run --source asr_evaluation setup.py test
17 | after_success:
18 | - codecov
19 | deploy:
20 |   provider: pypi
21 |   user: benlambert
22 |   password:
23 |     secure: AorFwvGFVx1yLdG5xZvHE2Zl6z0PQ9extTB81t9mxD+ke4KM5PZbMKr3mwejK5jYvsyfKz7kIaGUhHopZtk4oN+XoQgDVrW0QEu/JVSOmXjNRalmG90ujehoSU6w22Wl/rljcJoh9q+NknJq229SeYT2SwP9KqSiXHyr/SpCKORoRcXd26lUAXu1rp2wpcYFg0oCNo7NG5bnc+T+d7qaqbjD1mpHdwmlC/SMuN3tzHek+sG6k537NukhgDynRHHYdAhumMYbSgKWicZUHxk7dMK+5d6iR7KtQxE6gVA6OZt7HbF5JLVzfRlWFwow+tk608Qfyp275+j6t6fGnVnANjpA3Tmdd5xGJSY+riYTImE8znzEIkH4CdhlRBNmu/w3pRXrSBzGXaPQCc6l0i+j+1NOihXAL5doklMF92PFUuUProoKFPfm+YSAYwYkNMdIgUeCTpq9cnvV5pXykeaPZEovWVtMvPsTIRArLkzxHVDeCjfsKefkXxtaeIpUofyFnYuMZaZLuTpKSRZ8AhnjEu+bANPNWlB2vl+aw3rhzYxoqrOw6rh+AfyAlRBkTK2gp8UmOeFrGCBVXrjnPtMUSxa+Gxl+GTx0imeFEb1Kw7fCRtEiJ2ag1dVVK3UnrZiPfa/IzMRxliwXEyYwmDcK1DM1mIUka6/1KoxRP2oppl8=
24 |   on:
25 |     tags: true
26 |     python: '3.7'
27 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "Python 3",
 3 | 	"build": {
 4 | 		"dockerfile": "Dockerfile",
 5 | 		"context": "..",
 6 | 		"args": { "VARIANT": "3.8" }
 7 | 	},
 8 | 
 9 | 	// Set *default* container specific settings.json values on container create.
10 | 	"settings": { 
11 | 		"terminal.integrated.shell.linux": "/bin/bash",
12 | 		"python.pythonPath": "/usr/local/bin/python",
13 | 		"python.linting.enabled": true,
14 | 		"python.linting.pylintEnabled": true,
15 | 		"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
16 | 		"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
17 | 		"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
18 | 		"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
19 | 		"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
20 | 		"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
21 | 		"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
22 | 		"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
23 | 		"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
24 | 	},
25 | 
26 | 	// Add the IDs of extensions you want installed when the container is created.
27 | 	"extensions": [
28 | 		"ms-python.python"
29 | 	]
30 | 
31 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
32 | 	// "forwardPorts": [],
33 | 
34 | 	// Use 'postCreateCommand' to run commands after the container is created.
35 | 	// "postCreateCommand": "pip3 install --user -r requirements.txt",
36 | 
37 | 	// Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
38 | 	// "remoteUser": "vscode"
39 | }
40 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2013-2018 Ben Lambert
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Unit tests for asr_evaluation.
17 | """
18 | from __future__ import division
19 | 
20 | import sys
21 | import unittest
22 | 
23 | from asr_evaluation import __main__
24 | 
25 | # Note these tests aren't checking for correctness.  They are simply
26 | # exercising all the command line options to make sure we don't get errors
27 | # simply by running them.
28 | 
29 | class TestASREvaluation(unittest.TestCase):
30 |     """..."""
31 | 
32 |     def testing(self):
33 |         """..."""
34 |         self.assertTrue(True)
35 | 
36 |     def test_cli1(self):
37 |         sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py', '-c', '-m', '0', '-i']
38 |         __main__.main()
39 | 
40 |     def test_cli2(self):
41 |         sys.argv = ['evaluate.py', 'requirements.txtssssss', 'setup.py', '-c', '-m', '0', '-i']
42 |         with self.assertRaises(SystemExit):
43 |             __main__.main()
44 | 
45 |     def test_cli3(self):
46 |         sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py', '-c', '-m', '0']
47 |         __main__.main()
48 | 
49 |     def test_cli4(self):
50 |         sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py']
51 |         __main__.main()
52 | 
53 |     def test_cli5(self):
54 |         sys.argv = ['evaluate.py', 'setup.py', 'setup.py']
55 |         __main__.main()
56 | 
57 |     def test_cli6(self):
58 |         sys.argv = ['evaluate.py', 'setup.py', 'setup.py', '-a']
59 |         __main__.main()
60 | 
61 |     def test_cli7(self):
62 |         sys.argv = ['evaluate.py', 'setup.py', 'setup.py', '-e']
63 |         __main__.main()
64 | 
65 |     def test_cli8(self):
66 |         sys.argv = ['evaluate.py', 'requirements.txt', 'requirements.txt', '-id']
67 |         __main__.main()
68 | 


--------------------------------------------------------------------------------
/.devcontainer/base.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Update the VARIANT arg in devcontainer.json to pick a Python version: 3, 3.8, 3.7, 3.6 
 2 | ARG VARIANT=3
 3 | FROM python:${VARIANT}
 4 | 
 5 | # Options for common setup script
 6 | ARG INSTALL_ZSH="true"
 7 | ARG UPGRADE_PACKAGES="true"
 8 | ARG USERNAME=vscode
 9 | ARG USER_UID=1000
10 | ARG USER_GID=$USER_UID
11 | 
12 | # Install needed packages and setup non-root user. Use a separate RUN statement to add your own dependencies.
13 | COPY .devcontainer/library-scripts/*.sh /tmp/library-scripts/
14 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
15 |     # Remove imagemagick due to https://security-tracker.debian.org/tracker/CVE-2019-10131
16 |     && apt-get purge -y imagemagick imagemagick-6-common \
17 |     # Install common packages, non-root user
18 |     && /bin/bash /tmp/library-scripts/common-debian.sh "${INSTALL_ZSH}" "${USERNAME}" "${USER_UID}" "${USER_GID}" "${UPGRADE_PACKAGES}" \
19 |     # Clean up
20 |     && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts
21 | 
22 | # Setup default python tools in a venv via pipx to avoid conflicts
23 | ARG DEFAULT_UTILS="\
24 |     pylint \
25 |     flake8 \
26 |     autopep8 \
27 |     black \
28 |     yapf \
29 |     mypy \
30 |     pydocstyle \
31 |     pycodestyle \
32 |     bandit \
33 |     virtualenv"
34 | ENV PIPX_HOME=/usr/local/py-utils
35 | ENV PIPX_BIN_DIR=${PIPX_HOME}/bin
36 | ENV PATH=${PATH}:${PIPX_BIN_DIR}
37 | RUN mkdir -p ${PIPX_BIN_DIR} \
38 |     && export PYTHONUSERBASE=/tmp/pip-tmp \
39 |     && pip3 install --disable-pip-version-check --no-warn-script-location --no-cache-dir --user pipx \
40 |     && /tmp/pip-tmp/bin/pipx install --pip-args=--no-cache-dir pipx \
41 |     && echo "${DEFAULT_UTILS}" | xargs -n 1 /tmp/pip-tmp/bin/pipx install --system-site-packages --pip-args=--no-cache-dir --pip-args=--force-reinstall \
42 |     && chown -R ${USER_UID}:${USER_GID} ${PIPX_HOME} \
43 |     && rm -rf /tmp/pip-tmp
44 | 
45 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
46 | # COPY requirements.txt /tmp/pip-tmp/
47 | # RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
48 | #    && rm -rf /tmp/pip-tmp
49 | 
50 | # [Optional] Uncomment this section to install additional OS packages.
51 | # RUN apt-get update \
52 | #     && export DEBIAN_FRONTEND=noninteractive \
53 | #    && apt-get -y install --no-install-recommends <your-package-list-here>
54 | 
55 | 


--------------------------------------------------------------------------------
/asr_evaluation/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2017-2018 Ben Lambert
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | Contains the main method for the CLI.
19 | """
20 | 
21 | import argparse
22 | 
23 | # For some reason Python 2 and Python 3 disagree about how to import this.
24 | try:
25 |     from asr_evaluation.asr_evaluation import main as other_main
26 | except Exception:
27 |     from asr_evaluation import main as other_main
28 | 
29 | def get_parser():
30 |     """Parse the CLI args."""
31 |     parser = argparse.ArgumentParser(description='Evaluate an ASR transcript against a reference transcript.')
32 |     parser.add_argument('ref', type=argparse.FileType('r'), help='Reference transcript filename')
33 |     parser.add_argument('hyp', type=argparse.FileType('r'), help='ASR hypothesis filename')
34 |     print_args = parser.add_mutually_exclusive_group()
35 |     print_args.add_argument('-i', '--print-instances', action='store_true',
36 |                             help='Print all individual sentences and their errors.')
37 |     print_args.add_argument('-r', '--print-errors', action='store_true',
38 |                             help='Print all individual sentences that contain errors.')
39 |     parser.add_argument('--head-ids', action='store_true',
40 |                         help='Hypothesis and reference files have ids in the first token? (Kaldi format)')
41 |     parser.add_argument('-id', '--tail-ids', '--has-ids', action='store_true',
42 |                         help='Hypothesis and reference files have ids in the last token? (Sphinx format)')
43 |     parser.add_argument('-c', '--confusions', action='store_true', help='Print tables of which words were confused.')
44 |     parser.add_argument('-p', '--print-wer-vs-length', action='store_true',
45 |                         help='Print table of average WER grouped by reference sentence length.')
46 |     parser.add_argument('-m', '--min-word-count', type=int, default=1, metavar='count',
47 |                         help='Minimum word count to show a word in confusions (default 1).')
48 |     parser.add_argument('-a', '--case-insensitive', action='store_true',
49 |                         help='Down-case the text before running the evaluation.')
50 |     parser.add_argument('-e', '--remove-empty-refs', action='store_true',
51 |                         help='Skip over any examples where the reference is empty.')
52 | 
53 |     return parser
54 | 
55 | def main():
56 |     """Run the program."""
57 |     parser = get_parser()
58 |     args = parser.parse_args()
59 |     other_main(args)
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | asr_evaluation
 2 | ==============
 3 | [![Build Status](https://travis-ci.org/belambert/asr-evaluation.svg?branch=main)](https://travis-ci.org/belambert/asr-evaluation)
 4 | [![PyPI version](https://badge.fury.io/py/asr_evaluation.svg)](https://badge.fury.io/py/asr_evaluation)
 5 | [![codecov](https://codecov.io/gh/belambert/asr-evaluation/branch/main/graph/badge.svg)](https://codecov.io/gh/belambert/asr-evaluation)
 6 | 
 7 | Python module for evaluting ASR hypotheses (i.e. word error rate and word 
 8 | recognition rate).
 9 | 
10 | This module depends on the [editdistance](https://github.com/belambert/edit-distance)
11 | project, for computing edit distances between arbitrary sequences.
12 | 
13 | The formatting of the output of this program is very loosely based around the 
14 | same idea as the align.c program commonly used within the Sphinx ASR community. 
15 | This may run a bit faster if neither instances nor confusions are printed.
16 | 
17 | Please let me know if you have any comments, questions, or problems.
18 | 
19 | Output
20 | ------
21 | The program outputs three standard measurements:
22 |  - [Word error rate (WER)](https://en.wikipedia.org/wiki/Word_error_rate)
23 |  - Word recognition rate (the number of _matched_ words in the alignment divided by the number of words in the reference).
24 |  - Sentence error rate (SER) (the number of incorrect sentences divided by the total number of sentences).
25 | 
26 | 
27 | Installing & uninstalling
28 | -------------------------
29 | The easiest way to install is using pip:
30 | 
31 |     pip install asr-evaluation
32 | 
33 | Alternatively you can clone this git repo and install using distutils:
34 | 
35 |     git clone git@github.com:belambert/asr-evaluation.git
36 |     cd asr-evaluation
37 |     python setup.py install
38 | 
39 | To uninstall with pip:
40 | 
41 |     pip uninstall asr-evaluation
42 | 
43 | 
44 | Command line usage
45 | ------------------
46 | For command line usage, see:
47 | ```
48 |     wer --help
49 | ```
50 | 
51 | It should display something like this:
52 | 
53 | ```    
54 | usage: wer [-h] [-i | -r] [--head-ids] [-id] [-c] [-p] [-m count] [-a] [-e]
55 |            ref hyp
56 | 
57 | Evaluate an ASR transcript against a reference transcript.
58 | 
59 | positional arguments:
60 |   ref                   Reference transcript filename
61 |   hyp                   ASR hypothesis filename
62 | 
63 | optional arguments:
64 |   -h, --help            show this help message and exit
65 |   -i, --print-instances
66 |                         Print all individual sentences and their errors.
67 |   -r, --print-errors    Print all individual sentences that contain errors.
68 |   --head-ids            Hypothesis and reference files have ids in the first
69 |                         token? (Kaldi format)
70 |   -id, --tail-ids, --has-ids
71 |                         Hypothesis and reference files have ids in the last
72 |                         token? (Sphinx format)
73 |   -c, --confusions      Print tables of which words were confused.
74 |   -p, --print-wer-vs-length
75 |                         Print table of average WER grouped by reference
76 |                         sentence length.
77 |   -m count, --min-word-count count
78 |                         Minimum word count to show a word in confusions.
79 |   -a, --case-insensitive
80 |                         Down-case the text before running the evaluation.
81 |   -e, --remove-empty-refs
82 |                         Skip over any examples where the reference is empty.
83 | ```
84 | 
85 | Contributing and code of conduct
86 | --------------------------------
87 | For contributions, it's best to Github issues and pull requests. Proper
88 | testing and documentation suggested.
89 | 
90 | Code of conduct is expected to be reasonable, especially as specified by
91 | the [Contributor Covenant](http://contributor-covenant.org/version/1/4/)
92 | 


--------------------------------------------------------------------------------
/.devcontainer/library-scripts/common-debian.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #-------------------------------------------------------------------------------------------------------------
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
  5 | #-------------------------------------------------------------------------------------------------------------
  6 | 
  7 | # Syntax: ./common-debian.sh [install zsh flag] [username] [user UID] [user GID] [upgrade packages flag]
  8 | 
  9 | INSTALL_ZSH=${1:-"true"}
 10 | USERNAME=${2:-"vscode"}
 11 | USER_UID=${3:-1000}
 12 | USER_GID=${4:-1000}
 13 | UPGRADE_PACKAGES=${5:-"true"}
 14 | 
 15 | set -e
 16 | 
 17 | if [ "$(id -u)" -ne 0 ]; then
 18 |     echo -e 'Script must be run a root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.'
 19 |     exit 1
 20 | fi
 21 | 
 22 | # Treat a user name of "none" as root
 23 | if [ "${USERNAME}" = "none" ] || [ "${USERNAME}" = "root" ]; then
 24 |     USERNAME=root
 25 |     USER_UID=0
 26 |     USER_GID=0
 27 | fi
 28 | 
 29 | # Load markers to see which steps have already run
 30 | MARKER_FILE="/usr/local/etc/vscode-dev-containers/common"
 31 | if [ -f "${MARKER_FILE}" ]; then
 32 |     echo "Marker file found:"
 33 |     cat "${MARKER_FILE}"
 34 |     source "${MARKER_FILE}"
 35 | fi
 36 | 
 37 | # Ensure apt is in non-interactive to avoid prompts
 38 | export DEBIAN_FRONTEND=noninteractive
 39 | 
 40 | # Function to call apt-get if needed
 41 | apt-get-update-if-needed()
 42 | {
 43 |     if [ ! -d "/var/lib/apt/lists" ] || [ "$(ls /var/lib/apt/lists/ | wc -l)" = "0" ]; then
 44 |         echo "Running apt-get update..."
 45 |         apt-get update
 46 |     else
 47 |         echo "Skipping apt-get update."
 48 |     fi
 49 | }
 50 | 
 51 | # Run install apt-utils to avoid debconf warning then verify presence of other common developer tools and dependencies
 52 | if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
 53 |     apt-get-update-if-needed
 54 | 
 55 |     PACKAGE_LIST="apt-utils \
 56 |         git \
 57 |         openssh-client \
 58 |         less \
 59 |         iproute2 \
 60 |         procps \
 61 |         curl \
 62 |         wget \
 63 |         unzip \
 64 |         nano \
 65 |         jq \
 66 |         lsb-release \
 67 |         ca-certificates \
 68 |         apt-transport-https \
 69 |         dialog \
 70 |         gnupg2 \
 71 |         libc6 \
 72 |         libgcc1 \
 73 |         libgssapi-krb5-2 \
 74 |         libicu[0-9][0-9] \
 75 |         liblttng-ust0 \
 76 |         libstdc++6 \
 77 |         zlib1g \
 78 |         locales \
 79 |         sudo"
 80 | 
 81 |     # Install libssl1.1 if available
 82 |     if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then
 83 |         PACKAGE_LIST="${PACKAGE_LIST}       libssl1.1"
 84 |     fi
 85 |     
 86 |     # Install appropriate version of libssl1.0.x if available
 87 |     LIBSSL=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '')
 88 |     if [ "$(echo "$LIBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then
 89 |         if [[ ! -z $(apt-cache --names-only search ^libssl1.0.2$) ]]; then
 90 |             # Debian 9
 91 |             PACKAGE_LIST="${PACKAGE_LIST}       libssl1.0.2"
 92 |         elif [[ ! -z $(apt-cache --names-only search ^libssl1.0.0$) ]]; then
 93 |             # Ubuntu 18.04, 16.04, earlier
 94 |             PACKAGE_LIST="${PACKAGE_LIST}       libssl1.0.0"
 95 |         fi
 96 |     fi
 97 | 
 98 |     echo "Packages to verify are installed: ${PACKAGE_LIST}"
 99 |     apt-get -y install --no-install-recommends ${PACKAGE_LIST} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 )
100 |         
101 |     PACKAGES_ALREADY_INSTALLED="true"
102 | fi
103 | 
104 | # Get to latest versions of all packages
105 | if [ "${UPGRADE_PACKAGES}" = "true" ]; then
106 |     apt-get-update-if-needed
107 |     apt-get -y upgrade --no-install-recommends
108 |     apt-get autoremove -y
109 | fi
110 | 
111 | # Ensure at least the en_US.UTF-8 UTF-8 locale is available.
112 | # Common need for both applications and things like the agnoster ZSH theme.
113 | if [ "${LOCALE_ALREADY_SET}" != "true" ]; then
114 |     echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 
115 |     locale-gen
116 |     LOCALE_ALREADY_SET="true"
117 | fi
118 | 
119 | # Create or update a non-root user to match UID/GID - see https://aka.ms/vscode-remote/containers/non-root-user.
120 | if id -u $USERNAME > /dev/null 2>&1; then
121 |     # User exists, update if needed
122 |     if [ "$USER_GID" != "$(id -G $USERNAME)" ]; then 
123 |         groupmod --gid $USER_GID $USERNAME 
124 |         usermod --gid $USER_GID $USERNAME
125 |     fi
126 |     if [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 
127 |         usermod --uid $USER_UID $USERNAME
128 |     fi
129 | else
130 |     # Create user
131 |     groupadd --gid $USER_GID $USERNAME
132 |     useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME
133 | fi
134 | 
135 | # Add add sudo support for non-root user
136 | if [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" ]; then
137 |     echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME
138 |     chmod 0440 /etc/sudoers.d/$USERNAME
139 |     EXISTING_NON_ROOT_USER="${USERNAME}"
140 | fi
141 | 
142 | # Ensure ~/.local/bin is in the PATH for root and non-root users for bash. (zsh is later)
143 | if [ "${DOT_LOCAL_ALREADY_ADDED}" != "true" ]; then
144 |     echo "export PATH=\$PATH:\$HOME/.local/bin" | tee -a /root/.bashrc >> /home/$USERNAME/.bashrc 
145 |     chown $USER_UID:$USER_GID /home/$USERNAME/.bashrc
146 |     DOT_LOCAL_ALREADY_ADDED="true"
147 | fi
148 | 
149 | # Optionally install and configure zsh
150 | if [ "${INSTALL_ZSH}" = "true" ] && [ ! -d "/root/.oh-my-zsh" ] && [ "${ZSH_ALREADY_INSTALLED}" != "true" ]; then
151 |     apt-get-update-if-needed
152 |     apt-get install -y zsh
153 |     curl -fsSLo- https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh | bash 2>&1
154 |     echo "export PATH=\$PATH:\$HOME/.local/bin" >> /root/.zshrc
155 |     if [ "${USERNAME}" != "root" ]; then
156 |         cp -fR /root/.oh-my-zsh /home/$USERNAME
157 |         cp -f /root/.zshrc /home/$USERNAME
158 |         sed -i -e "s/\/root\/.oh-my-zsh/\/home\/$USERNAME\/.oh-my-zsh/g" /home/$USERNAME/.zshrc
159 |         chown -R $USER_UID:$USER_GID /home/$USERNAME/.oh-my-zsh /home/$USERNAME/.zshrc
160 |     fi
161 |     ZSH_ALREADY_INSTALLED="true"
162 | fi
163 | 
164 | # Write marker file
165 | mkdir -p "$(dirname "${MARKER_FILE}")"
166 | echo -e "\
167 |     PACKAGES_ALREADY_INSTALLED=${PACKAGES_ALREADY_INSTALLED}\n\
168 |     LOCALE_ALREADY_SET=${LOCALE_ALREADY_SET}\n\
169 |     EXISTING_NON_ROOT_USER=${EXISTING_NON_ROOT_USER}\n\
170 |     DOT_LOCAL_ALREADY_ADDED=${DOT_LOCAL_ALREADY_ADDED}\n\
171 |     ZSH_ALREADY_INSTALLED=${ZSH_ALREADY_INSTALLED}" > "${MARKER_FILE}"
172 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | 	$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  epub3      to make an epub3"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 	@echo "  dummy      to check syntax errors of document sources"
 51 | 
 52 | .PHONY: clean
 53 | clean:
 54 | 	rm -rf $(BUILDDIR)/*
 55 | 
 56 | .PHONY: html
 57 | html:
 58 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 61 | 
 62 | .PHONY: dirhtml
 63 | dirhtml:
 64 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 65 | 	@echo
 66 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 67 | 
 68 | .PHONY: singlehtml
 69 | singlehtml:
 70 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 71 | 	@echo
 72 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 73 | 
 74 | .PHONY: pickle
 75 | pickle:
 76 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 77 | 	@echo
 78 | 	@echo "Build finished; now you can process the pickle files."
 79 | 
 80 | .PHONY: json
 81 | json:
 82 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 83 | 	@echo
 84 | 	@echo "Build finished; now you can process the JSON files."
 85 | 
 86 | .PHONY: htmlhelp
 87 | htmlhelp:
 88 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 89 | 	@echo
 90 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 91 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 92 | 
 93 | .PHONY: qthelp
 94 | qthelp:
 95 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 96 | 	@echo
 97 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 98 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 99 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/asr-evaluation.qhcp"
100 | 	@echo "To view the help file:"
101 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/asr-evaluation.qhc"
102 | 
103 | .PHONY: applehelp
104 | applehelp:
105 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
106 | 	@echo
107 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
108 | 	@echo "N.B. You won't be able to view it unless you put it in" \
109 | 	      "~/Library/Documentation/Help or install it in your application" \
110 | 	      "bundle."
111 | 
112 | .PHONY: devhelp
113 | devhelp:
114 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
115 | 	@echo
116 | 	@echo "Build finished."
117 | 	@echo "To view the help file:"
118 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/asr-evaluation"
119 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/asr-evaluation"
120 | 	@echo "# devhelp"
121 | 
122 | .PHONY: epub
123 | epub:
124 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
125 | 	@echo
126 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
127 | 
128 | .PHONY: epub3
129 | epub3:
130 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
131 | 	@echo
132 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
133 | 
134 | .PHONY: latex
135 | latex:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo
138 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
139 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
140 | 	      "(use \`make latexpdf' here to do that automatically)."
141 | 
142 | .PHONY: latexpdf
143 | latexpdf:
144 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
145 | 	@echo "Running LaTeX files through pdflatex..."
146 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
147 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
148 | 
149 | .PHONY: latexpdfja
150 | latexpdfja:
151 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
152 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
153 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
154 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
155 | 
156 | .PHONY: text
157 | text:
158 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
159 | 	@echo
160 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
161 | 
162 | .PHONY: man
163 | man:
164 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
165 | 	@echo
166 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
167 | 
168 | .PHONY: texinfo
169 | texinfo:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo
172 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
173 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
174 | 	      "(use \`make info' here to do that automatically)."
175 | 
176 | .PHONY: info
177 | info:
178 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
179 | 	@echo "Running Texinfo files through makeinfo..."
180 | 	make -C $(BUILDDIR)/texinfo info
181 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
182 | 
183 | .PHONY: gettext
184 | gettext:
185 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
186 | 	@echo
187 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
188 | 
189 | .PHONY: changes
190 | changes:
191 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
192 | 	@echo
193 | 	@echo "The overview file is in $(BUILDDIR)/changes."
194 | 
195 | .PHONY: linkcheck
196 | linkcheck:
197 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
198 | 	@echo
199 | 	@echo "Link check complete; look for any errors in the above output " \
200 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
201 | 
202 | .PHONY: doctest
203 | doctest:
204 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
205 | 	@echo "Testing of doctests in the sources finished, look at the " \
206 | 	      "results in $(BUILDDIR)/doctest/output.txt."
207 | 
208 | .PHONY: coverage
209 | coverage:
210 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
211 | 	@echo "Testing of coverage in the sources finished, look at the " \
212 | 	      "results in $(BUILDDIR)/coverage/python.txt."
213 | 
214 | .PHONY: xml
215 | xml:
216 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
217 | 	@echo
218 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
219 | 
220 | .PHONY: pseudoxml
221 | pseudoxml:
222 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
223 | 	@echo
224 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
225 | 
226 | .PHONY: dummy
227 | dummy:
228 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
229 | 	@echo
230 | 	@echo "Build finished. Dummy builder generates no files."
231 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # asr-evaluation documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat Jan  7 19:32:10 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | sys.path.insert(0, os.path.abspath('../src'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.autodoc',
 33 |     'sphinx.ext.intersphinx',
 34 |     'sphinx.ext.todo',
 35 |     'sphinx.ext.coverage',
 36 | ]
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix(es) of source filenames.
 42 | # You can specify multiple suffix as a list of string:
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The encoding of source files.
 47 | #source_encoding = 'utf-8-sig'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = 'index'
 51 | 
 52 | # General information about the project.
 53 | project = u'asr-evaluation'
 54 | copyright = u'2017, Ben Lambert'
 55 | author = u'Ben Lambert'
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement for
 58 | # |version| and |release|, also used in various other places throughout the
 59 | # built documents.
 60 | #
 61 | # The short X.Y version.
 62 | version = u'0.2.3'
 63 | # The full version, including alpha/beta/rc tags.
 64 | release = u'0.2.3'
 65 | 
 66 | # The language for content autogenerated by Sphinx. Refer to documentation
 67 | # for a list of supported languages.
 68 | #
 69 | # This is also used if you do content translation via gettext catalogs.
 70 | # Usually you set "language" from the command line for these cases.
 71 | language = None
 72 | 
 73 | # There are two options for replacing |today|: either, you set today to some
 74 | # non-false value, then it is used:
 75 | #today = ''
 76 | # Else, today_fmt is used as the format for a strftime call.
 77 | #today_fmt = '%B %d, %Y'
 78 | 
 79 | # List of patterns, relative to source directory, that match files and
 80 | # directories to ignore when looking for source files.
 81 | # This patterns also effect to html_static_path and html_extra_path
 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all
 85 | # documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | # If true, keep warnings as "system message" paragraphs in the built documents.
106 | #keep_warnings = False
107 | 
108 | # If true, `todo` and `todoList` produce output, else they produce nothing.
109 | todo_include_todos = True
110 | 
111 | 
112 | # -- Options for HTML output ----------------------------------------------
113 | 
114 | # The theme to use for HTML and HTML Help pages.  See the documentation for
115 | # a list of builtin themes.
116 | html_theme = 'classic'
117 | 
118 | # Theme options are theme-specific and customize the look and feel of a theme
119 | # further.  For a list of options available for each theme, see the
120 | # documentation.
121 | #html_theme_options = {}
122 | 
123 | # Add any paths that contain custom themes here, relative to this directory.
124 | #html_theme_path = []
125 | 
126 | # The name for this set of Sphinx documents.
127 | # "<project> v<release> documentation" by default.
128 | #html_title = u'asr-evaluation v0.2.3'
129 | 
130 | # A shorter title for the navigation bar.  Default is the same as html_title.
131 | #html_short_title = None
132 | 
133 | # The name of an image file (relative to this directory) to place at the top
134 | # of the sidebar.
135 | #html_logo = None
136 | 
137 | # The name of an image file (relative to this directory) to use as a favicon of
138 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
139 | # pixels large.
140 | #html_favicon = None
141 | 
142 | # Add any paths that contain custom static files (such as style sheets) here,
143 | # relative to this directory. They are copied after the builtin static files,
144 | # so a file named "default.css" will overwrite the builtin "default.css".
145 | html_static_path = ['_static']
146 | 
147 | # Add any extra paths that contain custom files (such as robots.txt or
148 | # .htaccess) here, relative to this directory. These files are copied
149 | # directly to the root of the documentation.
150 | #html_extra_path = []
151 | 
152 | # If not None, a 'Last updated on:' timestamp is inserted at every page
153 | # bottom, using the given strftime format.
154 | # The empty string is equivalent to '%b %d, %Y'.
155 | #html_last_updated_fmt = None
156 | 
157 | # If true, SmartyPants will be used to convert quotes and dashes to
158 | # typographically correct entities.
159 | #html_use_smartypants = True
160 | 
161 | # Custom sidebar templates, maps document names to template names.
162 | #html_sidebars = {}
163 | 
164 | # Additional templates that should be rendered to pages, maps page names to
165 | # template names.
166 | #html_additional_pages = {}
167 | 
168 | # If false, no module index is generated.
169 | #html_domain_indices = True
170 | 
171 | # If false, no index is generated.
172 | #html_use_index = True
173 | 
174 | # If true, the index is split into individual pages for each letter.
175 | #html_split_index = False
176 | 
177 | # If true, links to the reST sources are added to the pages.
178 | #html_show_sourcelink = True
179 | 
180 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
181 | #html_show_sphinx = True
182 | 
183 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
184 | #html_show_copyright = True
185 | 
186 | # If true, an OpenSearch description file will be output, and all pages will
187 | # contain a <link> tag referring to it.  The value of this option must be the
188 | # base URL from which the finished HTML is served.
189 | #html_use_opensearch = ''
190 | 
191 | # This is the file name suffix for HTML files (e.g. ".xhtml").
192 | #html_file_suffix = None
193 | 
194 | # Language to be used for generating the HTML full-text search index.
195 | # Sphinx supports the following languages:
196 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
197 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
198 | #html_search_language = 'en'
199 | 
200 | # A dictionary with options for the search language support, empty by default.
201 | # 'ja' uses this config value.
202 | # 'zh' user can custom change `jieba` dictionary path.
203 | #html_search_options = {'type': 'default'}
204 | 
205 | # The name of a javascript file (relative to the configuration directory) that
206 | # implements a search results scorer. If empty, the default will be used.
207 | #html_search_scorer = 'scorer.js'
208 | 
209 | # Output file base name for HTML help builder.
210 | htmlhelp_basename = 'asr-evaluationdoc'
211 | 
212 | # -- Options for LaTeX output ---------------------------------------------
213 | 
214 | latex_elements = {
215 | # The paper size ('letterpaper' or 'a4paper').
216 | #'papersize': 'letterpaper',
217 | 
218 | # The font size ('10pt', '11pt' or '12pt').
219 | #'pointsize': '10pt',
220 | 
221 | # Additional stuff for the LaTeX preamble.
222 | #'preamble': '',
223 | 
224 | # Latex figure (float) alignment
225 | #'figure_align': 'htbp',
226 | }
227 | 
228 | # Grouping the document tree into LaTeX files. List of tuples
229 | # (source start file, target name, title,
230 | #  author, documentclass [howto, manual, or own class]).
231 | latex_documents = [
232 |     (master_doc, 'asr-evaluation.tex', u'asr-evaluation Documentation',
233 |      u'Ben Lambert', 'manual'),
234 | ]
235 | 
236 | # The name of an image file (relative to this directory) to place at the top of
237 | # the title page.
238 | #latex_logo = None
239 | 
240 | # For "manual" documents, if this is true, then toplevel headings are parts,
241 | # not chapters.
242 | #latex_use_parts = False
243 | 
244 | # If true, show page references after internal links.
245 | #latex_show_pagerefs = False
246 | 
247 | # If true, show URL addresses after external links.
248 | #latex_show_urls = False
249 | 
250 | # Documents to append as an appendix to all manuals.
251 | #latex_appendices = []
252 | 
253 | # If false, no module index is generated.
254 | #latex_domain_indices = True
255 | 
256 | 
257 | # -- Options for manual page output ---------------------------------------
258 | 
259 | # One entry per manual page. List of tuples
260 | # (source start file, name, description, authors, manual section).
261 | man_pages = [
262 |     (master_doc, 'asr-evaluation', u'asr-evaluation Documentation',
263 |      [author], 1)
264 | ]
265 | 
266 | # If true, show URL addresses after external links.
267 | #man_show_urls = False
268 | 
269 | 
270 | # -- Options for Texinfo output -------------------------------------------
271 | 
272 | # Grouping the document tree into Texinfo files. List of tuples
273 | # (source start file, target name, title, author,
274 | #  dir menu entry, description, category)
275 | texinfo_documents = [
276 |     (master_doc, 'asr-evaluation', u'asr-evaluation Documentation',
277 |      author, 'asr-evaluation', 'One line description of project.',
278 |      'Miscellaneous'),
279 | ]
280 | 
281 | # Documents to append as an appendix to all manuals.
282 | #texinfo_appendices = []
283 | 
284 | # If false, no module index is generated.
285 | #texinfo_domain_indices = True
286 | 
287 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
288 | #texinfo_show_urls = 'footnote'
289 | 
290 | # If true, do not generate a @detailmenu in the "Top" node's menu.
291 | #texinfo_no_detailmenu = False
292 | 
293 | 
294 | # Example configuration for intersphinx: refer to the Python standard library.
295 | intersphinx_mapping = {'https://docs.python.org/3': None}
296 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2013-2018 Ben Lambert
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/asr_evaluation/asr_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017-2018 Ben Lambert
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | """
 17 | Primary code for computing word error rate and other metrics from ASR output.
 18 | """
 19 | from __future__ import division
 20 | 
 21 | from functools import reduce
 22 | from collections import defaultdict
 23 | from edit_distance import SequenceMatcher
 24 | 
 25 | from termcolor import colored
 26 | 
 27 | # Some defaults
 28 | print_instances_p = False
 29 | print_errors_p = False
 30 | files_head_ids = False
 31 | files_tail_ids = False
 32 | confusions = False
 33 | min_count = 0
 34 | wer_vs_length_p = True
 35 | 
 36 | # For keeping track of the total number of tokens, errors, and matches
 37 | ref_token_count = 0
 38 | error_count = 0
 39 | match_count = 0
 40 | counter = 0
 41 | sent_error_count = 0
 42 | 
 43 | # For keeping track of word error rates by sentence length
 44 | # this is so we can see if performance is better/worse for longer
 45 | # and/or shorter sentences
 46 | lengths = []
 47 | error_rates = []
 48 | wer_bins = defaultdict(list)
 49 | wer_vs_length = defaultdict(list)
 50 | # Tables for keeping track of which words get confused with one another
 51 | insertion_table = defaultdict(int)
 52 | deletion_table = defaultdict(int)
 53 | substitution_table = defaultdict(int)
 54 | # These are the editdistance opcodes that are condsidered 'errors'
 55 | error_codes = ['replace', 'delete', 'insert']
 56 | 
 57 | 
 58 | # TODO - rename this function.  Move some of it into evaluate.py?
 59 | def main(args):
 60 |     """Main method - this reads the hyp and ref files, and creates
 61 |     editdistance.SequenceMatcher objects to compute the edit distance.
 62 |     All the statistics necessary statistics are collected, and results are
 63 |     printed as specified by the command line options.
 64 | 
 65 |     This function doesn't not check to ensure that the reference and
 66 |     hypothesis file have the same number of lines.  It will stop after the
 67 |     shortest one runs out of lines.  This should be easy to fix...
 68 |     """
 69 |     global counter
 70 |     set_global_variables(args)
 71 | 
 72 |     counter = 0
 73 |     # Loop through each line of the reference and hyp file
 74 |     for ref_line, hyp_line in zip(args.ref, args.hyp):
 75 |         processed_p = process_line_pair(ref_line, hyp_line, case_insensitive=args.case_insensitive,
 76 |                                         remove_empty_refs=args.remove_empty_refs)
 77 |         if processed_p:
 78 |             counter += 1
 79 |     if confusions:
 80 |         print_confusions()
 81 |     if wer_vs_length_p:
 82 |         print_wer_vs_length()
 83 |     # Compute WER and WRR
 84 |     if ref_token_count > 0:
 85 |         wrr = match_count / ref_token_count
 86 |         wer = error_count / ref_token_count
 87 |     else:
 88 |         wrr = 0.0
 89 |         wer = 0.0
 90 |     # Compute SER
 91 |     ser = sent_error_count / counter if counter > 0 else 0.0
 92 |     print('Sentence count: {}'.format(counter))
 93 |     print('WER: {:10.3%} ({:10d} / {:10d})'.format(wer, error_count, ref_token_count))
 94 |     print('WRR: {:10.3%} ({:10d} / {:10d})'.format(wrr, match_count, ref_token_count))
 95 |     print('SER: {:10.3%} ({:10d} / {:10d})'.format(ser, sent_error_count, counter))
 96 | 
 97 | 
 98 | def process_line_pair(ref_line, hyp_line, case_insensitive=False, remove_empty_refs=False):
 99 |     """Given a pair of strings corresponding to a reference and hypothesis,
100 |     compute the edit distance, print if desired, and keep track of results
101 |     in global variables.
102 | 
103 |     Return true if the pair was counted, false if the pair was not counted due
104 |     to an empty reference string."""
105 |     # I don't believe these all need to be global.  In any case, they shouldn't be.
106 |     global error_count
107 |     global match_count
108 |     global ref_token_count
109 |     global sent_error_count
110 | 
111 |     # Split into tokens by whitespace
112 |     ref = ref_line.split()
113 |     hyp = hyp_line.split()
114 |     id_ = None
115 | 
116 |     # If the files have IDs, then split the ID off from the text
117 |     if files_head_ids:
118 |         id_ = ref[0]
119 |         ref, hyp = remove_head_id(ref, hyp)
120 |     elif files_tail_ids:
121 |         id_ = ref[-1]
122 |         ref, hyp = remove_tail_id(ref, hyp)
123 | 
124 |     if case_insensitive:
125 |         ref = list(map(str.lower, ref))
126 |         hyp = list(map(str.lower, hyp))
127 |     if remove_empty_refs and len(ref) == 0:
128 |         return False
129 | 
130 |     # Create an object to get the edit distance, and then retrieve the
131 |     # relevant counts that we need.
132 |     sm = SequenceMatcher(a=ref, b=hyp)
133 |     errors = get_error_count(sm)
134 |     matches = get_match_count(sm)
135 |     ref_length = len(ref)
136 | 
137 |     # Increment the total counts we're tracking
138 |     error_count += errors
139 |     match_count += matches
140 |     ref_token_count += ref_length
141 | 
142 |     if errors != 0:
143 |         sent_error_count += 1
144 | 
145 |     # If we're keeping track of which words get mixed up with which others, call track_confusions
146 |     if confusions:
147 |         track_confusions(sm, ref, hyp)
148 | 
149 |     # If we're printing instances, do it here (in roughly the align.c format)
150 |     if print_instances_p or (print_errors_p and errors != 0):
151 |         print_instances(ref, hyp, sm, id_=id_)
152 | 
153 |     # Keep track of the individual error rates, and reference lengths, so we
154 |     # can compute average WERs by sentence length
155 |     lengths.append(ref_length)
156 |     error_rate = errors * 1.0 / len(ref) if len(ref) > 0 else float("inf")
157 |     error_rates.append(error_rate)
158 |     wer_bins[len(ref)].append(error_rate)
159 |     return True
160 | 
161 | def set_global_variables(args):
162 |     """Copy argparse args into global variables."""
163 |     global print_instances_p
164 |     global print_errors_p
165 |     global files_head_ids
166 |     global files_tail_ids
167 |     global confusions
168 |     global min_count
169 |     global wer_vs_length_p
170 |     # Put the command line options into global variables.
171 |     print_instances_p = args.print_instances
172 |     print_errors_p = args.print_errors
173 |     files_head_ids = args.head_ids
174 |     files_tail_ids = args.tail_ids
175 |     confusions = args.confusions
176 |     min_count = args.min_word_count
177 |     wer_vs_length_p = args.print_wer_vs_length
178 | 
179 | def remove_head_id(ref, hyp):
180 |     """Assumes that the ID is the begin token of the string which is common
181 |     in Kaldi but not in Sphinx."""
182 |     ref_id = ref[0]
183 |     hyp_id = hyp[0]
184 |     if ref_id != hyp_id:
185 |         print('Reference and hypothesis IDs do not match! '
186 |               'ref="{}" hyp="{}"\n'
187 |               'File lines in hyp file should match those in the ref file.'.format(ref_id, hyp_id))
188 |         exit(-1)
189 |     ref = ref[1:]
190 |     hyp = hyp[1:]
191 |     return ref, hyp
192 | 
193 | def remove_tail_id(ref, hyp):
194 |     """Assumes that the ID is the final token of the string which is common
195 |     in Sphinx but not in Kaldi."""
196 |     ref_id = ref[-1]
197 |     hyp_id = hyp[-1]
198 |     if ref_id != hyp_id:
199 |         print('Reference and hypothesis IDs do not match! '
200 |               'ref="{}" hyp="{}"\n'
201 |               'File lines in hyp file should match those in the ref file.'.format(ref_id, hyp_id))
202 |         exit(-1)
203 |     ref = ref[:-1]
204 |     hyp = hyp[:-1]
205 |     return ref, hyp
206 | 
207 | def print_instances(ref, hyp, sm, id_=None):
208 |     """Print a single instance of a ref/hyp pair."""
209 |     print_diff(sm, ref, hyp)
210 |     if id_:
211 |         print(('SENTENCE {0:d}  {1!s}'.format(counter + 1, id_)))
212 |     else:
213 |         print('SENTENCE {0:d}'.format(counter + 1))
214 |     # Handle cases where the reference is empty without dying
215 |     if len(ref) != 0:
216 |         correct_rate = sm.matches() / len(ref)
217 |         error_rate = sm.distance() / len(ref)
218 |     elif sm.matches() == 0:
219 |         correct_rate = 1.0
220 |         error_rate = 0.0
221 |     else:
222 |         correct_rate = 0.0
223 |         error_rate = sm.matches()
224 |     print('Correct          = {0:6.1%}  {1:3d}   ({2:6d})'.format(correct_rate, sm.matches(), len(ref)))
225 |     print('Errors           = {0:6.1%}  {1:3d}   ({2:6d})'.format(error_rate, sm.distance(), len(ref)))
226 | 
227 | def track_confusions(sm, seq1, seq2):
228 |     """Keep track of the errors in a global variable, given a sequence matcher."""
229 |     opcodes = sm.get_opcodes()
230 |     for tag, i1, i2, j1, j2 in opcodes:
231 |         if tag == 'insert':
232 |             for i in range(j1, j2):
233 |                 word = seq2[i]
234 |                 insertion_table[word] += 1
235 |         elif tag == 'delete':
236 |             for i in range(i1, i2):
237 |                 word = seq1[i]
238 |                 deletion_table[word] += 1
239 |         elif tag == 'replace':
240 |             for w1 in seq1[i1:i2]:
241 |                 for w2 in seq2[j1:j2]:
242 |                     key = (w1, w2)
243 |                     substitution_table[key] += 1
244 | 
245 | def print_confusions():
246 |     """Print the confused words that we found... grouped by insertions, deletions
247 |     and substitutions."""
248 |     if len(insertion_table) > 0:
249 |         print('INSERTIONS:')
250 |         for item in sorted(list(insertion_table.items()), key=lambda x: x[1], reverse=True):
251 |             if item[1] >= min_count:
252 |                 print('{0:20s} {1:10d}'.format(*item))
253 |     if len(deletion_table) > 0:
254 |         print('DELETIONS:')
255 |         for item in sorted(list(deletion_table.items()), key=lambda x: x[1], reverse=True):
256 |             if item[1] >= min_count:
257 |                 print('{0:20s} {1:10d}'.format(*item))
258 |     if len(substitution_table) > 0:
259 |         print('SUBSTITUTIONS:')
260 |         for [w1, w2], count in sorted(list(substitution_table.items()), key=lambda x: x[1], reverse=True):
261 |             if count >= min_count:
262 |                 print('{0:20s} -> {1:20s}   {2:10d}'.format(w1, w2, count))
263 | 
264 | # TODO - For some reason I was getting two different counts depending on how I count the matches,
265 | # so do an assertion in this code to make sure we're getting matching counts.
266 | # This might slow things down.
267 | def get_match_count(sm):
268 |     "Return the number of matches, given a sequence matcher object."
269 |     matches = None
270 |     matches1 = sm.matches()
271 |     matching_blocks = sm.get_matching_blocks()
272 |     matches2 = reduce(lambda x, y: x + y, [x[2] for x in matching_blocks], 0)
273 |     assert matches1 == matches2
274 |     matches = matches1
275 |     return matches
276 | 
277 | 
278 | def get_error_count(sm):
279 |     """Return the number of errors (insertion, deletion, and substitutiions
280 |     , given a sequence matcher object."""
281 |     opcodes = sm.get_opcodes()
282 |     errors = [x for x in opcodes if x[0] in error_codes]
283 |     error_lengths = [max(x[2] - x[1], x[4] - x[3]) for x in errors]
284 |     return reduce(lambda x, y: x + y, error_lengths, 0)
285 | 
286 | # TODO - This is long and ugly.  Perhaps we can break it up?
287 | # It would make more sense for this to just return the two strings...
288 | def print_diff(sm, seq1, seq2, prefix1='REF:', prefix2='HYP:', suffix1=None, suffix2=None):
289 |     """Given a sequence matcher and the two sequences, print a Sphinx-style
290 |     'diff' off the two."""
291 |     ref_tokens = []
292 |     hyp_tokens = []
293 |     opcodes = sm.get_opcodes()
294 |     for tag, i1, i2, j1, j2 in opcodes:
295 |         # If they are equal, do nothing except lowercase them
296 |         if tag == 'equal':
297 |             for i in range(i1, i2):
298 |                 ref_tokens.append(seq1[i].lower())
299 |             for i in range(j1, j2):
300 |                 hyp_tokens.append(seq2[i].lower())
301 |         # For insertions and deletions, put a filler of '***' on the other one, and
302 |         # make the other all caps
303 |         elif tag == 'delete':
304 |             for i in range(i1, i2):
305 |                 ref_token = colored(seq1[i].upper(), 'red')
306 |                 ref_tokens.append(ref_token)
307 |             for i in range(i1, i2):
308 |                 hyp_token = colored('*' * len(seq1[i]), 'red')
309 |                 hyp_tokens.append(hyp_token)
310 |         elif tag == 'insert':
311 |             for i in range(j1, j2):
312 |                 ref_token = colored('*' * len(seq2[i]), 'red')
313 |                 ref_tokens.append(ref_token)
314 |             for i in range(j1, j2):
315 |                 hyp_token = colored(seq2[i].upper(), 'red')
316 |                 hyp_tokens.append(hyp_token)
317 |         # More complicated logic for a substitution
318 |         elif tag == 'replace':
319 |             seq1_len = i2 - i1
320 |             seq2_len = j2 - j1
321 |             # Get a list of tokens for each
322 |             s1 = list(map(str.upper, seq1[i1:i2]))
323 |             s2 = list(map(str.upper, seq2[j1:j2]))
324 |             # Pad the two lists with False values to get them to the same length
325 |             if seq1_len > seq2_len:
326 |                 for i in range(0, seq1_len - seq2_len):
327 |                     s2.append(False)
328 |             if seq1_len < seq2_len:
329 |                 for i in range(0, seq2_len - seq1_len):
330 |                     s1.append(False)
331 |             assert len(s1) == len(s2)
332 |             # Pair up words with their substitutions, or fillers
333 |             for i in range(0, len(s1)):
334 |                 w1 = s1[i]
335 |                 w2 = s2[i]
336 |                 # If we have two words, make them the same length
337 |                 if w1 and w2:
338 |                     if len(w1) > len(w2):
339 |                         s2[i] = w2 + ' ' * (len(w1) - len(w2))
340 |                     elif len(w1) < len(w2):
341 |                         s1[i] = w1 + ' ' * (len(w2) - len(w1))
342 |                 # Otherwise, create an empty filler word of the right width
343 |                 if not w1:
344 |                     s1[i] = '*' * len(w2)
345 |                 if not w2:
346 |                     s2[i] = '*' * len(w1)
347 |             s1 = map(lambda x: colored(x, 'red'), s1)
348 |             s2 = map(lambda x: colored(x, 'red'), s2)
349 |             ref_tokens += s1
350 |             hyp_tokens += s2
351 |     if prefix1: ref_tokens.insert(0, prefix1)
352 |     if prefix2: hyp_tokens.insert(0, prefix2)
353 |     if suffix1: ref_tokens.append(suffix1)
354 |     if suffix2: hyp_tokens.append(suffix2)
355 |     print(' '.join(ref_tokens))
356 |     print(' '.join(hyp_tokens))
357 | 
358 | def mean(seq):
359 |     """Return the average of the elements of a sequence."""
360 |     return float(sum(seq)) / len(seq) if len(seq) > 0 else float('nan')
361 | 
362 | def print_wer_vs_length():
363 |     """Print the average word error rate for each length sentence."""
364 |     avg_wers = {length: mean(wers) for length, wers in wer_bins.items()}
365 |     for length, avg_wer in sorted(avg_wers.items(), key=lambda x: (x[1], x[0])):
366 |         print('{0:5d} {1:f}'.format(length, avg_wer))
367 |     print('')
368 | 


--------------------------------------------------------------------------------