├── test ├── __init__.py └── test.py ├── .gitignore ├── requirements.txt ├── setup.cfg ├── asr_evaluation ├── __init__.py ├── __main__.py └── asr_evaluation.py ├── .readthedocs.yml ├── docs ├── modules.rst ├── asr_evaluation.rst ├── index.rst ├── Makefile └── conf.py ├── .circleci └── config.yml ├── .devcontainer ├── library-scripts │ ├── README.md │ └── common-debian.sh ├── Dockerfile ├── devcontainer.json └── base.Dockerfile ├── setup.py ├── .travis.yml ├── README.md └── LICENSE /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | asr_evaluation.egg-info 2 | dist 3 | build -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | edit_distance 2 | termcolor 3 | flake8 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E302,E305,E701 3 | max-line-length = 120 -------------------------------------------------------------------------------- /asr_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # from asr_evaluation.asr_evaluation import * 2 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | 2 | requirements_file: requirements.txt 3 | 4 | python: 5 | version: 3 6 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | asr_evaluation 2 | ============== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | asr_evaluation 8 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@0.2.1 5 | 6 | jobs: 7 | build-and-test: 8 | executor: python/default 9 | steps: 10 | - checkout 11 | - python/install-deps 12 | - run: 13 | command: python setup.py test 14 | name: Unit tests 15 | - run: 16 | command: flake8 asr_evaluation 17 | name: Style check 18 | 19 | workflows: 20 | main: 21 | jobs: 22 | - build-and-test 23 | -------------------------------------------------------------------------------- /docs/asr_evaluation.rst: -------------------------------------------------------------------------------- 1 | asr_evaluation package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | asr_evaluation.asr_evaluation module 8 | ------------------------------------ 9 | 10 | .. automodule:: asr_evaluation.asr_evaluation 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: asr_evaluation.asr_evaluation 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /.devcontainer/library-scripts/README.md: -------------------------------------------------------------------------------- 1 | # Warning: Folder contents may be replaced 2 | 3 | The contents of this folder will be automatically replaced with a file of the same name in the [vscode-dev-containers](https://github.com/microsoft/vscode-dev-containers) repository's [script-library folder](https://github.com/microsoft/vscode-dev-containers/tree/master/script-library) whenever the repository is packaged. 4 | 5 | To retain your edits, move the file to a different location. You may also delete the files if they are not needed. -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG VARIANT=3 2 | FROM mcr.microsoft.com/vscode/devcontainers/python:${VARIANT} 3 | 4 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. 5 | # COPY requirements.txt /tmp/pip-tmp/ 6 | # RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ 7 | # && rm -rf /tmp/pip-tmp 8 | 9 | # [Optional] Uncomment this section to install additional OS packages. 10 | # RUN apt-get update \ 11 | # && export DEBIAN_FRONTEND=noninteractive \ 12 | # && apt-get -y install --no-install-recommends 13 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. asr-evaluation documentation master file, created by 2 | sphinx-quickstart on Sat Jan 7 19:32:10 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | asr-evaluation documentation 7 | ========================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | 12 | .. automodule:: asr_evaluation.asr_evaluation 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='asr_evaluation', 5 | version='2.0.5', 6 | author='Ben Lambert', 7 | author_email='blambert@gmail.com', 8 | packages=['asr_evaluation'], 9 | license='LICENSE.txt', 10 | description='Evaluating ASR (automatic speech recognition) hypotheses, i.e. computing word error rate.', 11 | install_requires=['edit_distance', 'termcolor'], 12 | test_suite='test.test.TestASREvaluation', 13 | long_description=open('README.md').read(), 14 | long_description_content_type="text/markdown", 15 | entry_points={ 16 | 'console_scripts': [ 17 | 'wer = asr_evaluation.__main__:main' 18 | ] 19 | }, 20 | keywords=['word', 'error', 'rate', 'asr', 'speech', 'recognition'], 21 | classifiers=[ 22 | "Development Status :: 5 - Production/Stable", 23 | "Environment :: Console", 24 | "Programming Language :: Python", 25 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 26 | "Topic :: Text Processing", 27 | "Topic :: Utilities", 28 | "License :: OSI Approved :: Apache Software License" 29 | ] 30 | ) 31 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | - '3.4' 5 | - '3.5' 6 | - '3.6' 7 | - '3.7' 8 | - '3.8' 9 | matrix: 10 | fast_finish: true 11 | install: 12 | - pip install coverage codecov flake8 13 | script: 14 | - python setup.py test 15 | - flake8 asr_evaluation 16 | - python -m coverage run --source asr_evaluation setup.py test 17 | after_success: 18 | - codecov 19 | deploy: 20 | provider: pypi 21 | user: benlambert 22 | password: 23 | secure: AorFwvGFVx1yLdG5xZvHE2Zl6z0PQ9extTB81t9mxD+ke4KM5PZbMKr3mwejK5jYvsyfKz7kIaGUhHopZtk4oN+XoQgDVrW0QEu/JVSOmXjNRalmG90ujehoSU6w22Wl/rljcJoh9q+NknJq229SeYT2SwP9KqSiXHyr/SpCKORoRcXd26lUAXu1rp2wpcYFg0oCNo7NG5bnc+T+d7qaqbjD1mpHdwmlC/SMuN3tzHek+sG6k537NukhgDynRHHYdAhumMYbSgKWicZUHxk7dMK+5d6iR7KtQxE6gVA6OZt7HbF5JLVzfRlWFwow+tk608Qfyp275+j6t6fGnVnANjpA3Tmdd5xGJSY+riYTImE8znzEIkH4CdhlRBNmu/w3pRXrSBzGXaPQCc6l0i+j+1NOihXAL5doklMF92PFUuUProoKFPfm+YSAYwYkNMdIgUeCTpq9cnvV5pXykeaPZEovWVtMvPsTIRArLkzxHVDeCjfsKefkXxtaeIpUofyFnYuMZaZLuTpKSRZ8AhnjEu+bANPNWlB2vl+aw3rhzYxoqrOw6rh+AfyAlRBkTK2gp8UmOeFrGCBVXrjnPtMUSxa+Gxl+GTx0imeFEb1Kw7fCRtEiJ2ag1dVVK3UnrZiPfa/IzMRxliwXEyYwmDcK1DM1mIUka6/1KoxRP2oppl8= 24 | on: 25 | tags: true 26 | python: '3.7' 27 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | "context": "..", 6 | "args": { "VARIANT": "3.8" } 7 | }, 8 | 9 | // Set *default* container specific settings.json values on container create. 10 | "settings": { 11 | "terminal.integrated.shell.linux": "/bin/bash", 12 | "python.pythonPath": "/usr/local/bin/python", 13 | "python.linting.enabled": true, 14 | "python.linting.pylintEnabled": true, 15 | "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", 16 | "python.formatting.blackPath": "/usr/local/py-utils/bin/black", 17 | "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", 18 | "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", 19 | "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", 20 | "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", 21 | "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", 22 | "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", 23 | "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint" 24 | }, 25 | 26 | // Add the IDs of extensions you want installed when the container is created. 27 | "extensions": [ 28 | "ms-python.python" 29 | ] 30 | 31 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 32 | // "forwardPorts": [], 33 | 34 | // Use 'postCreateCommand' to run commands after the container is created. 35 | // "postCreateCommand": "pip3 install --user -r requirements.txt", 36 | 37 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 38 | // "remoteUser": "vscode" 39 | } 40 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2018 Ben Lambert 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Unit tests for asr_evaluation. 17 | """ 18 | from __future__ import division 19 | 20 | import sys 21 | import unittest 22 | 23 | from asr_evaluation import __main__ 24 | 25 | # Note these tests aren't checking for correctness. They are simply 26 | # exercising all the command line options to make sure we don't get errors 27 | # simply by running them. 28 | 29 | class TestASREvaluation(unittest.TestCase): 30 | """...""" 31 | 32 | def testing(self): 33 | """...""" 34 | self.assertTrue(True) 35 | 36 | def test_cli1(self): 37 | sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py', '-c', '-m', '0', '-i'] 38 | __main__.main() 39 | 40 | def test_cli2(self): 41 | sys.argv = ['evaluate.py', 'requirements.txtssssss', 'setup.py', '-c', '-m', '0', '-i'] 42 | with self.assertRaises(SystemExit): 43 | __main__.main() 44 | 45 | def test_cli3(self): 46 | sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py', '-c', '-m', '0'] 47 | __main__.main() 48 | 49 | def test_cli4(self): 50 | sys.argv = ['evaluate.py', 'requirements.txt', 'setup.py'] 51 | __main__.main() 52 | 53 | def test_cli5(self): 54 | sys.argv = ['evaluate.py', 'setup.py', 'setup.py'] 55 | __main__.main() 56 | 57 | def test_cli6(self): 58 | sys.argv = ['evaluate.py', 'setup.py', 'setup.py', '-a'] 59 | __main__.main() 60 | 61 | def test_cli7(self): 62 | sys.argv = ['evaluate.py', 'setup.py', 'setup.py', '-e'] 63 | __main__.main() 64 | 65 | def test_cli8(self): 66 | sys.argv = ['evaluate.py', 'requirements.txt', 'requirements.txt', '-id'] 67 | __main__.main() 68 | -------------------------------------------------------------------------------- /.devcontainer/base.Dockerfile: -------------------------------------------------------------------------------- 1 | # Update the VARIANT arg in devcontainer.json to pick a Python version: 3, 3.8, 3.7, 3.6 2 | ARG VARIANT=3 3 | FROM python:${VARIANT} 4 | 5 | # Options for common setup script 6 | ARG INSTALL_ZSH="true" 7 | ARG UPGRADE_PACKAGES="true" 8 | ARG USERNAME=vscode 9 | ARG USER_UID=1000 10 | ARG USER_GID=$USER_UID 11 | 12 | # Install needed packages and setup non-root user. Use a separate RUN statement to add your own dependencies. 13 | COPY .devcontainer/library-scripts/*.sh /tmp/library-scripts/ 14 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 15 | # Remove imagemagick due to https://security-tracker.debian.org/tracker/CVE-2019-10131 16 | && apt-get purge -y imagemagick imagemagick-6-common \ 17 | # Install common packages, non-root user 18 | && /bin/bash /tmp/library-scripts/common-debian.sh "${INSTALL_ZSH}" "${USERNAME}" "${USER_UID}" "${USER_GID}" "${UPGRADE_PACKAGES}" \ 19 | # Clean up 20 | && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts 21 | 22 | # Setup default python tools in a venv via pipx to avoid conflicts 23 | ARG DEFAULT_UTILS="\ 24 | pylint \ 25 | flake8 \ 26 | autopep8 \ 27 | black \ 28 | yapf \ 29 | mypy \ 30 | pydocstyle \ 31 | pycodestyle \ 32 | bandit \ 33 | virtualenv" 34 | ENV PIPX_HOME=/usr/local/py-utils 35 | ENV PIPX_BIN_DIR=${PIPX_HOME}/bin 36 | ENV PATH=${PATH}:${PIPX_BIN_DIR} 37 | RUN mkdir -p ${PIPX_BIN_DIR} \ 38 | && export PYTHONUSERBASE=/tmp/pip-tmp \ 39 | && pip3 install --disable-pip-version-check --no-warn-script-location --no-cache-dir --user pipx \ 40 | && /tmp/pip-tmp/bin/pipx install --pip-args=--no-cache-dir pipx \ 41 | && echo "${DEFAULT_UTILS}" | xargs -n 1 /tmp/pip-tmp/bin/pipx install --system-site-packages --pip-args=--no-cache-dir --pip-args=--force-reinstall \ 42 | && chown -R ${USER_UID}:${USER_GID} ${PIPX_HOME} \ 43 | && rm -rf /tmp/pip-tmp 44 | 45 | # [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. 46 | # COPY requirements.txt /tmp/pip-tmp/ 47 | # RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ 48 | # && rm -rf /tmp/pip-tmp 49 | 50 | # [Optional] Uncomment this section to install additional OS packages. 51 | # RUN apt-get update \ 52 | # && export DEBIAN_FRONTEND=noninteractive \ 53 | # && apt-get -y install --no-install-recommends 54 | 55 | -------------------------------------------------------------------------------- /asr_evaluation/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017-2018 Ben Lambert 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Contains the main method for the CLI. 19 | """ 20 | 21 | import argparse 22 | 23 | # For some reason Python 2 and Python 3 disagree about how to import this. 24 | try: 25 | from asr_evaluation.asr_evaluation import main as other_main 26 | except Exception: 27 | from asr_evaluation import main as other_main 28 | 29 | def get_parser(): 30 | """Parse the CLI args.""" 31 | parser = argparse.ArgumentParser(description='Evaluate an ASR transcript against a reference transcript.') 32 | parser.add_argument('ref', type=argparse.FileType('r'), help='Reference transcript filename') 33 | parser.add_argument('hyp', type=argparse.FileType('r'), help='ASR hypothesis filename') 34 | print_args = parser.add_mutually_exclusive_group() 35 | print_args.add_argument('-i', '--print-instances', action='store_true', 36 | help='Print all individual sentences and their errors.') 37 | print_args.add_argument('-r', '--print-errors', action='store_true', 38 | help='Print all individual sentences that contain errors.') 39 | parser.add_argument('--head-ids', action='store_true', 40 | help='Hypothesis and reference files have ids in the first token? (Kaldi format)') 41 | parser.add_argument('-id', '--tail-ids', '--has-ids', action='store_true', 42 | help='Hypothesis and reference files have ids in the last token? (Sphinx format)') 43 | parser.add_argument('-c', '--confusions', action='store_true', help='Print tables of which words were confused.') 44 | parser.add_argument('-p', '--print-wer-vs-length', action='store_true', 45 | help='Print table of average WER grouped by reference sentence length.') 46 | parser.add_argument('-m', '--min-word-count', type=int, default=1, metavar='count', 47 | help='Minimum word count to show a word in confusions (default 1).') 48 | parser.add_argument('-a', '--case-insensitive', action='store_true', 49 | help='Down-case the text before running the evaluation.') 50 | parser.add_argument('-e', '--remove-empty-refs', action='store_true', 51 | help='Skip over any examples where the reference is empty.') 52 | 53 | return parser 54 | 55 | def main(): 56 | """Run the program.""" 57 | parser = get_parser() 58 | args = parser.parse_args() 59 | other_main(args) 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | asr_evaluation 2 | ============== 3 | [![Build Status](https://travis-ci.org/belambert/asr-evaluation.svg?branch=main)](https://travis-ci.org/belambert/asr-evaluation) 4 | [![PyPI version](https://badge.fury.io/py/asr_evaluation.svg)](https://badge.fury.io/py/asr_evaluation) 5 | [![codecov](https://codecov.io/gh/belambert/asr-evaluation/branch/main/graph/badge.svg)](https://codecov.io/gh/belambert/asr-evaluation) 6 | 7 | Python module for evaluting ASR hypotheses (i.e. word error rate and word 8 | recognition rate). 9 | 10 | This module depends on the [editdistance](https://github.com/belambert/edit-distance) 11 | project, for computing edit distances between arbitrary sequences. 12 | 13 | The formatting of the output of this program is very loosely based around the 14 | same idea as the align.c program commonly used within the Sphinx ASR community. 15 | This may run a bit faster if neither instances nor confusions are printed. 16 | 17 | Please let me know if you have any comments, questions, or problems. 18 | 19 | Output 20 | ------ 21 | The program outputs three standard measurements: 22 | - [Word error rate (WER)](https://en.wikipedia.org/wiki/Word_error_rate) 23 | - Word recognition rate (the number of _matched_ words in the alignment divided by the number of words in the reference). 24 | - Sentence error rate (SER) (the number of incorrect sentences divided by the total number of sentences). 25 | 26 | 27 | Installing & uninstalling 28 | ------------------------- 29 | The easiest way to install is using pip: 30 | 31 | pip install asr-evaluation 32 | 33 | Alternatively you can clone this git repo and install using distutils: 34 | 35 | git clone git@github.com:belambert/asr-evaluation.git 36 | cd asr-evaluation 37 | python setup.py install 38 | 39 | To uninstall with pip: 40 | 41 | pip uninstall asr-evaluation 42 | 43 | 44 | Command line usage 45 | ------------------ 46 | For command line usage, see: 47 | ``` 48 | wer --help 49 | ``` 50 | 51 | It should display something like this: 52 | 53 | ``` 54 | usage: wer [-h] [-i | -r] [--head-ids] [-id] [-c] [-p] [-m count] [-a] [-e] 55 | ref hyp 56 | 57 | Evaluate an ASR transcript against a reference transcript. 58 | 59 | positional arguments: 60 | ref Reference transcript filename 61 | hyp ASR hypothesis filename 62 | 63 | optional arguments: 64 | -h, --help show this help message and exit 65 | -i, --print-instances 66 | Print all individual sentences and their errors. 67 | -r, --print-errors Print all individual sentences that contain errors. 68 | --head-ids Hypothesis and reference files have ids in the first 69 | token? (Kaldi format) 70 | -id, --tail-ids, --has-ids 71 | Hypothesis and reference files have ids in the last 72 | token? (Sphinx format) 73 | -c, --confusions Print tables of which words were confused. 74 | -p, --print-wer-vs-length 75 | Print table of average WER grouped by reference 76 | sentence length. 77 | -m count, --min-word-count count 78 | Minimum word count to show a word in confusions. 79 | -a, --case-insensitive 80 | Down-case the text before running the evaluation. 81 | -e, --remove-empty-refs 82 | Skip over any examples where the reference is empty. 83 | ``` 84 | 85 | Contributing and code of conduct 86 | -------------------------------- 87 | For contributions, it's best to Github issues and pull requests. Proper 88 | testing and documentation suggested. 89 | 90 | Code of conduct is expected to be reasonable, especially as specified by 91 | the [Contributor Covenant](http://contributor-covenant.org/version/1/4/) 92 | -------------------------------------------------------------------------------- /.devcontainer/library-scripts/common-debian.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #------------------------------------------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 5 | #------------------------------------------------------------------------------------------------------------- 6 | 7 | # Syntax: ./common-debian.sh [install zsh flag] [username] [user UID] [user GID] [upgrade packages flag] 8 | 9 | INSTALL_ZSH=${1:-"true"} 10 | USERNAME=${2:-"vscode"} 11 | USER_UID=${3:-1000} 12 | USER_GID=${4:-1000} 13 | UPGRADE_PACKAGES=${5:-"true"} 14 | 15 | set -e 16 | 17 | if [ "$(id -u)" -ne 0 ]; then 18 | echo -e 'Script must be run a root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.' 19 | exit 1 20 | fi 21 | 22 | # Treat a user name of "none" as root 23 | if [ "${USERNAME}" = "none" ] || [ "${USERNAME}" = "root" ]; then 24 | USERNAME=root 25 | USER_UID=0 26 | USER_GID=0 27 | fi 28 | 29 | # Load markers to see which steps have already run 30 | MARKER_FILE="/usr/local/etc/vscode-dev-containers/common" 31 | if [ -f "${MARKER_FILE}" ]; then 32 | echo "Marker file found:" 33 | cat "${MARKER_FILE}" 34 | source "${MARKER_FILE}" 35 | fi 36 | 37 | # Ensure apt is in non-interactive to avoid prompts 38 | export DEBIAN_FRONTEND=noninteractive 39 | 40 | # Function to call apt-get if needed 41 | apt-get-update-if-needed() 42 | { 43 | if [ ! -d "/var/lib/apt/lists" ] || [ "$(ls /var/lib/apt/lists/ | wc -l)" = "0" ]; then 44 | echo "Running apt-get update..." 45 | apt-get update 46 | else 47 | echo "Skipping apt-get update." 48 | fi 49 | } 50 | 51 | # Run install apt-utils to avoid debconf warning then verify presence of other common developer tools and dependencies 52 | if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then 53 | apt-get-update-if-needed 54 | 55 | PACKAGE_LIST="apt-utils \ 56 | git \ 57 | openssh-client \ 58 | less \ 59 | iproute2 \ 60 | procps \ 61 | curl \ 62 | wget \ 63 | unzip \ 64 | nano \ 65 | jq \ 66 | lsb-release \ 67 | ca-certificates \ 68 | apt-transport-https \ 69 | dialog \ 70 | gnupg2 \ 71 | libc6 \ 72 | libgcc1 \ 73 | libgssapi-krb5-2 \ 74 | libicu[0-9][0-9] \ 75 | liblttng-ust0 \ 76 | libstdc++6 \ 77 | zlib1g \ 78 | locales \ 79 | sudo" 80 | 81 | # Install libssl1.1 if available 82 | if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then 83 | PACKAGE_LIST="${PACKAGE_LIST} libssl1.1" 84 | fi 85 | 86 | # Install appropriate version of libssl1.0.x if available 87 | LIBSSL=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '') 88 | if [ "$(echo "$LIBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then 89 | if [[ ! -z $(apt-cache --names-only search ^libssl1.0.2$) ]]; then 90 | # Debian 9 91 | PACKAGE_LIST="${PACKAGE_LIST} libssl1.0.2" 92 | elif [[ ! -z $(apt-cache --names-only search ^libssl1.0.0$) ]]; then 93 | # Ubuntu 18.04, 16.04, earlier 94 | PACKAGE_LIST="${PACKAGE_LIST} libssl1.0.0" 95 | fi 96 | fi 97 | 98 | echo "Packages to verify are installed: ${PACKAGE_LIST}" 99 | apt-get -y install --no-install-recommends ${PACKAGE_LIST} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 ) 100 | 101 | PACKAGES_ALREADY_INSTALLED="true" 102 | fi 103 | 104 | # Get to latest versions of all packages 105 | if [ "${UPGRADE_PACKAGES}" = "true" ]; then 106 | apt-get-update-if-needed 107 | apt-get -y upgrade --no-install-recommends 108 | apt-get autoremove -y 109 | fi 110 | 111 | # Ensure at least the en_US.UTF-8 UTF-8 locale is available. 112 | # Common need for both applications and things like the agnoster ZSH theme. 113 | if [ "${LOCALE_ALREADY_SET}" != "true" ]; then 114 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 115 | locale-gen 116 | LOCALE_ALREADY_SET="true" 117 | fi 118 | 119 | # Create or update a non-root user to match UID/GID - see https://aka.ms/vscode-remote/containers/non-root-user. 120 | if id -u $USERNAME > /dev/null 2>&1; then 121 | # User exists, update if needed 122 | if [ "$USER_GID" != "$(id -G $USERNAME)" ]; then 123 | groupmod --gid $USER_GID $USERNAME 124 | usermod --gid $USER_GID $USERNAME 125 | fi 126 | if [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 127 | usermod --uid $USER_UID $USERNAME 128 | fi 129 | else 130 | # Create user 131 | groupadd --gid $USER_GID $USERNAME 132 | useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME 133 | fi 134 | 135 | # Add add sudo support for non-root user 136 | if [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" ]; then 137 | echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME 138 | chmod 0440 /etc/sudoers.d/$USERNAME 139 | EXISTING_NON_ROOT_USER="${USERNAME}" 140 | fi 141 | 142 | # Ensure ~/.local/bin is in the PATH for root and non-root users for bash. (zsh is later) 143 | if [ "${DOT_LOCAL_ALREADY_ADDED}" != "true" ]; then 144 | echo "export PATH=\$PATH:\$HOME/.local/bin" | tee -a /root/.bashrc >> /home/$USERNAME/.bashrc 145 | chown $USER_UID:$USER_GID /home/$USERNAME/.bashrc 146 | DOT_LOCAL_ALREADY_ADDED="true" 147 | fi 148 | 149 | # Optionally install and configure zsh 150 | if [ "${INSTALL_ZSH}" = "true" ] && [ ! -d "/root/.oh-my-zsh" ] && [ "${ZSH_ALREADY_INSTALLED}" != "true" ]; then 151 | apt-get-update-if-needed 152 | apt-get install -y zsh 153 | curl -fsSLo- https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh | bash 2>&1 154 | echo "export PATH=\$PATH:\$HOME/.local/bin" >> /root/.zshrc 155 | if [ "${USERNAME}" != "root" ]; then 156 | cp -fR /root/.oh-my-zsh /home/$USERNAME 157 | cp -f /root/.zshrc /home/$USERNAME 158 | sed -i -e "s/\/root\/.oh-my-zsh/\/home\/$USERNAME\/.oh-my-zsh/g" /home/$USERNAME/.zshrc 159 | chown -R $USER_UID:$USER_GID /home/$USERNAME/.oh-my-zsh /home/$USERNAME/.zshrc 160 | fi 161 | ZSH_ALREADY_INSTALLED="true" 162 | fi 163 | 164 | # Write marker file 165 | mkdir -p "$(dirname "${MARKER_FILE}")" 166 | echo -e "\ 167 | PACKAGES_ALREADY_INSTALLED=${PACKAGES_ALREADY_INSTALLED}\n\ 168 | LOCALE_ALREADY_SET=${LOCALE_ALREADY_SET}\n\ 169 | EXISTING_NON_ROOT_USER=${EXISTING_NON_ROOT_USER}\n\ 170 | DOT_LOCAL_ALREADY_ADDED=${DOT_LOCAL_ALREADY_ADDED}\n\ 171 | ZSH_ALREADY_INSTALLED=${ZSH_ALREADY_INSTALLED}" > "${MARKER_FILE}" 172 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | @echo " dummy to check syntax errors of document sources" 51 | 52 | .PHONY: clean 53 | clean: 54 | rm -rf $(BUILDDIR)/* 55 | 56 | .PHONY: html 57 | html: 58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 61 | 62 | .PHONY: dirhtml 63 | dirhtml: 64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 67 | 68 | .PHONY: singlehtml 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | .PHONY: pickle 75 | pickle: 76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 77 | @echo 78 | @echo "Build finished; now you can process the pickle files." 79 | 80 | .PHONY: json 81 | json: 82 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 83 | @echo 84 | @echo "Build finished; now you can process the JSON files." 85 | 86 | .PHONY: htmlhelp 87 | htmlhelp: 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | .PHONY: qthelp 94 | qthelp: 95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 96 | @echo 97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/asr-evaluation.qhcp" 100 | @echo "To view the help file:" 101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/asr-evaluation.qhc" 102 | 103 | .PHONY: applehelp 104 | applehelp: 105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 106 | @echo 107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 108 | @echo "N.B. You won't be able to view it unless you put it in" \ 109 | "~/Library/Documentation/Help or install it in your application" \ 110 | "bundle." 111 | 112 | .PHONY: devhelp 113 | devhelp: 114 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 115 | @echo 116 | @echo "Build finished." 117 | @echo "To view the help file:" 118 | @echo "# mkdir -p $$HOME/.local/share/devhelp/asr-evaluation" 119 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/asr-evaluation" 120 | @echo "# devhelp" 121 | 122 | .PHONY: epub 123 | epub: 124 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 125 | @echo 126 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 127 | 128 | .PHONY: epub3 129 | epub3: 130 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 131 | @echo 132 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 133 | 134 | .PHONY: latex 135 | latex: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo 138 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 139 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 140 | "(use \`make latexpdf' here to do that automatically)." 141 | 142 | .PHONY: latexpdf 143 | latexpdf: 144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 145 | @echo "Running LaTeX files through pdflatex..." 146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 148 | 149 | .PHONY: latexpdfja 150 | latexpdfja: 151 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 152 | @echo "Running LaTeX files through platex and dvipdfmx..." 153 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 154 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 155 | 156 | .PHONY: text 157 | text: 158 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 159 | @echo 160 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 161 | 162 | .PHONY: man 163 | man: 164 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 165 | @echo 166 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 167 | 168 | .PHONY: texinfo 169 | texinfo: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo 172 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 173 | @echo "Run \`make' in that directory to run these through makeinfo" \ 174 | "(use \`make info' here to do that automatically)." 175 | 176 | .PHONY: info 177 | info: 178 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 179 | @echo "Running Texinfo files through makeinfo..." 180 | make -C $(BUILDDIR)/texinfo info 181 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 182 | 183 | .PHONY: gettext 184 | gettext: 185 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 186 | @echo 187 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 188 | 189 | .PHONY: changes 190 | changes: 191 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 192 | @echo 193 | @echo "The overview file is in $(BUILDDIR)/changes." 194 | 195 | .PHONY: linkcheck 196 | linkcheck: 197 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 198 | @echo 199 | @echo "Link check complete; look for any errors in the above output " \ 200 | "or in $(BUILDDIR)/linkcheck/output.txt." 201 | 202 | .PHONY: doctest 203 | doctest: 204 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 205 | @echo "Testing of doctests in the sources finished, look at the " \ 206 | "results in $(BUILDDIR)/doctest/output.txt." 207 | 208 | .PHONY: coverage 209 | coverage: 210 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 211 | @echo "Testing of coverage in the sources finished, look at the " \ 212 | "results in $(BUILDDIR)/coverage/python.txt." 213 | 214 | .PHONY: xml 215 | xml: 216 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 217 | @echo 218 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 219 | 220 | .PHONY: pseudoxml 221 | pseudoxml: 222 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 223 | @echo 224 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 225 | 226 | .PHONY: dummy 227 | dummy: 228 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 229 | @echo 230 | @echo "Build finished. Dummy builder generates no files." 231 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # asr-evaluation documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Jan 7 19:32:10 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('../src')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.intersphinx', 34 | 'sphinx.ext.todo', 35 | 'sphinx.ext.coverage', 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The encoding of source files. 47 | #source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = 'index' 51 | 52 | # General information about the project. 53 | project = u'asr-evaluation' 54 | copyright = u'2017, Ben Lambert' 55 | author = u'Ben Lambert' 56 | 57 | # The version info for the project you're documenting, acts as replacement for 58 | # |version| and |release|, also used in various other places throughout the 59 | # built documents. 60 | # 61 | # The short X.Y version. 62 | version = u'0.2.3' 63 | # The full version, including alpha/beta/rc tags. 64 | release = u'0.2.3' 65 | 66 | # The language for content autogenerated by Sphinx. Refer to documentation 67 | # for a list of supported languages. 68 | # 69 | # This is also used if you do content translation via gettext catalogs. 70 | # Usually you set "language" from the command line for these cases. 71 | language = None 72 | 73 | # There are two options for replacing |today|: either, you set today to some 74 | # non-false value, then it is used: 75 | #today = '' 76 | # Else, today_fmt is used as the format for a strftime call. 77 | #today_fmt = '%B %d, %Y' 78 | 79 | # List of patterns, relative to source directory, that match files and 80 | # directories to ignore when looking for source files. 81 | # This patterns also effect to html_static_path and html_extra_path 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all 85 | # documents. 86 | #default_role = None 87 | 88 | # If true, '()' will be appended to :func: etc. cross-reference text. 89 | #add_function_parentheses = True 90 | 91 | # If true, the current module name will be prepended to all description 92 | # unit titles (such as .. function::). 93 | #add_module_names = True 94 | 95 | # If true, sectionauthor and moduleauthor directives will be shown in the 96 | # output. They are ignored by default. 97 | #show_authors = False 98 | 99 | # The name of the Pygments (syntax highlighting) style to use. 100 | pygments_style = 'sphinx' 101 | 102 | # A list of ignored prefixes for module index sorting. 103 | #modindex_common_prefix = [] 104 | 105 | # If true, keep warnings as "system message" paragraphs in the built documents. 106 | #keep_warnings = False 107 | 108 | # If true, `todo` and `todoList` produce output, else they produce nothing. 109 | todo_include_todos = True 110 | 111 | 112 | # -- Options for HTML output ---------------------------------------------- 113 | 114 | # The theme to use for HTML and HTML Help pages. See the documentation for 115 | # a list of builtin themes. 116 | html_theme = 'classic' 117 | 118 | # Theme options are theme-specific and customize the look and feel of a theme 119 | # further. For a list of options available for each theme, see the 120 | # documentation. 121 | #html_theme_options = {} 122 | 123 | # Add any paths that contain custom themes here, relative to this directory. 124 | #html_theme_path = [] 125 | 126 | # The name for this set of Sphinx documents. 127 | # " v documentation" by default. 128 | #html_title = u'asr-evaluation v0.2.3' 129 | 130 | # A shorter title for the navigation bar. Default is the same as html_title. 131 | #html_short_title = None 132 | 133 | # The name of an image file (relative to this directory) to place at the top 134 | # of the sidebar. 135 | #html_logo = None 136 | 137 | # The name of an image file (relative to this directory) to use as a favicon of 138 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 139 | # pixels large. 140 | #html_favicon = None 141 | 142 | # Add any paths that contain custom static files (such as style sheets) here, 143 | # relative to this directory. They are copied after the builtin static files, 144 | # so a file named "default.css" will overwrite the builtin "default.css". 145 | html_static_path = ['_static'] 146 | 147 | # Add any extra paths that contain custom files (such as robots.txt or 148 | # .htaccess) here, relative to this directory. These files are copied 149 | # directly to the root of the documentation. 150 | #html_extra_path = [] 151 | 152 | # If not None, a 'Last updated on:' timestamp is inserted at every page 153 | # bottom, using the given strftime format. 154 | # The empty string is equivalent to '%b %d, %Y'. 155 | #html_last_updated_fmt = None 156 | 157 | # If true, SmartyPants will be used to convert quotes and dashes to 158 | # typographically correct entities. 159 | #html_use_smartypants = True 160 | 161 | # Custom sidebar templates, maps document names to template names. 162 | #html_sidebars = {} 163 | 164 | # Additional templates that should be rendered to pages, maps page names to 165 | # template names. 166 | #html_additional_pages = {} 167 | 168 | # If false, no module index is generated. 169 | #html_domain_indices = True 170 | 171 | # If false, no index is generated. 172 | #html_use_index = True 173 | 174 | # If true, the index is split into individual pages for each letter. 175 | #html_split_index = False 176 | 177 | # If true, links to the reST sources are added to the pages. 178 | #html_show_sourcelink = True 179 | 180 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 181 | #html_show_sphinx = True 182 | 183 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 184 | #html_show_copyright = True 185 | 186 | # If true, an OpenSearch description file will be output, and all pages will 187 | # contain a tag referring to it. The value of this option must be the 188 | # base URL from which the finished HTML is served. 189 | #html_use_opensearch = '' 190 | 191 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 192 | #html_file_suffix = None 193 | 194 | # Language to be used for generating the HTML full-text search index. 195 | # Sphinx supports the following languages: 196 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 197 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 198 | #html_search_language = 'en' 199 | 200 | # A dictionary with options for the search language support, empty by default. 201 | # 'ja' uses this config value. 202 | # 'zh' user can custom change `jieba` dictionary path. 203 | #html_search_options = {'type': 'default'} 204 | 205 | # The name of a javascript file (relative to the configuration directory) that 206 | # implements a search results scorer. If empty, the default will be used. 207 | #html_search_scorer = 'scorer.js' 208 | 209 | # Output file base name for HTML help builder. 210 | htmlhelp_basename = 'asr-evaluationdoc' 211 | 212 | # -- Options for LaTeX output --------------------------------------------- 213 | 214 | latex_elements = { 215 | # The paper size ('letterpaper' or 'a4paper'). 216 | #'papersize': 'letterpaper', 217 | 218 | # The font size ('10pt', '11pt' or '12pt'). 219 | #'pointsize': '10pt', 220 | 221 | # Additional stuff for the LaTeX preamble. 222 | #'preamble': '', 223 | 224 | # Latex figure (float) alignment 225 | #'figure_align': 'htbp', 226 | } 227 | 228 | # Grouping the document tree into LaTeX files. List of tuples 229 | # (source start file, target name, title, 230 | # author, documentclass [howto, manual, or own class]). 231 | latex_documents = [ 232 | (master_doc, 'asr-evaluation.tex', u'asr-evaluation Documentation', 233 | u'Ben Lambert', 'manual'), 234 | ] 235 | 236 | # The name of an image file (relative to this directory) to place at the top of 237 | # the title page. 238 | #latex_logo = None 239 | 240 | # For "manual" documents, if this is true, then toplevel headings are parts, 241 | # not chapters. 242 | #latex_use_parts = False 243 | 244 | # If true, show page references after internal links. 245 | #latex_show_pagerefs = False 246 | 247 | # If true, show URL addresses after external links. 248 | #latex_show_urls = False 249 | 250 | # Documents to append as an appendix to all manuals. 251 | #latex_appendices = [] 252 | 253 | # If false, no module index is generated. 254 | #latex_domain_indices = True 255 | 256 | 257 | # -- Options for manual page output --------------------------------------- 258 | 259 | # One entry per manual page. List of tuples 260 | # (source start file, name, description, authors, manual section). 261 | man_pages = [ 262 | (master_doc, 'asr-evaluation', u'asr-evaluation Documentation', 263 | [author], 1) 264 | ] 265 | 266 | # If true, show URL addresses after external links. 267 | #man_show_urls = False 268 | 269 | 270 | # -- Options for Texinfo output ------------------------------------------- 271 | 272 | # Grouping the document tree into Texinfo files. List of tuples 273 | # (source start file, target name, title, author, 274 | # dir menu entry, description, category) 275 | texinfo_documents = [ 276 | (master_doc, 'asr-evaluation', u'asr-evaluation Documentation', 277 | author, 'asr-evaluation', 'One line description of project.', 278 | 'Miscellaneous'), 279 | ] 280 | 281 | # Documents to append as an appendix to all manuals. 282 | #texinfo_appendices = [] 283 | 284 | # If false, no module index is generated. 285 | #texinfo_domain_indices = True 286 | 287 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 288 | #texinfo_show_urls = 'footnote' 289 | 290 | # If true, do not generate a @detailmenu in the "Top" node's menu. 291 | #texinfo_no_detailmenu = False 292 | 293 | 294 | # Example configuration for intersphinx: refer to the Python standard library. 295 | intersphinx_mapping = {'https://docs.python.org/3': None} 296 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2013-2018 Ben Lambert 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /asr_evaluation/asr_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2018 Ben Lambert 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | """ 17 | Primary code for computing word error rate and other metrics from ASR output. 18 | """ 19 | from __future__ import division 20 | 21 | from functools import reduce 22 | from collections import defaultdict 23 | from edit_distance import SequenceMatcher 24 | 25 | from termcolor import colored 26 | 27 | # Some defaults 28 | print_instances_p = False 29 | print_errors_p = False 30 | files_head_ids = False 31 | files_tail_ids = False 32 | confusions = False 33 | min_count = 0 34 | wer_vs_length_p = True 35 | 36 | # For keeping track of the total number of tokens, errors, and matches 37 | ref_token_count = 0 38 | error_count = 0 39 | match_count = 0 40 | counter = 0 41 | sent_error_count = 0 42 | 43 | # For keeping track of word error rates by sentence length 44 | # this is so we can see if performance is better/worse for longer 45 | # and/or shorter sentences 46 | lengths = [] 47 | error_rates = [] 48 | wer_bins = defaultdict(list) 49 | wer_vs_length = defaultdict(list) 50 | # Tables for keeping track of which words get confused with one another 51 | insertion_table = defaultdict(int) 52 | deletion_table = defaultdict(int) 53 | substitution_table = defaultdict(int) 54 | # These are the editdistance opcodes that are condsidered 'errors' 55 | error_codes = ['replace', 'delete', 'insert'] 56 | 57 | 58 | # TODO - rename this function. Move some of it into evaluate.py? 59 | def main(args): 60 | """Main method - this reads the hyp and ref files, and creates 61 | editdistance.SequenceMatcher objects to compute the edit distance. 62 | All the statistics necessary statistics are collected, and results are 63 | printed as specified by the command line options. 64 | 65 | This function doesn't not check to ensure that the reference and 66 | hypothesis file have the same number of lines. It will stop after the 67 | shortest one runs out of lines. This should be easy to fix... 68 | """ 69 | global counter 70 | set_global_variables(args) 71 | 72 | counter = 0 73 | # Loop through each line of the reference and hyp file 74 | for ref_line, hyp_line in zip(args.ref, args.hyp): 75 | processed_p = process_line_pair(ref_line, hyp_line, case_insensitive=args.case_insensitive, 76 | remove_empty_refs=args.remove_empty_refs) 77 | if processed_p: 78 | counter += 1 79 | if confusions: 80 | print_confusions() 81 | if wer_vs_length_p: 82 | print_wer_vs_length() 83 | # Compute WER and WRR 84 | if ref_token_count > 0: 85 | wrr = match_count / ref_token_count 86 | wer = error_count / ref_token_count 87 | else: 88 | wrr = 0.0 89 | wer = 0.0 90 | # Compute SER 91 | ser = sent_error_count / counter if counter > 0 else 0.0 92 | print('Sentence count: {}'.format(counter)) 93 | print('WER: {:10.3%} ({:10d} / {:10d})'.format(wer, error_count, ref_token_count)) 94 | print('WRR: {:10.3%} ({:10d} / {:10d})'.format(wrr, match_count, ref_token_count)) 95 | print('SER: {:10.3%} ({:10d} / {:10d})'.format(ser, sent_error_count, counter)) 96 | 97 | 98 | def process_line_pair(ref_line, hyp_line, case_insensitive=False, remove_empty_refs=False): 99 | """Given a pair of strings corresponding to a reference and hypothesis, 100 | compute the edit distance, print if desired, and keep track of results 101 | in global variables. 102 | 103 | Return true if the pair was counted, false if the pair was not counted due 104 | to an empty reference string.""" 105 | # I don't believe these all need to be global. In any case, they shouldn't be. 106 | global error_count 107 | global match_count 108 | global ref_token_count 109 | global sent_error_count 110 | 111 | # Split into tokens by whitespace 112 | ref = ref_line.split() 113 | hyp = hyp_line.split() 114 | id_ = None 115 | 116 | # If the files have IDs, then split the ID off from the text 117 | if files_head_ids: 118 | id_ = ref[0] 119 | ref, hyp = remove_head_id(ref, hyp) 120 | elif files_tail_ids: 121 | id_ = ref[-1] 122 | ref, hyp = remove_tail_id(ref, hyp) 123 | 124 | if case_insensitive: 125 | ref = list(map(str.lower, ref)) 126 | hyp = list(map(str.lower, hyp)) 127 | if remove_empty_refs and len(ref) == 0: 128 | return False 129 | 130 | # Create an object to get the edit distance, and then retrieve the 131 | # relevant counts that we need. 132 | sm = SequenceMatcher(a=ref, b=hyp) 133 | errors = get_error_count(sm) 134 | matches = get_match_count(sm) 135 | ref_length = len(ref) 136 | 137 | # Increment the total counts we're tracking 138 | error_count += errors 139 | match_count += matches 140 | ref_token_count += ref_length 141 | 142 | if errors != 0: 143 | sent_error_count += 1 144 | 145 | # If we're keeping track of which words get mixed up with which others, call track_confusions 146 | if confusions: 147 | track_confusions(sm, ref, hyp) 148 | 149 | # If we're printing instances, do it here (in roughly the align.c format) 150 | if print_instances_p or (print_errors_p and errors != 0): 151 | print_instances(ref, hyp, sm, id_=id_) 152 | 153 | # Keep track of the individual error rates, and reference lengths, so we 154 | # can compute average WERs by sentence length 155 | lengths.append(ref_length) 156 | error_rate = errors * 1.0 / len(ref) if len(ref) > 0 else float("inf") 157 | error_rates.append(error_rate) 158 | wer_bins[len(ref)].append(error_rate) 159 | return True 160 | 161 | def set_global_variables(args): 162 | """Copy argparse args into global variables.""" 163 | global print_instances_p 164 | global print_errors_p 165 | global files_head_ids 166 | global files_tail_ids 167 | global confusions 168 | global min_count 169 | global wer_vs_length_p 170 | # Put the command line options into global variables. 171 | print_instances_p = args.print_instances 172 | print_errors_p = args.print_errors 173 | files_head_ids = args.head_ids 174 | files_tail_ids = args.tail_ids 175 | confusions = args.confusions 176 | min_count = args.min_word_count 177 | wer_vs_length_p = args.print_wer_vs_length 178 | 179 | def remove_head_id(ref, hyp): 180 | """Assumes that the ID is the begin token of the string which is common 181 | in Kaldi but not in Sphinx.""" 182 | ref_id = ref[0] 183 | hyp_id = hyp[0] 184 | if ref_id != hyp_id: 185 | print('Reference and hypothesis IDs do not match! ' 186 | 'ref="{}" hyp="{}"\n' 187 | 'File lines in hyp file should match those in the ref file.'.format(ref_id, hyp_id)) 188 | exit(-1) 189 | ref = ref[1:] 190 | hyp = hyp[1:] 191 | return ref, hyp 192 | 193 | def remove_tail_id(ref, hyp): 194 | """Assumes that the ID is the final token of the string which is common 195 | in Sphinx but not in Kaldi.""" 196 | ref_id = ref[-1] 197 | hyp_id = hyp[-1] 198 | if ref_id != hyp_id: 199 | print('Reference and hypothesis IDs do not match! ' 200 | 'ref="{}" hyp="{}"\n' 201 | 'File lines in hyp file should match those in the ref file.'.format(ref_id, hyp_id)) 202 | exit(-1) 203 | ref = ref[:-1] 204 | hyp = hyp[:-1] 205 | return ref, hyp 206 | 207 | def print_instances(ref, hyp, sm, id_=None): 208 | """Print a single instance of a ref/hyp pair.""" 209 | print_diff(sm, ref, hyp) 210 | if id_: 211 | print(('SENTENCE {0:d} {1!s}'.format(counter + 1, id_))) 212 | else: 213 | print('SENTENCE {0:d}'.format(counter + 1)) 214 | # Handle cases where the reference is empty without dying 215 | if len(ref) != 0: 216 | correct_rate = sm.matches() / len(ref) 217 | error_rate = sm.distance() / len(ref) 218 | elif sm.matches() == 0: 219 | correct_rate = 1.0 220 | error_rate = 0.0 221 | else: 222 | correct_rate = 0.0 223 | error_rate = sm.matches() 224 | print('Correct = {0:6.1%} {1:3d} ({2:6d})'.format(correct_rate, sm.matches(), len(ref))) 225 | print('Errors = {0:6.1%} {1:3d} ({2:6d})'.format(error_rate, sm.distance(), len(ref))) 226 | 227 | def track_confusions(sm, seq1, seq2): 228 | """Keep track of the errors in a global variable, given a sequence matcher.""" 229 | opcodes = sm.get_opcodes() 230 | for tag, i1, i2, j1, j2 in opcodes: 231 | if tag == 'insert': 232 | for i in range(j1, j2): 233 | word = seq2[i] 234 | insertion_table[word] += 1 235 | elif tag == 'delete': 236 | for i in range(i1, i2): 237 | word = seq1[i] 238 | deletion_table[word] += 1 239 | elif tag == 'replace': 240 | for w1 in seq1[i1:i2]: 241 | for w2 in seq2[j1:j2]: 242 | key = (w1, w2) 243 | substitution_table[key] += 1 244 | 245 | def print_confusions(): 246 | """Print the confused words that we found... grouped by insertions, deletions 247 | and substitutions.""" 248 | if len(insertion_table) > 0: 249 | print('INSERTIONS:') 250 | for item in sorted(list(insertion_table.items()), key=lambda x: x[1], reverse=True): 251 | if item[1] >= min_count: 252 | print('{0:20s} {1:10d}'.format(*item)) 253 | if len(deletion_table) > 0: 254 | print('DELETIONS:') 255 | for item in sorted(list(deletion_table.items()), key=lambda x: x[1], reverse=True): 256 | if item[1] >= min_count: 257 | print('{0:20s} {1:10d}'.format(*item)) 258 | if len(substitution_table) > 0: 259 | print('SUBSTITUTIONS:') 260 | for [w1, w2], count in sorted(list(substitution_table.items()), key=lambda x: x[1], reverse=True): 261 | if count >= min_count: 262 | print('{0:20s} -> {1:20s} {2:10d}'.format(w1, w2, count)) 263 | 264 | # TODO - For some reason I was getting two different counts depending on how I count the matches, 265 | # so do an assertion in this code to make sure we're getting matching counts. 266 | # This might slow things down. 267 | def get_match_count(sm): 268 | "Return the number of matches, given a sequence matcher object." 269 | matches = None 270 | matches1 = sm.matches() 271 | matching_blocks = sm.get_matching_blocks() 272 | matches2 = reduce(lambda x, y: x + y, [x[2] for x in matching_blocks], 0) 273 | assert matches1 == matches2 274 | matches = matches1 275 | return matches 276 | 277 | 278 | def get_error_count(sm): 279 | """Return the number of errors (insertion, deletion, and substitutiions 280 | , given a sequence matcher object.""" 281 | opcodes = sm.get_opcodes() 282 | errors = [x for x in opcodes if x[0] in error_codes] 283 | error_lengths = [max(x[2] - x[1], x[4] - x[3]) for x in errors] 284 | return reduce(lambda x, y: x + y, error_lengths, 0) 285 | 286 | # TODO - This is long and ugly. Perhaps we can break it up? 287 | # It would make more sense for this to just return the two strings... 288 | def print_diff(sm, seq1, seq2, prefix1='REF:', prefix2='HYP:', suffix1=None, suffix2=None): 289 | """Given a sequence matcher and the two sequences, print a Sphinx-style 290 | 'diff' off the two.""" 291 | ref_tokens = [] 292 | hyp_tokens = [] 293 | opcodes = sm.get_opcodes() 294 | for tag, i1, i2, j1, j2 in opcodes: 295 | # If they are equal, do nothing except lowercase them 296 | if tag == 'equal': 297 | for i in range(i1, i2): 298 | ref_tokens.append(seq1[i].lower()) 299 | for i in range(j1, j2): 300 | hyp_tokens.append(seq2[i].lower()) 301 | # For insertions and deletions, put a filler of '***' on the other one, and 302 | # make the other all caps 303 | elif tag == 'delete': 304 | for i in range(i1, i2): 305 | ref_token = colored(seq1[i].upper(), 'red') 306 | ref_tokens.append(ref_token) 307 | for i in range(i1, i2): 308 | hyp_token = colored('*' * len(seq1[i]), 'red') 309 | hyp_tokens.append(hyp_token) 310 | elif tag == 'insert': 311 | for i in range(j1, j2): 312 | ref_token = colored('*' * len(seq2[i]), 'red') 313 | ref_tokens.append(ref_token) 314 | for i in range(j1, j2): 315 | hyp_token = colored(seq2[i].upper(), 'red') 316 | hyp_tokens.append(hyp_token) 317 | # More complicated logic for a substitution 318 | elif tag == 'replace': 319 | seq1_len = i2 - i1 320 | seq2_len = j2 - j1 321 | # Get a list of tokens for each 322 | s1 = list(map(str.upper, seq1[i1:i2])) 323 | s2 = list(map(str.upper, seq2[j1:j2])) 324 | # Pad the two lists with False values to get them to the same length 325 | if seq1_len > seq2_len: 326 | for i in range(0, seq1_len - seq2_len): 327 | s2.append(False) 328 | if seq1_len < seq2_len: 329 | for i in range(0, seq2_len - seq1_len): 330 | s1.append(False) 331 | assert len(s1) == len(s2) 332 | # Pair up words with their substitutions, or fillers 333 | for i in range(0, len(s1)): 334 | w1 = s1[i] 335 | w2 = s2[i] 336 | # If we have two words, make them the same length 337 | if w1 and w2: 338 | if len(w1) > len(w2): 339 | s2[i] = w2 + ' ' * (len(w1) - len(w2)) 340 | elif len(w1) < len(w2): 341 | s1[i] = w1 + ' ' * (len(w2) - len(w1)) 342 | # Otherwise, create an empty filler word of the right width 343 | if not w1: 344 | s1[i] = '*' * len(w2) 345 | if not w2: 346 | s2[i] = '*' * len(w1) 347 | s1 = map(lambda x: colored(x, 'red'), s1) 348 | s2 = map(lambda x: colored(x, 'red'), s2) 349 | ref_tokens += s1 350 | hyp_tokens += s2 351 | if prefix1: ref_tokens.insert(0, prefix1) 352 | if prefix2: hyp_tokens.insert(0, prefix2) 353 | if suffix1: ref_tokens.append(suffix1) 354 | if suffix2: hyp_tokens.append(suffix2) 355 | print(' '.join(ref_tokens)) 356 | print(' '.join(hyp_tokens)) 357 | 358 | def mean(seq): 359 | """Return the average of the elements of a sequence.""" 360 | return float(sum(seq)) / len(seq) if len(seq) > 0 else float('nan') 361 | 362 | def print_wer_vs_length(): 363 | """Print the average word error rate for each length sentence.""" 364 | avg_wers = {length: mean(wers) for length, wers in wer_bins.items()} 365 | for length, avg_wer in sorted(avg_wers.items(), key=lambda x: (x[1], x[0])): 366 | print('{0:5d} {1:f}'.format(length, avg_wer)) 367 | print('') 368 | --------------------------------------------------------------------------------