├── .github └── workflows │ ├── build-wheels-aarch64.yaml │ ├── build-wheels-macos.yaml │ ├── build-wheels-win32.yaml │ ├── build-wheels.yaml │ ├── build_conda_macos.yml │ ├── build_conda_ubuntu.yml │ ├── build_conda_windows.yml │ ├── test-pip-install.yaml │ └── test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── cmake ├── Modules │ ├── FetchContent.cmake │ ├── FetchContent │ │ └── CMakeLists.cmake.in │ └── README.md ├── __init__.py └── pybind11.cmake ├── extensions ├── kaldi_align.cpp ├── kaldi_align.h └── kaldialign.cpp ├── kaldialign └── __init__.py ├── scripts ├── build_conda.sh └── conda │ └── kaldialign │ └── meta.yaml ├── setup.py └── tests └── test_align.py /.github/workflows/build-wheels-aarch64.yaml: -------------------------------------------------------------------------------- 1 | name: build-wheels-aarch64 2 | 3 | on: 4 | push: 5 | branches: 6 | - wheel 7 | tags: 8 | - '*' 9 | 10 | workflow_dispatch: 11 | 12 | concurrency: 13 | group: build-wheels-aarch64-${{ github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | build_wheels_aarch64: 18 | name: ${{ matrix.python-version }} 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [ubuntu-22.04-arm] 24 | python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312", "cp313"] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | # see https://cibuildwheel.readthedocs.io/en/stable/changelog/ 30 | # for a list of versions 31 | - name: Build wheels 32 | uses: pypa/cibuildwheel@v2.21.3 33 | env: 34 | CIBW_BEFORE_BUILD: "pip install -U numpy" 35 | CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686" 36 | CIBW_BUILD: "${{ matrix.python-version}}-* " 37 | CIBW_BUILD_VERBOSITY: 3 38 | CIBW_ARCHS_LINUX: aarch64 39 | 40 | - name: Display wheels 41 | shell: bash 42 | run: | 43 | ls -lh ./wheelhouse/ 44 | 45 | - uses: actions/upload-artifact@v4 46 | with: 47 | name: wheel-${{ matrix.python-version }} 48 | path: ./wheelhouse/*.whl 49 | 50 | - name: Publish wheels to PyPI 51 | env: 52 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 53 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 54 | run: | 55 | python3 -m pip install --upgrade pip 56 | python3 -m pip install wheel twine==5.0.0 setuptools 57 | 58 | twine upload ./wheelhouse/*.whl 59 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-macos.yaml: -------------------------------------------------------------------------------- 1 | name: build-wheels-macos 2 | 3 | on: 4 | push: 5 | branches: 6 | - wheel 7 | tags: 8 | - '*' 9 | 10 | workflow_dispatch: 11 | 12 | concurrency: 13 | group: build-wheels-macos-${{ github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | build_wheels: 18 | name: ${{ matrix.python-version }} 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [macos-13] 24 | python-version: ["cp38", "cp39", "cp310", "cp311", "cp312", "cp313"] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: Build wheels 30 | uses: pypa/cibuildwheel@v2.21.3 31 | env: 32 | CIBW_BUILD: "${{ matrix.python-version}}-* " 33 | CIBW_BEFORE_BUILD: "pip install -U numpy" 34 | CIBW_ENVIRONMENT: KALDIALIGN_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64;x86_64'" 35 | CIBW_ARCHS: "universal2" 36 | CIBW_BUILD_VERBOSITY: 3 37 | 38 | # Don't repair macOS wheels 39 | CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" 40 | 41 | - name: Display wheels 42 | shell: bash 43 | run: | 44 | ls -lh ./wheelhouse/ 45 | 46 | - uses: actions/upload-artifact@v4 47 | with: 48 | name: wheel-${{ matrix.python-version }} 49 | path: ./wheelhouse/*.whl 50 | 51 | - name: Publish wheels to PyPI 52 | env: 53 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 54 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 55 | run: | 56 | opts='--break-system-packages' 57 | v=${{ matrix.python-version }} 58 | if [[ $v == cp38 || $v == cp39 ]]; then 59 | opts='' 60 | fi 61 | 62 | python3 -m pip install $opts --upgrade pip 63 | python3 -m pip install $opts wheel twine==5.0.0 setuptools 64 | 65 | twine upload ./wheelhouse/*.whl 66 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-win32.yaml: -------------------------------------------------------------------------------- 1 | name: build-wheels-win32 2 | 3 | on: 4 | push: 5 | branches: 6 | - wheel 7 | tags: 8 | - '*' 9 | 10 | workflow_dispatch: 11 | 12 | concurrency: 13 | group: build-wheels-win32-${{ github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | build_wheels_win32: 18 | name: ${{ matrix.python-version }} 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [windows-latest] 24 | python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312", "cp313"] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | # see https://cibuildwheel.readthedocs.io/en/stable/changelog/ 30 | # for a list of versions 31 | - name: Build wheels 32 | uses: pypa/cibuildwheel@v2.21.3 33 | env: 34 | CIBW_BEFORE_BUILD: "pip install -U numpy" 35 | CIBW_BUILD: "${{ matrix.python-version}}-* " 36 | CIBW_ENVIRONMENT: KALDIALIGN_CMAKE_ARGS="-A Win32" 37 | CIBW_SKIP: "*-win_amd64" 38 | CIBW_BUILD_VERBOSITY: 3 39 | 40 | - name: Display wheels 41 | shell: bash 42 | run: | 43 | ls -lh ./wheelhouse/ 44 | 45 | - uses: actions/upload-artifact@v4 46 | with: 47 | name: wheel-${{ matrix.python-version }} 48 | path: ./wheelhouse/*.whl 49 | 50 | - name: Publish wheels to PyPI 51 | env: 52 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 53 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 54 | run: | 55 | python3 -m pip install --upgrade pip 56 | python3 -m pip install wheel twine==5.0.0 setuptools 57 | 58 | twine upload ./wheelhouse/*.whl 59 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels.yaml: -------------------------------------------------------------------------------- 1 | name: build-wheels 2 | 3 | on: 4 | push: 5 | branches: 6 | - wheel 7 | - wheel-python3.13 8 | tags: 9 | - '*' 10 | 11 | workflow_dispatch: 12 | 13 | concurrency: 14 | group: build-wheels-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | build_wheels: 19 | name: ${{ matrix.os }} ${{ matrix.python-version }} 20 | runs-on: ${{ matrix.os }} 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | os: [ubuntu-latest, windows-latest] 25 | python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312", "cp313"] 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | # see https://cibuildwheel.readthedocs.io/en/stable/changelog/ 31 | # for a list of versions 32 | - name: Build wheels 33 | uses: pypa/cibuildwheel@v2.21.3 34 | env: 35 | CIBW_BUILD: "${{ matrix.python-version}}-* " 36 | CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686" 37 | CIBW_BUILD_VERBOSITY: 3 38 | 39 | - name: Display wheels 40 | shell: bash 41 | run: | 42 | ls -lh ./wheelhouse/ 43 | 44 | - uses: actions/upload-artifact@v4 45 | with: 46 | name: wheel-${{ matrix.os }}-${{ matrix.python-version }} 47 | path: ./wheelhouse/*.whl 48 | 49 | - name: Build sdist 50 | if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8' 51 | shell: bash 52 | run: | 53 | python3 setup.py sdist 54 | ls -lh dist/* 55 | 56 | - name: Publish wheels to PyPI 57 | env: 58 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 59 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 60 | shell: bash 61 | run: | 62 | python3 -m pip install --upgrade pip 63 | python3 -m pip install wheel twine==5.0.0 setuptools 64 | 65 | twine upload ./wheelhouse/*.whl 66 | 67 | 68 | - name: Publish sdist to PyPI 69 | if: matrix.os == 'ubuntu-latest' && matrix.python-version == 'cp38' 70 | env: 71 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 72 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 73 | run: | 74 | twine upload dist/kaldialign-*.tar.gz 75 | -------------------------------------------------------------------------------- /.github/workflows/build_conda_macos.yml: -------------------------------------------------------------------------------- 1 | name: build_conda_macos 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | branches: 8 | - conda 9 | workflow_dispatch: 10 | 11 | concurrency: 12 | group: build_conda_macos-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | build_conda_macos: 17 | name: ${{ matrix.python-version }} 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | os: [macos-12] 23 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 24 | 25 | steps: 26 | # refer to https://github.com/actions/checkout 27 | - uses: actions/checkout@v2 28 | with: 29 | fetch-depth: 0 30 | 31 | - uses: conda-incubator/setup-miniconda@v2 32 | with: 33 | auto-update-conda: true 34 | python-version: ${{ matrix.python-version }} 35 | channels: conda-forge 36 | activate-environment: kaldialign 37 | 38 | - name: Display Python version 39 | shell: bash -l {0} 40 | run: | 41 | python3 -c "import sys; print(sys.version)" 42 | which python3 43 | 44 | - name: Install conda dependencies 45 | shell: bash -l {0} 46 | run: | 47 | conda install -y -q anaconda-client 48 | conda install -y -q conda-build 49 | 50 | - name: Display conda info 51 | shell: bash -l {0} 52 | run: | 53 | which conda 54 | conda env list 55 | conda info 56 | 57 | - name: Build kaldialign 58 | shell: bash -l {0} 59 | env: 60 | KALDIALIGN_CONDA_TOKEN: ${{ secrets.KALDIALIGN_CONDA_TOKEN }} 61 | run: | 62 | ./scripts/build_conda.sh 63 | 64 | - name: Display generated files 65 | run: | 66 | ls -lh /usr/local/miniconda/envs/kaldialign/conda-bld/osx-64 67 | 68 | - name: Upload generated files 69 | uses: actions/upload-artifact@v2 70 | with: 71 | name: cpu-python-${{ matrix.python-version }}-${{ matrix.os }} 72 | path: /usr/local/miniconda/envs/kaldialign/conda-bld/osx-64/*.tar.bz2 73 | -------------------------------------------------------------------------------- /.github/workflows/build_conda_ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: build_conda_ubuntu 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | branches: 8 | - conda 9 | workflow_dispatch: 10 | 11 | concurrency: 12 | group: build_conda_ubuntu-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | build_conda_ubuntu: 17 | name: ${{ matrix.python-version }} 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | os: [ubuntu-20.04] 23 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 24 | 25 | steps: 26 | # refer to https://github.com/actions/checkout 27 | - uses: actions/checkout@v4 28 | with: 29 | fetch-depth: 0 30 | 31 | - uses: conda-incubator/setup-miniconda@v2 32 | with: 33 | auto-update-conda: true 34 | python-version: ${{ matrix.python-version }} 35 | channels: conda-forge 36 | activate-environment: kaldialign 37 | 38 | - name: Display Python version 39 | shell: bash -l {0} 40 | run: | 41 | python3 -c "import sys; print(sys.version)" 42 | which python3 43 | 44 | - name: Install conda dependencies 45 | shell: bash -l {0} 46 | run: | 47 | conda install -y -q anaconda-client 48 | conda install -y -q conda-build 49 | conda install -y -q numpy 50 | 51 | - name: Display conda info 52 | shell: bash -l {0} 53 | run: | 54 | which conda 55 | conda env list 56 | conda info 57 | nproc 58 | 59 | - name: Build kaldialign 60 | shell: bash -l {0} 61 | env: 62 | KALDIALIGN_CONDA_TOKEN: ${{ secrets.KALDIALIGN_CONDA_TOKEN }} 63 | run: | 64 | ./scripts/build_conda.sh 65 | 66 | - name: Display generated files 67 | run: | 68 | ls -lh /usr/share/miniconda/envs/kaldialign/conda-bld/linux-64 69 | 70 | - name: Upload generated files 71 | uses: actions/upload-artifact@v2 72 | with: 73 | name: python-${{ matrix.python-version }}-${{ matrix.os }} 74 | path: /usr/share/miniconda/envs/kaldialign/conda-bld/linux-64/*.tar.bz2 75 | -------------------------------------------------------------------------------- /.github/workflows/build_conda_windows.yml: -------------------------------------------------------------------------------- 1 | name: build_conda_windows 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | branches: 8 | - conda 9 | 10 | workflow_dispatch: 11 | 12 | concurrency: 13 | group: build_conda_windows-${{ github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | build_conda_windows: 18 | name: ${{ matrix.python-version }} 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [windows-2019] 24 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 25 | 26 | steps: 27 | # refer to https://github.com/actions/checkout 28 | - uses: actions/checkout@v4 29 | with: 30 | fetch-depth: 0 31 | 32 | - uses: conda-incubator/setup-miniconda@v2 33 | with: 34 | auto-update-conda: true 35 | python-version: ${{ matrix.python-version }} 36 | channels: conda-forge 37 | activate-environment: kaldialign 38 | 39 | - name: Display Python version 40 | shell: bash -l {0} 41 | run: | 42 | python -c "import sys; print(sys.version)" 43 | which python 44 | 45 | - name: Install conda dependencies 46 | shell: bash -l {0} 47 | run: | 48 | conda install -y -q anaconda-client 49 | conda install -y -q conda-build 50 | 51 | - name: Display conda info 52 | shell: bash -l {0} 53 | run: | 54 | which conda 55 | conda env list 56 | conda info 57 | which python 58 | 59 | - name: Build kaldialign 60 | shell: bash -l {0} 61 | env: 62 | KALDIALIGN_CONDA_TOKEN: ${{ secrets.KALDIALIGN_CONDA_TOKEN }} 63 | run: | 64 | ./scripts/build_conda.sh 65 | 66 | - name: Display generated files 67 | shell: bash -l {0} 68 | run: | 69 | ls -lh /c/Miniconda/envs/kaldialign/conda-bld 70 | ls -lh /c/Miniconda/envs/kaldialign/conda-bld/*/* 71 | ls -lh /c/Miniconda/envs/kaldialign/conda-bld/win-64/* 72 | 73 | - name: Upload generated files 74 | uses: actions/upload-artifact@v2 75 | with: 76 | name: python-${{ matrix.python-version }}-windows-2019 77 | path: c:/Miniconda/envs/kaldialign/conda-bld/win-64/*.tar.bz2 78 | -------------------------------------------------------------------------------- /.github/workflows/test-pip-install.yaml: -------------------------------------------------------------------------------- 1 | name: test-pip-install 2 | 3 | on: 4 | push: 5 | branches: 6 | - nightly 7 | schedule: 8 | # minute (0-59) 9 | # hour (0-23) 10 | # day of the month (1-31) 11 | # month (1-12) 12 | # day of the week (0-6) 13 | # nightly test at 22:50 UTC time every day 14 | - cron: "50 22 * * *" 15 | 16 | workflow_dispatch: 17 | 18 | concurrency: 19 | group: test_pip_install-${{ github.ref }} 20 | cancel-in-progress: true 21 | 22 | permissions: 23 | contents: read 24 | 25 | jobs: 26 | test_pip_install: 27 | name: ${{ matrix.os }} ${{ matrix.python-version }} 28 | runs-on: ${{ matrix.os }} 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | os: [ubuntu-latest, macos-latest, windows-latest] 33 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 34 | exclude: 35 | - os: macos-latest 36 | python-version: "3.7" 37 | 38 | steps: 39 | - uses: actions/checkout@v4 40 | with: 41 | fetch-depth: 0 42 | 43 | - name: Setup Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v5 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | 48 | - name: Display Python version 49 | run: python -c "import sys; print(sys.version)" 50 | 51 | - name: Install kaldialign 52 | shell: bash 53 | run: | 54 | pip3 install --verbose 'kaldialign[test]' 55 | 56 | - name: Run test 57 | shell: bash 58 | run: | 59 | cd tests 60 | python3 -c "import kaldialign; print(kaldialign.__file__)" 61 | python3 -c "import kaldialign; print(kaldialign.__version__)" 62 | pytest . 63 | 64 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Run test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | workflow_dispatch: 12 | 13 | jobs: 14 | run-test: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | os: [ubuntu-latest, macos-latest, windows-latest] 20 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 21 | exclude: 22 | - os: macos-latest 23 | python-version: "3.7" 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | # see https://github.com/microsoft/setup-msbuild 29 | - name: Add msbuild to PATH 30 | if: startsWith(matrix.os, 'windows') 31 | uses: microsoft/setup-msbuild@v1.0.2 32 | 33 | - name: Setup Python 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | 38 | - name: Install kaldialign 39 | shell: bash 40 | run: | 41 | python3 -m pip install --verbose '.[test]' 42 | 43 | - name: Test 44 | shell: bash 45 | run: | 46 | pytest -vv ./tests/test_align.py 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-ast 6 | - id: check-executables-have-shebangs 7 | - id: end-of-file-fixer 8 | - id: mixed-line-ending 9 | - id: trailing-whitespace 10 | 11 | - repo: https://github.com/PyCQA/flake8 12 | rev: 7.0.0 13 | hooks: 14 | - id: flake8 15 | args: ['--select=E9,F63,F7,F82'] 16 | 17 | - repo: https://github.com/pycqa/isort 18 | rev: 5.13.2 19 | hooks: 20 | - id: isort 21 | args: [--profile=black] 22 | 23 | - repo: https://github.com/psf/black 24 | rev: 24.2.0 25 | hooks: 26 | - id: black 27 | additional_dependencies: ['click==8.0.1'] 28 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") 2 | set(CMAKE_POLICY_VERSION_MINIMUM 3.5) 3 | endif() 4 | if("x${CMAKE_SOURCE_DIR}" STREQUAL "x${CMAKE_BINARY_DIR}") 5 | message(FATAL_ERROR "\ 6 | In-source build is not a good practice. 7 | Please use: 8 | mkdir build 9 | cd build 10 | cmake .. 11 | to build this project" 12 | ) 13 | endif() 14 | 15 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 16 | 17 | project(kaldialign CXX) 18 | 19 | # Please remember to also change line 3 of ./scripts/conda/kaldialign/meta.yaml 20 | set(KALDIALIGN_VERSION "0.9.2") 21 | 22 | if(NOT CMAKE_BUILD_TYPE) 23 | set(CMAKE_BUILD_TYPE Release) 24 | endif() 25 | 26 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) 27 | include(pybind11) 28 | 29 | pybind11_add_module(_kaldialign 30 | ./extensions/kaldi_align.cpp 31 | ./extensions/kaldialign.cpp 32 | ) 33 | 34 | install(TARGETS _kaldialign 35 | DESTINATION ../ 36 | ) 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include extensions/* 3 | include tests/* 4 | include CMakeLists.txt 5 | recursive-include cmake *.* 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kaldialign 2 | 3 | A small package that exposes edit distance computation functions from [Kaldi](https://github.com/kaldi-asr/kaldi). It uses the original Kaldi code and wraps it using pybind11. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | conda install -c kaldialign kaldialign 9 | ``` 10 | 11 | or 12 | 13 | ```bash 14 | pip install --verbose kaldialign 15 | ``` 16 | 17 | or 18 | 19 | ```bash 20 | pip install --verbose -U git+https://github.com/pzelasko/kaldialign.git 21 | ``` 22 | 23 | or 24 | 25 | ```bash 26 | git clone https://github.com/pzelasko/kaldialign.git 27 | cd kaldialign 28 | python3 -m pip install --verbose . 29 | ``` 30 | 31 | ## Examples 32 | 33 | ### Alignment 34 | 35 | `align(ref, hyp, epsilon)` - used to obtain the alignment between two string sequences. `epsilon` should be a null symbol (indicating deletion/insertion) that doesn't exist in either sequence. 36 | 37 | ```python 38 | from kaldialign import align 39 | 40 | EPS = '*' 41 | a = ['a', 'b', 'c'] 42 | b = ['a', 's', 'x', 'c'] 43 | ali = align(a, b, EPS) 44 | assert ali == [('a', 'a'), ('b', 's'), (EPS, 'x'), ('c', 'c')] 45 | ``` 46 | 47 | ### Edit distance 48 | 49 | `edit_distance(ref, hyp)` - used to obtain the total edit distance, as well as the number of insertions, deletions and substitutions. 50 | 51 | ```python 52 | from kaldialign import edit_distance 53 | 54 | a = ['a', 'b', 'c'] 55 | b = ['a', 's', 'x', 'c'] 56 | results = edit_distance(a, b) 57 | assert results == { 58 | 'ins': 1, 59 | 'del': 0, 60 | 'sub': 1, 61 | 'total': 2 62 | } 63 | ``` 64 | 65 | For alignment and edit distance, you can pass `sclite_mode=True` to compute WER or alignments 66 | based on SCLITE style weights, i.e., insertion/deletion cost 3 and substitution cost 4. 67 | 68 | ### Bootstrapping method to extract WER 95% confidence intervals 69 | 70 | `boostrap_wer_ci(ref, hyp, hyp2=None)` - obtain the 95% confidence intervals for WER using Bisani and Ney boostrapping method. 71 | 72 | ```python 73 | from kaldialign import bootstrap_wer_ci 74 | 75 | ref = [ 76 | ("a", "b", "c"), 77 | ("d", "e", "f"), 78 | ] 79 | hyp = [ 80 | ("a", "b", "d"), 81 | ("e", "f", "f"), 82 | ] 83 | ans = bootstrap_wer_ci(ref, hyp) 84 | assert ans["wer"] == 0.4989 85 | assert ans["ci95"] == 0.2312 86 | assert ans["ci95min"] == 0.2678 87 | assert ans["ci95max"] == 0.7301 88 | ``` 89 | 90 | It also supports providing hypotheses from system 1 and system 2 to compute the probability of S2 improving over S1: 91 | 92 | ```python 93 | from kaldialign import bootstrap_wer_ci 94 | 95 | ref = [ 96 | ("a", "b", "c"), 97 | ("d", "e", "f"), 98 | ] 99 | hyp = [ 100 | ("a", "b", "d"), 101 | ("e", "f", "f"), 102 | ] 103 | hyp2 = [ 104 | ("a", "b", "c"), 105 | ("e", "e", "f"), 106 | ] 107 | ans = bootstrap_wer_ci(ref, hyp, hyp2) 108 | 109 | s = ans["system1"] 110 | assert s["wer"] == 0.4989 111 | assert s["ci95"] == 0.2312 112 | assert s["ci95min"] == 0.2678 113 | assert s["ci95max"] == 0.7301 114 | 115 | s = ans["system2"] 116 | assert s["wer"] == 0.1656 117 | assert s["ci95"] == 0.2312 118 | assert s["ci95min"] == -0.0656 119 | assert s["ci95max"] == 0.3968 120 | 121 | assert ans["p_s2_improv_over_s1"] == 1.0 122 | ``` 123 | 124 | ## Motivation 125 | 126 | The need for this arised from the fact that practically all implementations of the Levenshtein distance have slight differences, making it impossible to use a different scoring tool than Kaldi and get the same error rate results. This package copies code from Kaldi directly and wraps it using pybind11, avoiding the issue altogether. 127 | -------------------------------------------------------------------------------- /cmake/Modules/FetchContent.cmake: -------------------------------------------------------------------------------- 1 | # Distributed under the OSI-approved BSD 3-Clause License. See accompanying 2 | # file Copyright.txt or https://cmake.org/licensing for details. 3 | 4 | #[=======================================================================[.rst: 5 | FetchContent 6 | ------------------ 7 | 8 | .. only:: html 9 | 10 | .. contents:: 11 | 12 | Overview 13 | ^^^^^^^^ 14 | 15 | This module enables populating content at configure time via any method 16 | supported by the :module:`ExternalProject` module. Whereas 17 | :command:`ExternalProject_Add` downloads at build time, the 18 | ``FetchContent`` module makes content available immediately, allowing the 19 | configure step to use the content in commands like :command:`add_subdirectory`, 20 | :command:`include` or :command:`file` operations. 21 | 22 | Content population details would normally be defined separately from the 23 | command that performs the actual population. Projects should also 24 | check whether the content has already been populated somewhere else in the 25 | project hierarchy. Typical usage would look something like this: 26 | 27 | .. code-block:: cmake 28 | 29 | FetchContent_Declare( 30 | googletest 31 | GIT_REPOSITORY https://github.com/google/googletest.git 32 | GIT_TAG release-1.8.0 33 | ) 34 | 35 | FetchContent_GetProperties(googletest) 36 | if(NOT googletest_POPULATED) 37 | FetchContent_Populate(googletest) 38 | add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) 39 | endif() 40 | 41 | When using the above pattern with a hierarchical project arrangement, 42 | projects at higher levels in the hierarchy are able to define or override 43 | the population details of content specified anywhere lower in the project 44 | hierarchy. The ability to detect whether content has already been 45 | populated ensures that even if multiple child projects want certain content 46 | to be available, the first one to populate it wins. The other child project 47 | can simply make use of the already available content instead of repeating 48 | the population for itself. See the 49 | :ref:`Examples ` section which demonstrates 50 | this scenario. 51 | 52 | The ``FetchContent`` module also supports defining and populating 53 | content in a single call, with no check for whether the content has been 54 | populated elsewhere in the project already. This is a more low level 55 | operation and would not normally be the way the module is used, but it is 56 | sometimes useful as part of implementing some higher level feature or to 57 | populate some content in CMake's script mode. 58 | 59 | 60 | Declaring Content Details 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 62 | 63 | .. command:: FetchContent_Declare 64 | 65 | .. code-block:: cmake 66 | 67 | FetchContent_Declare( ...) 68 | 69 | The ``FetchContent_Declare()`` function records the options that describe 70 | how to populate the specified content, but if such details have already 71 | been recorded earlier in this project (regardless of where in the project 72 | hierarchy), this and all later calls for the same content ```` are 73 | ignored. This "first to record, wins" approach is what allows hierarchical 74 | projects to have parent projects override content details of child projects. 75 | 76 | The content ```` can be any string without spaces, but good practice 77 | would be to use only letters, numbers and underscores. The name will be 78 | treated case-insensitively and it should be obvious for the content it 79 | represents, often being the name of the child project or the value given 80 | to its top level :command:`project` command (if it is a CMake project). 81 | For well-known public projects, the name should generally be the official 82 | name of the project. Choosing an unusual name makes it unlikely that other 83 | projects needing that same content will use the same name, leading to 84 | the content being populated multiple times. 85 | 86 | The ```` can be any of the download or update/patch options 87 | that the :command:`ExternalProject_Add` command understands. The configure, 88 | build, install and test steps are explicitly disabled and therefore options 89 | related to them will be ignored. In most cases, ```` will 90 | just be a couple of options defining the download method and method-specific 91 | details like a commit tag or archive hash. For example: 92 | 93 | .. code-block:: cmake 94 | 95 | FetchContent_Declare( 96 | googletest 97 | GIT_REPOSITORY https://github.com/google/googletest.git 98 | GIT_TAG release-1.8.0 99 | ) 100 | 101 | FetchContent_Declare( 102 | myCompanyIcons 103 | URL https://intranet.mycompany.com/assets/iconset_1.12.tar.gz 104 | URL_HASH 5588a7b18261c20068beabfb4f530b87 105 | ) 106 | 107 | FetchContent_Declare( 108 | myCompanyCertificates 109 | SVN_REPOSITORY svn+ssh://svn.mycompany.com/srv/svn/trunk/certs 110 | SVN_REVISION -r12345 111 | ) 112 | 113 | Populating The Content 114 | ^^^^^^^^^^^^^^^^^^^^^^ 115 | 116 | .. command:: FetchContent_Populate 117 | 118 | .. code-block:: cmake 119 | 120 | FetchContent_Populate( ) 121 | 122 | In most cases, the only argument given to ``FetchContent_Populate()`` is the 123 | ````. When used this way, the command assumes the content details have 124 | been recorded by an earlier call to :command:`FetchContent_Declare`. The 125 | details are stored in a global property, so they are unaffected by things 126 | like variable or directory scope. Therefore, it doesn't matter where in the 127 | project the details were previously declared, as long as they have been 128 | declared before the call to ``FetchContent_Populate()``. Those saved details 129 | are then used to construct a call to :command:`ExternalProject_Add` in a 130 | private sub-build to perform the content population immediately. The 131 | implementation of ``ExternalProject_Add()`` ensures that if the content has 132 | already been populated in a previous CMake run, that content will be reused 133 | rather than repopulating them again. For the common case where population 134 | involves downloading content, the cost of the download is only paid once. 135 | 136 | An internal global property records when a particular content population 137 | request has been processed. If ``FetchContent_Populate()`` is called more 138 | than once for the same content name within a configure run, the second call 139 | will halt with an error. Projects can and should check whether content 140 | population has already been processed with the 141 | :command:`FetchContent_GetProperties` command before calling 142 | ``FetchContent_Populate()``. 143 | 144 | ``FetchContent_Populate()`` will set three variables in the scope of the 145 | caller; ``_POPULATED``, ``_SOURCE_DIR`` and 146 | ``_BINARY_DIR``, where ```` is the lowercased ````. 147 | ``_POPULATED`` will always be set to ``True`` by the call. 148 | ``_SOURCE_DIR`` is the location where the 149 | content can be found upon return (it will have already been populated), while 150 | ``_BINARY_DIR`` is a directory intended for use as a corresponding 151 | build directory. The main use case for the two directory variables is to 152 | call :command:`add_subdirectory` immediately after population, i.e.: 153 | 154 | .. code-block:: cmake 155 | 156 | FetchContent_Populate(FooBar ...) 157 | add_subdirectory(${foobar_SOURCE_DIR} ${foobar_BINARY_DIR}) 158 | 159 | The values of the three variables can also be retrieved from anywhere in the 160 | project hierarchy using the :command:`FetchContent_GetProperties` command. 161 | 162 | A number of cache variables influence the behavior of all content population 163 | performed using details saved from a :command:`FetchContent_Declare` call: 164 | 165 | ``FETCHCONTENT_BASE_DIR`` 166 | In most cases, the saved details do not specify any options relating to the 167 | directories to use for the internal sub-build, final source and build areas. 168 | It is generally best to leave these decisions up to the ``FetchContent`` 169 | module to handle on the project's behalf. The ``FETCHCONTENT_BASE_DIR`` 170 | cache variable controls the point under which all content population 171 | directories are collected, but in most cases developers would not need to 172 | change this. The default location is ``${CMAKE_BINARY_DIR}/_deps``, but if 173 | developers change this value, they should aim to keep the path short and 174 | just below the top level of the build tree to avoid running into path 175 | length problems on Windows. 176 | 177 | ``FETCHCONTENT_QUIET`` 178 | The logging output during population can be quite verbose, making the 179 | configure stage quite noisy. This cache option (``ON`` by default) hides 180 | all population output unless an error is encountered. If experiencing 181 | problems with hung downloads, temporarily switching this option off may 182 | help diagnose which content population is causing the issue. 183 | 184 | ``FETCHCONTENT_FULLY_DISCONNECTED`` 185 | When this option is enabled, no attempt is made to download or update 186 | any content. It is assumed that all content has already been populated in 187 | a previous run or the source directories have been pointed at existing 188 | contents the developer has provided manually (using options described 189 | further below). When the developer knows that no changes have been made to 190 | any content details, turning this option ``ON`` can significantly speed up 191 | the configure stage. It is ``OFF`` by default. 192 | 193 | ``FETCHCONTENT_UPDATES_DISCONNECTED`` 194 | This is a less severe download/update control compared to 195 | ``FETCHCONTENT_FULLY_DISCONNECTED``. Instead of bypassing all download and 196 | update logic, the ``FETCHCONTENT_UPDATES_DISCONNECTED`` only disables the 197 | update stage. Therefore, if content has not been downloaded previously, 198 | it will still be downloaded when this option is enabled. This can speed up 199 | the configure stage, but not as much as 200 | ``FETCHCONTENT_FULLY_DISCONNECTED``. It is ``OFF`` by default. 201 | 202 | In addition to the above cache variables, the following cache variables are 203 | also defined for each content name (```` is the uppercased value of 204 | ````): 205 | 206 | ``FETCHCONTENT_SOURCE_DIR_`` 207 | If this is set, no download or update steps are performed for the specified 208 | content and the ``_SOURCE_DIR`` variable returned to the caller is 209 | pointed at this location. This gives developers a way to have a separate 210 | checkout of the content that they can modify freely without interference 211 | from the build. The build simply uses that existing source, but it still 212 | defines ``_BINARY_DIR`` to point inside its own build area. 213 | Developers are strongly encouraged to use this mechanism rather than 214 | editing the sources populated in the default location, as changes to 215 | sources in the default location can be lost when content population details 216 | are changed by the project. 217 | 218 | ``FETCHCONTENT_UPDATES_DISCONNECTED_`` 219 | This is the per-content equivalent of 220 | ``FETCHCONTENT_UPDATES_DISCONNECTED``. If the global option or this option 221 | is ``ON``, then updates will be disabled for the named content. 222 | Disabling updates for individual content can be useful for content whose 223 | details rarely change, while still leaving other frequently changing 224 | content with updates enabled. 225 | 226 | 227 | The ``FetchContent_Populate()`` command also supports a syntax allowing the 228 | content details to be specified directly rather than using any saved 229 | details. This is more low-level and use of this form is generally to be 230 | avoided in favour of using saved content details as outlined above. 231 | Nevertheless, in certain situations it can be useful to invoke the content 232 | population as an isolated operation (typically as part of implementing some 233 | other higher level feature or when using CMake in script mode): 234 | 235 | .. code-block:: cmake 236 | 237 | FetchContent_Populate( 238 | [QUIET] 239 | [SUBBUILD_DIR ] 240 | [SOURCE_DIR ] 241 | [BINARY_DIR ] 242 | ... 243 | ) 244 | 245 | This form has a number of key differences to that where only ```` is 246 | provided: 247 | 248 | - All required population details are assumed to have been provided directly 249 | in the call to ``FetchContent_Populate()``. Any saved details for 250 | ```` are ignored. 251 | - No check is made for whether content for ```` has already been 252 | populated. 253 | - No global property is set to record that the population has occurred. 254 | - No global properties record the source or binary directories used for the 255 | populated content. 256 | - The ``FETCHCONTENT_FULLY_DISCONNECTED`` and 257 | ``FETCHCONTENT_UPDATES_DISCONNECTED`` cache variables are ignored. 258 | 259 | The ``_SOURCE_DIR`` and ``_BINARY_DIR`` variables are still 260 | returned to the caller, but since these locations are not stored as global 261 | properties when this form is used, they are only available to the calling 262 | scope and below rather than the entire project hierarchy. No 263 | ``_POPULATED`` variable is set in the caller's scope with this form. 264 | 265 | The supported options for ``FetchContent_Populate()`` are the same as those 266 | for :command:`FetchContent_Declare()`. Those few options shown just 267 | above are either specific to ``FetchContent_Populate()`` or their behavior is 268 | slightly modified from how :command:`ExternalProject_Add` treats them. 269 | 270 | ``QUIET`` 271 | The ``QUIET`` option can be given to hide the output associated with 272 | populating the specified content. If the population fails, the output will 273 | be shown regardless of whether this option was given or not so that the 274 | cause of the failure can be diagnosed. The global ``FETCHCONTENT_QUIET`` 275 | cache variable has no effect on ``FetchContent_Populate()`` calls where the 276 | content details are provided directly. 277 | 278 | ``SUBBUILD_DIR`` 279 | The ``SUBBUILD_DIR`` argument can be provided to change the location of the 280 | sub-build created to perform the population. The default value is 281 | ``${CMAKE_CURRENT_BINARY_DIR}/-subbuild`` and it would be unusual 282 | to need to override this default. If a relative path is specified, it will 283 | be interpreted as relative to :variable:`CMAKE_CURRENT_BINARY_DIR`. 284 | 285 | ``SOURCE_DIR``, ``BINARY_DIR`` 286 | The ``SOURCE_DIR`` and ``BINARY_DIR`` arguments are supported by 287 | :command:`ExternalProject_Add`, but different default values are used by 288 | ``FetchContent_Populate()``. ``SOURCE_DIR`` defaults to 289 | ``${CMAKE_CURRENT_BINARY_DIR}/-src`` and ``BINARY_DIR`` defaults to 290 | ``${CMAKE_CURRENT_BINARY_DIR}/-build``. If a relative path is 291 | specified, it will be interpreted as relative to 292 | :variable:`CMAKE_CURRENT_BINARY_DIR`. 293 | 294 | In addition to the above explicit options, any other unrecognized options are 295 | passed through unmodified to :command:`ExternalProject_Add` to perform the 296 | download, patch and update steps. The following options are explicitly 297 | prohibited (they are disabled by the ``FetchContent_Populate()`` command): 298 | 299 | - ``CONFIGURE_COMMAND`` 300 | - ``BUILD_COMMAND`` 301 | - ``INSTALL_COMMAND`` 302 | - ``TEST_COMMAND`` 303 | 304 | If using ``FetchContent_Populate()`` within CMake's script mode, be aware 305 | that the implementation sets up a sub-build which therefore requires a CMake 306 | generator and build tool to be available. If these cannot be found by 307 | default, then the :variable:`CMAKE_GENERATOR` and/or 308 | :variable:`CMAKE_MAKE_PROGRAM` variables will need to be set appropriately 309 | on the command line invoking the script. 310 | 311 | 312 | Retrieve Population Properties 313 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 314 | 315 | .. command:: FetchContent_GetProperties 316 | 317 | When using saved content details, a call to :command:`FetchContent_Populate` 318 | records information in global properties which can be queried at any time. 319 | This information includes the source and binary directories associated with 320 | the content and also whether or not the content population has been processed 321 | during the current configure run. 322 | 323 | .. code-block:: cmake 324 | 325 | FetchContent_GetProperties( 326 | [SOURCE_DIR ] 327 | [BINARY_DIR ] 328 | [POPULATED ] 329 | ) 330 | 331 | The ``SOURCE_DIR``, ``BINARY_DIR`` and ``POPULATED`` options can be used to 332 | specify which properties should be retrieved. Each option accepts a value 333 | which is the name of the variable in which to store that property. Most of 334 | the time though, only ```` is given, in which case the call will then 335 | set the same variables as a call to 336 | :command:`FetchContent_Populate(name) `. This allows 337 | the following canonical pattern to be used, which ensures that the relevant 338 | variables will always be defined regardless of whether or not the population 339 | has been performed elsewhere in the project already: 340 | 341 | .. code-block:: cmake 342 | 343 | FetchContent_GetProperties(foobar) 344 | if(NOT foobar_POPULATED) 345 | FetchContent_Populate(foobar) 346 | 347 | # Set any custom variables, etc. here, then 348 | # populate the content as part of this build 349 | 350 | add_subdirectory(${foobar_SOURCE_DIR} ${foobar_BINARY_DIR}) 351 | endif() 352 | 353 | The above pattern allows other parts of the overall project hierarchy to 354 | re-use the same content and ensure that it is only populated once. 355 | 356 | 357 | .. _`fetch-content-examples`: 358 | 359 | Examples 360 | ^^^^^^^^ 361 | 362 | Consider a project hierarchy where ``projA`` is the top level project and it 363 | depends on projects ``projB`` and ``projC``. Both ``projB`` and ``projC`` 364 | can be built standalone and they also both depend on another project 365 | ``projD``. For simplicity, this example will assume that all four projects 366 | are available on a company git server. The ``CMakeLists.txt`` of each project 367 | might have sections like the following: 368 | 369 | *projA*: 370 | 371 | .. code-block:: cmake 372 | 373 | include(FetchContent) 374 | FetchContent_Declare( 375 | projB 376 | GIT_REPOSITORY git@mycompany.com/git/projB.git 377 | GIT_TAG 4a89dc7e24ff212a7b5167bef7ab079d 378 | ) 379 | FetchContent_Declare( 380 | projC 381 | GIT_REPOSITORY git@mycompany.com/git/projC.git 382 | GIT_TAG 4ad4016bd1d8d5412d135cf8ceea1bb9 383 | ) 384 | FetchContent_Declare( 385 | projD 386 | GIT_REPOSITORY git@mycompany.com/git/projD.git 387 | GIT_TAG origin/integrationBranch 388 | ) 389 | 390 | FetchContent_GetProperties(projB) 391 | if(NOT projb_POPULATED) 392 | FetchContent_Populate(projB) 393 | add_subdirectory(${projb_SOURCE_DIR} ${projb_BINARY_DIR}) 394 | endif() 395 | 396 | FetchContent_GetProperties(projC) 397 | if(NOT projc_POPULATED) 398 | FetchContent_Populate(projC) 399 | add_subdirectory(${projc_SOURCE_DIR} ${projc_BINARY_DIR}) 400 | endif() 401 | 402 | *projB*: 403 | 404 | .. code-block:: cmake 405 | 406 | include(FetchContent) 407 | FetchContent_Declare( 408 | projD 409 | GIT_REPOSITORY git@mycompany.com/git/projD.git 410 | GIT_TAG 20b415f9034bbd2a2e8216e9a5c9e632 411 | ) 412 | 413 | FetchContent_GetProperties(projD) 414 | if(NOT projd_POPULATED) 415 | FetchContent_Populate(projD) 416 | add_subdirectory(${projd_SOURCE_DIR} ${projd_BINARY_DIR}) 417 | endif() 418 | 419 | 420 | *projC*: 421 | 422 | .. code-block:: cmake 423 | 424 | include(FetchContent) 425 | FetchContent_Declare( 426 | projD 427 | GIT_REPOSITORY git@mycompany.com/git/projD.git 428 | GIT_TAG 7d9a17ad2c962aa13e2fbb8043fb6b8a 429 | ) 430 | 431 | FetchContent_GetProperties(projD) 432 | if(NOT projd_POPULATED) 433 | FetchContent_Populate(projD) 434 | add_subdirectory(${projd_SOURCE_DIR} ${projd_BINARY_DIR}) 435 | endif() 436 | 437 | A few key points should be noted in the above: 438 | 439 | - ``projB`` and ``projC`` define different content details for ``projD``, 440 | but ``projA`` also defines a set of content details for ``projD`` and 441 | because ``projA`` will define them first, the details from ``projB`` and 442 | ``projC`` will not be used. The override details defined by ``projA`` 443 | are not required to match either of those from ``projB`` or ``projC``, but 444 | it is up to the higher level project to ensure that the details it does 445 | define still make sense for the child projects. 446 | - While ``projA`` defined content details for ``projD``, it did not need 447 | to explicitly call ``FetchContent_Populate(projD)`` itself. Instead, it 448 | leaves that to a child project to do (in this case it will be ``projB`` 449 | since it is added to the build ahead of ``projC``). If ``projA`` needed to 450 | customize how the ``projD`` content was brought into the build as well 451 | (e.g. define some CMake variables before calling 452 | :command:`add_subdirectory` after populating), it would do the call to 453 | ``FetchContent_Populate()``, etc. just as it did for the ``projB`` and 454 | ``projC`` content. For higher level projects, it is usually enough to 455 | just define the override content details and leave the actual population 456 | to the child projects. This saves repeating the same thing at each level 457 | of the project hierarchy unnecessarily. 458 | - Even though ``projA`` is the top level project in this example, it still 459 | checks whether ``projB`` and ``projC`` have already been populated before 460 | going ahead to do those populations. This makes ``projA`` able to be more 461 | easily incorporated as a child of some other higher level project in the 462 | future if required. Always protect a call to 463 | :command:`FetchContent_Populate` with a check to 464 | :command:`FetchContent_GetProperties`, even in what may be considered a top 465 | level project at the time. 466 | 467 | 468 | The following example demonstrates how one might download and unpack a 469 | firmware tarball using CMake's :manual:`script mode `. The call to 470 | :command:`FetchContent_Populate` specifies all the content details and the 471 | unpacked firmware will be placed in a ``firmware`` directory below the 472 | current working directory. 473 | 474 | *getFirmware.cmake*: 475 | 476 | .. code-block:: cmake 477 | 478 | # NOTE: Intended to be run in script mode with cmake -P 479 | include(FetchContent) 480 | FetchContent_Populate( 481 | firmware 482 | URL https://mycompany.com/assets/firmware-1.23-arm.tar.gz 483 | URL_HASH MD5=68247684da89b608d466253762b0ff11 484 | SOURCE_DIR firmware 485 | ) 486 | 487 | #]=======================================================================] 488 | 489 | 490 | set(__FetchContent_privateDir "${CMAKE_CURRENT_LIST_DIR}/FetchContent") 491 | 492 | #======================================================================= 493 | # Recording and retrieving content details for later population 494 | #======================================================================= 495 | 496 | # Internal use, projects must not call this directly. It is 497 | # intended for use by FetchContent_Declare() only. 498 | # 499 | # Sets a content-specific global property (not meant for use 500 | # outside of functions defined here in this file) which can later 501 | # be retrieved using __FetchContent_getSavedDetails() with just the 502 | # same content name. If there is already a value stored in the 503 | # property, it is left unchanged and this call has no effect. 504 | # This allows parent projects to define the content details, 505 | # overriding anything a child project may try to set (properties 506 | # are not cached between runs, so the first thing to set it in a 507 | # build will be in control). 508 | function(__FetchContent_declareDetails contentName) 509 | 510 | string(TOLOWER ${contentName} contentNameLower) 511 | set(propertyName "_FetchContent_${contentNameLower}_savedDetails") 512 | get_property(alreadyDefined GLOBAL PROPERTY ${propertyName} DEFINED) 513 | if(NOT alreadyDefined) 514 | define_property(GLOBAL PROPERTY ${propertyName} 515 | BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" 516 | FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" 517 | ) 518 | set_property(GLOBAL PROPERTY ${propertyName} ${ARGN}) 519 | endif() 520 | 521 | endfunction() 522 | 523 | 524 | # Internal use, projects must not call this directly. It is 525 | # intended for use by the FetchContent_Declare() function. 526 | # 527 | # Retrieves details saved for the specified content in an 528 | # earlier call to __FetchContent_declareDetails(). 529 | function(__FetchContent_getSavedDetails contentName outVar) 530 | 531 | string(TOLOWER ${contentName} contentNameLower) 532 | set(propertyName "_FetchContent_${contentNameLower}_savedDetails") 533 | get_property(alreadyDefined GLOBAL PROPERTY ${propertyName} DEFINED) 534 | if(NOT alreadyDefined) 535 | message(FATAL_ERROR "No content details recorded for ${contentName}") 536 | endif() 537 | get_property(propertyValue GLOBAL PROPERTY ${propertyName}) 538 | set(${outVar} "${propertyValue}" PARENT_SCOPE) 539 | 540 | endfunction() 541 | 542 | 543 | # Saves population details of the content, sets defaults for the 544 | # SOURCE_DIR and BUILD_DIR. 545 | function(FetchContent_Declare contentName) 546 | 547 | set(options "") 548 | set(oneValueArgs SVN_REPOSITORY) 549 | set(multiValueArgs "") 550 | 551 | cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 552 | 553 | unset(srcDirSuffix) 554 | unset(svnRepoArgs) 555 | if(ARG_SVN_REPOSITORY) 556 | # Add a hash of the svn repository URL to the source dir. This works 557 | # around the problem where if the URL changes, the download would 558 | # fail because it tries to checkout/update rather than switch the 559 | # old URL to the new one. We limit the hash to the first 7 characters 560 | # so that the source path doesn't get overly long (which can be a 561 | # problem on windows due to path length limits). 562 | string(SHA1 urlSHA ${ARG_SVN_REPOSITORY}) 563 | string(SUBSTRING ${urlSHA} 0 7 urlSHA) 564 | set(srcDirSuffix "-${urlSHA}") 565 | set(svnRepoArgs SVN_REPOSITORY ${ARG_SVN_REPOSITORY}) 566 | endif() 567 | 568 | string(TOLOWER ${contentName} contentNameLower) 569 | __FetchContent_declareDetails( 570 | ${contentNameLower} 571 | SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src${srcDirSuffix}" 572 | BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build" 573 | ${svnRepoArgs} 574 | # List these last so they can override things we set above 575 | ${ARG_UNPARSED_ARGUMENTS} 576 | ) 577 | 578 | endfunction() 579 | 580 | 581 | #======================================================================= 582 | # Set/get whether the specified content has been populated yet. 583 | # The setter also records the source and binary dirs used. 584 | #======================================================================= 585 | 586 | # Internal use, projects must not call this directly. It is 587 | # intended for use by the FetchContent_Populate() function to 588 | # record when FetchContent_Populate() is called for a particular 589 | # content name. 590 | function(__FetchContent_setPopulated contentName sourceDir binaryDir) 591 | 592 | string(TOLOWER ${contentName} contentNameLower) 593 | set(prefix "_FetchContent_${contentNameLower}") 594 | 595 | set(propertyName "${prefix}_sourceDir") 596 | define_property(GLOBAL PROPERTY ${propertyName} 597 | BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" 598 | FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" 599 | ) 600 | set_property(GLOBAL PROPERTY ${propertyName} ${sourceDir}) 601 | 602 | set(propertyName "${prefix}_binaryDir") 603 | define_property(GLOBAL PROPERTY ${propertyName} 604 | BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" 605 | FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" 606 | ) 607 | set_property(GLOBAL PROPERTY ${propertyName} ${binaryDir}) 608 | 609 | set(propertyName "${prefix}_populated") 610 | define_property(GLOBAL PROPERTY ${propertyName} 611 | BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" 612 | FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" 613 | ) 614 | set_property(GLOBAL PROPERTY ${propertyName} True) 615 | 616 | endfunction() 617 | 618 | 619 | # Set variables in the calling scope for any of the retrievable 620 | # properties. If no specific properties are requested, variables 621 | # will be set for all retrievable properties. 622 | # 623 | # This function is intended to also be used by projects as the canonical 624 | # way to detect whether they should call FetchContent_Populate() 625 | # and pull the populated source into the build with add_subdirectory(), 626 | # if they are using the populated content in that way. 627 | function(FetchContent_GetProperties contentName) 628 | 629 | string(TOLOWER ${contentName} contentNameLower) 630 | 631 | set(options "") 632 | set(oneValueArgs SOURCE_DIR BINARY_DIR POPULATED) 633 | set(multiValueArgs "") 634 | 635 | cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 636 | 637 | if(NOT ARG_SOURCE_DIR AND 638 | NOT ARG_BINARY_DIR AND 639 | NOT ARG_POPULATED) 640 | # No specific properties requested, provide them all 641 | set(ARG_SOURCE_DIR ${contentNameLower}_SOURCE_DIR) 642 | set(ARG_BINARY_DIR ${contentNameLower}_BINARY_DIR) 643 | set(ARG_POPULATED ${contentNameLower}_POPULATED) 644 | endif() 645 | 646 | set(prefix "_FetchContent_${contentNameLower}") 647 | 648 | if(ARG_SOURCE_DIR) 649 | set(propertyName "${prefix}_sourceDir") 650 | get_property(value GLOBAL PROPERTY ${propertyName}) 651 | if(value) 652 | set(${ARG_SOURCE_DIR} ${value} PARENT_SCOPE) 653 | endif() 654 | endif() 655 | 656 | if(ARG_BINARY_DIR) 657 | set(propertyName "${prefix}_binaryDir") 658 | get_property(value GLOBAL PROPERTY ${propertyName}) 659 | if(value) 660 | set(${ARG_BINARY_DIR} ${value} PARENT_SCOPE) 661 | endif() 662 | endif() 663 | 664 | if(ARG_POPULATED) 665 | set(propertyName "${prefix}_populated") 666 | get_property(value GLOBAL PROPERTY ${propertyName} DEFINED) 667 | set(${ARG_POPULATED} ${value} PARENT_SCOPE) 668 | endif() 669 | 670 | endfunction() 671 | 672 | 673 | #======================================================================= 674 | # Performing the population 675 | #======================================================================= 676 | 677 | # The value of contentName will always have been lowercased by the caller. 678 | # All other arguments are assumed to be options that are understood by 679 | # ExternalProject_Add(), except for QUIET and SUBBUILD_DIR. 680 | function(__FetchContent_directPopulate contentName) 681 | 682 | set(options 683 | QUIET 684 | ) 685 | set(oneValueArgs 686 | SUBBUILD_DIR 687 | SOURCE_DIR 688 | BINARY_DIR 689 | # Prevent the following from being passed through 690 | CONFIGURE_COMMAND 691 | BUILD_COMMAND 692 | INSTALL_COMMAND 693 | TEST_COMMAND 694 | ) 695 | set(multiValueArgs "") 696 | 697 | cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 698 | 699 | if(NOT ARG_SUBBUILD_DIR) 700 | message(FATAL_ERROR "Internal error: SUBBUILD_DIR not set") 701 | elseif(NOT IS_ABSOLUTE "${ARG_SUBBUILD_DIR}") 702 | set(ARG_SUBBUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_SUBBUILD_DIR}") 703 | endif() 704 | 705 | if(NOT ARG_SOURCE_DIR) 706 | message(FATAL_ERROR "Internal error: SOURCE_DIR not set") 707 | elseif(NOT IS_ABSOLUTE "${ARG_SOURCE_DIR}") 708 | set(ARG_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_SOURCE_DIR}") 709 | endif() 710 | 711 | if(NOT ARG_BINARY_DIR) 712 | message(FATAL_ERROR "Internal error: BINARY_DIR not set") 713 | elseif(NOT IS_ABSOLUTE "${ARG_BINARY_DIR}") 714 | set(ARG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_BINARY_DIR}") 715 | endif() 716 | 717 | # Ensure the caller can know where to find the source and build directories 718 | # with some convenient variables. Doing this here ensures the caller sees 719 | # the correct result in the case where the default values are overridden by 720 | # the content details set by the project. 721 | set(${contentName}_SOURCE_DIR "${ARG_SOURCE_DIR}" PARENT_SCOPE) 722 | set(${contentName}_BINARY_DIR "${ARG_BINARY_DIR}" PARENT_SCOPE) 723 | 724 | # The unparsed arguments may contain spaces, so build up ARG_EXTRA 725 | # in such a way that it correctly substitutes into the generated 726 | # CMakeLists.txt file with each argument quoted. 727 | unset(ARG_EXTRA) 728 | foreach(arg IN LISTS ARG_UNPARSED_ARGUMENTS) 729 | set(ARG_EXTRA "${ARG_EXTRA} \"${arg}\"") 730 | endforeach() 731 | 732 | # Hide output if requested, but save it to a variable in case there's an 733 | # error so we can show the output upon failure. When not quiet, don't 734 | # capture the output to a variable because the user may want to see the 735 | # output as it happens (e.g. progress during long downloads). Combine both 736 | # stdout and stderr in the one capture variable so the output stays in order. 737 | if (ARG_QUIET) 738 | set(outputOptions 739 | OUTPUT_VARIABLE capturedOutput 740 | ERROR_VARIABLE capturedOutput 741 | ) 742 | else() 743 | set(capturedOutput) 744 | set(outputOptions) 745 | message(STATUS "Populating ${contentName}") 746 | endif() 747 | 748 | if(CMAKE_GENERATOR) 749 | set(generatorOpts "-G${CMAKE_GENERATOR}") 750 | if(CMAKE_GENERATOR_PLATFORM) 751 | list(APPEND generatorOpts "-A${CMAKE_GENERATOR_PLATFORM}") 752 | endif() 753 | if(CMAKE_GENERATOR_TOOLSET) 754 | list(APPEND generatorOpts "-T${CMAKE_GENERATOR_TOOLSET}") 755 | endif() 756 | 757 | if(CMAKE_MAKE_PROGRAM) 758 | list(APPEND generatorOpts "-DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}") 759 | endif() 760 | 761 | else() 762 | # Likely we've been invoked via CMake's script mode where no 763 | # generator is set (and hence CMAKE_MAKE_PROGRAM could not be 764 | # trusted even if provided). We will have to rely on being 765 | # able to find the default generator and build tool. 766 | unset(generatorOpts) 767 | endif() 768 | 769 | # Create and build a separate CMake project to carry out the population. 770 | # If we've already previously done these steps, they will not cause 771 | # anything to be updated, so extra rebuilds of the project won't occur. 772 | # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project 773 | # has this set to something not findable on the PATH. 774 | configure_file("${__FetchContent_privateDir}/CMakeLists.cmake.in" 775 | "${ARG_SUBBUILD_DIR}/CMakeLists.txt") 776 | execute_process( 777 | COMMAND ${CMAKE_COMMAND} ${generatorOpts} . 778 | RESULT_VARIABLE result 779 | ${outputOptions} 780 | WORKING_DIRECTORY "${ARG_SUBBUILD_DIR}" 781 | ) 782 | if(result) 783 | if(capturedOutput) 784 | message("${capturedOutput}") 785 | endif() 786 | message(FATAL_ERROR "CMake step for ${contentName} failed: ${result}") 787 | endif() 788 | execute_process( 789 | COMMAND ${CMAKE_COMMAND} --build . 790 | RESULT_VARIABLE result 791 | ${outputOptions} 792 | WORKING_DIRECTORY "${ARG_SUBBUILD_DIR}" 793 | ) 794 | if(result) 795 | if(capturedOutput) 796 | message("${capturedOutput}") 797 | endif() 798 | message(FATAL_ERROR "Build step for ${contentName} failed: ${result}") 799 | endif() 800 | 801 | endfunction() 802 | 803 | 804 | option(FETCHCONTENT_FULLY_DISCONNECTED "Disables all attempts to download or update content and assumes source dirs already exist") 805 | option(FETCHCONTENT_UPDATES_DISCONNECTED "Enables UPDATE_DISCONNECTED behavior for all content population") 806 | option(FETCHCONTENT_QUIET "Enables QUIET option for all content population" ON) 807 | set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/_deps" CACHE PATH "Directory under which to collect all populated content") 808 | 809 | # Populate the specified content using details stored from 810 | # an earlier call to FetchContent_Declare(). 811 | function(FetchContent_Populate contentName) 812 | 813 | if(NOT contentName) 814 | message(FATAL_ERROR "Empty contentName not allowed for FetchContent_Populate()") 815 | endif() 816 | 817 | string(TOLOWER ${contentName} contentNameLower) 818 | 819 | if(ARGN) 820 | # This is the direct population form with details fully specified 821 | # as part of the call, so we already have everything we need 822 | __FetchContent_directPopulate( 823 | ${contentNameLower} 824 | SUBBUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-subbuild" 825 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-src" 826 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-build" 827 | ${ARGN} # Could override any of the above ..._DIR variables 828 | ) 829 | 830 | # Pass source and binary dir variables back to the caller 831 | set(${contentNameLower}_SOURCE_DIR "${${contentNameLower}_SOURCE_DIR}" PARENT_SCOPE) 832 | set(${contentNameLower}_BINARY_DIR "${${contentNameLower}_BINARY_DIR}" PARENT_SCOPE) 833 | 834 | # Don't set global properties, or record that we did this population, since 835 | # this was a direct call outside of the normal declared details form. 836 | # We only want to save values in the global properties for content that 837 | # honours the hierarchical details mechanism so that projects are not 838 | # robbed of the ability to override details set in nested projects. 839 | return() 840 | endif() 841 | 842 | # No details provided, so assume they were saved from an earlier call 843 | # to FetchContent_Declare(). Do a check that we haven't already 844 | # populated this content before in case the caller forgot to check. 845 | FetchContent_GetProperties(${contentName}) 846 | if(${contentNameLower}_POPULATED) 847 | message(FATAL_ERROR "Content ${contentName} already populated in ${${contentNameLower}_SOURCE_DIR}") 848 | endif() 849 | 850 | string(TOUPPER ${contentName} contentNameUpper) 851 | set(FETCHCONTENT_SOURCE_DIR_${contentNameUpper} 852 | "${FETCHCONTENT_SOURCE_DIR_${contentNameUpper}}" 853 | CACHE PATH "When not empty, overrides where to find pre-populated content for ${contentName}") 854 | 855 | if(FETCHCONTENT_SOURCE_DIR_${contentNameUpper}) 856 | # The source directory has been explicitly provided in the cache, 857 | # so no population is required 858 | set(${contentNameLower}_SOURCE_DIR "${FETCHCONTENT_SOURCE_DIR_${contentNameUpper}}") 859 | set(${contentNameLower}_BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build") 860 | 861 | elseif(FETCHCONTENT_FULLY_DISCONNECTED) 862 | # Bypass population and assume source is already there from a previous run 863 | set(${contentNameLower}_SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src") 864 | set(${contentNameLower}_BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build") 865 | 866 | else() 867 | # Support both a global "disconnect all updates" and a per-content 868 | # update test (either one being set disables updates for this content). 869 | option(FETCHCONTENT_UPDATES_DISCONNECTED_${contentNameUpper} 870 | "Enables UPDATE_DISCONNECTED behavior just for population of ${contentName}") 871 | if(FETCHCONTENT_UPDATES_DISCONNECTED OR 872 | FETCHCONTENT_UPDATES_DISCONNECTED_${contentNameUpper}) 873 | set(disconnectUpdates True) 874 | else() 875 | set(disconnectUpdates False) 876 | endif() 877 | 878 | if(FETCHCONTENT_QUIET) 879 | set(quietFlag QUIET) 880 | else() 881 | unset(quietFlag) 882 | endif() 883 | 884 | __FetchContent_getSavedDetails(${contentName} contentDetails) 885 | if("${contentDetails}" STREQUAL "") 886 | message(FATAL_ERROR "No details have been set for content: ${contentName}") 887 | endif() 888 | 889 | __FetchContent_directPopulate( 890 | ${contentNameLower} 891 | ${quietFlag} 892 | UPDATE_DISCONNECTED ${disconnectUpdates} 893 | SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-subbuild" 894 | SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src" 895 | BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build" 896 | # Put the saved details last so they can override any of the 897 | # the options we set above (this can include SOURCE_DIR or 898 | # BUILD_DIR) 899 | ${contentDetails} 900 | ) 901 | endif() 902 | 903 | __FetchContent_setPopulated( 904 | ${contentName} 905 | ${${contentNameLower}_SOURCE_DIR} 906 | ${${contentNameLower}_BINARY_DIR} 907 | ) 908 | 909 | # Pass variables back to the caller. The variables passed back here 910 | # must match what FetchContent_GetProperties() sets when it is called 911 | # with just the content name. 912 | set(${contentNameLower}_SOURCE_DIR "${${contentNameLower}_SOURCE_DIR}" PARENT_SCOPE) 913 | set(${contentNameLower}_BINARY_DIR "${${contentNameLower}_BINARY_DIR}" PARENT_SCOPE) 914 | set(${contentNameLower}_POPULATED True PARENT_SCOPE) 915 | 916 | endfunction() 917 | -------------------------------------------------------------------------------- /cmake/Modules/FetchContent/CMakeLists.cmake.in: -------------------------------------------------------------------------------- 1 | # Distributed under the OSI-approved BSD 3-Clause License. See accompanying 2 | # file Copyright.txt or https://cmake.org/licensing for details. 3 | 4 | cmake_minimum_required(VERSION ${CMAKE_VERSION}) 5 | 6 | # We name the project and the target for the ExternalProject_Add() call 7 | # to something that will highlight to the user what we are working on if 8 | # something goes wrong and an error message is produced. 9 | 10 | project(${contentName}-populate NONE) 11 | 12 | include(ExternalProject) 13 | ExternalProject_Add(${contentName}-populate 14 | ${ARG_EXTRA} 15 | SOURCE_DIR "${ARG_SOURCE_DIR}" 16 | BINARY_DIR "${ARG_BINARY_DIR}" 17 | CONFIGURE_COMMAND "" 18 | BUILD_COMMAND "" 19 | INSTALL_COMMAND "" 20 | TEST_COMMAND "" 21 | ) 22 | -------------------------------------------------------------------------------- /cmake/Modules/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## FetchContent 3 | 4 | `FetchContent.cmake` and `FetchContent/CMakeLists.cmake.in` 5 | are copied from `cmake/3.11.0/share/cmake-3.11/Modules`. 6 | -------------------------------------------------------------------------------- /cmake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pzelasko/kaldialign/951315b57dd98ff5042ca56ab91310c2354b5662/cmake/__init__.py -------------------------------------------------------------------------------- /cmake/pybind11.cmake: -------------------------------------------------------------------------------- 1 | function(download_pybind11) 2 | if(CMAKE_VERSION VERSION_LESS 3.11) 3 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) 4 | endif() 5 | 6 | include(FetchContent) 7 | 8 | set(pybind11_URL "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz") 9 | set(pybind11_HASH "SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae") 10 | 11 | FetchContent_Declare(pybind11 12 | URL ${pybind11_URL} 13 | URL_HASH ${pybind11_HASH} 14 | ) 15 | 16 | FetchContent_GetProperties(pybind11) 17 | if(NOT pybind11_POPULATED) 18 | message(STATUS "Downloading pybind11 ${pybind11_URL}") 19 | FetchContent_Populate(pybind11) 20 | endif() 21 | message(STATUS "pybind11 is downloaded to ${pybind11_SOURCE_DIR}") 22 | add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR} EXCLUDE_FROM_ALL) 23 | endfunction() 24 | 25 | download_pybind11() 26 | -------------------------------------------------------------------------------- /extensions/kaldi_align.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "kaldi_align.h" 3 | 4 | int LevenshteinEditDistance(const std::vector &ref, 5 | const std::vector &hyp, 6 | const bool sclite_mode, 7 | int *ins, int *del, int *sub) { 8 | int ins_cost, del_cost, sub_cost; 9 | if (sclite_mode) { 10 | ins_cost = INS_COST_SCLITE; 11 | del_cost = DEL_COST_SCLITE; 12 | sub_cost = SUB_COST_SCLITE; 13 | } else { 14 | ins_cost = INS_COST; 15 | del_cost = DEL_COST; 16 | sub_cost = SUB_COST; 17 | } 18 | 19 | // temp sequence to remember error type and stats. 20 | std::vector e(ref.size()+1); 21 | std::vector cur_e(ref.size()+1); 22 | // initialize the first hypothesis aligned to the reference at each 23 | // position:[hyp_index =0][ref_index] 24 | for (size_t i =0; i < e.size(); i ++) { 25 | e[i].ins_num = 0; 26 | e[i].sub_num = 0; 27 | e[i].del_num = i; 28 | e[i].total_num = i; 29 | e[i].total_cost = i*del_cost; 30 | } 31 | 32 | // for other alignments 33 | for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) { 34 | cur_e[0] = e[0]; 35 | cur_e[0].ins_num++; 36 | cur_e[0].total_num++; 37 | cur_e[0].total_cost += ins_cost; 38 | for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) { 39 | int ins_err = e[ref_index].total_cost + ins_cost; 40 | int del_err = cur_e[ref_index-1].total_cost + del_cost; 41 | int sub_err = e[ref_index-1].total_cost; 42 | if (hyp[hyp_index-1] != ref[ref_index-1]) 43 | sub_err += sub_cost; 44 | 45 | if (sub_err < ins_err && sub_err < del_err) { 46 | cur_e[ref_index] = e[ref_index-1]; 47 | if (hyp[hyp_index-1] != ref[ref_index-1]) { 48 | cur_e[ref_index].sub_num++; // substitution error should be increased 49 | cur_e[ref_index].total_num++; 50 | } 51 | cur_e[ref_index].total_cost = sub_err; 52 | } else if (del_err < ins_err) { 53 | cur_e[ref_index] = cur_e[ref_index-1]; 54 | cur_e[ref_index].total_cost = del_err; 55 | cur_e[ref_index].del_num++; // deletion number is increased. 56 | cur_e[ref_index].total_num++; 57 | } else { 58 | cur_e[ref_index] = e[ref_index]; 59 | cur_e[ref_index].total_cost = ins_err; 60 | cur_e[ref_index].ins_num++; // insertion number is increased. 61 | cur_e[ref_index].total_num++; 62 | } 63 | } 64 | e = cur_e; // alternate for the next recursion. 65 | } 66 | size_t ref_index = e.size()-1; 67 | if (ins != nullptr) { 68 | *ins = e[ref_index].ins_num; 69 | } 70 | if (del != nullptr) { 71 | *del = e[ref_index].del_num; 72 | } 73 | if (sub != nullptr) { 74 | *sub = e[ref_index].sub_num; 75 | } 76 | return e[ref_index].total_num; 77 | } 78 | 79 | 80 | int LevenshteinAlignment(const std::vector &a, 81 | const std::vector &b, 82 | int eps_symbol, 83 | const bool sclite_mode, 84 | std::vector > *output) { 85 | // Check inputs: 86 | { 87 | assert(output != NULL); 88 | for (size_t i = 0; i < a.size(); i++) assert(a[i] != eps_symbol); 89 | for (size_t i = 0; i < b.size(); i++) assert(b[i] != eps_symbol); 90 | } 91 | output->clear(); 92 | 93 | int ins_cost, del_cost, sub_cost; 94 | if (sclite_mode) { 95 | ins_cost = INS_COST_SCLITE; 96 | del_cost = DEL_COST_SCLITE; 97 | sub_cost = SUB_COST_SCLITE; 98 | } else { 99 | ins_cost = INS_COST; 100 | del_cost = DEL_COST; 101 | sub_cost = SUB_COST; 102 | } 103 | 104 | // inthis is very memory-inefficiently implemented using a vector of vectors. 105 | size_t M = a.size(), N = b.size(); 106 | size_t m, n; 107 | std::vector > e(M+1); 108 | for (m = 0; m <=M; m++) e[m].resize(N+1); 109 | for (n = 0; n <= N; n++) 110 | e[0][n] = n*ins_cost; 111 | for (m = 1; m <= M; m++) { 112 | e[m][0] = e[m-1][0] + del_cost; 113 | for (n = 1; n <= N; n++) { 114 | int sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : sub_cost); 115 | int del = e[m-1][n] + del_cost; // assumes a == ref, b == hyp. 116 | int ins = e[m][n-1] + ins_cost; 117 | e[m][n] = std::min(sub_or_ok, std::min(del, ins)); 118 | } 119 | } 120 | // get time-reversed output first: trace back. 121 | m = M; 122 | n = N; 123 | while (m != 0 || n != 0) { 124 | size_t last_m, last_n; 125 | if (m == 0) { 126 | last_m = m; 127 | last_n = n-1; 128 | } else if (n == 0) { 129 | last_m = m-1; 130 | last_n = n; 131 | } else { 132 | int sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : sub_cost); 133 | int del = e[m-1][n] + del_cost; // assumes a == ref, b == hyp. 134 | int ins = e[m][n-1] + ins_cost; 135 | // choose sub_or_ok if all else equal. 136 | if (sub_or_ok < std::min(del, ins)) { 137 | last_m = m-1; 138 | last_n = n-1; 139 | } else { 140 | if (del < ins) { // choose del over ins if equal. 141 | last_m = m-1; 142 | last_n = n; 143 | } else { 144 | last_m = m; 145 | last_n = n-1; 146 | } 147 | } 148 | } 149 | int a_sym, b_sym; 150 | a_sym = (last_m == m ? eps_symbol : a[last_m]); 151 | b_sym = (last_n == n ? eps_symbol : b[last_n]); 152 | output->push_back(std::make_pair(a_sym, b_sym)); 153 | m = last_m; 154 | n = last_n; 155 | } 156 | ReverseVector(output); 157 | return e[M][N]; 158 | } 159 | 160 | namespace internal { 161 | 162 | std::vector> GetEdits( 163 | const std::vector> &refs, 164 | const std::vector> &hyps 165 | ) { 166 | std::vector> ans; 167 | for (int i = 0; i != refs.size(); ++i) { 168 | const auto &ref = refs[i]; 169 | const auto dist = LevenshteinEditDistance(ref, hyps[i], false, nullptr, nullptr, nullptr); 170 | ans.emplace_back(dist, ref.size()); 171 | } 172 | return ans; 173 | } 174 | 175 | std::pair GetBootstrapWerInterval( 176 | const std::vector> &edit_sym_per_hyp, 177 | const int replications, 178 | const unsigned int seed) 179 | { 180 | std::mt19937 rng{seed}; 181 | std::uniform_int_distribution<> dist{0, static_cast(edit_sym_per_hyp.size()) - 1}; 182 | 183 | double wer_accum = 0.0, wer_mult_accum = 0.0; 184 | for (int i = 0; i != replications; ++i) { 185 | int num_sym = 0, num_errs = 0; 186 | for (int j = 0; j != edit_sym_per_hyp.size(); ++j) { 187 | const auto selected = dist(rng); 188 | const auto &nerr_nsym = edit_sym_per_hyp[selected]; 189 | num_errs += nerr_nsym.first; 190 | num_sym += nerr_nsym.second; 191 | } 192 | const double wer_rep = static_cast(num_errs) / num_sym; 193 | wer_accum += wer_rep; 194 | wer_mult_accum += std::pow(wer_rep, 2); 195 | } 196 | 197 | const double mean = wer_accum / replications; 198 | const double _tmp = wer_mult_accum / replications - std::pow(mean, 2); 199 | double interval = 0.0; 200 | if (_tmp > 0) { 201 | interval = 1.96 * std::sqrt(_tmp); 202 | } 203 | return std::make_pair(mean, interval); 204 | } 205 | 206 | double GetPImprov( 207 | const std::vector> &edit_sym_per_hyp, 208 | const std::vector> &edit_sym_per_hyp2, 209 | const int replications, 210 | const unsigned int seed 211 | ) { 212 | std::mt19937 rng{seed}; 213 | std::uniform_int_distribution<> dist{0, static_cast(edit_sym_per_hyp.size()) - 1}; 214 | 215 | double improv_accum = 0.0; 216 | for (int i = 0; i != replications; ++i) { 217 | int num_errs = 0; 218 | for (int j = 0; j != edit_sym_per_hyp.size(); ++j) { 219 | const auto selected = dist(rng); 220 | num_errs += edit_sym_per_hyp[selected].first - edit_sym_per_hyp2[selected].first; 221 | } 222 | if (num_errs > 0) { 223 | improv_accum += 1; 224 | } 225 | } 226 | 227 | return improv_accum / replications; 228 | } 229 | 230 | } 231 | -------------------------------------------------------------------------------- /extensions/kaldi_align.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define INS_COST 1 7 | #define DEL_COST 1 8 | #define SUB_COST 1 9 | 10 | #define INS_COST_SCLITE 3 11 | #define DEL_COST_SCLITE 3 12 | #define SUB_COST_SCLITE 4 13 | 14 | /// Reverses the contents of a vector. 15 | template 16 | inline void ReverseVector(std::vector *vec) { 17 | assert(vec != NULL); 18 | size_t sz = vec->size(); 19 | for (size_t i = 0; i < sz/2; i++) 20 | std::swap( (*vec)[i], (*vec)[sz-1-i]); 21 | } 22 | 23 | struct error_stats { 24 | int ins_num; 25 | int del_num; 26 | int sub_num; 27 | int total_num; // total number of errors. 28 | int total_cost; // minimum total cost to the current alignment. 29 | }; 30 | // Note that both hyp and ref should not contain noise word in 31 | // the following implementation. 32 | 33 | 34 | int LevenshteinEditDistance(const std::vector &ref, 35 | const std::vector &hyp, 36 | const bool sclite_mode, 37 | int *ins, int *del, int *sub); 38 | 39 | 40 | int LevenshteinAlignment(const std::vector &a, 41 | const std::vector &b, 42 | int eps_symbol, 43 | const bool sclite_mode, 44 | std::vector > *output); 45 | 46 | 47 | namespace internal{ 48 | std::vector> GetEdits( 49 | const std::vector> &refs, 50 | const std::vector> &hyps 51 | ); 52 | 53 | std::pair GetBootstrapWerInterval( 54 | const std::vector> &edit_sym_per_hyp, 55 | const int replications, 56 | const unsigned int seed 57 | ); 58 | 59 | double GetPImprov( 60 | const std::vector> &edit_sym_per_hyp, 61 | const std::vector> &edit_sym_per_hyp2, 62 | const int replications, 63 | const unsigned int seed 64 | ); 65 | } 66 | -------------------------------------------------------------------------------- /extensions/kaldialign.cpp: -------------------------------------------------------------------------------- 1 | #include "kaldi_align.h" 2 | #include "pybind11/pybind11.h" 3 | #include "pybind11/stl.h" 4 | namespace py = pybind11; 5 | 6 | static py::dict EditDistance(const std::vector &a, 7 | const std::vector &b, 8 | const bool sclite_mode) { 9 | int ins; 10 | int del; 11 | int sub; 12 | 13 | int total = LevenshteinEditDistance(a, b, sclite_mode, &ins, &del, &sub); 14 | py::dict ans; 15 | ans["ins"] = ins; 16 | ans["del"] = del; 17 | ans["sub"] = sub; 18 | ans["total"] = total; 19 | return ans; 20 | } 21 | 22 | static std::vector> 23 | Align(const std::vector &a, const std::vector &b, int eps_symbol, const bool sclite_mode) { 24 | std::vector> ans; 25 | LevenshteinAlignment(a, b, eps_symbol, sclite_mode, &ans); 26 | return ans; 27 | } 28 | 29 | static std::vector> GetEdits( 30 | const std::vector> &refs, 31 | const std::vector> &hyps 32 | ) { 33 | return internal::GetEdits(refs, hyps); 34 | } 35 | 36 | static py::tuple GetBootstrapWerInterval( 37 | const std::vector> &edit_sym_per_hyp, 38 | const int replications, 39 | const unsigned int seed 40 | ) { 41 | const auto ans = internal::GetBootstrapWerInterval(edit_sym_per_hyp, replications, seed); 42 | return py::make_tuple(ans.first, ans.second); 43 | } 44 | 45 | static double GetPImprov( 46 | const std::vector> &edit_sym_per_hyp, 47 | const std::vector> &edit_sym_per_hyp2, 48 | const int replications, 49 | const unsigned int seed 50 | ) { 51 | return internal::GetPImprov(edit_sym_per_hyp, edit_sym_per_hyp2, replications, seed); 52 | } 53 | 54 | PYBIND11_MODULE(_kaldialign, m) { 55 | m.doc() = "Python wrapper for kaldialign"; 56 | m.def("edit_distance", &EditDistance, py::arg("a"), py::arg("b"), py::arg("sclite_mode") = false); 57 | m.def("align", &Align, py::arg("a"), py::arg("b"), py::arg("eps_symbol"), py::arg("sclite_mode") = false); 58 | m.def("_get_edits", &GetEdits, py::arg("refs"), py::arg("hyps")); 59 | m.def("_get_boostrap_wer_interval", &GetBootstrapWerInterval, py::arg("edit_sym_per_hyp"), py::arg("replications") = 10000, py::arg("seed") = 0); 60 | m.def("_get_p_improv", &GetPImprov, py::arg("edit_sym_per_hyp"), py::arg("edit_sym_per_hyp2"), py::arg("replications") = 10000, py::arg("seed") = 0); 61 | } 62 | -------------------------------------------------------------------------------- /kaldialign/__init__.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from typing import Dict, Iterable, List, Optional, Sequence, Tuple, TypeVar, Union 4 | 5 | import _kaldialign 6 | 7 | Symbol = TypeVar("Symbol") 8 | 9 | 10 | def edit_distance( 11 | ref: Iterable[Symbol], hyp: Iterable[Symbol], sclite_mode: bool = False 12 | ) -> Dict[str, Union[int, float]]: 13 | """ 14 | Compute the edit distance between sequences ``ref`` and ``hyp``. 15 | Both sequences can be strings or lists of strings or ints. 16 | 17 | Optional ``sclite_mode`` sets INS/DEL/SUB costs to 3/3/4 for 18 | compatibility with sclite tool. 19 | 20 | Returns a dict with keys: 21 | * ``ins`` -- the number of insertions (in ``hyp`` vs ``ref``) 22 | * ``del`` -- the number of deletions (in ``hyp`` vs ``ref``) 23 | * ``sub`` -- the number of substitutions 24 | * ``total`` -- total number of errors 25 | * ``ref_len`` -- the number of symbols in ``ref`` 26 | * ``err_rate`` -- the error rate (total number of errors divided by ``ref_len``) 27 | """ 28 | int2sym = dict(enumerate(sorted(set(ref) | set(hyp)))) 29 | sym2int = {v: k for k, v in int2sym.items()} 30 | 31 | refi: List[int] = [] 32 | hypi: List[int] = [] 33 | for sym in ref: 34 | refi.append(sym2int[sym]) 35 | 36 | for sym in hyp: 37 | hypi.append(sym2int[sym]) 38 | 39 | ans = _kaldialign.edit_distance(refi, hypi, sclite_mode) 40 | ans["ref_len"] = len(refi) 41 | try: 42 | ans["err_rate"] = ans["total"] / len(refi) 43 | except ZeroDivisionError: 44 | if ans["total"] == 0: 45 | ans["err_rate"] = 0.0 46 | else: 47 | ans["err_rate"] = float("inf") 48 | return ans 49 | 50 | 51 | def align( 52 | ref: Iterable[Symbol], 53 | hyp: Iterable[Symbol], 54 | eps_symbol: Symbol, 55 | sclite_mode: bool = False, 56 | ) -> List[Tuple[Symbol, Symbol]]: 57 | """ 58 | Compute the alignment between sequences ``ref`` and ``hyp``. 59 | Both sequences can be strings or lists of strings or ints. 60 | 61 | ``eps_symbol`` is used as a blank symbol to indicate insertion or deletion. 62 | 63 | Optional ``sclite_mode`` sets INS/DEL/SUB costs to 3/3/4 for 64 | compatibility with sclite tool. 65 | 66 | Returns a list of pairs of alignment symbols. The presence of ``eps_symbol`` 67 | in the first pair index indicates insertion, and in the second pair index, deletion. 68 | Mismatched symbols indicate substitution. 69 | """ 70 | int2sym = dict(enumerate(sorted(set(ref) | set(hyp) | {eps_symbol}))) 71 | sym2int = {v: k for k, v in int2sym.items()} 72 | 73 | ai: List[int] = [] 74 | bi: List[int] = [] 75 | 76 | for sym in ref: 77 | ai.append(sym2int[sym]) 78 | 79 | for sym in hyp: 80 | bi.append(sym2int[sym]) 81 | 82 | eps_int = sym2int[eps_symbol] 83 | alignment: List[Tuple[int, int]] = _kaldialign.align(ai, bi, eps_int, sclite_mode) 84 | 85 | ali = [] 86 | for idx in range(len(alignment)): 87 | ali.append((int2sym[alignment[idx][0]], int2sym[alignment[idx][1]])) 88 | 89 | return ali 90 | 91 | 92 | def bootstrap_wer_ci( 93 | refs: Sequence[Sequence[Symbol]], 94 | hyps: Sequence[Sequence[Symbol]], 95 | hyps2: Optional[Sequence[Sequence[Symbol]]] = None, 96 | replications: int = 10000, 97 | seed: int = 0, 98 | ) -> Dict: 99 | """ 100 | Compute a boostrapping of WER to extract the 95% confidence interval (CI) 101 | using the bootstrap method of Bisani and Ney [1]. 102 | The implementation is based on Kaldi's ``compute-wer-bootci`` script [2]. 103 | 104 | Args: 105 | refs: A list of reference sequences (str, list[str], list[list[[int]]) 106 | hyps: A list of hypothesis sequences from system1 (str, list[str], list[list[int]]) 107 | hyps2: A list of hypothesis sequences from system2 (str, list[str], list[list[int]]). 108 | When provided, we'll compute CI for both systems as well as the probability 109 | of system2 improving over system1. 110 | replications: The number of replications to use for bootstrapping. 111 | seed: The random seed to reproduce the results. 112 | 113 | Returns: 114 | A dict with results. When scoring a single system (``hyp2_seqs=None``), the keys are: 115 | - "wer" (mean WER estimate), 116 | - "ci95" (95% confidence interval size), 117 | - "ci95min" (95% confidence interval lower bound) 118 | - "ci95max" (95% confidence interval upper bound) 119 | When scoring two systems, the keys are "system1", "system2", and "p_s2_improv_over_s1". 120 | The first two keys contain dicts as described for the single-system case, and the last key's 121 | value is a float in the range [0, 1]. 122 | 123 | [1] Bisani, M., & Ney, H. (2004, May). Bootstrap estimates for confidence intervals in ASR performance evaluation. 124 | In 2004 IEEE International Conference on Acoustics, Speech, and Signal Processing (Vol. 1, pp. I-409). IEEE. 125 | 126 | [2] https://github.com/kaldi-asr/kaldi/blob/master/src/bin/compute-wer-bootci.cc 127 | """ 128 | from _kaldialign import _get_boostrap_wer_interval, _get_edits, _get_p_improv 129 | 130 | assert len(hyps) == len( 131 | refs 132 | ), f"Inconsistent number of reference ({len(refs)}) and hypothesis ({len(hyps)}) sequences." 133 | assert replications > 0, "The number of replications must be greater than 0." 134 | assert seed >= 0, "The seed must be 0 or greater." 135 | assert not isinstance(refs, str) and not isinstance( 136 | hyps, str 137 | ), "The input must be a list of strings or list of lists of ints." 138 | 139 | refs, hyps, hyps2 = _convert_to_int(refs, hyps, hyps2) 140 | 141 | edit_sym_per_hyp = _get_edits(refs, hyps) 142 | mean, interval = _get_boostrap_wer_interval( 143 | edit_sym_per_hyp, replications=replications, seed=seed 144 | ) 145 | ans1 = _build_results(mean, interval) 146 | if hyps2 is None: 147 | return ans1 148 | 149 | assert len(hyps2) == len( 150 | refs 151 | ), f"Inconsistent number of reference ({len(refs)}) and hypothesis ({len(hyps2)}) sequences for the second system (hyp2_seqs)." 152 | edit_sym_per_hyp2 = _get_edits(refs, hyps2) 153 | mean2, interval2 = _get_boostrap_wer_interval( 154 | edit_sym_per_hyp2, replications=replications, seed=seed 155 | ) 156 | p_improv = _get_p_improv( 157 | edit_sym_per_hyp, edit_sym_per_hyp2, replications=replications, seed=seed 158 | ) 159 | return { 160 | "system1": ans1, 161 | "system2": _build_results(mean2, interval2), 162 | "p_s2_improv_over_s1": p_improv, 163 | } 164 | 165 | 166 | def _build_results(mean: float, interval: float) -> Dict[str, float]: 167 | return { 168 | "wer": mean, 169 | "ci95": interval, 170 | "ci95min": mean - interval, 171 | "ci95max": mean + interval, 172 | } 173 | 174 | 175 | def _convert_to_int( 176 | ref: Sequence[Sequence[Symbol]], 177 | hyp: Sequence[Sequence[Symbol]], 178 | hyp2: Sequence[Sequence[Symbol]] = None, 179 | ) -> Tuple[List[List[Symbol]], ...]: 180 | sources = [ref, hyp] 181 | if hyp2 is not None: 182 | sources.append(hyp2) 183 | 184 | symbols = sorted( 185 | set(symbol for source in sources for seq in source for symbol in seq) 186 | ) 187 | int2sym = dict(enumerate(symbols)) 188 | sym2int = {v: k for k, v in int2sym.items()} 189 | 190 | ints = [[[sym2int[item] for item in seq] for seq in source] for source in sources] 191 | if hyp2 is None: 192 | ints.append(None) 193 | return tuple(ints) 194 | -------------------------------------------------------------------------------- /scripts/build_conda.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # The following environment variables are supposed to be set by users 4 | # 5 | # - KALDIALIGN_CONDA_TOKEN 6 | # If not set, auto upload to anaconda.org is disabled. 7 | # 8 | # Its value is from https://anaconda.org/kaldialign/settings/access 9 | # (You need to login as user kaldialign to see its value) 10 | # 11 | set -e 12 | export CONDA_BUILD=1 13 | 14 | cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd) 15 | kaldialign_dir=$(cd $cur_dir/.. && pwd) 16 | 17 | cd $kaldialign_dir 18 | 19 | export KALDIALIGN_ROOT_DIR=$kaldialign_dir 20 | echo "KALDIALIGN_DIR: $KALDIALIGN_ROOT_DIR" 21 | 22 | KALDIALIGN_PYTHON_VERSION=$(python -c "import sys; print('.'.join(sys.version.split('.')[:2]))") 23 | 24 | # Example value: 3.8 25 | export KALDIALIGN_PYTHON_VERSION 26 | 27 | if [ -z $KALDIALIGN_CONDA_TOKEN ]; then 28 | echo "Auto upload to anaconda.org is disabled since KALDIALIGN_CONDA_TOKEN is not set" 29 | conda build --no-test --no-anaconda-upload ./scripts/conda/kaldialign 30 | else 31 | conda build --no-test --token $KALDIALIGN_CONDA_TOKEN ./scripts/conda/kaldialign 32 | fi 33 | -------------------------------------------------------------------------------- /scripts/conda/kaldialign/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: kaldialign 3 | version: "0.9.2" 4 | 5 | source: 6 | path: "{{ environ.get('KALDIALIGN_ROOT_DIR') }}" 7 | 8 | build: 9 | number: 0 10 | string: py{{ environ.get('KALDIALIGN_PYTHON_VERSION') }} 11 | script: {{ PYTHON }} setup.py install --single-version-externally-managed --record=record.txt 12 | 13 | requirements: 14 | build: 15 | - {{ compiler('c') }} # [win] 16 | - {{ compiler('cxx') }} # [win] 17 | 18 | host: 19 | - anaconda-client 20 | - conda-build 21 | - cmake 22 | - python 23 | run: 24 | - python 25 | 26 | about: 27 | home: https://github.com/pzelasko/kaldialign 28 | license: Apache V2 29 | license_file: LICENSE 30 | summary: Python wrappers for Kaldi Levenshtein's distance and alignment code. 31 | description: | 32 | A small package that exposes edit distance computation functions from Kaldi. 33 | It uses the original Kaldi code and wraps it using pybind11. 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import platform 5 | import re 6 | import sys 7 | from pathlib import Path 8 | 9 | import setuptools 10 | from setuptools.command.build_ext import build_ext 11 | 12 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | 15 | def is_windows(): 16 | return platform.system() == "Windows" 17 | 18 | 19 | def cmake_extension(name, *args, **kwargs) -> setuptools.Extension: 20 | kwargs["language"] = "c++" 21 | sources = [] 22 | return setuptools.Extension(name, sources, *args, **kwargs) 23 | 24 | 25 | class BuildExtension(build_ext): 26 | def build_extension(self, ext: setuptools.extension.Extension): 27 | build_dir = self.build_temp 28 | os.makedirs(build_dir, exist_ok=True) 29 | 30 | # build/lib.linux-x86_64-3.8 31 | os.makedirs(self.build_lib, exist_ok=True) 32 | 33 | kaldialign_dir = os.path.dirname(os.path.abspath(__file__)) 34 | install_dir = Path(self.build_lib).resolve() / "kaldialign" 35 | 36 | cmake_args = os.environ.get("KALDIALIGN_CMAKE_ARGS", "") 37 | make_args = os.environ.get("KALDIALIGN_MAKE_ARGS", "") 38 | system_make_args = os.environ.get("MAKEFLAGS", "") 39 | 40 | if cmake_args == "": 41 | cmake_args = "-DCMAKE_BUILD_TYPE=Release" 42 | 43 | extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir}" 44 | 45 | if make_args == "" and system_make_args == "": 46 | print("For fast compilation, run:") 47 | print('export KALDIALIGN_MAKE_ARGS="-j"; python setup.py install') 48 | make_args = "-j4" 49 | print("Setting make_args to '-j4'") 50 | 51 | if "PYTHON_EXECUTABLE" not in cmake_args: 52 | print(f"Setting PYTHON_EXECUTABLE to {sys.executable}") 53 | cmake_args += f" -DPYTHON_EXECUTABLE={sys.executable}" 54 | 55 | cmake_args += extra_cmake_args 56 | 57 | if not is_windows(): 58 | build_cmd = f""" 59 | cd {self.build_temp} 60 | 61 | cmake {cmake_args} {kaldialign_dir} 62 | 63 | make {make_args} install 64 | """ 65 | print(f"build command is:\n{build_cmd}") 66 | 67 | ret = os.system(build_cmd) 68 | if ret != 0: 69 | raise Exception( 70 | "\nBuild kaldialign failed. Please check the error message.\n" 71 | "You can ask for help by creating an issue on GitHub.\n" 72 | "\nClick:\n" 73 | " https://github.com/pzelasko/kaldialign/issues/new\n" 74 | ) 75 | return 76 | 77 | # for windows 78 | build_cmd = f""" 79 | cmake {cmake_args} -B {self.build_temp} -S {cur_dir} 80 | cmake --build {self.build_temp} --target install --config Release -- -m 81 | """ 82 | print(f"build command is:\n{build_cmd}") 83 | 84 | ret = os.system(f"cmake {cmake_args} -B {self.build_temp} -S {cur_dir}") 85 | if ret != 0: 86 | raise Exception("Failed to build kaldialign") 87 | 88 | ret = os.system( 89 | f"cmake --build {self.build_temp} --target install --config Release -- -m" 90 | ) 91 | if ret != 0: 92 | raise Exception("Failed to build kaldialign") 93 | 94 | 95 | def read_long_description(): 96 | with open("README.md", encoding="utf8") as f: 97 | readme = f.read() 98 | return readme 99 | 100 | 101 | def get_package_version(): 102 | with open("CMakeLists.txt") as f: 103 | content = f.read() 104 | 105 | latest_version = re.search(r"set\(KALDIALIGN_VERSION (.*)\)", content).group(1) 106 | latest_version = latest_version.strip('"') 107 | return latest_version 108 | 109 | 110 | with open("kaldialign/__init__.py", "a") as f: 111 | f.write(f"__version__ = '{get_package_version()}'\n") 112 | 113 | 114 | setuptools.setup( 115 | name="kaldialign", 116 | version=get_package_version(), 117 | author="Piotr Żelasko", 118 | author_email="pzelasko@jhu.edu", 119 | package_dir={ 120 | "kaldialign": "kaldialign", 121 | }, 122 | packages=["kaldialign"], 123 | url="https://github.com/pzelasko/kaldialign", 124 | description="Kaldi alignment methods wrapped into Python", 125 | long_description=read_long_description(), 126 | long_description_content_type="text/markdown", 127 | ext_modules=[cmake_extension("_kaldialign")], 128 | cmdclass={"build_ext": BuildExtension}, 129 | extras_require={"test": ["pytest"]}, 130 | keywords=[ 131 | "natural language processing", 132 | "speech recognition", 133 | "machine learning", 134 | ], 135 | classifiers=[ 136 | "Programming Language :: Python :: 3.6", 137 | "Programming Language :: Python :: 3.7", 138 | "Programming Language :: Python :: 3.8", 139 | "Programming Language :: Python :: 3.9", 140 | "Programming Language :: Python :: 3.10", 141 | "Programming Language :: Python :: 3.11", 142 | "Programming Language :: Python :: 3.12", 143 | "Programming Language :: Python :: 3.13", 144 | "Development Status :: 5 - Production/Stable", 145 | "Intended Audience :: Developers", 146 | "License :: OSI Approved :: Apache Software License", 147 | "Operating System :: OS Independent", 148 | "Topic :: Software Development :: Libraries :: Python Modules", 149 | "Topic :: Text Processing :: Linguistic", 150 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 151 | "Topic :: Scientific/Engineering :: Mathematics", 152 | ], 153 | zip_safe=False, 154 | license="Apache licensed, as found in the LICENSE file", 155 | ) 156 | 157 | # remove the line __dev_version__ from k2/python/k2/__init__.py 158 | with open("kaldialign/__init__.py", "r") as f: 159 | lines = f.readlines() 160 | 161 | with open("kaldialign/__init__.py", "w") as f: 162 | for line in lines: 163 | if "__version__" not in line: 164 | f.write(line) 165 | -------------------------------------------------------------------------------- /tests/test_align.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import pytest 4 | 5 | from kaldialign import align, bootstrap_wer_ci, edit_distance 6 | 7 | EPS = "*" 8 | 9 | 10 | def test_align(): 11 | a = ["a", "b", "c"] 12 | b = ["a", "s", "x", "c"] 13 | ali = align(a, b, EPS) 14 | assert ali == [("a", "a"), ("b", "s"), (EPS, "x"), ("c", "c")] 15 | dist = edit_distance(a, b) 16 | assert dist == { 17 | "ins": 1, 18 | "del": 0, 19 | "sub": 1, 20 | "total": 2, 21 | "ref_len": 3, 22 | "err_rate": 2 / 3, 23 | } 24 | 25 | a = ["a", "b"] 26 | b = ["b", "c"] 27 | ali = align(a, b, EPS) 28 | assert ali == [("a", EPS), ("b", "b"), (EPS, "c")] 29 | dist = edit_distance(a, b) 30 | assert dist == { 31 | "ins": 1, 32 | "del": 1, 33 | "sub": 0, 34 | "total": 2, 35 | "ref_len": 2, 36 | "err_rate": 1.0, 37 | } 38 | 39 | a = ["A", "B", "C"] 40 | b = ["D", "C", "A"] 41 | ali = align(a, b, EPS) 42 | assert ali == [("A", "D"), ("B", EPS), ("C", "C"), (EPS, "A")] 43 | dist = edit_distance(a, b) 44 | assert dist == { 45 | "ins": 1, 46 | "del": 1, 47 | "sub": 1, 48 | "total": 3, 49 | "ref_len": 3, 50 | "err_rate": 1.0, 51 | } 52 | 53 | a = ["A", "B", "C", "D"] 54 | b = ["C", "E", "D", "F"] 55 | ali = align(a, b, EPS) 56 | assert ali == [ 57 | ("A", EPS), 58 | ("B", EPS), 59 | ("C", "C"), 60 | (EPS, "E"), 61 | ("D", "D"), 62 | (EPS, "F"), 63 | ] 64 | dist = edit_distance(a, b) 65 | assert dist == { 66 | "ins": 2, 67 | "del": 2, 68 | "sub": 0, 69 | "total": 4, 70 | "ref_len": 4, 71 | "err_rate": 1.0, 72 | } 73 | 74 | 75 | def test_edit_distance(): 76 | a = ["a", "b", "c"] 77 | b = ["a", "s", "x", "c"] 78 | results = edit_distance(a, b) 79 | assert results == { 80 | "ins": 1, 81 | "del": 0, 82 | "sub": 1, 83 | "total": 2, 84 | "ref_len": 3, 85 | "err_rate": 2 / 3, 86 | } 87 | 88 | 89 | def test_edit_distance_zero_len_ref_zero_err(): 90 | a = [] 91 | b = [] 92 | results = edit_distance(a, b) 93 | assert results == { 94 | "ins": 0, 95 | "del": 0, 96 | "sub": 0, 97 | "total": 0, 98 | "ref_len": 0, 99 | "err_rate": 0, 100 | } 101 | 102 | 103 | def test_edit_distance_zero_len_ref_with_err(): 104 | a = [] 105 | b = ["a"] 106 | results = edit_distance(a, b) 107 | assert results == { 108 | "ins": 1, 109 | "del": 0, 110 | "sub": 0, 111 | "total": 1, 112 | "ref_len": 0, 113 | "err_rate": float("inf"), 114 | } 115 | 116 | 117 | def test_edit_distance_sclite(): 118 | a = ["a", "b"] 119 | b = ["b", "c"] 120 | results = edit_distance(a, b, sclite_mode=True) 121 | assert results == { 122 | "ins": 1, 123 | "del": 1, 124 | "sub": 0, 125 | "total": 2, 126 | "ref_len": 2, 127 | "err_rate": 1.0, 128 | } 129 | 130 | 131 | approx = partial(pytest.approx, abs=3e-3) 132 | 133 | 134 | def test_bootstrap_wer_ci_1system(): 135 | ref = [ 136 | ("a", "b", "c"), 137 | ("d", "e", "f"), 138 | ] 139 | 140 | hyp = [ 141 | ("a", "b", "d"), 142 | ("e", "f", "f"), 143 | ] 144 | 145 | ans = bootstrap_wer_ci(ref, hyp) 146 | print(ans) 147 | 148 | assert ans["wer"] == approx(0.50) 149 | assert ans["ci95"] == approx(0.23) 150 | assert ans["ci95min"] == approx(0.269) 151 | assert ans["ci95max"] == approx(0.731) 152 | 153 | 154 | def test_bootstrap_wer_ci_2system(): 155 | ref = [ 156 | ("a", "b", "c"), 157 | ("d", "e", "f"), 158 | ] 159 | 160 | hyp = [ 161 | ("a", "b", "d"), 162 | ("e", "f", "f"), 163 | ] 164 | 165 | hyp2 = [ 166 | ("a", "b", "c"), 167 | ("e", "e", "f"), 168 | ] 169 | 170 | ans = bootstrap_wer_ci(ref, hyp, hyp2) 171 | print(ans) 172 | 173 | s = ans["system1"] 174 | assert s["wer"] == approx(0.50) 175 | assert s["ci95"] == approx(0.23) 176 | assert s["ci95min"] == approx(0.269) 177 | assert s["ci95max"] == approx(0.731) 178 | 179 | s = ans["system2"] 180 | assert s["wer"] == approx(0.166) 181 | assert s["ci95"] == approx(0.231) 182 | assert s["ci95min"] == approx(-0.064) 183 | assert s["ci95max"] == approx(0.397) 184 | 185 | assert ans["p_s2_improv_over_s1"] == 1.0 186 | 187 | --------------------------------------------------------------------------------