├── .clang-format ├── .github └── workflows │ ├── CI.yml │ ├── copyright-update.yml │ ├── deploy.yml │ └── links_fail_fast.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── NLP-Format.md ├── Synonyms-Format.md ├── Usage.md └── json_log_schema.json ├── sample_data ├── aaddf.txt ├── abcdef.txt └── synonyms.rules.txt ├── src ├── AdaptedComposition.cpp ├── AdaptedComposition.h ├── AlignmentTraversor.cpp ├── AlignmentTraversor.h ├── Ctm.cpp ├── Ctm.h ├── FstFileLoader.cpp ├── FstFileLoader.h ├── FstLoader.cpp ├── FstLoader.h ├── IComposition.h ├── Nlp.cpp ├── Nlp.h ├── OneBestFstLoader.cpp ├── OneBestFstLoader.h ├── PathHeap.cpp ├── PathHeap.h ├── StandardComposition.cpp ├── StandardComposition.h ├── SynonymEngine.cpp ├── SynonymEngine.h ├── Walker.cpp ├── Walker.h ├── fast-d.cpp ├── fast-d.h ├── fstalign.cpp ├── fstalign.h ├── json_logging.h ├── logging.cpp ├── logging.h ├── main.cpp ├── utilities.cpp ├── utilities.h ├── version.h ├── wer.cpp └── wer.h ├── test ├── CMakeLists.txt ├── compose-tests-utils.h ├── compose-tests.cc ├── data │ ├── align_1.aligned.punc_case.nlp │ ├── align_1.hyp.ctm │ ├── align_1.hyp.punc_case.ctm │ ├── align_1.norm.json │ ├── align_1.ref.aligned.nlp │ ├── align_1.ref.nlp │ ├── align_2.hyp.ctm │ ├── align_2.norm.json │ ├── align_2.ref.aligned.nlp │ ├── align_2.ref.aligned.std.nlp │ ├── align_2.ref.nlp │ ├── align_3.hyp.ctm │ ├── align_3.norm.json │ ├── align_3.ref.aligned.nlp │ ├── align_3.ref.nlp │ ├── align_4.hyp1.ctm │ ├── align_4.hyp2.ctm │ ├── align_4.norm.json │ ├── align_4.ref.aligned1.nlp │ ├── align_4.ref.aligned2.nlp │ ├── align_4.ref.nlp │ ├── align_5.hyp1.ctm │ ├── align_5.hyp2.ctm │ ├── align_5.ref.aligned1-2.nlp │ ├── align_5.ref.aligned1.nlp │ ├── align_5.ref.aligned2-a2.nlp │ ├── align_5.ref.aligned2.nlp │ ├── align_5.ref.nlp │ ├── align_6.hyp.ctm │ ├── align_6.ref.aligned.nlp │ ├── align_6.ref.nlp │ ├── empty.hyp.ctm │ ├── empty.hyp.nlp │ ├── empty.hyp.txt │ ├── empty.ref.txt │ ├── fstalign-50.hyp.txt │ ├── fstalign-50.new.sbs.txt │ ├── fstalign-50.ref.txt │ ├── noise.hyp1.ctm │ ├── noise.hyp2.ctm │ ├── noise_1.hyp1.aligned │ ├── noise_1.hyp2.aligned │ ├── noise_1.ref.nlp │ ├── oracle_1.hyp.fst │ ├── oracle_1.ref.txt │ ├── oracle_1.symbols.txt │ ├── short.aligned.case.nlp │ ├── short.aligned.nlp │ ├── short.aligned.punc.nlp │ ├── short.aligned.punc_case.nlp │ ├── short.aligned.strict.nlp │ ├── short.hyp.nlp │ ├── short.hyp.txt │ ├── short.ref.nlp │ ├── short.sbs.txt │ ├── short_punc.hyp.nlp │ ├── short_punc.ref.nlp │ ├── short_punc.wer_tag.json │ ├── speaker_1.hyp.txt │ ├── speaker_1.ref.nlp │ ├── speaker_2.hyp.txt │ ├── speaker_2.ref.nlp │ ├── syn_1.hyp.adapted.sbs │ ├── syn_1.hyp.sbs │ ├── syn_1.hyp.txt │ ├── syn_1.ref.txt │ ├── syn_10.hyp.txt │ ├── syn_10.ref.txt │ ├── syn_2.hyp.txt │ ├── syn_2.ref.txt │ ├── syn_3.hyp.txt │ ├── syn_3.ref.txt │ ├── syn_4.hyp.txt │ ├── syn_4.ref.txt │ ├── syn_5.hyp.txt │ ├── syn_5.ref.txt │ ├── syn_6.hyp.txt │ ├── syn_6.ref.txt │ ├── syn_7.hyp.txt │ ├── syn_7.hyp2.txt │ ├── syn_7.hyp3.txt │ ├── syn_7.hyp4.txt │ ├── syn_7.norm.json │ ├── syn_7.ref.nlp │ ├── syn_7.synonym.rules.txt │ ├── syn_7_ref4.nlp │ ├── syn_8.hyp.ctm │ ├── syn_8.ref.nlp │ ├── syn_9.hyp.txt │ ├── syn_9.ref.txt │ ├── syn_9.synonym.rules.txt │ ├── syn_compound_1.hyp.txt │ ├── syn_compound_1.ref.txt │ ├── syn_compound_2.hyp.txt │ ├── syn_compound_2.ref.txt │ ├── test1.hyp.txt │ ├── test1.ref.txt │ ├── twenty.aligned.punc_case.nlp │ ├── twenty.hyp-a2.sbs │ ├── twenty.hyp.punc_case.txt │ ├── twenty.hyp.sbs │ ├── twenty.hyp.txt │ ├── twenty.norm.json │ ├── twenty.ref.nlp │ ├── twenty.ref.testing.nlp │ ├── twenty.ref.testing.norm.json │ ├── wer_utf.hyp.txt │ └── wer_utf.ref.txt ├── fast-d-tests.cc ├── fstalign_Test.cc └── test-utilties.h └── tools ├── README.md ├── gather_runtime_metrics.sh ├── generate_wer_test_data.pl ├── images ├── 120_short_files.png ├── 120_vs_130_ram.png ├── 120_vs_130_runtime.png └── 130_short_files.png └── sbs2fst.py /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: true 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 120 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: true 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^' 65 | Priority: 2 66 | - Regex: '^<.*\.h>' 67 | Priority: 1 68 | - Regex: '^<.*' 69 | Priority: 2 70 | - Regex: '.*' 71 | Priority: 3 72 | IncludeIsMainRegex: '([-_](test|unittest))?$' 73 | IndentCaseLabels: true 74 | IndentPPDirectives: None 75 | IndentWidth: 2 76 | IndentWrappedFunctionNames: false 77 | JavaScriptQuotes: Leave 78 | JavaScriptWrapImports: true 79 | KeepEmptyLinesAtTheStartOfBlocks: false 80 | MacroBlockBegin: '' 81 | MacroBlockEnd: '' 82 | MaxEmptyLinesToKeep: 1 83 | NamespaceIndentation: None 84 | ObjCBlockIndentWidth: 2 85 | ObjCSpaceAfterProperty: false 86 | ObjCSpaceBeforeProtocolList: false 87 | PenaltyBreakAssignment: 2 88 | PenaltyBreakBeforeFirstCallParameter: 1 89 | PenaltyBreakComment: 300 90 | PenaltyBreakFirstLessLess: 120 91 | PenaltyBreakString: 1000 92 | PenaltyExcessCharacter: 1000000 93 | PenaltyReturnTypeOnItsOwnLine: 200 94 | PointerAlignment: Left 95 | ReflowComments: true 96 | SortIncludes: true 97 | SortUsingDeclarations: true 98 | SpaceAfterCStyleCast: false 99 | SpaceAfterTemplateKeyword: true 100 | SpaceBeforeAssignmentOperators: true 101 | SpaceBeforeParens: ControlStatements 102 | SpaceInEmptyParentheses: false 103 | SpacesBeforeTrailingComments: 2 104 | SpacesInAngles: false 105 | SpacesInContainerLiterals: true 106 | SpacesInCStyleCastParentheses: false 107 | SpacesInParentheses: false 108 | SpacesInSquareBrackets: false 109 | Standard: Auto 110 | TabWidth: 8 111 | UseTab: Never 112 | ... 113 | 114 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | # Trigger the workflow on a push event or manually from the Actions tab 4 | on: [push, workflow_dispatch] 5 | 6 | jobs: 7 | # This workflow has one job 8 | # First we build the docker container, then we run CI tests 9 | build: 10 | runs-on: ubuntu-latest 11 | env: 12 | DOCKER_IMAGE: fstalign 13 | 14 | steps: 15 | - name: Checkout repository and submodules 16 | uses: actions/checkout@v2 17 | with: 18 | submodules: recursive 19 | 20 | - name: Build the docker container 21 | run: docker build . -f Dockerfile -t ${DOCKER_IMAGE} 22 | 23 | - name: Run CI tests 24 | run: docker run --rm -t ${DOCKER_IMAGE} bash -c '(cd build && make test)' 25 | -------------------------------------------------------------------------------- /.github/workflows/copyright-update.yml: -------------------------------------------------------------------------------- 1 | name: Update copyright year(s) in license file 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 3 1 1 *" # 03:00 AM on January 1 7 | 8 | jobs: 9 | update-license-year: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 0 15 | - uses: FantasticFiasco/action-update-license-year@771ff9afdc55b09e1fb649cf03e312d0cf86b4a6 16 | with: 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | transform: (?\d{4})+-?(\d{4})? 19 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | push_to_registry: 7 | name: Push Docker image to GitHub Packages 8 | runs-on: ubuntu-latest 9 | steps: 10 | - 11 | name: Check out the repo 12 | uses: actions/checkout@v2 13 | with: 14 | submodules: recursive 15 | 16 | - 17 | name: Prepare tags 18 | id: prep 19 | run: | 20 | DOCKER_IMAGE=revdotcom/fstalign 21 | VERSION=develop 22 | if [[ $GITHUB_REF == refs/tags/* ]]; then 23 | VERSION=${GITHUB_REF#refs/tags/} 24 | fi 25 | TAGS="${DOCKER_IMAGE}:${VERSION}" 26 | echo ::set-output name=version::${VERSION} 27 | echo ::set-output name=tags::${TAGS} 28 | echo ::set-output name=created::$(date -u +'%Y-%m-%dT%H:%M:%SZ') 29 | 30 | - 31 | name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v1 33 | 34 | - 35 | name: Login to DockerHub 36 | uses: docker/login-action@v1 37 | with: 38 | username: ${{ secrets.DOCKERHUB_USERNAME }} 39 | password: ${{ secrets.DOCKERHUB_TOKEN }} 40 | 41 | - 42 | name: Build and push 43 | uses: docker/build-push-action@v2 44 | with: 45 | context: . 46 | platforms: linux/amd64 47 | push: true 48 | tags: | 49 | revdotcom/fstalign:latest 50 | ${{ steps.prep.outputs.tags }} 51 | -------------------------------------------------------------------------------- /.github/workflows/links_fail_fast.yml: -------------------------------------------------------------------------------- 1 | name: Broken Link Checker 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | linkChecker: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | 13 | - name: Link Checker 14 | uses: lycheeverse/lychee-action@e1ef974431881438bf594f458e332b099fd33bb5 #v1.4.1 https://github.com/lycheeverse/lychee-action#security-tip 15 | with: 16 | args: --verbose --no-progress './**/*.md' './**/*.json' './**/*.cpp' './**/*.h' './**/*.cc' 17 | fail: true 18 | env: 19 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS generated files # 2 | ###################### 3 | .DS_Store 4 | .DS_Store? 5 | ._* 6 | .Spotlight-V100 7 | .Trashes 8 | ehthumbs.db 9 | Thumbs.db 10 | *.o 11 | *.pyc 12 | *.swp 13 | 14 | build 15 | 16 | *.dSYM 17 | *.vscode* 18 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third-party/CLI11"] 2 | path = third-party/CLI11 3 | url = https://github.com/CLIUtils/CLI11.git 4 | [submodule "third-party/strtk"] 5 | path = third-party/strtk 6 | url = https://github.com/ArashPartow/strtk.git 7 | [submodule "third-party/jsoncpp"] 8 | path = third-party/jsoncpp 9 | url = https://github.com/open-source-parsers/jsoncpp.git 10 | [submodule "third-party/csv"] 11 | path = third-party/csv 12 | url = https://github.com/ben-strasser/fast-cpp-csv-parser.git 13 | [submodule "third-party/catch2"] 14 | path = third-party/catch2 15 | url = https://github.com/catchorg/Catch2.git 16 | [submodule "third-party/spdlog"] 17 | path = third-party/spdlog 18 | url = https://github.com/gabime/spdlog.git 19 | [submodule "third-party/inih"] 20 | path = third-party/inih 21 | url = https://github.com/benhoyt/inih.git 22 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.7) 2 | 3 | project(fstalign LANGUAGES CXX C) 4 | 5 | include(GNUInstallDirs) 6 | 7 | if(NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) 9 | endif() 10 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 11 | 12 | enable_testing() 13 | 14 | set(CMAKE_CXX_STANDARD 14) 15 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 16 | 17 | if(DEFINED ENV{OPENFST_ROOT}) 18 | set(OPENFST_ROOT $ENV{OPENFST_ROOT} CACHE STRING "Path to OpenFST") 19 | endif() 20 | message(STATUS "OpenFST root: ${OPENFST_ROOT}") 21 | 22 | set(FSTALIGN_INCLUDES 23 | ${PROJECT_SOURCE_DIR}/third-party/spdlog/include 24 | ${PROJECT_SOURCE_DIR}/third-party/CLI11/include 25 | ${PROJECT_SOURCE_DIR}/src 26 | ${PROJECT_SOURCE_DIR}/third-party 27 | ${PROJECT_SOURCE_DIR}/third-party/inih 28 | ${PROJECT_SOURCE_DIR}/third-party/inih/cpp 29 | ) 30 | 31 | find_package(Threads REQUIRED) 32 | 33 | set(FSTALIGN_LIBRARIES 34 | jsoncpp_lib_static 35 | ${PROJECT_SOURCE_DIR}/third-party/inih/ini.c 36 | ) 37 | 38 | set(OPENFST_INCLUDES 39 | ${OPENFST_ROOT}/include 40 | ) 41 | 42 | if(DYNAMIC_OPENFST) 43 | set(OPENFST_LIBRARIES 44 | ${OPENFST_ROOT}/lib/libfst.so 45 | ) 46 | else() 47 | set(OPENFST_LIBRARIES 48 | ${OPENFST_ROOT}/lib/libfst.a -ldl 49 | ) 50 | endif() 51 | 52 | add_library(fstaligner-common 53 | src/fstalign.cpp 54 | src/wer.cpp 55 | src/fast-d.cpp 56 | src/AdaptedComposition.cpp 57 | src/StandardComposition.cpp 58 | src/AlignmentTraversor.cpp 59 | src/Ctm.cpp 60 | src/FstLoader.cpp 61 | src/FstFileLoader.cpp 62 | src/logging.cpp 63 | src/Nlp.cpp 64 | src/OneBestFstLoader.cpp 65 | src/PathHeap.cpp 66 | src/SynonymEngine.cpp 67 | src/utilities.cpp 68 | src/Walker.cpp 69 | third-party/inih/cpp/INIReader.cpp 70 | ) 71 | 72 | list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") # for Mac users 73 | find_package(ICU REQUIRED COMPONENTS uc) 74 | 75 | target_link_libraries(fstaligner-common 76 | Threads::Threads 77 | ${FSTALIGN_LIBRARIES} 78 | ${FST_KALDI_LIBRARIES} 79 | ${ICU_LIBRARIES} 80 | ) 81 | 82 | add_subdirectory(third-party/jsoncpp) 83 | add_subdirectory(third-party/catch2) 84 | 85 | add_executable(fstalign src/main.cpp) 86 | 87 | include_directories(fstalign 88 | ${FSTALIGN_INCLUDES} 89 | ${OPENFST_INCLUDES} 90 | ${ICU_INCLUDE_DIRS} 91 | ) 92 | 93 | target_link_libraries(fstalign 94 | fstaligner-common 95 | ${CMAKE_DL_LIBS} 96 | ${FSTALIGN_LIBRARIES} 97 | ${OPENFST_LIBRARIES} 98 | ) 99 | 100 | add_subdirectory(test) 101 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 3 | documentation, we greatly value feedback and contributions from our community. 4 | 5 | 6 | ## Reporting Bugs/Feature Requests 7 | Please use the GitHub issue tracker to report bugs or suggest features. 8 | 9 | When filing an issue, please check [existing open](https://github.com/revdotcom/fstalign/issues), or recently closed, issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 10 | 11 | * A reproducible test case or series of steps 12 | * The environment/deployment of our code being used 13 | * The version of our code being used 14 | * Any modifications you've made relevant to the bug 15 | 16 | 17 | ## Contributing via Pull Requests 18 | This project follows the [Gitflow workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) for contributions. Before sending us a pull request, please ensure that: 19 | 20 | 1. You are working against the latest source on the *develop* branch. 21 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 22 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 23 | 24 | To send us a pull request, please: 25 | 26 | 1. Fork the repository. 27 | 2. Modify the source. 28 | 3. Ensure local tests pass. 29 | 4. Commit to your fork using clear commit messages. 30 | 5. Send us a pull request. 31 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 32 | 33 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 34 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 35 | 36 | 37 | ## Code of Conduct 38 | This project has adopted the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org/version/1/4/code-of-conduct/). 39 | 40 | 41 | ## Licensing 42 | See the [LICENSE](https://github.com/revdotcom/fstalign/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Using kaldi image for pre-built OpenFST, version is 1.7.2 2 | FROM kaldiasr/kaldi:cpu-debian10-2024-07-29 as kaldi-base 3 | 4 | FROM debian:11 5 | 6 | COPY --from=kaldi-base /opt/kaldi/tools/openfst /opt/openfst 7 | ENV OPENFST_ROOT /opt/openfst 8 | 9 | ARG JOBS=4 10 | 11 | RUN apt-get update && \ 12 | apt-get upgrade -y && \ 13 | apt-get -y install \ 14 | cmake \ 15 | g++ \ 16 | libicu-dev 17 | 18 | RUN mkdir /fstalign 19 | COPY CMakeLists.txt /fstalign/CMakeLists.txt 20 | COPY src /fstalign/src 21 | COPY test /fstalign/test 22 | COPY third-party /fstalign/third-party 23 | COPY sample_data /fstalign/sample_data 24 | 25 | WORKDIR /fstalign 26 | 27 | RUN mkdir -p /fstalign/build && \ 28 | cd /fstalign/build && \ 29 | rm -rf * && \ 30 | cmake .. -DOPENFST_ROOT="${OPENFST_ROOT}" -DDYNAMIC_OPENFST=ON && \ 31 | make -j${JOBS} VERBOSE=1 && \ 32 | mkdir -p /fstalign/bin && \ 33 | cp /fstalign/build/fstalign /fstalign/bin && \ 34 | strip /fstalign/bin/* 35 | 36 | COPY tools /fstalign/tools 37 | 38 | ENV PATH \ 39 | /fstalign/bin/:\ 40 | $PATH 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![CI](https://github.com/revdotcom/fstalign/workflows/CI/badge.svg) 2 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 3 | 4 | # fstalign 5 | - [Overview](#Overview) 6 | - [What's new in 2.0](#What's-new-in-2.0) 7 | - [Installation](#Installation) 8 | * [Dependencies](#Dependencies) 9 | * [Build](#Build) 10 | * [Docker](#Docker) 11 | - [Documentation](#Documentation) 12 | 13 | ## Overview 14 | `fstalign` is a tool for creating alignment between two sequences of tokens (here out referred to as “reference” and “hypothesis”). It has two key functions: computing word error rate (WER) and aligning [NLP-formatted](https://github.com/revdotcom/fstalign/blob/develop/docs/NLP-Format.md) references with CTM hypotheses. 15 | 16 | Due to its use of OpenFST and lazy algorithms for text-based alignment, `fstalign` is efficient for calculating WER while also providing significant flexibility for different measurement features and error analysis. 17 | 18 | ## What's new in 2.0 19 | 20 | Version 2.0 introduces two major changes: 21 | 1. A new method to traverse the composition graph, which dramatically improves the overall speed, especially when the sequences are long contain many errors. 22 | We have files that took 25 minutes to align before that can now take about 7 seconds. This is especially noticeable with the adapted composition (the default). 23 | 1. Some smarts were introduced when --use-case and --use-punctuation are enabled. 24 | Now, by default, punctuation symbols can only be substituted by other punctuation symbols (or deleted/inserted). 25 | Also, words that differ only by the first letter case will be preffered for substitution. 26 | 27 | 28 | Here's an example of the 1.x behavior and the 2.0 version 29 | ``` 30 | ==> v1.x sbs.txt <== 31 | ref_token hyp_token IsErr Class Wer_Tag_Entities 32 | Welcome Welcome ###322_###| 33 | back back 34 | to to 35 | another another 36 | episode episode ###323_###| 37 | of of 38 | Podcasts Podcast ERR ###324_###| 39 | in and ERR 40 | Color Color ###167_###|###325_###| 41 | : of ERR 42 | The the ERR 43 | Podcast Podcast ###168_###|###326_###| 44 | . . 45 | I I 46 | 47 | ==> v2.0 sbs.txt <== 48 | ref_token hyp_token IsErr Class Wer_Tag_Entities 49 | Welcome Welcome ###322_###| 50 | back back 51 | to to 52 | another another 53 | episode episode ###323_###| 54 | of of 55 | Podcasts Podcast ERR ###324_###| 56 | in and ERR 57 | Color Color ###167_###|###325_###| 58 | of ERR 59 | : ERR 60 | The the ERR 61 | Podcast Podcast ###168_###|###326_###| 62 | ``` 63 | The confusion between `:` and `of` is not longer allowed. 64 | 65 | Also, here's how favoring or not the substitution based on case-insensitive comparison, while still counting it as an error, looks like: 66 | ``` 67 | ==> v1.x sbs.txt <== 68 | ref_token hyp_token IsErr Class Wer_Tag_Entities 69 | shorten shorten ###801_###| 70 | It's it's ERR 71 | Berry Barry ERR ###785_###|###788_###|###802_###| 72 | . . 73 | Just Just 74 | Yeah like ERR ###805_###| 75 | . ERR 76 | Like ERR 77 | , ERR 78 | I I ###809_###| 79 | have have 80 | a a 81 | nickname nickname 82 | 83 | ==> v2.0 sbs.txt <== 84 | ref_token hyp_token IsErr Class Wer_Tag_Entities 85 | It's it's ERR 86 | Berry Barry ERR ###785_###|###788_###|###802_###| 87 | . . 88 | Just Just 89 | Yeah ERR ###805_###| 90 | . ERR 91 | Like like ERR 92 | , ERR 93 | I I ###809_###| 94 | have have 95 | a a 96 | nickname nickname 97 | ``` 98 | Here, `Like <-> like` substitution is favored. While this generally won't change the WER value itself (although it can), it will improve the timing alignments. 99 | 100 | 101 | These behavior, as well as the beam size (that has a default value of 50.0) can be controlled with the following new parameters: 102 | ``` 103 | --disable-strict-punctuation 104 | Disable strict punctuation alignment (which prevents punctuation aligning with words). 105 | --disable-favored-subs Disable favored substitutions (which makes alignment favor substitutions between words which differ only by case). 106 | --favored-sub-cost FLOAT Cost for favored substitutions (e.g., case diff). Default: 0.1 107 | ``` 108 | 109 | ## Installation 110 | 111 | ### Dependencies 112 | We use git submodules to manage third-party dependencies. Initialize and update submodules before proceeding to the main build steps. 113 | ``` 114 | git submodule update --init --recursive 115 | ``` 116 | 117 | This will pull the current dependencies: 118 | - catch2 - for unit testing 119 | - spdlog - for logging 120 | - CLI11 - for CLI construction 121 | - csv - for CTM and NLP input parsing 122 | - jsoncpp - for JSON output construction 123 | - strtk - for various string utilities 124 | 125 | Additionally, we have dependencies outside of the third-party submodules: 126 | - OpenFST - currently provided to the build system by settings the $OPENFST_ROOT environment variable or during the CMake command via `-DOPENFST_ROOT`. 127 | 128 | ### Build 129 | The current build framework is CMake. Install CMake following the instructions here (https://cmake.org/install/). 130 | 131 | To build fstalign, run: 132 | ``` 133 | mkdir build && cd build 134 | cmake .. -DOPENFST_ROOT="" -DDYNAMIC_OPENFST=ON 135 | make 136 | ``` 137 | 138 | Note: `-DDYNAMIC_OPENFST=ON` is needed if OpenFST at `OPENFST_ROOT` is compiled as shared libraries. Otherwise static libraries are assumed. 139 | 140 | Finally, tests can be run using: 141 | ``` 142 | make test 143 | ``` 144 | 145 | ### Docker 146 | 147 | The fstalign docker image is hosted on Docker Hub and can be easily pulled and run: 148 | ``` 149 | docker pull revdotcom/fstalign 150 | docker run --rm -it revdotcom/fstalign 151 | ``` 152 | 153 | See https://hub.docker.com/r/revdotcom/fstalign/tags for the available versions/tags to pull. If you desire to run the tool on local files you can mount local directories with the `-v` flag of the `docker run` command. 154 | 155 | From inside the container: 156 | ``` 157 | /fstalign/build/fstalign --help 158 | ``` 159 | 160 | For development you can also build the docker image locally using: 161 | ``` 162 | docker build . -t fstalign-dev 163 | ``` 164 | 165 | ## Documentation 166 | For more information on how to use `fstalign` see our [documentation](https://github.com/revdotcom/fstalign/blob/develop/docs/Usage.md) for more details. 167 | -------------------------------------------------------------------------------- /docs/NLP-Format.md: -------------------------------------------------------------------------------- 1 | # NLP Format 2 | NLP files are `.csv` inspired, pipe-separated text files that contain token and metadata information of a transcript. Each line of a file represents a single transcript token and the metadata associated with it. 3 | 4 | | Column | Description | 5 | | ----------- | ----------- | 6 | | Column 1: token | A single token in the transcript. These are typically single words or multiple words with hyphens in between. | 7 | | Column 2: speaker | A unique ID that associates this token to a specific speaker in an audio. | 8 | | Column 3: ts | A float representing the time in seconds that starts of the token’s utterance. | 9 | | Column 4: endTs | A float representing the time in seconds that ends of the token’s utterance. | 10 | | Column 5: punctuation | A punctuation character that is included at the end of a token that is used when reconstructing the transcript. Example punctuation: `",", ";", ".", "!"`. These will be ignored from WER token matching. | 11 | | Column 6: case | A two letter code to denominate the which of four possible casings for this token:
UC - Denotes a token that has the first character in uppercase and every other character lowercase
LC - Denotes a token that has every character in lowercase
CA - Denotes a token that has every character in uppercase
MC - Denotes a token that doesn’t follow the previous rules. This is the case when upper- and lowercase characters are mixed throughout the token | 12 | | Column 7: tags | Displays one of the several entity tags that are listed in wer_tags in long form - such that the displayed entity here is in the form `ID:ENTITY_CLASS`. If normalization is used, only entities in this column can be normalized. | 13 | | Column 8: wer_tags | A list of entity tags that are associated with this token. In this field, only entity IDs should be present. The specific ENTITY_CLASS for each ID can be extracted from an accompanying wer_tags sidecar json. | 14 | 15 | Example: 16 | ``` 17 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags 18 | Good|0||||UC|[]|[] 19 | morning|0||||LC|['5:TIME']|['5'] 20 | and|0||||LC|[]|[] 21 | welcome|0||||LC|[]|[] 22 | to|0||||LC|[]|[] 23 | the|0||||LC|['6:DATE']|['6'] 24 | first|0||||LC|['6:DATE']|['6'] 25 | quarter|0||||LC|['6:DATE']|['6'] 26 | 2020|0||||CA|['0:YEAR']|['0', '1', '6'] 27 | NexGEn|0||||MC|['7:ORG']|['7'] 28 | ``` 29 | 30 | ## WER tag sidecar 31 | 32 | WER tag sidecar files contain accompanying info for tokens in an NLP file. The 33 | keys are IDs corresponding to tokens in the NLP file `wer_tags` column. The 34 | objects under the keys are information about the token. 35 | 36 | Example: 37 | ``` 38 | { 39 | '0': {'entity_type': 'YEAR'}, 40 | '1': {'entity_type': 'CARDINAL'}, 41 | '6': {'entity_type': 'SPACY>TIME'}, 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/Synonyms-Format.md: -------------------------------------------------------------------------------- 1 | # Synonyms File Format 2 | Synonyms allow for reference words to be equivalent to similar forms (determined by the user) for error counting. They are accepted for any input formats and passed into the tool via the `--syn ` flag. 3 | 4 | The file structure is a simple text file where each line is a synonym and each synonym is separated by a pipe where the left hand side is the reference version of the term and the right hand side is the accepted hypothesis alternative. 5 | 6 | ``` 7 | format : LHSRHS 8 | where: 9 | LHS : space-delimited words to match in the original reference text 10 | RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS 11 | ``` 12 | 13 | Note that there is no built in symmetry, so synonyms must be doubly specified for symmetrical equivalence (example below illustrates this). Empty lines or lines starting with '#' are ignored. 14 | 15 | Example: 16 | ``` 17 | i am | i'm 18 | i'm | i am 19 | okay | ok 20 | ok | okay 21 | ``` 22 | 23 | A full example of a synonyms file is available in the repository under `sample_data/synonyms.rules.txt`. 24 | -------------------------------------------------------------------------------- /docs/json_log_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "definitions": { 3 | "transcript_wer": { 4 | "title": "Transcript WER", 5 | "type": "object", 6 | "properties": { 7 | "wer": { 8 | "title": "WER", 9 | "type": "object", 10 | "properties": { 11 | "bestWER": "#/definitions/wer_result", 12 | "classWER": { 13 | : "#/definitions/wer_result" 14 | }, 15 | "speakerWER": { 16 | : "#/definitions/wer_result" 17 | }, 18 | "speakerSwitchWER": "#/definitions/wer_result", 19 | "unigrams": { 20 | : "#/definitions/pr_result" 21 | }, 22 | "bigrams": { 23 | : "#/definitions/pr_result" 24 | } 25 | } 26 | } 27 | } 28 | }, 29 | "wer_result": { 30 | "title": "WER Result", 31 | "type": "object", 32 | "properties": { 33 | "insertions": { 34 | "title": "Insertions", 35 | "type": "integer" 36 | }, 37 | "deletions": { 38 | "title": "Deletions", 39 | "type": "integer" 40 | }, 41 | "substitutions": { 42 | "title": "Substitutions", 43 | "type": "integer" 44 | }, 45 | "numErrors": { 46 | "title": "Number of errors", 47 | "type": "integer" 48 | }, 49 | "numWordsInReference": { 50 | "title": "Number of words in reference", 51 | "type": "integer" 52 | }, 53 | "wer": { 54 | "title": "WER", 55 | "type": "float" 56 | }, 57 | "meta": { 58 | "title": "Metadata", 59 | "type": "object" 60 | }, 61 | }, 62 | }, 63 | "pr_result": { 64 | "title": "Precision Recall Result", 65 | "type": "object", 66 | "properties": { 67 | "insertions": { 68 | "title": "Insertions", 69 | "type": "integer" 70 | }, 71 | "deletions": { 72 | "title": "Deletions", 73 | "type": "integer" 74 | }, 75 | "substitutions_fp": { 76 | "title": "Substitutions that were false positives.", 77 | "type": "integer" 78 | }, 79 | "substitutions_fn": { 80 | "title": "Substitutions that were false negatives.", 81 | "type": "integer" 82 | }, 83 | "correct": { 84 | "title": "Correct", 85 | "type": "integer" 86 | }, 87 | "precision": { 88 | "title": "Precision", 89 | "type": "float" 90 | }, 91 | "recall": { 92 | "title": "Recall", 93 | "type": "float" 94 | } 95 | } 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /sample_data/aaddf.txt: -------------------------------------------------------------------------------- 1 | a a d d f 2 | -------------------------------------------------------------------------------- /sample_data/abcdef.txt: -------------------------------------------------------------------------------- 1 | a b c d e f 2 | -------------------------------------------------------------------------------- /sample_data/synonyms.rules.txt: -------------------------------------------------------------------------------- 1 | # format : LHSRHS 2 | # where: 3 | # LHS : space-delimited words to match in the original reference text 4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS 5 | # 6 | # This is non-recursive and single-pass only. 7 | # By default, there won't be an automatic symetry: 8 | # if you want a->b and b->a, you need to specify both 9 | # 10 | # Empty lines or lines starting with '#' are ignored 11 | 12 | # To Be contractions - present 13 | i am | i'm 14 | i'm | i am 15 | you are | you're 16 | you're | you are 17 | he is | he's 18 | he's | he is 19 | she is | she's 20 | she's | she is 21 | it is | it's 22 | it's | it is 23 | we're | we are 24 | we are | we're 25 | they are | they're 26 | 27 | # To Be contractions - future 28 | i will |i'll 29 | i'll |i will 30 | you will |you'll 31 | you'll |you will 32 | he will |he'll 33 | he'll |he will 34 | she will |she'll 35 | she'll |she will 36 | it will |it'll 37 | it'll |it will 38 | we will |we'll 39 | we'll |we will 40 | they will |they'll 41 | they'll |they will 42 | 43 | 44 | okay | ok ; 'kay 45 | ok | okay ; 'kay 46 | 'kay | okay ; ok 47 | 48 | til | until ; 'til 49 | 'til | until ; til 50 | until | 'til ; til 51 | 52 | awhile | a while 53 | a while | awhile 54 | 55 | lotta | lot of 56 | lot of | lotta 57 | sorta | sort of 58 | sort of | sorta 59 | 60 | dunno | don't know ; do not know 61 | don't know | dunno 62 | do not know | dunno 63 | 64 | lemme | let me 65 | let me | lemme 66 | 67 | let's | let us 68 | let us | let's 69 | 70 | # TODO: can't -> cannot ? 71 | # TODO: To Have ? 72 | must've | must have 73 | must have | must've 74 | would've | would have 75 | would have | would've 76 | should've | should have 77 | should have | should've ; shoulda 78 | might've | might have 79 | might have | might've 80 | could've | could have 81 | could have | could've 82 | i'd | i had ; i would 83 | i had | i'd 84 | i would | i'd 85 | you'd | you had ; you would 86 | you had | you'd 87 | you would | you'd 88 | he'd | he had ; he would 89 | he had | he'd 90 | he would | he'd 91 | she'd | she had ; she would 92 | she had | she'd 93 | she would | she'd 94 | they'd | they had ; they would 95 | they had | they'd 96 | they would | they'd 97 | i've | i have 98 | i have | i've 99 | they've | they have 100 | they have | they've 101 | you've | you have 102 | you have | you've 103 | 104 | 105 | 'cause | cause ; because 106 | because | 'cause ; cause 107 | gonna | going to 108 | going to | gonna 109 | wanna | want to 110 | want to | wanna 111 | kinda | kind of 112 | kind of | kinda 113 | gotta | got to 114 | got to | gotta 115 | 'em | them 116 | them | 'em 117 | all right | alright 118 | alright | all right 119 | 120 | give me | gimme 121 | gimme | give me 122 | shoulda | should have 123 | out of | outta 124 | outta | out of 125 | what are you | whatcha 126 | whatcha | what are you 127 | 128 | | 129 | | 130 | | 131 | | 132 | | 133 | | 134 | | 135 | | 136 | | 137 | | 138 | | 139 | | 140 | | 141 | | 142 | | 143 | | 144 | | 145 | | 146 | | 147 | | 148 | | 149 | | 150 | | 151 | | 152 | -------------------------------------------------------------------------------- /src/AdaptedComposition.h: -------------------------------------------------------------------------------- 1 | /* 2 | AdaptedComposition.h 3 | JP Robichaud (jp@rev.com) 4 | 2021 5 | 6 | */ 7 | 8 | #ifndef __ADAPTEDCOMPOSITION_H__ 9 | #define __ADAPTEDCOMPOSITION_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "IComposition.h" 15 | #include "utilities.h" 16 | #include "fstalign.h" 17 | 18 | using namespace std; 19 | 20 | typedef fst::Fst::StateId StateId; 21 | 22 | typedef pair StatePair; 23 | struct key_hash : public std::unary_function { 24 | std::size_t operator()(const StatePair &k) const { return std::get<0>(k) ^ std::get<1>(k); } 25 | }; 26 | 27 | // A hash function used to hash a pair of any kind, useful for unordered_map 28 | struct hash_pair { 29 | template 30 | size_t operator()(const pair &p) const { 31 | auto hash1 = hash{}(p.first); 32 | auto hash2 = hash{}(p.second); 33 | return hash1 ^ hash2; 34 | } 35 | }; 36 | 37 | /* 38 | * Calculates edit distance between two FSTs through manual single-step composition. 39 | * Optimizes the search space of the composed graph by greedily expanding composition states. 40 | * It is notably faster than the StandardCompositionFst alternative. 41 | * (in beta) 42 | */ 43 | class AdaptedCompositionFst : public IComposition { 44 | protected: 45 | map composed_states; 46 | map reversed_composed_states; 47 | 48 | set> entity_exit_states; 49 | 50 | StateId current_composed_next_state_id = 0; 51 | 52 | const fst::SymbolTable *symbols_; 53 | std::vector synonyms_label_ids; 54 | std::vector entity_label_ids; 55 | 56 | int dbg_count = 0; 57 | 58 | // possible optimizations : limit to const FST or limit to StdVectorFst 59 | const fst::StdFst &fstA_; 60 | const fst::StdFst &fstB_; 61 | // Add members to store options 62 | bool strict_punctuation_ = false; 63 | std::unordered_set punctuation_ids_; 64 | // Favored substitutions 65 | bool use_favored_substitutions_ = false; 66 | float favored_substitution_cost_ = 0.1f; 67 | std::vector favorable_substitution_map_; 68 | 69 | StateId GetOrCreateComposedState(StateId a, StateId b); 70 | bool IsEntityLabel(int labelId); 71 | bool IsSynonymLabel(int labelId); 72 | bool IsEntityReacheable(int target_entity_label_id, StateId refA, StateId refB); 73 | 74 | public: 75 | AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB); 76 | AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols); 77 | AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options); 78 | ~AdaptedCompositionFst(); 79 | 80 | StateId Start(); 81 | fst::Fst::Weight Final(StateId stateId); 82 | bool TryGetArcsAtState(StateId fromStateId, vector *out_vector); 83 | 84 | // a and b are in the incoming graph referencials 85 | bool DoesComposedStateExist(StateId a, StateId b); 86 | 87 | // a is in the composed-graph referencial 88 | bool DoesComposedStateExist(StateId a); 89 | 90 | void SetSymbols(const fst::SymbolTable *symbols); 91 | 92 | void DebugComposedGraph(); 93 | }; 94 | 95 | #endif -------------------------------------------------------------------------------- /src/AlignmentTraversor.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | AlignmentTraversor.cpp 3 | JP Robichaud (jp@rev.com) 4 | 2021 5 | 6 | */ 7 | #include "AlignmentTraversor.h" 8 | 9 | AlignmentTraversor::AlignmentTraversor(wer_alignment &topLevel) : root(topLevel) { 10 | currentPosInRoot = -1; 11 | currentSubclass = nullptr; 12 | } 13 | 14 | void AlignmentTraversor::Restart() { 15 | currentPosInRoot = -1; 16 | currentSubclass = nullptr; 17 | currentPosInSubclass = -1; 18 | } 19 | 20 | bool AlignmentTraversor::NextTriple(triple &triple) { 21 | if (currentSubclass == nullptr) { 22 | // we're not in a subclass, we're consuming the root alignment content, 23 | // let's move to the next word 24 | currentPosInRoot++; 25 | if (currentPosInRoot >= root.tokens.size()) { 26 | return false; 27 | } 28 | 29 | auto tk = root.tokens[currentPosInRoot]; 30 | if (isEntityLabel(tk.first)) { 31 | // handle class 32 | currentPosInSubclass = -1; 33 | // find subclass spWERA from within the root 34 | for (auto &a : root.label_alignments) { 35 | if (a.classLabel == tk.first) { 36 | currentSubclass = &a; 37 | break; 38 | } 39 | } 40 | // currentSubclass = nullptr; // fixme 41 | return NextTriple(triple); 42 | } 43 | 44 | triple.classLabel = TK_GLOBAL_CLASS; 45 | triple.ref = tk.first; 46 | triple.hyp = tk.second; 47 | 48 | return true; 49 | } else { 50 | currentPosInSubclass++; 51 | if (currentPosInSubclass == 0 && currentSubclass->tokens.size() == 0 && 52 | currentSubclass->classLabel.find("FALLBACK") != std::string::npos) { 53 | triple.classLabel = currentSubclass->classLabel; 54 | triple.ref = NOOP; 55 | triple.hyp = NOOP; 56 | return true; 57 | } 58 | if (currentPosInSubclass >= currentSubclass->tokens.size()) { 59 | // we're done here... 60 | currentSubclass = nullptr; 61 | currentPosInSubclass = -1; 62 | return NextTriple(triple); 63 | } 64 | 65 | auto tk = currentSubclass->tokens[currentPosInSubclass]; 66 | triple.classLabel = currentSubclass->classLabel; 67 | triple.ref = tk.first; 68 | triple.hyp = tk.second; 69 | return true; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/AlignmentTraversor.h: -------------------------------------------------------------------------------- 1 | /* 2 | AlignmentTraversor.h 3 | JP Robichaud (jp@rev.com) 4 | 2021 5 | 6 | */ 7 | 8 | #ifndef __ATRAVERSOR_H__ 9 | #define __ATRAVERSOR_H__ 10 | 11 | #include "utilities.h" 12 | 13 | struct triple { 14 | string ref; 15 | string hyp; 16 | string classLabel; 17 | }; 18 | 19 | class AlignmentTraversor { 20 | public: 21 | AlignmentTraversor(wer_alignment &topLevel); 22 | bool NextTriple(triple &triple); 23 | void Restart(); 24 | 25 | private: 26 | wer_alignment &root; 27 | int currentPosInRoot = -1; 28 | int currentPosInSubclass; 29 | wer_alignment *currentSubclass; 30 | }; 31 | 32 | #endif // __ATRAVERSOR_H__ 33 | -------------------------------------------------------------------------------- /src/Ctm.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Ctm.cpp 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | 2018 7 | * 8 | */ 9 | 10 | #include "Ctm.h" 11 | 12 | #include 13 | 14 | using namespace std; 15 | using namespace fst; 16 | 17 | /*************************************** 18 | CTM FST Loader Class Start 19 | ***************************************/ 20 | CtmFstLoader::CtmFstLoader(vector &records, bool use_case) : FstLoader() { 21 | { 22 | mCtmRows = records; 23 | mUseCase = use_case; 24 | for (auto &row : mCtmRows) { 25 | std::string token = std::string(row.word); 26 | if (!mUseCase) { 27 | token = UnicodeLowercase(row.word); 28 | } 29 | mToken.push_back(token); 30 | } 31 | } 32 | } 33 | 34 | CtmFstLoader::~CtmFstLoader() { 35 | // TODO Auto-generated destructor stub 36 | } 37 | void CtmFstLoader::addToSymbolTable(SymbolTable &symbol) const { 38 | for (auto &s : mToken) { 39 | AddSymbolIfNeeded(symbol, s); 40 | } 41 | } 42 | 43 | StdVectorFst CtmFstLoader::convertToFst(const SymbolTable &symbol, std::vector map) const { 44 | auto logger = logger::GetOrCreateLogger("ctmloader"); 45 | // 46 | StdVectorFst transducer; 47 | logger->debug("creating transducer for CTM"); 48 | 49 | transducer.AddState(); 50 | transducer.SetStart(0); 51 | 52 | int prevState = 0; 53 | int nextState = 1; 54 | int wc = 0; 55 | int map_sz = map.size(); 56 | for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) { 57 | std::string token = *i; 58 | if (!mUseCase) { 59 | token = UnicodeLowercase(token); 60 | } 61 | transducer.AddState(); 62 | 63 | if (map_sz > wc && map[wc] > 0) { 64 | transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 1.0f, nextState)); 65 | } else { 66 | transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 0.0f, nextState)); 67 | } 68 | 69 | prevState = nextState; 70 | nextState++; 71 | wc++; 72 | } 73 | 74 | transducer.SetFinal(prevState, 0.0f); 75 | return transducer; 76 | } 77 | 78 | std::vector CtmFstLoader::convertToIntVector(fst::SymbolTable &symbol) const { 79 | auto logger = logger::GetOrCreateLogger("ctmloader"); 80 | std::vector vect; 81 | addToSymbolTable(symbol); 82 | int sz = mToken.size(); 83 | logger->info("creating std::vector for CTM for {} tokens", sz); 84 | vect.reserve(sz); 85 | 86 | FstAlignOption options; 87 | for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) { 88 | std::string token = *i; 89 | int token_sym = symbol.Find(token); 90 | if (token_sym == -1) { 91 | token_sym = symbol.Find(options.symUnk); 92 | } 93 | vect.emplace_back(token_sym); 94 | } 95 | 96 | return vect; 97 | } 98 | 99 | /*************************************** 100 | CTM FST Loader Class End 101 | ***************************************/ 102 | 103 | /*************************************** 104 | CTM Reader Class Start 105 | ***************************************/ 106 | CtmReader::CtmReader() {} 107 | 108 | vector read_from_disk_no_conf(const string &filename) { 109 | vector vect; 110 | io::CSVReader<5, io::trim_chars<' ', '\t'>, io::no_quote_escape<' '>, io::throw_on_overflow, io::empty_line_comment> 111 | input_ctm(filename); 112 | 113 | input_ctm.set_header("audiofile", "channel", "start", "duration", "word"); 114 | 115 | string audiofile, channel, start, duration, word, confidence; 116 | while (input_ctm.read_row(audiofile, channel, start, duration, word)) { 117 | RawCtmRecord record; 118 | record.recording = audiofile; 119 | record.channel = channel; 120 | record.start_time_secs = stof(start); 121 | record.duration_secs = stof(duration); 122 | record.word = word; 123 | record.confidence = 1; 124 | vect.push_back(record); 125 | } 126 | 127 | return vect; 128 | } 129 | 130 | vector read_from_disk_with_conf(const string &filename) { 131 | vector vect; 132 | io::CSVReader<6, io::trim_chars<' ', '\t'>, io::no_quote_escape<' '>, io::throw_on_overflow, io::empty_line_comment> 133 | input_ctm(filename); 134 | 135 | input_ctm.set_header("audiofile", "channel", "start", "duration", "word", "confidence"); 136 | 137 | string audiofile, channel, start, duration, word, confidence; 138 | while (input_ctm.read_row(audiofile, channel, start, duration, word, confidence)) { 139 | RawCtmRecord record; 140 | record.recording = audiofile; 141 | record.channel = channel; 142 | record.start_time_secs = stof(start); 143 | record.duration_secs = stof(duration); 144 | record.word = word; 145 | record.confidence = stof(confidence); 146 | vect.push_back(record); 147 | } 148 | 149 | return vect; 150 | } 151 | 152 | vector CtmReader::read_from_disk(const string &filename) { 153 | ifstream ctm_peek(filename); 154 | string first_line; 155 | if (!std::getline(ctm_peek, first_line)) { 156 | vector vect; 157 | return vect; 158 | } 159 | 160 | int sz = 1; 161 | char lastChar = 'x'; 162 | 163 | for (auto &c : first_line) { 164 | if (c == ' ' || c == '\t') { 165 | if (lastChar != ' ' && lastChar != '\t') { 166 | sz++; 167 | } 168 | } 169 | 170 | lastChar = c; 171 | } 172 | 173 | // Minimum CTM columns should be: audiofile, channel, start, duration, word 174 | // Sixth confidence score column is optional 175 | bool hasConf = sz > 5 ? true : false; 176 | 177 | if (hasConf) { 178 | return read_from_disk_with_conf(filename); 179 | } else { 180 | return read_from_disk_no_conf(filename); 181 | } 182 | } 183 | 184 | /*************************************** 185 | CTM Reader Class End 186 | ***************************************/ 187 | -------------------------------------------------------------------------------- /src/Ctm.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Ctm.h 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | * (C) 2018 7 | * 8 | */ 9 | 10 | #ifndef __CTM_H__ 11 | #define __CTM_H__ 12 | 13 | #include "FstLoader.h" 14 | #include "utilities.h" 15 | 16 | using namespace std; 17 | using namespace fst; 18 | 19 | struct RawCtmRecord { 20 | string recording; 21 | string channel; 22 | float start_time_secs; 23 | float duration_secs; 24 | string word; 25 | float confidence; 26 | }; 27 | 28 | class CtmFstLoader : public FstLoader { 29 | public: 30 | CtmFstLoader(std::vector &records, bool use_case = false); 31 | ~CtmFstLoader(); 32 | vector mCtmRows; 33 | virtual void addToSymbolTable(fst::SymbolTable &symbol) const; 34 | virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector map) const; 35 | virtual std::vector convertToIntVector(fst::SymbolTable &symbol) const; 36 | virtual const std::string &getToken(int index) const { return mToken.at(index); } 37 | private: 38 | bool mUseCase; 39 | }; 40 | 41 | class CtmReader { 42 | public: 43 | CtmReader(); 44 | vector read_from_disk(const std::string &filename); 45 | }; 46 | 47 | #endif // __CTM_H__ 48 | -------------------------------------------------------------------------------- /src/FstFileLoader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FstFileLoader.cpp 3 | */ 4 | #include "FstFileLoader.h" 5 | 6 | FstFileLoader::FstFileLoader(std::string filename) : FstLoader(), filename_(filename) {} 7 | 8 | void FstFileLoader::addToSymbolTable(fst::SymbolTable& symbol) const { return; } 9 | 10 | fst::StdVectorFst FstFileLoader::convertToFst(const fst::SymbolTable& symbol, std::vector map) const { 11 | auto logger = logger::GetOrCreateLogger("FstFileLoader"); 12 | fst::StdVectorFst* transducer = fst::StdVectorFst::Read(filename_); 13 | logger->info("Total FST has {} states.", transducer->NumStates()); 14 | return (*transducer); 15 | } 16 | 17 | std::vector FstFileLoader::convertToIntVector(fst::SymbolTable& symbol) const { 18 | auto logger = logger::GetOrCreateLogger("FstFileLoader"); 19 | std::vector vect; 20 | logger->error("convertToIntVector isn't implemented for FST inputs"); 21 | vect.reserve(0); 22 | vect.resize(0); 23 | return vect; 24 | } 25 | 26 | FstFileLoader::~FstFileLoader() {} 27 | -------------------------------------------------------------------------------- /src/FstFileLoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FstFileLoader.h 3 | * 4 | * FstLoader for loading a serialized FST from disk. 5 | * 6 | * Quinn McNamara (quinn@rev.com) 7 | * 2020 8 | */ 9 | 10 | #ifndef FstFileLoader_H_ 11 | #define FstFileLoader_H_ 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "FstLoader.h" 18 | #include "utilities.h" 19 | 20 | class FstFileLoader : public FstLoader { 21 | public: 22 | FstFileLoader(std::string filename); 23 | ~FstFileLoader(); 24 | 25 | virtual void addToSymbolTable(fst::SymbolTable &symbol) const; 26 | virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector map) const; 27 | virtual std::vector convertToIntVector(fst::SymbolTable &symbol) const; 28 | 29 | private: 30 | std::string filename_; 31 | }; 32 | 33 | #endif /* FstFileLoader_H_ */ 34 | -------------------------------------------------------------------------------- /src/FstLoader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | FstLoader.cpp 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #include "FstLoader.h" 9 | #include "utilities.h" 10 | 11 | FstLoader::FstLoader() { 12 | // TODO Auto-generated constructor stub 13 | } 14 | 15 | FstLoader::~FstLoader() { 16 | // TODO Auto-generated destructor stub 17 | } 18 | 19 | void FstLoader::AddSymbolIfNeeded(fst::SymbolTable &symbol, std::string str_value) { 20 | if (symbol.Find(str_value) == -1) { 21 | symbol.AddSymbol(str_value); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/FstLoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | FstLoader.h 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #ifndef __FSTLOADER_H_ 9 | #define __FSTLOADER_H_ 10 | 11 | #include 12 | #include "utilities.h" 13 | 14 | class FstLoader { 15 | protected: 16 | typedef std::vector TokenType; 17 | TokenType mToken; 18 | 19 | public: 20 | FstLoader(); 21 | virtual ~FstLoader(); 22 | virtual void addToSymbolTable(fst::SymbolTable &symbol) const = 0; 23 | static void AddSymbolIfNeeded(fst::SymbolTable &symbol, std::string str_value); 24 | virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector map) const = 0; 25 | virtual std::vector convertToIntVector(fst::SymbolTable &symbol) const = 0; 26 | 27 | static std::unique_ptr MakeReferenceLoader(const std::string& ref_filename, 28 | const std::string& wer_sidecar_filename, 29 | const std::string& json_norm_filename, 30 | bool use_punctuation, 31 | bool use_case, 32 | bool symbols_file_included); 33 | 34 | static std::unique_ptr MakeHypothesisLoader(const std::string& hyp_filename, 35 | const std::string& hyp_json_norm_filename, 36 | bool use_punctuation, 37 | bool use_case, 38 | bool symbols_file_included); 39 | 40 | 41 | }; 42 | 43 | #endif /* __FSTLOADER_H_ */ 44 | -------------------------------------------------------------------------------- /src/IComposition.h: -------------------------------------------------------------------------------- 1 | /* 2 | ICompostion.h 3 | JP Robichaud (jp@rev.com) 4 | 2021 5 | 6 | Custom interface to encapsulate various composition strategies 7 | 8 | */ 9 | 10 | #ifndef __ICOMPOSITION_H_ 11 | #define __ICOMPOSITION_H_ 12 | 13 | #include "utilities.h" 14 | typedef fst::Fst::StateId StateId; 15 | 16 | class IComposition : public fst::VectorFst { 17 | protected: 18 | float insertion_cost = 1; 19 | float deletion_cost = 1; 20 | float substitution_cost = 1.5; 21 | std::shared_ptr logger_; 22 | 23 | fst::SymbolTable *symbols_; 24 | 25 | // TODO: make this settable/configurable 26 | int ins_label_id_ = 1; 27 | int del_label_id_ = 2; 28 | int sub_label_id_ = 3; 29 | 30 | public: 31 | IComposition() {} 32 | IComposition(const fst::StdFst &fstA, const fst::StdFst &fstB) {} 33 | IComposition(const fst::StdFst &fstA, const fst::StdFst &fstB, SymbolTable &symbols) {} 34 | 35 | virtual ~IComposition() {} 36 | virtual StateId Start() = 0; 37 | virtual fst::Fst::Weight Final(StateId stateId) = 0; 38 | virtual bool TryGetArcsAtState(StateId fromStateId, vector *out_vector) = 0; 39 | }; 40 | 41 | #endif /*__ICOMPOSITION_H_ */ 42 | -------------------------------------------------------------------------------- /src/Nlp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Nlp.h 3 | * 4 | * Created on: 2018-04-23 5 | * Author: JP Robichaud (jp@rev.com) 6 | */ 7 | #ifndef NLP_H_ 8 | #define NLP_H_ 9 | 10 | #include 11 | 12 | #include 13 | 14 | #include "FstLoader.h" 15 | 16 | using namespace std; 17 | using namespace fst; 18 | 19 | struct WerTagEntry { 20 | string tag_id; 21 | string entity_type; 22 | }; 23 | 24 | struct RawNlpRecord { 25 | string token; 26 | string speakerId; 27 | string punctuation; 28 | string prepunctuation; 29 | string ts; 30 | string endTs; 31 | string casing; 32 | string labels; 33 | string best_label; 34 | string best_label_id; 35 | vector wer_tags; 36 | string confidence; 37 | }; 38 | 39 | class NlpReader { 40 | public: 41 | NlpReader(); 42 | virtual ~NlpReader(); 43 | vector read_from_disk(const std::string &filename); 44 | string GetBestLabel(std::string &labels); 45 | vector GetWerTags(std::string &wer_tags_str); 46 | string GetLabelId(std::string &label); 47 | }; 48 | 49 | class NlpFstLoader : public FstLoader { 50 | public: 51 | NlpFstLoader(std::vector &records, Json::Value normalization, Json::Value wer_sidecar, bool processLabels, bool use_punctuation = false, bool use_case = false); 52 | NlpFstLoader(std::vector &records, Json::Value normalization, Json::Value wer_sidecar); 53 | virtual ~NlpFstLoader(); 54 | virtual void addToSymbolTable(fst::SymbolTable &symbol) const; 55 | virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector map) const; 56 | virtual std::vector convertToIntVector(fst::SymbolTable &symbol) const; 57 | 58 | int GetProperSymbolId(const fst::SymbolTable &symbol, string token, string symUnk) const; 59 | vector mNlpRows; 60 | vector mSpeakers; 61 | Json::Value mJsonNorm; 62 | Json::Value mWerSidecar; 63 | virtual const std::string &getToken(int index) const { return mToken.at(index); } 64 | private: 65 | bool mUseCase; 66 | }; 67 | 68 | #endif /* NLP_H_ */ 69 | -------------------------------------------------------------------------------- /src/OneBestFstLoader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * OneBestFstLoader.cpp 3 | * JP Robichaud (jp@rev.com) 4 | * 2018 5 | */ 6 | 7 | #include "OneBestFstLoader.h" 8 | 9 | #include 10 | #include 11 | 12 | #include "utilities.h" 13 | 14 | // empty constructor 15 | OneBestFstLoader::OneBestFstLoader(bool use_case) : FstLoader() { 16 | mUseCase = use_case; 17 | } 18 | 19 | void OneBestFstLoader::BuildFromString(const std::string content) { 20 | std::istringstream mystream(content); 21 | std::copy(std::istream_iterator(mystream), std::istream_iterator(), 22 | std::back_inserter(mToken)); 23 | } 24 | 25 | void OneBestFstLoader::LoadTextFile(const std::string filename) { 26 | std::ifstream stream(filename); 27 | 28 | if (!stream.is_open()) throw std::runtime_error("Cannot open input file"); 29 | 30 | std::copy(std::istream_iterator(stream), std::istream_iterator(), 31 | std::back_inserter(mToken)); 32 | 33 | stream.close(); 34 | } 35 | 36 | void OneBestFstLoader::addToSymbolTable(fst::SymbolTable &symbol) const { 37 | for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) { 38 | std::string token = *i; 39 | if (!mUseCase) { 40 | token = UnicodeLowercase(token); 41 | } 42 | // fst::kNoSymbol 43 | if (symbol.Find(token) == -1) { 44 | symbol.AddSymbol(token); 45 | } 46 | } 47 | } 48 | 49 | fst::StdVectorFst OneBestFstLoader::convertToFst(const fst::SymbolTable &symbol, std::vector map) const { 50 | auto logger = logger::GetOrCreateLogger("OneBestFstLoader"); 51 | 52 | FstAlignOption options; 53 | int eps_sym = symbol.Find(options.symEps); 54 | 55 | fst::StdVectorFst transducer; 56 | 57 | transducer.AddState(); 58 | transducer.SetStart(0); 59 | 60 | int prevState = 0; 61 | int nextState = 1; 62 | int map_sz = map.size(); 63 | int wc = 0; 64 | for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) { 65 | std::string token = *i; 66 | if (!mUseCase) { 67 | token = UnicodeLowercase(token); 68 | } 69 | transducer.AddState(); 70 | 71 | int tk_idx = symbol.Find(token); 72 | if (tk_idx < 0) { 73 | logger->trace("we found an invalid token [{}] at token position {} which gave a label id of {}", token, (wc + 1), 74 | tk_idx); 75 | } 76 | if (map_sz > wc && map[wc] > 0) { 77 | transducer.AddArc(prevState, fst::StdArc(tk_idx, tk_idx, 1.0f, nextState)); 78 | } else { 79 | transducer.AddArc(prevState, fst::StdArc(tk_idx, tk_idx, 0.0f, nextState)); 80 | } 81 | 82 | prevState = nextState; 83 | nextState++; 84 | wc++; 85 | } 86 | 87 | int realFinal = transducer.AddState(); 88 | transducer.AddArc(prevState, fst::StdArc(eps_sym, eps_sym, 0.0f, realFinal)); 89 | transducer.SetFinal(realFinal, StdFst::Weight::One()); 90 | return transducer; 91 | } 92 | 93 | std::vector OneBestFstLoader::convertToIntVector(fst::SymbolTable &symbol) const { 94 | auto logger = logger::GetOrCreateLogger("OneBestFstLoader"); 95 | std::vector vect; 96 | addToSymbolTable(symbol); 97 | int sz = mToken.size(); 98 | logger->info("creating std::vector for OneBestFstLoader for {} tokens", sz); 99 | vect.reserve(sz); 100 | 101 | FstAlignOption options; 102 | for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) { 103 | std::string token = *i; 104 | if (!mUseCase) { 105 | token = UnicodeLowercase(token); 106 | } 107 | int token_sym = symbol.Find(token); 108 | if (token_sym == -1) { 109 | token_sym = symbol.Find(options.symUnk); 110 | } 111 | vect.emplace_back(token_sym); 112 | } 113 | 114 | return vect; 115 | } 116 | 117 | OneBestFstLoader::~OneBestFstLoader() { 118 | // TODO Auto-generated destructor stub 119 | } 120 | -------------------------------------------------------------------------------- /src/OneBestFstLoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * OneBestFstLoader.h 3 | * JP Robichaud (jp@rev.com) 4 | * 2018 5 | */ 6 | 7 | #ifndef ONEBESTFSTLOADER_H_ 8 | #define ONEBESTFSTLOADER_H_ 9 | 10 | #include "FstLoader.h" 11 | 12 | class OneBestFstLoader : public FstLoader { 13 | public: 14 | OneBestFstLoader(bool use_case = false); 15 | virtual ~OneBestFstLoader(); 16 | void LoadTextFile(const std::string filename); 17 | void BuildFromString(const std::string content); 18 | 19 | virtual void addToSymbolTable(fst::SymbolTable &symbol) const; 20 | virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector map) const; 21 | virtual const std::string &getToken(int index) const { return mToken.at(index); } 22 | virtual std::vector convertToIntVector(fst::SymbolTable &symbol) const; 23 | int TokensSize() { return mToken.size(); } 24 | private: 25 | bool mUseCase; 26 | }; 27 | 28 | #endif /* ONEBESTFSTLOADER_H_ */ 29 | -------------------------------------------------------------------------------- /src/PathHeap.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | PathHeap.cpp 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #include "PathHeap.h" 9 | 10 | using namespace std; 11 | using namespace fst; 12 | 13 | PathHeap::PathHeap() { 14 | // creating the set 15 | } 16 | 17 | void PathHeap::insert(shared_ptr entry) { 18 | // just add it to the set, leaving to the comparator to do its job 19 | heap.insert(entry); 20 | } 21 | 22 | shared_ptr PathHeap::removeFirst() { 23 | // we want to take the 1st element and remove it 24 | auto logbookIter = heap.begin(); 25 | auto currentState_ptr = *logbookIter; 26 | heap.erase(logbookIter); 27 | return currentState_ptr; 28 | } 29 | 30 | int PathHeap::size() { return heap.size(); } 31 | 32 | shared_ptr PathHeap::GetBestWerCandidate() { 33 | set, shortlistComparatorSharedPtr>::iterator iter = heap.begin(); 34 | 35 | shared_ptr best = nullptr; 36 | float bestWer = std::numeric_limits::quiet_NaN(); 37 | while (iter != heap.end()) { 38 | auto entry = *iter; 39 | float local_wer = (float)entry->numErrors / (float)entry->numWords; 40 | 41 | if (best == nullptr) { 42 | best = entry; 43 | bestWer = local_wer; 44 | continue; 45 | } 46 | 47 | if (local_wer < bestWer) { 48 | bestWer = local_wer; 49 | best = entry; 50 | } 51 | 52 | iter++; 53 | } 54 | 55 | return best; 56 | } 57 | 58 | int PathHeap::prune(int targetSz) { 59 | set, shortlistComparatorSharedPtr>::iterator iter = heap.begin(); 60 | float wer0, wer_last; 61 | int sz = heap.size(); 62 | for (int i = 0; i < targetSz && i < sz; i++) { 63 | float local_wer = (float)(*iter)->numErrors / ((float)(*iter)->numWords); 64 | if (i == 0) { 65 | wer0 = local_wer; 66 | } 67 | 68 | wer_last = local_wer; 69 | 70 | iter++; 71 | } 72 | 73 | auto last_wer_index = iter; 74 | last_wer_index--; 75 | auto logger = logger::GetOrCreateLogger("pathheap"); 76 | // logger->set_level(spdlog::level::debug); 77 | logger->debug("==== pruning starting ====="); 78 | logger->debug("pruning to {} items -> top wer was {} and last wer was {}. We have {} items in the heap.", targetSz, 79 | wer0, wer_last, heap.size()); 80 | 81 | /* TODO: make sure we don't prune paths that have the same length/error-count 82 | as the last one kept at 'targetSz' 83 | */ 84 | 85 | int numErrorsWithoutInsertions = (*last_wer_index)->numErrors - (*last_wer_index)->numInsert; 86 | int pruned = 0; 87 | while (iter != heap.end()) { 88 | auto p = *iter; 89 | float local_wer = (float)(*iter)->numErrors / ((float)(*iter)->numWords); 90 | logger->debug( 91 | "candidate for prunung: wer0 {4:.4f}, wer_last {0:.4f} {2} words, current candidate {1:.4f}, {3} words", 92 | wer_last, local_wer, (*last_wer_index)->numWords, (*iter)->numWords, wer0); 93 | 94 | int localCoreErr = (*iter)->numErrors - (*iter)->numInsert; 95 | /* various strategies : 96 | bool pruneMe = (*last_wer_index)->numErrors * 1.2 < (*iter)->numErrors; -> slow on larger files 97 | bool pruneMe = (*last_wer_index)->numErrors * 1.1 < (*iter)->numErrors; -> slightly better on larger files 98 | bool pruneMe = numErrorsWithoutInsertions * 1.1 < localCoreErr; --> a bit agressive 99 | bool pruneMe = (*last_wer_index)->numErrors + 20 < (*iter)->numErrors; --> seems to work resonably well 100 | */ 101 | // TODO: make this '20' configurable. Also consider using (numErrors - 102 | // numInsertion) + 20 103 | bool pruneMe = (*last_wer_index)->numErrors + 20 < (*iter)->numErrors; 104 | logger->debug("{} + 20 < {} = {}", numErrorsWithoutInsertions, localCoreErr, pruneMe); 105 | if (pruneMe) { 106 | heap.erase(iter++); 107 | pruned++; 108 | } else { 109 | iter++; 110 | } 111 | } 112 | logger->debug("after pruning we have {} items in the heap", heap.size()); 113 | logger->debug("-----"); 114 | 115 | return pruned; 116 | } 117 | 118 | int PathHeap::prune_relative(float beam_width) { 119 | if (heap.empty()) { 120 | return 0; 121 | } 122 | 123 | auto logger = logger::GetOrCreateLogger("pathheap"); 124 | size_t initial_size = heap.size(); 125 | 126 | // Find the best costSoFar in the current heap 127 | // Note: The heap is ordered by the complex shortlistComparatorSharedPtr, 128 | // so the first element isn't necessarily the one with the lowest costSoFar. 129 | // We need to iterate to find the minimum costSoFar. 130 | float best_cost = std::numeric_limits::max(); 131 | for (const auto& entry : heap) { 132 | if (entry->costSoFar < best_cost) { 133 | best_cost = entry->costSoFar; 134 | } 135 | } 136 | 137 | float cost_threshold = best_cost + beam_width; 138 | 139 | logger->debug("==== Relative pruning starting (Beam: {}) =====", beam_width); 140 | logger->debug("Initial size: {}, Best cost: {:.4f}, Threshold: {:.4f}", 141 | initial_size, best_cost, cost_threshold); 142 | 143 | int pruned_count = 0; 144 | auto iter = heap.begin(); 145 | while (iter != heap.end()) { 146 | // Check if the current entry's cost exceeds the threshold 147 | if ((*iter)->costSoFar > cost_threshold) { 148 | // Remove the element and advance the iterator 149 | iter = heap.erase(iter); 150 | pruned_count++; 151 | } else { 152 | // Otherwise, just advance the iterator 153 | ++iter; 154 | } 155 | } 156 | 157 | logger->debug("After relative pruning: {} items remain ({} pruned)", heap.size(), pruned_count); 158 | logger->debug("-----\n"); 159 | 160 | return pruned_count; 161 | } 162 | -------------------------------------------------------------------------------- /src/PathHeap.h: -------------------------------------------------------------------------------- 1 | /* 2 | PathHeap.h 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #ifndef __PATH_HEAP_H__ 9 | #define __PATH_HEAP_H__ 10 | 11 | #include 12 | 13 | #include "utilities.h" 14 | 15 | using namespace std; 16 | using namespace fst; 17 | 18 | typedef struct ShortlistEntry ShortlistEntry; 19 | typedef struct ShortlistEntry* SLE; 20 | typedef struct MyArc MyArc; 21 | typedef struct MyArc* MyArcPtr; 22 | typedef shared_ptr spSLE; 23 | 24 | struct MyArc { 25 | int ilabel = 0; 26 | int olabel = 0; 27 | float weight = 0.0; 28 | int nextstate = 0; 29 | }; 30 | 31 | struct ShortlistEntry { 32 | int currentState = 0; 33 | int whereTo = 0; 34 | int numErrors = 0; 35 | int numWords = 0; 36 | int numInsert = 0; 37 | double costToGoThere = 0; 38 | float costSoFar = 0; 39 | MyArc local_arc; 40 | shared_ptr linkToHere = nullptr; 41 | }; 42 | 43 | struct shortlistComparatorSharedPtr { 44 | bool operator()(const shared_ptr& a, const shared_ptr& b) { 45 | if (a->numWords == b->numWords) { 46 | if (a->numErrors == b->numErrors) { 47 | if (a->costSoFar == b->costSoFar) { 48 | return a->currentState < b->currentState; 49 | } 50 | 51 | return a->costSoFar < b->costSoFar; 52 | } 53 | 54 | return a->numErrors < b->numErrors; 55 | } 56 | 57 | return a->numWords < b->numWords; 58 | } 59 | }; 60 | 61 | class PathHeap { 62 | public: 63 | PathHeap(); 64 | void insert(std::shared_ptr entry); 65 | shared_ptr removeFirst(); 66 | int prune(int targetSz); 67 | int prune_relative(float beam_width); 68 | int size(); 69 | std::shared_ptr GetBestWerCandidate(); 70 | int pruningErrorOffset = 20; 71 | bool pruningIncludeInsInThreshold = true; 72 | 73 | private: 74 | set, shortlistComparatorSharedPtr> heap; 75 | }; 76 | #endif // __PATH_HEAP_H__ 77 | -------------------------------------------------------------------------------- /src/StandardComposition.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * StandardComposition.cpp 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | * 2021 7 | * 8 | */ 9 | 10 | #include "StandardComposition.h" 11 | #include 12 | #include 13 | #include // For numeric_limits 14 | 15 | using fst::StdArc; 16 | using fst::StdVectorFst; 17 | using fst::SymbolTable; 18 | using fst::TropicalWeight; 19 | using fst::SymbolTableIterator; 20 | using fst::ArcIterator; 21 | using fst::StateIterator; 22 | using fst::FstWriteOptions; 23 | using fst::kNoStateId; 24 | // StateId is defined via typedef in IComposition.h 25 | using std::vector; 26 | using std::string; 27 | using std::ofstream; 28 | 29 | // --- Constructors (ensure symbols_ is initialized) --- 30 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB) 31 | : StandardCompositionFst(fstA, fstB, *(fstA.InputSymbols()), AlignerOptions()) // Use default AlignerOptions 32 | { 33 | if (fstA.InputSymbols() == nullptr) { 34 | throw std::runtime_error("StandardCompositionFst requires symbol table. Attach symbols to fstA or provide explicitly."); 35 | } 36 | } 37 | 38 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols) 39 | : StandardCompositionFst(fstA, fstB, symbols, AlignerOptions()) {} // Use default AlignerOptions 40 | 41 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options) 42 | : strict_punctuation_(options.strict_punctuation), 43 | punctuation_ids_(options.punctuation_ids), 44 | symbols_(symbols), // Initialize member reference 45 | use_favored_substitutions_(options.use_favored_substitutions), 46 | favored_substitution_cost_(options.favored_substitution_cost), 47 | favorable_substitution_map_(options.favorable_substitution_map) 48 | { 49 | auto logger_ = logger::GetOrCreateLogger("StandardCompositionFst"); 50 | logger_->set_level(spdlog::level::info); 51 | 52 | logger_->info("Starting Standard Composition. fstA.Start: {}, fstB.Start: {}", fstA.Start(), fstB.Start()); 53 | 54 | FstAlignOption fst_options; // Assuming this defines symSub, symDel, symIns strings 55 | auto sub_label_id_ = symbols_.Find(fst_options.symSub); 56 | auto del_label_id_ = symbols_.Find(fst_options.symDel); 57 | auto ins_label_id_ = symbols_.Find(fst_options.symIns); 58 | if (sub_label_id_ == fst::kNoSymbol || del_label_id_ == fst::kNoSymbol || ins_label_id_ == fst::kNoSymbol) { 59 | logger_->error("Could not find special edit symbols ('{}', '{}', '{}') in the symbol table.", fst_options.symSub, fst_options.symDel, fst_options.symIns); 60 | throw std::runtime_error("Missing special symbols in symbol table."); 61 | } 62 | 63 | // --- Create Edit FSTs (halfEdit1, halfEdit2) --- 64 | StdVectorFst halfEdit1; 65 | StdVectorFst halfEdit2; 66 | halfEdit1.SetInputSymbols(&symbols_); 67 | halfEdit1.SetOutputSymbols(&symbols_); 68 | halfEdit1.AddState(); 69 | halfEdit1.SetStart(0); 70 | halfEdit1.SetFinal(0, TropicalWeight::One()); 71 | halfEdit1.AddArc(0, StdArc(0, ins_label_id_, insertion_cost / 2, 0)); // eps:ins 72 | 73 | halfEdit2.SetInputSymbols(&symbols_); 74 | halfEdit2.SetOutputSymbols(&symbols_); 75 | halfEdit2.AddState(); 76 | halfEdit2.SetStart(0); 77 | halfEdit2.SetFinal(0, TropicalWeight::One()); 78 | halfEdit2.AddArc(0, StdArc(del_label_id_, 0, deletion_cost / 2, 0)); // del:eps 79 | 80 | for (SymbolTableIterator siter(symbols_); !siter.Done(); siter.Next()) { 81 | int64_t sid = siter.Value(); 82 | if (sid == 0 || sid == ins_label_id_ || sid == del_label_id_ || sid == sub_label_id_) { 83 | continue; 84 | } 85 | 86 | auto sym_tk = symbols_.Find(sid); 87 | bool isClassLabel = isEntityLabel(sym_tk); 88 | // Simplified: Check for entity labels if needed (same as develop version) 89 | // bool isClassLabel = false; // isEntityLabel(symbols_.Find(sid)); in develop 90 | if (isClassLabel) { 91 | // Same handling as in develop version 92 | logger_->info("Token class label found for {}", symbols.Find(sid)); 93 | halfEdit1.AddArc(0, StdArc(sid, sid, 0, 0)); 94 | halfEdit1.AddArc(0, StdArc(sid, del_label_id_, -deletion_cost / 2, 0)); 95 | halfEdit2.AddArc(0, StdArc(sid, sid, 0, 0)); 96 | } else { 97 | // Standard symbol edits - exactly as in develop version 98 | halfEdit1.AddArc(0, StdArc(sid, sid, 0, 0)); // id:id 99 | halfEdit1.AddArc(0, StdArc(sid, sub_label_id_, substitution_cost / 2, 0)); // id:sub 100 | halfEdit1.AddArc(0, StdArc(sid, del_label_id_, deletion_cost / 2, 0)); // id:del 101 | 102 | halfEdit2.AddArc(0, StdArc(sid, sid, 0, 0)); // id:id 103 | halfEdit2.AddArc(0, StdArc(sub_label_id_, sid, substitution_cost / 2, 0)); // sub:id 104 | halfEdit2.AddArc(0, StdArc(ins_label_id_, sid, insertion_cost / 2, 0)); // ins:id 105 | } 106 | } 107 | 108 | logger_->info("Created halfEdit FSTs with self-loops"); 109 | 110 | // Step 1: Determinize the reference FST 111 | StdVectorFst detRefFst; 112 | Determinize(fstA, &detRefFst); 113 | logger_->info("Determinized fstA has {} states", detRefFst.NumStates()); 114 | 115 | // Step 2: Compose the first half 116 | StdVectorFst halfCompose1; 117 | logger_->info("Composing detRefFst o halfEdit1"); 118 | Compose(detRefFst, halfEdit1, &halfCompose1); 119 | logger_->debug("halfCompose1 has {} states", halfCompose1.NumStates()); 120 | 121 | // Check if first composition worked 122 | if (halfCompose1.NumStates() == 0) { 123 | logger_->warn("halfCompose1 (ref o edits) produced an FST with 0 states"); 124 | logger_->warn("halEdit1 was:"); 125 | printFst("fstalign", &halfEdit1, &symbols_); 126 | return; 127 | } 128 | 129 | // Sort for composition 130 | ArcSort(&halfCompose1, fst::StdOLabelCompare()); 131 | 132 | if (halfCompose1.NumStates() < 100) { 133 | printFst("fstalign", &halfCompose1, &symbols_); 134 | } 135 | 136 | // Step 3: Compose the second half 137 | StdVectorFst halfCompose2; 138 | logger_->info("Composing halfEdit2 o fstB"); 139 | Compose(halfEdit2, fstB, &halfCompose2); 140 | logger_->debug("halfCompose2 has {} states", halfCompose2.NumStates()); 141 | 142 | // Check if second composition worked 143 | if (halfCompose2.NumStates() == 0) { 144 | logger_->warn("halfCompose2 (edits o hyp) produced an FST with 0 states"); 145 | logger_->warn("halEdit2 was:"); 146 | printFst("fstalign", &halfEdit2, &symbols_); 147 | return; 148 | } 149 | 150 | // Sort for composition 151 | ArcSort(&halfCompose2, fst::StdILabelCompare()); 152 | if (halfCompose2.NumStates() < 100) { 153 | logger_->info("halfCompose2 has {} states", halfCompose2.NumStates()); 154 | printFst("fstalign", &halfCompose2, &symbols_); 155 | } else { 156 | logger_->info("halfCompose2 is too large to print, it has {} states", halfCompose2.NumStates()); 157 | } 158 | 159 | // Step 4: Final lazy composition 160 | logger_->info("Performing lazy composition"); 161 | fstC_ = std::make_unique(halfCompose1, halfCompose2); 162 | 163 | StateIterator siter(*fstC_); 164 | logger_->info("Standard composition complete"); 165 | } 166 | 167 | StateId StandardCompositionFst::Start() { 168 | return fstC_->Start(); 169 | } 170 | 171 | fst::Fst::Weight StandardCompositionFst::Final(StateId stateId) { 172 | return fstC_->Final(stateId); 173 | } 174 | 175 | bool StandardCompositionFst::TryGetArcsAtState(StateId fromStateId, vector *out_vector) { 176 | assert(out_vector != NULL); 177 | // out_vector->clear(); 178 | 179 | for (ArcIterator aiter(*fstC_, fromStateId); !aiter.Done(); aiter.Next()) { 180 | const fst::StdArc &arc = aiter.Value(); 181 | out_vector->push_back(arc); 182 | } 183 | 184 | return true; 185 | } 186 | 187 | StandardCompositionFst::~StandardCompositionFst() {} 188 | 189 | void StandardCompositionFst::DebugComposedGraph(string debug_filename) { 190 | StdVectorFst composedFst(*fstC_); 191 | ofstream outfile(debug_filename); 192 | FstWriteOptions wopts; 193 | composedFst.SetInputSymbols(&symbols_); 194 | composedFst.SetOutputSymbols(&symbols_); 195 | wopts.write_isymbols = true; 196 | wopts.write_osymbols = true; 197 | wopts.write_header = true; 198 | composedFst.Write(outfile, wopts); 199 | } -------------------------------------------------------------------------------- /src/StandardComposition.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * StandardComposition.h 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | * 2021 7 | * 8 | */ 9 | 10 | #ifndef __STANDARDCOMPOSITION_H__ 11 | #define __STANDARDCOMPOSITION_H__ 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "IComposition.h" 18 | #include "utilities.h" 19 | #include "fstalign.h" 20 | 21 | /* 22 | * Calculates edit distance between two FSTs through two-step composition. 23 | * First, the reference FST is composed with all possible reference transformations (, ). 24 | * Second, the hypothesis FST is composed with all possible hypothesis transformations (, ). 25 | * Then the two FSTs are composed using the standard OpenFST lazy composition. 26 | */ 27 | class StandardCompositionFst : public IComposition { 28 | protected: 29 | // Lazily composed fst, created during initialization 30 | std::unique_ptr> fstC_; 31 | // Add members to store options 32 | bool strict_punctuation_ = false; 33 | std::unordered_set punctuation_ids_; 34 | // Favored substitutions 35 | bool use_favored_substitutions_ = false; 36 | float favored_substitution_cost_ = 0.1f; 37 | std::vector favorable_substitution_map_; 38 | const fst::SymbolTable& symbols_; // Store symbols if needed for filtering 39 | 40 | public: 41 | StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB); 42 | StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols); 43 | StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options); 44 | ~StandardCompositionFst(); 45 | 46 | StateId Start(); 47 | fst::Fst::Weight Final(StateId stateId); 48 | virtual bool TryGetArcsAtState(StateId fromStateId, vector *out_vector); 49 | 50 | /* useful for debugging *SMALL* graphs, performs full (non-lazy) composition */ 51 | void DebugComposedGraph(string debug_filename); 52 | }; 53 | 54 | #endif /* __STANDARDCOMPOSITION_H__ */ 55 | -------------------------------------------------------------------------------- /src/SynonymEngine.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * SynonymEngine.h 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | * 2018 7 | * 8 | */ 9 | #ifndef __SYN_ENGINE_H 10 | #define __SYN_ENGINE_H 11 | #include 12 | 13 | #include "utilities.h" 14 | 15 | using namespace std; 16 | using namespace fst; 17 | 18 | typedef vector SynKey; 19 | typedef vector> SynVals; 20 | 21 | struct SynonymOptions { 22 | bool disable_cutoffs = false; 23 | bool disable_hyphen_ignore = false; 24 | }; 25 | 26 | class SynonymEngine { 27 | public: 28 | SynonymEngine(SynonymOptions syn_opts); 29 | 30 | void LoadFile(string filename); 31 | SynKey GetKeyFromString(string lhs); 32 | SynVals GetValuesFromStrings(string rhs); 33 | void ParseStrings(vector lines); 34 | void ApplyToFst(StdVectorFst &fst, SymbolTable &symbol); 35 | void GenerateSynFromSymbolTable(SymbolTable &symbol); 36 | 37 | protected: 38 | SynonymOptions opts_; 39 | map synonyms; 40 | std::shared_ptr logger_; 41 | }; 42 | 43 | #endif // __SYN_ENGINE_H 44 | -------------------------------------------------------------------------------- /src/Walker.h: -------------------------------------------------------------------------------- 1 | /* 2 | Walker.h 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #ifndef __WALKER_H__ 9 | #define __WALKER_H__ 10 | 11 | #include "AlignmentTraversor.h" 12 | #include "FstLoader.h" 13 | #include "IComposition.h" 14 | #include "PathHeap.h" 15 | #include 16 | 17 | class Walker { 18 | public: 19 | Walker(); 20 | ~Walker() = default; 21 | vector walkComposed(IComposition &fst, SymbolTable &symbol, FstAlignOption &options, 22 | int numBests); 23 | int numberOfLoopsBeforePruning = 10; 24 | int pruningHeapSizeTarget = 100; 25 | bool useRelativeBeamPruning = true; 26 | float relativeBeamWidth = 20.0; 27 | 28 | // Logging configuration 29 | bool enableDetailedWalkerLogging = false; 30 | 31 | private: 32 | std::unordered_map logbook; 33 | PathHeap _heapA; 34 | PathHeap _heapB; 35 | PathHeap *heapA; 36 | PathHeap *heapB; 37 | std::shared_ptr logger; 38 | 39 | std::shared_ptr enqueueIfNeeded(std::shared_ptr currentStatePtr, 40 | const MyArc& arc_ptr, bool isAnchor); 41 | wer_alignment GetDetailsFromTopCandidates(ShortlistEntry ¤tState, SymbolTable &symbol, 42 | FstAlignOption &options); 43 | }; 44 | 45 | #endif // __WALKER_H__ 46 | -------------------------------------------------------------------------------- /src/fast-d.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | fast-d.cpp 3 | JP Robichaud (jp@rev.com) 4 | 2021 5 | */ 6 | #include "fast-d.h" 7 | #include // std::min 8 | #include 9 | #include 10 | #include 11 | 12 | #define debug_map false 13 | 14 | int min(int &a, int &b, int &c) { 15 | if (a < b) { 16 | if (a < c) { 17 | return a; 18 | } else { 19 | return c; 20 | } 21 | } else if (b < c) { 22 | return b; 23 | } else { 24 | return c; 25 | } 26 | } 27 | 28 | int GetEditDistance(std::vector &seqA, std::vector &seqB) { 29 | std::vector mapA; 30 | std::vector mapB; 31 | 32 | return GetEditDistance(seqA, mapA, seqB, mapB); 33 | } 34 | 35 | void print_vect(std::string lbl, int *v, int l) { 36 | std::cout << lbl; 37 | for (int x = 0; x < l; x++) { 38 | std::cout << std::setw(2); 39 | std::cout << v[x] << " "; 40 | } 41 | std::cout << std::endl; 42 | } 43 | 44 | /* This is memory hungry. We need seqA.size() * seqB.size() * sizeof(int) because we need to be able to do the 45 | * backtracking. 46 | */ 47 | int GetEditDistance(std::vector &seqA, std::vector &mapA, std::vector &seqB, std::vector &mapB) { 48 | int lengthA = seqA.size(); 49 | int lengthB = seqB.size(); 50 | 51 | if (lengthA > lengthB) { 52 | // make sure seqA is always the shortest 53 | return GetEditDistance(seqB, mapB, seqA, mapA); 54 | } 55 | 56 | mapA.reserve(seqA.size()); 57 | mapA.resize(seqA.size(), -1); // resize() sets all position to the given value, -1 58 | mapB.reserve(seqB.size()); 59 | mapB.resize(seqB.size(), -1); 60 | 61 | if (seqA.size() == 0) { 62 | return seqB.size(); 63 | } else if (seqB.size() == 0) { 64 | return seqA.size(); 65 | } 66 | 67 | // let's keep two rows, dig all the way to the end to get the distance, then 68 | // let's try to backtrack and get the edits from there, recomputing the rows again 69 | 70 | int distance[lengthA + 1]; 71 | int distancePrev[lengthA + 1]; 72 | for (int i = 0; i <= lengthA; ++i) { 73 | distance[i] = i; 74 | } 75 | 76 | // TODO: we should maybe optimize this to be a 2D uint array? 77 | std::vector> all_distances; 78 | 79 | #if debug_map 80 | print_vect(std::string("seqA: "), seqA.data(), lengthA); 81 | print_vect(std::string("seqB: "), seqB.data(), lengthB); 82 | #endif 83 | for (int j = 1; j <= lengthB; ++j) { 84 | all_distances.push_back(std::vector(distance, distance + lengthA + 1)); 85 | // for (int x = 0; x <= lengthA; ++x) { 86 | // distancePrev[x] = distance[x]; 87 | // } 88 | 89 | std::copy(distance, distance + lengthA + 1, distancePrev); 90 | 91 | #if debug_map 92 | print_vect(std::string("d: "), distance, lengthA + 1); 93 | #endif 94 | 95 | int prev_diag = distance[0], prev_diag_save; 96 | ++distance[0]; 97 | 98 | for (int i = 1; i <= lengthA; ++i) { 99 | prev_diag_save = distance[i]; 100 | if (seqA[i - 1] == seqB[j - 1]) { 101 | distance[i] = prev_diag; 102 | } else { 103 | distance[i] = min(distance[i - 1], distance[i], prev_diag) + 1; 104 | } 105 | prev_diag = prev_diag_save; 106 | } 107 | } 108 | all_distances.push_back(std::vector(distance, distance + lengthA + 1)); 109 | #if debug_map 110 | print_vect(std::string("d: "), distance, lengthA + 1); 111 | #endif 112 | 113 | int edit_distance = distance[lengthA]; 114 | 115 | // now, we want to backtrack the computation and trace, row, by row, 116 | // the path 117 | 118 | int current_pos = lengthA; 119 | int seqB_track = lengthB; 120 | 121 | while (current_pos > 0 && seqB_track >= 0) { 122 | int current_pos_score = distance[current_pos]; 123 | #if debug_map 124 | std::cout << "starting iter" << std::endl; 125 | std::cout << "current_pos = " << current_pos << std::endl; 126 | std::cout << "current_pos_score = " << current_pos_score << std::endl; 127 | std::cout << "seqB_track = " << seqB_track << std::endl; 128 | print_vect(std::string("mapA:"), mapA.data(), lengthA); 129 | print_vect(std::string("mapB:"), mapB.data(), lengthB); 130 | print_vect(std::string("dP: "), distancePrev, lengthA + 1); 131 | print_vect(std::string("d : "), distance, lengthA + 1); 132 | #endif 133 | 134 | int token_a = seqA[current_pos - 1]; 135 | int token_b = seqB[seqB_track - 1]; 136 | 137 | int nw = distancePrev[current_pos - 1]; 138 | int n = distancePrev[current_pos]; 139 | int w = distance[current_pos - 1]; 140 | 141 | bool is_sub = false; 142 | bool is_match = false; 143 | bool is_del = false; 144 | bool is_ins = false; 145 | 146 | #if debug_map 147 | std::cout << "checking " << token_a << " vs " << token_b << std::endl; 148 | std::cout << "nw = " << nw << std::endl; 149 | std::cout << "n = " << n << std::endl; 150 | std::cout << "w = " << w << std::endl; 151 | #endif 152 | 153 | int min_path_score = min(nw, n, w); 154 | if (min_path_score == nw) { 155 | // the upper-left diagonal is the best path 156 | is_sub = true; 157 | if (token_a == token_b) { 158 | // we have a caracter match 159 | mapA[current_pos - 1] = 1; 160 | mapB[seqB_track - 1] = 1; 161 | is_match = true; 162 | is_sub = false; 163 | } 164 | #if debug_map 165 | std::cout << "S(" << token_a << "|" << token_b << ") (matched " << is_match << "), (sub " << is_sub << ")" 166 | << std::endl; 167 | #endif 168 | // going up-left, next time we need to be one position on the left 169 | // to read the distance 170 | current_pos--; 171 | seqB_track--; 172 | } else if (min_path_score == w) { 173 | // this is a deletion, going left 174 | current_pos--; 175 | is_del = true; 176 | #if debug_map 177 | std::cout << "D(" << token_a << ")" << std::endl; 178 | #endif 179 | } else { 180 | // this is an insertion, going north doesn't 181 | // change the current_position we read in the distance vector 182 | seqB_track--; 183 | is_ins = true; 184 | #if debug_map 185 | std::cout << "I(" << token_b << ")" << std::endl; 186 | #endif 187 | } 188 | 189 | if (current_pos < 0 || seqB_track == 0) { 190 | // We reach to a point where any position left 191 | // are errors (either insertions or deletions). 192 | // There's no point in analyzing these values. 193 | break; 194 | } 195 | 196 | // ok, we stop reflecting on the best path to take, now we need to update 197 | // the distance vectors. Step 1, distancePrev becomes the new distance 198 | // now, we only have to go up to current_pos, because everything on the 199 | // right will get ignored. Actually, we only have to compute the two values 200 | // above 201 | 202 | for (int x = 0; x <= lengthA; x++) { 203 | distance[x] = distancePrev[x]; 204 | } 205 | 206 | // Step 2, let's look at the row above the one we are now. 207 | if (false) { 208 | token_b = seqB[seqB_track - 1]; 209 | int token_b_prime = seqB[seqB_track - 2]; 210 | for (int x = current_pos; x > 0; --x) { 211 | token_a = seqA[x - 1]; 212 | 213 | if (token_a == token_b) { 214 | distancePrev[x - 1] = distance[x]; 215 | distancePrev[x] = distance[x] + 1; 216 | } else { 217 | distancePrev[x - 1] = distance[x - 1] - 1; 218 | distancePrev[x] = distance[x] - 1; 219 | } 220 | } 221 | 222 | } else { 223 | auto v = all_distances[seqB_track - 1]; 224 | for (int j = 0; j <= lengthA; ++j) { 225 | distancePrev[j] = v[j]; 226 | } 227 | } 228 | } 229 | 230 | #if debug_map 231 | std::cout << "starting iter" << std::endl; 232 | std::cout << "current_pos = " << current_pos << std::endl; 233 | std::cout << "current_pos_score = 0" << std::endl; 234 | std::cout << "seqB_track = " << seqB_track << std::endl; 235 | print_vect(std::string("mapA:"), mapA.data(), lengthA); 236 | print_vect(std::string("mapB:"), mapB.data(), lengthB); 237 | print_vect(std::string("dP: "), distancePrev, lengthA + 1); 238 | print_vect(std::string("d : "), distance, lengthA + 1); 239 | #endif 240 | 241 | return edit_distance; 242 | } 243 | 244 | /* This version doesn't handle a map and uses much less ram 245 | because it doesn't keep information required for backtracking. 246 | With this method, you only get the final edit distance. 247 | */ 248 | int GetEditDistanceOnly(std::vector &seqA, std::vector &seqB) { 249 | int lengthA = seqA.size(); 250 | int lengthB = seqB.size(); 251 | 252 | if (lengthA > lengthB) { 253 | // make sure seqA is always the shortest 254 | return GetEditDistanceOnly(seqB, seqA); 255 | } 256 | 257 | if (seqA.size() == 0) { 258 | return seqB.size(); 259 | } else if (seqB.size() == 0) { 260 | return seqA.size(); 261 | } 262 | 263 | int distance[lengthA + 1]; 264 | for (int i = 0; i <= lengthA; ++i) { 265 | distance[i] = i; 266 | } 267 | 268 | for (int j = 1; j <= lengthB; ++j) { 269 | int prev_diag = distance[0], prev_diag_save; 270 | int dist_lenA = distance[lengthA]; 271 | ++distance[0]; 272 | 273 | for (int i = 1; i <= lengthA; ++i) { 274 | prev_diag_save = distance[i]; 275 | if (seqA[i - 1] == seqB[j - 1]) { 276 | distance[i] = prev_diag; 277 | } else { 278 | distance[i] = min(distance[i - 1], distance[i], prev_diag) + 1; 279 | } 280 | prev_diag = prev_diag_save; 281 | } 282 | } 283 | 284 | return distance[lengthA]; 285 | } 286 | 287 | bool MapContainsErrorStreaks(std::vector map, int streak_cutoff) { 288 | int seq_cnt = 0; 289 | int bad_match_seq_cnt = 0; 290 | for (int x = 0; x < map.size(); x++) { 291 | if (map[x] <= 0) { 292 | bad_match_seq_cnt++; 293 | } else if (bad_match_seq_cnt > streak_cutoff) { 294 | return true; 295 | } else { 296 | bad_match_seq_cnt = 0; 297 | } 298 | } 299 | return false; 300 | } 301 | -------------------------------------------------------------------------------- /src/fast-d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * fast-d.h 4 | * 5 | * JP Robichaud (jp@rev.com) 6 | * (C) 2021 7 | * 8 | */ 9 | 10 | #ifndef __FASTD_H__ 11 | #define __FASTD_H__ 12 | 13 | #include 14 | 15 | // Simply call GetEditDistance with dummy mapA/mapB 16 | int GetEditDistance(std::vector &seqA, std::vector &seqB); 17 | 18 | // returns the edit distance, resize mapA and mapB to be the same length as seqA and seqB 19 | // the map vectors will have either -1 or 1 values, 1 indicating that the token in this position 20 | // matched its counterpart in the other sequence vector, -1 otherwise. 21 | // Unfortunately, for now, we'll need in the order of seqA.size()*seqB.size()*sizeof(int) memory because 22 | // we need to get the backtracking info available to construct the map objects 23 | int GetEditDistance(std::vector &seqA, std::vector &mapA, std::vector &seqB, std::vector &mapB); 24 | 25 | // returns only the edit distance. 26 | // This is a memory optimized version and is quite fast. 27 | int GetEditDistanceOnly(std::vector &seqA, std::vector &seqB); 28 | 29 | // Returns whether map contains long error streaks. 30 | bool MapContainsErrorStreaks(std::vector map, int streak_cutoff); 31 | 32 | #endif -------------------------------------------------------------------------------- /src/fstalign.h: -------------------------------------------------------------------------------- 1 | /* 2 | fstalign.h 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #ifndef __FSTALIGN_H__ 9 | #define __FSTALIGN_H__ 10 | 11 | #include "Ctm.h" 12 | #include "Nlp.h" 13 | #include "SynonymEngine.h" 14 | 15 | using namespace std; 16 | using namespace fst; 17 | 18 | // Represent information associated with a reference or hypothesis token 19 | struct Token { 20 | string token; 21 | float start_ts=0.0; 22 | float end_ts=0.0; 23 | float duration=0.0; 24 | float confidence=-1.0; 25 | string speaker; 26 | }; 27 | 28 | // Stitchings will be used to represent fstalign output, combining reference, 29 | // hypothesis, and error information into a record-like data structure. 30 | struct Stitching { 31 | Token reftk; 32 | Token hyptk; 33 | string classLabel; 34 | RawNlpRecord nlpRow; 35 | string hyp_orig; 36 | string comment; 37 | }; 38 | 39 | struct AlignerOptions { 40 | int speaker_switch_context_size; 41 | int numBests = 20; 42 | int heapPruningTarget = 20; 43 | int pr_threshold = 0; 44 | string symbols_filename = ""; 45 | string composition_approach = "adapted"; 46 | bool record_case_stats; 47 | bool levenstein_first_pass = false; 48 | int levenstein_maximum_error_streak = 100; 49 | float relative_beam_width = 50.0; 50 | bool strict_punctuation = true; 51 | std::unordered_set punctuation_ids; 52 | // Favored substitutions 53 | bool use_favored_substitutions = true; 54 | float favored_substitution_cost = 0.1f; 55 | std::vector favorable_substitution_map; // Map ID -> favored partner ID (-1 if none) 56 | }; 57 | 58 | 59 | void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine, const string& output_sbs, const string& output_nlp, 60 | AlignerOptions alignerOptions, bool add_inserts_nlp, bool use_case, std::vector ref_extra_columns, std::vector hyp_extra_columns); 61 | void HandleAlign(NlpFstLoader &refLoader, CtmFstLoader &hypLoader, SynonymEngine &engine, ofstream &output_nlp_file, 62 | AlignerOptions alignerOptions); 63 | 64 | string GetTokenPropertyAsString(Stitching stitch, bool refToken, string property); 65 | 66 | #endif // __FSTALIGN_H__ 67 | -------------------------------------------------------------------------------- /src/json_logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | json_logging.h 3 | Nishchal Bhandari (nishchal@rev.com) 4 | 2019 5 | 6 | */ 7 | 8 | #ifndef __JSONLOGGING_H__ 9 | #define __JSONLOGGING_H__ 10 | 11 | #include 12 | 13 | namespace jsonLogger { 14 | 15 | class JsonLogger { 16 | private: 17 | JsonLogger() {} 18 | 19 | public: 20 | Json::Value root; 21 | 22 | static JsonLogger& getLogger() { 23 | static JsonLogger instance; 24 | return instance; 25 | } 26 | 27 | JsonLogger(JsonLogger const&) = delete; 28 | void operator=(JsonLogger const&) = delete; 29 | }; 30 | 31 | } // namespace jsonLogger 32 | 33 | #endif // __JSONLOGGING_H__ 34 | -------------------------------------------------------------------------------- /src/logging.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | logging.cpp 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #include "logging.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace logger { 15 | 16 | std::string CONSOLE_LOGGER_NAME = "console"; 17 | 18 | std::vector sinks; 19 | 20 | void InitLoggers(std::string logfilename) { 21 | sinks.push_back(std::make_shared()); 22 | if (logfilename.size() > 0) { 23 | auto filesink = std::make_shared(logfilename); 24 | sinks.push_back(filesink); 25 | } 26 | 27 | spdlog::set_level(spdlog::level::info); 28 | auto console = std::make_shared(CONSOLE_LOGGER_NAME, begin(sinks), end(sinks)); 29 | spdlog::register_logger(console); 30 | 31 | // auto console = spd::stdout_color_mt(CONSOLE_LOGGER_NAME); 32 | console->info("loggers initialized"); 33 | console->flush_on(spdlog::level::info); 34 | // in general, we can have the utc offset, but for stdout, let's be lean a bit 35 | spdlog::set_pattern("[%^+++%$] [%H:%M:%S %z] [thread %t] [%n] %v"); 36 | console->set_pattern("[%^+++%$] [%H:%M:%S] [%n] %v"); 37 | 38 | // todo : define extra loggers for individual components and read their levels 39 | // from a trc.cfg 40 | } 41 | 42 | std::shared_ptr GetOrCreateLogger(std::string name) { 43 | auto log = spdlog::get(name); 44 | 45 | if (log == nullptr) { 46 | // since we'll go to stdout, we'll avoid the utc offset 47 | // log = spdlog::stdout_color_mt(name); 48 | log = std::make_shared(name, begin(sinks), end(sinks)); 49 | spdlog::register_logger(log); 50 | log->flush_on(spdlog::level::info); 51 | log->set_pattern("[%^+++%$] [%H:%M:%S] [%n] %v"); 52 | } 53 | 54 | return log; 55 | } 56 | 57 | std::shared_ptr GetLogger(std::string name) { 58 | auto log = spdlog::get(name); 59 | 60 | if (log == nullptr) { 61 | log = spdlog::get(CONSOLE_LOGGER_NAME); 62 | log->error( 63 | "The requested logger name [{}] wasn't found in the registery, using " 64 | "[{}] instead", 65 | name, CONSOLE_LOGGER_NAME); 66 | } 67 | 68 | return log; 69 | } 70 | 71 | void CloseLoggers() { 72 | // closing everything 73 | spdlog::drop_all(); 74 | } 75 | } // namespace logger 76 | -------------------------------------------------------------------------------- /src/logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | logging.h 3 | JP Robichaud (jp@rev.com) 4 | 2018 5 | 6 | */ 7 | 8 | #ifndef __LOGGING_H__ 9 | #define __LOGGING_H__ 10 | 11 | #include 12 | 13 | #define HERE_FMT "{}:{:d}: " 14 | #define HERE2 __FILE__, __LINE__ 15 | #define HEREF2 __FUNCTION__, __LINE__ 16 | 17 | namespace logger { 18 | 19 | namespace spd = spdlog; 20 | 21 | void InitLoggers(std::string logfilename); 22 | std::shared_ptr GetOrCreateLogger(std::string name); 23 | std::shared_ptr GetLogger(std::string name); 24 | void CloseLoggers(); 25 | } // namespace logger 26 | 27 | #endif // __LOGGING_H__ 28 | -------------------------------------------------------------------------------- /src/utilities.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "utilities.h" 3 | 4 | #include 5 | 6 | // controlling the graph 7 | const std::string EPSILON = ""; 8 | const std::string INS = ""; 9 | const std::string DEL = ""; 10 | 11 | // for the sticthing process 12 | const std::string TK_GLOBAL_CLASS = "global"; 13 | 14 | // for fallback deletions 15 | const std::string NOOP = "!!noop-token!!"; 16 | 17 | void printFst(const fst::StdFst *fst, const fst::SymbolTable *symbol) { printFst("console", fst, symbol); } 18 | 19 | void printFst(std::string loggerName, const fst::StdFst *fst, const fst::SymbolTable *symbol) { 20 | auto log = logger::GetOrCreateLogger(loggerName); 21 | if (log->should_log(spdlog::level::info)) { 22 | for (fst::StateIterator siter(*fst); !siter.Done(); siter.Next()) { 23 | fst::StdFst::StateId stateId = siter.Value(); 24 | float end_state_weight = fst->Final(stateId).Value(); 25 | 26 | for (fst::ArcIterator aiter(*fst, stateId); !aiter.Done(); aiter.Next()) { 27 | const fst::StdArc &arc = aiter.Value(); 28 | 29 | std::stringstream ss; 30 | std::stringstream ss1; 31 | ss << arc.ilabel << "/" << symbol->Find(arc.ilabel); 32 | std::string ilabel = ss.str(); 33 | 34 | ss1 << arc.olabel << "/" << symbol->Find(arc.olabel); 35 | std::string olabel = ss1.str(); 36 | 37 | log->info("{}\t{}\t{}\t{}\t{}", stateId, arc.nextstate, ilabel, olabel, arc.weight.Value()); 38 | } 39 | 40 | if (end_state_weight != numeric_limits::infinity() && end_state_weight != 0) { 41 | log->info("{}", stateId); 42 | } 43 | } 44 | } 45 | } 46 | 47 | template 48 | void splitString(const std::string &str, char delimiter, StringFunction f) { 49 | std::size_t from = 0; 50 | for (std::size_t i = 0; i < str.size(); ++i) { 51 | if (str[i] == delimiter) { 52 | f(str, from, i); 53 | from = i + 1; 54 | } 55 | } 56 | if (from <= str.size()) { 57 | f(str, from, str.size()); 58 | } 59 | } 60 | 61 | struct iequal { 62 | bool operator()(int c1, int c2) const { return std::toupper(c1) == std::toupper(c2); } 63 | }; 64 | 65 | bool iequals(const std::string &str1, const std::string &str2) { 66 | if (str1.size() != str2.size()) { 67 | return false; 68 | } 69 | 70 | if (str1 == str2) { 71 | return true; 72 | } 73 | 74 | return std::equal(str1.begin(), str1.end(), str2.begin(), iequal()); 75 | } 76 | 77 | // trim from start (in place) 78 | void ltrim(std::string &s) { 79 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !std::isspace(ch); })); 80 | } 81 | 82 | // trim from end (in place) 83 | void rtrim(std::string &s) { 84 | s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end()); 85 | } 86 | 87 | // trim from both ends (in place) 88 | void trim(std::string &s) { 89 | ltrim(s); 90 | rtrim(s); 91 | } 92 | 93 | // trim from both ends (copying) 94 | std::string trim_copy(std::string s) { 95 | trim(s); 96 | return s; 97 | } 98 | 99 | template 100 | Iter splitStringIter(const std::string &s, const std::string &delim, Iter out) { 101 | if (delim.empty()) { 102 | *out++ = s; 103 | return out; 104 | } 105 | size_t a = 0, b = s.find(delim); 106 | for (; b != std::string::npos; a = b + delim.length(), b = s.find(delim, a)) { 107 | *out++ = std::move(s.substr(a, b - a)); 108 | } 109 | *out++ = std::move(s.substr(a, s.length() - a)); 110 | return out; 111 | } 112 | 113 | std::string string_join(const std::vector &elements, const char *const separator) { 114 | switch (elements.size()) { 115 | case 0: 116 | return ""; 117 | case 1: 118 | return elements[0]; 119 | default: 120 | std::ostringstream os; 121 | std::copy(elements.begin(), elements.end() - 1, std::ostream_iterator(os, separator)); 122 | os << *elements.rbegin(); 123 | return os.str(); 124 | } 125 | } 126 | 127 | bool isValidNgram(const string &token) { 128 | if ((token.find(INS) != string::npos) || (token.find(DEL) != string::npos) || (token.find(EPSILON) != string::npos) || 129 | (token.find("___") != string::npos)) { 130 | return false; 131 | } else { 132 | return true; 133 | } 134 | } 135 | 136 | unordered_set get_bigrams(wer_alignment &topAlignment) { 137 | string bigram_ref = ""; 138 | string bigram_hyp = ""; 139 | unordered_set all_bigrams; 140 | vector bi_words; 141 | 142 | // Create a list of all tokens, flattening entity tokens 143 | vector> flattened_tokens; 144 | for (auto &tokens : topAlignment.tokens) { 145 | // handle entity labels 146 | if (isEntityLabel(tokens.first)) { 147 | auto class_label = tokens.first; 148 | 149 | for (auto &label_alignment : topAlignment.label_alignments) { 150 | if (label_alignment.classLabel == class_label) { 151 | for (auto &labelTokens : label_alignment.tokens) { 152 | flattened_tokens.push_back(labelTokens); 153 | } 154 | } 155 | } 156 | } else { 157 | flattened_tokens.push_back(tokens); 158 | } 159 | } 160 | 161 | for (auto it = flattened_tokens.begin(); it != std::prev(flattened_tokens.end()); ++it) { 162 | bi_words = {it->first, std::next(it)->first}; 163 | bigram_ref = string_join(bi_words, " "); 164 | // cout << it - topAlignment->tokens.begin() << " : "<< bigram_ref << " (" << it->first << " " << 165 | // std::next(it)->first <<" )" << endl; 166 | if (isValidNgram(bigram_ref)) { 167 | topAlignment.ref_bigrams[bigram_ref] += 1; 168 | all_bigrams.insert(bigram_ref); 169 | } 170 | bi_words = {it->second, std::next(it)->second}; 171 | bigram_hyp = string_join(bi_words, " "); 172 | // cout << it - topAlignment->tokens.begin() << " : "<< bigram_hyp << " (" << it->second << " " << 173 | // std::next(it)->second <<" )" << endl; 174 | if (isValidNgram(bigram_hyp)) { 175 | topAlignment.hyp_bigrams[bigram_hyp] += 1; 176 | all_bigrams.insert(bigram_hyp); 177 | } 178 | 179 | topAlignment.bigram_tokens.push_back(std::make_pair(bigram_ref, bigram_hyp)); 180 | } 181 | return all_bigrams; 182 | } 183 | 184 | bool isEntityLabel(const string &token) { return token.find("___") == 0 ? true : false; } 185 | 186 | bool isSynonymLabel(const string &token) { 187 | // return token.find("___SYN-") == 0 ? true : false; 188 | return (token.find("___") == 0 && token.find("_SYN_") != std::string::npos) ? true : false; 189 | } 190 | 191 | bool IsNoisecodeToken(const string &token) { return token.find("<") == 0 && token.find(">") == token.length() - 1; } 192 | 193 | string getLabelIdFromToken(const string &token) { 194 | if (!isEntityLabel(token)) { 195 | return ""; 196 | } 197 | // Example label: ___0_CONTRACTION___ 198 | 199 | // Trim the ___ from the start and end of the label string 200 | auto label_id = token.substr(3, token.size() - 6); 201 | 202 | // Isolate the ID at the start of the label, separated by _ 203 | int p = label_id.find("_"); 204 | if (p > 0) { 205 | label_id = label_id.substr(0, p); 206 | } 207 | 208 | return label_id; 209 | } 210 | 211 | std::string GetEnv(const std::string &var, const std::string default_value) { 212 | const char *val = std::getenv(var.c_str()); 213 | if (val == nullptr) { // invalid to assign nullptr to std::string 214 | return default_value; 215 | } else { 216 | return val; 217 | } 218 | } 219 | 220 | // going from ___23_ORDINAL___ to ORDINAL 221 | string GetLabelNameFromClassLabel(string classLabel) { 222 | string label_id = classLabel.substr(3, classLabel.size() - 6); 223 | string label = label_id.substr(label_id.find("_") + 1); 224 | return label; 225 | } 226 | 227 | string GetClassLabel(string best_label) { 228 | if (best_label == "") { 229 | return ""; 230 | } 231 | 232 | string classlabel = string("___" + best_label + "___"); 233 | std::replace(classlabel.begin(), classlabel.end(), ':', '_'); 234 | return classlabel; 235 | } 236 | 237 | string UnicodeLowercase(string token) { 238 | icu::UnicodeString utoken = icu::UnicodeString::fromUTF8(token); 239 | std::string lower_cased; 240 | utoken.toLower().toUTF8String(lower_cased); 241 | return lower_cased; 242 | } 243 | 244 | bool EndsWithCaseInsensitive(const string &value, const string &ending) { 245 | if (ending.size() > value.size()) { 246 | return false; 247 | } 248 | return equal(ending.rbegin(), ending.rend(), value.rbegin(), 249 | [](const char a, const char b) { return tolower(a) == tolower(b); }); 250 | } 251 | -------------------------------------------------------------------------------- /src/utilities.h: -------------------------------------------------------------------------------- 1 | /* 2 | * utilities.h 3 | * 4 | * Created on: 2018-04-23 5 | * Author: JP Robichaud (jp@rev.com) 6 | */ 7 | 8 | #ifndef UTILITIES_H_ 9 | #define UTILITIES_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "logging.h" 30 | // #include "FstLoader.h" 31 | 32 | #define quote(x) #x 33 | 34 | using namespace std; 35 | using namespace fst; 36 | 37 | extern const string EPSILON; // = "ε"; 38 | extern const string INS; // = "ε"; 39 | extern const string DEL; // = "ε"; 40 | 41 | extern const string TK_GLOBAL_CLASS; // "global"; 42 | extern const string NOOP; 43 | 44 | typedef struct wer_alignment wer_alignment; 45 | typedef struct wer_alignment *WERAp; 46 | typedef shared_ptr spWERA; 47 | 48 | typedef float precision_t; 49 | typedef float recall_t; 50 | 51 | typedef unordered_map bigrams; 52 | 53 | typedef struct gram_error_counter { 54 | int correct = 0; 55 | int del = 0; 56 | int subst_fp = 0; 57 | int subst_fn = 0; 58 | int ins = 0; 59 | precision_t precision = 0.0f; 60 | recall_t recall = 0.0f; 61 | gram_error_counter(int c, int d, int sfp, int sfn, int i) : correct(c), del(d), subst_fp(sfp), subst_fn(sfn), ins(i) {} 62 | } gram_error_counter; 63 | 64 | struct wer_alignment { 65 | string classLabel; 66 | // int numErrors; 67 | int insertions = 0; 68 | int deletions = 0; 69 | int substitutions = 0; 70 | int numWordsInReference = 0; 71 | int numWordsInHypothesis = 0; 72 | 73 | vector ref_words; 74 | vector hyp_words; 75 | 76 | // we could perhaps get rid of these using or in sub_words 77 | vector del_words; 78 | vector ins_words; 79 | vector> sub_words; 80 | 81 | precision_t precision; 82 | recall_t recall; 83 | // map> unigram_stats; 84 | vector> unigram_stats; 85 | 86 | // map, pair> 87 | vector> bigrams_stats; 88 | bigrams ref_bigrams; 89 | bigrams hyp_bigrams; 90 | vector> bigram_tokens; 91 | 92 | vector> tokens; 93 | vector label_alignments; 94 | int NumErrors() { return insertions + substitutions + deletions; } 95 | /* can return infinity if numWordsInReference == 0 and numWordsInHypothesis > 0 */ 96 | float WER() const { 97 | if (numWordsInReference > 0) { 98 | return (float)(insertions + deletions + substitutions) / (float)numWordsInReference; 99 | } 100 | 101 | if (numWordsInHypothesis > 0) { 102 | return numeric_limits::infinity(); 103 | } 104 | 105 | return 0; 106 | } 107 | 108 | void Reverse() { 109 | std::reverse(ref_words.begin(), ref_words.end()); 110 | std::reverse(hyp_words.begin(), hyp_words.end()); 111 | std::reverse(ins_words.begin(), ins_words.end()); 112 | std::reverse(del_words.begin(), del_words.end()); 113 | std::reverse(sub_words.begin(), sub_words.end()); 114 | std::reverse(tokens.begin(), tokens.end()); 115 | for (auto &a : label_alignments) { 116 | a.Reverse(); 117 | } 118 | } 119 | }; 120 | 121 | struct FstAlignOption { 122 | bool bForceEnterAndExit; 123 | 124 | float corCost; 125 | float insCost; 126 | float delCost; 127 | float subCost; 128 | 129 | string symEps; 130 | string symOov; 131 | string symIns; 132 | string symDel; 133 | string symSub; 134 | string symInaud; 135 | string symSil; 136 | string symUnk; 137 | 138 | int eps_idx; 139 | int oov_idx; 140 | int ins_idx; 141 | int del_idx; 142 | int sub_idx; 143 | int inaud_idx; 144 | int sil_idx; 145 | int unk_idx; 146 | 147 | FstAlignOption() 148 | : bForceEnterAndExit(false), 149 | corCost(0.0f), 150 | insCost(3.0f), 151 | delCost(3.0f), 152 | subCost(4.0f), 153 | symEps(""), 154 | symOov(""), 155 | symIns(""), 156 | symDel(""), 157 | symSub(""), 158 | symInaud(""), 159 | symSil(""), 160 | symUnk("") {} 161 | 162 | void RegisterSymbols(fst::SymbolTable &symbol) { 163 | // int noSym = fst::kNoSymbol; 164 | int noSym = -1; 165 | eps_idx = symbol.Find(symEps); 166 | if (eps_idx == noSym) { 167 | eps_idx = symbol.AddSymbol(symEps); 168 | } 169 | 170 | oov_idx = symbol.Find(symOov); 171 | if (oov_idx == noSym) { 172 | oov_idx = symbol.AddSymbol(symOov); 173 | } 174 | 175 | ins_idx = symbol.Find(symIns); 176 | if (ins_idx == noSym) { 177 | ins_idx = symbol.AddSymbol(symIns); 178 | } 179 | 180 | del_idx = symbol.Find(symDel); 181 | if (del_idx == noSym) { 182 | del_idx = symbol.AddSymbol(symDel); 183 | } 184 | 185 | sub_idx = symbol.Find(symSub); 186 | if (sub_idx == noSym) { 187 | sub_idx = symbol.AddSymbol(symSub); 188 | } 189 | 190 | inaud_idx = symbol.Find(symInaud); 191 | if (inaud_idx == noSym) { 192 | inaud_idx = symbol.AddSymbol(symInaud); 193 | } 194 | 195 | sil_idx = symbol.Find(symSil); 196 | if (sil_idx == noSym) { 197 | sil_idx = symbol.AddSymbol(symSil); 198 | } 199 | 200 | unk_idx = symbol.Find(symUnk); 201 | if (unk_idx == noSym) { 202 | unk_idx = symbol.AddSymbol(symUnk); 203 | } 204 | } 205 | }; 206 | /* printing FST on the console (or the specified logger) */ 207 | void printFst(const fst::StdFst *fst, const fst::SymbolTable *symbol); 208 | void printFst(string loggerName, const fst::StdFst *fst, const fst::SymbolTable *symbol); 209 | 210 | // from StackOverflow : nice way to get a function call when delimiters on a 211 | // string are matched 212 | template 213 | void splitString(const string &str, char delimiter, StringFunction f); 214 | 215 | bool EndsWithCaseInsensitive(const string &value, const string &ending); 216 | bool iequals(const std::string &, const std::string &); 217 | 218 | // string manip 219 | void ltrim(std::string &s); 220 | void rtrim(std::string &s); 221 | void trim(std::string &s); 222 | std::string trim_copy(std::string s); 223 | 224 | template 225 | Iter splitStringIter(const std::string &s, const std::string &delim, Iter out); 226 | 227 | std::string string_join(const std::vector &elements, const char *const separator); 228 | 229 | unordered_set get_bigrams(wer_alignment &topAlignment); 230 | bool isValidNgram(const string &token); 231 | bool isEntityLabel(const string &token); 232 | bool isSynonymLabel(const string &token); 233 | bool IsNoisecodeToken(const string &token); 234 | string getLabelIdFromToken(const string &token); 235 | std::string GetEnv(const std::string &var, const std::string default_value); 236 | 237 | // going from ___23_ORDINAL___ to ORDINAL 238 | string GetLabelNameFromClassLabel(string classLabel); 239 | 240 | string GetClassLabel(string best_label); 241 | 242 | string UnicodeLowercase(string token); 243 | 244 | #endif // UTILITIES_H_ 245 | -------------------------------------------------------------------------------- /src/version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define FSTALIGNER_VERSION_MAJOR 2 4 | #define FSTALIGNER_VERSION_MINOR 0 5 | #define FSTALIGNER_VERSION_PATCH 0 6 | -------------------------------------------------------------------------------- /src/wer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * wer.h 3 | * 4 | * Collection of functions specific to the WER subcommand. 5 | * 6 | * Quinn McNamara (quinn@rev.com) 7 | * 2021 8 | */ 9 | #include "fstalign.h" 10 | #include "json_logging.h" 11 | 12 | using namespace std; 13 | 14 | struct WerResult { 15 | int insertions; 16 | int deletions; 17 | int substitutions; 18 | int numWordsInReference; 19 | int numWordsInHypothesis; 20 | int NumErrors() { return insertions + substitutions + deletions; } 21 | /* can return infinity if numWordsInReference == 0 and numWordsInHypothesis > 0 */ 22 | float WER() { 23 | if (numWordsInReference > 0) { 24 | return (float)(insertions + deletions + substitutions) / (float)numWordsInReference; 25 | } 26 | 27 | if (numWordsInHypothesis > 0) { 28 | return numeric_limits::infinity(); 29 | } 30 | 31 | return -nanf(""); 32 | } 33 | }; 34 | 35 | vector GetSpeakerSwitchIndices(const vector& stitches); 36 | 37 | // These methods record different WER analyses to JSON 38 | void RecordWerResult(Json::Value &json, WerResult wr); 39 | void RecordWer(wer_alignment& topAlignment); 40 | void RecordSpeakerWer(const vector& stitches); 41 | void RecordSpeakerSwitchWer(const vector& stitches, int speaker_switch_context_size); 42 | void RecordSentenceWer(const vector& stitches); 43 | void RecordTagWer(const vector& stitches); 44 | void RecordCaseWer(const vector& aligned_stitches); 45 | 46 | // Adds PR metrics to topAlignment 47 | void CalculatePrecisionRecall(wer_alignment &topAlignment, int threshold); 48 | 49 | typedef vector> ErrorGroups; 50 | 51 | void AddErrorGroup(ErrorGroups &groups, size_t &line, string &ref, string &hyp); 52 | void WriteSbs(wer_alignment &topAlignment, const vector& stitches, string sbs_filename, const vector extra_ref_columns, const vector extra_hyp_columns); 53 | void JsonLogUnigramBigramStats(wer_alignment &topAlignment); 54 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | find_package(Threads REQUIRED) 4 | 5 | include_directories( 6 | ${CMAKE_INSTALL_PREFIX}/include 7 | ${FSTALIGN_INCLUDES} 8 | ${OPENFST_INCLUDES} 9 | ${PROJECT_SOURCE_DIR}/test 10 | ${PROJECT_SOURCE_DIR} 11 | ${CMAKE_DL_LIBS} 12 | ) 13 | 14 | link_libraries( 15 | ${OPENFST_LIBRARIES} 16 | fstaligner-common 17 | ) 18 | 19 | add_executable(fstalign_Test fstalign_Test.cc) 20 | target_link_libraries(fstalign_Test Threads::Threads) 21 | 22 | add_test(NAME fstalign_Test 23 | COMMAND $ 24 | WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build) 25 | 26 | add_executable(compose-tests compose-tests.cc) 27 | target_link_libraries(compose-tests Threads::Threads) 28 | 29 | add_test(NAME compose-tests 30 | COMMAND $ 31 | WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build) 32 | 33 | add_executable(fast-d-tests fast-d-tests.cc) 34 | target_link_libraries(fast-d-tests Threads::Threads) 35 | 36 | add_test(NAME fast-d-tests 37 | COMMAND $ 38 | WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build) 39 | -------------------------------------------------------------------------------- /test/compose-tests-utils.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __COMPOSE_TEST_UTILITIES_H__ 3 | #define __COMPOSE_TEST_UTILITIES_H__ 1 4 | 5 | #include "src/OneBestFstLoader.h" 6 | #include "src/AdaptedComposition.h" 7 | #include "src/logging.h" 8 | 9 | // helper methods 10 | 11 | StdVectorFst GetFstFromString(SymbolTable *symbols, const std::string str) { 12 | OneBestFstLoader loader; 13 | loader.BuildFromString(str); 14 | loader.addToSymbolTable(*symbols); 15 | std::vector map; 16 | return loader.convertToFst(*symbols, map); 17 | } 18 | 19 | StdVectorFst GetStdFstA() { 20 | StdVectorFst a; 21 | a.AddState(); // 0 22 | a.AddState(); // 1 23 | a.AddState(); // 2 24 | a.AddState(); // 3 25 | 26 | a.SetStart(0); 27 | a.SetFinal((StateId)3, StdArc::Weight::One()); 28 | // Arc constructor args: ilabel, olabel, weight, dest state ID. 29 | 30 | // 0 -> 1:1 -> 1 -> 2:2 -> 2 -> 3:3 31 | 32 | a.AddArc(0, StdArc(1, 1, 0, 1)); 33 | a.AddArc(1, StdArc(2, 2, 0, 2)); 34 | a.AddArc(2, StdArc(3, 3, 0, 3)); 35 | 36 | return a; 37 | } 38 | 39 | StdVectorFst GetStdFstB() { 40 | StdVectorFst a; 41 | a.AddState(); // 0 42 | a.AddState(); // 1 43 | a.AddState(); // 2 44 | a.AddState(); // 3 45 | 46 | a.SetStart(0); 47 | a.SetFinal((StateId)3, StdArc::Weight::One()); 48 | // Arc constructor args: ilabel, olabel, weight, dest state ID. 49 | 50 | // 0 -> 1:1 -> 1 -> 2:2 -> 2 -> 3:3 51 | 52 | a.AddArc(0, StdArc(1, 4, 0, 1)); 53 | a.AddArc(1, StdArc(2, 5, 0, 2)); 54 | a.AddArc(2, StdArc(3, 6, 0, 3)); 55 | 56 | return a; 57 | } 58 | 59 | 60 | #endif -------------------------------------------------------------------------------- /test/compose-tests.cc: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include 3 | #include 4 | #include "../third-party/catch2/single_include/catch2/catch.hpp" 5 | #include "compose-tests-utils.h" 6 | #include "test-utilties.h" 7 | 8 | using Catch::Matchers::Contains; 9 | 10 | #include "src/AdaptedComposition.h" 11 | #include "src/logging.h" 12 | 13 | // there just to setup the loggers 14 | TEST_CASE("STATIC_REQUIRE showcase", "[traits]") { 15 | logger::InitLoggers(""); 16 | STATIC_REQUIRE(std::is_void::value); 17 | STATIC_REQUIRE_FALSE(std::is_void::value); 18 | } 19 | 20 | // TODO: add degenerated case, where all words in CTM are or no words at all are available 21 | TEST_CASE("CheckEntity") { 22 | SECTION("synonyms") { 23 | REQUIRE(isSynonymLabel("___100000_SYN_1-1___")); 24 | REQUIRE(isEntityLabel("___100000_SYN_1-1___")); 25 | 26 | REQUIRE(isSynonymLabel("___90_CARDINAL___") == false); 27 | REQUIRE(isEntityLabel("___90_CARDINAL___")); 28 | REQUIRE(isEntityLabel("___90___")); 29 | REQUIRE(isEntityLabel("__90__") == false); 30 | 31 | REQUIRE(isSynonymLabel("___100000_syn_1-1___") == false); 32 | REQUIRE(isSynonymLabel("___100000SYN_1-1___") == false); 33 | } 34 | } 35 | TEST_CASE("composition()") { 36 | SECTION("simple1") { 37 | auto logger = logger::GetOrCreateLogger("simple1"); 38 | logger->info("starting"); 39 | 40 | fst::StdVectorFst a = GetStdFstA(); 41 | fst::StdVectorFst b = GetStdFstB(); 42 | AdaptedCompositionFst composer(a, b); 43 | 44 | REQUIRE(composer.Start() == 0); 45 | 46 | auto s = composer.Start(); 47 | vector arcs; 48 | bool ret_status = composer.TryGetArcsAtState(s, &arcs); 49 | 50 | REQUIRE(ret_status); 51 | REQUIRE(arcs.size() == 3); 52 | 53 | REQUIRE(true); 54 | } 55 | 56 | SECTION("perfect match") { 57 | auto logger = logger::GetOrCreateLogger("perfect match"); 58 | logger->info("starting"); 59 | 60 | SymbolTable symbols; 61 | symbols.AddSymbol(""); 62 | symbols.AddSymbol(""); 63 | symbols.AddSymbol(""); 64 | symbols.AddSymbol(""); 65 | 66 | auto a = GetFstFromString(&symbols, "this is a test"); 67 | auto b = GetFstFromString(&symbols, "this is a test"); 68 | 69 | logger->info("symbols has {} entries, fst has {} states", symbols.NumSymbols(), a.NumStates()); 70 | 71 | AdaptedCompositionFst composer(a, b); 72 | auto s = composer.Start(); 73 | REQUIRE(s == 0); 74 | 75 | // given that we have a match for each words, we should always have 1 arc per state and one composed state per pair 76 | // of input arcs (0,0) -> 0 (1,1) -> 1 (2,2) -> 2 (3,3) -> 3 77 | int current_state = s; 78 | for (int i = 0; i < 7; i++) { 79 | vector arcs_leaving_state; 80 | bool ret_status = composer.TryGetArcsAtState(current_state, &arcs_leaving_state); 81 | logger->info("({}) from state {}, we have {} arcs leaving with a ret-status {}", i, current_state, 82 | arcs_leaving_state.size(), ret_status); 83 | REQUIRE(ret_status); 84 | 85 | if (i == 6) { 86 | // final state 87 | REQUIRE(arcs_leaving_state.size() == 0); 88 | logger->info("({}) we expect composed state id {} to have a weight one One()", i, current_state); 89 | REQUIRE(composer.Final(current_state) == StdFst::Weight::One()); 90 | } else { 91 | if (i >= 4) { 92 | REQUIRE(arcs_leaving_state.size() == 1); 93 | } else { 94 | REQUIRE(arcs_leaving_state.size() == 3); 95 | } 96 | for (vector::iterator iter = arcs_leaving_state.begin(); iter != arcs_leaving_state.end(); ++iter) { 97 | const fst::StdArc arc = *iter; 98 | logger->info("({}) arc leaving state {} to {} with label {}/{} ({}/{})", i, current_state, arc.nextstate, 99 | arc.ilabel, arc.olabel, symbols.Find(arc.ilabel), symbols.Find(arc.olabel)); 100 | 101 | logger->info("({}) we expect composed state id {} to have a weight one Zero()", i, current_state); 102 | REQUIRE(composer.Final(current_state) == StdFst::Weight::Zero()); 103 | 104 | current_state = arc.nextstate; 105 | } 106 | } 107 | } 108 | } 109 | 110 | SECTION("deletion at the end") { 111 | auto logger = logger::GetOrCreateLogger("deletions"); 112 | logger->info("starting"); 113 | 114 | SymbolTable symbols; 115 | symbols.AddSymbol(""); 116 | symbols.AddSymbol(""); 117 | symbols.AddSymbol(""); 118 | symbols.AddSymbol(""); 119 | 120 | auto a = GetFstFromString(&symbols, "this is a test with some extra words at the end"); 121 | auto b = GetFstFromString(&symbols, "this is a test"); 122 | 123 | logger->info("symbols has {} entries, fst has {} states", symbols.NumSymbols(), a.NumStates()); 124 | 125 | AdaptedCompositionFst composer(a, b); 126 | auto s = composer.Start(); 127 | REQUIRE(s == 0); 128 | 129 | // given that we have a match for each words, we should always have 1 arc per state and one composed state per pair 130 | // of input arcs (0,0) -> 0 (1,1) -> 1 (2,2) -> 2 (3,3) -> 3 131 | int current_state = s; 132 | 133 | // The test here is to check that we can reach a final node where the word "end" is deleted. 134 | std::queue states_to_process; 135 | std::set states_explored; 136 | 137 | states_to_process.push(s); 138 | 139 | vector arcs_leaving_state; 140 | bool found_deleted_end = false; 141 | while (states_to_process.size() > 0) { 142 | current_state = states_to_process.front(); 143 | states_to_process.pop(); 144 | 145 | if (states_explored.find(current_state) != states_explored.end()) { 146 | continue; 147 | } 148 | states_explored.insert(current_state); 149 | 150 | bool ret_status = composer.TryGetArcsAtState(current_state, &arcs_leaving_state); 151 | logger->info("from state {}, we have {} arcs leaving with a ret-status {}", current_state, 152 | arcs_leaving_state.size(), ret_status); 153 | REQUIRE(ret_status); 154 | 155 | for (vector::iterator iter = arcs_leaving_state.begin(); iter != arcs_leaving_state.end(); ++iter) { 156 | const fst::StdArc arc = *iter; 157 | logger->info("arc leaving state {} to {} with label {}/{} ({}/{})", current_state, arc.nextstate, arc.ilabel, 158 | arc.olabel, symbols.Find(arc.ilabel), symbols.Find(arc.olabel)); 159 | 160 | if (arc.nextstate != current_state && states_explored.find(arc.nextstate) == states_explored.end()) { 161 | states_to_process.push(arc.nextstate); 162 | } 163 | 164 | if (symbols.Find(arc.ilabel) == "end" && arc.olabel == 0) { 165 | found_deleted_end = true; 166 | } 167 | } 168 | } 169 | 170 | REQUIRE(found_deleted_end); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /test/data/align_1.aligned.punc_case.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||sub(A)| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | ,|1|7.0000|8.0000|||||[]|||| 7 | |1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||| 8 | .|1|11.0000|12.0000|||||[]|||| 9 | e|1|||||LC|[]|[]|||del| 10 | f|1|13.0000|14.0000|||LC|[]|[]|||| 11 | g|1|15.0000|16.0000|||LC|[]|[]|||| 12 | h|1|17.0000|18.0000|||LC|[]|[]|||| 13 | |1|||,||LC|[]|[]|||del| 14 | ,|1|||||||[]|||del| 15 | i|1|21.0000|22.0000|||LC|[]|[]|||sub(I)| 16 | j|1|23.0000|24.0000|||LC|[]|[]|||sub(J)| 17 | -------------------------------------------------------------------------------- /test/data/align_1.hyp.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 6 | recording.wav 1 11 1 e 7 | recording.wav 1 13 1 f 8 | recording.wav 1 15 1 g 9 | recording.wav 1 17 1 h 10 | recording.wav 1 21 1 i 11 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_1.hyp.punc_case.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 A 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 7 1 , 6 | recording.wav 1 9 1 7 | recording.wav 1 11 1 e 8 | recording.wav 1 11 1 . 9 | recording.wav 1 13 1 f 10 | recording.wav 1 15 1 g 11 | recording.wav 1 17 1 h 12 | recording.wav 1 21 1 I 13 | recording.wav 1 23 1 J 14 | -------------------------------------------------------------------------------- /test/data/align_1.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 1.0, 6 | "verbalization": [ 7 | "" 8 | ] 9 | } 10 | ], 11 | "class": "FALLBACK" 12 | } 13 | } -------------------------------------------------------------------------------- /test/data/align_1.ref.aligned.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | |1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]|||| 7 | e|1|11.0000|12.0000|||LC|[]|[]|||| 8 | f|1|13.0000|14.0000|||LC|[]|[]|||| 9 | g|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | |1|||,||LC|[]|[]|||del| 12 | i|1|21.0000|22.0000|||LC|[]|[]|||| 13 | j|1|23.0000|24.0000|||LC|[]|[]|||| 14 | -------------------------------------------------------------------------------- /test/data/align_1.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | |1|||.|LC|['0:FALLBACK'] 7 | e|1||||LC|[] 8 | f|1||||LC|[] 9 | g|1||||LC|[] 10 | h|1||||LC|[] 11 | |1|||,|LC|[] 12 | i|1||||LC|[] 13 | j|1||||LC|[] 14 | -------------------------------------------------------------------------------- /test/data/align_2.hyp.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 i'll 6 | recording.wav 1 11 1 shakespeare 7 | recording.wav 1 13 1 j 8 | recording.wav 1 15 1 k 9 | recording.wav 1 17 1 l 10 | -------------------------------------------------------------------------------- /test/data/align_2.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 0.5, 6 | "verbalization": [ 7 | "William Shakespeare" 8 | ] 9 | }, 10 | { 11 | "probability": 0.5, 12 | "verbalization": [ 13 | "Will Shakespeare" 14 | ] 15 | } 16 | ], 17 | "class": "NAME" 18 | } 19 | } -------------------------------------------------------------------------------- /test/data/align_2.ref.aligned.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | i|1|9.0000|10.0000|||LC|[]|[]|||sub(i'll)| 7 | will|1|||||LC|['0:NAME']|[]|||del,direct| 8 | shakespeare|1|11.0000|12.0000|||LC|['0:NAME']|[]|||,direct| 9 | j|1|13.0000|14.0000|||LC|[]|[]|||| 10 | k|1|15.0000|16.0000|||LC|[]|[]|||| 11 | l|1|17.0000|18.0000|||LC|[]|[]|||| 12 | -------------------------------------------------------------------------------- /test/data/align_2.ref.aligned.std.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | i|1|||||LC|[]|[]|||del| 7 | will|1|9.0000|10.0000|||LC|['0:NAME']|[]|||sub(i'll),direct| 8 | shakespeare|1|11.0000|12.0000|||LC|['0:NAME']|[]|||,direct| 9 | j|1|13.0000|14.0000|||LC|[]|[]|||| 10 | k|1|15.0000|16.0000|||LC|[]|[]|||| 11 | l|1|17.0000|18.0000|||LC|[]|[]|||| 12 | -------------------------------------------------------------------------------- /test/data/align_2.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | i|1||||LC|[] 7 | will|1||||LC|['0:NAME'] 8 | shakespeare|1||||LC|['0:NAME'] 9 | j|1||||LC|[] 10 | k|1||||LC|[] 11 | l|1||||LC|[] 12 | -------------------------------------------------------------------------------- /test/data/align_3.hyp.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 want 6 | recording.wav 1 10 1 to 7 | recording.wav 1 11 1 e 8 | recording.wav 1 13 1 f 9 | recording.wav 1 15 1 g 10 | recording.wav 1 17 1 h 11 | recording.wav 1 21 1 i 12 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_3.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 1.0, 6 | "verbalization": [ 7 | "dont use" 8 | ] 9 | } 10 | ], 11 | "class": "FALLBACK" 12 | } 13 | } -------------------------------------------------------------------------------- /test/data/align_3.ref.aligned.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | want|1|9.0000|10.0000|||LC|['0:FALLBACK']|[]|||,push_last| 7 | to|1|10.0000|11.0000|.||LC|['0:FALLBACK']|[]|||,push_last| 8 | e|1|11.0000|12.0000|||LC|[]|[]|||| 9 | f|1|13.0000|14.0000|||LC|[]|[]|||| 10 | g|1|15.0000|16.0000|||LC|[]|[]|||| 11 | h|1|17.0000|18.0000|||LC|[]|[]|||| 12 | |1|||,||LC|[]|[]|||del| 13 | i|1|21.0000|22.0000|||LC|[]|[]|||| 14 | j|1|23.0000|24.0000|||LC|[]|[]|||| 15 | -------------------------------------------------------------------------------- /test/data/align_3.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | wanna|1|||.|LC|['0:FALLBACK'] 7 | e|1||||LC|[] 8 | f|1||||LC|[] 9 | g|1||||LC|[] 10 | h|1||||LC|[] 11 | |1|||,|LC|[] 12 | i|1||||LC|[] 13 | j|1||||LC|[] 14 | -------------------------------------------------------------------------------- /test/data/align_4.hyp1.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 nineteen 6 | recording.wav 1 10 1 uh 7 | recording.wav 1 11 1 eighty 8 | recording.wav 1 12 1 eight 9 | recording.wav 1 13 1 e 10 | recording.wav 1 14 1 f 11 | recording.wav 1 15 1 g 12 | recording.wav 1 17 1 h 13 | recording.wav 1 21 1 i 14 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_4.hyp2.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 nineteen 6 | recording.wav 1 11 1 eighty 7 | recording.wav 1 13 1 e 8 | recording.wav 1 14 1 f 9 | recording.wav 1 15 1 g 10 | recording.wav 1 17 1 h 11 | recording.wav 1 21 1 i 12 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_4.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 1.0, 6 | "verbalization": [ 7 | "nineteen", 8 | "eighty", 9 | "eight" 10 | ] 11 | } 12 | ], 13 | "class": "CARDINAL" 14 | } 15 | } -------------------------------------------------------------------------------- /test/data/align_4.ref.aligned1.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | nineteen|1|9.0000|10.0000|||LC|['0:CARDINAL']|[]|||,push_last| 7 | eighty|1|11.0000|12.0000|||LC|['0:CARDINAL']|[]|||,push_last| 8 | eight|1|12.0000|13.0000|||LC|['0:CARDINAL']|[]|||,push_last| 9 | e|1|13.0000|14.0000|||LC|[]|[]|||| 10 | f|1|14.0000|15.0000|||LC|[]|[]|||| 11 | g|1|15.0000|16.0000|||LC|[]|[]|||| 12 | h|1|17.0000|18.0000|||LC|[]|[]|||| 13 | i|1|21.0000|22.0000|||LC|[]|[]|||| 14 | j|1|23.0000|24.0000|||LC|[]|[]|||| 15 | -------------------------------------------------------------------------------- /test/data/align_4.ref.aligned2.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | nineteen|1|9.0000|10.0000|||LC|['0:CARDINAL']|[]|||,push_last| 7 | eighty|1|11.0000|12.0000|||LC|['0:CARDINAL']|[]|||,push_last| 8 | eight|1|||||LC|['0:CARDINAL']|[]|||del,push_last| 9 | e|1|13.0000|14.0000|||LC|[]|[]|||| 10 | f|1|14.0000|15.0000|||LC|[]|[]|||| 11 | g|1|15.0000|16.0000|||LC|[]|[]|||| 12 | h|1|17.0000|18.0000|||LC|[]|[]|||| 13 | i|1|21.0000|22.0000|||LC|[]|[]|||| 14 | j|1|23.0000|24.0000|||LC|[]|[]|||| 15 | -------------------------------------------------------------------------------- /test/data/align_4.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | 1988|1||||LC|['0:CARDINAL'] 7 | e|1||||LC|[] 8 | f|1||||LC|[] 9 | g|1||||LC|[] 10 | h|1||||LC|[] 11 | i|1||||LC|[] 12 | j|1||||LC|[] 13 | -------------------------------------------------------------------------------- /test/data/align_5.hyp1.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 want 6 | recording.wav 1 10 1 uh 7 | recording.wav 1 11 1 to 8 | recording.wav 1 13 1 e 9 | recording.wav 1 14 1 f 10 | recording.wav 1 15 1 g 11 | recording.wav 1 17 1 h 12 | recording.wav 1 21 1 i 13 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_5.hyp2.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 want 6 | recording.wav 1 13 1 e 7 | recording.wav 1 14 1 f 8 | recording.wav 1 15 1 g 9 | recording.wav 1 17 1 h 10 | recording.wav 1 21 1 i 11 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_5.ref.aligned1-2.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | wanna|1|11.0000|12.0000|||LC|[]|[]|||sub(to)| 7 | e|1|13.0000|14.0000|||LC|[]|[]|||| 8 | f|1|14.0000|15.0000|||LC|[]|[]|||| 9 | g|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | i|1|21.0000|22.0000|||LC|[]|[]|||| 12 | j|1|23.0000|24.0000|||LC|[]|[]|||| 13 | -------------------------------------------------------------------------------- /test/data/align_5.ref.aligned1.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | want|1|9.0000|10.0000|||LC|[]|[]|||,push_last| 7 | to|1|11.0000|12.0000|||LC|[]|[]|||,push_last| 8 | e|1|13.0000|14.0000|||LC|[]|[]|||| 9 | f|1|14.0000|15.0000|||LC|[]|[]|||| 10 | g|1|15.0000|16.0000|||LC|[]|[]|||| 11 | h|1|17.0000|18.0000|||LC|[]|[]|||| 12 | i|1|21.0000|22.0000|||LC|[]|[]|||| 13 | j|1|23.0000|24.0000|||LC|[]|[]|||| 14 | -------------------------------------------------------------------------------- /test/data/align_5.ref.aligned2-a2.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | wanna|1|9.0000|10.0000|||LC|[]|[]|||sub(want)| 7 | e|1|13.0000|14.0000|||LC|[]|[]|||| 8 | f|1|14.0000|15.0000|||LC|[]|[]|||| 9 | g|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | i|1|21.0000|22.0000|||LC|[]|[]|||| 12 | j|1|23.0000|24.0000|||LC|[]|[]|||| 13 | -------------------------------------------------------------------------------- /test/data/align_5.ref.aligned2.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | want|1|9.0000|10.0000|||LC|[]|[]|||,push_last| 7 | to|1|||||LC|[]|[]|||del,push_last| 8 | e|1|13.0000|14.0000|||LC|[]|[]|||| 9 | f|1|14.0000|15.0000|||LC|[]|[]|||| 10 | g|1|15.0000|16.0000|||LC|[]|[]|||| 11 | h|1|17.0000|18.0000|||LC|[]|[]|||| 12 | i|1|21.0000|22.0000|||LC|[]|[]|||| 13 | j|1|23.0000|24.0000|||LC|[]|[]|||| 14 | -------------------------------------------------------------------------------- /test/data/align_5.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | wanna|1||||LC|[] 7 | e|1||||LC|[] 8 | f|1||||LC|[] 9 | g|1||||LC|[] 10 | h|1||||LC|[] 11 | i|1||||LC|[] 12 | j|1||||LC|[] 13 | -------------------------------------------------------------------------------- /test/data/align_6.hyp.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 want 6 | recording.wav 1 10 1 uh 7 | recording.wav 1 11 1 to 8 | recording.wav 1 13 1 e 9 | recording.wav 1 14 1 f 10 | recording.wav 1 15 1 g 11 | recording.wav 1 17 1 h 12 | recording.wav 1 21 1 i 13 | recording.wav 1 23 1 j -------------------------------------------------------------------------------- /test/data/align_6.ref.aligned.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000||...|CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | wanna|1|11.0000|12.0000|||LC|[]|[]|||sub(to)| 7 | e|1|13.0000|14.0000|||LC|[]|[]|||| 8 | f|1|14.0000|15.0000||!|LC|[]|[]|||| 9 | g|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | i|1|21.0000|22.0000|||LC|[]|[]|||| 12 | j|1|23.0000|24.0000|||LC|[]|[]|||| 13 | -------------------------------------------------------------------------------- /test/data/align_6.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags 2 | a|1||||...|CA|[] 3 | b|1|||||LC|[] 4 | c|1|||||LC|[] 5 | d|1|||,||LC|[] 6 | wanna|1|||||LC|[] 7 | e|1|||||LC|[] 8 | f|1||||!|LC|[] 9 | g|1|||||LC|[] 10 | h|1|||||LC|[] 11 | i|1|||||LC|[] 12 | j|1|||||LC|[] 13 | -------------------------------------------------------------------------------- /test/data/empty.hyp.ctm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/empty.hyp.ctm -------------------------------------------------------------------------------- /test/data/empty.hyp.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | -------------------------------------------------------------------------------- /test/data/empty.hyp.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/empty.hyp.txt -------------------------------------------------------------------------------- /test/data/empty.ref.txt: -------------------------------------------------------------------------------- 1 | not really empty 2 | -------------------------------------------------------------------------------- /test/data/fstalign-50.hyp.txt: -------------------------------------------------------------------------------- 1 | su capital es la ciudad de pau 2 | -------------------------------------------------------------------------------- /test/data/fstalign-50.new.sbs.txt: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | su su 3 | capital capital 4 | es es 5 | la la 6 | ciudad ciudad 7 | de de 8 | palu pau ERR 9 | ------------------------------------------------------------ 10 | Line Group 11 | 8 palu <-> pau 12 | -------------------------------------------------------------------------------- /test/data/fstalign-50.ref.txt: -------------------------------------------------------------------------------- 1 | su capital es la ciudad de palu 2 | -------------------------------------------------------------------------------- /test/data/noise.hyp1.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 11 1 e 6 | recording.wav 1 13 1 f 7 | recording.wav 1 15 1 g 8 | recording.wav 1 17 1 h 9 | recording.wav 1 21 1 i 10 | recording.wav 1 23 1 j 11 | -------------------------------------------------------------------------------- /test/data/noise.hyp2.ctm: -------------------------------------------------------------------------------- 1 | recording.wav 1 1 1 a 2 | recording.wav 1 3 1 b 3 | recording.wav 1 5 1 c 4 | recording.wav 1 7 1 d 5 | recording.wav 1 9 1 6 | recording.wav 1 11 1 e 7 | recording.wav 1 13 1 f 8 | recording.wav 1 15 1 g 9 | recording.wav 1 17 1 h 10 | recording.wav 1 19 1 11 | recording.wav 1 21 1 i 12 | recording.wav 1 23 1 j 13 | -------------------------------------------------------------------------------- /test/data/noise_1.hyp1.aligned: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | |1|||,||LC|[]|[]|||del| 7 | e|1|11.0000|12.0000|||LC|[]|[]|||| 8 | F|1|13.0000|14.0000|||LC|[]|[]|||| 9 | G|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | |1|||,||LC|[]|[]|||del| 12 | i|1|21.0000|22.0000|||LC|[]|[]|||| 13 | j|1|23.0000|24.0000|||LC|[]|[]|||| 14 | -------------------------------------------------------------------------------- /test/data/noise_1.hyp2.aligned: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | a|1|1.0000|2.0000|||CA|[]|[]|||| 3 | b|1|3.0000|4.0000|||LC|[]|[]|||| 4 | c|1|5.0000|6.0000|||LC|[]|[]|||| 5 | d|1|7.0000|8.0000|,||LC|[]|[]|||| 6 | |1|9.0000|10.0000|,||LC|[]|[]|||| 7 | e|1|11.0000|12.0000|||LC|[]|[]|||| 8 | F|1|13.0000|14.0000|||LC|[]|[]|||| 9 | G|1|15.0000|16.0000|||LC|[]|[]|||| 10 | h|1|17.0000|18.0000|||LC|[]|[]|||| 11 | |1|19.0000|20.0000|,||LC|[]|[]|||| 12 | i|1|21.0000|22.0000|||LC|[]|[]|||| 13 | j|1|23.0000|24.0000|||LC|[]|[]|||| 14 | -------------------------------------------------------------------------------- /test/data/noise_1.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||CA|[] 3 | b|1||||LC|[] 4 | c|1||||LC|[] 5 | d|1|||,|LC|[] 6 | |1|||,|LC|[] 7 | e|1||||LC|[] 8 | F|1||||LC|[] 9 | G|1||||LC|[] 10 | h|1||||LC|[] 11 | |1|||,|LC|[] 12 | i|1||||LC|[] 13 | j|1||||LC|[] 14 | -------------------------------------------------------------------------------- /test/data/oracle_1.hyp.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/oracle_1.hyp.fst -------------------------------------------------------------------------------- /test/data/oracle_1.ref.txt: -------------------------------------------------------------------------------- 1 | this is a test and a very good one 2 | -------------------------------------------------------------------------------- /test/data/oracle_1.symbols.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 6 7 | 5 8 | 7 9 | this 8 10 | these 9 11 | is 10 12 | his 11 13 | a 12 14 | isa 13 15 | test 14 16 | and 15 17 | very 16 18 | good 17 19 | one 18 20 | -------------------------------------------------------------------------------- /test/data/short.aligned.case.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | |2|0.0000|0.0000|||LC|[]|[]|||| 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 4 | yeah|1|||,||LC|[]|[]|||del| 5 | right|1|0.0000|0.0000|.||LC|[]|[]|||| 6 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(I'll)| 7 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)| 8 | right|1|||,||LC|[]|[]|||del| 9 | probably|1|||||LC|[]|[]|||del| 10 | just|1|0.0000|0.0000|||LC|[]|[]|||| 11 | that|1|0.0000|0.0000|.||LC|[]|[]|||| 12 | Are|3|0.0000|0.0000|||UC|[]|[]|||| 13 | there|3|0.0000|0.0000|||LC|[]|[]|||| 14 | any|3|0.0000|0.0000|||LC|[]|[]|||| 15 | visuals|3|0.0000|0.0000|||LC|[]|[]|||| 16 | that|3|0.0000|0.0000|||LC|[]|[]|||| 17 | come|3|0.0000|0.0000|||LC|[]|[]|||| 18 | to|3|0.0000|0.0000|||LC|[]|[]|||| 19 | mind|3|0.0000|0.0000|||LC|[]|[]|||| 20 | or|3|0.0000|0.0000|||LC|[]|[]|||| 21 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 22 | sure|1|0.0000|0.0000|.||LC|[]|[]|||| 23 | When|1|0.0000|0.0000|||UC|[]|[]|||| 24 | I|1|0.0000|0.0000|||CA|[]|[]|||| 25 | hear|1|0.0000|0.0000|||LC|[]|[]|||| 26 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| 27 | I|1|0.0000|0.0000|||CA|[]|[]|||| 28 | think|1|0.0000|0.0000|||LC|[]|[]|||| 29 | about|1|0.0000|0.0000|||LC|[]|[]|||| 30 | just|1|0.0000|0.0000|||LC|[]|[]|||| 31 | that|1|0.0000|0.0000|:||LC|[]|[]|||| 32 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(Foobar)| 33 | a|1|0.0000|0.0000|||LC|[]|[]|||| 34 | -------------------------------------------------------------------------------- /test/data/short.aligned.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | |2|0.0000|0.0000|||LC|[]|[]|||| 3 | Yeah|1|||,||UC|[]|[]|||del| 4 | yeah|1|0.0000|0.0000|,||LC|[]|[]|||| 5 | right|1|0.0000|0.0000|.||LC|[]|[]|||| 6 | Yeah|1|||,||UC|[]|[]|||del| 7 | alright|1|0.0000|0.0000|,||LC|[]|[]|||sub(i'll),split_worst| 8 | probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)| 9 | just|1|0.0000|0.0000|||LC|[]|[]|||| 10 | that|1|0.0000|0.0000|.||LC|[]|[]|||| 11 | Are|3|0.0000|0.0000|||UC|[]|[]|||| 12 | there|3|0.0000|0.0000|||LC|[]|[]|||| 13 | any|3|0.0000|0.0000|||LC|[]|[]|||| 14 | visuals|3|0.0000|0.0000|||LC|[]|[]|||| 15 | that|3|0.0000|0.0000|||LC|[]|[]|||| 16 | come|3|0.0000|0.0000|||LC|[]|[]|||| 17 | to|3|0.0000|0.0000|||LC|[]|[]|||| 18 | mind|3|0.0000|0.0000|||LC|[]|[]|||| 19 | or|3|0.0000|0.0000|||LC|[]|[]|||| 20 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 21 | sure|1|0.0000|0.0000|.||LC|[]|[]|||| 22 | When|1|0.0000|0.0000|||UC|[]|[]|||| 23 | I|1|0.0000|0.0000|||CA|[]|[]|||| 24 | hear|1|0.0000|0.0000|||LC|[]|[]|||| 25 | Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| 26 | I|1|0.0000|0.0000|||CA|[]|[]|||| 27 | think|1|0.0000|0.0000|||LC|[]|[]|||| 28 | about|1|0.0000|0.0000|||LC|[]|[]|||| 29 | just|1|0.0000|0.0000|||LC|[]|[]|||| 30 | that|1|0.0000|0.0000|:||LC|[]|[]|||| 31 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)| 32 | a|1|0.0000|0.0000|||LC|[]|[]|||| 33 | -------------------------------------------------------------------------------- /test/data/short.aligned.punc.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | |2|0.0000|0.0000|||LC|[]|[]|||| 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 4 | ,|1|0.0000|0.0000|||||[]|||| 5 | yeah|1|||,||LC|[]|[]|||del| 6 | ,|1|||||||[]|||del| 7 | right|1|0.0000|0.0000|.||LC|[]|[]|||| 8 | .|1|||||||[]|||del| 9 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(i'll)| 10 | ,|1|||||||[]|||del| 11 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)| 12 | right|1|||,||LC|[]|[]|||del| 13 | ,|1|||||||[]|||del| 14 | probably|1|||||LC|[]|[]|||del| 15 | just|1|0.0000|0.0000|||LC|[]|[]|||| 16 | that|1|0.0000|0.0000|.||LC|[]|[]|||| 17 | .|1|0.0000|0.0000|||||[]|||sub(?)| 18 | Are|3|0.0000|0.0000|||UC|[]|[]|||| 19 | there|3|0.0000|0.0000|||LC|[]|[]|||| 20 | any|3|0.0000|0.0000|||LC|[]|[]|||| 21 | visuals|3|0.0000|0.0000|||LC|[]|[]|||| 22 | that|3|0.0000|0.0000|||LC|[]|[]|||| 23 | come|3|0.0000|0.0000|||LC|[]|[]|||| 24 | to|3|0.0000|0.0000|||LC|[]|[]|||| 25 | mind|3|0.0000|0.0000|||LC|[]|[]|||| 26 | or|3|0.0000|0.0000|||LC|[]|[]|||| 27 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 28 | ,|1|0.0000|0.0000|||||[]|||| 29 | sure|1|0.0000|0.0000|.||LC|[]|[]|||| 30 | .|1|0.0000|0.0000|||||[]|||| 31 | When|1|0.0000|0.0000|||UC|[]|[]|||| 32 | I|1|0.0000|0.0000|||CA|[]|[]|||| 33 | hear|1|0.0000|0.0000|||LC|[]|[]|||| 34 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| 35 | ,|1|0.0000|0.0000|||||[]|||| 36 | I|1|0.0000|0.0000|||CA|[]|[]|||| 37 | think|1|0.0000|0.0000|||LC|[]|[]|||| 38 | about|1|0.0000|0.0000|||LC|[]|[]|||| 39 | just|1|0.0000|0.0000|||LC|[]|[]|||| 40 | that|1|0.0000|0.0000|:||LC|[]|[]|||| 41 | :|1|0.0000|0.0000|||||[]|||| 42 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)| 43 | a|1|0.0000|0.0000|||LC|[]|[]|||| 44 | -------------------------------------------------------------------------------- /test/data/short.aligned.punc_case.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | |2|0.0000|0.0000|||LC|[]|[]|||| 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 4 | ,|1|0.0000|0.0000|||||[]|||| 5 | yeah|1|||,||LC|[]|[]|||del| 6 | ,|1|||||||[]|||del| 7 | right|1|0.0000|0.0000|.||LC|[]|[]|||| 8 | .|1|||||||[]|||del| 9 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(I'll)| 10 | ,|1|||||||[]|||del| 11 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)| 12 | right|1|||,||LC|[]|[]|||del| 13 | ,|1|||||||[]|||del| 14 | probably|1|||||LC|[]|[]|||del| 15 | just|1|0.0000|0.0000|||LC|[]|[]|||| 16 | that|1|0.0000|0.0000|.||LC|[]|[]|||| 17 | .|1|0.0000|0.0000|||||[]|||sub(?)| 18 | Are|3|0.0000|0.0000|||UC|[]|[]|||| 19 | there|3|0.0000|0.0000|||LC|[]|[]|||| 20 | any|3|0.0000|0.0000|||LC|[]|[]|||| 21 | visuals|3|0.0000|0.0000|||LC|[]|[]|||| 22 | that|3|0.0000|0.0000|||LC|[]|[]|||| 23 | come|3|0.0000|0.0000|||LC|[]|[]|||| 24 | to|3|0.0000|0.0000|||LC|[]|[]|||| 25 | mind|3|0.0000|0.0000|||LC|[]|[]|||| 26 | or|3|0.0000|0.0000|||LC|[]|[]|||| 27 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 28 | ,|1|0.0000|0.0000|||||[]|||| 29 | sure|1|0.0000|0.0000|.||LC|[]|[]|||| 30 | .|1|0.0000|0.0000|||||[]|||| 31 | When|1|0.0000|0.0000|||UC|[]|[]|||| 32 | I|1|0.0000|0.0000|||CA|[]|[]|||| 33 | hear|1|0.0000|0.0000|||LC|[]|[]|||| 34 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']|||| 35 | ,|1|0.0000|0.0000|||||[]|||| 36 | I|1|0.0000|0.0000|||CA|[]|[]|||| 37 | think|1|0.0000|0.0000|||LC|[]|[]|||| 38 | about|1|0.0000|0.0000|||LC|[]|[]|||| 39 | just|1|0.0000|0.0000|||LC|[]|[]|||| 40 | that|1|0.0000|0.0000|:||LC|[]|[]|||| 41 | :|1|0.0000|0.0000|||||[]|||| 42 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(Foobar)| 43 | a|1|0.0000|0.0000|||LC|[]|[]|||| 44 | -------------------------------------------------------------------------------- /test/data/short.aligned.strict.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | |2|0.0000|0.0000|||LC|[]|[]|||| 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 4 | yeah|1|||,||LC|[]|[]|||del| 5 | right|1|0.0000|0.0000|.||LC|[]|[]|||| 6 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(i'll)| 7 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)| 8 | right|1|||,||LC|[]|[]|||del| 9 | probably|1|||||LC|[]|[]|||del| 10 | just|1|0.0000|0.0000|||LC|[]|[]|||| 11 | that|1|0.0000|0.0000|.||LC|[]|[]|||| 12 | Are|3|0.0000|0.0000|||UC|[]|[]|||| 13 | there|3|0.0000|0.0000|||LC|[]|[]|||| 14 | any|3|0.0000|0.0000|||LC|[]|[]|||| 15 | visuals|3|0.0000|0.0000|||LC|[]|[]|||| 16 | that|3|0.0000|0.0000|||LC|[]|[]|||| 17 | come|3|0.0000|0.0000|||LC|[]|[]|||| 18 | to|3|0.0000|0.0000|||LC|[]|[]|||| 19 | mind|3|0.0000|0.0000|||LC|[]|[]|||| 20 | or|3|0.0000|0.0000|||LC|[]|[]|||| 21 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||| 22 | sure|1|0.0000|0.0000|.||LC|[]|[]|||| 23 | When|1|0.0000|0.0000|||UC|[]|[]|||| 24 | I|1|0.0000|0.0000|||CA|[]|[]|||| 25 | hear|1|0.0000|0.0000|||LC|[]|[]|||| 26 | Foobar|1|0.0000|0.0000|,||UC|[]|[]|||| 27 | I|1|0.0000|0.0000|||CA|[]|[]|||| 28 | think|1|0.0000|0.0000|||LC|[]|[]|||| 29 | about|1|0.0000|0.0000|||LC|[]|[]|||| 30 | just|1|0.0000|0.0000|||LC|[]|[]|||| 31 | that|1|0.0000|0.0000|:||LC|[]|[]|||| 32 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)| 33 | a|1|0.0000|0.0000|||LC|[]|[]|||| 34 | -------------------------------------------------------------------------------- /test/data/short.hyp.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | |2||||LC|[] 3 | Yeah|1|||,|UC|[] 4 | right|1||||LC|[] 5 | I'll|1||||UC|[] 6 | do|1||||LC|[] 7 | just|1||||LC|[] 8 | that|1|||.|LC|[] 9 | Are|3||||UC|[] 10 | there|3||||LC|[] 11 | any|3||||LC|[] 12 | visuals|3||||LC|[] 13 | that|3||||LC|[] 14 | come|3||||LC|[] 15 | to|3||||LC|[] 16 | mind|3||||LC|[] 17 | or|3|||?|LC|[] 18 | Yeah|1|||,|UC|[] 19 | sure|1|||.|LC|[] 20 | When|1||||UC|[] 21 | I|1||||CA|[] 22 | hear|1||||LC|[] 23 | Foobar|1|||,|UC|[] 24 | I|1||||CA|[] 25 | think|1||||LC|[] 26 | about|1||||LC|[] 27 | just|1||||LC|[] 28 | that|1|||:|LC|[] 29 | Foobar|1|||,|UC|[] 30 | a|1||||LC|[] 31 | -------------------------------------------------------------------------------- /test/data/short.hyp.txt: -------------------------------------------------------------------------------- 1 | Yeah right I'll do just that Are there any visuals that come to mind or Yeah sure When I hear Foobar I think about just that Foobar a -------------------------------------------------------------------------------- /test/data/short.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | |2||||LC|[] 3 | Yeah|1|||,|UC|[] 4 | yeah|1|||,|LC|[] 5 | right|1|||.|LC|[] 6 | Yeah|1|||,|UC|[] 7 | all|1||||LC|[] 8 | right|1|||,|LC|[] 9 | probably|1||||LC|[] 10 | just|1||||LC|[] 11 | that|1|||.|LC|[] 12 | Are|3||||UC|[] 13 | there|3||||LC|[] 14 | any|3||||LC|[] 15 | visuals|3||||LC|[] 16 | that|3||||LC|[] 17 | come|3||||LC|[] 18 | to|3||||LC|[] 19 | mind|3||||LC|[] 20 | or-|3||||LC|[] 21 | Yeah|1|||,|UC|[] 22 | sure|1|||.|LC|[] 23 | When|1||||UC|[] 24 | I|1||||CA|[] 25 | hear|1||||LC|[] 26 | Foobar|1|||,|UC|[] 27 | I|1||||CA|[] 28 | think|1||||LC|[] 29 | about|1||||LC|[] 30 | just|1||||LC|[] 31 | that|1|||:|LC|[] 32 | foo|1||||LC|[] 33 | a|1||||LC|[] 34 | -------------------------------------------------------------------------------- /test/data/short.sbs.txt: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | 3 | Yeah Yeah 4 | , , 5 | yeah ERR 6 | , ERR 7 | right right 8 | . ERR 9 | Yeah I'll ERR 10 | , ERR 11 | all do ERR 12 | right ERR 13 | , ERR 14 | probably ERR 15 | just just 16 | that that 17 | . ? ERR 18 | Are Are 19 | there there 20 | any any 21 | visuals visuals 22 | that that 23 | come come 24 | to to 25 | mind mind 26 | or or ___100002_SYN_1-1___ 27 | ? ERR 28 | Yeah Yeah 29 | , , 30 | sure sure 31 | . . 32 | When When 33 | I I 34 | hear hear 35 | Foobar Foobar ###1_PROPER_NOUN###|###2_SPACY>ORG###| 36 | , , 37 | I I 38 | think think 39 | about about 40 | just just 41 | that that 42 | : : 43 | foo Foobar ERR 44 | , ERR 45 | a a 46 | ------------------------------------------------------------ 47 | Line Group 48 | 5 yeah , <-> *** 49 | 8 . Yeah , all right , probably <-> I'll do 50 | 17 . <-> ? 51 | 27 *** <-> ? 52 | 43 foo <-> Foobar , 53 | -------------------------------------------------------------------------------- /test/data/short_punc.hyp.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | |2||||LC|[] 3 | Yeah|1|||,|UC|[] 4 | right|1||||LC|[] 5 | I'll|1||||UC|[] 6 | do|1||||LC|[] 7 | just|1||||LC|[] 8 | that|1|||?|LC|[] 9 | Are|3||||UC|[] 10 | there|3||||LC|[] 11 | any|3||||LC|[] 12 | visuals|3||||LC|[] 13 | that|3||||LC|[] 14 | come|3||||LC|[] 15 | to|3||||LC|[] 16 | mind|3||||LC|[] 17 | or|3|||?|LC|[] 18 | Yeah|1|||,|UC|[] 19 | sure|1|||.|LC|[] 20 | When|1||||UC|[] 21 | I|1||||CA|[] 22 | hear|1||||LC|[] 23 | Foobar|1|||,|UC|[] 24 | I|1||||CA|[] 25 | think|1||||LC|[] 26 | about|1||||LC|[] 27 | just|1||||LC|[] 28 | that|1|||:|LC|[] 29 | Foobar|1|||,|UC|[] 30 | a|1||||LC|[] 31 | -------------------------------------------------------------------------------- /test/data/short_punc.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags 2 | |2||||LC|[]|[] 3 | Yeah|1|||,|UC|[]|[] 4 | yeah|1|||,|LC|[]|[] 5 | right|1|||.|LC|[]|[] 6 | Yeah|1|||,|UC|[]|[] 7 | all|1||||LC|[]|[] 8 | right|1|||,|LC|[]|[] 9 | probably|1||||LC|[]|[] 10 | just|1||||LC|[]|[] 11 | that|1|||.|LC|[]|[] 12 | Are|3||||UC|[]|[] 13 | there|3||||LC|[]|[] 14 | any|3||||LC|[]|[] 15 | visuals|3||||LC|[]|[] 16 | that|3||||LC|[]|[] 17 | come|3||||LC|[]|[] 18 | to|3||||LC|[]|[] 19 | mind|3||||LC|[]|[] 20 | or-|3||||LC|[]|[] 21 | Yeah|1|||,|UC|[]|[] 22 | sure|1|||.|LC|[]|[] 23 | When|1||||UC|[]|[] 24 | I|1||||CA|[]|[] 25 | hear|1||||LC|[]|[] 26 | Foobar|1|||,|UC|[]|['1', '2'] 27 | I|1||||CA|[]|[] 28 | think|1||||LC|[]|[] 29 | about|1||||LC|[]|[] 30 | just|1||||LC|[]|[] 31 | that|1|||:|LC|[]|[] 32 | foo|1||||LC|[]|[] 33 | a|1||||LC|[]|[] 34 | -------------------------------------------------------------------------------- /test/data/short_punc.wer_tag.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": { 3 | "entity_type": "PROPER_NOUN" 4 | }, 5 | "2": { 6 | "entity_type": "SPACY>ORG" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /test/data/speaker_1.hyp.txt: -------------------------------------------------------------------------------- 1 | a b c d e e f g h i j it is uh a b c d f g h i 2 | -------------------------------------------------------------------------------- /test/data/speaker_1.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||| 3 | b|1||||| 4 | c|1||||| 5 | d|1||||| 6 | e|1||||| 7 | f|1||||| 8 | g|1||||| 9 | h|1||||| 10 | i|1||||| 11 | j|1||||| 12 | um|2||||| 13 | it's|2||||| 14 | a|2||||| 15 | b|2||||| 16 | c|2||||| 17 | d|2||||| 18 | e|2||||| 19 | f|2||||| 20 | g|2||||| 21 | h|2||||| 22 | -------------------------------------------------------------------------------- /test/data/speaker_2.hyp.txt: -------------------------------------------------------------------------------- 1 | a b a d e f f g h i k l m n n n o p q r -------------------------------------------------------------------------------- /test/data/speaker_2.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | a|1||||| 3 | b|1||||| 4 | c|2||||| 5 | d|1||||| 6 | e|1||||| 7 | f|1||||| 8 | g|1||||| 9 | h|1||||| 10 | i|3||||| 11 | j|3||||| 12 | k|3||||| 13 | l|3||||| 14 | m|3||||| 15 | n|3||||| 16 | o|2||||| 17 | p|2||||| 18 | q|2||||| 19 | r|2||||| 20 | s|2||||| 21 | -------------------------------------------------------------------------------- /test/data/syn_1.hyp.adapted.sbs: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | we we'll ERR 3 | will ERR 4 | have have 5 | a a 6 | nice nice 7 | evening evening 8 | um ERR 9 | no no 10 | matter matter 11 | what what 12 | will will 13 | happen happen 14 | it ERR 15 | um is ERR 16 | it's uh ERR 17 | a a 18 | good good 19 | opportunity opportunity 20 | to to 21 | do ERR 22 | this this 23 | you'll you'll 24 | uh ERR 25 | see see 26 | ------------------------------------------------------------ 27 | Line Group 28 | 2 we will <-> we'll 29 | 8 *** <-> um 30 | 14 um it's <-> it is uh 31 | 21 do <-> *** 32 | 24 *** <-> uh 33 | -------------------------------------------------------------------------------- /test/data/syn_1.hyp.sbs: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | we ERR 3 | will we'll ERR 4 | have have 5 | a a 6 | nice nice 7 | evening evening 8 | um ERR 9 | no no 10 | matter matter 11 | what what 12 | will will 13 | happen happen 14 | it ERR 15 | um is ERR 16 | it's uh ERR 17 | a a 18 | good good 19 | opportunity opportunity 20 | to to 21 | do ERR 22 | this this 23 | you'll you'll 24 | uh ERR 25 | see see 26 | ------------------------------------------------------------ 27 | Line Group 28 | 2 we will <-> we'll 29 | 8 *** <-> um 30 | 14 um it's <-> it is uh 31 | 21 do <-> *** 32 | 24 *** <-> uh 33 | -------------------------------------------------------------------------------- /test/data/syn_1.hyp.txt: -------------------------------------------------------------------------------- 1 | we'll have a nice evening um no matter what will happen it is uh a good opportunity to this you'll uh see 2 | -------------------------------------------------------------------------------- /test/data/syn_1.ref.txt: -------------------------------------------------------------------------------- 1 | we will have a nice evening no matter what will happen um it's a good opportunity to do this you'll see 2 | -------------------------------------------------------------------------------- /test/data/syn_10.hyp.txt: -------------------------------------------------------------------------------- 1 | she will have a great evening 2 | -------------------------------------------------------------------------------- /test/data/syn_10.ref.txt: -------------------------------------------------------------------------------- 1 | she- will have a great evening 2 | -------------------------------------------------------------------------------- /test/data/syn_2.hyp.txt: -------------------------------------------------------------------------------- 1 | we'll 2 | -------------------------------------------------------------------------------- /test/data/syn_2.ref.txt: -------------------------------------------------------------------------------- 1 | we will 2 | -------------------------------------------------------------------------------- /test/data/syn_3.hyp.txt: -------------------------------------------------------------------------------- 1 | we will 2 | -------------------------------------------------------------------------------- /test/data/syn_3.ref.txt: -------------------------------------------------------------------------------- 1 | we'll 2 | -------------------------------------------------------------------------------- /test/data/syn_4.hyp.txt: -------------------------------------------------------------------------------- 1 | no 2 | -------------------------------------------------------------------------------- /test/data/syn_4.ref.txt: -------------------------------------------------------------------------------- 1 | we will 2 | -------------------------------------------------------------------------------- /test/data/syn_5.hyp.txt: -------------------------------------------------------------------------------- 1 | will 2 | -------------------------------------------------------------------------------- /test/data/syn_5.ref.txt: -------------------------------------------------------------------------------- 1 | we'll 2 | -------------------------------------------------------------------------------- /test/data/syn_6.hyp.txt: -------------------------------------------------------------------------------- 1 | this is what saying 2 | -------------------------------------------------------------------------------- /test/data/syn_6.ref.txt: -------------------------------------------------------------------------------- 1 | this- is what she's saying 2 | -------------------------------------------------------------------------------- /test/data/syn_7.hyp.txt: -------------------------------------------------------------------------------- 1 | it costs ten bricks of bricks dollars but i will verify what we could have done differently for the second day 2 | -------------------------------------------------------------------------------- /test/data/syn_7.hyp2.txt: -------------------------------------------------------------------------------- 1 | it costs bricks of bricks dollars but i will verify what we could have done differently for the second day 2 | -------------------------------------------------------------------------------- /test/data/syn_7.hyp3.txt: -------------------------------------------------------------------------------- 1 | it costs ten bricks of bricks but i will verify what we could have done differently for the second day 2 | -------------------------------------------------------------------------------- /test/data/syn_7.hyp4.txt: -------------------------------------------------------------------------------- 1 | it costs -------------------------------------------------------------------------------- /test/data/syn_7.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 0.33, 6 | "verbalization": [ 7 | "ten", 8 | "billions" 9 | ] 10 | }, 11 | { 12 | "probability": 0.33, 13 | "verbalization": [ 14 | "ten", 15 | "billions", 16 | "dollars" 17 | ] 18 | }, 19 | { 20 | "probability": 0.33, 21 | "verbalization": [ 22 | ] 23 | } 24 | ], 25 | "class": "MONEY" 26 | }, 27 | "1": { 28 | "candidates": [ 29 | { 30 | "probability": 1.0, 31 | "verbalization": [ 32 | "second" 33 | ] 34 | } 35 | ], 36 | "class": "ORDINAL" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /test/data/syn_7.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | It|1|0.06999999999999999||,|UC|[] 3 | costs|1||||LC|[] 4 | $10|1||||LC|['0:MONEY'] 5 | billions|1||||LC|['0:MONEY'] 6 | but |1|||…|LC|[] 7 | i'll|2||||UC|[] 8 | verify|2||||LC|[] 9 | what|2||||LC|[] 10 | we|2||||LC|[] 11 | could|2||||LC|[] 12 | have|2|||.|LC|[] 13 | done|2||||UC|[] 14 | differently-|2||||LC|[] 15 | for|2||||LC|[] 16 | the|2||||LC|[] 17 | 2nd|2||||LC|['1:ORDINAL'] 18 | -------------------------------------------------------------------------------- /test/data/syn_7.synonym.rules.txt: -------------------------------------------------------------------------------- 1 | # format : LHSRHS 2 | # where: 3 | # LHS : space-delimited words to match in the original reference text 4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS 5 | # 6 | # This is non-recursive and single-pass only. 7 | # By default, there won't be an automatic symetry: 8 | # if you want a-> and b->a, you need to specify both 9 | # 10 | # Empty lines or lines starting with '#' are ignored 11 | 12 | 13 | ## FOR TESTING ONLY 14 | ten billions | ten bricks of bricks 15 | 16 | # To Be contractions - present 17 | i am | i'm 18 | i'm | i am 19 | you are | you're 20 | you're | you are 21 | he is | he's 22 | he's | he is 23 | she is | she's 24 | she's | she is 25 | it is | it's 26 | it's | it is 27 | we're | we are 28 | we are | we're 29 | they are | they're 30 | 31 | # To Be contractions - future 32 | i will |i'll 33 | i'll |i will 34 | you will |you'll 35 | you'll |you will 36 | he will |he'll 37 | he'll |he will 38 | she will |she'll 39 | she'll |she will 40 | it will |it'll 41 | it'll |it will 42 | we will |we'll 43 | we'll |we will 44 | they will |they'll 45 | they'll |they will 46 | 47 | i'd |i had ; i would 48 | 49 | # TODO: can't -> cannot ? 50 | # TODO: To Have ? 51 | # TODO: which other contractions? 52 | -------------------------------------------------------------------------------- /test/data/syn_7_ref4.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | It|1|0.06999999999999999||,|UC|[] 3 | costs|1||||LC|[] 4 | $10|1||||LC|['0:MONEY'] 5 | billions|1||||LC|['0:MONEY'] 6 | -------------------------------------------------------------------------------- /test/data/syn_8.hyp.ctm: -------------------------------------------------------------------------------- 1 | recording.wav A 0.00 0.36 we 1.00 2 | recording.wav A 0.36 0.12 will 1.00 3 | recording.wav A 0.48 0.30 see 1.00 4 | recording.wav A 0.75 0.95 that 1.00 5 | recording.wav A 1.05 0.21 that 1.00 6 | recording.wav A 1.26 0.18 was 0.99 7 | recording.wav A 1.44 0.09 an 1.00 8 | recording.wav A 1.53 0.96 accident 1.00 9 | -------------------------------------------------------------------------------- /test/data/syn_8.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags 2 | We'll|1|0.15|||UC|[] 3 | see|1||||LC|[] 4 | that|1||||LC|[] 5 | was|1||||LC|[] 6 | an|1||||LC|[] 7 | accident|1|||.|LC|[] 8 | -------------------------------------------------------------------------------- /test/data/syn_9.hyp.txt: -------------------------------------------------------------------------------- 1 | he will have a great evening -------------------------------------------------------------------------------- /test/data/syn_9.ref.txt: -------------------------------------------------------------------------------- 1 | she- will have a great evening -------------------------------------------------------------------------------- /test/data/syn_9.synonym.rules.txt: -------------------------------------------------------------------------------- 1 | # format : LHSRHS 2 | # where: 3 | # LHS : space-delimited words to match in the original reference text 4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS 5 | # 6 | # This is non-recursive and single-pass only. 7 | # By default, there won't be an automatic symetry: 8 | # if you want a-> and b->a, you need to specify both 9 | # 10 | # Empty lines or lines starting with '#' are ignored 11 | 12 | 13 | ## FOR TESTING ONLY 14 | she- | he 15 | -------------------------------------------------------------------------------- /test/data/syn_compound_1.hyp.txt: -------------------------------------------------------------------------------- 1 | things are going to be next level next quarter -------------------------------------------------------------------------------- /test/data/syn_compound_1.ref.txt: -------------------------------------------------------------------------------- 1 | things are going to be next-level next quarter -------------------------------------------------------------------------------- /test/data/syn_compound_2.hyp.txt: -------------------------------------------------------------------------------- 1 | that is are long-term view on politics -------------------------------------------------------------------------------- /test/data/syn_compound_2.ref.txt: -------------------------------------------------------------------------------- 1 | what is our long term view on politics -------------------------------------------------------------------------------- /test/data/test1.hyp.txt: -------------------------------------------------------------------------------- 1 | we'll have victory it is a sure thing we must only calculate p i number 2 | we'll have victory it is a sure thing we must only calculate pee eye number 3 | we will have victory it's a sure thing we must only calculate p i number 4 | we will have victory it's a sure thing we must only calculate p i number 5 | we will have victory it's a sure thing we must only calculate p i number 6 | -------------------------------------------------------------------------------- /test/data/test1.ref.txt: -------------------------------------------------------------------------------- 1 | we will have victory marc it's a sure thing we must only calculate p i number 2 | we will have victory it's a sure thing we must only calculate p i number 3 | we will have victory it's a sure thing we must only calculate p i number 4 | we will have victory it's a sure thing we must only calculate p i number 5 | we will have victory it's a sure thing we must only calculate p i number 6 | -------------------------------------------------------------------------------- /test/data/twenty.aligned.punc_case.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence 2 | 20|2|||||CA|['1:CARDINAL']|['1']|84.6600|85.0600|del| 3 | in|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800|| 4 | twenty|2|0.0000|0.0000|||CA|['0:YEAR']|['0', '2']|89.7400|89.9900|sub(Twenty),push_last| 5 | twenty|2|0.0000|0.0000|||LC|['0:YEAR']|['0', '2']|89.7400|89.9900|sub(tHiRtY),push_last| 6 | is|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800|| 7 | one|2|0.0000|0.0000|||CA|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last| 8 | twenty|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|sub(two),push_last| 9 | three|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last| 10 | -------------------------------------------------------------------------------- /test/data/twenty.hyp-a2.sbs: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | 20 ERR ___1_CARDINAL___ 3 | in in 4 | twenty twenty ___2_YEAR___ 5 | twenty thirty ERR ___2_YEAR___ 6 | is is 7 | one one ___3_CARDINAL___ 8 | twenty twenty ___3_CARDINAL___ 9 | two ERR ___3_CARDINAL___ 10 | three three ___3_CARDINAL___ 11 | ------------------------------------------------------------ 12 | Line Group 13 | 2 20 <-> *** 14 | 5 twenty <-> thirty 15 | 9 *** <-> two 16 | -------------------------------------------------------------------------------- /test/data/twenty.hyp.punc_case.txt: -------------------------------------------------------------------------------- 1 | in Twenty tHiRtY , is one TWENTY two three 2 | 3 | -------------------------------------------------------------------------------- /test/data/twenty.hyp.sbs: -------------------------------------------------------------------------------- 1 | ref_token hyp_token IsErr Class Wer_Tag_Entities 2 | twenty ERR ___1_CARDINAL___ 3 | in in 4 | twenty twenty ___2_YEAR___ 5 | twenty thirty ERR ___2_YEAR___ 6 | is is 7 | one one ___3_CARDINAL___ 8 | twenty twenty ___3_CARDINAL___ 9 | two ERR ___3_CARDINAL___ 10 | three three ___3_CARDINAL___ 11 | ------------------------------------------------------------ 12 | Line Group 13 | 2 twenty <-> *** 14 | 5 twenty <-> thirty 15 | 9 *** <-> two 16 | -------------------------------------------------------------------------------- /test/data/twenty.hyp.txt: -------------------------------------------------------------------------------- 1 | in twenty thirty is one twenty two three 2 | 3 | -------------------------------------------------------------------------------- /test/data/twenty.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": { 3 | "candidates": [ 4 | { 5 | "probability": 1.0, 6 | "verbalization": [ 7 | "twenty" 8 | ] 9 | } 10 | ], 11 | "class": "CARDINAL" 12 | }, 13 | "2": { 14 | "candidates": [ 15 | { 16 | "probability": 0.3333333333333333, 17 | "verbalization": [ 18 | "twenty", 19 | "twenty" 20 | ] 21 | }, 22 | { 23 | "probability": 0.3333333333333333, 24 | "verbalization": [ 25 | "two", 26 | "thousand", 27 | "twenty" 28 | ] 29 | }, 30 | { 31 | "probability": 0.3333333333333333, 32 | "verbalization": [ 33 | "two", 34 | "thousand", 35 | "and", 36 | "twenty" 37 | ] 38 | } 39 | ], 40 | "class": "YEAR" 41 | }, 42 | "3": { 43 | "candidates": [ 44 | { 45 | "probability": 0.2, 46 | "verbalization": [ 47 | "one", 48 | "twenty", 49 | "three" 50 | ] 51 | }, 52 | { 53 | "probability": 0.2, 54 | "verbalization": [ 55 | "a", 56 | "hundred", 57 | "twenty", 58 | "three" 59 | ] 60 | }, 61 | { 62 | "probability": 0.2, 63 | "verbalization": [ 64 | "one", 65 | "hundred", 66 | "twenty", 67 | "three" 68 | ] 69 | }, 70 | { 71 | "probability": 0.2, 72 | "verbalization": [ 73 | "a", 74 | "hundred", 75 | "and", 76 | "twenty", 77 | "three" 78 | ] 79 | }, 80 | { 81 | "probability": 0.2, 82 | "verbalization": [ 83 | "one", 84 | "hundred", 85 | "and", 86 | "twenty", 87 | "three" 88 | ] 89 | } 90 | ], 91 | "class": "CARDINAL" 92 | } 93 | } -------------------------------------------------------------------------------- /test/data/twenty.ref.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags|oldTs|oldEndTs|ali_comment 2 | 20|2|84.6600|85.0600||CA|['1:CARDINAL']|||,push_last 3 | in|2|89.1600|89.2800||LC|[]||| 4 | 2020|2|89.7400|89.9900||CA|['2:YEAR']|||,push_last 5 | is|2|89.1600|89.2800||LC|[]||| 6 | 123|2|89.7400|89.9900||CA|['3:CARDINAL']|||,push_last 7 | -------------------------------------------------------------------------------- /test/data/twenty.ref.testing.nlp: -------------------------------------------------------------------------------- 1 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags 2 | 20|2|84.6600|85.0600||CA|['1:CARDINAL']|['1'] 3 | in|2|89.1600|89.2800||LC|[]|[] 4 | 2020|2|89.7400|89.9900||CA|['0:YEAR']|['0', '2'] 5 | is|2|89.1600|89.2800||LC|[]|[] 6 | 123|2|89.7400|89.9900||CA|['3:CARDINAL']|['3'] 7 | -------------------------------------------------------------------------------- /test/data/twenty.ref.testing.norm.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "candidates": [ 4 | { 5 | "probability": 0.8699402786994028, 6 | "verbalization": [ 7 | "twenty", 8 | "twenty" 9 | ] 10 | }, 11 | { 12 | "probability": 0.032183145321831454, 13 | "verbalization": [ 14 | "two", 15 | "thousand", 16 | "and", 17 | "twenty" 18 | ] 19 | }, 20 | { 21 | "probability": 0.09787657597876576, 22 | "verbalization": [ 23 | "two", 24 | "thousand", 25 | "twenty" 26 | ] 27 | } 28 | ], 29 | "class": "YEAR" 30 | }, 31 | "1": { 32 | "candidates": [ 33 | { 34 | "probability": 1.0, 35 | "verbalization": [ 36 | "twenty" 37 | ] 38 | } 39 | ], 40 | "class": "CARDINAL" 41 | }, 42 | "3": { 43 | "candidates": [ 44 | { 45 | "probability": 0.06962025316455696, 46 | "verbalization": [ 47 | "a", 48 | "hundred", 49 | "and", 50 | "twenty", 51 | "three" 52 | ] 53 | }, 54 | { 55 | "probability": 0.09177215189873418, 56 | "verbalization": [ 57 | "a", 58 | "hundred", 59 | "twenty", 60 | "three" 61 | ] 62 | }, 63 | { 64 | "probability": 0.012658227848101266, 65 | "verbalization": [ 66 | "one", 67 | "hundred", 68 | "and", 69 | "twenty", 70 | "three" 71 | ] 72 | }, 73 | { 74 | "probability": 0.0189873417721519, 75 | "verbalization": [ 76 | "one", 77 | "hundred", 78 | "twenty", 79 | "three" 80 | ] 81 | }, 82 | { 83 | "probability": 0.8069620253164557, 84 | "verbalization": [ 85 | "one", 86 | "twenty", 87 | "three" 88 | ] 89 | } 90 | ], 91 | "class": "CARDINAL" 92 | } 93 | } -------------------------------------------------------------------------------- /test/data/wer_utf.hyp.txt: -------------------------------------------------------------------------------- 1 | ça va va bien aujourd'hui éte inutile Êtes -------------------------------------------------------------------------------- /test/data/wer_utf.ref.txt: -------------------------------------------------------------------------------- 1 | Ça va bien aujourd'hui étÉ inutile êtes -------------------------------------------------------------------------------- /test/fast-d-tests.cc: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include /* srand, rand */ 3 | #include /* time */ 4 | #include "../third-party/catch2/single_include/catch2/catch.hpp" 5 | #include "src/fast-d.h" 6 | typedef std::vector vint; 7 | 8 | TEST_CASE("simple-edits-counts") { 9 | vint a = {1, 2, 3, 4, 5}; 10 | vint b = {1, 2, 8, 4, 5}; 11 | 12 | REQUIRE(GetEditDistance(a, b) == 1); 13 | // edit distance should be symetric 14 | REQUIRE(GetEditDistance(b, a) == 1); 15 | 16 | // distance of oneself with oneself should be 0 17 | REQUIRE(GetEditDistance(a, a) == 0); 18 | REQUIRE(GetEditDistance(b, b) == 0); 19 | } 20 | 21 | TEST_CASE("boundaries-edits-counts") { 22 | vint a = {1, 2, 3, 4, 5}; 23 | vint b = {}; 24 | 25 | REQUIRE(GetEditDistance(a, b) == 5); 26 | // edit distance should be symetric 27 | REQUIRE(GetEditDistance(b, a) == 5); 28 | 29 | // distance of oneself with oneself should be 0 30 | REQUIRE(GetEditDistance(a, a) == 0); 31 | REQUIRE(GetEditDistance(b, b) == 0); 32 | } 33 | 34 | TEST_CASE("just-one-target-edits-counts") { 35 | vint a = {1, 2, 3, 4, 5}; 36 | vint b1 = {1}; 37 | vint b2 = {2}; 38 | vint b3 = {3}; 39 | vint b4 = {4}; 40 | vint b5 = {5}; 41 | 42 | REQUIRE(GetEditDistance(a, b1) == 4); 43 | REQUIRE(GetEditDistance(a, b2) == 4); 44 | REQUIRE(GetEditDistance(a, b3) == 4); 45 | REQUIRE(GetEditDistance(a, b4) == 4); 46 | REQUIRE(GetEditDistance(a, b5) == 4); 47 | } 48 | 49 | TEST_CASE("single-edits-counts") { 50 | vint a = {1, 2, 3, 4, 5}; 51 | vint b = {1, 1, 2, 3, 4, 5}; 52 | 53 | REQUIRE(GetEditDistance(b, a) == 1); 54 | REQUIRE(GetEditDistance(a, b) == 1); 55 | } 56 | 57 | TEST_CASE("left-insert") { 58 | vint va = {1, 2, 3, 4, 5}; 59 | vint vb = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 2, 3, 4, 5}; 60 | vint mapA; 61 | vint mapB; 62 | 63 | REQUIRE(GetEditDistance(va, mapA, vb, mapB) == 10); 64 | 65 | vint vb1 = {8, 8, 8, 8, 8, 1, 8, 8, 8, 8, 1, 2, 3, 4, 5}; 66 | REQUIRE(GetEditDistance(va, mapA, vb1, mapB) == 10); 67 | } 68 | 69 | TEST_CASE("map-test-A") { 70 | vint a = {1, 2, 3, 4, 5}; 71 | vint b = {1, 2, 3, 4, 5}; 72 | vint mapA = {-1, -1, -1, -1, -1}; 73 | vint mapB = {-1, -1, -1, -1, -1}; 74 | 75 | REQUIRE(GetEditDistance(a, mapA, b, mapB) == 0); 76 | for (int i = 0; i < 5; i++) { 77 | REQUIRE(mapA[i] == 1); 78 | REQUIRE(mapB[i] == 1); 79 | } 80 | } 81 | 82 | TEST_CASE("map-test-B") { 83 | vint a = {1, 2, 3, 4, 5}; 84 | vint b = {1, 4, 5}; 85 | vint mapA = {-1, -1, -1, -1, -1}; 86 | vint mapB = {-1, 1, -1}; 87 | 88 | REQUIRE(GetEditDistance(a, mapA, b, mapB) == 2); 89 | REQUIRE(mapA[0] == 1); 90 | REQUIRE(mapA[1] == -1); 91 | REQUIRE(mapA[2] == -1); 92 | REQUIRE(mapA[3] == 1); 93 | REQUIRE(mapA[4] == 1); 94 | 95 | REQUIRE(mapB[0] == 1); 96 | REQUIRE(mapB[1] == 1); 97 | REQUIRE(mapB[2] == 1); 98 | } 99 | 100 | TEST_CASE("map-test-C") { 101 | vint a = {1, 2, 3, 4, 5}; 102 | vint b = {10, 20, 30, 40, 50}; 103 | vint mapA = {-1, -1, -1, -1, -1}; 104 | // vint mapB = {-1, -1, -1, -1, -1}; 105 | vint mapB; 106 | 107 | REQUIRE(GetEditDistance(a, mapA, b, mapB) == 5); 108 | for (int i = 0; i < 5; i++) { 109 | REQUIRE(mapA[i] == -1); 110 | REQUIRE(mapB[i] == -1); 111 | } 112 | } 113 | 114 | TEST_CASE("map-repeat") { 115 | vint a = {1, 2, 3, 4, 5, 1, 2, 7, 8, 1, 2, 3}; 116 | vint b = {1, 2, 3, 5, 9, 1, 2, 7, 8, 1, 2}; 117 | vint mapA; 118 | vint mapB; 119 | 120 | int dist = GetEditDistance(a, mapA, b, mapB); 121 | REQUIRE(mapA.size() == a.size()); 122 | REQUIRE(mapB.size() == b.size()); 123 | 124 | for (int i = 0; i < a.size(); i++) { 125 | std::cout << " a[" << i << "] = " << a[i] << ", mapA[" << i << "] = " << mapA[i] << std::endl; 126 | } 127 | for (int i = 0; i < b.size(); i++) { 128 | std::cout << " b[" << i << "] = " << b[i] << ", mapB[" << i << "] = " << mapB[i] << std::endl; 129 | } 130 | 131 | REQUIRE(dist == 3); 132 | } 133 | 134 | TEST_CASE("test-long-seq") { 135 | srand(time(NULL)); 136 | int ins_rate = 20; // over 1k, so 2% 137 | int del_rate = 20; // over 1k, so 2% 138 | int sub_rate = 50; // over 1k, so 5% 139 | 140 | int retries_left = 5; 141 | int number_of_edits = 0; 142 | int edit_distance = 0; 143 | 144 | // for stochastic reasons, it is possible that the naive 145 | // ins + sub + del count gets off a little. We give ourselves 146 | // few attempts to validate that this test passes. 147 | while (retries_left-- > 0) { 148 | vint a; 149 | vint b; 150 | vint mapA; 151 | vint mapB; 152 | 153 | number_of_edits = 0; 154 | int num_ins = 0; 155 | int num_del = 0; 156 | int num_sub = 0; 157 | 158 | for (int i = 0; i < 1000; i++) { 159 | int ai = rand() % 32000 + rand() % 32000 + rand() % 32000 + rand() % 32000 + 1; 160 | a.push_back(ai); 161 | 162 | // if you want to debug the test 163 | // std::cout << "a[" << i << "] = " << a[i] << std::endl; 164 | int extra_char = a[i] + rand() % 32000 + 40000; 165 | 166 | int f = rand() % 1000; 167 | if (f < ins_rate) { 168 | b.push_back(extra_char); 169 | b.push_back(a[i]); 170 | number_of_edits++; 171 | num_ins++; 172 | } else if (f < ins_rate + del_rate) { 173 | // let's skip this one 174 | number_of_edits++; 175 | num_del++; 176 | } else if (f < ins_rate + del_rate + sub_rate) { 177 | b.push_back(extra_char); 178 | number_of_edits++; 179 | num_sub++; 180 | } else { 181 | b.push_back(a[i]); 182 | } 183 | } 184 | 185 | // if you want to debug the test 186 | // for (int j = 0; j < b.size(); j++) { 187 | // std::cout << "b[" << j << "] = " << b[j] << std::endl; 188 | // } 189 | 190 | std::cout << " We have " << num_ins << " insertions, " << num_del << " deletions and " << num_sub 191 | << " substitution for a total of " << number_of_edits << " edits" << std::endl; 192 | 193 | edit_distance = GetEditDistance(a, mapA, b, mapB); 194 | if (edit_distance != number_of_edits) { 195 | std::cout << "a= " << a[0]; 196 | for (int i = 1; i < a.size(); i++) { 197 | std::cout << " " << a[i]; 198 | } 199 | std::cout << std::endl; 200 | std::cout << "b= " << b[0]; 201 | for (int i = 1; i < b.size(); i++) { 202 | std::cout << " " << b[i]; 203 | } 204 | std::cout << std::endl; 205 | continue; 206 | } else { 207 | break; 208 | } 209 | } 210 | 211 | REQUIRE(edit_distance == number_of_edits); 212 | } 213 | 214 | TEST_CASE("test-long-seq-editonly") { 215 | srand(time(NULL)); 216 | int ins_rate = 20; // over 1k, so 2% 217 | int del_rate = 20; // over 1k, so 2% 218 | int sub_rate = 50; // over 1k, so 5% 219 | 220 | int retries_left = 5; 221 | int number_of_edits = 0; 222 | int edit_distance = 0; 223 | 224 | // for stochastic reasons, it is possible that the naive 225 | // ins + sub + del count gets off a little. We give ourselves 226 | // few attempts to validate that this test passes. 227 | while (retries_left-- > 0) { 228 | vint a; 229 | vint b; 230 | vint mapA; 231 | vint mapB; 232 | 233 | number_of_edits = 0; 234 | int num_ins = 0; 235 | int num_del = 0; 236 | int num_sub = 0; 237 | 238 | for (int i = 0; i < 1000; i++) { 239 | int ai = rand() % 32000 + rand() % 32000 + rand() % 32000 + rand() % 32000 + 1; 240 | a.push_back(ai); 241 | 242 | // if you want to debug the test 243 | // std::cout << "a[" << i << "] = " << a[i] << std::endl; 244 | int extra_char = a[i] + rand() % 32000 + 40000; 245 | 246 | int f = rand() % 1000; 247 | if (f < ins_rate) { 248 | b.push_back(extra_char); 249 | b.push_back(a[i]); 250 | number_of_edits++; 251 | num_ins++; 252 | } else if (f < ins_rate + del_rate) { 253 | // let's skip this one 254 | number_of_edits++; 255 | num_del++; 256 | } else if (f < ins_rate + del_rate + sub_rate) { 257 | b.push_back(extra_char); 258 | number_of_edits++; 259 | num_sub++; 260 | } else { 261 | b.push_back(a[i]); 262 | } 263 | } 264 | 265 | // if you want to debug the test 266 | // for (int j = 0; j < b.size(); j++) { 267 | // std::cout << "b[" << j << "] = " << b[j] << std::endl; 268 | // } 269 | 270 | std::cout << " We have " << num_ins << " insertions, " << num_del << " deletions and " << num_sub 271 | << " substitution for a total of " << number_of_edits << " edits" << std::endl; 272 | 273 | edit_distance = GetEditDistanceOnly(a, b); 274 | if (edit_distance != number_of_edits) { 275 | std::cout << "a= " << a[0]; 276 | for (int i = 1; i < a.size(); i++) { 277 | std::cout << " " << a[i]; 278 | } 279 | std::cout << std::endl; 280 | std::cout << "b= " << b[0]; 281 | for (int i = 1; i < b.size(); i++) { 282 | std::cout << " " << b[i]; 283 | } 284 | std::cout << std::endl; 285 | continue; 286 | } else { 287 | break; 288 | } 289 | } 290 | 291 | REQUIRE(edit_distance == number_of_edits); 292 | } -------------------------------------------------------------------------------- /test/test-utilties.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "src/logging.h" 7 | #ifndef __TEST_UTILITIES_H__ 8 | #define __TEST_UTILITIES_H__ 1 9 | const std::string TEST_BINARY = "./fstalign"; 10 | const std::string TEST_DATA = "../test/data/"; 11 | const std::string TEST_SYNONYMS = "../sample_data/synonyms.rules.txt"; 12 | #ifdef WINDOWS 13 | #include 14 | #define GetCurrentDir _getcwd 15 | #else 16 | #include 17 | #define GetCurrentDir getcwd 18 | #endif 19 | 20 | void pclose_test(FILE *fp) { 21 | int status = pclose(fp); 22 | if (status != 0) { 23 | throw std::runtime_error("exit status non-zero!"); 24 | } 25 | } 26 | 27 | std::string get_current_dir() { 28 | char buff[FILENAME_MAX]; // create string buffer to hold path 29 | GetCurrentDir(buff, FILENAME_MAX); 30 | std::string current_working_dir(buff); 31 | return current_working_dir; 32 | } 33 | 34 | // Executes a specific shell command, and returns a string containing the output 35 | std::string exec(const std::string &cmd) { 36 | const size_t length = 256; 37 | std::array buffer; 38 | 39 | std::shared_ptr pipe{popen(cmd.c_str(), "r"), pclose_test}; 40 | if (!pipe) { 41 | throw std::runtime_error("popen() failed!"); 42 | } 43 | 44 | std::string result; 45 | while (!feof(pipe.get())) { 46 | if (fgets(buffer.data(), length, pipe.get()) != nullptr) result += buffer.data(); 47 | } 48 | result = result + "\nCommand:\n" + cmd + "\n"; 49 | 50 | return result; 51 | } 52 | 53 | // Generates a specific fstalign command given certain flag values 54 | std::string command(const char *subcommand, const char *approach, const char *reference, const char *hypothesis, 55 | const std::string output_sbs = "", const std::string output_nlp = "", 56 | const std::string synonyms = "", const char *refJson = nullptr, const bool disableCutoffs = false, 57 | const int speakerSwitchContextSize = -1, const std::string extraFlags = "") { 58 | const auto ref = std::string{"--ref "} + TEST_DATA + reference; 59 | const auto hyp = std::string{"--hyp "} + TEST_DATA + hypothesis; 60 | 61 | auto cmd = std::string{TEST_BINARY} + " " + subcommand + " " + approach + " " + ref + " " + hyp; 62 | // useful for debugging test 63 | // auto logger = logger::GetOrCreateLogger("main()"); 64 | // logger->info("final command is {}", cmd); 65 | 66 | if (!synonyms.empty()) { 67 | cmd = cmd + " --syn " + synonyms; 68 | } 69 | if (refJson != nullptr) { 70 | cmd = cmd + " --ref-json " + TEST_DATA + refJson; 71 | } 72 | 73 | if (disableCutoffs) { 74 | cmd += " --disable-cutoffs"; 75 | } 76 | 77 | if (speakerSwitchContextSize > 0) { 78 | cmd += " --speaker-switch-context " + std::to_string(speakerSwitchContextSize); 79 | } 80 | 81 | if (!output_sbs.empty()) { 82 | cmd = cmd + " --output-sbs " + output_sbs; 83 | } 84 | 85 | if (!output_nlp.empty()) { 86 | cmd = cmd + " --output-nlp " + output_nlp; 87 | } 88 | 89 | if (!extraFlags.empty()) { 90 | cmd += " " + extraFlags; 91 | } 92 | return cmd; 93 | } 94 | 95 | // Compares two test files for exact equality 96 | bool compareFiles(const std::string &p1, const std::string &p2) { 97 | std::ifstream f1(p1, std::ifstream::binary | std::ifstream::ate); 98 | std::ifstream f2(p2, std::ifstream::binary | std::ifstream::ate); 99 | 100 | // useful for debugging test 101 | auto logger = logger::GetOrCreateLogger("main()"); 102 | // logger->info("comparing {} with {}", p1, p2); 103 | 104 | if (f1.fail() || f2.fail()) { 105 | logger->info("comparing {} with {}", p1, p2); 106 | logger->info("some file can't be opened"); 107 | return false; // file problem 108 | } 109 | 110 | if (f1.tellg() != f2.tellg()) { 111 | logger->info("comparing {} with {}", p1, p2); 112 | logger->info("files sizes don't match {}, vs {}", f1.tellg(), f2.tellg()); 113 | return false; // size mismatch 114 | } 115 | 116 | // seek back to beginning and use std::equal to compare contents 117 | f1.seekg(0, std::ifstream::beg); 118 | f2.seekg(0, std::ifstream::beg); 119 | return std::equal(std::istreambuf_iterator(f1.rdbuf()), std::istreambuf_iterator(), 120 | std::istreambuf_iterator(f2.rdbuf())); 121 | } 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | A collection of miscellaneous tools to support the fstalign project. 3 | 4 | ## generate_wer_test_data.pl 5 | A simple perl script to generate synthetic transcripts with a targetted word error rate. Outputs will be written as plain text to `ref.out` and `hyp.out`. 6 | The script contains settings to generate specific INS/DEL/SUB error frequencies, in addition to target reference transcript length. This is useful for testing the WER behavior of `fstalign` and also the performance of the algorithm when hit with edge case scenarios (e.g. 80% deletion rate). 7 | 8 | Example usage: 9 | `perl generate_wer_test_data.pl --ins_fract 0.2 --del_fract 0.3 --sub_fract 0.2 --ref_length 1000 --oref ref.out --ohyp hyp.out` 10 | 11 | Example output: 12 | ``` 13 | writing to [ref.out] 14 | writing to [hyp.out] 15 | 181 INS 16 | 316 DEL 17 | 205 SUB 18 | expected WER 0.702 19 | ``` 20 | 21 | NOTE: this script provides an approximate WER, the algorithm could use some fine tuning to be exact. 22 | 23 | ## gather_runtime_metrics.sh 24 | A simple bash script that is meant for benchmarking the resource (RAM and runtime) consumption of fstalign across different transcript settings (length, WER). It uses the `generate_wer_test_data.pl` to generate fake transcripts with a suite of hard-coded settings and runs them through fstalign, recording the resource usage to a CSV. 25 | 26 | Example usage: 27 | `bash gather_runtime_metrics.sh output_for_this_release.csv` 28 | 29 | ## sbs2fst.py 30 | A python interface to simplify the conversion of a side-by-side file, generated from fstalign's `--output-sbs` flag, into [files that can be used to produce an FST using OpenFST](https://www.openfst.org/twiki/bin/view/FST/FstQuickTour). 31 | 32 | Example usage: 33 | 34 | `python sbs2fst.py sbs_file.txt fst_file_name` 35 | 36 | The output will be two files: `fst_file_name.fst` which will describe the FST in the AT&T FSM format used by OpenFST, and `fst_file_name.txt` which contains the complete list of symbols in the FST. 37 | 38 | The additional flags can be passed into the python script to add metadata that fstalign uses for tracking performance. These are useful to understand when fstalign picks tokens that are: only in the side-by-side's `ref_token` column (labeled by the `--left` flag), only in the side-by-side's `hyp_token` column (labeled by the `--right` flag), or in both columns because the `ref_token` and `hyp_token` agree (labeled by the `--gold` flag). 39 | 40 | Example usage: 41 | 42 | `python sbs2fst.py --tag --left VERBATIM --right NONVERBATIM --gold AGREEMENT sbs_file.txt fst_file_name` 43 | 44 | The output will produce an FST with tags indicating tokens that were only in the `ref_token` with `VERBATIM`, tokens that were only in the `hyp_token` with `NONVERBATIM`, and tokens that were in both columns with `AGREEMENT`. 45 | 46 | ### Compiling the FST 47 | Once you have used `sbs2fst.py` to produce the `.txt` and `.fst` files, you *must* then compile the FST before passing it into fstalign. An example command can be found below: 48 | 49 | `fstcompile --isymbols=${SYMBOLS} --osymbols=${SYMBOLS} ${TXT_FST} ${COMPILED_FST}` 50 | 51 | where `SYMBOLS` is the `.txt` file produced by `sbs2fst.py`, `TXT_FST` is the `.fst` file, and `COMPILED_FST` is a new `.fst` file that produces the binary FST usable by fstalign. 52 | 53 | Example usage: 54 | ```bash 55 | python sbs2fst.py --tag --left VERBATIM --right NONVERBATIM --gold AGREEMENT sbs_file.txt fst_file_name 56 | fstcompile --isymbols=fst_file_name.txt --osymbols=fst_file_name.txt fst_file_name.fst fst_file_name.compiled.fst 57 | ``` 58 | You can now use `fst_file_name.compiled.fst` in fstalign with the corresponding symbols file as follows: 59 | ```bash 60 | fstalign --ref fst_file_name.complied.fst --symbols fst_file_name.txt ... 61 | ``` 62 | 63 | Note that when you `sbs2fst.py` to produce a "tagged" FST with the `--tag` flag, fstalign will aggregate WER metrics for each of the specified tags (`--left`, `--right`, and `--gold`) in the JSON log file specified by fstalign's `--json-log` flag. 64 | 65 | -------------------------------------------------------------------------------- /tools/gather_runtime_metrics.sh: -------------------------------------------------------------------------------- 1 | # Script to gather runtime metrics on fstalign binary 2 | 3 | benchmark_settings() { 4 | local outdir=$1 # directory to write refs, hyps, and stats to 5 | local ref_length=$2 # target number of words when making a synthetic reference 6 | local num_repeats=$3 # number of trials to run for this benchmark 7 | local ins_rate=$4 # target insertion rate when making a synthetic hypothesis 8 | local del_rate=$5 # target deletion rate when making a synthetic hypothesis 9 | local sub_rate=$6 # target substitution rate when making a synthetic hypothesis 10 | local outcsv=$7 # output to write comma separated stats to 11 | 12 | for i in $(seq $num_repeats); do 13 | perl generate_wer_test_data.pl --ins_fract $ins_rate \ 14 | --del_fract $del_rate \ 15 | --sub_fract $sub_rate \ 16 | --ref_length $ref_length \ 17 | --oref "${outdir}/ref${i}.txt" \ 18 | --ohyp "${outdir}/hyp${i}.txt" 19 | 20 | /usr/bin/time -v fstalign wer --ref "${outdir}/ref${i}.txt" \ 21 | --hyp "${outdir}/hyp${i}.txt" 2> "${outdir}/stats${i}.txt" 22 | 23 | runtime=$(grep "Elapsed (wall clock) time" "${outdir}/stats${i}.txt" | awk 'NF>1{print $NF}') 24 | ram=$(grep "Maximum resident set size" "${outdir}/stats${i}.txt" | awk 'NF>1{print $NF}') 25 | 26 | echo "${ref_length},${ins_rate},${del_rate},${sub_rate},${runtime},${ram}" >> "${outcsv}" 27 | done 28 | } 29 | 30 | main() { 31 | echo "$0 $@" # Print the command line for logging 32 | 33 | local outcsv=$1 34 | 35 | echo "length,ins,del,sub,runtime,ram" >> "${outcsv}" 36 | 37 | # Stage 1: medium transcripts, different WER 38 | dir="temp" 39 | mkdir "${dir}" 40 | benchmark_settings "${dir}" 1000 5 0.1 0.1 0.1 "${outcsv}" 41 | benchmark_settings "${dir}" 1000 5 0.2 0.2 0.2 "${outcsv}" 42 | benchmark_settings "${dir}" 1000 5 0.3 0.3 0.3 "${outcsv}" 43 | benchmark_settings "${dir}" 1000 5 0.1 0.1 0.4 "${outcsv}" 44 | benchmark_settings "${dir}" 1000 5 0.1 0.4 0.1 "${outcsv}" 45 | benchmark_settings "${dir}" 1000 5 0.4 0.1 0.1 "${outcsv}" 46 | 47 | # Stage 2: single WER, different length transcripts 48 | benchmark_settings "${dir}" 100 10 0.1 0.1 0.1 "${outcsv}" 49 | benchmark_settings "${dir}" 200 10 0.1 0.1 0.1 "${outcsv}" 50 | benchmark_settings "${dir}" 400 10 0.1 0.1 0.1 "${outcsv}" 51 | benchmark_settings "${dir}" 800 5 0.1 0.1 0.1 "${outcsv}" 52 | benchmark_settings "${dir}" 2000 5 0.1 0.1 0.1 "${outcsv}" 53 | benchmark_settings "${dir}" 4000 2 0.1 0.1 0.1 "${outcsv}" 54 | benchmark_settings "${dir}" 8000 2 0.1 0.1 0.1 "${outcsv}" 55 | benchmark_settings "${dir}" 16000 2 0.1 0.1 0.1 "${outcsv}" 56 | benchmark_settings "${dir}" 32000 2 0.1 0.1 0.1 "${outcsv}" 57 | } 58 | 59 | main "$@" 60 | -------------------------------------------------------------------------------- /tools/generate_wer_test_data.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Long; 3 | 4 | my $in_ref; 5 | my $out_ref = "ref.out"; 6 | my $out_hyp = "hyp.out"; 7 | my $ins_fract = 0.1; 8 | my $del_fract = 0.1; 9 | my $sub_fract = 0.1; 10 | my $ref_length = 1000; 11 | 12 | my $voc_length = 10000; 13 | 14 | my $rc = GetOptions( 15 | "iref=s" =>\$in_ref, 16 | "oref=s" =>\$out_ref, 17 | "ohyp=s" =>\$out_hyp, 18 | "ins_fract=f" => \$ins_fract, 19 | "del_fract=f" => \$del_fract, 20 | "sub_fract=f" => \$sub_fract, 21 | "ref_length=i" => \$ref_length, 22 | "voc_length=i" => \$voc_length, 23 | ); 24 | 25 | die "check your commandline!\n" if(!$rc); 26 | 27 | # just making things slightly easier 28 | our %words; 29 | our $word_mass =0; 30 | my @ref_words; 31 | my @hyp_words; 32 | 33 | if(!defined($in_ref)){ 34 | for(my $i = 0; $i < $voc_length; $i++) 35 | { 36 | my $w = sprintf("w%06d", $i); 37 | # fix to get a non-uniform distribution 38 | my $mass = 1; 39 | $words{$w} = $mass; 40 | $word_mass += $mass; 41 | } 42 | 43 | for(my $i =0; $i < $ref_length; $i++){ 44 | my $w = select_word(); 45 | push(@ref_words, $w); 46 | } 47 | } else { 48 | open(FF, "<$in_ref") || die "couldn't open [$in_ref] for reading!"; 49 | while(my $l = ){ 50 | chomp($l); 51 | $l=~s/\s*$//; 52 | my @wds = split(/\s+/, $l); 53 | foreach (@wds){ 54 | $words{$_}++; 55 | $word_mass += 1; 56 | push(@ref_words, $_); 57 | } 58 | } 59 | 60 | $ref_length = scalar(@ref_words); 61 | } 62 | 63 | my $num_ins = 0; 64 | my $num_del = 0; 65 | my $num_sub = 0; 66 | my $i = 0; 67 | my $last_was_del = 0; 68 | 69 | my $ins_thres = $ins_fract; 70 | my $del_thres = $ins_thres + $del_fract; 71 | my $sub_thres = $del_thres + $sub_fract; 72 | my $owed_ins = 0; 73 | 74 | # Algorithm is as follows: 75 | # Because "word error rate" is defined as a rate respective to the number of 76 | # reference words, we sample for an "error" while looping over a reference word 77 | # counter. The only thing we need to do is avoid consecutive INS+DEL or DEL+INS, 78 | # because these will be counted as SUB. Thus, for every INS sampled, we add a 79 | # ref word after the INS to avoid INS+DEL, or add to a counter to owed_ins if 80 | # a DEL just happened. 81 | 82 | while($i < $ref_length) 83 | { 84 | my $r = rand(); 85 | my $rw = $ref_words[$i]; 86 | 87 | if($r <= $ins_thres) 88 | { 89 | if($last_was_del){ 90 | # let's not insert after a deletion, this looks like 91 | # a substitution 92 | $owed_ins++; 93 | 94 | # Add in a reference word to keep sampling 95 | push(@hyp_words, $rw); 96 | $i++; 97 | next; 98 | } else { 99 | # safe to insert, add an inserted word and also 100 | # add the reference word we are sampling 101 | my $ins_w = select_word(); 102 | push(@hyp_words, $ins_w); 103 | $num_ins++; 104 | 105 | push(@hyp_words, $rw); 106 | $i++; 107 | $last_was_del = 0; 108 | } 109 | } elsif($r < $del_thres){ 110 | $num_del++; 111 | $i++; 112 | $last_was_del = 1; 113 | } elsif($r < $sub_thres){ 114 | my $sub_w = select_word(); 115 | while($sub_w eq $rw) 116 | { 117 | $sub_w = select_word(); 118 | } 119 | 120 | $num_sub++; 121 | push(@hyp_words, $sub_w); 122 | $i++; 123 | $last_was_del = 0; 124 | } else { 125 | if(!$last_was_del){ 126 | # clean out the buffer of owed insertions 127 | while($owed_ins > 0){ 128 | my $ins_w = select_word(); 129 | push(@hyp_words, $ins_w); 130 | $num_ins++; 131 | $owed_ins--; 132 | } 133 | } 134 | 135 | $i++; 136 | # phew... a correct word... 137 | push(@hyp_words, $rw); 138 | $last_was_del = 0; 139 | } 140 | } 141 | 142 | if(defined($out_ref)){ 143 | dump_words($out_ref, \@ref_words); 144 | } 145 | 146 | if(defined($out_hyp)){ 147 | dump_words($out_hyp, \@hyp_words); 148 | } 149 | 150 | print "$num_ins INS\n"; 151 | print "$num_del DEL\n"; 152 | print "$num_sub SUB\n"; 153 | printf "expected WER %.3f\n", ($num_ins + $num_del + $num_sub) / $ref_length; 154 | 155 | sub dump_words{ 156 | my ($ofn, $aref) = @_; 157 | 158 | print "writing to [$ofn]\n"; 159 | open(OUT, ">$ofn") || die "couldn't open [$ofn] for writing!"; 160 | foreach (@$aref){ 161 | print OUT $_," "; 162 | } 163 | print OUT "\n"; 164 | close(OUT); 165 | 166 | } 167 | 168 | 169 | 170 | 171 | sub select_word { 172 | my $r = int(rand($word_mass)); 173 | my $w; 174 | 175 | my $cur_sum = 0; 176 | while ( my ( $key, $value ) = each %words ) { 177 | $w = $key; 178 | $cur_sum += $value; 179 | if($r <= $cur_sum){ 180 | last; 181 | } 182 | } 183 | 184 | return $w; 185 | } -------------------------------------------------------------------------------- /tools/images/120_short_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_short_files.png -------------------------------------------------------------------------------- /tools/images/120_vs_130_ram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_vs_130_ram.png -------------------------------------------------------------------------------- /tools/images/120_vs_130_runtime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_vs_130_runtime.png -------------------------------------------------------------------------------- /tools/images/130_short_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/130_short_files.png --------------------------------------------------------------------------------