├── .clang-format
├── .github
    └── workflows
    │   ├── CI.yml
    │   ├── copyright-update.yml
    │   ├── deploy.yml
    │   └── links_fail_fast.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── docs
    ├── NLP-Format.md
    ├── Synonyms-Format.md
    ├── Usage.md
    └── json_log_schema.json
├── sample_data
    ├── aaddf.txt
    ├── abcdef.txt
    └── synonyms.rules.txt
├── src
    ├── AdaptedComposition.cpp
    ├── AdaptedComposition.h
    ├── AlignmentTraversor.cpp
    ├── AlignmentTraversor.h
    ├── Ctm.cpp
    ├── Ctm.h
    ├── FstFileLoader.cpp
    ├── FstFileLoader.h
    ├── FstLoader.cpp
    ├── FstLoader.h
    ├── IComposition.h
    ├── Nlp.cpp
    ├── Nlp.h
    ├── OneBestFstLoader.cpp
    ├── OneBestFstLoader.h
    ├── PathHeap.cpp
    ├── PathHeap.h
    ├── StandardComposition.cpp
    ├── StandardComposition.h
    ├── SynonymEngine.cpp
    ├── SynonymEngine.h
    ├── Walker.cpp
    ├── Walker.h
    ├── fast-d.cpp
    ├── fast-d.h
    ├── fstalign.cpp
    ├── fstalign.h
    ├── json_logging.h
    ├── logging.cpp
    ├── logging.h
    ├── main.cpp
    ├── utilities.cpp
    ├── utilities.h
    ├── version.h
    ├── wer.cpp
    └── wer.h
├── test
    ├── CMakeLists.txt
    ├── compose-tests-utils.h
    ├── compose-tests.cc
    ├── data
    │   ├── align_1.aligned.punc_case.nlp
    │   ├── align_1.hyp.ctm
    │   ├── align_1.hyp.punc_case.ctm
    │   ├── align_1.norm.json
    │   ├── align_1.ref.aligned.nlp
    │   ├── align_1.ref.nlp
    │   ├── align_2.hyp.ctm
    │   ├── align_2.norm.json
    │   ├── align_2.ref.aligned.nlp
    │   ├── align_2.ref.aligned.std.nlp
    │   ├── align_2.ref.nlp
    │   ├── align_3.hyp.ctm
    │   ├── align_3.norm.json
    │   ├── align_3.ref.aligned.nlp
    │   ├── align_3.ref.nlp
    │   ├── align_4.hyp1.ctm
    │   ├── align_4.hyp2.ctm
    │   ├── align_4.norm.json
    │   ├── align_4.ref.aligned1.nlp
    │   ├── align_4.ref.aligned2.nlp
    │   ├── align_4.ref.nlp
    │   ├── align_5.hyp1.ctm
    │   ├── align_5.hyp2.ctm
    │   ├── align_5.ref.aligned1-2.nlp
    │   ├── align_5.ref.aligned1.nlp
    │   ├── align_5.ref.aligned2-a2.nlp
    │   ├── align_5.ref.aligned2.nlp
    │   ├── align_5.ref.nlp
    │   ├── align_6.hyp.ctm
    │   ├── align_6.ref.aligned.nlp
    │   ├── align_6.ref.nlp
    │   ├── empty.hyp.ctm
    │   ├── empty.hyp.nlp
    │   ├── empty.hyp.txt
    │   ├── empty.ref.txt
    │   ├── fstalign-50.hyp.txt
    │   ├── fstalign-50.new.sbs.txt
    │   ├── fstalign-50.ref.txt
    │   ├── noise.hyp1.ctm
    │   ├── noise.hyp2.ctm
    │   ├── noise_1.hyp1.aligned
    │   ├── noise_1.hyp2.aligned
    │   ├── noise_1.ref.nlp
    │   ├── oracle_1.hyp.fst
    │   ├── oracle_1.ref.txt
    │   ├── oracle_1.symbols.txt
    │   ├── short.aligned.case.nlp
    │   ├── short.aligned.nlp
    │   ├── short.aligned.punc.nlp
    │   ├── short.aligned.punc_case.nlp
    │   ├── short.aligned.strict.nlp
    │   ├── short.hyp.nlp
    │   ├── short.hyp.txt
    │   ├── short.ref.nlp
    │   ├── short.sbs.txt
    │   ├── short_punc.hyp.nlp
    │   ├── short_punc.ref.nlp
    │   ├── short_punc.wer_tag.json
    │   ├── speaker_1.hyp.txt
    │   ├── speaker_1.ref.nlp
    │   ├── speaker_2.hyp.txt
    │   ├── speaker_2.ref.nlp
    │   ├── syn_1.hyp.adapted.sbs
    │   ├── syn_1.hyp.sbs
    │   ├── syn_1.hyp.txt
    │   ├── syn_1.ref.txt
    │   ├── syn_10.hyp.txt
    │   ├── syn_10.ref.txt
    │   ├── syn_2.hyp.txt
    │   ├── syn_2.ref.txt
    │   ├── syn_3.hyp.txt
    │   ├── syn_3.ref.txt
    │   ├── syn_4.hyp.txt
    │   ├── syn_4.ref.txt
    │   ├── syn_5.hyp.txt
    │   ├── syn_5.ref.txt
    │   ├── syn_6.hyp.txt
    │   ├── syn_6.ref.txt
    │   ├── syn_7.hyp.txt
    │   ├── syn_7.hyp2.txt
    │   ├── syn_7.hyp3.txt
    │   ├── syn_7.hyp4.txt
    │   ├── syn_7.norm.json
    │   ├── syn_7.ref.nlp
    │   ├── syn_7.synonym.rules.txt
    │   ├── syn_7_ref4.nlp
    │   ├── syn_8.hyp.ctm
    │   ├── syn_8.ref.nlp
    │   ├── syn_9.hyp.txt
    │   ├── syn_9.ref.txt
    │   ├── syn_9.synonym.rules.txt
    │   ├── syn_compound_1.hyp.txt
    │   ├── syn_compound_1.ref.txt
    │   ├── syn_compound_2.hyp.txt
    │   ├── syn_compound_2.ref.txt
    │   ├── test1.hyp.txt
    │   ├── test1.ref.txt
    │   ├── twenty.aligned.punc_case.nlp
    │   ├── twenty.hyp-a2.sbs
    │   ├── twenty.hyp.punc_case.txt
    │   ├── twenty.hyp.sbs
    │   ├── twenty.hyp.txt
    │   ├── twenty.norm.json
    │   ├── twenty.ref.nlp
    │   ├── twenty.ref.testing.nlp
    │   ├── twenty.ref.testing.norm.json
    │   ├── wer_utf.hyp.txt
    │   └── wer_utf.ref.txt
    ├── fast-d-tests.cc
    ├── fstalign_Test.cc
    └── test-utilties.h
└── tools
    ├── README.md
    ├── gather_runtime_metrics.sh
    ├── generate_wer_test_data.pl
    ├── images
        ├── 120_short_files.png
        ├── 120_vs_130_ram.png
        ├── 120_vs_130_runtime.png
        └── 130_short_files.png
    └── sbs2fst.py


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: true
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:   
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakBeforeTernaryOperators: true
 43 | BreakConstructorInitializersBeforeComma: false
 44 | BreakConstructorInitializers: BeforeColon
 45 | BreakAfterJavaFieldAnnotations: false
 46 | BreakStringLiterals: true
 47 | ColumnLimit:     120
 48 | CommentPragmas:  '^ IWYU pragma:'
 49 | CompactNamespaces: false
 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 51 | ConstructorInitializerIndentWidth: 4
 52 | ContinuationIndentWidth: 4
 53 | Cpp11BracedListStyle: true
 54 | DerivePointerAlignment: true
 55 | DisableFormat:   false
 56 | ExperimentalAutoDetectBinPacking: false
 57 | FixNamespaceComments: true
 58 | ForEachMacros:   
 59 |   - foreach
 60 |   - Q_FOREACH
 61 |   - BOOST_FOREACH
 62 | IncludeBlocks:   Preserve
 63 | IncludeCategories: 
 64 |   - Regex:           '^<ext/.*\.h>'
 65 |     Priority:        2
 66 |   - Regex:           '^<.*\.h>'
 67 |     Priority:        1
 68 |   - Regex:           '^<.*'
 69 |     Priority:        2
 70 |   - Regex:           '.*'
 71 |     Priority:        3
 72 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 73 | IndentCaseLabels: true
 74 | IndentPPDirectives: None
 75 | IndentWidth:     2
 76 | IndentWrappedFunctionNames: false
 77 | JavaScriptQuotes: Leave
 78 | JavaScriptWrapImports: true
 79 | KeepEmptyLinesAtTheStartOfBlocks: false
 80 | MacroBlockBegin: ''
 81 | MacroBlockEnd:   ''
 82 | MaxEmptyLinesToKeep: 1
 83 | NamespaceIndentation: None
 84 | ObjCBlockIndentWidth: 2
 85 | ObjCSpaceAfterProperty: false
 86 | ObjCSpaceBeforeProtocolList: false
 87 | PenaltyBreakAssignment: 2
 88 | PenaltyBreakBeforeFirstCallParameter: 1
 89 | PenaltyBreakComment: 300
 90 | PenaltyBreakFirstLessLess: 120
 91 | PenaltyBreakString: 1000
 92 | PenaltyExcessCharacter: 1000000
 93 | PenaltyReturnTypeOnItsOwnLine: 200
 94 | PointerAlignment: Left
 95 | ReflowComments:  true
 96 | SortIncludes:    true
 97 | SortUsingDeclarations: true
 98 | SpaceAfterCStyleCast: false
 99 | SpaceAfterTemplateKeyword: true
100 | SpaceBeforeAssignmentOperators: true
101 | SpaceBeforeParens: ControlStatements
102 | SpaceInEmptyParentheses: false
103 | SpacesBeforeTrailingComments: 2
104 | SpacesInAngles:  false
105 | SpacesInContainerLiterals: true
106 | SpacesInCStyleCastParentheses: false
107 | SpacesInParentheses: false
108 | SpacesInSquareBrackets: false
109 | Standard:        Auto
110 | TabWidth:        8
111 | UseTab:          Never
112 | ...
113 | 
114 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | # Trigger the workflow on a push event or manually from the Actions tab
 4 | on: [push, workflow_dispatch]
 5 | 
 6 | jobs:
 7 |   # This workflow has one job
 8 |   # First we build the docker container, then we run CI tests
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     env: 
12 |       DOCKER_IMAGE: fstalign
13 | 
14 |     steps:
15 |       - name: Checkout repository and submodules
16 |         uses: actions/checkout@v2
17 |         with:
18 |           submodules: recursive
19 | 
20 |       - name: Build the docker container
21 |         run: docker build . -f Dockerfile -t ${DOCKER_IMAGE}
22 | 
23 |       - name: Run CI tests
24 |         run: docker run --rm -t ${DOCKER_IMAGE} bash -c '(cd build && make test)'
25 | 


--------------------------------------------------------------------------------
/.github/workflows/copyright-update.yml:
--------------------------------------------------------------------------------
 1 | name: Update copyright year(s) in license file
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 3 1 1 *" # 03:00 AM on January 1
 7 | 
 8 | jobs:
 9 |   update-license-year:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |         with:
14 |           fetch-depth: 0
15 |       - uses: FantasticFiasco/action-update-license-year@771ff9afdc55b09e1fb649cf03e312d0cf86b4a6
16 |         with:
17 |           token: ${{ secrets.GITHUB_TOKEN }}
18 |           transform: (?<from>\d{4})+-?(\d{4})?
19 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Docker image
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | jobs:
 6 |   push_to_registry:
 7 |     name: Push Docker image to GitHub Packages
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - 
11 |         name: Check out the repo
12 |         uses: actions/checkout@v2
13 |         with:
14 |           submodules: recursive
15 | 
16 |       - 
17 |         name: Prepare tags
18 |         id: prep
19 |         run: |
20 |           DOCKER_IMAGE=revdotcom/fstalign
21 |           VERSION=develop
22 |           if [[ $GITHUB_REF == refs/tags/* ]]; then
23 |             VERSION=${GITHUB_REF#refs/tags/}
24 |           fi
25 |           TAGS="${DOCKER_IMAGE}:${VERSION}"
26 |           echo ::set-output name=version::${VERSION}
27 |           echo ::set-output name=tags::${TAGS}
28 |           echo ::set-output name=created::$(date -u +'%Y-%m-%dT%H:%M:%SZ')
29 | 
30 |       -
31 |         name: Set up Docker Buildx
32 |         uses: docker/setup-buildx-action@v1
33 | 
34 |       -
35 |         name: Login to DockerHub
36 |         uses: docker/login-action@v1 
37 |         with:
38 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
39 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
40 | 
41 |       - 
42 |         name: Build and push
43 |         uses: docker/build-push-action@v2
44 |         with:
45 |           context: .
46 |           platforms: linux/amd64
47 |           push: true
48 |           tags: |
49 |             revdotcom/fstalign:latest
50 |             ${{ steps.prep.outputs.tags }}
51 | 


--------------------------------------------------------------------------------
/.github/workflows/links_fail_fast.yml:
--------------------------------------------------------------------------------
 1 | name: Broken Link Checker
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   linkChecker:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 | 
13 |       - name: Link Checker
14 |         uses: lycheeverse/lychee-action@e1ef974431881438bf594f458e332b099fd33bb5 #v1.4.1 https://github.com/lycheeverse/lychee-action#security-tip
15 |         with:
16 |           args: --verbose --no-progress './**/*.md' './**/*.json' './**/*.cpp' './**/*.h' './**/*.cc'
17 |           fail: true
18 |         env:
19 |           GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS generated files #
 2 | ######################
 3 | .DS_Store
 4 | .DS_Store?
 5 | ._*
 6 | .Spotlight-V100
 7 | .Trashes
 8 | ehthumbs.db
 9 | Thumbs.db
10 | *.o
11 | *.pyc
12 | *.swp
13 | 
14 | build
15 | 
16 | *.dSYM
17 | *.vscode*
18 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third-party/CLI11"]
 2 | 	path = third-party/CLI11
 3 | 	url = https://github.com/CLIUtils/CLI11.git
 4 | [submodule "third-party/strtk"]
 5 | 	path = third-party/strtk
 6 | 	url = https://github.com/ArashPartow/strtk.git
 7 | [submodule "third-party/jsoncpp"]
 8 | 	path = third-party/jsoncpp
 9 | 	url = https://github.com/open-source-parsers/jsoncpp.git
10 | [submodule "third-party/csv"]
11 | 	path = third-party/csv
12 | 	url = https://github.com/ben-strasser/fast-cpp-csv-parser.git
13 | [submodule "third-party/catch2"]
14 | 	path = third-party/catch2
15 | 	url = https://github.com/catchorg/Catch2.git
16 | [submodule "third-party/spdlog"]
17 | 	path = third-party/spdlog
18 | 	url = https://github.com/gabime/spdlog.git
19 | [submodule "third-party/inih"]
20 | 	path = third-party/inih
21 | 	url = https://github.com/benhoyt/inih.git
22 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.7)
  2 | 
  3 | project(fstalign LANGUAGES CXX C)
  4 | 
  5 | include(GNUInstallDirs)
  6 | 
  7 | if(NOT CMAKE_BUILD_TYPE)
  8 |   set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
  9 | endif()
 10 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 11 | 
 12 | enable_testing()
 13 | 
 14 | set(CMAKE_CXX_STANDARD 14)
 15 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 16 | 
 17 | if(DEFINED ENV{OPENFST_ROOT})
 18 |   set(OPENFST_ROOT $ENV{OPENFST_ROOT} CACHE STRING "Path to OpenFST")
 19 | endif()
 20 | message(STATUS "OpenFST root: ${OPENFST_ROOT}")
 21 | 
 22 | set(FSTALIGN_INCLUDES
 23 |   ${PROJECT_SOURCE_DIR}/third-party/spdlog/include
 24 |   ${PROJECT_SOURCE_DIR}/third-party/CLI11/include
 25 |   ${PROJECT_SOURCE_DIR}/src
 26 |   ${PROJECT_SOURCE_DIR}/third-party
 27 |   ${PROJECT_SOURCE_DIR}/third-party/inih
 28 |   ${PROJECT_SOURCE_DIR}/third-party/inih/cpp
 29 | )
 30 | 
 31 | find_package(Threads REQUIRED)
 32 | 
 33 | set(FSTALIGN_LIBRARIES
 34 |   jsoncpp_lib_static
 35 |   ${PROJECT_SOURCE_DIR}/third-party/inih/ini.c
 36 | )
 37 | 
 38 | set(OPENFST_INCLUDES
 39 |   ${OPENFST_ROOT}/include
 40 | )
 41 | 
 42 | if(DYNAMIC_OPENFST)
 43 |   set(OPENFST_LIBRARIES
 44 |     ${OPENFST_ROOT}/lib/libfst.so
 45 |   )
 46 | else()
 47 |   set(OPENFST_LIBRARIES
 48 |     ${OPENFST_ROOT}/lib/libfst.a -ldl
 49 |   )
 50 | endif()
 51 | 
 52 | add_library(fstaligner-common
 53 |   src/fstalign.cpp
 54 |   src/wer.cpp
 55 |   src/fast-d.cpp
 56 |   src/AdaptedComposition.cpp
 57 |   src/StandardComposition.cpp
 58 |   src/AlignmentTraversor.cpp
 59 |   src/Ctm.cpp
 60 |   src/FstLoader.cpp
 61 |   src/FstFileLoader.cpp
 62 |   src/logging.cpp
 63 |   src/Nlp.cpp
 64 |   src/OneBestFstLoader.cpp
 65 |   src/PathHeap.cpp
 66 |   src/SynonymEngine.cpp
 67 |   src/utilities.cpp
 68 |   src/Walker.cpp
 69 |   third-party/inih/cpp/INIReader.cpp
 70 | )
 71 | 
 72 | list(APPEND CMAKE_PREFIX_PATH "/usr/local/opt/icu4c") # for Mac users
 73 | find_package(ICU REQUIRED COMPONENTS uc)
 74 | 
 75 | target_link_libraries(fstaligner-common
 76 |   Threads::Threads
 77 |   ${FSTALIGN_LIBRARIES}
 78 |   ${FST_KALDI_LIBRARIES}
 79 |   ${ICU_LIBRARIES}
 80 | )
 81 | 
 82 | add_subdirectory(third-party/jsoncpp)
 83 | add_subdirectory(third-party/catch2)
 84 | 
 85 | add_executable(fstalign src/main.cpp)
 86 | 
 87 | include_directories(fstalign
 88 |   ${FSTALIGN_INCLUDES}
 89 |   ${OPENFST_INCLUDES}
 90 |   ${ICU_INCLUDE_DIRS}
 91 | )
 92 | 
 93 | target_link_libraries(fstalign
 94 |   fstaligner-common
 95 |   ${CMAKE_DL_LIBS}
 96 |   ${FSTALIGN_LIBRARIES}
 97 |   ${OPENFST_LIBRARIES}
 98 | )
 99 | 
100 | add_subdirectory(test)
101 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 
 3 | documentation, we greatly value feedback and contributions from our community.
 4 | 
 5 | 
 6 | ## Reporting Bugs/Feature Requests
 7 | Please use the GitHub issue tracker to report bugs or suggest features.
 8 | 
 9 | When filing an issue, please check [existing open](https://github.com/revdotcom/fstalign/issues), or recently closed, issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
10 | 
11 | * A reproducible test case or series of steps
12 | * The environment/deployment of our code being used
13 | * The version of our code being used
14 | * Any modifications you've made relevant to the bug
15 | 
16 | 
17 | ## Contributing via Pull Requests
18 | This project follows the [Gitflow workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) for contributions. Before sending us a pull request, please ensure that:
19 | 
20 | 1. You are working against the latest source on the *develop* branch.
21 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
22 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
23 | 
24 | To send us a pull request, please:
25 | 
26 | 1. Fork the repository.
27 | 2. Modify the source.
28 | 3. Ensure local tests pass.
29 | 4. Commit to your fork using clear commit messages.
30 | 5. Send us a pull request.
31 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
32 | 
33 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 
34 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
35 | 
36 | 
37 | ## Code of Conduct
38 | This project has adopted the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org/version/1/4/code-of-conduct/).
39 | 
40 | 
41 | ## Licensing
42 | See the [LICENSE](https://github.com/revdotcom/fstalign/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
43 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Using kaldi image for pre-built OpenFST, version is 1.7.2
 2 | FROM kaldiasr/kaldi:cpu-debian10-2024-07-29 as kaldi-base
 3 | 
 4 | FROM debian:11
 5 | 
 6 | COPY --from=kaldi-base /opt/kaldi/tools/openfst /opt/openfst
 7 | ENV OPENFST_ROOT /opt/openfst
 8 | 
 9 | ARG JOBS=4
10 | 
11 | RUN apt-get update && \
12 |     apt-get upgrade -y && \
13 |     apt-get -y install \
14 |     cmake \
15 |     g++ \
16 |     libicu-dev
17 | 
18 | RUN mkdir /fstalign
19 | COPY CMakeLists.txt /fstalign/CMakeLists.txt
20 | COPY src /fstalign/src
21 | COPY test /fstalign/test
22 | COPY third-party /fstalign/third-party
23 | COPY sample_data /fstalign/sample_data
24 | 
25 | WORKDIR /fstalign
26 | 
27 | RUN mkdir -p /fstalign/build && \
28 |     cd /fstalign/build && \
29 |     rm -rf * && \
30 |     cmake .. -DOPENFST_ROOT="${OPENFST_ROOT}" -DDYNAMIC_OPENFST=ON && \
31 |     make -j${JOBS} VERBOSE=1 && \
32 |     mkdir -p /fstalign/bin && \
33 |     cp /fstalign/build/fstalign /fstalign/bin && \
34 |     strip /fstalign/bin/*
35 | 
36 | COPY tools /fstalign/tools
37 | 
38 | ENV PATH \
39 |     /fstalign/bin/:\
40 |     $PATH
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![CI](https://github.com/revdotcom/fstalign/workflows/CI/badge.svg)
  2 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  3 | 
  4 | # fstalign
  5 | - [Overview](#Overview)
  6 | - [What's new in 2.0](#What's-new-in-2.0)
  7 | - [Installation](#Installation)
  8 |   * [Dependencies](#Dependencies)
  9 |   * [Build](#Build)
 10 |   * [Docker](#Docker)
 11 | - [Documentation](#Documentation)
 12 | 
 13 | ## Overview
 14 | `fstalign` is a tool for creating alignment between two sequences of tokens (here out referred to as “reference” and “hypothesis”). It has two key functions: computing word error rate (WER) and aligning [NLP-formatted](https://github.com/revdotcom/fstalign/blob/develop/docs/NLP-Format.md) references with CTM hypotheses.
 15 | 
 16 | Due to its use of OpenFST and lazy algorithms for text-based alignment, `fstalign` is efficient for calculating WER while also providing significant flexibility for different measurement features and error analysis.
 17 | 
 18 | ## What's new in 2.0
 19 | 
 20 | Version 2.0 introduces two major changes: 
 21 | 1. A new method to traverse the composition graph, which dramatically improves the overall speed, especially when the sequences are long contain many errors.
 22 | We have files that took 25 minutes to align before that can now take about 7 seconds. This is especially noticeable with the adapted composition (the default).
 23 | 1. Some smarts were introduced when --use-case and --use-punctuation are enabled.
 24 | Now, by default, punctuation symbols can only be substituted by other punctuation symbols (or deleted/inserted).
 25 | Also, words that differ only by the first letter case will be preffered for substitution.
 26 | 
 27 | 
 28 | Here's an example of the 1.x behavior and the 2.0 version
 29 | ```
 30 | ==> v1.x sbs.txt <==
 31 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 32 |              Welcome	Welcome             			###322_###|
 33 |                 back	back                			
 34 |                   to	to                  			
 35 |              another	another             			
 36 |              episode	episode             			###323_###|
 37 |                   of	of                  			
 38 |             Podcasts	Podcast             	ERR		###324_###|
 39 |                   in	and                 	ERR		
 40 |                Color	Color               			###167_###|###325_###|
 41 |                    :	of                  	ERR		
 42 |                  The	the                 	ERR		
 43 |              Podcast	Podcast             			###168_###|###326_###|
 44 |                    .	.                   			
 45 |                    I	I                   			
 46 | 
 47 | ==> v2.0 sbs.txt <==
 48 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 49 |              Welcome	Welcome             			###322_###|
 50 |                 back	back                			
 51 |                   to	to                  			
 52 |              another	another             			
 53 |              episode	episode             			###323_###|
 54 |                   of	of                  			
 55 |             Podcasts	Podcast             	ERR		###324_###|
 56 |                   in	and                 	ERR		
 57 |                Color	Color               			###167_###|###325_###|
 58 |                <ins>	of                  	ERR		
 59 |                    :	<del>               	ERR		
 60 |                  The	the                 	ERR		
 61 |              Podcast	Podcast             			###168_###|###326_###|
 62 | ```
 63 | The confusion between `:` and `of` is not longer allowed.
 64 | 
 65 | Also, here's how favoring or not the substitution based on case-insensitive comparison, while still counting it as an error, looks like:
 66 | ```
 67 | ==> v1.x sbs.txt <==
 68 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 69 |              shorten    shorten                         ###801_###|
 70 |                 It's    it's                    ERR     
 71 |                Berry    Barry                   ERR     ###785_###|###788_###|###802_###|
 72 |                    .    .        
 73 |                 Just    Just   
 74 |                 Yeah    like                    ERR     ###805_###|                                                                                                             
 75 |                    .    <del>                   ERR
 76 |                 Like    <del>                   ERR     
 77 |                    ,    <del>                   ERR     
 78 |                    I    I                               ###809_###|
 79 |                 have    have   
 80 |                    a    a         
 81 |             nickname    nickname 
 82 | 
 83 | ==> v2.0 sbs.txt <==
 84 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 85 |                 It's    it's                    ERR     
 86 |                Berry    Barry                   ERR     ###785_###|###788_###|###802_###|
 87 |                    .    .     
 88 |                 Just    Just     
 89 |                 Yeah    <del>                   ERR     ###805_###|
 90 |                    .    <del>                   ERR     
 91 |                 Like    like                    ERR     
 92 |                    ,    <del>                   ERR     
 93 |                    I    I                               ###809_###|
 94 |                 have    have     
 95 |                    a    a        
 96 |             nickname    nickname  
 97 | ```
 98 | Here, `Like <-> like` substitution is favored.  While this generally won't change the WER value itself (although it can), it will improve the timing alignments.  
 99 | 
100 | 
101 | These behavior, as well as the beam size (that has a default value of 50.0) can be controlled with the following new parameters:
102 | ```
103 |   --disable-strict-punctuation
104 |                               Disable strict punctuation alignment (which prevents punctuation aligning with words).
105 |   --disable-favored-subs      Disable favored substitutions (which makes alignment favor substitutions between words which differ only by case).
106 |   --favored-sub-cost FLOAT    Cost for favored substitutions (e.g., case diff). Default: 0.1
107 | ```
108 | 
109 | ## Installation
110 | 
111 | ### Dependencies
112 | We use git submodules to manage third-party dependencies. Initialize and update submodules before proceeding to the main build steps.
113 | ```
114 | git submodule update --init --recursive
115 | ```
116 | 
117 | This will pull the current dependencies:
118 | - catch2 - for unit testing
119 | - spdlog - for logging
120 | - CLI11 - for CLI construction
121 | - csv - for CTM and NLP input parsing
122 | - jsoncpp - for JSON output construction
123 | - strtk - for various string utilities
124 | 
125 | Additionally, we have dependencies outside of the third-party submodules:
126 | - OpenFST - currently provided to the build system by settings the $OPENFST_ROOT environment variable or during the CMake command via `-DOPENFST_ROOT`.
127 | 
128 | ### Build
129 | The current build framework is CMake. Install CMake following the instructions here (https://cmake.org/install/).
130 | 
131 | To build fstalign, run:
132 | ```
133 |     mkdir build && cd build
134 |     cmake .. -DOPENFST_ROOT="<path to OpenFST>" -DDYNAMIC_OPENFST=ON
135 |     make
136 | ```
137 | 
138 | Note: `-DDYNAMIC_OPENFST=ON` is needed if OpenFST at `OPENFST_ROOT` is compiled as shared libraries. Otherwise static libraries are assumed.
139 | 
140 | Finally, tests can be run using:
141 | ```
142 | make test
143 | ```
144 | 
145 | ### Docker
146 | 
147 | The fstalign docker image is hosted on Docker Hub and can be easily pulled and run:
148 | ```
149 | docker pull revdotcom/fstalign
150 | docker run --rm -it revdotcom/fstalign
151 | ```
152 | 
153 | See https://hub.docker.com/r/revdotcom/fstalign/tags for the available versions/tags to pull. If you desire to run the tool on local files you can mount local directories with the `-v` flag of the `docker run` command.
154 | 
155 | From inside the container:
156 | ```
157 | /fstalign/build/fstalign --help
158 | ```
159 | 
160 | For development you can also build the docker image locally using:
161 | ```
162 | docker build . -t fstalign-dev
163 | ```
164 | 
165 | ## Documentation
166 | For more information on how to use `fstalign` see our [documentation](https://github.com/revdotcom/fstalign/blob/develop/docs/Usage.md) for more details.
167 | 


--------------------------------------------------------------------------------
/docs/NLP-Format.md:
--------------------------------------------------------------------------------
 1 | # NLP Format
 2 | NLP files are `.csv` inspired, pipe-separated text files that contain token and metadata information of a transcript. Each line of a file represents a single transcript token and the metadata associated with it. 
 3 | 
 4 | | Column | Description |
 5 | | ----------- | ----------- |
 6 | | Column 1: token | A single token in the transcript. These are typically single words or multiple words with hyphens in between. |
 7 | | Column 2: speaker | A unique ID that associates this token to a specific speaker in an audio. |
 8 | | Column 3: ts | A float representing the time in seconds that starts of the token’s utterance. |
 9 | | Column 4: endTs | A float representing the time in seconds that ends of the token’s utterance. |
10 | | Column 5: punctuation | A punctuation character that is included at the end of a token that is used when reconstructing the transcript. Example punctuation: `",", ";", ".", "!"`. These will be ignored from WER token matching. |
11 | | Column 6: case | A two letter code to denominate the which of four possible casings for this token: <br />UC - Denotes a token that has the first character in uppercase and every other character lowercase<br />LC - Denotes a token that has every character in lowercase<br />CA - Denotes a token that has every character in uppercase<br />MC - Denotes a token that doesn’t follow the previous rules. This is the case when upper- and lowercase characters are mixed throughout the token |
12 | | Column 7: tags | Displays one of the several entity tags that are listed in wer_tags in long form - such that the displayed entity here is in the form `ID:ENTITY_CLASS`. If normalization is used, only entities in this column can be normalized. |
13 | | Column 8: wer_tags | A list of entity tags that are associated with this token. In this field, only entity IDs should be present. The specific ENTITY_CLASS for each ID can be extracted from an accompanying wer_tags sidecar json. |
14 |  
15 | Example:
16 | ```
17 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags
18 | Good|0||||UC|[]|[]
19 | morning|0||||LC|['5:TIME']|['5']
20 | and|0||||LC|[]|[]
21 | welcome|0||||LC|[]|[]
22 | to|0||||LC|[]|[]
23 | the|0||||LC|['6:DATE']|['6']
24 | first|0||||LC|['6:DATE']|['6']
25 | quarter|0||||LC|['6:DATE']|['6']
26 | 2020|0||||CA|['0:YEAR']|['0', '1', '6']
27 | NexGEn|0||||MC|['7:ORG']|['7']
28 | ```
29 | 
30 | ## WER tag sidecar
31 | 
32 | WER tag sidecar files contain accompanying info for tokens in an NLP file. The
33 | keys are IDs corresponding to tokens in the NLP file `wer_tags` column. The
34 | objects under the keys are information about the token.
35 | 
36 | Example:
37 | ```
38 | {
39 |  '0': {'entity_type': 'YEAR'},
40 |  '1': {'entity_type': 'CARDINAL'},
41 |  '6': {'entity_type': 'SPACY>TIME'},
42 | }
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/Synonyms-Format.md:
--------------------------------------------------------------------------------
 1 | # Synonyms File Format
 2 | Synonyms allow for reference words to be equivalent to similar forms (determined by the user) for error counting. They are accepted for any input formats and passed into the tool via the `--syn <path_to_synonym_file>` flag.
 3 | 
 4 | The file structure is a simple text file where each line is a synonym and each synonym is separated by a pipe where the left hand side is the reference version of the term and the right hand side is the accepted hypothesis alternative.
 5 | 
 6 | ```
 7 | format : LHS<pipe>RHS
 8 | where:
 9 |   LHS : space-delimited words to match in the original reference text
10 |   RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS
11 | ```
12 | 
13 | Note that there is no built in symmetry, so synonyms must be doubly specified for symmetrical equivalence (example below illustrates this). Empty lines or lines starting with '#' are ignored.
14 | 
15 | Example:
16 | ```
17 | i am     | i'm
18 | i'm      | i am
19 | okay     | ok
20 | ok       | okay
21 | ```
22 | 
23 | A full example of a synonyms file is available in the repository under `sample_data/synonyms.rules.txt`.
24 | 


--------------------------------------------------------------------------------
/docs/json_log_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "definitions": {
 3 |         "transcript_wer": {
 4 |             "title": "Transcript WER",
 5 |             "type": "object",
 6 |             "properties": {
 7 |                 "wer": {
 8 |                     "title": "WER",
 9 |                     "type": "object",
10 |                     "properties": {
11 |                         "bestWER": "#/definitions/wer_result",
12 |                         "classWER": {
13 |                             <class_id>: "#/definitions/wer_result"
14 |                         },
15 |                         "speakerWER": {
16 |                             <speaker_id>: "#/definitions/wer_result"
17 |                         },
18 |                         "speakerSwitchWER": "#/definitions/wer_result",
19 |                         "unigrams": {
20 |                             <unigram_text>: "#/definitions/pr_result"
21 |                         },
22 |                         "bigrams": {
23 |                             <bigram_text>: "#/definitions/pr_result"
24 |                         }
25 |                     }
26 |                 }
27 |             }
28 |         },
29 |         "wer_result": {
30 |             "title": "WER Result",
31 |             "type": "object",
32 |             "properties": {
33 |                 "insertions": {
34 |                     "title": "Insertions",
35 |                     "type": "integer"
36 |                 },
37 |                 "deletions": {
38 |                     "title": "Deletions",
39 |                     "type": "integer"
40 |                 },
41 |                 "substitutions": {
42 |                     "title": "Substitutions",
43 |                     "type": "integer"
44 |                 },
45 |                 "numErrors": {
46 |                     "title": "Number of errors",
47 |                     "type": "integer"
48 |                 },
49 |                 "numWordsInReference": {
50 |                     "title": "Number of words in reference",
51 |                     "type": "integer"
52 |                 },
53 |                 "wer": {
54 |                     "title": "WER",
55 |                     "type": "float"
56 |                 },
57 |                 "meta": {
58 |                     "title": "Metadata",
59 |                     "type": "object"
60 |                 },
61 |             },
62 |         },
63 |         "pr_result": {
64 |             "title": "Precision Recall Result",
65 |             "type": "object",
66 |             "properties": {
67 |                 "insertions": {
68 |                     "title": "Insertions",
69 |                     "type": "integer"
70 |                 },
71 |                 "deletions": {
72 |                     "title": "Deletions",
73 |                     "type": "integer"
74 |                 },
75 |                 "substitutions_fp": {
76 |                     "title": "Substitutions that were false positives.",
77 |                     "type": "integer"
78 |                 },
79 |                 "substitutions_fn": {
80 |                     "title": "Substitutions that were false negatives.",
81 |                     "type": "integer"
82 |                 },
83 |                 "correct": {
84 |                     "title": "Correct",
85 |                     "type": "integer"
86 |                 },
87 |                 "precision": {
88 |                     "title": "Precision",
89 |                     "type": "float"
90 |                 },
91 |                 "recall": {
92 |                     "title": "Recall",
93 |                     "type": "float"
94 |                 }
95 |             }
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/sample_data/aaddf.txt:
--------------------------------------------------------------------------------
1 | a a d d f
2 | 


--------------------------------------------------------------------------------
/sample_data/abcdef.txt:
--------------------------------------------------------------------------------
1 | a b c d e f
2 | 


--------------------------------------------------------------------------------
/sample_data/synonyms.rules.txt:
--------------------------------------------------------------------------------
  1 | # format : LHS<pipe>RHS
  2 | # where:
  3 | # LHS : space-delimited words to match in the original reference text
  4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS
  5 | #
  6 | # This is non-recursive and single-pass only.
  7 | # By default, there won't be an automatic symetry:
  8 | # if you want a->b and b->a, you need to specify both
  9 | #
 10 | # Empty lines or lines starting with '#' are ignored
 11 | 
 12 | # To Be contractions - present
 13 | i am     | i'm
 14 | i'm      | i am
 15 | you are  | you're
 16 | you're   | you are
 17 | he is    | he's
 18 | he's     | he is
 19 | she is   | she's
 20 | she's    | she is
 21 | it is    | it's
 22 | it's     | it is
 23 | we're    | we are
 24 | we are   | we're
 25 | they are | they're
 26 | 
 27 | # To Be contractions - future
 28 | i will    |i'll
 29 | i'll      |i will
 30 | you will  |you'll
 31 | you'll    |you will
 32 | he will   |he'll
 33 | he'll     |he will
 34 | she will  |she'll
 35 | she'll    |she will
 36 | it will   |it'll
 37 | it'll     |it will
 38 | we will   |we'll
 39 | we'll     |we will
 40 | they will |they'll
 41 | they'll   |they will
 42 | 
 43 | 
 44 | okay    | ok ; 'kay
 45 | ok      | okay ; 'kay
 46 | 'kay    | okay ; ok
 47 | 
 48 | til     | until ; 'til
 49 | 'til    | until ; til
 50 | until   | 'til ; til
 51 | 
 52 | awhile | a while
 53 | a while | awhile
 54 | 
 55 | lotta    | lot of
 56 | lot of | lotta
 57 | sorta    | sort of
 58 | sort of  | sorta
 59 | 
 60 | dunno | don't know ; do not know
 61 | don't know | dunno
 62 | do not know | dunno
 63 | 
 64 | lemme | let me
 65 | let me | lemme
 66 | 
 67 | let's | let us
 68 | let us | let's
 69 | 
 70 | # TODO: can't -> cannot ?
 71 | # TODO: To Have ?
 72 | must've   | must have
 73 | must have | must've
 74 | would've | would have
 75 | would have | would've
 76 | should've | should have
 77 | should have | should've ; shoulda
 78 | might've | might have
 79 | might have | might've
 80 | could've | could have
 81 | could have | could've
 82 | i'd     | i had ; i would
 83 | i had | i'd
 84 | i would | i'd
 85 | you'd | you had  ; you would
 86 | you had | you'd
 87 | you would | you'd
 88 | he'd | he had ; he would
 89 | he had | he'd
 90 | he would | he'd
 91 | she'd | she had ; she would
 92 | she had | she'd
 93 | she would | she'd
 94 | they'd | they had ; they would
 95 | they had | they'd
 96 | they would | they'd
 97 | i've | i have
 98 | i have | i've
 99 | they've | they have
100 | they have | they've
101 | you've | you have
102 | you have | you've
103 | 
104 | 
105 | 'cause | cause ; because
106 | because | 'cause ; cause
107 | gonna | going to
108 | going to | gonna
109 | wanna | want to
110 | want to | wanna
111 | kinda | kind of
112 | kind of | kinda
113 | gotta | got to
114 | got to | gotta
115 | 'em | them
116 | them | 'em
117 | all right | alright
118 | alright | all right
119 | 
120 | give me | gimme
121 | gimme | give me
122 | shoulda | should have
123 | out of | outta
124 | outta | out of
125 | what are you | whatcha
126 | whatcha | what are you
127 | 
128 | <barking>                  |  <unk>  
129 | <cheer>                    |  <unk>          
130 | <clap>                     |  <unk>         
131 | <clear_throat>             |  <unk>                 
132 | <cough>                    |  <unk>          
133 | <crosstalk>                |  <unk>              
134 | <cry>                      |  <unk>        
135 | <finger_snap>              |  <unk>                
136 | <foreign>                  |  <unk>            
137 | <groan>                    |  <unk>          
138 | <grunt>                    |  <unk>          
139 | <inaudible>                |  <unk> 
140 | <laugh>                    |  <unk>          
141 | <mumble>                   |  <unk>           
142 | <murmur>                   |  <unk>           
143 | <phonetic>                 |  <unk>             
144 | <scream>                   |  <unk>           
145 | <sigh>                     |  <unk>         
146 | <silence>                  |  <unk>            
147 | <sing>                     |  <unk>         
148 | <siren>                    |  <unk>          
149 | <unk>                      |  <unk>        
150 | <visual>                   |  <unk>           
151 | <yawn>                     |  <unk>
152 | 


--------------------------------------------------------------------------------
/src/AdaptedComposition.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | AdaptedComposition.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2021
 5 | 
 6 | */
 7 | 
 8 | #ifndef __ADAPTEDCOMPOSITION_H__
 9 | #define __ADAPTEDCOMPOSITION_H__
10 | 
11 | #include <fst/fstlib.h>
12 | #include <unordered_map>
13 | #include <utility>
14 | #include "IComposition.h"
15 | #include "utilities.h"
16 | #include "fstalign.h"
17 | 
18 | using namespace std;
19 | 
20 | typedef fst::Fst<fst::StdArc>::StateId StateId;
21 | 
22 | typedef pair<uint32, uint32> StatePair;
23 | struct key_hash : public std::unary_function<StatePair, std::size_t> {
24 |   std::size_t operator()(const StatePair &k) const { return std::get<0>(k) ^ std::get<1>(k); }
25 | };
26 | 
27 | // A hash function used to hash a pair of any kind, useful for unordered_map
28 | struct hash_pair {
29 |   template <class T1, class T2>
30 |   size_t operator()(const pair<T1, T2> &p) const {
31 |     auto hash1 = hash<T1>{}(p.first);
32 |     auto hash2 = hash<T2>{}(p.second);
33 |     return hash1 ^ hash2;
34 |   }
35 | };
36 | 
37 | /*
38 |  * Calculates edit distance between two FSTs through manual single-step composition.
39 |  * Optimizes the search space of the composed graph by greedily expanding composition states.
40 |  * It is notably faster than the StandardCompositionFst alternative.
41 |  * (in beta)
42 |  */
43 | class AdaptedCompositionFst : public IComposition {
44 |  protected:
45 |   map<StatePair, StateId> composed_states;
46 |   map<StateId, StatePair> reversed_composed_states;
47 | 
48 |   set<pair<StateId, int>> entity_exit_states;
49 | 
50 |   StateId current_composed_next_state_id = 0;
51 | 
52 |   const fst::SymbolTable *symbols_;
53 |   std::vector<bool> synonyms_label_ids;
54 |   std::vector<bool> entity_label_ids;
55 | 
56 |   int dbg_count = 0;
57 | 
58 |   // possible optimizations : limit to const FST or limit to StdVectorFst
59 |   const fst::StdFst &fstA_;
60 |   const fst::StdFst &fstB_;
61 |   // Add members to store options
62 |   bool strict_punctuation_ = false;
63 |   std::unordered_set<int> punctuation_ids_;
64 |   // Favored substitutions
65 |   bool use_favored_substitutions_ = false;
66 |   float favored_substitution_cost_ = 0.1f;
67 |   std::vector<int> favorable_substitution_map_;
68 | 
69 |   StateId GetOrCreateComposedState(StateId a, StateId b);
70 |   bool IsEntityLabel(int labelId);
71 |   bool IsSynonymLabel(int labelId);
72 |   bool IsEntityReacheable(int target_entity_label_id, StateId refA, StateId refB);
73 | 
74 |  public:
75 |   AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB);
76 |   AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols);
77 |   AdaptedCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options);
78 |   ~AdaptedCompositionFst();
79 | 
80 |   StateId Start();
81 |   fst::Fst<fst::StdArc>::Weight Final(StateId stateId);
82 |   bool TryGetArcsAtState(StateId fromStateId, vector<fst::StdArc> *out_vector);
83 | 
84 |   // a and b are in the incoming graph referencials
85 |   bool DoesComposedStateExist(StateId a, StateId b);
86 | 
87 |   // a is in the composed-graph referencial
88 |   bool DoesComposedStateExist(StateId a);
89 | 
90 |   void SetSymbols(const fst::SymbolTable *symbols);
91 | 
92 |   void DebugComposedGraph();
93 | };
94 | 
95 | #endif


--------------------------------------------------------------------------------
/src/AlignmentTraversor.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | AlignmentTraversor.cpp
 3 |  JP Robichaud (jp@rev.com)
 4 |  2021
 5 | 
 6 | */
 7 | #include "AlignmentTraversor.h"
 8 | 
 9 | AlignmentTraversor::AlignmentTraversor(wer_alignment &topLevel) : root(topLevel) {
10 |   currentPosInRoot = -1;
11 |   currentSubclass = nullptr;
12 | }
13 | 
14 | void AlignmentTraversor::Restart() {
15 |   currentPosInRoot = -1;
16 |   currentSubclass = nullptr;
17 |   currentPosInSubclass = -1;
18 | }
19 | 
20 | bool AlignmentTraversor::NextTriple(triple &triple) {
21 |   if (currentSubclass == nullptr) {
22 |     // we're not in a subclass, we're consuming the root alignment content,
23 |     // let's move to the next word
24 |     currentPosInRoot++;
25 |     if (currentPosInRoot >= root.tokens.size()) {
26 |       return false;
27 |     }
28 | 
29 |     auto tk = root.tokens[currentPosInRoot];
30 |     if (isEntityLabel(tk.first)) {
31 |       // handle class
32 |       currentPosInSubclass = -1;
33 |       // find subclass spWERA from within the root
34 |       for (auto &a : root.label_alignments) {
35 |         if (a.classLabel == tk.first) {
36 |           currentSubclass = &a;
37 |           break;
38 |         }
39 |       }
40 |       // currentSubclass = nullptr; // fixme
41 |       return NextTriple(triple);
42 |     }
43 | 
44 |     triple.classLabel = TK_GLOBAL_CLASS;
45 |     triple.ref = tk.first;
46 |     triple.hyp = tk.second;
47 | 
48 |     return true;
49 |   } else {
50 |     currentPosInSubclass++;
51 |     if (currentPosInSubclass == 0 && currentSubclass->tokens.size() == 0 &&
52 |         currentSubclass->classLabel.find("FALLBACK") != std::string::npos) {
53 |       triple.classLabel = currentSubclass->classLabel;
54 |       triple.ref = NOOP;
55 |       triple.hyp = NOOP;
56 |       return true;
57 |     }
58 |     if (currentPosInSubclass >= currentSubclass->tokens.size()) {
59 |       // we're done here...
60 |       currentSubclass = nullptr;
61 |       currentPosInSubclass = -1;
62 |       return NextTriple(triple);
63 |     }
64 | 
65 |     auto tk = currentSubclass->tokens[currentPosInSubclass];
66 |     triple.classLabel = currentSubclass->classLabel;
67 |     triple.ref = tk.first;
68 |     triple.hyp = tk.second;
69 |     return true;
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/AlignmentTraversor.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | AlignmentTraversor.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2021
 5 | 
 6 | */
 7 | 
 8 | #ifndef __ATRAVERSOR_H__
 9 | #define __ATRAVERSOR_H__
10 | 
11 | #include "utilities.h"
12 | 
13 | struct triple {
14 |   string ref;
15 |   string hyp;
16 |   string classLabel;
17 | };
18 | 
19 | class AlignmentTraversor {
20 |  public:
21 |   AlignmentTraversor(wer_alignment &topLevel);
22 |   bool NextTriple(triple &triple);
23 |   void Restart();
24 | 
25 |  private:
26 |   wer_alignment &root;
27 |   int currentPosInRoot = -1;
28 |   int currentPosInSubclass;
29 |   wer_alignment *currentSubclass;
30 | };
31 | 
32 | #endif  // __ATRAVERSOR_H__
33 | 


--------------------------------------------------------------------------------
/src/Ctm.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | * Ctm.cpp
  4 | *
  5 | * JP Robichaud (jp@rev.com)
  6 |   2018
  7 | *
  8 | */
  9 | 
 10 | #include "Ctm.h"
 11 | 
 12 | #include <csv/csv.h>
 13 | 
 14 | using namespace std;
 15 | using namespace fst;
 16 | 
 17 | /***************************************
 18 |     CTM FST Loader Class Start
 19 |  ***************************************/
 20 | CtmFstLoader::CtmFstLoader(vector<RawCtmRecord> &records, bool use_case) : FstLoader() {
 21 |   {
 22 |     mCtmRows = records;
 23 |     mUseCase = use_case;
 24 |     for (auto &row : mCtmRows) {
 25 |       std::string token = std::string(row.word);
 26 |       if (!mUseCase) {
 27 |         token = UnicodeLowercase(row.word);
 28 |       }
 29 |       mToken.push_back(token);
 30 |     }
 31 |   }
 32 | }
 33 | 
 34 | CtmFstLoader::~CtmFstLoader() {
 35 |   // TODO Auto-generated destructor stub
 36 | }
 37 | void CtmFstLoader::addToSymbolTable(SymbolTable &symbol) const {
 38 |   for (auto &s : mToken) {
 39 |     AddSymbolIfNeeded(symbol, s);
 40 |   }
 41 | }
 42 | 
 43 | StdVectorFst CtmFstLoader::convertToFst(const SymbolTable &symbol, std::vector<int> map) const {
 44 |   auto logger = logger::GetOrCreateLogger("ctmloader");
 45 |   //
 46 |   StdVectorFst transducer;
 47 |   logger->debug("creating transducer for CTM");
 48 | 
 49 |   transducer.AddState();
 50 |   transducer.SetStart(0);
 51 | 
 52 |   int prevState = 0;
 53 |   int nextState = 1;
 54 |   int wc = 0;
 55 |   int map_sz = map.size();
 56 |   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
 57 |     std::string token = *i;
 58 |     if (!mUseCase) {
 59 |       token = UnicodeLowercase(token);
 60 |     }
 61 |     transducer.AddState();
 62 | 
 63 |     if (map_sz > wc && map[wc] > 0) {
 64 |       transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 1.0f, nextState));
 65 |     } else {
 66 |       transducer.AddArc(prevState, StdArc(symbol.Find(token), symbol.Find(token), 0.0f, nextState));
 67 |     }
 68 | 
 69 |     prevState = nextState;
 70 |     nextState++;
 71 |     wc++;
 72 |   }
 73 | 
 74 |   transducer.SetFinal(prevState, 0.0f);
 75 |   return transducer;
 76 | }
 77 | 
 78 | std::vector<int> CtmFstLoader::convertToIntVector(fst::SymbolTable &symbol) const {
 79 |   auto logger = logger::GetOrCreateLogger("ctmloader");
 80 |   std::vector<int> vect;
 81 |   addToSymbolTable(symbol);
 82 |   int sz = mToken.size();
 83 |   logger->info("creating std::vector<int> for CTM for {} tokens", sz);
 84 |   vect.reserve(sz);
 85 | 
 86 |   FstAlignOption options;
 87 |   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
 88 |     std::string token = *i;
 89 |     int token_sym = symbol.Find(token);
 90 |     if (token_sym == -1) {
 91 |       token_sym = symbol.Find(options.symUnk);
 92 |     }
 93 |     vect.emplace_back(token_sym);
 94 |   }
 95 | 
 96 |   return vect;
 97 | }
 98 | 
 99 | /***************************************
100 |       CTM FST Loader Class End
101 |    ***************************************/
102 | 
103 | /***************************************
104 |       CTM Reader Class Start
105 |    ***************************************/
106 | CtmReader::CtmReader() {}
107 | 
108 | vector<RawCtmRecord> read_from_disk_no_conf(const string &filename) {
109 |   vector<RawCtmRecord> vect;
110 |   io::CSVReader<5, io::trim_chars<' ', '\t'>, io::no_quote_escape<' '>, io::throw_on_overflow, io::empty_line_comment>
111 |       input_ctm(filename);
112 | 
113 |   input_ctm.set_header("audiofile", "channel", "start", "duration", "word");
114 | 
115 |   string audiofile, channel, start, duration, word, confidence;
116 |   while (input_ctm.read_row(audiofile, channel, start, duration, word)) {
117 |     RawCtmRecord record;
118 |     record.recording = audiofile;
119 |     record.channel = channel;
120 |     record.start_time_secs = stof(start);
121 |     record.duration_secs = stof(duration);
122 |     record.word = word;
123 |     record.confidence = 1;
124 |     vect.push_back(record);
125 |   }
126 | 
127 |   return vect;
128 | }
129 | 
130 | vector<RawCtmRecord> read_from_disk_with_conf(const string &filename) {
131 |   vector<RawCtmRecord> vect;
132 |   io::CSVReader<6, io::trim_chars<' ', '\t'>, io::no_quote_escape<' '>, io::throw_on_overflow, io::empty_line_comment>
133 |       input_ctm(filename);
134 | 
135 |   input_ctm.set_header("audiofile", "channel", "start", "duration", "word", "confidence");
136 | 
137 |   string audiofile, channel, start, duration, word, confidence;
138 |   while (input_ctm.read_row(audiofile, channel, start, duration, word, confidence)) {
139 |     RawCtmRecord record;
140 |     record.recording = audiofile;
141 |     record.channel = channel;
142 |     record.start_time_secs = stof(start);
143 |     record.duration_secs = stof(duration);
144 |     record.word = word;
145 |     record.confidence = stof(confidence);
146 |     vect.push_back(record);
147 |   }
148 | 
149 |   return vect;
150 | }
151 | 
152 | vector<RawCtmRecord> CtmReader::read_from_disk(const string &filename) {
153 |   ifstream ctm_peek(filename);
154 |   string first_line;
155 |   if (!std::getline(ctm_peek, first_line)) {
156 |     vector<RawCtmRecord> vect;
157 |     return vect;
158 |   }
159 | 
160 |   int sz = 1;
161 |   char lastChar = 'x';
162 | 
163 |   for (auto &c : first_line) {
164 |     if (c == ' ' || c == '\t') {
165 |       if (lastChar != ' ' && lastChar != '\t') {
166 |         sz++;
167 |       }
168 |     }
169 | 
170 |     lastChar = c;
171 |   }
172 | 
173 |   // Minimum CTM columns should be: audiofile, channel, start, duration, word
174 |   // Sixth confidence score column is optional
175 |   bool hasConf = sz > 5 ? true : false;
176 | 
177 |   if (hasConf) {
178 |     return read_from_disk_with_conf(filename);
179 |   } else {
180 |     return read_from_disk_no_conf(filename);
181 |   }
182 | }
183 | 
184 | /***************************************
185 |       CTM Reader Class End
186 |    ***************************************/
187 | 


--------------------------------------------------------------------------------
/src/Ctm.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * Ctm.h
 4 |  *
 5 |  * JP Robichaud (jp@rev.com)
 6 |  * (C) 2018
 7 |  *
 8 |  */
 9 | 
10 | #ifndef __CTM_H__
11 | #define __CTM_H__
12 | 
13 | #include "FstLoader.h"
14 | #include "utilities.h"
15 | 
16 | using namespace std;
17 | using namespace fst;
18 | 
19 | struct RawCtmRecord {
20 |   string recording;
21 |   string channel;
22 |   float start_time_secs;
23 |   float duration_secs;
24 |   string word;
25 |   float confidence;
26 | };
27 | 
28 | class CtmFstLoader : public FstLoader {
29 |  public:
30 |   CtmFstLoader(std::vector<RawCtmRecord> &records, bool use_case = false);
31 |   ~CtmFstLoader();
32 |   vector<RawCtmRecord> mCtmRows;
33 |   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
34 |   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const;
35 |   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
36 |   virtual const std::string &getToken(int index) const { return mToken.at(index); }
37 |  private:
38 |   bool mUseCase;
39 | };
40 | 
41 | class CtmReader {
42 |  public:
43 |   CtmReader();
44 |   vector<RawCtmRecord> read_from_disk(const std::string &filename);
45 | };
46 | 
47 | #endif  // __CTM_H__
48 | 


--------------------------------------------------------------------------------
/src/FstFileLoader.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FstFileLoader.cpp
 3 |  */
 4 | #include "FstFileLoader.h"
 5 | 
 6 | FstFileLoader::FstFileLoader(std::string filename) : FstLoader(), filename_(filename) {}
 7 | 
 8 | void FstFileLoader::addToSymbolTable(fst::SymbolTable& symbol) const { return; }
 9 | 
10 | fst::StdVectorFst FstFileLoader::convertToFst(const fst::SymbolTable& symbol, std::vector<int> map) const {
11 |   auto logger = logger::GetOrCreateLogger("FstFileLoader");
12 |   fst::StdVectorFst* transducer = fst::StdVectorFst::Read(filename_);
13 |   logger->info("Total FST has {} states.", transducer->NumStates());
14 |   return (*transducer);
15 | }
16 | 
17 | std::vector<int> FstFileLoader::convertToIntVector(fst::SymbolTable& symbol) const {
18 |   auto logger = logger::GetOrCreateLogger("FstFileLoader");
19 |   std::vector<int> vect;
20 |   logger->error("convertToIntVector isn't implemented for FST inputs");
21 |   vect.reserve(0);
22 |   vect.resize(0);
23 |   return vect;
24 | }
25 | 
26 | FstFileLoader::~FstFileLoader() {}
27 | 


--------------------------------------------------------------------------------
/src/FstFileLoader.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FstFileLoader.h
 3 |  * 
 4 |  * FstLoader for loading a serialized FST from disk.
 5 |  * 
 6 |  * Quinn McNamara (quinn@rev.com)
 7 |  * 2020
 8 |  */
 9 | 
10 | #ifndef FstFileLoader_H_
11 | #define FstFileLoader_H_
12 | 
13 | #include <fstream>
14 | #include <stdexcept>
15 | #include <vector>
16 | 
17 | #include "FstLoader.h"
18 | #include "utilities.h"
19 | 
20 | class FstFileLoader : public FstLoader {
21 |  public:
22 |   FstFileLoader(std::string filename);
23 |   ~FstFileLoader();
24 | 
25 |   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
26 |   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const;
27 |   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
28 | 
29 |  private:
30 |   std::string filename_;
31 | };
32 | 
33 | #endif /* FstFileLoader_H_ */
34 | 


--------------------------------------------------------------------------------
/src/FstLoader.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | FstLoader.cpp
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #include "FstLoader.h"
 9 | #include "utilities.h"
10 | 
11 | FstLoader::FstLoader() {
12 |   // TODO Auto-generated constructor stub
13 | }
14 | 
15 | FstLoader::~FstLoader() {
16 |   // TODO Auto-generated destructor stub
17 | }
18 | 
19 | void FstLoader::AddSymbolIfNeeded(fst::SymbolTable &symbol, std::string str_value) {
20 |   if (symbol.Find(str_value) == -1) {
21 |     symbol.AddSymbol(str_value);
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/FstLoader.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | FstLoader.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #ifndef __FSTLOADER_H_
 9 | #define __FSTLOADER_H_
10 | 
11 | #include <vector>
12 | #include "utilities.h"
13 | 
14 | class FstLoader {
15 |  protected:
16 |   typedef std::vector<std::string> TokenType;
17 |   TokenType mToken;
18 | 
19 |  public:
20 |   FstLoader();
21 |   virtual ~FstLoader();
22 |   virtual void addToSymbolTable(fst::SymbolTable &symbol) const = 0;
23 |   static void AddSymbolIfNeeded(fst::SymbolTable &symbol, std::string str_value);
24 |   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const = 0;
25 |   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const = 0;
26 | 
27 |   static std::unique_ptr<FstLoader> MakeReferenceLoader(const std::string& ref_filename,
28 |                                                         const std::string& wer_sidecar_filename,
29 |                                                         const std::string& json_norm_filename,
30 |                                                         bool use_punctuation,
31 |                                                         bool use_case,
32 |                                                         bool symbols_file_included);
33 | 
34 |   static std::unique_ptr<FstLoader> MakeHypothesisLoader(const std::string& hyp_filename,
35 |                                                          const std::string& hyp_json_norm_filename,
36 |                                                          bool use_punctuation,
37 |                                                          bool use_case,
38 |                                                          bool symbols_file_included);
39 | 
40 | 
41 | };
42 | 
43 | #endif /* __FSTLOADER_H_ */
44 | 


--------------------------------------------------------------------------------
/src/IComposition.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | ICompostion.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2021
 5 | 
 6 |   Custom interface to encapsulate various composition strategies
 7 | 
 8 | */
 9 | 
10 | #ifndef __ICOMPOSITION_H_
11 | #define __ICOMPOSITION_H_
12 | 
13 | #include "utilities.h"
14 | typedef fst::Fst<fst::StdArc>::StateId StateId;
15 | 
16 | class IComposition : public fst::VectorFst<fst::StdArc> {
17 |  protected:
18 |   float insertion_cost = 1;
19 |   float deletion_cost = 1;
20 |   float substitution_cost = 1.5;
21 |   std::shared_ptr<spdlog::logger> logger_;
22 | 
23 |   fst::SymbolTable *symbols_;
24 | 
25 |   // TODO: make this settable/configurable
26 |   int ins_label_id_ = 1;
27 |   int del_label_id_ = 2;
28 |   int sub_label_id_ = 3;
29 | 
30 |  public:
31 |   IComposition() {}
32 |   IComposition(const fst::StdFst &fstA, const fst::StdFst &fstB) {}
33 |   IComposition(const fst::StdFst &fstA, const fst::StdFst &fstB, SymbolTable &symbols) {}
34 | 
35 |   virtual ~IComposition() {}
36 |   virtual StateId Start() = 0;
37 |   virtual fst::Fst<fst::StdArc>::Weight Final(StateId stateId) = 0;
38 |   virtual bool TryGetArcsAtState(StateId fromStateId, vector<fst::StdArc> *out_vector) = 0;
39 | };
40 | 
41 | #endif /*__ICOMPOSITION_H_ */
42 | 


--------------------------------------------------------------------------------
/src/Nlp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Nlp.h
 3 |  *
 4 |  *  Created on: 2018-04-23
 5 |  *      Author: JP Robichaud (jp@rev.com)
 6 |  */
 7 | #ifndef NLP_H_
 8 | #define NLP_H_
 9 | 
10 | #include <iomanip>
11 | 
12 | #include <json/json.h>
13 | 
14 | #include "FstLoader.h"
15 | 
16 | using namespace std;
17 | using namespace fst;
18 | 
19 | struct WerTagEntry {
20 |   string tag_id;
21 |   string entity_type;
22 | };
23 | 
24 | struct RawNlpRecord {
25 |   string token;
26 |   string speakerId;
27 |   string punctuation;
28 |   string prepunctuation;
29 |   string ts;
30 |   string endTs;
31 |   string casing;
32 |   string labels;
33 |   string best_label;
34 |   string best_label_id;
35 |   vector<WerTagEntry> wer_tags;
36 |   string confidence;
37 | };
38 | 
39 | class NlpReader {
40 |  public:
41 |   NlpReader();
42 |   virtual ~NlpReader();
43 |   vector<RawNlpRecord> read_from_disk(const std::string &filename);
44 |   string GetBestLabel(std::string &labels);
45 |   vector<WerTagEntry> GetWerTags(std::string &wer_tags_str);
46 |   string GetLabelId(std::string &label);
47 | };
48 | 
49 | class NlpFstLoader : public FstLoader {
50 |  public:
51 |   NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar, bool processLabels, bool use_punctuation = false, bool use_case = false);
52 |   NlpFstLoader(std::vector<RawNlpRecord> &records, Json::Value normalization, Json::Value wer_sidecar);
53 |   virtual ~NlpFstLoader();
54 |   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
55 |   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const;
56 |   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
57 | 
58 |   int GetProperSymbolId(const fst::SymbolTable &symbol, string token, string symUnk) const;
59 |   vector<RawNlpRecord> mNlpRows;
60 |   vector<std::string> mSpeakers;
61 |   Json::Value mJsonNorm;
62 |   Json::Value mWerSidecar;
63 |   virtual const std::string &getToken(int index) const { return mToken.at(index); }
64 |  private:
65 |   bool mUseCase;
66 | };
67 | 
68 | #endif /* NLP_H_ */
69 | 


--------------------------------------------------------------------------------
/src/OneBestFstLoader.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * OneBestFstLoader.cpp
  3 |  * JP Robichaud (jp@rev.com)
  4 |  * 2018
  5 |  */
  6 | 
  7 | #include "OneBestFstLoader.h"
  8 | 
  9 | #include <fstream>
 10 | #include <stdexcept>
 11 | 
 12 | #include "utilities.h"
 13 | 
 14 | // empty constructor
 15 | OneBestFstLoader::OneBestFstLoader(bool use_case) : FstLoader() {
 16 |   mUseCase = use_case;
 17 | }
 18 | 
 19 | void OneBestFstLoader::BuildFromString(const std::string content) {
 20 |   std::istringstream mystream(content);
 21 |   std::copy(std::istream_iterator<std::string>(mystream), std::istream_iterator<std::string>(),
 22 |             std::back_inserter(mToken));
 23 | }
 24 | 
 25 | void OneBestFstLoader::LoadTextFile(const std::string filename) {
 26 |   std::ifstream stream(filename);
 27 | 
 28 |   if (!stream.is_open()) throw std::runtime_error("Cannot open input file");
 29 | 
 30 |   std::copy(std::istream_iterator<std::string>(stream), std::istream_iterator<std::string>(),
 31 |             std::back_inserter(mToken));
 32 | 
 33 |   stream.close();
 34 | }
 35 | 
 36 | void OneBestFstLoader::addToSymbolTable(fst::SymbolTable &symbol) const {
 37 |   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
 38 |     std::string token = *i;
 39 |     if (!mUseCase) {
 40 |       token = UnicodeLowercase(token);
 41 |     }
 42 |     // fst::kNoSymbol
 43 |     if (symbol.Find(token) == -1) {
 44 |       symbol.AddSymbol(token);
 45 |     }
 46 |   }
 47 | }
 48 | 
 49 | fst::StdVectorFst OneBestFstLoader::convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const {
 50 |   auto logger = logger::GetOrCreateLogger("OneBestFstLoader");
 51 | 
 52 |   FstAlignOption options;
 53 |   int eps_sym = symbol.Find(options.symEps);
 54 | 
 55 |   fst::StdVectorFst transducer;
 56 | 
 57 |   transducer.AddState();
 58 |   transducer.SetStart(0);
 59 | 
 60 |   int prevState = 0;
 61 |   int nextState = 1;
 62 |   int map_sz = map.size();
 63 |   int wc = 0;
 64 |   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
 65 |     std::string token = *i;
 66 |     if (!mUseCase) {
 67 |       token = UnicodeLowercase(token);
 68 |     }
 69 |     transducer.AddState();
 70 | 
 71 |     int tk_idx = symbol.Find(token);
 72 |     if (tk_idx < 0) {
 73 |       logger->trace("we found an invalid token [{}] at token position {} which gave a label id of {}", token, (wc + 1),
 74 |                     tk_idx);
 75 |     }
 76 |     if (map_sz > wc && map[wc] > 0) {
 77 |       transducer.AddArc(prevState, fst::StdArc(tk_idx, tk_idx, 1.0f, nextState));
 78 |     } else {
 79 |       transducer.AddArc(prevState, fst::StdArc(tk_idx, tk_idx, 0.0f, nextState));
 80 |     }
 81 | 
 82 |     prevState = nextState;
 83 |     nextState++;
 84 |     wc++;
 85 |   }
 86 | 
 87 |   int realFinal = transducer.AddState();
 88 |   transducer.AddArc(prevState, fst::StdArc(eps_sym, eps_sym, 0.0f, realFinal));
 89 |   transducer.SetFinal(realFinal, StdFst::Weight::One());
 90 |   return transducer;
 91 | }
 92 | 
 93 | std::vector<int> OneBestFstLoader::convertToIntVector(fst::SymbolTable &symbol) const {
 94 |   auto logger = logger::GetOrCreateLogger("OneBestFstLoader");
 95 |   std::vector<int> vect;
 96 |   addToSymbolTable(symbol);
 97 |   int sz = mToken.size();
 98 |   logger->info("creating std::vector<int> for OneBestFstLoader for {} tokens", sz);
 99 |   vect.reserve(sz);
100 | 
101 |   FstAlignOption options;
102 |   for (TokenType::const_iterator i = mToken.begin(); i != mToken.end(); ++i) {
103 |     std::string token = *i;
104 |     if (!mUseCase) {
105 |       token = UnicodeLowercase(token);
106 |     }
107 |     int token_sym = symbol.Find(token);
108 |     if (token_sym == -1) {
109 |       token_sym = symbol.Find(options.symUnk);
110 |     }
111 |     vect.emplace_back(token_sym);
112 |   }
113 | 
114 |   return vect;
115 | }
116 | 
117 | OneBestFstLoader::~OneBestFstLoader() {
118 |   // TODO Auto-generated destructor stub
119 | }
120 | 


--------------------------------------------------------------------------------
/src/OneBestFstLoader.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * OneBestFstLoader.h
 3 |  * JP Robichaud (jp@rev.com)
 4 |  * 2018
 5 |  */
 6 | 
 7 | #ifndef ONEBESTFSTLOADER_H_
 8 | #define ONEBESTFSTLOADER_H_
 9 | 
10 | #include "FstLoader.h"
11 | 
12 | class OneBestFstLoader : public FstLoader {
13 |  public:
14 |   OneBestFstLoader(bool use_case = false);
15 |   virtual ~OneBestFstLoader();
16 |   void LoadTextFile(const std::string filename);
17 |   void BuildFromString(const std::string content);
18 | 
19 |   virtual void addToSymbolTable(fst::SymbolTable &symbol) const;
20 |   virtual fst::StdVectorFst convertToFst(const fst::SymbolTable &symbol, std::vector<int> map) const;
21 |   virtual const std::string &getToken(int index) const { return mToken.at(index); }
22 |   virtual std::vector<int> convertToIntVector(fst::SymbolTable &symbol) const;
23 |   int TokensSize() { return mToken.size(); }
24 |  private:
25 |   bool mUseCase;
26 | };
27 | 
28 | #endif /* ONEBESTFSTLOADER_H_ */
29 | 


--------------------------------------------------------------------------------
/src/PathHeap.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | PathHeap.cpp
  3 |  JP Robichaud (jp@rev.com)
  4 |  2018
  5 | 
  6 | */
  7 | 
  8 | #include "PathHeap.h"
  9 | 
 10 | using namespace std;
 11 | using namespace fst;
 12 | 
 13 | PathHeap::PathHeap() {
 14 |   // creating the set
 15 | }
 16 | 
 17 | void PathHeap::insert(shared_ptr<ShortlistEntry> entry) {
 18 |   // just add it to the set, leaving to the comparator to do its job
 19 |   heap.insert(entry);
 20 | }
 21 | 
 22 | shared_ptr<ShortlistEntry> PathHeap::removeFirst() {
 23 |   // we want to take the 1st element and remove it
 24 |   auto logbookIter = heap.begin();
 25 |   auto currentState_ptr = *logbookIter;
 26 |   heap.erase(logbookIter);
 27 |   return currentState_ptr;
 28 | }
 29 | 
 30 | int PathHeap::size() { return heap.size(); }
 31 | 
 32 | shared_ptr<ShortlistEntry> PathHeap::GetBestWerCandidate() {
 33 |   set<shared_ptr<ShortlistEntry>, shortlistComparatorSharedPtr>::iterator iter = heap.begin();
 34 | 
 35 |   shared_ptr<ShortlistEntry> best = nullptr;
 36 |   float bestWer = std::numeric_limits<float>::quiet_NaN();
 37 |   while (iter != heap.end()) {
 38 |     auto entry = *iter;
 39 |     float local_wer = (float)entry->numErrors / (float)entry->numWords;
 40 | 
 41 |     if (best == nullptr) {
 42 |       best = entry;
 43 |       bestWer = local_wer;
 44 |       continue;
 45 |     }
 46 | 
 47 |     if (local_wer < bestWer) {
 48 |       bestWer = local_wer;
 49 |       best = entry;
 50 |     }
 51 | 
 52 |     iter++;
 53 |   }
 54 | 
 55 |   return best;
 56 | }
 57 | 
 58 | int PathHeap::prune(int targetSz) {
 59 |   set<shared_ptr<ShortlistEntry>, shortlistComparatorSharedPtr>::iterator iter = heap.begin();
 60 |   float wer0, wer_last;
 61 |   int sz = heap.size();
 62 |   for (int i = 0; i < targetSz && i < sz; i++) {
 63 |     float local_wer = (float)(*iter)->numErrors / ((float)(*iter)->numWords);
 64 |     if (i == 0) {
 65 |       wer0 = local_wer;
 66 |     }
 67 | 
 68 |     wer_last = local_wer;
 69 | 
 70 |     iter++;
 71 |   }
 72 | 
 73 |   auto last_wer_index = iter;
 74 |   last_wer_index--;
 75 |   auto logger = logger::GetOrCreateLogger("pathheap");
 76 |   // logger->set_level(spdlog::level::debug);
 77 |   logger->debug("==== pruning starting =====");
 78 |   logger->debug("pruning to {} items -> top wer was {} and last wer was {}.  We have {} items in the heap.", targetSz,
 79 |                 wer0, wer_last, heap.size());
 80 | 
 81 |   /* TODO:  make sure we don't prune paths that have the same length/error-count
 82 | as the last one kept at 'targetSz'
 83 | */
 84 | 
 85 |   int numErrorsWithoutInsertions = (*last_wer_index)->numErrors - (*last_wer_index)->numInsert;
 86 |   int pruned = 0;
 87 |   while (iter != heap.end()) {
 88 |     auto p = *iter;
 89 |     float local_wer = (float)(*iter)->numErrors / ((float)(*iter)->numWords);
 90 |     logger->debug(
 91 |         "candidate for prunung: wer0 {4:.4f}, wer_last {0:.4f} {2} words, current candidate {1:.4f}, {3} words",
 92 |         wer_last, local_wer, (*last_wer_index)->numWords, (*iter)->numWords, wer0);
 93 | 
 94 |     int localCoreErr = (*iter)->numErrors - (*iter)->numInsert;
 95 |     /* various strategies :
 96 | bool pruneMe = (*last_wer_index)->numErrors * 1.2 < (*iter)->numErrors; -> slow on larger files
 97 | bool pruneMe = (*last_wer_index)->numErrors * 1.1 < (*iter)->numErrors; -> slightly better on larger files
 98 | bool pruneMe = numErrorsWithoutInsertions * 1.1 < localCoreErr; --> a bit agressive
 99 | bool  pruneMe = (*last_wer_index)->numErrors + 20 < (*iter)->numErrors; --> seems to work resonably well
100 | */
101 |     // TODO: make this '20' configurable.  Also consider using (numErrors -
102 |     // numInsertion) + 20
103 |     bool pruneMe = (*last_wer_index)->numErrors + 20 < (*iter)->numErrors;
104 |     logger->debug("{} + 20 < {} = {}", numErrorsWithoutInsertions, localCoreErr, pruneMe);
105 |     if (pruneMe) {
106 |       heap.erase(iter++);
107 |       pruned++;
108 |     } else {
109 |       iter++;
110 |     }
111 |   }
112 |   logger->debug("after pruning we have {} items in the heap", heap.size());
113 |   logger->debug("-----");
114 | 
115 |   return pruned;
116 | }
117 | 
118 | int PathHeap::prune_relative(float beam_width) {
119 |     if (heap.empty()) {
120 |         return 0;
121 |     }
122 | 
123 |     auto logger = logger::GetOrCreateLogger("pathheap");
124 |     size_t initial_size = heap.size();
125 | 
126 |     // Find the best costSoFar in the current heap
127 |     // Note: The heap is ordered by the complex shortlistComparatorSharedPtr,
128 |     //       so the first element isn't necessarily the one with the lowest costSoFar.
129 |     //       We need to iterate to find the minimum costSoFar.
130 |     float best_cost = std::numeric_limits<float>::max();
131 |     for (const auto& entry : heap) {
132 |         if (entry->costSoFar < best_cost) {
133 |             best_cost = entry->costSoFar;
134 |         }
135 |     }
136 | 
137 |     float cost_threshold = best_cost + beam_width;
138 | 
139 |     logger->debug("==== Relative pruning starting (Beam: {}) =====", beam_width);
140 |     logger->debug("Initial size: {}, Best cost: {:.4f}, Threshold: {:.4f}",
141 |                   initial_size, best_cost, cost_threshold);
142 | 
143 |     int pruned_count = 0;
144 |     auto iter = heap.begin();
145 |     while (iter != heap.end()) {
146 |         // Check if the current entry's cost exceeds the threshold
147 |         if ((*iter)->costSoFar > cost_threshold) {
148 |             // Remove the element and advance the iterator
149 |             iter = heap.erase(iter);
150 |             pruned_count++;
151 |         } else {
152 |             // Otherwise, just advance the iterator
153 |             ++iter;
154 |         }
155 |     }
156 | 
157 |     logger->debug("After relative pruning: {} items remain ({} pruned)", heap.size(), pruned_count);
158 |     logger->debug("-----\n");
159 | 
160 |     return pruned_count;
161 | }
162 | 


--------------------------------------------------------------------------------
/src/PathHeap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | PathHeap.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #ifndef __PATH_HEAP_H__
 9 | #define __PATH_HEAP_H__
10 | 
11 | #include <limits>
12 | 
13 | #include "utilities.h"
14 | 
15 | using namespace std;
16 | using namespace fst;
17 | 
18 | typedef struct ShortlistEntry ShortlistEntry;
19 | typedef struct ShortlistEntry* SLE;
20 | typedef struct MyArc MyArc;
21 | typedef struct MyArc* MyArcPtr;
22 | typedef shared_ptr<ShortlistEntry> spSLE;
23 | 
24 | struct MyArc {
25 |   int ilabel = 0;
26 |   int olabel = 0;
27 |   float weight = 0.0;
28 |   int nextstate = 0;
29 | };
30 | 
31 | struct ShortlistEntry {
32 |   int currentState = 0;
33 |   int whereTo = 0;
34 |   int numErrors = 0;
35 |   int numWords = 0;
36 |   int numInsert = 0;
37 |   double costToGoThere = 0;
38 |   float costSoFar = 0;
39 |   MyArc local_arc;
40 |   shared_ptr<ShortlistEntry> linkToHere = nullptr;
41 | };
42 | 
43 | struct shortlistComparatorSharedPtr {
44 |   bool operator()(const shared_ptr<ShortlistEntry>& a, const shared_ptr<ShortlistEntry>& b) {
45 |     if (a->numWords == b->numWords) {
46 |       if (a->numErrors == b->numErrors) {
47 |         if (a->costSoFar == b->costSoFar) {
48 |           return a->currentState < b->currentState;
49 |         }
50 | 
51 |         return a->costSoFar < b->costSoFar;
52 |       }
53 | 
54 |       return a->numErrors < b->numErrors;
55 |     }
56 | 
57 |     return a->numWords < b->numWords;
58 |   }
59 | };
60 | 
61 | class PathHeap {
62 |  public:
63 |   PathHeap();
64 |   void insert(std::shared_ptr<ShortlistEntry> entry);
65 |   shared_ptr<ShortlistEntry> removeFirst();
66 |   int prune(int targetSz);
67 |   int prune_relative(float beam_width);
68 |   int size();
69 |   std::shared_ptr<ShortlistEntry> GetBestWerCandidate();
70 |   int pruningErrorOffset = 20;
71 |   bool pruningIncludeInsInThreshold = true;
72 | 
73 |  private:
74 |   set<std::shared_ptr<ShortlistEntry>, shortlistComparatorSharedPtr> heap;
75 | };
76 | #endif  // __PATH_HEAP_H__
77 | 


--------------------------------------------------------------------------------
/src/StandardComposition.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *
  3 |  * StandardComposition.cpp
  4 |  *
  5 |  * JP Robichaud (jp@rev.com)
  6 |  * 2021
  7 |  *
  8 |  */
  9 | 
 10 | #include "StandardComposition.h"
 11 | #include <fstream>
 12 | #include <memory>
 13 | #include <limits> // For numeric_limits
 14 | 
 15 | using fst::StdArc;
 16 | using fst::StdVectorFst;
 17 | using fst::SymbolTable;
 18 | using fst::TropicalWeight;
 19 | using fst::SymbolTableIterator;
 20 | using fst::ArcIterator;
 21 | using fst::StateIterator;
 22 | using fst::FstWriteOptions;
 23 | using fst::kNoStateId;
 24 | // StateId is defined via typedef in IComposition.h
 25 | using std::vector;
 26 | using std::string;
 27 | using std::ofstream;
 28 | 
 29 | // --- Constructors (ensure symbols_ is initialized) ---
 30 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB)
 31 |     : StandardCompositionFst(fstA, fstB, *(fstA.InputSymbols()), AlignerOptions()) // Use default AlignerOptions
 32 | {
 33 |     if (fstA.InputSymbols() == nullptr) {
 34 |          throw std::runtime_error("StandardCompositionFst requires symbol table. Attach symbols to fstA or provide explicitly.");
 35 |     }
 36 | }
 37 | 
 38 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols)
 39 |     : StandardCompositionFst(fstA, fstB, symbols, AlignerOptions()) {} // Use default AlignerOptions
 40 | 
 41 | StandardCompositionFst::StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options)
 42 |   : strict_punctuation_(options.strict_punctuation),
 43 |     punctuation_ids_(options.punctuation_ids),
 44 |     symbols_(symbols), // Initialize member reference
 45 |     use_favored_substitutions_(options.use_favored_substitutions),
 46 |     favored_substitution_cost_(options.favored_substitution_cost),
 47 |     favorable_substitution_map_(options.favorable_substitution_map)
 48 | {
 49 |     auto logger_ = logger::GetOrCreateLogger("StandardCompositionFst");
 50 |     logger_->set_level(spdlog::level::info);
 51 | 
 52 |     logger_->info("Starting Standard Composition. fstA.Start: {}, fstB.Start: {}", fstA.Start(), fstB.Start());
 53 | 
 54 |     FstAlignOption fst_options; // Assuming this defines symSub, symDel, symIns strings
 55 |     auto sub_label_id_ = symbols_.Find(fst_options.symSub);
 56 |     auto del_label_id_ = symbols_.Find(fst_options.symDel);
 57 |     auto ins_label_id_ = symbols_.Find(fst_options.symIns);
 58 |     if (sub_label_id_ == fst::kNoSymbol || del_label_id_ == fst::kNoSymbol || ins_label_id_ == fst::kNoSymbol) {
 59 |         logger_->error("Could not find special edit symbols ('{}', '{}', '{}') in the symbol table.", fst_options.symSub, fst_options.symDel, fst_options.symIns);
 60 |         throw std::runtime_error("Missing special symbols in symbol table.");
 61 |     }
 62 | 
 63 |     // --- Create Edit FSTs (halfEdit1, halfEdit2) ---
 64 |     StdVectorFst halfEdit1;
 65 |     StdVectorFst halfEdit2;
 66 |     halfEdit1.SetInputSymbols(&symbols_);
 67 |     halfEdit1.SetOutputSymbols(&symbols_);
 68 |     halfEdit1.AddState();
 69 |     halfEdit1.SetStart(0);
 70 |     halfEdit1.SetFinal(0, TropicalWeight::One());
 71 |     halfEdit1.AddArc(0, StdArc(0, ins_label_id_, insertion_cost / 2, 0)); // eps:ins
 72 | 
 73 |     halfEdit2.SetInputSymbols(&symbols_);
 74 |     halfEdit2.SetOutputSymbols(&symbols_);
 75 |     halfEdit2.AddState();
 76 |     halfEdit2.SetStart(0);
 77 |     halfEdit2.SetFinal(0, TropicalWeight::One());
 78 |     halfEdit2.AddArc(0, StdArc(del_label_id_, 0, deletion_cost / 2, 0)); // del:eps
 79 | 
 80 |     for (SymbolTableIterator siter(symbols_); !siter.Done(); siter.Next()) {
 81 |         int64_t sid = siter.Value();
 82 |         if (sid == 0 || sid == ins_label_id_ || sid == del_label_id_ || sid == sub_label_id_) {
 83 |             continue;
 84 |         }
 85 | 
 86 |         auto sym_tk = symbols_.Find(sid);
 87 |         bool isClassLabel = isEntityLabel(sym_tk);
 88 |         // Simplified: Check for entity labels if needed (same as develop version)
 89 |         // bool isClassLabel = false; // isEntityLabel(symbols_.Find(sid)); in develop
 90 |         if (isClassLabel) {
 91 |                 // Same handling as in develop version
 92 |             logger_->info("Token class label found for {}", symbols.Find(sid));
 93 |             halfEdit1.AddArc(0, StdArc(sid, sid, 0, 0));
 94 |             halfEdit1.AddArc(0, StdArc(sid, del_label_id_, -deletion_cost / 2, 0));
 95 |             halfEdit2.AddArc(0, StdArc(sid, sid, 0, 0));
 96 |         } else {
 97 |             // Standard symbol edits - exactly as in develop version
 98 |             halfEdit1.AddArc(0, StdArc(sid, sid, 0, 0)); // id:id
 99 |             halfEdit1.AddArc(0, StdArc(sid, sub_label_id_, substitution_cost / 2, 0)); // id:sub
100 |             halfEdit1.AddArc(0, StdArc(sid, del_label_id_, deletion_cost / 2, 0)); // id:del
101 | 
102 |             halfEdit2.AddArc(0, StdArc(sid, sid, 0, 0)); // id:id
103 |             halfEdit2.AddArc(0, StdArc(sub_label_id_, sid, substitution_cost / 2, 0)); // sub:id
104 |             halfEdit2.AddArc(0, StdArc(ins_label_id_, sid, insertion_cost / 2, 0)); // ins:id
105 |         }
106 |     }
107 |     
108 |     logger_->info("Created halfEdit FSTs with self-loops");
109 | 
110 |     // Step 1: Determinize the reference FST
111 |     StdVectorFst detRefFst;
112 |     Determinize(fstA, &detRefFst);
113 |     logger_->info("Determinized fstA has {} states", detRefFst.NumStates());
114 | 
115 |     // Step 2: Compose the first half
116 |     StdVectorFst halfCompose1;
117 |     logger_->info("Composing detRefFst o halfEdit1");
118 |     Compose(detRefFst, halfEdit1, &halfCompose1);
119 |     logger_->debug("halfCompose1 has {} states", halfCompose1.NumStates());
120 |     
121 |     // Check if first composition worked
122 |     if (halfCompose1.NumStates() == 0) {
123 |         logger_->warn("halfCompose1 (ref o edits) produced an FST with 0 states");
124 |         logger_->warn("halEdit1 was:");
125 |         printFst("fstalign", &halfEdit1, &symbols_);
126 |         return;
127 |     }
128 | 
129 |     // Sort for composition
130 |     ArcSort(&halfCompose1, fst::StdOLabelCompare());
131 | 
132 |     if (halfCompose1.NumStates() < 100) {
133 |         printFst("fstalign", &halfCompose1, &symbols_);
134 |     }
135 | 
136 |     // Step 3: Compose the second half
137 |     StdVectorFst halfCompose2;
138 |     logger_->info("Composing halfEdit2 o fstB");
139 |     Compose(halfEdit2, fstB, &halfCompose2);
140 |     logger_->debug("halfCompose2 has {} states", halfCompose2.NumStates());
141 |     
142 |     // Check if second composition worked
143 |     if (halfCompose2.NumStates() == 0) {
144 |         logger_->warn("halfCompose2 (edits o hyp) produced an FST with 0 states");
145 |         logger_->warn("halEdit2 was:");
146 |         printFst("fstalign", &halfEdit2, &symbols_);
147 |         return;
148 |     }
149 | 
150 |     // Sort for composition
151 |     ArcSort(&halfCompose2, fst::StdILabelCompare());
152 |     if (halfCompose2.NumStates() < 100) {
153 |       logger_->info("halfCompose2 has {} states", halfCompose2.NumStates());
154 |       printFst("fstalign", &halfCompose2, &symbols_);
155 |     } else {
156 |       logger_->info("halfCompose2 is too large to print, it has {} states", halfCompose2.NumStates());
157 |     }
158 | 
159 |     // Step 4: Final lazy composition
160 |     logger_->info("Performing lazy composition");
161 |     fstC_ = std::make_unique<fst::StdComposeFst>(halfCompose1, halfCompose2);
162 |     
163 |     StateIterator<fst::StdFst> siter(*fstC_);
164 |     logger_->info("Standard composition complete");
165 | }
166 | 
167 | StateId StandardCompositionFst::Start() {
168 |     return fstC_->Start();
169 | }
170 | 
171 | fst::Fst<fst::StdArc>::Weight StandardCompositionFst::Final(StateId stateId) {
172 |     return fstC_->Final(stateId);
173 | }
174 | 
175 | bool StandardCompositionFst::TryGetArcsAtState(StateId fromStateId, vector<fst::StdArc> *out_vector) {
176 |     assert(out_vector != NULL);
177 |     // out_vector->clear();
178 | 
179 |     for (ArcIterator<fst::StdFst> aiter(*fstC_, fromStateId); !aiter.Done(); aiter.Next()) {
180 |       const fst::StdArc &arc = aiter.Value();
181 |       out_vector->push_back(arc);
182 |     }
183 | 
184 |     return true;
185 | }
186 | 
187 | StandardCompositionFst::~StandardCompositionFst() {}
188 | 
189 | void StandardCompositionFst::DebugComposedGraph(string debug_filename) {
190 |   StdVectorFst composedFst(*fstC_);
191 |   ofstream outfile(debug_filename);
192 |   FstWriteOptions wopts;
193 |   composedFst.SetInputSymbols(&symbols_);
194 |   composedFst.SetOutputSymbols(&symbols_);
195 |   wopts.write_isymbols = true;
196 |   wopts.write_osymbols = true;
197 |   wopts.write_header = true;
198 |   composedFst.Write(outfile, wopts);
199 | }


--------------------------------------------------------------------------------
/src/StandardComposition.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * StandardComposition.h
 4 |  *
 5 |  * JP Robichaud (jp@rev.com)
 6 |  * 2021
 7 |  *
 8 |  */
 9 | 
10 | #ifndef __STANDARDCOMPOSITION_H__
11 | #define __STANDARDCOMPOSITION_H__
12 | 
13 | #include <fst/fstlib.h>
14 | #include <unordered_map>
15 | #include <utility>
16 | 
17 | #include "IComposition.h"
18 | #include "utilities.h"
19 | #include "fstalign.h"
20 | 
21 | /*
22 |  * Calculates edit distance between two FSTs through two-step composition.
23 |  * First, the reference FST is composed with all possible reference transformations (<sub>, <del>).
24 |  * Second, the hypothesis FST is composed with all possible hypothesis transformations (<sub>, <ins>).
25 |  * Then the two FSTs are composed using the standard OpenFST lazy composition.
26 |  */
27 | class StandardCompositionFst : public IComposition {
28 |  protected:
29 |   // Lazily composed fst, created during initialization
30 |   std::unique_ptr<fst::Fst<fst::StdArc>> fstC_;
31 |   // Add members to store options
32 |   bool strict_punctuation_ = false;
33 |   std::unordered_set<int> punctuation_ids_;
34 |   // Favored substitutions
35 |   bool use_favored_substitutions_ = false;
36 |   float favored_substitution_cost_ = 0.1f;
37 |   std::vector<int> favorable_substitution_map_;
38 |   const fst::SymbolTable& symbols_; // Store symbols if needed for filtering
39 | 
40 |  public:
41 |   StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB);
42 |   StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols);
43 |   StandardCompositionFst(const fst::StdFst &fstA, const fst::StdFst &fstB, const SymbolTable &symbols, const AlignerOptions& options);
44 |   ~StandardCompositionFst();
45 | 
46 |   StateId Start();
47 |   fst::Fst<fst::StdArc>::Weight Final(StateId stateId);
48 |   virtual bool TryGetArcsAtState(StateId fromStateId, vector<fst::StdArc> *out_vector);
49 | 
50 |   /* useful for debugging *SMALL* graphs, performs full (non-lazy) composition */
51 |   void DebugComposedGraph(string debug_filename);
52 | };
53 | 
54 | #endif /* __STANDARDCOMPOSITION_H__ */
55 | 


--------------------------------------------------------------------------------
/src/SynonymEngine.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * SynonymEngine.h
 4 |  *
 5 |  * JP Robichaud (jp@rev.com)
 6 |  * 2018
 7 |  *
 8 |  */
 9 | #ifndef __SYN_ENGINE_H
10 | #define __SYN_ENGINE_H
11 | #include <iomanip>
12 | 
13 | #include "utilities.h"
14 | 
15 | using namespace std;
16 | using namespace fst;
17 | 
18 | typedef vector<string> SynKey;
19 | typedef vector<vector<string>> SynVals;
20 | 
21 | struct SynonymOptions {
22 |   bool disable_cutoffs = false;
23 |   bool disable_hyphen_ignore = false;
24 | };
25 | 
26 | class SynonymEngine {
27 |  public:
28 |   SynonymEngine(SynonymOptions syn_opts);
29 | 
30 |   void LoadFile(string filename);
31 |   SynKey GetKeyFromString(string lhs);
32 |   SynVals GetValuesFromStrings(string rhs);
33 |   void ParseStrings(vector<string> lines);
34 |   void ApplyToFst(StdVectorFst &fst, SymbolTable &symbol);
35 |   void GenerateSynFromSymbolTable(SymbolTable &symbol);
36 | 
37 |  protected:
38 |   SynonymOptions opts_;
39 |   map<SynKey, SynVals> synonyms;
40 |   std::shared_ptr<spdlog::logger> logger_;
41 | };
42 | 
43 | #endif  // __SYN_ENGINE_H
44 | 


--------------------------------------------------------------------------------
/src/Walker.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Walker.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #ifndef __WALKER_H__
 9 | #define __WALKER_H__
10 | 
11 | #include "AlignmentTraversor.h"
12 | #include "FstLoader.h"
13 | #include "IComposition.h"
14 | #include "PathHeap.h"
15 | #include <unordered_map>
16 | 
17 | class Walker {
18 |  public:
19 |   Walker();
20 |   ~Walker() = default;
21 |   vector<wer_alignment> walkComposed(IComposition &fst, SymbolTable &symbol, FstAlignOption &options,
22 |                                                  int numBests);
23 |   int numberOfLoopsBeforePruning = 10;
24 |   int pruningHeapSizeTarget = 100;
25 |   bool useRelativeBeamPruning = true;
26 |   float relativeBeamWidth = 20.0;
27 | 
28 |   // Logging configuration
29 |   bool enableDetailedWalkerLogging = false;
30 | 
31 |  private:
32 |   std::unordered_map<int, float> logbook;
33 |   PathHeap _heapA;
34 |   PathHeap _heapB;
35 |   PathHeap *heapA;
36 |   PathHeap *heapB;
37 |   std::shared_ptr<spdlog::logger> logger;
38 | 
39 |   std::shared_ptr<ShortlistEntry> enqueueIfNeeded(std::shared_ptr<ShortlistEntry> currentStatePtr,
40 |                                                   const MyArc& arc_ptr, bool isAnchor);
41 |   wer_alignment GetDetailsFromTopCandidates(ShortlistEntry &currentState, SymbolTable &symbol,
42 |                                                         FstAlignOption &options);
43 | };
44 | 
45 | #endif  // __WALKER_H__
46 | 


--------------------------------------------------------------------------------
/src/fast-d.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | fast-d.cpp
  3 |  JP Robichaud (jp@rev.com)
  4 |  2021
  5 | */
  6 | #include "fast-d.h"
  7 | #include <algorithm>  // std::min
  8 | #include <iomanip>
  9 | #include <iostream>
 10 | #include <utility>
 11 | 
 12 | #define debug_map false
 13 | 
 14 | int min(int &a, int &b, int &c) {
 15 |   if (a < b) {
 16 |     if (a < c) {
 17 |       return a;
 18 |     } else {
 19 |       return c;
 20 |     }
 21 |   } else if (b < c) {
 22 |     return b;
 23 |   } else {
 24 |     return c;
 25 |   }
 26 | }
 27 | 
 28 | int GetEditDistance(std::vector<int> &seqA, std::vector<int> &seqB) {
 29 |   std::vector<int> mapA;
 30 |   std::vector<int> mapB;
 31 | 
 32 |   return GetEditDistance(seqA, mapA, seqB, mapB);
 33 | }
 34 | 
 35 | void print_vect(std::string lbl, int *v, int l) {
 36 |   std::cout << lbl;
 37 |   for (int x = 0; x < l; x++) {
 38 |     std::cout << std::setw(2);
 39 |     std::cout << v[x] << " ";
 40 |   }
 41 |   std::cout << std::endl;
 42 | }
 43 | 
 44 | /* This is memory hungry.  We need seqA.size()  * seqB.size() * sizeof(int) because we need to be able to do the
 45 |  * backtracking.
 46 |  */
 47 | int GetEditDistance(std::vector<int> &seqA, std::vector<int> &mapA, std::vector<int> &seqB, std::vector<int> &mapB) {
 48 |   int lengthA = seqA.size();
 49 |   int lengthB = seqB.size();
 50 | 
 51 |   if (lengthA > lengthB) {
 52 |     // make sure seqA is always the shortest
 53 |     return GetEditDistance(seqB, mapB, seqA, mapA);
 54 |   }
 55 | 
 56 |   mapA.reserve(seqA.size());
 57 |   mapA.resize(seqA.size(), -1);  // resize() sets all position to the given value, -1
 58 |   mapB.reserve(seqB.size());
 59 |   mapB.resize(seqB.size(), -1);
 60 | 
 61 |   if (seqA.size() == 0) {
 62 |     return seqB.size();
 63 |   } else if (seqB.size() == 0) {
 64 |     return seqA.size();
 65 |   }
 66 | 
 67 |   // let's keep two rows, dig all the way to the end to get the distance, then
 68 |   // let's try to backtrack and get the edits from there, recomputing the rows again
 69 | 
 70 |   int distance[lengthA + 1];
 71 |   int distancePrev[lengthA + 1];
 72 |   for (int i = 0; i <= lengthA; ++i) {
 73 |     distance[i] = i;
 74 |   }
 75 | 
 76 |   // TODO: we should maybe optimize this to be a 2D uint array?
 77 |   std::vector<std::vector<int>> all_distances;
 78 | 
 79 | #if debug_map
 80 |   print_vect(std::string("seqA: "), seqA.data(), lengthA);
 81 |   print_vect(std::string("seqB: "), seqB.data(), lengthB);
 82 | #endif
 83 |   for (int j = 1; j <= lengthB; ++j) {
 84 |     all_distances.push_back(std::vector<int>(distance, distance + lengthA + 1));
 85 |     // for (int x = 0; x <= lengthA; ++x) {
 86 |     //   distancePrev[x] = distance[x];
 87 |     // }
 88 | 
 89 |     std::copy(distance, distance + lengthA + 1, distancePrev);
 90 | 
 91 | #if debug_map
 92 |     print_vect(std::string("d: "), distance, lengthA + 1);
 93 | #endif
 94 | 
 95 |     int prev_diag = distance[0], prev_diag_save;
 96 |     ++distance[0];
 97 | 
 98 |     for (int i = 1; i <= lengthA; ++i) {
 99 |       prev_diag_save = distance[i];
100 |       if (seqA[i - 1] == seqB[j - 1]) {
101 |         distance[i] = prev_diag;
102 |       } else {
103 |         distance[i] = min(distance[i - 1], distance[i], prev_diag) + 1;
104 |       }
105 |       prev_diag = prev_diag_save;
106 |     }
107 |   }
108 |   all_distances.push_back(std::vector<int>(distance, distance + lengthA + 1));
109 | #if debug_map
110 |   print_vect(std::string("d: "), distance, lengthA + 1);
111 | #endif
112 | 
113 |   int edit_distance = distance[lengthA];
114 | 
115 |   // now, we want to backtrack the computation and trace, row, by row,
116 |   // the path
117 | 
118 |   int current_pos = lengthA;
119 |   int seqB_track = lengthB;
120 | 
121 |   while (current_pos > 0 && seqB_track >= 0) {
122 |     int current_pos_score = distance[current_pos];
123 | #if debug_map
124 |     std::cout << "starting iter" << std::endl;
125 |     std::cout << "current_pos = " << current_pos << std::endl;
126 |     std::cout << "current_pos_score = " << current_pos_score << std::endl;
127 |     std::cout << "seqB_track  = " << seqB_track << std::endl;
128 |     print_vect(std::string("mapA:"), mapA.data(), lengthA);
129 |     print_vect(std::string("mapB:"), mapB.data(), lengthB);
130 |     print_vect(std::string("dP: "), distancePrev, lengthA + 1);
131 |     print_vect(std::string("d : "), distance, lengthA + 1);
132 | #endif
133 | 
134 |     int token_a = seqA[current_pos - 1];
135 |     int token_b = seqB[seqB_track - 1];
136 | 
137 |     int nw = distancePrev[current_pos - 1];
138 |     int n = distancePrev[current_pos];
139 |     int w = distance[current_pos - 1];
140 | 
141 |     bool is_sub = false;
142 |     bool is_match = false;
143 |     bool is_del = false;
144 |     bool is_ins = false;
145 | 
146 | #if debug_map
147 |     std::cout << "checking " << token_a << " vs " << token_b << std::endl;
148 |     std::cout << "nw = " << nw << std::endl;
149 |     std::cout << "n  = " << n << std::endl;
150 |     std::cout << "w  = " << w << std::endl;
151 | #endif
152 | 
153 |     int min_path_score = min(nw, n, w);
154 |     if (min_path_score == nw) {
155 |       // the upper-left diagonal is the best path
156 |       is_sub = true;
157 |       if (token_a == token_b) {
158 |         // we have a caracter match
159 |         mapA[current_pos - 1] = 1;
160 |         mapB[seqB_track - 1] = 1;
161 |         is_match = true;
162 |         is_sub = false;
163 |       }
164 | #if debug_map
165 |       std::cout << "S(" << token_a << "|" << token_b << ") (matched " << is_match << "), (sub " << is_sub << ")"
166 |                 << std::endl;
167 | #endif
168 |       // going up-left, next time we need to be one position on the left
169 |       // to read the distance
170 |       current_pos--;
171 |       seqB_track--;
172 |     } else if (min_path_score == w) {
173 |       // this is a deletion, going left
174 |       current_pos--;
175 |       is_del = true;
176 | #if debug_map
177 |       std::cout << "D(" << token_a << ")" << std::endl;
178 | #endif
179 |     } else {
180 |       // this is an insertion, going north doesn't
181 |       // change the current_position we read in the distance vector
182 |       seqB_track--;
183 |       is_ins = true;
184 | #if debug_map
185 |       std::cout << "I(" << token_b << ")" << std::endl;
186 | #endif
187 |     }
188 | 
189 |     if (current_pos < 0 || seqB_track == 0) {
190 |       // We reach to a point where any position left
191 |       // are errors (either insertions or deletions).
192 |       // There's no point in analyzing these values.
193 |       break;
194 |     }
195 | 
196 |     // ok, we stop reflecting on the best path to take, now we need to update
197 |     // the distance vectors.  Step 1, distancePrev becomes the new distance
198 |     // now, we only have to go up to current_pos, because everything on the
199 |     // right will get ignored.  Actually, we only have to compute the two values
200 |     // above
201 | 
202 |     for (int x = 0; x <= lengthA; x++) {
203 |       distance[x] = distancePrev[x];
204 |     }
205 | 
206 |     // Step 2, let's look at the row above the one we are now.
207 |     if (false) {
208 |       token_b = seqB[seqB_track - 1];
209 |       int token_b_prime = seqB[seqB_track - 2];
210 |       for (int x = current_pos; x > 0; --x) {
211 |         token_a = seqA[x - 1];
212 | 
213 |         if (token_a == token_b) {
214 |           distancePrev[x - 1] = distance[x];
215 |           distancePrev[x] = distance[x] + 1;
216 |         } else {
217 |           distancePrev[x - 1] = distance[x - 1] - 1;
218 |           distancePrev[x] = distance[x] - 1;
219 |         }
220 |       }
221 | 
222 |     } else {
223 |       auto v = all_distances[seqB_track - 1];
224 |       for (int j = 0; j <= lengthA; ++j) {
225 |         distancePrev[j] = v[j];
226 |       }
227 |     }
228 |   }
229 | 
230 | #if debug_map
231 |   std::cout << "starting iter" << std::endl;
232 |   std::cout << "current_pos = " << current_pos << std::endl;
233 |   std::cout << "current_pos_score = 0" << std::endl;
234 |   std::cout << "seqB_track  = " << seqB_track << std::endl;
235 |   print_vect(std::string("mapA:"), mapA.data(), lengthA);
236 |   print_vect(std::string("mapB:"), mapB.data(), lengthB);
237 |   print_vect(std::string("dP: "), distancePrev, lengthA + 1);
238 |   print_vect(std::string("d : "), distance, lengthA + 1);
239 | #endif
240 | 
241 |   return edit_distance;
242 | }
243 | 
244 | /* This version doesn't handle a map and uses much less ram
245 |    because it doesn't keep information required for backtracking.
246 |    With this method, you only get the final edit distance.
247 |  */
248 | int GetEditDistanceOnly(std::vector<int> &seqA, std::vector<int> &seqB) {
249 |   int lengthA = seqA.size();
250 |   int lengthB = seqB.size();
251 | 
252 |   if (lengthA > lengthB) {
253 |     // make sure seqA is always the shortest
254 |     return GetEditDistanceOnly(seqB, seqA);
255 |   }
256 | 
257 |   if (seqA.size() == 0) {
258 |     return seqB.size();
259 |   } else if (seqB.size() == 0) {
260 |     return seqA.size();
261 |   }
262 | 
263 |   int distance[lengthA + 1];
264 |   for (int i = 0; i <= lengthA; ++i) {
265 |     distance[i] = i;
266 |   }
267 | 
268 |   for (int j = 1; j <= lengthB; ++j) {
269 |     int prev_diag = distance[0], prev_diag_save;
270 |     int dist_lenA = distance[lengthA];
271 |     ++distance[0];
272 | 
273 |     for (int i = 1; i <= lengthA; ++i) {
274 |       prev_diag_save = distance[i];
275 |       if (seqA[i - 1] == seqB[j - 1]) {
276 |         distance[i] = prev_diag;
277 |       } else {
278 |         distance[i] = min(distance[i - 1], distance[i], prev_diag) + 1;
279 |       }
280 |       prev_diag = prev_diag_save;
281 |     }
282 |   }
283 | 
284 |   return distance[lengthA];
285 | }
286 | 
287 | bool MapContainsErrorStreaks(std::vector<int> map, int streak_cutoff) {
288 |   int seq_cnt = 0;
289 |   int bad_match_seq_cnt = 0;
290 |   for (int x = 0; x < map.size(); x++) {
291 |     if (map[x] <= 0) {
292 |       bad_match_seq_cnt++;
293 |     } else if (bad_match_seq_cnt > streak_cutoff) {
294 |       return true;
295 |     } else {
296 |       bad_match_seq_cnt = 0;
297 |     }
298 |   }
299 |   return false;
300 | }
301 | 


--------------------------------------------------------------------------------
/src/fast-d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * fast-d.h
 4 |  *
 5 |  * JP Robichaud (jp@rev.com)
 6 |  * (C) 2021
 7 |  *
 8 |  */
 9 | 
10 | #ifndef __FASTD_H__
11 | #define __FASTD_H__
12 | 
13 | #include <vector>
14 | 
15 | // Simply call GetEditDistance with dummy mapA/mapB
16 | int GetEditDistance(std::vector<int> &seqA, std::vector<int> &seqB);
17 | 
18 | // returns the edit distance, resize mapA and mapB to be the same length as seqA and seqB
19 | // the map vectors will have either -1 or 1 values, 1 indicating that the token in this position
20 | // matched its counterpart in the other sequence vector, -1 otherwise.
21 | // Unfortunately, for now, we'll need in the order of seqA.size()*seqB.size()*sizeof(int) memory because
22 | // we need to get the backtracking info available to construct the map objects
23 | int GetEditDistance(std::vector<int> &seqA, std::vector<int> &mapA, std::vector<int> &seqB, std::vector<int> &mapB);
24 | 
25 | // returns only the edit distance.
26 | // This is a memory optimized version and is quite fast.
27 | int GetEditDistanceOnly(std::vector<int> &seqA, std::vector<int> &seqB);
28 | 
29 | // Returns whether map contains long error streaks.
30 | bool MapContainsErrorStreaks(std::vector<int> map, int streak_cutoff);
31 | 
32 | #endif


--------------------------------------------------------------------------------
/src/fstalign.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | fstalign.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #ifndef __FSTALIGN_H__
 9 | #define __FSTALIGN_H__
10 | 
11 | #include "Ctm.h"
12 | #include "Nlp.h"
13 | #include "SynonymEngine.h"
14 | 
15 | using namespace std;
16 | using namespace fst;
17 | 
18 | // Represent information associated with a reference or hypothesis token
19 | struct Token {
20 |   string token;
21 |   float start_ts=0.0;
22 |   float end_ts=0.0;
23 |   float duration=0.0;
24 |   float confidence=-1.0;
25 |   string speaker;
26 | };
27 | 
28 | // Stitchings will be used to represent fstalign output, combining reference,
29 | // hypothesis, and error information into a record-like data structure.
30 | struct Stitching {
31 |   Token reftk;
32 |   Token hyptk;
33 |   string classLabel;
34 |   RawNlpRecord nlpRow;
35 |   string hyp_orig;
36 |   string comment;
37 | };
38 | 
39 | struct AlignerOptions {
40 |   int speaker_switch_context_size;
41 |   int numBests = 20;
42 |   int heapPruningTarget = 20;
43 |   int pr_threshold = 0;
44 |   string symbols_filename = "";
45 |   string composition_approach = "adapted";
46 |   bool record_case_stats;
47 |   bool levenstein_first_pass = false;
48 |   int levenstein_maximum_error_streak = 100;
49 |   float relative_beam_width = 50.0;
50 |   bool strict_punctuation = true;
51 |   std::unordered_set<int> punctuation_ids;
52 |   // Favored substitutions
53 |   bool use_favored_substitutions = true;
54 |   float favored_substitution_cost = 0.1f;
55 |   std::vector<int> favorable_substitution_map; // Map ID -> favored partner ID (-1 if none)
56 | };
57 | 
58 | 
59 | void HandleWer(FstLoader& refLoader, FstLoader& hypLoader, SynonymEngine &engine, const string& output_sbs, const string& output_nlp,
60 |                AlignerOptions alignerOptions, bool add_inserts_nlp, bool use_case, std::vector<string> ref_extra_columns, std::vector<string> hyp_extra_columns);
61 | void HandleAlign(NlpFstLoader &refLoader, CtmFstLoader &hypLoader, SynonymEngine &engine, ofstream &output_nlp_file,
62 |                  AlignerOptions alignerOptions);
63 | 
64 | string GetTokenPropertyAsString(Stitching stitch, bool refToken, string property);
65 | 
66 | #endif  // __FSTALIGN_H__
67 | 


--------------------------------------------------------------------------------
/src/json_logging.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | json_logging.h
 3 |  Nishchal Bhandari (nishchal@rev.com)
 4 |  2019
 5 | 
 6 | */
 7 | 
 8 | #ifndef __JSONLOGGING_H__
 9 | #define __JSONLOGGING_H__
10 | 
11 | #include <json/json.h>
12 | 
13 | namespace jsonLogger {
14 | 
15 | class JsonLogger {
16 |  private:
17 |   JsonLogger() {}
18 | 
19 |  public:
20 |   Json::Value root;
21 | 
22 |   static JsonLogger& getLogger() {
23 |     static JsonLogger instance;
24 |     return instance;
25 |   }
26 | 
27 |   JsonLogger(JsonLogger const&) = delete;
28 |   void operator=(JsonLogger const&) = delete;
29 | };
30 | 
31 | }  // namespace jsonLogger
32 | 
33 | #endif  // __JSONLOGGING_H__
34 | 


--------------------------------------------------------------------------------
/src/logging.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | logging.cpp
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #include "logging.h"
 9 | 
10 | #include <spdlog/sinks/ansicolor_sink.h>
11 | #include <spdlog/sinks/basic_file_sink.h>
12 | #include <spdlog/spdlog.h>
13 | 
14 | namespace logger {
15 | 
16 | std::string CONSOLE_LOGGER_NAME = "console";
17 | 
18 | std::vector<spdlog::sink_ptr> sinks;
19 | 
20 | void InitLoggers(std::string logfilename) {
21 |   sinks.push_back(std::make_shared<spdlog::sinks::ansicolor_stdout_sink_mt>());
22 |   if (logfilename.size() > 0) {
23 |     auto filesink = std::make_shared<spdlog::sinks::basic_file_sink_mt>(logfilename);
24 |     sinks.push_back(filesink);
25 |   }
26 | 
27 |   spdlog::set_level(spdlog::level::info);
28 |   auto console = std::make_shared<spdlog::logger>(CONSOLE_LOGGER_NAME, begin(sinks), end(sinks));
29 |   spdlog::register_logger(console);
30 | 
31 |   //   auto console = spd::stdout_color_mt(CONSOLE_LOGGER_NAME);
32 |   console->info("loggers initialized");
33 |   console->flush_on(spdlog::level::info);
34 |   // in general, we can have the utc offset, but for stdout, let's be lean a bit
35 |   spdlog::set_pattern("[%^+++%$] [%H:%M:%S %z] [thread %t] [%n] %v");
36 |   console->set_pattern("[%^+++%$] [%H:%M:%S] [%n] %v");
37 | 
38 |   // todo : define extra loggers for individual components and read their levels
39 |   // from a trc.cfg
40 | }
41 | 
42 | std::shared_ptr<spdlog::logger> GetOrCreateLogger(std::string name) {
43 |   auto log = spdlog::get(name);
44 | 
45 |   if (log == nullptr) {
46 |     // since we'll go to stdout, we'll avoid the utc offset
47 |     // log = spdlog::stdout_color_mt(name);
48 |     log = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
49 |     spdlog::register_logger(log);
50 |     log->flush_on(spdlog::level::info);
51 |     log->set_pattern("[%^+++%$] [%H:%M:%S] [%n] %v");
52 |   }
53 | 
54 |   return log;
55 | }
56 | 
57 | std::shared_ptr<spdlog::logger> GetLogger(std::string name) {
58 |   auto log = spdlog::get(name);
59 | 
60 |   if (log == nullptr) {
61 |     log = spdlog::get(CONSOLE_LOGGER_NAME);
62 |     log->error(
63 |         "The requested logger name [{}] wasn't found in the registery, using "
64 |         "[{}] instead",
65 |         name, CONSOLE_LOGGER_NAME);
66 |   }
67 | 
68 |   return log;
69 | }
70 | 
71 | void CloseLoggers() {
72 |   // closing everything
73 |   spdlog::drop_all();
74 | }
75 | }  // namespace logger
76 | 


--------------------------------------------------------------------------------
/src/logging.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | logging.h
 3 |  JP Robichaud (jp@rev.com)
 4 |  2018
 5 | 
 6 | */
 7 | 
 8 | #ifndef __LOGGING_H__
 9 | #define __LOGGING_H__
10 | 
11 | #include <spdlog/spdlog.h>
12 | 
13 | #define HERE_FMT "{}:{:d}: "
14 | #define HERE2 __FILE__, __LINE__
15 | #define HEREF2 __FUNCTION__, __LINE__
16 | 
17 | namespace logger {
18 | 
19 | namespace spd = spdlog;
20 | 
21 | void InitLoggers(std::string logfilename);
22 | std::shared_ptr<spd::logger> GetOrCreateLogger(std::string name);
23 | std::shared_ptr<spd::logger> GetLogger(std::string name);
24 | void CloseLoggers();
25 | }  // namespace logger
26 | 
27 | #endif  // __LOGGING_H__
28 | 


--------------------------------------------------------------------------------
/src/utilities.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "utilities.h"
  3 | 
  4 | #include <iterator>
  5 | 
  6 | // controlling the graph
  7 | const std::string EPSILON = "<eps>";
  8 | const std::string INS = "<ins>";
  9 | const std::string DEL = "<del>";
 10 | 
 11 | // for the sticthing process
 12 | const std::string TK_GLOBAL_CLASS = "global";
 13 | 
 14 | // for fallback deletions
 15 | const std::string NOOP = "!!noop-token!!";
 16 | 
 17 | void printFst(const fst::StdFst *fst, const fst::SymbolTable *symbol) { printFst("console", fst, symbol); }
 18 | 
 19 | void printFst(std::string loggerName, const fst::StdFst *fst, const fst::SymbolTable *symbol) {
 20 |   auto log = logger::GetOrCreateLogger(loggerName);
 21 |   if (log->should_log(spdlog::level::info)) {
 22 |     for (fst::StateIterator<fst::StdFst> siter(*fst); !siter.Done(); siter.Next()) {
 23 |       fst::StdFst::StateId stateId = siter.Value();
 24 |       float end_state_weight = fst->Final(stateId).Value();
 25 | 
 26 |       for (fst::ArcIterator<fst::StdFst> aiter(*fst, stateId); !aiter.Done(); aiter.Next()) {
 27 |         const fst::StdArc &arc = aiter.Value();
 28 | 
 29 |         std::stringstream ss;
 30 |         std::stringstream ss1;
 31 |         ss << arc.ilabel << "/" << symbol->Find(arc.ilabel);
 32 |         std::string ilabel = ss.str();
 33 | 
 34 |         ss1 << arc.olabel << "/" << symbol->Find(arc.olabel);
 35 |         std::string olabel = ss1.str();
 36 | 
 37 |         log->info("{}\t{}\t{}\t{}\t{}", stateId, arc.nextstate, ilabel, olabel, arc.weight.Value());
 38 |       }
 39 | 
 40 |       if (end_state_weight != numeric_limits<float>::infinity() && end_state_weight != 0) {
 41 |         log->info("{}", stateId);
 42 |       }
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | template <typename StringFunction>
 48 | void splitString(const std::string &str, char delimiter, StringFunction f) {
 49 |   std::size_t from = 0;
 50 |   for (std::size_t i = 0; i < str.size(); ++i) {
 51 |     if (str[i] == delimiter) {
 52 |       f(str, from, i);
 53 |       from = i + 1;
 54 |     }
 55 |   }
 56 |   if (from <= str.size()) {
 57 |     f(str, from, str.size());
 58 |   }
 59 | }
 60 | 
 61 | struct iequal {
 62 |   bool operator()(int c1, int c2) const { return std::toupper(c1) == std::toupper(c2); }
 63 | };
 64 | 
 65 | bool iequals(const std::string &str1, const std::string &str2) {
 66 |   if (str1.size() != str2.size()) {
 67 |     return false;
 68 |   }
 69 | 
 70 |   if (str1 == str2) {
 71 |     return true;
 72 |   }
 73 | 
 74 |   return std::equal(str1.begin(), str1.end(), str2.begin(), iequal());
 75 | }
 76 | 
 77 | // trim from start (in place)
 78 | void ltrim(std::string &s) {
 79 |   s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !std::isspace(ch); }));
 80 | }
 81 | 
 82 | // trim from end (in place)
 83 | void rtrim(std::string &s) {
 84 |   s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end());
 85 | }
 86 | 
 87 | // trim from both ends (in place)
 88 | void trim(std::string &s) {
 89 |   ltrim(s);
 90 |   rtrim(s);
 91 | }
 92 | 
 93 | // trim from both ends (copying)
 94 | std::string trim_copy(std::string s) {
 95 |   trim(s);
 96 |   return s;
 97 | }
 98 | 
 99 | template <class Iter>
100 | Iter splitStringIter(const std::string &s, const std::string &delim, Iter out) {
101 |   if (delim.empty()) {
102 |     *out++ = s;
103 |     return out;
104 |   }
105 |   size_t a = 0, b = s.find(delim);
106 |   for (; b != std::string::npos; a = b + delim.length(), b = s.find(delim, a)) {
107 |     *out++ = std::move(s.substr(a, b - a));
108 |   }
109 |   *out++ = std::move(s.substr(a, s.length() - a));
110 |   return out;
111 | }
112 | 
113 | std::string string_join(const std::vector<std::string> &elements, const char *const separator) {
114 |   switch (elements.size()) {
115 |     case 0:
116 |       return "";
117 |     case 1:
118 |       return elements[0];
119 |     default:
120 |       std::ostringstream os;
121 |       std::copy(elements.begin(), elements.end() - 1, std::ostream_iterator<std::string>(os, separator));
122 |       os << *elements.rbegin();
123 |       return os.str();
124 |   }
125 | }
126 | 
127 | bool isValidNgram(const string &token) {
128 |   if ((token.find(INS) != string::npos) || (token.find(DEL) != string::npos) || (token.find(EPSILON) != string::npos) ||
129 |       (token.find("___") != string::npos)) {
130 |     return false;
131 |   } else {
132 |     return true;
133 |   }
134 | }
135 | 
136 | unordered_set<string> get_bigrams(wer_alignment &topAlignment) {
137 |   string bigram_ref = "";
138 |   string bigram_hyp = "";
139 |   unordered_set<string> all_bigrams;
140 |   vector<string> bi_words;
141 | 
142 |   // Create a list of all tokens, flattening entity tokens
143 |   vector<pair<string, string>> flattened_tokens;
144 |   for (auto &tokens : topAlignment.tokens) {
145 |     // handle entity labels
146 |     if (isEntityLabel(tokens.first)) {
147 |       auto class_label = tokens.first;
148 | 
149 |       for (auto &label_alignment : topAlignment.label_alignments) {
150 |         if (label_alignment.classLabel == class_label) {
151 |           for (auto &labelTokens : label_alignment.tokens) {
152 |             flattened_tokens.push_back(labelTokens);
153 |           }
154 |         }
155 |       }
156 |     } else {
157 |       flattened_tokens.push_back(tokens);
158 |     }
159 |   }
160 | 
161 |   for (auto it = flattened_tokens.begin(); it != std::prev(flattened_tokens.end()); ++it) {
162 |     bi_words = {it->first, std::next(it)->first};
163 |     bigram_ref = string_join(bi_words, " ");
164 |     // cout << it  - topAlignment->tokens.begin() << " : "<< bigram_ref << " (" << it->first << " " <<
165 |     // std::next(it)->first <<" )" << endl;
166 |     if (isValidNgram(bigram_ref)) {
167 |       topAlignment.ref_bigrams[bigram_ref] += 1;
168 |       all_bigrams.insert(bigram_ref);
169 |     }
170 |     bi_words = {it->second, std::next(it)->second};
171 |     bigram_hyp = string_join(bi_words, " ");
172 |     // cout << it  - topAlignment->tokens.begin() << " : "<< bigram_hyp << " (" << it->second << " " <<
173 |     // std::next(it)->second <<" )" << endl;
174 |     if (isValidNgram(bigram_hyp)) {
175 |       topAlignment.hyp_bigrams[bigram_hyp] += 1;
176 |       all_bigrams.insert(bigram_hyp);
177 |     }
178 | 
179 |     topAlignment.bigram_tokens.push_back(std::make_pair(bigram_ref, bigram_hyp));
180 |   }
181 |   return all_bigrams;
182 | }
183 | 
184 | bool isEntityLabel(const string &token) { return token.find("___") == 0 ? true : false; }
185 | 
186 | bool isSynonymLabel(const string &token) {
187 |   //   return token.find("___SYN-") == 0 ? true : false;
188 |   return (token.find("___") == 0 && token.find("_SYN_") != std::string::npos) ? true : false;
189 | }
190 | 
191 | bool IsNoisecodeToken(const string &token) { return token.find("<") == 0 && token.find(">") == token.length() - 1; }
192 | 
193 | string getLabelIdFromToken(const string &token) {
194 |   if (!isEntityLabel(token)) {
195 |     return "";
196 |   }
197 |   // Example label: ___0_CONTRACTION___
198 | 
199 |   // Trim the ___ from the start and end of the label string
200 |   auto label_id = token.substr(3, token.size() - 6);
201 | 
202 |   // Isolate the ID at the start of the label, separated by _
203 |   int p = label_id.find("_");
204 |   if (p > 0) {
205 |     label_id = label_id.substr(0, p);
206 |   }
207 | 
208 |   return label_id;
209 | }
210 | 
211 | std::string GetEnv(const std::string &var, const std::string default_value) {
212 |   const char *val = std::getenv(var.c_str());
213 |   if (val == nullptr) {  // invalid to assign nullptr to std::string
214 |     return default_value;
215 |   } else {
216 |     return val;
217 |   }
218 | }
219 | 
220 | // going from ___23_ORDINAL___  to ORDINAL
221 | string GetLabelNameFromClassLabel(string classLabel) {
222 |   string label_id = classLabel.substr(3, classLabel.size() - 6);
223 |   string label = label_id.substr(label_id.find("_") + 1);
224 |   return label;
225 | }
226 | 
227 | string GetClassLabel(string best_label) {
228 |   if (best_label == "") {
229 |     return "";
230 |   }
231 | 
232 |   string classlabel = string("___" + best_label + "___");
233 |   std::replace(classlabel.begin(), classlabel.end(), ':', '_');
234 |   return classlabel;
235 | }
236 | 
237 | string UnicodeLowercase(string token) {
238 |   icu::UnicodeString utoken = icu::UnicodeString::fromUTF8(token);
239 |   std::string lower_cased;
240 |   utoken.toLower().toUTF8String(lower_cased);
241 |   return lower_cased;
242 | }
243 | 
244 | bool EndsWithCaseInsensitive(const string &value, const string &ending) {
245 |   if (ending.size() > value.size()) {
246 |     return false;
247 |   }
248 |   return equal(ending.rbegin(), ending.rend(), value.rbegin(),
249 |                [](const char a, const char b) { return tolower(a) == tolower(b); });
250 | }
251 | 


--------------------------------------------------------------------------------
/src/utilities.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * utilities.h
  3 |  *
  4 |  *  Created on: 2018-04-23
  5 |  *      Author: JP Robichaud (jp@rev.com)
  6 |  */
  7 | 
  8 | #ifndef UTILITIES_H_
  9 | #define UTILITIES_H_
 10 | 
 11 | #include <unicode/locid.h>
 12 | #include <unicode/unistr.h>
 13 | #include <unicode/ustream.h>
 14 | #include <algorithm>
 15 | #include <cctype>
 16 | #include <codecvt>
 17 | #include <cstdarg>
 18 | #include <cstring>
 19 | #include <iostream>
 20 | #include <iterator>
 21 | #include <locale>
 22 | #include <map>
 23 | #include <sstream>
 24 | #include <string>
 25 | #include <vector>
 26 | 
 27 | #include <fst/fstlib.h>
 28 | 
 29 | #include "logging.h"
 30 | // #include "FstLoader.h"
 31 | 
 32 | #define quote(x) #x
 33 | 
 34 | using namespace std;
 35 | using namespace fst;
 36 | 
 37 | extern const string EPSILON;  // = "ε";
 38 | extern const string INS;      // = "ε";
 39 | extern const string DEL;      // = "ε";
 40 | 
 41 | extern const string TK_GLOBAL_CLASS;  // "global";
 42 | extern const string NOOP;
 43 | 
 44 | typedef struct wer_alignment wer_alignment;
 45 | typedef struct wer_alignment *WERAp;
 46 | typedef shared_ptr<wer_alignment> spWERA;
 47 | 
 48 | typedef float precision_t;
 49 | typedef float recall_t;
 50 | 
 51 | typedef unordered_map<string, uint64_t> bigrams;
 52 | 
 53 | typedef struct gram_error_counter {
 54 |   int correct = 0;
 55 |   int del = 0;
 56 |   int subst_fp = 0;
 57 |   int subst_fn = 0;
 58 |   int ins = 0;
 59 |   precision_t precision = 0.0f;
 60 |   recall_t recall = 0.0f;
 61 |   gram_error_counter(int c, int d, int sfp, int sfn, int i) : correct(c), del(d), subst_fp(sfp), subst_fn(sfn), ins(i) {}
 62 | } gram_error_counter;
 63 | 
 64 | struct wer_alignment {
 65 |   string classLabel;
 66 |   // int numErrors;
 67 |   int insertions = 0;
 68 |   int deletions = 0;
 69 |   int substitutions = 0;
 70 |   int numWordsInReference = 0;
 71 |   int numWordsInHypothesis = 0;
 72 | 
 73 |   vector<string> ref_words;
 74 |   vector<string> hyp_words;
 75 | 
 76 |   // we could perhaps get rid of these using <del> or <ins> in sub_words
 77 |   vector<string> del_words;
 78 |   vector<string> ins_words;
 79 |   vector<pair<string, string>> sub_words;
 80 | 
 81 |   precision_t precision;
 82 |   recall_t recall;
 83 |   // map<string, pair<precision_t, recall_t>> unigram_stats;
 84 |   vector<pair<string, gram_error_counter>> unigram_stats;
 85 | 
 86 |   // map<pair<string, string>, pair<float, float>>
 87 |   vector<pair<string, gram_error_counter>> bigrams_stats;
 88 |   bigrams ref_bigrams;
 89 |   bigrams hyp_bigrams;
 90 |   vector<pair<string, string>> bigram_tokens;
 91 | 
 92 |   vector<pair<string, string>> tokens;
 93 |   vector<wer_alignment> label_alignments;
 94 |   int NumErrors() { return insertions + substitutions + deletions; }
 95 |   /* can return infinity if numWordsInReference == 0 and numWordsInHypothesis > 0 */
 96 |   float WER() const {
 97 |     if (numWordsInReference > 0) {
 98 |       return (float)(insertions + deletions + substitutions) / (float)numWordsInReference;
 99 |     }
100 | 
101 |     if (numWordsInHypothesis > 0) {
102 |       return numeric_limits<float>::infinity();
103 |     }
104 | 
105 |     return 0;
106 |   }
107 | 
108 |   void Reverse() {
109 |     std::reverse(ref_words.begin(), ref_words.end());
110 |     std::reverse(hyp_words.begin(), hyp_words.end());
111 |     std::reverse(ins_words.begin(), ins_words.end());
112 |     std::reverse(del_words.begin(), del_words.end());
113 |     std::reverse(sub_words.begin(), sub_words.end());
114 |     std::reverse(tokens.begin(), tokens.end());
115 |     for (auto &a : label_alignments) {
116 |       a.Reverse();
117 |     }
118 |   }
119 | };
120 | 
121 | struct FstAlignOption {
122 |   bool bForceEnterAndExit;
123 | 
124 |   float corCost;
125 |   float insCost;
126 |   float delCost;
127 |   float subCost;
128 | 
129 |   string symEps;
130 |   string symOov;
131 |   string symIns;
132 |   string symDel;
133 |   string symSub;
134 |   string symInaud;
135 |   string symSil;
136 |   string symUnk;
137 | 
138 |   int eps_idx;
139 |   int oov_idx;
140 |   int ins_idx;
141 |   int del_idx;
142 |   int sub_idx;
143 |   int inaud_idx;
144 |   int sil_idx;
145 |   int unk_idx;
146 | 
147 |   FstAlignOption()
148 |       : bForceEnterAndExit(false),
149 |         corCost(0.0f),
150 |         insCost(3.0f),
151 |         delCost(3.0f),
152 |         subCost(4.0f),
153 |         symEps("<eps>"),
154 |         symOov("<oov>"),
155 |         symIns("<ins>"),
156 |         symDel("<del>"),
157 |         symSub("<sub>"),
158 |         symInaud("<inaudible>"),
159 |         symSil("<silence>"),
160 |         symUnk("<unk>") {}
161 | 
162 |   void RegisterSymbols(fst::SymbolTable &symbol) {
163 |     // int noSym = fst::kNoSymbol;
164 |     int noSym = -1;
165 |     eps_idx = symbol.Find(symEps);
166 |     if (eps_idx == noSym) {
167 |       eps_idx = symbol.AddSymbol(symEps);
168 |     }
169 | 
170 |     oov_idx = symbol.Find(symOov);
171 |     if (oov_idx == noSym) {
172 |       oov_idx = symbol.AddSymbol(symOov);
173 |     }
174 | 
175 |     ins_idx = symbol.Find(symIns);
176 |     if (ins_idx == noSym) {
177 |       ins_idx = symbol.AddSymbol(symIns);
178 |     }
179 | 
180 |     del_idx = symbol.Find(symDel);
181 |     if (del_idx == noSym) {
182 |       del_idx = symbol.AddSymbol(symDel);
183 |     }
184 | 
185 |     sub_idx = symbol.Find(symSub);
186 |     if (sub_idx == noSym) {
187 |       sub_idx = symbol.AddSymbol(symSub);
188 |     }
189 | 
190 |     inaud_idx = symbol.Find(symInaud);
191 |     if (inaud_idx == noSym) {
192 |       inaud_idx = symbol.AddSymbol(symInaud);
193 |     }
194 | 
195 |     sil_idx = symbol.Find(symSil);
196 |     if (sil_idx == noSym) {
197 |       sil_idx = symbol.AddSymbol(symSil);
198 |     }
199 | 
200 |     unk_idx = symbol.Find(symUnk);
201 |     if (unk_idx == noSym) {
202 |       unk_idx = symbol.AddSymbol(symUnk);
203 |     }
204 |   }
205 | };
206 | /* printing FST on the console (or the specified logger) */
207 | void printFst(const fst::StdFst *fst, const fst::SymbolTable *symbol);
208 | void printFst(string loggerName, const fst::StdFst *fst, const fst::SymbolTable *symbol);
209 | 
210 | // from StackOverflow : nice way to get a function call when delimiters on a
211 | // string are matched
212 | template <typename StringFunction>
213 | void splitString(const string &str, char delimiter, StringFunction f);
214 | 
215 | bool EndsWithCaseInsensitive(const string &value, const string &ending);
216 | bool iequals(const std::string &, const std::string &);
217 | 
218 | // string manip
219 | void ltrim(std::string &s);
220 | void rtrim(std::string &s);
221 | void trim(std::string &s);
222 | std::string trim_copy(std::string s);
223 | 
224 | template <class Iter>
225 | Iter splitStringIter(const std::string &s, const std::string &delim, Iter out);
226 | 
227 | std::string string_join(const std::vector<std::string> &elements, const char *const separator);
228 | 
229 | unordered_set<string> get_bigrams(wer_alignment &topAlignment);
230 | bool isValidNgram(const string &token);
231 | bool isEntityLabel(const string &token);
232 | bool isSynonymLabel(const string &token);
233 | bool IsNoisecodeToken(const string &token);
234 | string getLabelIdFromToken(const string &token);
235 | std::string GetEnv(const std::string &var, const std::string default_value);
236 | 
237 | // going from ___23_ORDINAL___  to ORDINAL
238 | string GetLabelNameFromClassLabel(string classLabel);
239 | 
240 | string GetClassLabel(string best_label);
241 | 
242 | string UnicodeLowercase(string token);
243 | 
244 | #endif  // UTILITIES_H_
245 | 


--------------------------------------------------------------------------------
/src/version.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define FSTALIGNER_VERSION_MAJOR 2
4 | #define FSTALIGNER_VERSION_MINOR 0
5 | #define FSTALIGNER_VERSION_PATCH 0
6 | 


--------------------------------------------------------------------------------
/src/wer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * wer.h
 3 |  *
 4 |  * Collection of functions specific to the WER subcommand.
 5 |  *
 6 |  * Quinn McNamara (quinn@rev.com)
 7 |  * 2021
 8 |  */
 9 | #include "fstalign.h"
10 | #include "json_logging.h"
11 | 
12 | using namespace std;
13 | 
14 | struct WerResult {
15 |   int insertions;
16 |   int deletions;
17 |   int substitutions;
18 |   int numWordsInReference;
19 |   int numWordsInHypothesis;
20 |   int NumErrors() { return insertions + substitutions + deletions; }
21 |   /* can return infinity if numWordsInReference == 0 and numWordsInHypothesis > 0 */
22 |   float WER() {
23 |     if (numWordsInReference > 0) {
24 |       return (float)(insertions + deletions + substitutions) / (float)numWordsInReference;
25 |     }
26 | 
27 |     if (numWordsInHypothesis > 0) {
28 |       return numeric_limits<float>::infinity();
29 |     }
30 | 
31 |     return -nanf("");
32 |   }
33 | };
34 | 
35 | vector<int> GetSpeakerSwitchIndices(const vector<Stitching>& stitches);
36 | 
37 | // These methods record different WER analyses to JSON
38 | void RecordWerResult(Json::Value &json, WerResult wr);
39 | void RecordWer(wer_alignment& topAlignment);
40 | void RecordSpeakerWer(const vector<Stitching>& stitches);
41 | void RecordSpeakerSwitchWer(const vector<Stitching>& stitches, int speaker_switch_context_size);
42 | void RecordSentenceWer(const vector<Stitching>& stitches);
43 | void RecordTagWer(const vector<Stitching>& stitches);
44 | void RecordCaseWer(const vector<Stitching>& aligned_stitches);
45 | 
46 | // Adds PR metrics to topAlignment
47 | void CalculatePrecisionRecall(wer_alignment &topAlignment, int threshold);
48 | 
49 | typedef vector<pair<size_t, string>> ErrorGroups;
50 | 
51 | void AddErrorGroup(ErrorGroups &groups, size_t &line, string &ref, string &hyp);
52 | void WriteSbs(wer_alignment &topAlignment, const vector<Stitching>& stitches, string sbs_filename, const vector<string> extra_ref_columns, const vector<string> extra_hyp_columns);
53 | void JsonLogUnigramBigramStats(wer_alignment &topAlignment);
54 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | find_package(Threads REQUIRED)
 4 | 
 5 | include_directories(
 6 |   ${CMAKE_INSTALL_PREFIX}/include
 7 |   ${FSTALIGN_INCLUDES}
 8 |   ${OPENFST_INCLUDES}
 9 |   ${PROJECT_SOURCE_DIR}/test
10 |   ${PROJECT_SOURCE_DIR}
11 |   ${CMAKE_DL_LIBS}
12 | )
13 | 
14 | link_libraries(
15 |   ${OPENFST_LIBRARIES}
16 |   fstaligner-common
17 | )
18 | 
19 | add_executable(fstalign_Test fstalign_Test.cc)
20 | target_link_libraries(fstalign_Test Threads::Threads)
21 | 
22 | add_test(NAME fstalign_Test
23 |   COMMAND $<TARGET_FILE:fstalign_Test>
24 |   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build)
25 | 
26 | add_executable(compose-tests compose-tests.cc)
27 | target_link_libraries(compose-tests Threads::Threads)
28 | 
29 | add_test(NAME compose-tests
30 |   COMMAND $<TARGET_FILE:compose-tests>
31 |   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build)
32 | 
33 | add_executable(fast-d-tests fast-d-tests.cc)
34 | target_link_libraries(fast-d-tests Threads::Threads)
35 | 
36 | add_test(NAME fast-d-tests
37 |   COMMAND $<TARGET_FILE:fast-d-tests>
38 |   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/build)
39 | 


--------------------------------------------------------------------------------
/test/compose-tests-utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef __COMPOSE_TEST_UTILITIES_H__
 3 | #define  __COMPOSE_TEST_UTILITIES_H__ 1 
 4 | 
 5 | #include "src/OneBestFstLoader.h"
 6 | #include "src/AdaptedComposition.h"
 7 | #include "src/logging.h"
 8 | 
 9 | // helper methods
10 | 
11 | StdVectorFst GetFstFromString(SymbolTable *symbols, const std::string str) {
12 |   OneBestFstLoader loader;
13 |   loader.BuildFromString(str);
14 |   loader.addToSymbolTable(*symbols);
15 |   std::vector<int> map;
16 |   return  loader.convertToFst(*symbols, map);
17 | }
18 | 
19 | StdVectorFst GetStdFstA() {
20 |   StdVectorFst a;
21 |   a.AddState();  // 0
22 |   a.AddState();  // 1
23 |   a.AddState();  // 2
24 |   a.AddState();  // 3
25 | 
26 |   a.SetStart(0);
27 |   a.SetFinal((StateId)3, StdArc::Weight::One());
28 |   // Arc constructor args: ilabel, olabel, weight, dest state ID.
29 | 
30 |   // 0 -> 1:1 -> 1 -> 2:2 -> 2 -> 3:3
31 | 
32 |   a.AddArc(0, StdArc(1, 1, 0, 1));
33 |   a.AddArc(1, StdArc(2, 2, 0, 2));
34 |   a.AddArc(2, StdArc(3, 3, 0, 3));
35 | 
36 |   return a;
37 | }
38 | 
39 | StdVectorFst GetStdFstB() {
40 |   StdVectorFst a;
41 |   a.AddState();  // 0
42 |   a.AddState();  // 1
43 |   a.AddState();  // 2
44 |   a.AddState();  // 3
45 | 
46 |   a.SetStart(0);
47 |   a.SetFinal((StateId)3, StdArc::Weight::One());
48 |   // Arc constructor args: ilabel, olabel, weight, dest state ID.
49 | 
50 |   // 0 -> 1:1 -> 1 -> 2:2 -> 2 -> 3:3
51 | 
52 |   a.AddArc(0, StdArc(1, 4, 0, 1));
53 |   a.AddArc(1, StdArc(2, 5, 0, 2));
54 |   a.AddArc(2, StdArc(3, 6, 0, 3));
55 | 
56 |   return a;
57 | }
58 | 
59 | 
60 | #endif


--------------------------------------------------------------------------------
/test/compose-tests.cc:
--------------------------------------------------------------------------------
  1 | #define CATCH_CONFIG_MAIN
  2 | #include <queue>
  3 | #include <set>
  4 | #include "../third-party/catch2/single_include/catch2/catch.hpp"
  5 | #include "compose-tests-utils.h"
  6 | #include "test-utilties.h"
  7 | 
  8 | using Catch::Matchers::Contains;
  9 | 
 10 | #include "src/AdaptedComposition.h"
 11 | #include "src/logging.h"
 12 | 
 13 | // there just to setup the loggers
 14 | TEST_CASE("STATIC_REQUIRE showcase", "[traits]") {
 15 |   logger::InitLoggers("");
 16 |   STATIC_REQUIRE(std::is_void<void>::value);
 17 |   STATIC_REQUIRE_FALSE(std::is_void<int>::value);
 18 | }
 19 | 
 20 | // TODO: add degenerated case, where all words in CTM are <unk> or no words at all are available
 21 | TEST_CASE("CheckEntity") {
 22 |   SECTION("synonyms") {
 23 |     REQUIRE(isSynonymLabel("___100000_SYN_1-1___"));
 24 |     REQUIRE(isEntityLabel("___100000_SYN_1-1___"));
 25 | 
 26 |     REQUIRE(isSynonymLabel("___90_CARDINAL___") == false);
 27 |     REQUIRE(isEntityLabel("___90_CARDINAL___"));
 28 |     REQUIRE(isEntityLabel("___90___"));
 29 |     REQUIRE(isEntityLabel("__90__") == false);
 30 | 
 31 |     REQUIRE(isSynonymLabel("___100000_syn_1-1___") == false);
 32 |     REQUIRE(isSynonymLabel("___100000SYN_1-1___") == false);
 33 |   }
 34 | }
 35 | TEST_CASE("composition()") {
 36 |   SECTION("simple1") {
 37 |     auto logger = logger::GetOrCreateLogger("simple1");
 38 |     logger->info("starting");
 39 | 
 40 |     fst::StdVectorFst a = GetStdFstA();
 41 |     fst::StdVectorFst b = GetStdFstB();
 42 |     AdaptedCompositionFst composer(a, b);
 43 | 
 44 |     REQUIRE(composer.Start() == 0);
 45 | 
 46 |     auto s = composer.Start();
 47 |     vector<StdArc> arcs;
 48 |     bool ret_status = composer.TryGetArcsAtState(s, &arcs);
 49 | 
 50 |     REQUIRE(ret_status);
 51 |     REQUIRE(arcs.size() == 3);
 52 | 
 53 |     REQUIRE(true);
 54 |   }
 55 | 
 56 |   SECTION("perfect match") {
 57 |     auto logger = logger::GetOrCreateLogger("perfect match");
 58 |     logger->info("starting");
 59 | 
 60 |     SymbolTable symbols;
 61 |     symbols.AddSymbol("<eps>");
 62 |     symbols.AddSymbol("<del>");
 63 |     symbols.AddSymbol("<ins>");
 64 |     symbols.AddSymbol("<sub>");
 65 | 
 66 |     auto a = GetFstFromString(&symbols, "this is a test");
 67 |     auto b = GetFstFromString(&symbols, "this is a test");
 68 | 
 69 |     logger->info("symbols has {} entries, fst has {} states", symbols.NumSymbols(), a.NumStates());
 70 | 
 71 |     AdaptedCompositionFst composer(a, b);
 72 |     auto s = composer.Start();
 73 |     REQUIRE(s == 0);
 74 | 
 75 |     // given that we have a match for each words, we should always have 1 arc per state and one composed state per pair
 76 |     // of input arcs (0,0) -> 0 (1,1) -> 1 (2,2) -> 2 (3,3) -> 3
 77 |     int current_state = s;
 78 |     for (int i = 0; i < 7; i++) {
 79 |       vector<StdArc> arcs_leaving_state;
 80 |       bool ret_status = composer.TryGetArcsAtState(current_state, &arcs_leaving_state);
 81 |       logger->info("({}) from state {}, we have {} arcs leaving with a ret-status {}", i, current_state,
 82 |                    arcs_leaving_state.size(), ret_status);
 83 |       REQUIRE(ret_status);
 84 | 
 85 |       if (i == 6) {
 86 |         // final state
 87 |         REQUIRE(arcs_leaving_state.size() == 0);
 88 |         logger->info("({}) we expect composed state id {} to have a weight one One()", i, current_state);
 89 |         REQUIRE(composer.Final(current_state) == StdFst::Weight::One());
 90 |       } else {
 91 |         if (i >= 4) {
 92 |           REQUIRE(arcs_leaving_state.size() == 1);
 93 |         } else {
 94 |           REQUIRE(arcs_leaving_state.size() == 3);
 95 |         }
 96 |         for (vector<StdArc>::iterator iter = arcs_leaving_state.begin(); iter != arcs_leaving_state.end(); ++iter) {
 97 |           const fst::StdArc arc = *iter;
 98 |           logger->info("({}) arc leaving state {} to {} with label {}/{} ({}/{})", i, current_state, arc.nextstate,
 99 |                        arc.ilabel, arc.olabel, symbols.Find(arc.ilabel), symbols.Find(arc.olabel));
100 | 
101 |           logger->info("({}) we expect composed state id {} to have a weight one Zero()", i, current_state);
102 |           REQUIRE(composer.Final(current_state) == StdFst::Weight::Zero());
103 | 
104 |           current_state = arc.nextstate;
105 |         }
106 |       }
107 |     }
108 |   }
109 | 
110 |   SECTION("deletion at the end") {
111 |     auto logger = logger::GetOrCreateLogger("deletions");
112 |     logger->info("starting");
113 | 
114 |     SymbolTable symbols;
115 |     symbols.AddSymbol("<eps>");
116 |     symbols.AddSymbol("<del>");
117 |     symbols.AddSymbol("<ins>");
118 |     symbols.AddSymbol("<sub>");
119 | 
120 |     auto a = GetFstFromString(&symbols, "this is a test with some extra words at the end");
121 |     auto b = GetFstFromString(&symbols, "this is a test");
122 | 
123 |     logger->info("symbols has {} entries, fst has {} states", symbols.NumSymbols(), a.NumStates());
124 | 
125 |     AdaptedCompositionFst composer(a, b);
126 |     auto s = composer.Start();
127 |     REQUIRE(s == 0);
128 | 
129 |     // given that we have a match for each words, we should always have 1 arc per state and one composed state per pair
130 |     // of input arcs (0,0) -> 0 (1,1) -> 1 (2,2) -> 2 (3,3) -> 3
131 |     int current_state = s;
132 | 
133 |     // The test here is to check that we can reach a final node where the word "end" is deleted.
134 |     std::queue<int> states_to_process;
135 |     std::set<int> states_explored;
136 | 
137 |     states_to_process.push(s);
138 | 
139 |     vector<StdArc> arcs_leaving_state;
140 |     bool found_deleted_end = false;
141 |     while (states_to_process.size() > 0) {
142 |       current_state = states_to_process.front();
143 |       states_to_process.pop();
144 | 
145 |       if (states_explored.find(current_state) != states_explored.end()) {
146 |         continue;
147 |       }
148 |       states_explored.insert(current_state);
149 | 
150 |       bool ret_status = composer.TryGetArcsAtState(current_state, &arcs_leaving_state);
151 |       logger->info("from state {}, we have {} arcs leaving with a ret-status {}", current_state,
152 |                    arcs_leaving_state.size(), ret_status);
153 |       REQUIRE(ret_status);
154 | 
155 |       for (vector<StdArc>::iterator iter = arcs_leaving_state.begin(); iter != arcs_leaving_state.end(); ++iter) {
156 |         const fst::StdArc arc = *iter;
157 |         logger->info("arc leaving state {} to {} with label {}/{} ({}/{})", current_state, arc.nextstate, arc.ilabel,
158 |                      arc.olabel, symbols.Find(arc.ilabel), symbols.Find(arc.olabel));
159 | 
160 |         if (arc.nextstate != current_state && states_explored.find(arc.nextstate) == states_explored.end()) {
161 |           states_to_process.push(arc.nextstate);
162 |         }
163 | 
164 |         if (symbols.Find(arc.ilabel) == "end" && arc.olabel == 0) {
165 |           found_deleted_end = true;
166 |         }
167 |       }
168 |     }
169 | 
170 |     REQUIRE(found_deleted_end);
171 |   }
172 | }
173 | 


--------------------------------------------------------------------------------
/test/data/align_1.aligned.punc_case.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]|||sub(A)|
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | ,|1|7.0000|8.0000|||||[]||||
 7 | <laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 8 | .|1|11.0000|12.0000|||||[]||||
 9 | e|1|||||LC|[]|[]|||del|
10 | f|1|13.0000|14.0000|||LC|[]|[]||||
11 | g|1|15.0000|16.0000|||LC|[]|[]||||
12 | h|1|17.0000|18.0000|||LC|[]|[]||||
13 | <foreign>|1|||,||LC|[]|[]|||del|
14 | ,|1|||||||[]|||del|
15 | i|1|21.0000|22.0000|||LC|[]|[]|||sub(I)|
16 | j|1|23.0000|24.0000|||LC|[]|[]|||sub(J)|
17 | 


--------------------------------------------------------------------------------
/test/data/align_1.hyp.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 <unk>
 6 | recording.wav 1 11 1 e
 7 | recording.wav 1 13 1 f
 8 | recording.wav 1 15 1 g
 9 | recording.wav 1 17 1 h
10 | recording.wav 1 21 1 i
11 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_1.hyp.punc_case.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 A
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 7 1 ,
 6 | recording.wav 1 9 1 <unk>
 7 | recording.wav 1 11 1 e
 8 | recording.wav 1 11 1 .
 9 | recording.wav 1 13 1 f
10 | recording.wav 1 15 1 g
11 | recording.wav 1 17 1 h
12 | recording.wav 1 21 1 I
13 | recording.wav 1 23 1 J
14 | 


--------------------------------------------------------------------------------
/test/data/align_1.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": {
 3 |         "candidates": [
 4 |             {
 5 |                 "probability": 1.0,
 6 |                 "verbalization": [
 7 |                     "<laugh>"
 8 |                 ]
 9 |             }
10 |         ],
11 |         "class": "FALLBACK"
12 |     }
13 | }


--------------------------------------------------------------------------------
/test/data/align_1.ref.aligned.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | <laugh>|1|9.0000|10.0000|.||LC|['0:FALLBACK']|[]||||
 7 | e|1|11.0000|12.0000|||LC|[]|[]||||
 8 | f|1|13.0000|14.0000|||LC|[]|[]||||
 9 | g|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | <foreign>|1|||,||LC|[]|[]|||del|
12 | i|1|21.0000|22.0000|||LC|[]|[]||||
13 | j|1|23.0000|24.0000|||LC|[]|[]||||
14 | 


--------------------------------------------------------------------------------
/test/data/align_1.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | <laugh>|1|||.|LC|['0:FALLBACK']
 7 | e|1||||LC|[]
 8 | f|1||||LC|[]
 9 | g|1||||LC|[]
10 | h|1||||LC|[]
11 | <foreign>|1|||,|LC|[]
12 | i|1||||LC|[]
13 | j|1||||LC|[]
14 | 


--------------------------------------------------------------------------------
/test/data/align_2.hyp.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 i'll
 6 | recording.wav 1 11 1 shakespeare
 7 | recording.wav 1 13 1 j
 8 | recording.wav 1 15 1 k
 9 | recording.wav 1 17 1 l
10 | 


--------------------------------------------------------------------------------
/test/data/align_2.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": {
 3 |         "candidates": [
 4 |             {
 5 |                 "probability": 0.5,
 6 |                 "verbalization": [
 7 |                     "William Shakespeare"
 8 |                 ]
 9 |             },
10 |             {
11 |                 "probability": 0.5,
12 |                 "verbalization": [
13 |                     "Will Shakespeare"
14 |                 ]
15 |             }
16 |         ],
17 |         "class": "NAME"
18 |     }
19 | }


--------------------------------------------------------------------------------
/test/data/align_2.ref.aligned.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | i|1|9.0000|10.0000|||LC|[]|[]|||sub(i'll)|
 7 | will|1|||||LC|['0:NAME']|[]|||del,direct|
 8 | shakespeare|1|11.0000|12.0000|||LC|['0:NAME']|[]|||,direct|
 9 | j|1|13.0000|14.0000|||LC|[]|[]||||
10 | k|1|15.0000|16.0000|||LC|[]|[]||||
11 | l|1|17.0000|18.0000|||LC|[]|[]||||
12 | 


--------------------------------------------------------------------------------
/test/data/align_2.ref.aligned.std.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | i|1|||||LC|[]|[]|||del|
 7 | will|1|9.0000|10.0000|||LC|['0:NAME']|[]|||sub(i'll),direct|
 8 | shakespeare|1|11.0000|12.0000|||LC|['0:NAME']|[]|||,direct|
 9 | j|1|13.0000|14.0000|||LC|[]|[]||||
10 | k|1|15.0000|16.0000|||LC|[]|[]||||
11 | l|1|17.0000|18.0000|||LC|[]|[]||||
12 | 


--------------------------------------------------------------------------------
/test/data/align_2.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | i|1||||LC|[]
 7 | will|1||||LC|['0:NAME']
 8 | shakespeare|1||||LC|['0:NAME']
 9 | j|1||||LC|[]
10 | k|1||||LC|[]
11 | l|1||||LC|[]
12 | 


--------------------------------------------------------------------------------
/test/data/align_3.hyp.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 want
 6 | recording.wav 1 10 1 to
 7 | recording.wav 1 11 1 e
 8 | recording.wav 1 13 1 f
 9 | recording.wav 1 15 1 g
10 | recording.wav 1 17 1 h
11 | recording.wav 1 21 1 i
12 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_3.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": {
 3 |         "candidates": [
 4 |             {
 5 |                 "probability": 1.0,
 6 |                 "verbalization": [
 7 |                     "dont use"
 8 |                 ]
 9 |             }
10 |         ],
11 |         "class": "FALLBACK"
12 |     }
13 | }


--------------------------------------------------------------------------------
/test/data/align_3.ref.aligned.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | want|1|9.0000|10.0000|||LC|['0:FALLBACK']|[]|||,push_last|
 7 | to|1|10.0000|11.0000|.||LC|['0:FALLBACK']|[]|||,push_last|
 8 | e|1|11.0000|12.0000|||LC|[]|[]||||
 9 | f|1|13.0000|14.0000|||LC|[]|[]||||
10 | g|1|15.0000|16.0000|||LC|[]|[]||||
11 | h|1|17.0000|18.0000|||LC|[]|[]||||
12 | <foreign>|1|||,||LC|[]|[]|||del|
13 | i|1|21.0000|22.0000|||LC|[]|[]||||
14 | j|1|23.0000|24.0000|||LC|[]|[]||||
15 | 


--------------------------------------------------------------------------------
/test/data/align_3.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | wanna|1|||.|LC|['0:FALLBACK']
 7 | e|1||||LC|[]
 8 | f|1||||LC|[]
 9 | g|1||||LC|[]
10 | h|1||||LC|[]
11 | <foreign>|1|||,|LC|[]
12 | i|1||||LC|[]
13 | j|1||||LC|[]
14 | 


--------------------------------------------------------------------------------
/test/data/align_4.hyp1.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 nineteen
 6 | recording.wav 1 10 1 uh
 7 | recording.wav 1 11 1 eighty
 8 | recording.wav 1 12 1 eight
 9 | recording.wav 1 13 1 e
10 | recording.wav 1 14 1 f
11 | recording.wav 1 15 1 g
12 | recording.wav 1 17 1 h
13 | recording.wav 1 21 1 i
14 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_4.hyp2.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 nineteen
 6 | recording.wav 1 11 1 eighty
 7 | recording.wav 1 13 1 e
 8 | recording.wav 1 14 1 f
 9 | recording.wav 1 15 1 g
10 | recording.wav 1 17 1 h
11 | recording.wav 1 21 1 i
12 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_4.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": {
 3 |         "candidates": [
 4 |             {
 5 |                 "probability": 1.0,
 6 |                 "verbalization": [
 7 |                     "nineteen",
 8 |                     "eighty",
 9 |                     "eight"
10 |                 ]
11 |             }
12 |         ],
13 |         "class": "CARDINAL"
14 |     }
15 | }


--------------------------------------------------------------------------------
/test/data/align_4.ref.aligned1.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | nineteen|1|9.0000|10.0000|||LC|['0:CARDINAL']|[]|||,push_last|
 7 | eighty|1|11.0000|12.0000|||LC|['0:CARDINAL']|[]|||,push_last|
 8 | eight|1|12.0000|13.0000|||LC|['0:CARDINAL']|[]|||,push_last|
 9 | e|1|13.0000|14.0000|||LC|[]|[]||||
10 | f|1|14.0000|15.0000|||LC|[]|[]||||
11 | g|1|15.0000|16.0000|||LC|[]|[]||||
12 | h|1|17.0000|18.0000|||LC|[]|[]||||
13 | i|1|21.0000|22.0000|||LC|[]|[]||||
14 | j|1|23.0000|24.0000|||LC|[]|[]||||
15 | 


--------------------------------------------------------------------------------
/test/data/align_4.ref.aligned2.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | nineteen|1|9.0000|10.0000|||LC|['0:CARDINAL']|[]|||,push_last|
 7 | eighty|1|11.0000|12.0000|||LC|['0:CARDINAL']|[]|||,push_last|
 8 | eight|1|||||LC|['0:CARDINAL']|[]|||del,push_last|
 9 | e|1|13.0000|14.0000|||LC|[]|[]||||
10 | f|1|14.0000|15.0000|||LC|[]|[]||||
11 | g|1|15.0000|16.0000|||LC|[]|[]||||
12 | h|1|17.0000|18.0000|||LC|[]|[]||||
13 | i|1|21.0000|22.0000|||LC|[]|[]||||
14 | j|1|23.0000|24.0000|||LC|[]|[]||||
15 | 


--------------------------------------------------------------------------------
/test/data/align_4.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | 1988|1||||LC|['0:CARDINAL']
 7 | e|1||||LC|[]
 8 | f|1||||LC|[]
 9 | g|1||||LC|[]
10 | h|1||||LC|[]
11 | i|1||||LC|[]
12 | j|1||||LC|[]
13 | 


--------------------------------------------------------------------------------
/test/data/align_5.hyp1.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 want
 6 | recording.wav 1 10 1 uh
 7 | recording.wav 1 11 1 to
 8 | recording.wav 1 13 1 e
 9 | recording.wav 1 14 1 f
10 | recording.wav 1 15 1 g
11 | recording.wav 1 17 1 h
12 | recording.wav 1 21 1 i
13 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_5.hyp2.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 want
 6 | recording.wav 1 13 1 e
 7 | recording.wav 1 14 1 f
 8 | recording.wav 1 15 1 g
 9 | recording.wav 1 17 1 h
10 | recording.wav 1 21 1 i
11 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_5.ref.aligned1-2.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | wanna|1|11.0000|12.0000|||LC|[]|[]|||sub(to)|
 7 | e|1|13.0000|14.0000|||LC|[]|[]||||
 8 | f|1|14.0000|15.0000|||LC|[]|[]||||
 9 | g|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | i|1|21.0000|22.0000|||LC|[]|[]||||
12 | j|1|23.0000|24.0000|||LC|[]|[]||||
13 | 


--------------------------------------------------------------------------------
/test/data/align_5.ref.aligned1.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | want|1|9.0000|10.0000|||LC|[]|[]|||,push_last|
 7 | to|1|11.0000|12.0000|||LC|[]|[]|||,push_last|
 8 | e|1|13.0000|14.0000|||LC|[]|[]||||
 9 | f|1|14.0000|15.0000|||LC|[]|[]||||
10 | g|1|15.0000|16.0000|||LC|[]|[]||||
11 | h|1|17.0000|18.0000|||LC|[]|[]||||
12 | i|1|21.0000|22.0000|||LC|[]|[]||||
13 | j|1|23.0000|24.0000|||LC|[]|[]||||
14 | 


--------------------------------------------------------------------------------
/test/data/align_5.ref.aligned2-a2.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | wanna|1|9.0000|10.0000|||LC|[]|[]|||sub(want)|
 7 | e|1|13.0000|14.0000|||LC|[]|[]||||
 8 | f|1|14.0000|15.0000|||LC|[]|[]||||
 9 | g|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | i|1|21.0000|22.0000|||LC|[]|[]||||
12 | j|1|23.0000|24.0000|||LC|[]|[]||||
13 | 


--------------------------------------------------------------------------------
/test/data/align_5.ref.aligned2.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | want|1|9.0000|10.0000|||LC|[]|[]|||,push_last|
 7 | to|1|||||LC|[]|[]|||del,push_last|
 8 | e|1|13.0000|14.0000|||LC|[]|[]||||
 9 | f|1|14.0000|15.0000|||LC|[]|[]||||
10 | g|1|15.0000|16.0000|||LC|[]|[]||||
11 | h|1|17.0000|18.0000|||LC|[]|[]||||
12 | i|1|21.0000|22.0000|||LC|[]|[]||||
13 | j|1|23.0000|24.0000|||LC|[]|[]||||
14 | 


--------------------------------------------------------------------------------
/test/data/align_5.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | wanna|1||||LC|[]
 7 | e|1||||LC|[]
 8 | f|1||||LC|[]
 9 | g|1||||LC|[]
10 | h|1||||LC|[]
11 | i|1||||LC|[]
12 | j|1||||LC|[]
13 | 


--------------------------------------------------------------------------------
/test/data/align_6.hyp.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 want
 6 | recording.wav 1 10 1 uh
 7 | recording.wav 1 11 1 to
 8 | recording.wav 1 13 1 e
 9 | recording.wav 1 14 1 f
10 | recording.wav 1 15 1 g
11 | recording.wav 1 17 1 h
12 | recording.wav 1 21 1 i
13 | recording.wav 1 23 1 j


--------------------------------------------------------------------------------
/test/data/align_6.ref.aligned.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000||...|CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | wanna|1|11.0000|12.0000|||LC|[]|[]|||sub(to)|
 7 | e|1|13.0000|14.0000|||LC|[]|[]||||
 8 | f|1|14.0000|15.0000||!|LC|[]|[]||||
 9 | g|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | i|1|21.0000|22.0000|||LC|[]|[]||||
12 | j|1|23.0000|24.0000|||LC|[]|[]||||
13 | 


--------------------------------------------------------------------------------
/test/data/align_6.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags
 2 | a|1||||...|CA|[]
 3 | b|1|||||LC|[]
 4 | c|1|||||LC|[]
 5 | d|1|||,||LC|[]
 6 | wanna|1|||||LC|[]
 7 | e|1|||||LC|[]
 8 | f|1||||!|LC|[]
 9 | g|1|||||LC|[]
10 | h|1|||||LC|[]
11 | i|1|||||LC|[]
12 | j|1|||||LC|[]
13 | 


--------------------------------------------------------------------------------
/test/data/empty.hyp.ctm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/empty.hyp.ctm


--------------------------------------------------------------------------------
/test/data/empty.hyp.nlp:
--------------------------------------------------------------------------------
1 | token|speaker|ts|endTs|punctuation|case|tags
2 | 


--------------------------------------------------------------------------------
/test/data/empty.hyp.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/empty.hyp.txt


--------------------------------------------------------------------------------
/test/data/empty.ref.txt:
--------------------------------------------------------------------------------
1 | not really empty
2 | 


--------------------------------------------------------------------------------
/test/data/fstalign-50.hyp.txt:
--------------------------------------------------------------------------------
1 | su capital es la ciudad de pau
2 | 


--------------------------------------------------------------------------------
/test/data/fstalign-50.new.sbs.txt:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |                   su	su                  			
 3 |              capital	capital             			
 4 |                   es	es                  			
 5 |                   la	la                  			
 6 |               ciudad	ciudad              			
 7 |                   de	de                  			
 8 |                 palu	pau                 	ERR		
 9 | ------------------------------------------------------------
10 |                 Line	Group               
11 |                    8	palu <-> pau
12 | 


--------------------------------------------------------------------------------
/test/data/fstalign-50.ref.txt:
--------------------------------------------------------------------------------
1 | su capital es la ciudad de palu
2 | 


--------------------------------------------------------------------------------
/test/data/noise.hyp1.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 11 1 e
 6 | recording.wav 1 13 1 f
 7 | recording.wav 1 15 1 g
 8 | recording.wav 1 17 1 h
 9 | recording.wav 1 21 1 i
10 | recording.wav 1 23 1 j
11 | 


--------------------------------------------------------------------------------
/test/data/noise.hyp2.ctm:
--------------------------------------------------------------------------------
 1 | recording.wav 1 1 1 a
 2 | recording.wav 1 3 1 b
 3 | recording.wav 1 5 1 c
 4 | recording.wav 1 7 1 d
 5 | recording.wav 1 9 1 <unk>
 6 | recording.wav 1 11 1 e
 7 | recording.wav 1 13 1 f
 8 | recording.wav 1 15 1 g
 9 | recording.wav 1 17 1 h
10 | recording.wav 1 19 1 <unk>
11 | recording.wav 1 21 1 i
12 | recording.wav 1 23 1 j
13 | 


--------------------------------------------------------------------------------
/test/data/noise_1.hyp1.aligned:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | <inaudible>|1|||,||LC|[]|[]|||del|
 7 | e|1|11.0000|12.0000|||LC|[]|[]||||
 8 | F|1|13.0000|14.0000|||LC|[]|[]||||
 9 | G|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | <foreign>|1|||,||LC|[]|[]|||del|
12 | i|1|21.0000|22.0000|||LC|[]|[]||||
13 | j|1|23.0000|24.0000|||LC|[]|[]||||
14 | 


--------------------------------------------------------------------------------
/test/data/noise_1.hyp2.aligned:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | a|1|1.0000|2.0000|||CA|[]|[]||||
 3 | b|1|3.0000|4.0000|||LC|[]|[]||||
 4 | c|1|5.0000|6.0000|||LC|[]|[]||||
 5 | d|1|7.0000|8.0000|,||LC|[]|[]||||
 6 | <inaudible>|1|9.0000|10.0000|,||LC|[]|[]||||
 7 | e|1|11.0000|12.0000|||LC|[]|[]||||
 8 | F|1|13.0000|14.0000|||LC|[]|[]||||
 9 | G|1|15.0000|16.0000|||LC|[]|[]||||
10 | h|1|17.0000|18.0000|||LC|[]|[]||||
11 | <foreign>|1|19.0000|20.0000|,||LC|[]|[]||||
12 | i|1|21.0000|22.0000|||LC|[]|[]||||
13 | j|1|23.0000|24.0000|||LC|[]|[]||||
14 | 


--------------------------------------------------------------------------------
/test/data/noise_1.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1||||CA|[]
 3 | b|1||||LC|[]
 4 | c|1||||LC|[]
 5 | d|1|||,|LC|[]
 6 | <inaudible>|1|||,|LC|[]
 7 | e|1||||LC|[]
 8 | F|1||||LC|[]
 9 | G|1||||LC|[]
10 | h|1||||LC|[]
11 | <foreign>|1|||,|LC|[]
12 | i|1||||LC|[]
13 | j|1||||LC|[]
14 | 


--------------------------------------------------------------------------------
/test/data/oracle_1.hyp.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/test/data/oracle_1.hyp.fst


--------------------------------------------------------------------------------
/test/data/oracle_1.ref.txt:
--------------------------------------------------------------------------------
1 | this is a test and a very good one
2 | 


--------------------------------------------------------------------------------
/test/data/oracle_1.symbols.txt:
--------------------------------------------------------------------------------
 1 | <eps> 0
 2 | <oov> 1
 3 | <ins> 2
 4 | <del> 3
 5 | <sub> 4
 6 | <silence>     6
 7 | <inaudible>   5
 8 | <unk> 7
 9 | this 8
10 | these 9
11 | is 10
12 | his 11
13 | a 12
14 | isa 13
15 | test 14
16 | and 15
17 | very 16
18 | good 17
19 | one 18
20 | 


--------------------------------------------------------------------------------
/test/data/short.aligned.case.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
 4 | yeah|1|||,||LC|[]|[]|||del|
 5 | right|1|0.0000|0.0000|.||LC|[]|[]||||
 6 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(I'll)|
 7 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
 8 | right|1|||,||LC|[]|[]|||del|
 9 | probably|1|||||LC|[]|[]|||del|
10 | just|1|0.0000|0.0000|||LC|[]|[]||||
11 | that|1|0.0000|0.0000|.||LC|[]|[]||||
12 | Are|3|0.0000|0.0000|||UC|[]|[]||||
13 | there|3|0.0000|0.0000|||LC|[]|[]||||
14 | any|3|0.0000|0.0000|||LC|[]|[]||||
15 | visuals|3|0.0000|0.0000|||LC|[]|[]||||
16 | that|3|0.0000|0.0000|||LC|[]|[]||||
17 | come|3|0.0000|0.0000|||LC|[]|[]||||
18 | to|3|0.0000|0.0000|||LC|[]|[]||||
19 | mind|3|0.0000|0.0000|||LC|[]|[]||||
20 | or|3|0.0000|0.0000|||LC|[]|[]||||
21 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
22 | sure|1|0.0000|0.0000|.||LC|[]|[]||||
23 | When|1|0.0000|0.0000|||UC|[]|[]||||
24 | I|1|0.0000|0.0000|||CA|[]|[]||||
25 | hear|1|0.0000|0.0000|||LC|[]|[]||||
26 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']||||
27 | I|1|0.0000|0.0000|||CA|[]|[]||||
28 | think|1|0.0000|0.0000|||LC|[]|[]||||
29 | about|1|0.0000|0.0000|||LC|[]|[]||||
30 | just|1|0.0000|0.0000|||LC|[]|[]||||
31 | that|1|0.0000|0.0000|:||LC|[]|[]||||
32 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(Foobar)|
33 | a|1|0.0000|0.0000|||LC|[]|[]||||
34 | 


--------------------------------------------------------------------------------
/test/data/short.aligned.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 3 | Yeah|1|||,||UC|[]|[]|||del|
 4 | yeah|1|0.0000|0.0000|,||LC|[]|[]||||
 5 | right|1|0.0000|0.0000|.||LC|[]|[]||||
 6 | Yeah|1|||,||UC|[]|[]|||del|
 7 | alright|1|0.0000|0.0000|,||LC|[]|[]|||sub(i'll),split_worst|
 8 | probably|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
 9 | just|1|0.0000|0.0000|||LC|[]|[]||||
10 | that|1|0.0000|0.0000|.||LC|[]|[]||||
11 | Are|3|0.0000|0.0000|||UC|[]|[]||||
12 | there|3|0.0000|0.0000|||LC|[]|[]||||
13 | any|3|0.0000|0.0000|||LC|[]|[]||||
14 | visuals|3|0.0000|0.0000|||LC|[]|[]||||
15 | that|3|0.0000|0.0000|||LC|[]|[]||||
16 | come|3|0.0000|0.0000|||LC|[]|[]||||
17 | to|3|0.0000|0.0000|||LC|[]|[]||||
18 | mind|3|0.0000|0.0000|||LC|[]|[]||||
19 | or|3|0.0000|0.0000|||LC|[]|[]||||
20 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
21 | sure|1|0.0000|0.0000|.||LC|[]|[]||||
22 | When|1|0.0000|0.0000|||UC|[]|[]||||
23 | I|1|0.0000|0.0000|||CA|[]|[]||||
24 | hear|1|0.0000|0.0000|||LC|[]|[]||||
25 | Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
26 | I|1|0.0000|0.0000|||CA|[]|[]||||
27 | think|1|0.0000|0.0000|||LC|[]|[]||||
28 | about|1|0.0000|0.0000|||LC|[]|[]||||
29 | just|1|0.0000|0.0000|||LC|[]|[]||||
30 | that|1|0.0000|0.0000|:||LC|[]|[]||||
31 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)|
32 | a|1|0.0000|0.0000|||LC|[]|[]||||
33 | 


--------------------------------------------------------------------------------
/test/data/short.aligned.punc.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
 4 | ,|1|0.0000|0.0000|||||[]||||
 5 | yeah|1|||,||LC|[]|[]|||del|
 6 | ,|1|||||||[]|||del|
 7 | right|1|0.0000|0.0000|.||LC|[]|[]||||
 8 | .|1|||||||[]|||del|
 9 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(i'll)|
10 | ,|1|||||||[]|||del|
11 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
12 | right|1|||,||LC|[]|[]|||del|
13 | ,|1|||||||[]|||del|
14 | probably|1|||||LC|[]|[]|||del|
15 | just|1|0.0000|0.0000|||LC|[]|[]||||
16 | that|1|0.0000|0.0000|.||LC|[]|[]||||
17 | .|1|0.0000|0.0000|||||[]|||sub(?)|
18 | Are|3|0.0000|0.0000|||UC|[]|[]||||
19 | there|3|0.0000|0.0000|||LC|[]|[]||||
20 | any|3|0.0000|0.0000|||LC|[]|[]||||
21 | visuals|3|0.0000|0.0000|||LC|[]|[]||||
22 | that|3|0.0000|0.0000|||LC|[]|[]||||
23 | come|3|0.0000|0.0000|||LC|[]|[]||||
24 | to|3|0.0000|0.0000|||LC|[]|[]||||
25 | mind|3|0.0000|0.0000|||LC|[]|[]||||
26 | or|3|0.0000|0.0000|||LC|[]|[]||||
27 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
28 | ,|1|0.0000|0.0000|||||[]||||
29 | sure|1|0.0000|0.0000|.||LC|[]|[]||||
30 | .|1|0.0000|0.0000|||||[]||||
31 | When|1|0.0000|0.0000|||UC|[]|[]||||
32 | I|1|0.0000|0.0000|||CA|[]|[]||||
33 | hear|1|0.0000|0.0000|||LC|[]|[]||||
34 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']||||
35 | ,|1|0.0000|0.0000|||||[]||||
36 | I|1|0.0000|0.0000|||CA|[]|[]||||
37 | think|1|0.0000|0.0000|||LC|[]|[]||||
38 | about|1|0.0000|0.0000|||LC|[]|[]||||
39 | just|1|0.0000|0.0000|||LC|[]|[]||||
40 | that|1|0.0000|0.0000|:||LC|[]|[]||||
41 | :|1|0.0000|0.0000|||||[]||||
42 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)|
43 | a|1|0.0000|0.0000|||LC|[]|[]||||
44 | 


--------------------------------------------------------------------------------
/test/data/short.aligned.punc_case.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
 4 | ,|1|0.0000|0.0000|||||[]||||
 5 | yeah|1|||,||LC|[]|[]|||del|
 6 | ,|1|||||||[]|||del|
 7 | right|1|0.0000|0.0000|.||LC|[]|[]||||
 8 | .|1|||||||[]|||del|
 9 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(I'll)|
10 | ,|1|||||||[]|||del|
11 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
12 | right|1|||,||LC|[]|[]|||del|
13 | ,|1|||||||[]|||del|
14 | probably|1|||||LC|[]|[]|||del|
15 | just|1|0.0000|0.0000|||LC|[]|[]||||
16 | that|1|0.0000|0.0000|.||LC|[]|[]||||
17 | .|1|0.0000|0.0000|||||[]|||sub(?)|
18 | Are|3|0.0000|0.0000|||UC|[]|[]||||
19 | there|3|0.0000|0.0000|||LC|[]|[]||||
20 | any|3|0.0000|0.0000|||LC|[]|[]||||
21 | visuals|3|0.0000|0.0000|||LC|[]|[]||||
22 | that|3|0.0000|0.0000|||LC|[]|[]||||
23 | come|3|0.0000|0.0000|||LC|[]|[]||||
24 | to|3|0.0000|0.0000|||LC|[]|[]||||
25 | mind|3|0.0000|0.0000|||LC|[]|[]||||
26 | or|3|0.0000|0.0000|||LC|[]|[]||||
27 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
28 | ,|1|0.0000|0.0000|||||[]||||
29 | sure|1|0.0000|0.0000|.||LC|[]|[]||||
30 | .|1|0.0000|0.0000|||||[]||||
31 | When|1|0.0000|0.0000|||UC|[]|[]||||
32 | I|1|0.0000|0.0000|||CA|[]|[]||||
33 | hear|1|0.0000|0.0000|||LC|[]|[]||||
34 | Foobar|1|0.0000|0.0000|,||UC|[]|['1', '2']||||
35 | ,|1|0.0000|0.0000|||||[]||||
36 | I|1|0.0000|0.0000|||CA|[]|[]||||
37 | think|1|0.0000|0.0000|||LC|[]|[]||||
38 | about|1|0.0000|0.0000|||LC|[]|[]||||
39 | just|1|0.0000|0.0000|||LC|[]|[]||||
40 | that|1|0.0000|0.0000|:||LC|[]|[]||||
41 | :|1|0.0000|0.0000|||||[]||||
42 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(Foobar)|
43 | a|1|0.0000|0.0000|||LC|[]|[]||||
44 | 


--------------------------------------------------------------------------------
/test/data/short.aligned.strict.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | <crosstalk>|2|0.0000|0.0000|||LC|[]|[]||||
 3 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
 4 | yeah|1|||,||LC|[]|[]|||del|
 5 | right|1|0.0000|0.0000|.||LC|[]|[]||||
 6 | Yeah|1|0.0000|0.0000|,||UC|[]|[]|||sub(i'll)|
 7 | all|1|0.0000|0.0000|||LC|[]|[]|||sub(do)|
 8 | right|1|||,||LC|[]|[]|||del|
 9 | probably|1|||||LC|[]|[]|||del|
10 | just|1|0.0000|0.0000|||LC|[]|[]||||
11 | that|1|0.0000|0.0000|.||LC|[]|[]||||
12 | Are|3|0.0000|0.0000|||UC|[]|[]||||
13 | there|3|0.0000|0.0000|||LC|[]|[]||||
14 | any|3|0.0000|0.0000|||LC|[]|[]||||
15 | visuals|3|0.0000|0.0000|||LC|[]|[]||||
16 | that|3|0.0000|0.0000|||LC|[]|[]||||
17 | come|3|0.0000|0.0000|||LC|[]|[]||||
18 | to|3|0.0000|0.0000|||LC|[]|[]||||
19 | mind|3|0.0000|0.0000|||LC|[]|[]||||
20 | or|3|0.0000|0.0000|||LC|[]|[]||||
21 | Yeah|1|0.0000|0.0000|,||UC|[]|[]||||
22 | sure|1|0.0000|0.0000|.||LC|[]|[]||||
23 | When|1|0.0000|0.0000|||UC|[]|[]||||
24 | I|1|0.0000|0.0000|||CA|[]|[]||||
25 | hear|1|0.0000|0.0000|||LC|[]|[]||||
26 | Foobar|1|0.0000|0.0000|,||UC|[]|[]||||
27 | I|1|0.0000|0.0000|||CA|[]|[]||||
28 | think|1|0.0000|0.0000|||LC|[]|[]||||
29 | about|1|0.0000|0.0000|||LC|[]|[]||||
30 | just|1|0.0000|0.0000|||LC|[]|[]||||
31 | that|1|0.0000|0.0000|:||LC|[]|[]||||
32 | foo|1|0.0000|0.0000|||LC|[]|[]|||sub(foobar)|
33 | a|1|0.0000|0.0000|||LC|[]|[]||||
34 | 


--------------------------------------------------------------------------------
/test/data/short.hyp.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | <crosstalk>|2||||LC|[]
 3 | Yeah|1|||,|UC|[]
 4 | right|1||||LC|[]
 5 | I'll|1||||UC|[]
 6 | do|1||||LC|[]
 7 | just|1||||LC|[]
 8 | that|1|||.|LC|[]
 9 | Are|3||||UC|[]
10 | there|3||||LC|[]
11 | any|3||||LC|[]
12 | visuals|3||||LC|[]
13 | that|3||||LC|[]
14 | come|3||||LC|[]
15 | to|3||||LC|[]
16 | mind|3||||LC|[]
17 | or|3|||?|LC|[]
18 | Yeah|1|||,|UC|[]
19 | sure|1|||.|LC|[]
20 | When|1||||UC|[]
21 | I|1||||CA|[]
22 | hear|1||||LC|[]
23 | Foobar|1|||,|UC|[]
24 | I|1||||CA|[]
25 | think|1||||LC|[]
26 | about|1||||LC|[]
27 | just|1||||LC|[]
28 | that|1|||:|LC|[]
29 | Foobar|1|||,|UC|[]
30 | a|1||||LC|[]
31 | 


--------------------------------------------------------------------------------
/test/data/short.hyp.txt:
--------------------------------------------------------------------------------
1 | <crosstalk> Yeah right I'll do just that Are there any visuals that come to mind or Yeah sure When I hear Foobar I think about just that Foobar a 


--------------------------------------------------------------------------------
/test/data/short.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | <crosstalk>|2||||LC|[]
 3 | Yeah|1|||,|UC|[]
 4 | yeah|1|||,|LC|[]
 5 | right|1|||.|LC|[]
 6 | Yeah|1|||,|UC|[]
 7 | all|1||||LC|[]
 8 | right|1|||,|LC|[]
 9 | probably|1||||LC|[]
10 | just|1||||LC|[]
11 | that|1|||.|LC|[]
12 | Are|3||||UC|[]
13 | there|3||||LC|[]
14 | any|3||||LC|[]
15 | visuals|3||||LC|[]
16 | that|3||||LC|[]
17 | come|3||||LC|[]
18 | to|3||||LC|[]
19 | mind|3||||LC|[]
20 | or-|3||||LC|[]
21 | Yeah|1|||,|UC|[]
22 | sure|1|||.|LC|[]
23 | When|1||||UC|[]
24 | I|1||||CA|[]
25 | hear|1||||LC|[]
26 | Foobar|1|||,|UC|[]
27 | I|1||||CA|[]
28 | think|1||||LC|[]
29 | about|1||||LC|[]
30 | just|1||||LC|[]
31 | that|1|||:|LC|[]
32 | foo|1||||LC|[]
33 | a|1||||LC|[]
34 | 


--------------------------------------------------------------------------------
/test/data/short.sbs.txt:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |          <crosstalk>	<crosstalk>         			
 3 |                 Yeah	Yeah                			
 4 |                    ,	,                   			
 5 |                 yeah	<del>               	ERR		
 6 |                    ,	<del>               	ERR		
 7 |                right	right               			
 8 |                    .	<del>               	ERR		
 9 |                 Yeah	I'll                	ERR		
10 |                    ,	<del>               	ERR		
11 |                  all	do                  	ERR		
12 |                right	<del>               	ERR		
13 |                    ,	<del>               	ERR		
14 |             probably	<del>               	ERR		
15 |                 just	just                			
16 |                 that	that                			
17 |                    .	?                   	ERR		
18 |                  Are	Are                 			
19 |                there	there               			
20 |                  any	any                 			
21 |              visuals	visuals             			
22 |                 that	that                			
23 |                 come	come                			
24 |                   to	to                  			
25 |                 mind	mind                			
26 |                   or	or                  		___100002_SYN_1-1___	
27 |                <ins>	?                   	ERR		
28 |                 Yeah	Yeah                			
29 |                    ,	,                   			
30 |                 sure	sure                			
31 |                    .	.                   			
32 |                 When	When                			
33 |                    I	I                   			
34 |                 hear	hear                			
35 |               Foobar	Foobar              			###1_PROPER_NOUN###|###2_SPACY>ORG###|
36 |                    ,	,                   			
37 |                    I	I                   			
38 |                think	think               			
39 |                about	about               			
40 |                 just	just                			
41 |                 that	that                			
42 |                    :	:                   			
43 |                  foo	Foobar              	ERR		
44 |                <ins>	,                   	ERR		
45 |                    a	a                   			
46 | ------------------------------------------------------------
47 |                 Line	Group               
48 |                    5	yeah , <-> ***
49 |                    8	. Yeah , all right , probably <-> I'll do
50 |                   17	. <-> ?
51 |                   27	*** <-> ?
52 |                   43	foo <-> Foobar ,
53 | 


--------------------------------------------------------------------------------
/test/data/short_punc.hyp.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | <crosstalk>|2||||LC|[]
 3 | Yeah|1|||,|UC|[]
 4 | right|1||||LC|[]
 5 | I'll|1||||UC|[]
 6 | do|1||||LC|[]
 7 | just|1||||LC|[]
 8 | that|1|||?|LC|[]
 9 | Are|3||||UC|[]
10 | there|3||||LC|[]
11 | any|3||||LC|[]
12 | visuals|3||||LC|[]
13 | that|3||||LC|[]
14 | come|3||||LC|[]
15 | to|3||||LC|[]
16 | mind|3||||LC|[]
17 | or|3|||?|LC|[]
18 | Yeah|1|||,|UC|[]
19 | sure|1|||.|LC|[]
20 | When|1||||UC|[]
21 | I|1||||CA|[]
22 | hear|1||||LC|[]
23 | Foobar|1|||,|UC|[]
24 | I|1||||CA|[]
25 | think|1||||LC|[]
26 | about|1||||LC|[]
27 | just|1||||LC|[]
28 | that|1|||:|LC|[]
29 | Foobar|1|||,|UC|[]
30 | a|1||||LC|[]
31 | 


--------------------------------------------------------------------------------
/test/data/short_punc.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags
 2 | <crosstalk>|2||||LC|[]|[]
 3 | Yeah|1|||,|UC|[]|[]
 4 | yeah|1|||,|LC|[]|[]
 5 | right|1|||.|LC|[]|[]
 6 | Yeah|1|||,|UC|[]|[]
 7 | all|1||||LC|[]|[]
 8 | right|1|||,|LC|[]|[]
 9 | probably|1||||LC|[]|[]
10 | just|1||||LC|[]|[]
11 | that|1|||.|LC|[]|[]
12 | Are|3||||UC|[]|[]
13 | there|3||||LC|[]|[]
14 | any|3||||LC|[]|[]
15 | visuals|3||||LC|[]|[]
16 | that|3||||LC|[]|[]
17 | come|3||||LC|[]|[]
18 | to|3||||LC|[]|[]
19 | mind|3||||LC|[]|[]
20 | or-|3||||LC|[]|[]
21 | Yeah|1|||,|UC|[]|[]
22 | sure|1|||.|LC|[]|[]
23 | When|1||||UC|[]|[]
24 | I|1||||CA|[]|[]
25 | hear|1||||LC|[]|[]
26 | Foobar|1|||,|UC|[]|['1', '2']
27 | I|1||||CA|[]|[]
28 | think|1||||LC|[]|[]
29 | about|1||||LC|[]|[]
30 | just|1||||LC|[]|[]
31 | that|1|||:|LC|[]|[]
32 | foo|1||||LC|[]|[]
33 | a|1||||LC|[]|[]
34 | 


--------------------------------------------------------------------------------
/test/data/short_punc.wer_tag.json:
--------------------------------------------------------------------------------
1 | {
2 |     "1": {
3 |         "entity_type": "PROPER_NOUN"
4 |     },
5 |     "2": {
6 |         "entity_type": "SPACY>ORG"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/test/data/speaker_1.hyp.txt:
--------------------------------------------------------------------------------
1 | a b c d e e f g h i j it is uh a b c d f g h i
2 | 


--------------------------------------------------------------------------------
/test/data/speaker_1.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1|||||
 3 | b|1|||||
 4 | c|1|||||
 5 | d|1|||||
 6 | e|1|||||
 7 | f|1|||||
 8 | g|1|||||
 9 | h|1|||||
10 | i|1|||||
11 | j|1|||||
12 | um|2|||||
13 | it's|2|||||
14 | a|2|||||
15 | b|2|||||
16 | c|2|||||
17 | d|2|||||
18 | e|2|||||
19 | f|2|||||
20 | g|2|||||
21 | h|2|||||
22 | 


--------------------------------------------------------------------------------
/test/data/speaker_2.hyp.txt:
--------------------------------------------------------------------------------
1 | a b a d e f f g h i k l m n n n o p q r


--------------------------------------------------------------------------------
/test/data/speaker_2.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | a|1|||||
 3 | b|1|||||
 4 | c|2|||||
 5 | d|1|||||
 6 | e|1|||||
 7 | f|1|||||
 8 | g|1|||||
 9 | h|1|||||
10 | i|3|||||
11 | j|3|||||
12 | k|3|||||
13 | l|3|||||
14 | m|3|||||
15 | n|3|||||
16 | o|2|||||
17 | p|2|||||
18 | q|2|||||
19 | r|2|||||
20 | s|2|||||
21 | 


--------------------------------------------------------------------------------
/test/data/syn_1.hyp.adapted.sbs:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |                   we	we'll               	ERR		
 3 |                 will	<del>               	ERR		
 4 |                 have	have                			
 5 |                    a	a                   			
 6 |                 nice	nice                			
 7 |              evening	evening             			
 8 |                <ins>	um                  	ERR		
 9 |                   no	no                  			
10 |               matter	matter              			
11 |                 what	what                			
12 |                 will	will                			
13 |               happen	happen              			
14 |                <ins>	it                  	ERR		
15 |                   um	is                  	ERR		
16 |                 it's	uh                  	ERR		
17 |                    a	a                   			
18 |                 good	good                			
19 |          opportunity	opportunity         			
20 |                   to	to                  			
21 |                   do	<del>               	ERR		
22 |                 this	this                			
23 |               you'll	you'll              			
24 |                <ins>	uh                  	ERR		
25 |                  see	see                 			
26 | ------------------------------------------------------------
27 |                 Line	Group               
28 |                    2	we will <-> we'll
29 |                    8	*** <-> um
30 |                   14	um it's <-> it is uh
31 |                   21	do <-> ***
32 |                   24	*** <-> uh
33 | 


--------------------------------------------------------------------------------
/test/data/syn_1.hyp.sbs:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |                   we	<del>               	ERR		
 3 |                 will	we'll               	ERR		
 4 |                 have	have                			
 5 |                    a	a                   			
 6 |                 nice	nice                			
 7 |              evening	evening             			
 8 |                <ins>	um                  	ERR		
 9 |                   no	no                  			
10 |               matter	matter              			
11 |                 what	what                			
12 |                 will	will                			
13 |               happen	happen              			
14 |                <ins>	it                  	ERR		
15 |                   um	is                  	ERR		
16 |                 it's	uh                  	ERR		
17 |                    a	a                   			
18 |                 good	good                			
19 |          opportunity	opportunity         			
20 |                   to	to                  			
21 |                   do	<del>               	ERR		
22 |                 this	this                			
23 |               you'll	you'll              			
24 |                <ins>	uh                  	ERR		
25 |                  see	see                 			
26 | ------------------------------------------------------------
27 |                 Line	Group               
28 |                    2	we will <-> we'll
29 |                    8	*** <-> um
30 |                   14	um it's <-> it is uh
31 |                   21	do <-> ***
32 |                   24	*** <-> uh
33 | 


--------------------------------------------------------------------------------
/test/data/syn_1.hyp.txt:
--------------------------------------------------------------------------------
1 | we'll have a nice evening um no matter what will happen it is uh a good opportunity to this you'll uh see
2 | 


--------------------------------------------------------------------------------
/test/data/syn_1.ref.txt:
--------------------------------------------------------------------------------
1 | we will have a nice evening no matter what will happen um it's a good opportunity to do this you'll see
2 | 


--------------------------------------------------------------------------------
/test/data/syn_10.hyp.txt:
--------------------------------------------------------------------------------
1 | she will have a great evening
2 | 


--------------------------------------------------------------------------------
/test/data/syn_10.ref.txt:
--------------------------------------------------------------------------------
1 | she- will have a great evening
2 | 


--------------------------------------------------------------------------------
/test/data/syn_2.hyp.txt:
--------------------------------------------------------------------------------
1 | we'll
2 | 


--------------------------------------------------------------------------------
/test/data/syn_2.ref.txt:
--------------------------------------------------------------------------------
1 | we will
2 | 


--------------------------------------------------------------------------------
/test/data/syn_3.hyp.txt:
--------------------------------------------------------------------------------
1 | we will
2 | 


--------------------------------------------------------------------------------
/test/data/syn_3.ref.txt:
--------------------------------------------------------------------------------
1 | we'll
2 | 


--------------------------------------------------------------------------------
/test/data/syn_4.hyp.txt:
--------------------------------------------------------------------------------
1 | no
2 | 


--------------------------------------------------------------------------------
/test/data/syn_4.ref.txt:
--------------------------------------------------------------------------------
1 | we will
2 | 


--------------------------------------------------------------------------------
/test/data/syn_5.hyp.txt:
--------------------------------------------------------------------------------
1 | will
2 | 


--------------------------------------------------------------------------------
/test/data/syn_5.ref.txt:
--------------------------------------------------------------------------------
1 | we'll
2 | 


--------------------------------------------------------------------------------
/test/data/syn_6.hyp.txt:
--------------------------------------------------------------------------------
1 | this is what saying
2 | 


--------------------------------------------------------------------------------
/test/data/syn_6.ref.txt:
--------------------------------------------------------------------------------
1 | this- is what she's saying
2 | 


--------------------------------------------------------------------------------
/test/data/syn_7.hyp.txt:
--------------------------------------------------------------------------------
1 | it costs ten bricks of bricks dollars but  i will verify what we could have done differently for the second day
2 | 


--------------------------------------------------------------------------------
/test/data/syn_7.hyp2.txt:
--------------------------------------------------------------------------------
1 | it costs bricks of bricks dollars but  i will verify what we could have done differently for the second day
2 | 


--------------------------------------------------------------------------------
/test/data/syn_7.hyp3.txt:
--------------------------------------------------------------------------------
1 | it costs ten bricks of bricks but  i will verify what we could have done differently for the second day
2 | 


--------------------------------------------------------------------------------
/test/data/syn_7.hyp4.txt:
--------------------------------------------------------------------------------
1 | it costs


--------------------------------------------------------------------------------
/test/data/syn_7.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": {
 3 |         "candidates": [
 4 |             {
 5 |                 "probability": 0.33,
 6 |                 "verbalization": [
 7 |                     "ten",
 8 |                     "billions"
 9 |                 ]
10 |             },
11 |             {
12 |                 "probability": 0.33,
13 |                 "verbalization": [
14 |                     "ten",
15 |                     "billions",
16 |                     "dollars"
17 |                 ]
18 |             },
19 |             {
20 |                 "probability": 0.33,
21 |                 "verbalization": [
22 |                 ]
23 |             }
24 |         ],
25 |         "class": "MONEY"
26 |     },
27 |     "1": {
28 |         "candidates": [
29 |             {
30 |                 "probability": 1.0,
31 |                 "verbalization": [
32 |                     "second"
33 |                 ]
34 |             }
35 |         ],
36 |         "class": "ORDINAL"
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/test/data/syn_7.ref.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|case|tags
 2 | It|1|0.06999999999999999||,|UC|[]
 3 | costs|1||||LC|[]
 4 | $10|1||||LC|['0:MONEY']
 5 | billions|1||||LC|['0:MONEY']
 6 | but |1|||…|LC|[]
 7 | i'll|2||||UC|[]
 8 | verify|2||||LC|[]
 9 | what|2||||LC|[]
10 | we|2||||LC|[]
11 | could|2||||LC|[]
12 | have|2|||.|LC|[]
13 | done|2||||UC|[]
14 | differently-|2||||LC|[]
15 | for|2||||LC|[]
16 | the|2||||LC|[]
17 | 2nd|2||||LC|['1:ORDINAL']
18 | 


--------------------------------------------------------------------------------
/test/data/syn_7.synonym.rules.txt:
--------------------------------------------------------------------------------
 1 | # format : LHS<pipe>RHS
 2 | # where:
 3 | # LHS : space-delimited words to match in the original reference text
 4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS
 5 | #
 6 | # This is non-recursive and single-pass only.
 7 | # By default, there won't be an automatic symetry:
 8 | # if you want a-> and b->a, you need to specify both
 9 | #
10 | # Empty lines or lines starting with '#' are ignored
11 | 
12 | 
13 | ## FOR TESTING ONLY
14 | ten billions     | ten bricks of bricks
15 | 
16 | # To Be contractions - present
17 | i am     | i'm
18 | i'm      | i am
19 | you are  | you're
20 | you're   | you are
21 | he is    | he's
22 | he's     | he is
23 | she is   | she's
24 | she's    | she is
25 | it is    | it's
26 | it's     | it is
27 | we're    | we are
28 | we are   | we're
29 | they are | they're
30 | 
31 | # To Be contractions - future
32 | i will    |i'll
33 | i'll      |i will
34 | you will  |you'll
35 | you'll    |you will
36 | he will   |he'll
37 | he'll     |he will
38 | she will  |she'll
39 | she'll    |she will
40 | it will   |it'll
41 | it'll     |it will
42 | we will   |we'll
43 | we'll     |we will
44 | they will |they'll
45 | they'll   |they will
46 | 
47 | i'd     |i had ; i would
48 | 
49 | # TODO: can't -> cannot ?
50 | # TODO: To Have ?
51 | # TODO: which other contractions?
52 | 


--------------------------------------------------------------------------------
/test/data/syn_7_ref4.nlp:
--------------------------------------------------------------------------------
1 | token|speaker|ts|endTs|punctuation|case|tags
2 | It|1|0.06999999999999999||,|UC|[]
3 | costs|1||||LC|[]
4 | $10|1||||LC|['0:MONEY']
5 | billions|1||||LC|['0:MONEY']
6 | 


--------------------------------------------------------------------------------
/test/data/syn_8.hyp.ctm:
--------------------------------------------------------------------------------
1 | recording.wav A 0.00 0.36 we 1.00
2 | recording.wav A 0.36 0.12 will 1.00
3 | recording.wav A 0.48 0.30 see 1.00
4 | recording.wav A 0.75 0.95 that 1.00
5 | recording.wav A 1.05 0.21 that 1.00
6 | recording.wav A 1.26 0.18 was 0.99
7 | recording.wav A 1.44 0.09 an 1.00
8 | recording.wav A 1.53 0.96 accident 1.00
9 | 


--------------------------------------------------------------------------------
/test/data/syn_8.ref.nlp:
--------------------------------------------------------------------------------
1 | token|speaker|ts|endTs|punctuation|case|tags
2 | We'll|1|0.15|||UC|[]
3 | see|1||||LC|[]
4 | that|1||||LC|[]
5 | was|1||||LC|[]
6 | an|1||||LC|[]
7 | accident|1|||.|LC|[]
8 | 


--------------------------------------------------------------------------------
/test/data/syn_9.hyp.txt:
--------------------------------------------------------------------------------
1 | he will have a great evening


--------------------------------------------------------------------------------
/test/data/syn_9.ref.txt:
--------------------------------------------------------------------------------
1 | she- will have a great evening


--------------------------------------------------------------------------------
/test/data/syn_9.synonym.rules.txt:
--------------------------------------------------------------------------------
 1 | # format : LHS<pipe>RHS
 2 | # where:
 3 | # LHS : space-delimited words to match in the original reference text
 4 | # RHS : semi-colon-delimited list of space-delimited words to consider as equivalent expressions to the LHS
 5 | #
 6 | # This is non-recursive and single-pass only.
 7 | # By default, there won't be an automatic symetry:
 8 | # if you want a-> and b->a, you need to specify both
 9 | #
10 | # Empty lines or lines starting with '#' are ignored
11 | 
12 | 
13 | ## FOR TESTING ONLY
14 | she-     | he
15 | 


--------------------------------------------------------------------------------
/test/data/syn_compound_1.hyp.txt:
--------------------------------------------------------------------------------
1 | things are going to be next level next quarter


--------------------------------------------------------------------------------
/test/data/syn_compound_1.ref.txt:
--------------------------------------------------------------------------------
1 | things are going to be next-level next quarter


--------------------------------------------------------------------------------
/test/data/syn_compound_2.hyp.txt:
--------------------------------------------------------------------------------
1 | that is are long-term view on politics


--------------------------------------------------------------------------------
/test/data/syn_compound_2.ref.txt:
--------------------------------------------------------------------------------
1 | what is our long term view on politics


--------------------------------------------------------------------------------
/test/data/test1.hyp.txt:
--------------------------------------------------------------------------------
1 | we'll have victory it is a sure thing we must only calculate p i number
2 | we'll have victory it is a sure thing we must only calculate pee eye number
3 | we will have victory it's a sure thing we must only calculate p i number
4 | we will have victory it's a sure thing we must only calculate p i number
5 | we will have victory it's a sure thing we must only calculate p i number
6 | 


--------------------------------------------------------------------------------
/test/data/test1.ref.txt:
--------------------------------------------------------------------------------
1 | we will have victory marc it's a sure thing we must only calculate p i number
2 | we will have victory it's a sure thing we must only calculate p i number
3 | we will have victory it's a sure thing we must only calculate p i number
4 | we will have victory it's a sure thing we must only calculate p i number
5 | we will have victory it's a sure thing we must only calculate p i number
6 | 


--------------------------------------------------------------------------------
/test/data/twenty.aligned.punc_case.nlp:
--------------------------------------------------------------------------------
 1 | token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|wer_tags|oldTs|oldEndTs|ali_comment|confidence
 2 | 20|2|||||CA|['1:CARDINAL']|['1']|84.6600|85.0600|del|
 3 | in|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800||
 4 | twenty|2|0.0000|0.0000|||CA|['0:YEAR']|['0', '2']|89.7400|89.9900|sub(Twenty),push_last|
 5 | twenty|2|0.0000|0.0000|||LC|['0:YEAR']|['0', '2']|89.7400|89.9900|sub(tHiRtY),push_last|
 6 | is|2|0.0000|0.0000|||LC|[]|[]|89.1600|89.2800||
 7 | one|2|0.0000|0.0000|||CA|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last|
 8 | twenty|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|sub(two),push_last|
 9 | three|2|0.0000|0.0000|||LC|['3:CARDINAL']|['3']|89.7400|89.9900|,push_last|
10 | 


--------------------------------------------------------------------------------
/test/data/twenty.hyp-a2.sbs:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |                   20	<del>               	ERR	___1_CARDINAL___	
 3 |                   in	in                  			
 4 |               twenty	twenty              		___2_YEAR___	
 5 |               twenty	thirty              	ERR	___2_YEAR___	
 6 |                   is	is                  			
 7 |                  one	one                 		___3_CARDINAL___	
 8 |               twenty	twenty              		___3_CARDINAL___	
 9 |                <ins>	two                 	ERR	___3_CARDINAL___	
10 |                three	three               		___3_CARDINAL___	
11 | ------------------------------------------------------------
12 |                 Line	Group               
13 |                    2	20 <-> ***
14 |                    5	twenty <-> thirty
15 |                    9	*** <-> two
16 | 


--------------------------------------------------------------------------------
/test/data/twenty.hyp.punc_case.txt:
--------------------------------------------------------------------------------
1 | in Twenty tHiRtY , is one TWENTY two three
2 | 
3 | 


--------------------------------------------------------------------------------
/test/data/twenty.hyp.sbs:
--------------------------------------------------------------------------------
 1 |            ref_token	hyp_token           	IsErr	Class	Wer_Tag_Entities
 2 |               twenty	<del>               	ERR	___1_CARDINAL___	
 3 |                   in	in                  			
 4 |               twenty	twenty              		___2_YEAR___	
 5 |               twenty	thirty              	ERR	___2_YEAR___	
 6 |                   is	is                  			
 7 |                  one	one                 		___3_CARDINAL___	
 8 |               twenty	twenty              		___3_CARDINAL___	
 9 |                <ins>	two                 	ERR	___3_CARDINAL___	
10 |                three	three               		___3_CARDINAL___	
11 | ------------------------------------------------------------
12 |                 Line	Group               
13 |                    2	twenty <-> ***
14 |                    5	twenty <-> thirty
15 |                    9	*** <-> two
16 | 


--------------------------------------------------------------------------------
/test/data/twenty.hyp.txt:
--------------------------------------------------------------------------------
1 | in twenty thirty is one twenty two three
2 | 
3 | 


--------------------------------------------------------------------------------
/test/data/twenty.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": {
 3 |     "candidates": [
 4 |       {
 5 |         "probability": 1.0,
 6 |         "verbalization": [
 7 |           "twenty"
 8 |         ]
 9 |       }
10 |     ],
11 |     "class": "CARDINAL"
12 |   },
13 |   "2": {
14 |     "candidates": [
15 |       {
16 |         "probability": 0.3333333333333333,
17 |         "verbalization": [
18 |           "twenty",
19 |           "twenty"
20 |         ]
21 |       },
22 |       {
23 |         "probability": 0.3333333333333333,
24 |         "verbalization": [
25 |           "two",
26 |           "thousand",
27 |           "twenty"
28 |         ]
29 |       },
30 |       {
31 |         "probability": 0.3333333333333333,
32 |         "verbalization": [
33 |           "two",
34 |           "thousand",
35 |           "and",
36 |           "twenty"
37 |         ]
38 |       }
39 |     ],
40 |     "class": "YEAR"
41 |   },
42 |   "3": {
43 |     "candidates": [
44 |       {
45 |         "probability": 0.2,
46 |         "verbalization": [
47 |           "one",
48 |           "twenty",
49 |           "three"
50 |         ]
51 |       },
52 |       {
53 |         "probability": 0.2,
54 |         "verbalization": [
55 |           "a",
56 |           "hundred",
57 |           "twenty",
58 |           "three"
59 |         ]
60 |       },
61 |       {
62 |         "probability": 0.2,
63 |         "verbalization": [
64 |           "one",
65 |           "hundred",
66 |           "twenty",
67 |           "three"
68 |         ]
69 |       },
70 |       {
71 |         "probability": 0.2,
72 |         "verbalization": [
73 |           "a",
74 |           "hundred",
75 |           "and",
76 |           "twenty",
77 |           "three"
78 |         ]
79 |       },
80 |       {
81 |         "probability": 0.2,
82 |         "verbalization": [
83 |           "one",
84 |           "hundred",
85 |           "and",
86 |           "twenty",
87 |           "three"
88 |         ]
89 |       }
90 |     ],
91 |     "class": "CARDINAL"
92 |   }
93 | }


--------------------------------------------------------------------------------
/test/data/twenty.ref.nlp:
--------------------------------------------------------------------------------
1 | token|speaker|ts|endTs|punctuation|case|tags|oldTs|oldEndTs|ali_comment
2 | 20|2|84.6600|85.0600||CA|['1:CARDINAL']|||,push_last 
3 | in|2|89.1600|89.2800||LC|[]||| 
4 | 2020|2|89.7400|89.9900||CA|['2:YEAR']|||,push_last
5 | is|2|89.1600|89.2800||LC|[]||| 
6 | 123|2|89.7400|89.9900||CA|['3:CARDINAL']|||,push_last
7 | 


--------------------------------------------------------------------------------
/test/data/twenty.ref.testing.nlp:
--------------------------------------------------------------------------------
1 | token|speaker|ts|endTs|punctuation|case|tags|wer_tags
2 | 20|2|84.6600|85.0600||CA|['1:CARDINAL']|['1']
3 | in|2|89.1600|89.2800||LC|[]|[]
4 | 2020|2|89.7400|89.9900||CA|['0:YEAR']|['0', '2']
5 | is|2|89.1600|89.2800||LC|[]|[]
6 | 123|2|89.7400|89.9900||CA|['3:CARDINAL']|['3']
7 | 


--------------------------------------------------------------------------------
/test/data/twenty.ref.testing.norm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "0": {
 3 |     "candidates": [
 4 |       {
 5 |         "probability": 0.8699402786994028,
 6 |         "verbalization": [
 7 |           "twenty",
 8 |           "twenty"
 9 |         ]
10 |       },
11 |       {
12 |         "probability": 0.032183145321831454,
13 |         "verbalization": [
14 |           "two",
15 |           "thousand",
16 |           "and",
17 |           "twenty"
18 |         ]
19 |       },
20 |       {
21 |         "probability": 0.09787657597876576,
22 |         "verbalization": [
23 |           "two",
24 |           "thousand",
25 |           "twenty"
26 |         ]
27 |       }
28 |     ],
29 |     "class": "YEAR"
30 |   },
31 |   "1": {
32 |     "candidates": [
33 |       {
34 |         "probability": 1.0,
35 |         "verbalization": [
36 |           "twenty"
37 |         ]
38 |       }
39 |     ],
40 |     "class": "CARDINAL"
41 |   },
42 |   "3": {
43 |     "candidates": [
44 |       {
45 |         "probability": 0.06962025316455696,
46 |         "verbalization": [
47 |           "a",
48 |           "hundred",
49 |           "and",
50 |           "twenty",
51 |           "three"
52 |         ]
53 |       },
54 |       {
55 |         "probability": 0.09177215189873418,
56 |         "verbalization": [
57 |           "a",
58 |           "hundred",
59 |           "twenty",
60 |           "three"
61 |         ]
62 |       },
63 |       {
64 |         "probability": 0.012658227848101266,
65 |         "verbalization": [
66 |           "one",
67 |           "hundred",
68 |           "and",
69 |           "twenty",
70 |           "three"
71 |         ]
72 |       },
73 |       {
74 |         "probability": 0.0189873417721519,
75 |         "verbalization": [
76 |           "one",
77 |           "hundred",
78 |           "twenty",
79 |           "three"
80 |         ]
81 |       },
82 |       {
83 |         "probability": 0.8069620253164557,
84 |         "verbalization": [
85 |           "one",
86 |           "twenty",
87 |           "three"
88 |         ]
89 |       }
90 |     ],
91 |     "class": "CARDINAL"
92 |   }
93 | }


--------------------------------------------------------------------------------
/test/data/wer_utf.hyp.txt:
--------------------------------------------------------------------------------
1 | ça va va bien aujourd'hui éte inutile Êtes


--------------------------------------------------------------------------------
/test/data/wer_utf.ref.txt:
--------------------------------------------------------------------------------
1 | Ça va bien aujourd'hui étÉ inutile êtes


--------------------------------------------------------------------------------
/test/fast-d-tests.cc:
--------------------------------------------------------------------------------
  1 | #define CATCH_CONFIG_MAIN
  2 | #include <stdlib.h> /* srand, rand */
  3 | #include <time.h>   /* time */
  4 | #include "../third-party/catch2/single_include/catch2/catch.hpp"
  5 | #include "src/fast-d.h"
  6 | typedef std::vector<int> vint;
  7 | 
  8 | TEST_CASE("simple-edits-counts") {
  9 |   vint a = {1, 2, 3, 4, 5};
 10 |   vint b = {1, 2, 8, 4, 5};
 11 | 
 12 |   REQUIRE(GetEditDistance(a, b) == 1);
 13 |   // edit distance should be symetric
 14 |   REQUIRE(GetEditDistance(b, a) == 1);
 15 | 
 16 |   // distance of oneself with oneself should be 0
 17 |   REQUIRE(GetEditDistance(a, a) == 0);
 18 |   REQUIRE(GetEditDistance(b, b) == 0);
 19 | }
 20 | 
 21 | TEST_CASE("boundaries-edits-counts") {
 22 |   vint a = {1, 2, 3, 4, 5};
 23 |   vint b = {};
 24 | 
 25 |   REQUIRE(GetEditDistance(a, b) == 5);
 26 |   // edit distance should be symetric
 27 |   REQUIRE(GetEditDistance(b, a) == 5);
 28 | 
 29 |   // distance of oneself with oneself should be 0
 30 |   REQUIRE(GetEditDistance(a, a) == 0);
 31 |   REQUIRE(GetEditDistance(b, b) == 0);
 32 | }
 33 | 
 34 | TEST_CASE("just-one-target-edits-counts") {
 35 |   vint a = {1, 2, 3, 4, 5};
 36 |   vint b1 = {1};
 37 |   vint b2 = {2};
 38 |   vint b3 = {3};
 39 |   vint b4 = {4};
 40 |   vint b5 = {5};
 41 | 
 42 |   REQUIRE(GetEditDistance(a, b1) == 4);
 43 |   REQUIRE(GetEditDistance(a, b2) == 4);
 44 |   REQUIRE(GetEditDistance(a, b3) == 4);
 45 |   REQUIRE(GetEditDistance(a, b4) == 4);
 46 |   REQUIRE(GetEditDistance(a, b5) == 4);
 47 | }
 48 | 
 49 | TEST_CASE("single-edits-counts") {
 50 |   vint a = {1, 2, 3, 4, 5};
 51 |   vint b = {1, 1, 2, 3, 4, 5};
 52 | 
 53 |   REQUIRE(GetEditDistance(b, a) == 1);
 54 |   REQUIRE(GetEditDistance(a, b) == 1);
 55 | }
 56 | 
 57 | TEST_CASE("left-insert") {
 58 |   vint va = {1, 2, 3, 4, 5};
 59 |   vint vb = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 2, 3, 4, 5};
 60 |   vint mapA;
 61 |   vint mapB;
 62 | 
 63 |   REQUIRE(GetEditDistance(va, mapA, vb, mapB) == 10);
 64 | 
 65 |   vint vb1 = {8, 8, 8, 8, 8, 1, 8, 8, 8, 8, 1, 2, 3, 4, 5};
 66 |   REQUIRE(GetEditDistance(va, mapA, vb1, mapB) == 10);
 67 | }
 68 | 
 69 | TEST_CASE("map-test-A") {
 70 |   vint a = {1, 2, 3, 4, 5};
 71 |   vint b = {1, 2, 3, 4, 5};
 72 |   vint mapA = {-1, -1, -1, -1, -1};
 73 |   vint mapB = {-1, -1, -1, -1, -1};
 74 | 
 75 |   REQUIRE(GetEditDistance(a, mapA, b, mapB) == 0);
 76 |   for (int i = 0; i < 5; i++) {
 77 |     REQUIRE(mapA[i] == 1);
 78 |     REQUIRE(mapB[i] == 1);
 79 |   }
 80 | }
 81 | 
 82 | TEST_CASE("map-test-B") {
 83 |   vint a = {1, 2, 3, 4, 5};
 84 |   vint b = {1, 4, 5};
 85 |   vint mapA = {-1, -1, -1, -1, -1};
 86 |   vint mapB = {-1, 1, -1};
 87 | 
 88 |   REQUIRE(GetEditDistance(a, mapA, b, mapB) == 2);
 89 |   REQUIRE(mapA[0] == 1);
 90 |   REQUIRE(mapA[1] == -1);
 91 |   REQUIRE(mapA[2] == -1);
 92 |   REQUIRE(mapA[3] == 1);
 93 |   REQUIRE(mapA[4] == 1);
 94 | 
 95 |   REQUIRE(mapB[0] == 1);
 96 |   REQUIRE(mapB[1] == 1);
 97 |   REQUIRE(mapB[2] == 1);
 98 | }
 99 | 
100 | TEST_CASE("map-test-C") {
101 |   vint a = {1, 2, 3, 4, 5};
102 |   vint b = {10, 20, 30, 40, 50};
103 |   vint mapA = {-1, -1, -1, -1, -1};
104 |   //   vint mapB = {-1, -1, -1, -1, -1};
105 |   vint mapB;
106 | 
107 |   REQUIRE(GetEditDistance(a, mapA, b, mapB) == 5);
108 |   for (int i = 0; i < 5; i++) {
109 |     REQUIRE(mapA[i] == -1);
110 |     REQUIRE(mapB[i] == -1);
111 |   }
112 | }
113 | 
114 | TEST_CASE("map-repeat") {
115 |   vint a = {1, 2, 3, 4, 5, 1, 2, 7, 8, 1, 2, 3};
116 |   vint b = {1, 2, 3, 5, 9, 1, 2, 7, 8, 1, 2};
117 |   vint mapA;
118 |   vint mapB;
119 | 
120 |   int dist = GetEditDistance(a, mapA, b, mapB);
121 |   REQUIRE(mapA.size() == a.size());
122 |   REQUIRE(mapB.size() == b.size());
123 | 
124 |   for (int i = 0; i < a.size(); i++) {
125 |     std::cout << " a[" << i << "] = " << a[i] << ", mapA[" << i << "] = " << mapA[i] << std::endl;
126 |   }
127 |   for (int i = 0; i < b.size(); i++) {
128 |     std::cout << " b[" << i << "] = " << b[i] << ", mapB[" << i << "] = " << mapB[i] << std::endl;
129 |   }
130 | 
131 |   REQUIRE(dist == 3);
132 | }
133 | 
134 | TEST_CASE("test-long-seq") {
135 |   srand(time(NULL));
136 |   int ins_rate = 20;  // over 1k, so 2%
137 |   int del_rate = 20;  // over 1k, so 2%
138 |   int sub_rate = 50;  // over 1k, so 5%
139 | 
140 |   int retries_left = 5;
141 |   int number_of_edits = 0;
142 |   int edit_distance = 0;
143 | 
144 |   // for stochastic reasons, it is possible that the naive
145 |   // ins + sub + del count gets off a little.  We give ourselves
146 |   // few attempts to validate that this test passes.
147 |   while (retries_left-- > 0) {
148 |     vint a;
149 |     vint b;
150 |     vint mapA;
151 |     vint mapB;
152 | 
153 |     number_of_edits = 0;
154 |     int num_ins = 0;
155 |     int num_del = 0;
156 |     int num_sub = 0;
157 | 
158 |     for (int i = 0; i < 1000; i++) {
159 |       int ai = rand() % 32000 + rand() % 32000 + rand() % 32000 + rand() % 32000 + 1;
160 |       a.push_back(ai);
161 | 
162 |       // if you want to debug the test
163 |       // std::cout << "a[" << i << "] = " << a[i] << std::endl;
164 |       int extra_char = a[i] + rand() % 32000 + 40000;
165 | 
166 |       int f = rand() % 1000;
167 |       if (f < ins_rate) {
168 |         b.push_back(extra_char);
169 |         b.push_back(a[i]);
170 |         number_of_edits++;
171 |         num_ins++;
172 |       } else if (f < ins_rate + del_rate) {
173 |         // let's skip this one
174 |         number_of_edits++;
175 |         num_del++;
176 |       } else if (f < ins_rate + del_rate + sub_rate) {
177 |         b.push_back(extra_char);
178 |         number_of_edits++;
179 |         num_sub++;
180 |       } else {
181 |         b.push_back(a[i]);
182 |       }
183 |     }
184 | 
185 |     // if you want to debug the test
186 |     //   for (int j = 0; j < b.size(); j++) {
187 |     //     std::cout << "b[" << j << "] = " << b[j] << std::endl;
188 |     //   }
189 | 
190 |     std::cout << " We have " << num_ins << " insertions, " << num_del << " deletions and " << num_sub
191 |               << " substitution for a total of " << number_of_edits << " edits" << std::endl;
192 | 
193 |     edit_distance = GetEditDistance(a, mapA, b, mapB);
194 |     if (edit_distance != number_of_edits) {
195 |       std::cout << "a= " << a[0];
196 |       for (int i = 1; i < a.size(); i++) {
197 |         std::cout << " " << a[i];
198 |       }
199 |       std::cout << std::endl;
200 |       std::cout << "b= " << b[0];
201 |       for (int i = 1; i < b.size(); i++) {
202 |         std::cout << " " << b[i];
203 |       }
204 |       std::cout << std::endl;
205 |       continue;
206 |     } else {
207 |       break;
208 |     }
209 |   }
210 | 
211 |   REQUIRE(edit_distance == number_of_edits);
212 | }
213 | 
214 | TEST_CASE("test-long-seq-editonly") {
215 |   srand(time(NULL));
216 |   int ins_rate = 20;  // over 1k, so 2%
217 |   int del_rate = 20;  // over 1k, so 2%
218 |   int sub_rate = 50;  // over 1k, so 5%
219 | 
220 |   int retries_left = 5;
221 |   int number_of_edits = 0;
222 |   int edit_distance = 0;
223 | 
224 |   // for stochastic reasons, it is possible that the naive
225 |   // ins + sub + del count gets off a little.  We give ourselves
226 |   // few attempts to validate that this test passes.
227 |   while (retries_left-- > 0) {
228 |     vint a;
229 |     vint b;
230 |     vint mapA;
231 |     vint mapB;
232 | 
233 |     number_of_edits = 0;
234 |     int num_ins = 0;
235 |     int num_del = 0;
236 |     int num_sub = 0;
237 | 
238 |     for (int i = 0; i < 1000; i++) {
239 |       int ai = rand() % 32000 + rand() % 32000 + rand() % 32000 + rand() % 32000 + 1;
240 |       a.push_back(ai);
241 | 
242 |       // if you want to debug the test
243 |       // std::cout << "a[" << i << "] = " << a[i] << std::endl;
244 |       int extra_char = a[i] + rand() % 32000 + 40000;
245 | 
246 |       int f = rand() % 1000;
247 |       if (f < ins_rate) {
248 |         b.push_back(extra_char);
249 |         b.push_back(a[i]);
250 |         number_of_edits++;
251 |         num_ins++;
252 |       } else if (f < ins_rate + del_rate) {
253 |         // let's skip this one
254 |         number_of_edits++;
255 |         num_del++;
256 |       } else if (f < ins_rate + del_rate + sub_rate) {
257 |         b.push_back(extra_char);
258 |         number_of_edits++;
259 |         num_sub++;
260 |       } else {
261 |         b.push_back(a[i]);
262 |       }
263 |     }
264 | 
265 |     // if you want to debug the test
266 |     //   for (int j = 0; j < b.size(); j++) {
267 |     //     std::cout << "b[" << j << "] = " << b[j] << std::endl;
268 |     //   }
269 | 
270 |     std::cout << " We have " << num_ins << " insertions, " << num_del << " deletions and " << num_sub
271 |               << " substitution for a total of " << number_of_edits << " edits" << std::endl;
272 | 
273 |     edit_distance = GetEditDistanceOnly(a, b);
274 |     if (edit_distance != number_of_edits) {
275 |       std::cout << "a= " << a[0];
276 |       for (int i = 1; i < a.size(); i++) {
277 |         std::cout << " " << a[i];
278 |       }
279 |       std::cout << std::endl;
280 |       std::cout << "b= " << b[0];
281 |       for (int i = 1; i < b.size(); i++) {
282 |         std::cout << " " << b[i];
283 |       }
284 |       std::cout << std::endl;
285 |       continue;
286 |     } else {
287 |       break;
288 |     }
289 |   }
290 | 
291 |   REQUIRE(edit_distance == number_of_edits);
292 | }


--------------------------------------------------------------------------------
/test/test-utilties.h:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <array>
  3 | #include <fstream>
  4 | #include <iterator>
  5 | #include <string>
  6 | #include "src/logging.h"
  7 | #ifndef __TEST_UTILITIES_H__
  8 | #define __TEST_UTILITIES_H__ 1
  9 | const std::string TEST_BINARY = "./fstalign";
 10 | const std::string TEST_DATA = "../test/data/";
 11 | const std::string TEST_SYNONYMS = "../sample_data/synonyms.rules.txt";
 12 | #ifdef WINDOWS
 13 | #include <direct.h>
 14 | #define GetCurrentDir _getcwd
 15 | #else
 16 | #include <unistd.h>
 17 | #define GetCurrentDir getcwd
 18 | #endif
 19 | 
 20 | void pclose_test(FILE *fp) {
 21 |   int status = pclose(fp);
 22 |   if (status != 0) {
 23 |     throw std::runtime_error("exit status non-zero!");
 24 |   }
 25 | }
 26 | 
 27 | std::string get_current_dir() {
 28 |   char buff[FILENAME_MAX];  // create string buffer to hold path
 29 |   GetCurrentDir(buff, FILENAME_MAX);
 30 |   std::string current_working_dir(buff);
 31 |   return current_working_dir;
 32 | }
 33 | 
 34 | // Executes a specific shell command, and returns a string containing the output
 35 | std::string exec(const std::string &cmd) {
 36 |   const size_t length = 256;
 37 |   std::array<char, length> buffer;
 38 | 
 39 |   std::shared_ptr<FILE> pipe{popen(cmd.c_str(), "r"), pclose_test};
 40 |   if (!pipe) {
 41 |     throw std::runtime_error("popen() failed!");
 42 |   }
 43 | 
 44 |   std::string result;
 45 |   while (!feof(pipe.get())) {
 46 |     if (fgets(buffer.data(), length, pipe.get()) != nullptr) result += buffer.data();
 47 |   }
 48 |   result = result + "\nCommand:\n" + cmd + "\n";
 49 | 
 50 |   return result;
 51 | }
 52 | 
 53 | // Generates a specific fstalign command given certain flag values
 54 | std::string command(const char *subcommand, const char *approach, const char *reference, const char *hypothesis,
 55 |                     const std::string output_sbs = "", const std::string output_nlp = "",
 56 |                     const std::string synonyms = "", const char *refJson = nullptr, const bool disableCutoffs = false,
 57 |                     const int speakerSwitchContextSize = -1, const std::string extraFlags = "") {
 58 |   const auto ref = std::string{"--ref "} + TEST_DATA + reference;
 59 |   const auto hyp = std::string{"--hyp "} + TEST_DATA + hypothesis;
 60 | 
 61 |   auto cmd = std::string{TEST_BINARY} + " " + subcommand + " " + approach + " " + ref + " " + hyp;
 62 |   // useful for debugging test
 63 |   //   auto logger = logger::GetOrCreateLogger("main()");
 64 |   //   logger->info("final command is {}", cmd);
 65 | 
 66 |   if (!synonyms.empty()) {
 67 |     cmd = cmd + " --syn " + synonyms;
 68 |   }
 69 |   if (refJson != nullptr) {
 70 |     cmd = cmd + " --ref-json " + TEST_DATA + refJson;
 71 |   }
 72 | 
 73 |   if (disableCutoffs) {
 74 |     cmd += " --disable-cutoffs";
 75 |   }
 76 | 
 77 |   if (speakerSwitchContextSize > 0) {
 78 |     cmd += " --speaker-switch-context " + std::to_string(speakerSwitchContextSize);
 79 |   }
 80 | 
 81 |   if (!output_sbs.empty()) {
 82 |     cmd = cmd + " --output-sbs " + output_sbs;
 83 |   }
 84 | 
 85 |   if (!output_nlp.empty()) {
 86 |     cmd = cmd + " --output-nlp " + output_nlp;
 87 |   }
 88 | 
 89 |   if (!extraFlags.empty()) {
 90 |     cmd += " " + extraFlags;
 91 |   }
 92 |   return cmd;
 93 | }
 94 | 
 95 | // Compares two test files for exact equality
 96 | bool compareFiles(const std::string &p1, const std::string &p2) {
 97 |   std::ifstream f1(p1, std::ifstream::binary | std::ifstream::ate);
 98 |   std::ifstream f2(p2, std::ifstream::binary | std::ifstream::ate);
 99 | 
100 |   // useful for debugging test
101 |   auto logger = logger::GetOrCreateLogger("main()");
102 |   //   logger->info("comparing {} with {}", p1, p2);
103 | 
104 |   if (f1.fail() || f2.fail()) {
105 |     logger->info("comparing {} with {}", p1, p2);
106 |     logger->info("some file can't be opened");
107 |     return false;  // file problem
108 |   }
109 | 
110 |   if (f1.tellg() != f2.tellg()) {
111 |     logger->info("comparing {} with {}", p1, p2);
112 |     logger->info("files sizes don't match {}, vs {}", f1.tellg(), f2.tellg());
113 |     return false;  // size mismatch
114 |   }
115 | 
116 |   // seek back to beginning and use std::equal to compare contents
117 |   f1.seekg(0, std::ifstream::beg);
118 |   f2.seekg(0, std::ifstream::beg);
119 |   return std::equal(std::istreambuf_iterator<char>(f1.rdbuf()), std::istreambuf_iterator<char>(),
120 |                     std::istreambuf_iterator<char>(f2.rdbuf()));
121 | }
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # Tools
 2 | A collection of miscellaneous tools to support the fstalign project.
 3 | 
 4 | ## generate_wer_test_data.pl
 5 | A simple perl script to generate synthetic transcripts with a targetted word error rate. Outputs will be written as plain text to `ref.out` and `hyp.out`.
 6 | The script contains settings to generate specific INS/DEL/SUB error frequencies, in addition to target reference transcript length. This is useful for testing the WER behavior of `fstalign` and also the performance of the algorithm when hit with edge case scenarios (e.g. 80% deletion rate).
 7 | 
 8 | Example usage:
 9 | `perl generate_wer_test_data.pl --ins_fract 0.2 --del_fract 0.3 --sub_fract 0.2 --ref_length 1000 --oref ref.out --ohyp hyp.out`
10 | 
11 | Example output:
12 | ```
13 | writing to [ref.out]
14 | writing to [hyp.out]
15 | 181 INS
16 | 316 DEL
17 | 205 SUB
18 | expected WER 0.702
19 | ```
20 | 
21 | NOTE: this script provides an approximate WER, the algorithm could use some fine tuning to be exact.
22 | 
23 | ## gather_runtime_metrics.sh
24 | A simple bash script that is meant for benchmarking the resource (RAM and runtime) consumption of fstalign across different transcript settings (length, WER). It uses the `generate_wer_test_data.pl` to generate fake transcripts with a suite of hard-coded settings and runs them through fstalign, recording the resource usage to a CSV.
25 | 
26 | Example usage:
27 | `bash gather_runtime_metrics.sh output_for_this_release.csv`
28 | 
29 | ## sbs2fst.py
30 | A python interface to simplify the conversion of a side-by-side file, generated from fstalign's `--output-sbs` flag, into [files that can be used to produce an FST using OpenFST](https://www.openfst.org/twiki/bin/view/FST/FstQuickTour).
31 | 
32 | Example usage:
33 | 
34 | `python sbs2fst.py sbs_file.txt fst_file_name`
35 | 
36 | The output will be two files: `fst_file_name.fst` which will describe the FST in the AT&T FSM format used by OpenFST, and `fst_file_name.txt` which contains the complete list of symbols in the FST.
37 | 
38 | The additional flags can be passed into the python script to add metadata that fstalign uses for tracking performance. These are useful to understand when fstalign picks tokens that are: only in the side-by-side's `ref_token` column (labeled by the `--left` flag), only in the side-by-side's `hyp_token` column (labeled by the `--right` flag), or in both columns because the `ref_token` and `hyp_token` agree (labeled by the `--gold` flag). 
39 | 
40 | Example usage:
41 | 
42 | `python sbs2fst.py --tag --left VERBATIM --right NONVERBATIM --gold AGREEMENT sbs_file.txt fst_file_name`
43 | 
44 | The output will produce an FST with tags indicating tokens that were only in the `ref_token` with `VERBATIM`, tokens that were only in the `hyp_token` with `NONVERBATIM`, and tokens that were in both columns with `AGREEMENT`. 
45 | 
46 | ### Compiling the FST
47 | Once you have used `sbs2fst.py` to produce the `.txt` and `.fst` files, you *must* then compile the FST before passing it into fstalign. An example command can be found below:
48 | 
49 | `fstcompile --isymbols=${SYMBOLS} --osymbols=${SYMBOLS} ${TXT_FST} ${COMPILED_FST}`
50 | 
51 | where `SYMBOLS` is the `.txt` file produced by `sbs2fst.py`, `TXT_FST` is the `.fst` file, and `COMPILED_FST` is a new `.fst` file that produces the binary FST usable by fstalign.
52 | 
53 | Example usage:
54 | ```bash
55 | python sbs2fst.py --tag --left VERBATIM --right NONVERBATIM --gold AGREEMENT sbs_file.txt fst_file_name
56 | fstcompile --isymbols=fst_file_name.txt --osymbols=fst_file_name.txt fst_file_name.fst fst_file_name.compiled.fst
57 | ```
58 | You can now use `fst_file_name.compiled.fst` in fstalign with the corresponding symbols file as follows:
59 | ```bash
60 | fstalign --ref fst_file_name.complied.fst --symbols fst_file_name.txt ...
61 | ```
62 | 
63 | Note that when you `sbs2fst.py` to produce a "tagged" FST with the `--tag` flag, fstalign will aggregate WER metrics for each of the specified tags (`--left`, `--right`, and `--gold`) in the JSON log file specified by fstalign's `--json-log` flag.
64 | 
65 | 


--------------------------------------------------------------------------------
/tools/gather_runtime_metrics.sh:
--------------------------------------------------------------------------------
 1 | # Script to gather runtime metrics on fstalign binary
 2 | 
 3 | benchmark_settings() {
 4 |     local outdir=$1         # directory to write refs, hyps, and stats to
 5 |     local ref_length=$2     # target number of words when making a synthetic reference
 6 |     local num_repeats=$3    # number of trials to run for this benchmark
 7 |     local ins_rate=$4       # target insertion rate when making a synthetic hypothesis
 8 |     local del_rate=$5       # target deletion rate when making a synthetic hypothesis
 9 |     local sub_rate=$6       # target substitution rate when making a synthetic hypothesis
10 |     local outcsv=$7         # output to write comma separated stats to
11 | 
12 |     for i in $(seq $num_repeats); do
13 |         perl generate_wer_test_data.pl --ins_fract $ins_rate \
14 |             --del_fract $del_rate \
15 |             --sub_fract $sub_rate \
16 |             --ref_length $ref_length \
17 |             --oref "${outdir}/ref${i}.txt" \
18 |             --ohyp "${outdir}/hyp${i}.txt"
19 | 
20 |         /usr/bin/time -v fstalign wer --ref "${outdir}/ref${i}.txt" \
21 |             --hyp "${outdir}/hyp${i}.txt" 2> "${outdir}/stats${i}.txt"
22 | 
23 |         runtime=$(grep "Elapsed (wall clock) time" "${outdir}/stats${i}.txt" | awk 'NF>1{print $NF}')
24 |         ram=$(grep "Maximum resident set size" "${outdir}/stats${i}.txt" | awk 'NF>1{print $NF}')
25 | 
26 |         echo "${ref_length},${ins_rate},${del_rate},${sub_rate},${runtime},${ram}" >> "${outcsv}"
27 |     done
28 | }
29 | 
30 | main() {
31 |     echo "$0 $@"  # Print the command line for logging
32 | 
33 |     local outcsv=$1
34 | 
35 |     echo "length,ins,del,sub,runtime,ram" >> "${outcsv}"
36 | 
37 |     # Stage 1: medium transcripts, different WER
38 |     dir="temp"
39 |     mkdir "${dir}"
40 |     benchmark_settings "${dir}" 1000 5 0.1 0.1 0.1 "${outcsv}"
41 |     benchmark_settings "${dir}" 1000 5 0.2 0.2 0.2 "${outcsv}"
42 |     benchmark_settings "${dir}" 1000 5 0.3 0.3 0.3 "${outcsv}"
43 |     benchmark_settings "${dir}" 1000 5 0.1 0.1 0.4 "${outcsv}"
44 |     benchmark_settings "${dir}" 1000 5 0.1 0.4 0.1 "${outcsv}"
45 |     benchmark_settings "${dir}" 1000 5 0.4 0.1 0.1 "${outcsv}"
46 | 
47 |     # Stage 2: single WER, different length transcripts
48 |     benchmark_settings "${dir}" 100 10 0.1 0.1 0.1 "${outcsv}"
49 |     benchmark_settings "${dir}" 200 10 0.1 0.1 0.1 "${outcsv}"
50 |     benchmark_settings "${dir}" 400 10 0.1 0.1 0.1 "${outcsv}"
51 |     benchmark_settings "${dir}" 800 5 0.1 0.1 0.1 "${outcsv}"
52 |     benchmark_settings "${dir}" 2000 5 0.1 0.1 0.1 "${outcsv}"
53 |     benchmark_settings "${dir}" 4000 2 0.1 0.1 0.1 "${outcsv}"
54 |     benchmark_settings "${dir}" 8000 2 0.1 0.1 0.1 "${outcsv}"
55 |     benchmark_settings "${dir}" 16000 2 0.1 0.1 0.1 "${outcsv}"
56 |     benchmark_settings "${dir}" 32000 2 0.1 0.1 0.1 "${outcsv}"
57 | }
58 | 
59 | main "$@"
60 | 


--------------------------------------------------------------------------------
/tools/generate_wer_test_data.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use Getopt::Long;
  3 | 
  4 |  my $in_ref;
  5 |  my $out_ref = "ref.out";
  6 |  my $out_hyp = "hyp.out";
  7 |  my $ins_fract = 0.1;
  8 |  my $del_fract = 0.1;
  9 |  my $sub_fract = 0.1;
 10 |  my $ref_length = 1000;
 11 | 
 12 | my $voc_length = 10000;
 13 | 
 14 | my $rc = GetOptions(
 15 |     "iref=s" =>\$in_ref,
 16 |     "oref=s" =>\$out_ref,
 17 |     "ohyp=s" =>\$out_hyp,
 18 |     "ins_fract=f" => \$ins_fract,
 19 |     "del_fract=f" => \$del_fract,
 20 |     "sub_fract=f" => \$sub_fract,
 21 |     "ref_length=i" => \$ref_length,
 22 |     "voc_length=i" => \$voc_length,
 23 | );
 24 | 
 25 | die "check your commandline!\n" if(!$rc);
 26 | 
 27 | # just making things slightly easier
 28 | our %words;
 29 | our $word_mass =0;
 30 | my @ref_words;
 31 | my @hyp_words;
 32 | 
 33 | if(!defined($in_ref)){
 34 |     for(my $i = 0; $i < $voc_length; $i++)
 35 |     {
 36 |         my $w = sprintf("w%06d", $i);
 37 |         # fix to get a non-uniform distribution
 38 |         my $mass = 1;
 39 |         $words{$w} = $mass;
 40 |         $word_mass += $mass;
 41 |     }
 42 | 
 43 |     for(my $i =0; $i < $ref_length; $i++){
 44 |         my $w = select_word();
 45 |         push(@ref_words, $w);
 46 |     }
 47 | } else {
 48 |     open(FF, "<$in_ref") || die "couldn't open [$in_ref] for reading!";
 49 |     while(my $l = <FF>){
 50 |         chomp($l);
 51 |         $l=~s/\s*$//;
 52 |         my @wds = split(/\s+/, $l);
 53 |         foreach (@wds){
 54 |             $words{$_}++;
 55 |             $word_mass += 1;
 56 |             push(@ref_words, $_);
 57 |         }
 58 |     }
 59 | 
 60 |     $ref_length = scalar(@ref_words);
 61 | }
 62 | 
 63 | my $num_ins = 0;
 64 | my $num_del = 0;
 65 | my $num_sub = 0;
 66 | my $i = 0;
 67 | my $last_was_del = 0;
 68 | 
 69 | my $ins_thres = $ins_fract;
 70 | my $del_thres = $ins_thres + $del_fract;
 71 | my $sub_thres = $del_thres + $sub_fract;
 72 | my $owed_ins = 0;
 73 | 
 74 | # Algorithm is as follows:
 75 | # Because "word error rate" is defined as a rate respective to the number of 
 76 | # reference words, we sample for an "error" while looping over a reference word
 77 | # counter. The only thing we need to do is avoid consecutive INS+DEL or DEL+INS,
 78 | # because these will be counted as SUB. Thus, for every INS sampled, we add a
 79 | # ref word after the INS to avoid INS+DEL, or add to a counter to owed_ins if
 80 | # a DEL just happened.
 81 | 
 82 | while($i < $ref_length)
 83 | {
 84 |     my $r = rand();
 85 |     my $rw = $ref_words[$i];
 86 | 
 87 |     if($r <= $ins_thres)
 88 |     {
 89 |         if($last_was_del){
 90 |             # let's not insert after a deletion, this looks like 
 91 |             # a substitution
 92 |             $owed_ins++;
 93 | 
 94 |             # Add in a reference word to keep sampling
 95 |             push(@hyp_words, $rw);
 96 |             $i++;
 97 |             next;
 98 |         } else {
 99 |             # safe to insert, add an inserted word and also
100 |             # add the reference word we are sampling
101 |             my $ins_w = select_word();
102 |             push(@hyp_words, $ins_w);
103 |             $num_ins++;
104 | 
105 |             push(@hyp_words, $rw);
106 |             $i++;
107 |             $last_was_del = 0;
108 |         }
109 |     } elsif($r < $del_thres){
110 |         $num_del++;
111 |         $i++;
112 |         $last_was_del = 1;
113 |     } elsif($r < $sub_thres){
114 |         my $sub_w = select_word();
115 |         while($sub_w eq $rw)
116 |         {
117 |             $sub_w = select_word();
118 |         }
119 | 
120 |         $num_sub++;
121 |         push(@hyp_words, $sub_w);
122 |         $i++;
123 |         $last_was_del = 0;
124 |     } else {
125 |         if(!$last_was_del){
126 |             # clean out the buffer of owed insertions
127 |             while($owed_ins > 0){
128 |                 my $ins_w = select_word();
129 |                 push(@hyp_words, $ins_w);
130 |                 $num_ins++;
131 |                 $owed_ins--;
132 |             }
133 |         }
134 | 
135 |         $i++;
136 |         # phew...  a correct word...
137 |         push(@hyp_words, $rw);
138 |         $last_was_del = 0;
139 |     }
140 | }
141 | 
142 | if(defined($out_ref)){
143 |     dump_words($out_ref, \@ref_words);
144 | }
145 | 
146 | if(defined($out_hyp)){
147 |     dump_words($out_hyp, \@hyp_words);
148 | }
149 | 
150 | print "$num_ins INS\n";
151 | print "$num_del DEL\n";
152 | print "$num_sub SUB\n";
153 | printf "expected WER %.3f\n", ($num_ins + $num_del + $num_sub) / $ref_length;
154 | 
155 | sub dump_words{
156 |     my ($ofn, $aref) = @_;
157 | 
158 |     print "writing to [$ofn]\n";
159 |     open(OUT, ">$ofn") || die "couldn't open [$ofn] for writing!";
160 |     foreach (@$aref){
161 |         print OUT $_," ";
162 |     }
163 |     print OUT "\n";
164 |     close(OUT);
165 | 
166 | }
167 | 
168 | 
169 | 
170 | 
171 | sub select_word {
172 |     my $r = int(rand($word_mass));
173 |     my $w;
174 | 
175 |     my $cur_sum = 0;
176 |     while ( my ( $key, $value ) = each %words ) {
177 |         $w = $key;
178 |         $cur_sum += $value;
179 |         if($r <= $cur_sum){
180 |             last;
181 |         }
182 |     }
183 | 
184 |     return $w;
185 | }


--------------------------------------------------------------------------------
/tools/images/120_short_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_short_files.png


--------------------------------------------------------------------------------
/tools/images/120_vs_130_ram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_vs_130_ram.png


--------------------------------------------------------------------------------
/tools/images/120_vs_130_runtime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/120_vs_130_runtime.png


--------------------------------------------------------------------------------
/tools/images/130_short_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revdotcom/fstalign/82dec1dabec06a53b7f3c28b51dcd1b0a2dd2f1d/tools/images/130_short_files.png


--------------------------------------------------------------------------------