├── .gitignore
├── .pre-commit-config.yaml
├── .secrets.baseline
├── CONTRIBUTING.md
├── Experiments.md
├── From_WER_and_RIL_to_MER_and_WIL_improved_evaluatio.pdf
├── LICENSE
├── README.md
├── analyze.py
├── config.ini.sample
├── config.py
├── experiment.py
├── models.py
├── optional_analyze_with_sclite.py
├── requirements.txt
├── sample-files
    ├── ibuprofen.wav
    ├── lipitor.wav
    ├── reference_transcriptions.csv
    ├── stt_transcriptions.csv
    ├── tylenol.wav
    ├── vicodin.wav
    ├── wer_details.csv
    └── wer_summary.json
├── test_config.py
└── transcribe.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | config.ini
 2 | config_*
 3 | reference_transcriptions.csv
 4 | reference_transcriptions_*
 5 | stt_transcriptions.csv
 6 | wer_details.csv
 7 | wer_summary.json
 8 | wer_word_accuracy.csv
 9 | audio_input*
10 | *.wav
11 | *.mp3
12 | __pycache__
13 | .DS_Store
14 | experiments_*
15 | output*
16 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # This is an example configuration to enable detect-secrets in the pre-commit hook.
 2 | # Add this file to the root folder of your repository.
 3 | #
 4 | # Read pre-commit hook framework https://pre-commit.com/ for more details about the structure of config yaml file and how git pre-commit would invoke each hook.
 5 | #
 6 | # This line indicates we will use the hook from ibm/detect-secrets to run scan during committing phase.
 7 | # Whitewater/whitewater-detect-secrets would sync code to ibm/detect-secrets upon merge.
 8 | repos:
 9 | - repo: https://github.com/ibm/detect-secrets
10 |   # If you desire to use a specific version of detect-secrets, you can replace `master` with other git revisions such as branch, tag or commit sha.
11 |   # You are encouraged to use static refs such as tags, instead of branch name
12 |   #
13 |   # Running "pre-commit autoupdate" would automatically updates rev to latest tag
14 |   rev: 0.13.1+ibm.46.dss
15 |   hooks:
16 |     - id: detect-secrets # pragma: whitelist secret
17 |       # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options.
18 |       # You may also run `pre-commit run detect-secrets` to preview the scan result.
19 |       # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file
20 |       # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins
21 |       # add "--fail-on-non-audited" to fail pre-commit for unaudited potential secrets
22 |       args: [--baseline, .secrets.baseline, --use-all-plugins ]
23 | 


--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
 1 | {
 2 |   "exclude": {
 3 |     "files": "^.secrets.baseline$",
 4 |     "lines": null
 5 |   },
 6 |   "generated_at": "2021-11-29T21:06:43Z",
 7 |   "plugins_used": [
 8 |     {
 9 |       "name": "AWSKeyDetector"
10 |     },
11 |     {
12 |       "name": "ArtifactoryDetector"
13 |     },
14 |     {
15 |       "name": "AzureStorageKeyDetector"
16 |     },
17 |     {
18 |       "base64_limit": 4.5,
19 |       "name": "Base64HighEntropyString"
20 |     },
21 |     {
22 |       "name": "BasicAuthDetector"
23 |     },
24 |     {
25 |       "name": "BoxDetector"
26 |     },
27 |     {
28 |       "name": "CloudantDetector"
29 |     },
30 |     {
31 |       "ghe_instance": "github.ibm.com",
32 |       "name": "GheDetector"
33 |     },
34 |     {
35 |       "name": "GitHubTokenDetector"
36 |     },
37 |     {
38 |       "hex_limit": 3,
39 |       "name": "HexHighEntropyString"
40 |     },
41 |     {
42 |       "name": "IbmCloudIamDetector"
43 |     },
44 |     {
45 |       "name": "IbmCosHmacDetector"
46 |     },
47 |     {
48 |       "name": "JwtTokenDetector"
49 |     },
50 |     {
51 |       "keyword_exclude": null,
52 |       "name": "KeywordDetector"
53 |     },
54 |     {
55 |       "name": "MailchimpDetector"
56 |     },
57 |     {
58 |       "name": "NpmDetector"
59 |     },
60 |     {
61 |       "name": "PrivateKeyDetector"
62 |     },
63 |     {
64 |       "name": "SlackDetector"
65 |     },
66 |     {
67 |       "name": "SoftlayerDetector"
68 |     },
69 |     {
70 |       "name": "SquareOAuthDetector"
71 |     },
72 |     {
73 |       "name": "StripeDetector"
74 |     },
75 |     {
76 |       "name": "TwilioKeyDetector"
77 |     }
78 |   ],
79 |   "results": {},
80 |   "version": "0.13.1+ibm.46.dss",
81 |   "word_list": {
82 |     "file": null,
83 |     "hash": null
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | Thanks for your interest in this tool! We are glad you have found it and hope that you can use and improve it.  Contributions are welcome!
 3 | 
 4 | ## How to contribute an issue
 5 | We welcome issues and enhancement requests on our [Issues Tracker](https://github.com/IBM/watson-stt-wer-python/issues).  
 6 | 
 7 | When reporting an issue, please be as specific as possible including:
 8 | * Python version & OS
 9 | * Error message(s) encountered (if any)
10 | * Configuration values
11 | 
12 | If you have a problem using our tool, please open an issue!
13 | 
14 | ## How to contribute code
15 | We use a Pull Request process with a required review from a maintainer.
16 | 
17 | Before submitting your pull request, consider the following questions:
18 | * Has the documentation been updated to support this change?
19 | * Is the change backward-compatible to older configuration files?
20 | * Are new dependencies introduced, and are they included in the `requirements.txt`?
21 | * Does the change introduce complexity for novice users? Are optional features clearly marked optional?
22 | * Are debugging `print` statements removed?
23 | * Does the change use descriptive variable names?
24 | 
25 | Please also review the commits:
26 | * Include an issue number in the commit name, to link the commit, issue, and pull request
27 | * Include a descriptive message
28 | * Sign the commits to certify your Developer Certificate of Origin (DCO)
29 | 
30 | For example use: `git commit -sm "#issue_number - <your message here>"`
31 | 
32 | # Code of Conduct
33 | ## Our Pledge
34 | 
35 | We as members, contributors, and leaders pledge to make participation in our
36 | community a harassment-free experience for everyone, regardless of age, body
37 | size, visible or invisible disability, ethnicity, sex characteristics, gender
38 | identity and expression, level of experience, education, socio-economic status,
39 | nationality, personal appearance, race, caste, color, religion, or sexual
40 | identity and orientation.
41 | 
42 | We pledge to act and interact in ways that contribute to an open, welcoming,
43 | diverse, inclusive, and healthy community.
44 | 
45 | ## Our Standards
46 | 
47 | Examples of behavior that contributes to a positive environment for our
48 | community include:
49 | 
50 | * Demonstrating empathy and kindness toward other people
51 | * Being respectful of differing opinions, viewpoints, and experiences
52 | * Giving and gracefully accepting constructive feedback
53 | * Accepting responsibility and apologizing to those affected by our mistakes,
54 |   and learning from the experience
55 | * Focusing on what is best not just for us as individuals, but for the overall
56 |   community
57 | 
58 | Examples of unacceptable behavior include:
59 | 
60 | * The use of sexualized language or imagery, and sexual attention or advances of
61 |   any kind
62 | * Trolling, insulting or derogatory comments, and personal or political attacks
63 | * Public or private harassment
64 | * Publishing others' private information, such as a physical or email address,
65 |   without their explicit permission
66 | * Other conduct which could reasonably be considered inappropriate in a
67 |   professional setting
68 | 
69 | ## Attribution
70 | 
71 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
72 | version 2.1, available at
73 | https://www.contributor-covenant.org/version/2/1/code_of_conduct.html.


--------------------------------------------------------------------------------
/Experiments.md:
--------------------------------------------------------------------------------
 1 | # Fine Tuning Your Configuration
 2 | 
 3 | In its default state, Watson STT provides very accurate transcriptions across a wide range of words, accents, and audio interferances. However, should you want to get under the hood and fine tune things, Watson STT has you covered. With settings for speech sensitivity, background audio suppression, and customization model bias, to name a few, Watson STT gives you the ability to optimize your solution to fit your business and customer needs.
 4 | 
 5 | ## Sounds great, how do I optimize the fine tuning, though?
 6 | Getting that perfect balance can be tricky. Manually cycling through parameter values, or taking guesses at optimal settings, takes time and will often not produce the best results for your solution. To know for sure which settings provide the best transcriptions, you'll want to perform a grid search on the settings you care about. When complete, you'll be able to make a confident decision on what to set each parameter, knowing that those decisions will verifiably lead to better transcriptions within your usecase. 
 7 | 
 8 | ## What is grid search?
 9 | Grid search is a fancy term for cycling through all combinations of various parameter values to find the most optimal ones. In the case of a STT grid search, the optimal values are the ones that produce the most accurate transcriptions for a given test set. During grid search, a given iteration will set the parameters to specific values, run a test set of audio files through STT, and analyze the results for accuracy. It then increments one of the parameters and starts a new iteration. After all possible combinations of values have been attempted and analyzed, results are provided showing the accuracy of each iteration and the values used during that iteration.
10 | 
11 | ## This can be automated, right?
12 | Of course! In fact it already has! Within the https://github.com/IBM/watson-stt-wer-python repo there is a script called `experiment.py` to facilitate grid search against Watson STT. Here's how it works:
13 | 
14 | Clone down the repo to your local machine
15 | 
16 | ```git clone git@github.com:IBM/watson-stt-wer-python.git```
17 | 
18 | Follow the installation instructions in the repository's README - https://github.com/IBM/watson-stt-wer-python#installation
19 | 
20 | Select audio files to use as a test set. Note that the more files you include both the more representative the results will be, and the longer grid search will take. Store this test set into a unique directory.
21 | 
22 | Create a reference file containing audio filenames and their associated, correct, transcritption. For example:
23 | ```
24 | Audio File Name,Reference
25 | ./sample-files/sample_audio/vicodin.wav,I will prescribe you some Vicodin
26 | ./sample-files/sample_audio/lipitor.wav,"To deal with your bad cholesterol, Lipitor would be a good option"
27 | ./sample-files/sample_audio/tylenol.wav,Take two Tylenol for your fever
28 | ./sample-files/sample_audio/ibuprofen.wav,Ibuprofen is good for your muscle aches
29 | ```
30 | 
31 | Update the `config.ini` file with:
32 | 1. The `apikey`, `service_url`, and `base_model_name` of the service to optimize. 
33 | 1. If using a custom model, include the `language_model_id` and/or `acoustic_model_id`. 
34 | 1. The `reference_transcriptions_file` you created 
35 | 1. The `audio_file_folder` containing your test set of audio files. 
36 | 1. The `stt_transcriptions_file` you wish to have transcripts stored into as a filename.
37 | 1. The parameters you wish to optimize. If you would like to omit parameters from the grid search, set their `_min` and `_max` values to the same value, which will be used in each iteration. 
38 |     1. `sds_*` controls the `speech_detector_sensitivity` parameter
39 |     1. `bias_*` controls the `character_insertion_bias` parameter
40 |     1. `cust_weight_*` controls the `customization_weight` parameter
41 |     1. `bas_*` controls the `background_audio_suppression` parameter
42 |     1. `end_of_phrase_silence_time_*` controls the `end_of_phrase_silence_time_` parameter
43 |     For example the following will iterate through `customization_weight` from `0` to `0.3` at `0.1` increments, while keeping the other parameters static:
44 |         ```
45 |         [Experiments]
46 |         sds_min=0.7
47 |         sds_max=0.7
48 |         sds_step=0.1
49 |         bias_min=0
50 |         bias_max=0.0
51 |         bias_step=0.1
52 |         cust_weight_min=0
53 |         cust_weight_max=0.3
54 |         cust_weight_step=0.1
55 |         bas_min=0
56 |         bas_max=0.0
57 |         bas_step=0.1
58 |         end_of_phrase_silence_time_min=0
59 |         end_of_phrase_silence_time_max=0.0
60 |         end_of_phrase_silence_time_step=0.1
61 |         ```
62 | 
63 | Run the grid search
64 | ```
65 | python experiment.py --config_file config.ini --log_level INFO
66 | 
67 | 2023-05-04 11:13:34,607 - INFO - Running Experiment -- Character Insertion Bias: 0.0, Customization Weight: 0.0, Speech Detector Sensitivity: 0.7, Background Audio Suppression: 0.0
68 | 2023-05-04 11:13:44,665 - INFO - Completed transcribing 4 files out of 4
69 | 2023-05-04 11:13:44,666 - INFO - Wrote transcriptions for 4 audio files to ./sample-files/bias_0.0_weight_0.0_sds_0.7_bas_0.0/stt_transcriptions.csv
70 | 2023-05-04 11:13:44,676 - INFO - Updated ./sample-files/bias_0.0_weight_0.0_sds_0.7_bas_0.0/stt_transcriptions.csv with reference transcriptions
71 | 2023-05-04 11:13:44,680 - INFO - Created ctm file - ./sample-files/bias_0.0_weight_0.0_sds_0.7_bas_0.0/stt_transcriptions.ctm
72 | 2023-05-04 11:13:44,687 - INFO - Created stm file - ./sample-files/bias_0.0_weight_0.0_sds_0.7_bas_0.0/stt_transcriptions.stm
73 | 2023-05-04 11:13:44,709 - INFO - Created summary file - /Users/gecock/Desktop/watson-stt-wer-python-fork/watson-stt-wer-python/sample-files/bias_0.0_weight_0.0_sds_0.7_bas_0.0/sclite_wer_summary.json
74 | 2023-05-04 11:13:44,716 - INFO - Experiment Complete 
75 | 
76 | 2023-05-04 11:13:44,716 - INFO - Running Experiment -- Character Insertion Bias: 0.0, Customization Weight: 0.1, Speech Detector Sensitivity: 0.7, Background Audio Suppression: 0.0
77 | 2023-05-04 11:13:56,141 - INFO - Completed transcribing 4 files out of 4
78 | ...
79 | 2023-05-04 11:14:13,783 - INFO - 
80 | |    | task                                |   Substitutions |   Deletions |   Insertions |   Word Error Rate |   Sentence Error Rate |   Total Words |   Total Sentences |
81 | |---:|:------------------------------------|----------------:|------------:|-------------:|------------------:|----------------------:|--------------:|------------------:|
82 | |  0 | bias_0.0_weight_0.0_sds_0.7_bas_0.0 |            16.1 |         0   |         16.1 |              32.3 |                   100 |            31 |                 4 |
83 | |  1 | bias_0.0_weight_0.1_sds_0.7_bas_0.0 |            16.1 |         0   |         16.1 |              32.3 |                   100 |            31 |                 4 |
84 | |  2 | bias_0.0_weight_0.3_sds_0.7_bas_0.0 |            16.1 |         3.2 |         12.9 |              32.3 |                   100 |            31 |                 4 |
85 | |  3 | bias_0.0_weight_0.2_sds_0.7_bas_0.0 |            16.1 |         0   |         16.1 |              32.3 |                   100 |            31 |                 4 |
86 | ```
87 | 
88 | View the results in the file `all_summaries.csv` to see a concise view of all iterations, or, view the details of each iteration by looking at the set of directories created in the format `bias_<bias-value>_weight_<customization-weight-value>_sds_<sds-value>_bas_<bas-value>`
89 | 
90 | 


--------------------------------------------------------------------------------
/From_WER_and_RIL_to_MER_and_WIL_improved_evaluatio.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/watson-stt-wer-python/661c329230c588f71046228ffbca4737f5d19d1f/From_WER_and_RIL_to_MER_and_WIL_improved_evaluatio.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # STT-WER-Python
  2 | Utilities for
  3 | * [Transcribing](#transcription) a set of audio files with Speech to Text (STT)
  4 | * [Analyzing](#analysis) the error rate of the STT transcription against a known-good transcription
  5 | * [Experimenting](#experimenting) with various parameters to find optimal values
  6 | 
  7 | ## More documentation
  8 | This readme describes the tools in depth.  For more information on use cases and methodology, please see the following articles:
  9 | * [New Python Scripts to Measure Word Error Rate on Watson Speech to Text](https://medium.com/@marconoel/new-python-scripts-to-measure-word-error-rate-on-watson-speech-to-text-77ecaa513f60): How to use these tools, including a YouTube video demonstration
 10 | * [New Speech Testing Utilities for Conversational AI Projects](https://medium.com/ibm-watson-speech-services/new-speech-testing-utilities-for-conversational-ai-projects-bf73debe19be): Describes recipe for using Text to Speech to "bootstrap" testing data
 11 | * [Data Collection and Training for Speech Projects](https://medium.com/ibm-data-ai/data-collection-and-training-for-speech-projects-22004c3e84fb): How to collect test data from human voices.
 12 | * [How to Train your Speech to Text Dragon](https://medium.com/ibm-watson/watson-speech-to-text-how-to-train-your-own-speech-dragon-part-1-data-collection-and-fdd8cea4f4b8)
 13 | * [A mental model for Speech to Text training](https://medium.com/ibm-watson-speech-services/a-mental-model-for-speech-to-text-training-8c56a4105e25)
 14 | 
 15 | You may also find useful:
 16 | * [TTS-Python](https://github.com/IBM/watson-tts-python) - companion tooling for IBM Text to Speech
 17 | 
 18 | ## Installation
 19 | Requires Python 3.x installation.
 20 | 
 21 | All of the watson-stt-wer-python dependencies are installed at once with `pip`:
 22 | 
 23 | ```
 24 | pip install -r requirements.txt
 25 | ```
 26 | 
 27 | **Note:**  If receiving an SSL Certificate error (CERTIFICATE_VERIFY_FAILED) when running the python scripts, try the following commands to tell python to use the system certificate store.
 28 | 
 29 | **_Windows_**
 30 | ```
 31 | pip install --trusted-host pypi.org --trustedhost files.python.org python-certifi-win32
 32 | ```
 33 | 
 34 | **_MacOS_**
 35 | 
 36 | Open a terminal and change to the location of your python installation to execute `Install Certificates.command`, for example:
 37 | ```
 38 | cd /Applications/Python 3.6
 39 | ./Install Certificates.command
 40 | ```
 41 | 
 42 | ## Setup
 43 | Create a copy of `config.ini.sample`. You'll modify this file in subsequent steps.
 44 | ```
 45 | cp config.ini.sample config.ini
 46 | ```
 47 | 
 48 | Each sub-sections will describe what configuration parameters are needed.
 49 | 
 50 | # Generic Command-Line parameters
 51 | 
 52 | `--config_file` or `-c` is the configuration file to be used. The default is `config.ini`
 53 | 
 54 | `--log_level` or `-ll` is the log level to be used when running the script. Supported levels are as follows:
 55 | - `ERROR` -- Print out only when things fail.
 56 | - `WARN` -- Print out cautions and when things fail.
 57 | - `INFO` -- (Default) Print out useful status, cautions, and when things fail.
 58 | - `DEBUG` -- Print out every possible message.
 59 | 
 60 | # Transcription
 61 | Uses IBM Watson Speech to Text service to transcribe a folder full of audio files.  Creates a CSV with transcriptions.
 62 | 
 63 | 
 64 | 
 65 | ## Setup
 66 | Update the parameters in your `config.ini` file.
 67 | 
 68 | Required configuration parameters:
 69 | * apikey - API key for your Speech to Text instance
 70 | * service_url - Reference URL for your Speech to Text instance
 71 | * base_model_name - Base model for Speech to Text transcription
 72 | 
 73 | Optional configuration parameters:
 74 | * max_threads - Maximum number of threads to use with `transcribe.py` to improve performance.
 75 | * language_model_id - Language model customization ID (comment out to use base model)
 76 | * acoustic_model_id - Acoustic model customization ID (comment out to use base model)
 77 | * grammar_name - Grammar name (comment out to use base model)
 78 | * stt_transcriptions_file - Output file for Speech to Text transcriptions
 79 | * audio_file_folder - Input directory containing your audio files
 80 | * reference_transcriptions_file - Reference file for manually transcribed audio files ("labeled data" or "ground truth").  If present, will be merged into `stt_transcriptions_file` as "Reference" column
 81 | * stemming - If True, pre-processing stems words with Porter stemmer. Stemming will treat singular/plural of a word as equivalent, rather than a word error.
 82 | 
 83 | 
 84 | 
 85 | ## Execution
 86 | Assuming your configuration is in `config.ini`, transcribe all the audio files in `audio_file_folder` parameter via the following command:
 87 | 
 88 | ```
 89 | python transcribe.py --config_file config.ini --log_level DEBUG
 90 | ```
 91 | 
 92 | See [Generic Command Line Parameters](#generic-command-line-parameters) for more details.
 93 | 
 94 | ## Output
 95 | Transcription will be stored in a CSV file based on `stt_transcriptions_file` parameter with a format like below:
 96 | 
 97 | Audio File|Transcription
 98 | -----|-----
 99 | file1.wav|The quick brown fox
100 | file2.wav|jumped over the lazy dog
101 | 
102 | A third column, "Reference", will be included with the reference transcription, if a `reference_transcriptions_file` is found as source.
103 | 
104 | # Analysis
105 | Simple python package to approximate the Word Error Rate (WER), Match Error Rate (MER), Word Information Lost (WIL) and Word Information Preserved (WIP) of one or more transcripts.
106 | 
107 | ## Setup
108 | Your config file must have references for the `reference_transcriptions_file` and `stt_transcriptions_file` properties.
109 | 
110 | * **Reference file** (`reference_transcriptions_file`) is a CSV file with at least columns called `Audio File Name` and `Reference`.  The `Reference` is the actual transcription of the audio file (also known as the "ground truth" or "labeled data"). NOTE: In your audio file name, make sure you put the full path (eg. ./audio1.wav)
111 | * **Hypothesis file** (`stt_transcriptions_file`) is a CSV file with at least columns called `Audio File Name` and `Hypothesis`.  The `Hypothesis` is the transcription of the audio file by the Speech to Text engine.  The `transcribe.py` script can create this file.
112 | 
113 | ## Results
114 | * **Details** (`details_file`) is a CSV file with rows for each audio sample, including reference and hypothesis transcription and specific transcription errors
115 | * **Summary** (`summary_file`) is a JSON file with metrics for total transcriptions and overall word and sentence error rates.
116 | * **Accuracy** (`word_accuracy_file`) is a CSV file with rows
117 | 
118 | ## Metrics (Definitions)
119 | - WER (word error rate), commonly used in ASR assessment, measures the cost of restoring the output word sequence to the original input sequence.
120 | - MER (match error rate) is the proportion of I/O word matches which are errors.
121 | - WIL (word information lost) is a simple approximation to the proportion of word information lost which overcomes the problems associated with the RIL (relative information lost) measure that was proposed half a century ago.
122 | 
123 | ## Background on supporting library
124 | Repo of the Python module JIWER: https://pypi.org/project/jiwer/
125 | 
126 | It computes the minimum-edit distance between the ground-truth sentence and the hypothesis sentence of a speech-to-text API.
127 | The minimum-edit distance is calculated using the python C module python-Levenshtein.
128 | 
129 | ## Execution
130 | 
131 | ```
132 | python analyze.py --config_file config.ini --log_level DEBUG
133 | ```
134 | 
135 | See [Generic Command Line Parameters](#generic-command-line-parameters) for more details.
136 | 
137 | # Analysis with sclite
138 | This repo provides a wrapper script, `optional_analyze_with_sclite.py`, to run `sclite`, which is an open source tool designed to evaluate STT transcription results. `sclite` goes beyound regular WER and SER reporting to provide reports like Confusion Pairs to show exactly which words were substituted with what, or Text Alignment which shows the inline differences between the reference and transcribed texts. For more information about the output of `optional_analyze_with_sclite.py` see the results sub-section below. For more information about `sclite`, see -- https://people.csail.mit.edu/joe/sctk-1.2/doc/sclite.htm#sclite_name_0.
139 | 
140 | ## Setup
141 | 1. `reference_transcriptions_file` and `stt_transcriptions_file` must be populated in `config.ini` and exist on the filesystem.
142 | 1. `sclite_directory` must be uncommented and populated with the directory that hold the `sclite` executable
143 |     1. To install `sclite` follow the instructions here -- https://github.com/usnistgov/SCTK#sctk-basic-installation
144 | 
145 | ## Execution
146 | 
147 | ```
148 | python optional_analyze_with_sclite.py --config config.ini --log_level INFO
149 | ```
150 | See [Generic Command Line Parameters](#generic-command-line-parameters) for more details.
151 | 
152 | ## Results
153 | 1. `sclite_wer_summary.json` -- A concise summary of metrics
154 | 1. `*.sys` -- A summary file showing the number of words, sentences, deletions, insertions, substitutions, word error rate, and sentence error rate.
155 | 1. `*.prf` -- A text alignment file that shows, for each audio file, the reference text and transcribed text, and for each word whether it was inserted, deleted, substituted, or correct.
156 | 1. `*.dtl` -- A detail file showing confusion pairs and which specific words were inserted, deleted, or substituted.
157 | 
158 | There will also be the following two files that were created for use by `sclite` but are not direct outputs of `sclite`:
159 | 1. `*.ctm` -- A file containing a line for each transcribed word of each audio file
160 | 1. `*.stm` -- A file containing a reformatted version of the `reference_transcriptions_file` that `sclite` uses for evalutation
161 | 
162 | # Experimenting
163 | Use the `experiment.py` script to execute a series of Transcription/Analyze experiments to optimize SpeechToText parameters. 
164 | 
165 | ## Setup
166 | 
167 | Follow the setup for [Transcribing](#transcription).
168 | 
169 | Follow the setup for [Analyzing](#analysis).
170 | 
171 | The following parameters in `[Experiments]` all have a `*_min` and `*_max` variant to specify the lower limit and upper limit, respectively, for its corresponding `[SpeechToText]` parameter, and a `*_step` variant to specify the amount to increase that parameter in each experiment:
172 | 1. `sds_*` controls the `speech_detector_sensitivity` parameter
173 | 1. `bias_*` controls the `character_insertion_bias` parameter
174 | 1. `cust_weight_*` controls the `customization_weight` parameter
175 | 1. `bas_*` controls the `background_audio_suppression`parameter
176 | 1. `end_of_phrase_silence_time_*` controls the `end_of_phrase_silence_time_` parameter
177 | 
178 | Note: If you want to use `sclite` for analysis of each experiment be sure to configure `sclite_directory` under the `[ErrorRateOutput]` section.
179 | 
180 | ## Execution
181 | 
182 | ```
183 | python experiment.py --config_file config.ini --log_level INFO
184 | ```
185 | 
186 | See [Generic Command Line Parameters](#generic-command-line-parameters) for more details.
187 | 
188 | ## Results
189 | Each experiment creates a unique directory based on the parameters of that experiment in the format `bias_<bias-value>_weight_<customization-weight-value>_sds_<sds-value>_bas_<bas-value>`.
190 | 
191 | For each experiment the output files from [Transcribing](#transcription) and [Analyzing](#analysis) will be created in its unique output directory. 
192 | 
193 | There will be a final file created called `all_summaries.csv` that contains the summary of all experiments in a single CSV.   
194 | 
195 | # Model training
196 | The `models.py` script has wrappers for many model-related tasks including creating models, updating training contents, getting model details, and training models.
197 | 
198 | ## Setup
199 | Update the parameters in your `config.ini` file.
200 | 
201 | Required configuration parameters:
202 | * apikey - API key for your Speech to Text instance
203 | * service_url - Reference URL for your Speech to Text instance
204 | * base_model_name - Base model for Speech to Text transcription
205 | 
206 | ## Execution
207 | For general help, execute:
208 | ```
209 | python models.py
210 | ```
211 | 
212 | The script requires a type (one of base_model,custom_model,corpus,word,grammar) and an operation (one of list,get,create,update,delete)
213 | The script optionally takes a config file as an argument with `-c config_file_name_goes_here`, otherwise using a default file of `config.ini` which contains the connection details for your speech to text instance.
214 | Depending on the specified operation, the script also accepts a name, description, and file for an associated resource.  For instance, new custom models should have a name and description, and a corpus should have a name and associated file.
215 | 
216 | ## Examples
217 | 
218 | List all base models:
219 | ```
220 | python models.py -o list -t base_model
221 | ```
222 | 
223 | List all custom models:
224 | ```
225 | python models.py -o list -t custom_model
226 | ```
227 | 
228 | Create a custom model:
229 | ```
230 | python models.py -o create -t custom_model -n "model1" -d "my first model"
231 | ```
232 | 
233 | Add a corpus file for a custom model (the custom model's customization_id is stored in `config.ini.model1`)(`corpus1.txt` contains the corpus contents):
234 | ```
235 | python models.py -c config.ini.model1 -o create -n "corpus1" -f "corpus1.txt" -t corpus
236 | ```
237 | 
238 | Create corpora for all corpus files in a directory (the filename will be used for the corpora name)
239 | ```
240 | python models.py -c config.ini.model1 -o create -t corpus -dir corpus-dir
241 | ```
242 | 
243 | List all corpora for a custom model (the custom model's customization_id is stored in `config.ini.model1`):
244 | ```
245 | python models.py -c config.ini.model1 -o list -t corpus
246 | ```
247 | 
248 | Train a custom model (the custom model's customization_id is stored in `config.ini.model1`):
249 | ```
250 | python models.py -c config.ini.model1 -o update -t custom_model
251 | ```
252 | 
253 | Note some parameter combinations are not possible.  The operations supported all wrap the SDK methods documented at https://cloud.ibm.com/apidocs/speech-to-text.
254 | 
255 | # Sample setup for organizing multiple experiments
256 | Instructions for creating a directory structure for organizing input and output files for experiments for multiple models. Creating a new directory structure is recommend for each new model being experimented/tested. A sample `MemberID` model is shown.
257 | 1. Start from root of WER tool directory, `cd WATSON-STT-WER-PYTHON`
258 | 1. Create project directory, `mkdir -p <project name>`
259 |     1. e.g. `mkdir -p ClientName-data`
260 | 1. Create audio directory, `mkdir -p <project name>/audios/<audio type>`
261 |     1. e.g. `mkdir -p ClientName-data/audios/audio.memberID`
262 |     1. copy/upload audio files to directory
263 |         1. e.g. `cp /temp/audio/*.wav ClientName-data/audios/audio.memberID`
264 | 1. Create referemce transcriptions directory, `mkdir -p <project name>/reference_transcriptions`
265 |     1. e.g. `mkdir -p ClientName-data/reference_transcriptions`
266 |     1. copy/upload transcription file to directory
267 |         1. e.g. `cp/temp/transcriptions/reference_transcription_memberID.csv ClientName-data/reference_transcriptions`
268 | 1. Create experiments directory, `mkdir -p <project name>/experiments/<model description base>/<model detail>`
269 |     1. e.g. `mkdir -p ClientName-data/experiments/telephony_base/MemberID/`
270 | 1. Copy sample config file over to directory
271 |     1. e.g. `cp config.ini.sample ClientName-data/experiments/telephony_base/MemberID/config.ini`
272 |     1. Edit the config file to match your new directory structure
273 |         ```
274 |         base_model_name=en-US_Telephony
275 |         .
276 |         .
277 |         .
278 |         [Transcriptions]
279 |         reference_transcriptions_file=./ClientName-data/reference_transcriptions/reference_transcription_memberID.csv
280 |         stt_transcriptions_file=./ClientName-data/experiments/telephony_base/MemberID/stt_transcription.csv
281 |         audio_file_folder=./ClientName-data/audios/audio.memberID
282 | 
283 |         [ErrorRateOutput]
284 |         details_file=./ClientName-data/experiments/telephony_base/MemberID/wer_detailsMemberID.csv
285 |         summary_file=./ClientName-data/experiments/telephony_base/MemberID/wer_summaryMemberID.json
286 |         word_accuracy_file=./ClientName-data/experiments/telephony_base/MemberID/wer_word_accuracyMemberID.csv
287 |         stt_transcriptions_file=./ClientName-data/experiments/telephony_base/MemberID/stt_transcription.csv
288 |         ```
289 | 1. transcribe using the new config file, `python transcribe.py ClientName-data/experiments/telephony_base/MemberID/config.ini`
290 | 1. analyze using the new config file, `python analyze.py ClientName-data/experiments/telephony_base/MemberID/config.ini`
291 | 1. repeat previous steps for each new experiment
292 | 


--------------------------------------------------------------------------------
/analyze.py:
--------------------------------------------------------------------------------
  1 | # Repo: https://pypi.org/project/jiwer/
  2 | # Simple python package to approximate the Word Error Rate (WER), Match Error Rate (MER), Word Information Lost (WIL) and Word Information Preserved (WIP) of a transcript.
  3 | # - WER (word error rate), commonly used in ASR assessment, measures the cost of restoring the output word sequence to the original input sequence.
  4 | # - MER (match error rate) is the proportion of I/O word matches which are errors.
  5 | # - WIL (word information lost) is a simple approximation to the proportion of word information lost which overcomes the problems associated with the RIL (relative information lost)
  6 | #   measure that was proposed half a century ago.
  7 | # It computes the minimum-edit distance between the ground-truth sentence and the hypothesis sentence of a speech-to-text API.
  8 | # The minimum-edit distance is calculated using the python C module python-Levenshtein.
  9 | 
 10 | import argparse
 11 | import os
 12 | import jiwer
 13 | import json
 14 | import sys
 15 | import csv
 16 | import logging
 17 | from shutil import copyfile
 18 | from os.path import join, dirname
 19 | from config import Config
 20 | import nltk
 21 | from nltk.stem.porter import PorterStemmer
 22 | 
 23 | DEFAULT_CONFIG_INI='config.ini'
 24 | DEFAULT_LOGLEVEL='DEBUG'
 25 | 
 26 | class AnalysisResult:
 27 |     def __init__(self, audio_file_name, reference, hypothesis, cleaned_reference, cleaned_hypothesis, measures, differences):
 28 |         self.audio_file_name = audio_file_name
 29 |         self.measures        = measures
 30 |         self.differences     = differences
 31 | 
 32 |         self.data = {}
 33 |         self.data["Audio File Name"]       = audio_file_name
 34 |         self.data["Reference"]             = reference
 35 |         self.data["Transcription"]         = hypothesis
 36 |         self.data["Reference (clean)"]     = cleaned_reference
 37 |         self.data["Transcription (clean)"] = cleaned_hypothesis
 38 |         self.data["WER"]                   = measures['wer'] * 100
 39 |         self.data["MER"]                   = measures['mer'] * 100
 40 |         self.data["WIL"]                   = measures['wil'] * 100
 41 |         self.data["Hits"]                  = measures['hits']
 42 |         self.data["Substitutions"]         = measures['substitutions']
 43 |         self.data["Deletions"]             = measures['deletions']
 44 |         self.data["Insertions"]            = measures['insertions']
 45 |         self.data["Differences"]           = str(differences).replace(';', ' ') #Replace commas for naive CSV readers
 46 | 
 47 | class AnalysisResults:
 48 |     def __init__(self, config):
 49 |         self.results = []
 50 |         self.headers = []
 51 |         self.total_words  = 0
 52 |         self.total_word_errors = 0
 53 |         self.total_sent_errors = 0
 54 |         self.config = config
 55 |         self.word_map = {}
 56 | 
 57 |     def add(self, result:AnalysisResult):
 58 |         #Track `details_file` data
 59 |         self.results.append(result)
 60 |         self.headers = result.data.keys()
 61 | 
 62 |         #Track `summary_file` data
 63 |         word_errors = 0
 64 |         word_errors += result.data["Substitutions"]
 65 |         word_errors += result.data["Deletions"]
 66 |         word_errors += result.data["Insertions"]
 67 | 
 68 |         self.total_words  += len(result.data["Reference"].split(" "))
 69 |         self.total_word_errors += word_errors
 70 |         if(word_errors > 0):
 71 |             self.total_sent_errors += 1
 72 | 
 73 |         #Track `word_accuracy_file` data
 74 |         for word in result.data["Reference (clean)"].split(" "):
 75 |             tuple = self.get_tuple(word)
 76 |             tuple['count'] = tuple['count']+1
 77 |             tuple['error_rate'] = tuple['errors'] / tuple['count']
 78 | 
 79 |         for word in result.differences:
 80 |             tuple = self.get_tuple(word)
 81 |             tuple['errors']     = tuple['errors']+1
 82 |             tuple['error_rate'] = tuple['errors'] / tuple['count']
 83 | 
 84 |     def get_tuple(self, word):
 85 |         if word not in self.word_map:
 86 |             tuple = {'word':word, 'count':0, 'errors':0, 'error_rate':0.0}
 87 |             self.word_map[word] = tuple
 88 |         else:
 89 |             tuple = self.word_map[word]
 90 |         return tuple
 91 | 
 92 |     def get_summary(self):
 93 |         results = {}
 94 |         results["Number of Samples"]      = len(self.results)
 95 |         results["Total Words"]            = self.total_words
 96 |         results["Total Word Errors"]      = self.total_word_errors
 97 |         results["Word Error Rate"]        = round(self.total_word_errors / self.total_words, 4)
 98 |         results["Total Sentence Errors"]  = self.total_sent_errors
 99 |         results["Sentence Error Rate"]    = round(self.total_sent_errors / len(self.results), 4)
100 | 
101 |         #Store transcription configuration in the summary, for ease of comparing different summary files
102 |         #Don't store/compare sensitive values
103 |         config_keys = self.config.getKeys("SpeechToText")
104 |         ignore_keys = ["apikey","service_url","use_bearer_token"]
105 |         for key in config_keys:
106 |             if key not in ignore_keys:
107 |                 results[key] = self.config.getValue("SpeechToText", key)
108 | 
109 |         return results
110 | 
111 |     def write_details(self, filename):
112 |         csv_columns = self.headers
113 | 
114 |         with open(filename, 'w',newline='') as csvfile:
115 |             writer = csv.writer(csvfile)
116 |             writer.writerow(csv_columns)
117 |             for result in self.results:
118 |                 writer.writerow(result.data.values())
119 |         
120 |         logging.info(f"Wrote detailed results to {filename}")
121 | 
122 |     def write_summary(self, filename):
123 | 
124 |         with open(filename, 'w') as jsonfile:
125 |             json.dump(self.get_summary(), jsonfile, indent=2)
126 |         
127 |         logging.info(f"Wrote summary results to {filename}")
128 | 
129 |     def write_word_accuracy(self, filename):
130 |         csv_columns = ['word','count','errors','error_rate']
131 | 
132 |         with open(filename, 'w',newline='') as csvfile:
133 |             writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
134 |             writer.writeheader()
135 |             for word in self.word_map:
136 |                 writer.writerow(self.word_map[word])
137 | 
138 |         logging.info(f"Wrote word accuracy results to {filename}")
139 | 
140 | class Analyzer:
141 |     def __init__(self, config):
142 |         self.config = config
143 |         self.transformation = self.get_pipeline()
144 | 
145 |     def load_csv(self, filename:str, headers:list):
146 |         result = {}
147 |         # https://stackoverflow.com/questions/57152985/what-is-the-difference-between-utf-8-and-utf-8-sig
148 |         # utf-8-sig so we can ignore the BOM (Byte Order Marker)
149 |         with open(filename, encoding='utf-8-sig') as file:
150 |             csvreader = csv.DictReader(file)
151 |             for row in csvreader:
152 |                 key = row[headers[0]]
153 |                 value = row[headers[1]]
154 |                 result[key] = value
155 |         return result
156 | 
157 |     def get_pipeline(self):
158 |         pipeline = []
159 |         if self.config.getBoolean("Transformations", "lower_case"):
160 |             pipeline.append(jiwer.ToLowerCase())
161 | 
162 |         if self.config.getBoolean("Transformations", "remove_white_space"):
163 |             pipeline.append(jiwer.RemoveWhiteSpace(replace_by_space=True))
164 | 
165 |         if self.config.getBoolean("Transformations", "remove_multiple_spaces"):
166 |             pipeline.append(jiwer.RemoveMultipleSpaces())
167 | 
168 |         #JIWER 2.2 defines SentencesToListOfWords
169 |         if self.config.getBoolean("Transformations", "sentences_to_words") and getattr(jiwer, "SentencesToListOfWords", None) is not None:
170 |             pipeline.append(jiwer.SentencesToListOfWords(word_delimiter=" "))
171 | 
172 |         word_list = self.config.getValue("Transformations", "remove_word_list")
173 |         if word_list is not None and len(word_list) > 0:
174 |             pipeline.append(jiwer.RemoveSpecificWords(word_list.split(",")))
175 | 
176 |         if self.config.getBoolean("Transformations", "remove_punctuation"):
177 |             pipeline.append(jiwer.RemovePunctuation())
178 | 
179 |         if self.config.getBoolean("Transformations", "strip"):
180 |             pipeline.append(jiwer.Strip())
181 | 
182 |         if self.config.getBoolean("Transformations", "remove_empty_strings"):
183 |             pipeline.append(jiwer.RemoveEmptyStrings())
184 | 
185 |         #JIWER 2.3+ defines ReduceToListOfListOfWords, breaking API change from SentencesToListOfWords
186 |         if self.config.getBoolean("Transformations", "sentences_to_words") and getattr(jiwer, "ReduceToListOfListOfWords", None) is not None:
187 |             pipeline.append(jiwer.ReduceToSingleSentence())
188 |             pipeline.append(jiwer.ReduceToListOfListOfWords(word_delimiter=" "))
189 | 
190 |         return jiwer.Compose(pipeline)
191 | 
192 | 
193 |     def analyze(self):
194 |         reference_file   = self.config.getValue("Transcriptions","reference_transcriptions_file")
195 |         hypothesis_file  = self.config.getValue("Transcriptions","stt_transcriptions_file")
196 |         reference_dict   = self.load_csv(reference_file, ["Audio File Name", "Reference"])
197 |         hypothesis_dict  = self.load_csv(hypothesis_file,["Audio File Name", "Transcription"])
198 | 
199 |         results = AnalysisResults(self.config)
200 |         p_stemmer = PorterStemmer()
201 | 
202 |         for audio_file_name in reference_dict.keys():
203 |             reference = reference_dict.get(audio_file_name)
204 |             hypothesis   = hypothesis_dict.get(audio_file_name, None)
205 | 
206 |             if hypothesis is None:
207 |                 logging.warn(f"{audio_file_name} - No hypothesis transcription found", sys.stderr)
208 |                 continue
209 | 
210 |             # Common pre-processing on ground truth and hypothesis
211 |             cleaned_ref = self.transformation(reference)
212 |             cleaned_hyp = self.transformation(hypothesis)
213 | 
214 |             if self.config.getBoolean("Transformations", "stemming"):
215 |                 cleaned_ref = [p_stemmer.stem(word) for word in cleaned_ref]
216 |                 cleaned_hyp = [p_stemmer.stem(word) for word in cleaned_hyp]
217 | 
218 |             # gather all metrics at once with `compute_measures`
219 |             measures = jiwer.compute_measures(cleaned_ref, cleaned_hyp)
220 |             differences = self.compute_differences(cleaned_ref, cleaned_hyp)
221 | 
222 |             result = AnalysisResult(audio_file_name, reference, hypothesis, " ".join(cleaned_ref), " ".join(cleaned_hyp), measures, differences)
223 |             results.add(result)
224 | 
225 |         return results
226 | 
227 |     def compute_differences(self, ref_list, hyp_list):
228 |         #Simple set arithmetic does not work if the same word appears multiple times in the reference transcription
229 |         #differences = list(set(cleaned_ref) - set(cleaned_hyp))
230 | 
231 |         differences = list()
232 |         for word in set(ref_list):
233 |             ref_count = ref_list.count(word)
234 |             hyp_count = hyp_list.count(word)
235 |             diff = ref_count - hyp_count
236 |             if diff > 0:
237 |                 for _ in range(diff):
238 |                     differences.append(word)
239 |         return differences
240 | 
241 | def run(config_file:str, logging_level:str=DEFAULT_LOGLEVEL):
242 |     config      = Config(config_file)
243 |     analyzer    = Analyzer(config)
244 | 
245 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
246 | 
247 |     logging.debug(f"Using config file:{config_file}")
248 | 
249 |     output_dir = os.path.dirname(config.getValue("ErrorRateOutput", "summary_file"))
250 |     if output_dir is not None and len(output_dir) > 0:
251 |         os.makedirs(output_dir, exist_ok=True)
252 | 
253 |     results = analyzer.analyze()
254 |     results.write_details(config.getValue("ErrorRateOutput","details_file"))
255 |     results.write_summary(config.getValue("ErrorRateOutput","summary_file"))
256 |     results.write_word_accuracy(config.getValue("ErrorRateOutput","word_accuracy_file"))
257 | 
258 | if __name__ == '__main__':
259 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
260 |     parser.add_argument(
261 |         '-c', '--config_file', type=str, default=DEFAULT_CONFIG_INI, help='the config file to use')
262 |     parser.add_argument(
263 |         '-ll', '--log_level', type=str, default=DEFAULT_LOGLEVEL, help='the log level to use')
264 | 
265 |     args = parser.parse_args()
266 | 
267 |     run(args.config_file, args.log_level)
268 | 


--------------------------------------------------------------------------------
/config.ini.sample:
--------------------------------------------------------------------------------
 1 | [SpeechToText]
 2 | apikey=xxxxxxxxx-xxxxx-xxxxx-xxxxxxxxxxx # pragma: allowlist secret
 3 | service_url=https://....
 4 | use_bearer_token=False
 5 | 
 6 | ;max_threads=20
 7 | 
 8 | base_model_name=en-US_Telephony
 9 | ;language_model_id=xxxxxxxxx-xxxxx-xxxxx-xxxxxxxxxxx
10 | ;grammar_name=
11 | end_of_phrase_silence_time=1.5
12 | inactivity_timeout=-1
13 | speech_detector_sensitivity=0.5
14 | background_audio_suppression=0.0
15 | smart_formatting=False
16 | low_latency=False
17 | ;customization_weight=0.1
18 | ;character_insertion_bias=0.1
19 | ;customization_weight=0.3
20 | custom_transaction_id=False
21 | 
22 | #At most one of interim_results and audio_metrics can be True
23 | interim_results=False
24 | audio_metrics=False
25 | 
26 | [Transcriptions]
27 | reference_transcriptions_file=reference_transcriptions.csv
28 | stt_transcriptions_file=output/stt_transcriptions.csv
29 | audio_file_folder=.
30 | 
31 | [ErrorRateOutput]
32 | ;Suggestion: Use same folders for both [ErrorRateOutput] and [Transcriptions] sections
33 | ;Columnar file with audio file name, transcriptions, error details
34 | details_file=output/wer_details.csv
35 | ;JSON file with number of samples, total WER, total SER
36 | summary_file=output/wer_summary.json
37 | ;Columnar file with word, total frequency, error count, error rather
38 | word_accuracy_file=output/wer_word_accuracy.csv
39 | ;JSON file with STT output
40 | stt_transcriptions_file=output/stt_transcriptions.csv
41 | ;Directory where sclite is installed if sclite is to be used for Analysis, see https://github.com/usnistgov/SCTK#sctk-basic-installation for installation instructions
42 | ;sclite_directory=
43 | 
44 | [Transformations]
45 | remove_word_list=uh,uhuh,%hesitation,hesitation
46 | ;Suggestion: Use the defaults from below
47 | lower_case=True
48 | remove_punctuation=True
49 | remove_multiple_spaces=True
50 | remove_white_space=True
51 | sentences_to_words=True
52 | strip=True
53 | remove_empty_strings=True
54 | ;If True, pre-processing stems words with Porter stemmer. Stemming will treat singular/plural of a word as equivalent, rather than a word error.
55 | stemming=False
56 | 
57 | [Experiments]
58 | sds_min=0.5
59 | sds_max=0.5
60 | sds_step=0.1
61 | bias_min=0
62 | bias_max=0.0
63 | bias_step=0.1
64 | cust_weight_min=0
65 | cust_weight_max=0.3
66 | cust_weight_step=0.1
67 | bas_min=0
68 | bas_max=0.0
69 | bas_step=0.1


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | 
 3 | STT_SECTION_KEY="SpeechToText"
 4 | TRANSCRIPTIONS_SECTION_KEY="Transcriptions"
 5 | OUTPUT_SECTION_KEY="ErrorRateOutput"
 6 | TRANSFORMATIONS_SECTION_KEY="Transformations"
 7 | 
 8 | class Config:
 9 |     config = None
10 | 
11 |     def __init__(self, config_file:str):
12 |         # (interpolation=None) so that '%' is not treated like an environment variable
13 |          # inline_comment_prefixes allows comments inline, after the value
14 |         self.config_file = config_file
15 |         self.config = configparser.ConfigParser(interpolation=None, inline_comment_prefixes='#;')
16 |         self.config.read(config_file)
17 | 
18 |     def getBoolean(self, section, key, default_value=None):
19 |         return self.getValue(section, key, default_value) == "True"
20 | 
21 |     def getValue(self, section, key, default_value=None):
22 |         value = None
23 |         if section in self.config:
24 |            list = self.config[section]
25 |            value = list.get(key, default_value)
26 |         return value
27 | 
28 |     def getKeys(self, section):
29 |         if section in self.config:
30 |             return [key for key,value in self.config.items(section)]
31 |         return None
32 | 
33 |     def setValue(self, section:str, key:str, value:str):
34 |         if section in self.config:
35 |            self.config.set(section, key, value)
36 | 
37 |     def writeFile(self, file_name:str):
38 |         with open(file_name, 'w') as configfile:
39 |             self.config.write(configfile)
40 | 
41 |         #value = None
42 |         if section in self.config:
43 |            self.config.set(section, key, value)
44 |         return
45 | 
46 |     def writeFile(self, file:str):
47 |         #value = None
48 |         with open(file, 'w') as configfile:
49 |             self.config.write(configfile)
50 |         return
51 | 


--------------------------------------------------------------------------------
/experiment.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import re
  5 | import csv
  6 | import json
  7 | import logging
  8 | from shutil import copyfile
  9 | from config import Config
 10 | import subprocess
 11 | import os.path
 12 | from os import path
 13 | import glob
 14 | 
 15 | import pandas as pd
 16 | 
 17 | import transcribe
 18 | import analyze
 19 | import optional_analyze_with_sclite
 20 | 
 21 | DEFAULT_CONFIG_INI='config.ini'
 22 | DEFAULT_LOGLEVEL='INFO'
 23 | 
 24 | class Experiments:
 25 |     def __init__(self, config, output_dir):
 26 |         self.config = config
 27 |         self.output_dir = output_dir
 28 | 
 29 |     def run_all_experiments(self, bias_range, weight_range, sds_range, bas_range, end_of_phrase_silence_time_range, max_threads, logging_level):
 30 |         weight_values = list(weight_range)
 31 |         sds_values = list(sds_range)
 32 |         bas_values = list(bas_range)
 33 |         for bias in bias_range:
 34 |             for weight in weight_values:
 35 |                 for sds in sds_values:
 36 |                     for bas in bas_values:
 37 |                         for end_of_phrase_silence_time in end_of_phrase_silence_time_range:
 38 |                             
 39 |                             end_of_phrase_silence_time = round(end_of_phrase_silence_time, 2)
 40 |                             bias = round(bias, 2)
 41 |                             weight = round(weight, 2)
 42 |                             sds = round(sds, 2)
 43 |                             bas = round(bas,2)
 44 | 
 45 |                             logging.info(f"Running Experiment -- Character Insertion Bias: {bias}, Customization Weight: {weight}, Speech Detector Sensitivity: {sds}, Background Audio Suppression: {bas}, End of Phrase Silence Time: {end_of_phrase_silence_time}")
 46 | 
 47 |                             experiment_output_dir = self.output_dir + "/bias_" + str(bias) + "_weight_" + str(weight) + "_sds_" + str(sds) + "_bas_" + str(bas) + "_eofst_" + str(end_of_phrase_silence_time)
 48 |                             os.makedirs(experiment_output_dir, exist_ok=True)
 49 | 
 50 |                             exp_config_path = experiment_output_dir + "/" + self.config.config_file
 51 |                             copyfile(self.config.config_file, exp_config_path)
 52 | 
 53 |                             #Update config settings for the experiment
 54 |                             exp_config = Config(exp_config_path)
 55 | 
 56 |                             file_info = os.path.split(exp_config.getValue('ErrorRateOutput', 'details_file'))
 57 |                             details_file = os.path.join(experiment_output_dir, file_info[1])
 58 |                             exp_config.setValue('ErrorRateOutput', 'details_file', details_file)
 59 | 
 60 |                             file_info = os.path.split(exp_config.getValue('ErrorRateOutput', 'summary_file'))
 61 |                             summary_file = os.path.join(experiment_output_dir, file_info[1])
 62 |                             exp_config.setValue('ErrorRateOutput', 'summary_file', summary_file)
 63 | 
 64 |                             file_info = os.path.split(exp_config.getValue('ErrorRateOutput', 'word_accuracy_file'))
 65 |                             word_accuracy_file = os.path.join(experiment_output_dir, file_info[1])
 66 |                             exp_config.setValue('ErrorRateOutput', 'word_accuracy_file', word_accuracy_file)
 67 | 
 68 |                             file_info = os.path.split(exp_config.getValue('Transcriptions', 'stt_transcriptions_file'))
 69 |                             stt_transcriptions_file = os.path.join(experiment_output_dir, file_info[1])
 70 |                             exp_config.setValue('Transcriptions', 'stt_transcriptions_file', stt_transcriptions_file)
 71 | 
 72 |                             file_info = os.path.split(exp_config.getValue('ErrorRateOutput', 'stt_transcriptions_file'))
 73 |                             stt_transcriptions_file = os.path.join(experiment_output_dir, file_info[1])
 74 |                             exp_config.setValue('ErrorRateOutput', 'stt_transcriptions_file', stt_transcriptions_file)
 75 |                                                     
 76 |                             exp_config.setValue('SpeechToText', "max_threads", str(max_threads))
 77 | 
 78 |                             exp_config.setValue('SpeechToText', "speech_detector_sensitivity", str(sds))
 79 |                             exp_config.setValue('SpeechToText', "background_audio_suppression", str(bas))
 80 |                             exp_config.setValue('SpeechToText', "character_insertion_bias", str(bias))
 81 |                             exp_config.setValue('SpeechToText', "customization_weight", str(weight))
 82 |                             exp_config.setValue('SpeechToText', "end_of_phrase_silence_time", str(end_of_phrase_silence_time))
 83 | 
 84 |                             exp_config.writeFile(exp_config_path)
 85 | 
 86 |                             #Get Transcriptions 
 87 |                             transcribe.run(exp_config_path, logging_level)
 88 | 
 89 |                             #Get Analysis
 90 |                             if exp_config.getValue('ErrorRateOutput', 'sclite_directory') is None:
 91 |                                 analyze.run(exp_config_path, logging_level)
 92 |                             else:
 93 |                                 optional_analyze_with_sclite.run(exp_config_path, logging_level)
 94 | 
 95 |                             logging.info(f"Experiment Complete \n")
 96 | 
 97 |     def run_report(self, output_dir, config):
 98 |         logging.debug(f"Generating summary report in {output_dir}")
 99 | 
100 |         # Extract all summaries
101 |         if config.getValue('ErrorRateOutput', 'sclite_directory') is None:
102 |             lines = True 
103 |             wer_summary_filename = os.path.split(config.getValue("ErrorRateOutput", "summary_file"))[1]
104 |         else:
105 |             lines = False
106 |             wer_summary_filename = 'sclite_wer_summary.json'
107 |             
108 |         summary_files = glob.glob(f"{output_dir}/**/*{wer_summary_filename}")
109 | 
110 |         output_filename = output_dir + '/all_summaries.csv'
111 | 
112 |         f = open(summary_files[0])
113 |         data = json.load(f)
114 |         df_all = pd.read_json(json.dumps(data), orient='records', lines=lines)
115 | 
116 |         for file in summary_files[1:]:
117 |             f = open(file)
118 |             data = json.load(f)
119 |             df = pd.read_json(json.dumps(data), orient='records', lines=lines)
120 |             df_all = pd.concat([df_all, df], ignore_index=True)
121 |         
122 |         logging.info("\n"+df_all.to_markdown())
123 |         df_all.to_csv(output_filename, index=False)
124 | 
125 | def drange(start, stop, step):
126 |     r = start
127 |     while r < stop:
128 |         yield r
129 |         r += step
130 | 
131 | def run(config_file:str, logging_level:str=DEFAULT_LOGLEVEL):
132 | 
133 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
134 |     logging.debug(f"Using config file:{config_file}")
135 | 
136 |     config = Config(config_file)
137 | 
138 |     output_dir = os.path.dirname(config.getValue("ErrorRateOutput", "summary_file"))
139 |     if output_dir is None or len(output_dir) == 0:
140 |         output_dir = "."
141 | 
142 |     # build generators
143 |     experiments = Experiments(config, output_dir)
144 |     max_threads = int(config.getValue("SpeechToText","max_threads", 1))
145 |     sds_min  = float(config.getValue("Experiments", "sds_min"))
146 |     sds_max  = float(config.getValue("Experiments", "sds_max"))
147 |     sds_step  = float(config.getValue("Experiments", "sds_step"))
148 |     bias_min  = float(config.getValue("Experiments", "bias_min"))
149 |     bias_max  = float(config.getValue("Experiments", "bias_max"))
150 |     bias_step  = float(config.getValue("Experiments", "bias_step"))
151 |     cust_weight_min  = float(config.getValue("Experiments", "cust_weight_min"))
152 |     cust_weight_max  = float(config.getValue("Experiments", "cust_weight_max"))
153 |     cust_weight_step  = float(config.getValue("Experiments", "cust_weight_step"))
154 |     bas_min = float(config.getValue("Experiments", "bas_min"))
155 |     bas_max = float(config.getValue("Experiments", "bas_max"))
156 |     bas_step = float(config.getValue("Experiments", "bas_step"))
157 |     end_of_phrase_silence_time_min = float(config.getValue("Experiments", "end_of_phrase_silence_time_min"))
158 |     end_of_phrase_silence_time_max = float(config.getValue("Experiments", "end_of_phrase_silence_time_max"))
159 |     end_of_phrase_silence_time_step = float(config.getValue("Experiments", "end_of_phrase_silence_time_step"))
160 | 
161 |     custom_model = str(config.getValue("SpeechToText", "language_model_id"))
162 |     
163 |     bias_range = drange(bias_min, bias_max+bias_step, bias_step)
164 |     weight_range = drange(cust_weight_min, cust_weight_max+cust_weight_step, cust_weight_step) if custom_model!="None" else drange(0.0, 0.1, 0.1)    
165 |     sds_range = drange(sds_min, sds_max+sds_step, sds_step) 
166 |     bas_range = drange(bas_min, bas_max+bas_step, bas_step)
167 |     end_of_phrase_silence_time_range = drange(end_of_phrase_silence_time_min,end_of_phrase_silence_time_max+end_of_phrase_silence_time_step,end_of_phrase_silence_time_step)
168 |     
169 |     experiments.run_all_experiments(bias_range, weight_range, sds_range, bas_range, end_of_phrase_silence_time_range, max_threads, logging_level)
170 | 
171 |     experiments.run_report(output_dir, config)
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
175 |     parser.add_argument(
176 |         '-c', '--config_file', type=str, default=DEFAULT_CONFIG_INI, help='the config file to use')
177 |     parser.add_argument(
178 |         '-ll', '--log_level', type=str, default=DEFAULT_LOGLEVEL, help='the log level to use')
179 | 
180 |     args = parser.parse_args()
181 | 
182 |     run(args.config_file, args.log_level)
183 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import sys
  4 | import re
  5 | import csv
  6 | import time
  7 | from config import Config
  8 | 
  9 | from ibm_watson import SpeechToTextV1
 10 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
 11 | from ibm_watson import IAMTokenManager
 12 | from ibm_cloud_sdk_core.authenticators import BearerTokenAuthenticator
 13 | from ibm_watson.speech_to_text_v1 import CustomWord
 14 | 
 15 | from argparse import ArgumentParser
 16 | 
 17 | import os.path
 18 | from os import path
 19 | 
 20 | #For information to user.  stdout is preserved for command status (could be redirected to file and parsed), stderr tracks ongoing progress
 21 | def eprint(msg:str):
 22 |     print(msg, file=sys.stderr)
 23 | 
 24 | class ModelTool:
 25 | 
 26 |     def __init__(self, config, ARGS):
 27 |         self.config = config
 28 |         self.STT = self.createSTT()
 29 |         self.ARGS = ARGS
 30 | 
 31 |     def createSTT(self):
 32 |         apikey            = self.config.getValue("SpeechToText", "apikey")
 33 |         url               = self.config.getValue("SpeechToText", "service_url")
 34 |         use_bearer_token  = self.config.getBoolean("SpeechToText", "use_bearer_token")
 35 | 
 36 |         if use_bearer_token != True:
 37 |             authenticator = IAMAuthenticator(apikey)
 38 |         else:
 39 |             iam_token_manager = IAMTokenManager(apikey=apikey)
 40 |             bearerToken       = iam_token_manager.get_token()
 41 |             authenticator     = BearerTokenAuthenticator(bearerToken)
 42 | 
 43 |         speech_to_text = SpeechToTextV1(authenticator=authenticator)
 44 | 
 45 |         speech_to_text.set_service_url(url)
 46 |         speech_to_text.set_default_headers({'x-watson-learning-opt-out': "true"})
 47 |         return speech_to_text
 48 | 
 49 |     def execute(self):
 50 |         # eprint(f"operation: {self.ARGS.operation}\n"
 51 |         #       +f"type: {self.ARGS.type}\n"
 52 |         #       +f"name: {self.ARGS.name}\n"
 53 |         #       +f"description: {self.ARGS.description}\n"
 54 |         #       +f"file: {self.ARGS.file}\n"
 55 |         # )
 56 | 
 57 |         # Registration of all the methods we support.
 58 |         # First key is type, second key is type
 59 |         # By genericizing the invocation, we can use common response handling below.
 60 |         type_handlers = {}
 61 |         type_handlers['base_model'  ] = self.get_type_base_model_handlers
 62 |         type_handlers['custom_model'] = self.get_type_custom_model_handlers
 63 |         type_handlers['corpus'      ] = self.get_type_corpus_handlers
 64 |         type_handlers['word'        ] = self.get_type_word_handlers
 65 |         type_handlers['grammar'     ] = self.get_type_grammar_handlers
 66 | 
 67 |         if self.ARGS.type in type_handlers:
 68 |             type_handler = type_handlers[self.ARGS.type]()
 69 |             if self.ARGS.operation in type_handler:
 70 |                 eprint(f"Executing operation: {self.ARGS.operation} on type: {self.ARGS.type}")
 71 |                 response = type_handler[self.ARGS.operation]()
 72 |                 if response is not None:
 73 |                     #Could do global handling of HTTP status code, etc
 74 |                     #eprint(response.get_status_code())
 75 |                     eprint(json.dumps(response.get_result(), indent=2))
 76 |                 else:
 77 |                     eprint(f"Error executing operation: {self.ARGS.operation} on type: {self.ARGS.type}")    
 78 |             else:
 79 |                 eprint(f"Unsupported operation: {self.ARGS.operation} on type: {self.ARGS.type}")
 80 |         else:
 81 |             eprint(f"Unsupported type: {self.ARGS.type}")
 82 | 
 83 |     # Most methods rely on the customization ID, we can abstract the config file from those methods
 84 |     def get_customization_id(self):
 85 |         return self.config.getValue("SpeechToText", "language_model_id")
 86 | 
 87 |     def wait_until(self, status, action):
 88 |         """Wait until the model is in the given state, such as 'ready' or 'available'
 89 |         (it might still need to finalize an operation such adding a corpus file or training)
 90 |         """
 91 |         while True:
 92 |             resp = self.STT.get_language_model(self.get_customization_id())
 93 |             if resp.status_code != 200:
 94 |                 return False            
 95 |             if resp.result['status'] == status:
 96 |                 break
 97 |             elif resp.result['status'] == 'failed':
 98 |                 break
 99 |             else:
100 |                 eprint(action + " in progress. Please wait...")
101 |                 time.sleep(5.0)
102 | 
103 |     '''
104 |     Base model functions
105 |     '''
106 |     def get_type_base_model_handlers(self):
107 |         handlers = {}
108 |         handlers['list'] = self.STT.list_models
109 |         handlers['get']  = self.get_base_model
110 | 
111 |         return handlers
112 |     
113 |     def get_base_model(self):
114 |         name = self.config.getValue("SpeechToText", "base_model_name")
115 |         if ARGS.name is not None:
116 |             name = ARGS.name
117 | 
118 |         return self.STT.get_model(name)
119 | 
120 |     '''
121 |     Custom model functions
122 |     '''
123 | 
124 |     def get_type_custom_model_handlers(self):
125 |         handlers = {}
126 |         handlers['list'  ] = self.STT.list_language_models
127 |         handlers['create'] = self.create_custom_model
128 |         handlers['get'   ] = self.get_custom_model
129 |         handlers['delete'] = self.delete_custom_model
130 |         # Update intentionally mapped to STT 'train' to reduce operation count.  
131 |         # Thus the SDK's 'add' is our 'create' and SDK's 'train' is our update.  This matches my mental model of speech customization.
132 |         handlers['update'] = self.train_custom_model
133 |         handlers['reset'] = self.reset_custom_model
134 | 
135 |         return handlers
136 | 
137 |     def get_custom_model(self):
138 |         return self.STT.get_language_model(self.get_customization_id())
139 | 
140 |     def create_custom_model(self):
141 |         base_model_name = self.config.getValue("SpeechToText", "base_model_name")
142 |         model_name = self.ARGS.name
143 |         if self.ARGS.name is None:
144 |             eprint("ERROR: Must pass a 'name' for the model")
145 |             return None
146 | 
147 |         #Parameter 'dialect' is not included in this tool
148 |         response = self.STT.create_language_model(model_name, base_model_name, description=self.ARGS.description)
149 | 
150 |         if response is not None and 'customization_id' in response.get_result():
151 |             #Fetch new customization id, to store it back into a new config file
152 |             customization_id = response.get_result()['customization_id']
153 |             self.config.setValue("SpeechToText", "language_model_id", customization_id)
154 |             
155 |             #Sanitization could be improved a bit more, see https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename
156 |             sanitized_model_name = re.sub('[ /.]','_', model_name)
157 |             new_config_file_name = f"config.ini.{sanitized_model_name}"
158 |             eprint(f"Writing new configuration to {new_config_file_name} which contains customization id {customization_id}")
159 |             self.config.writeFile(new_config_file_name)
160 | 
161 |         return response
162 | 
163 |     def delete_custom_model(self):
164 |         return self.STT.delete_language_model(self.get_customization_id())
165 | 
166 |     def train_custom_model(self):
167 |         resp = self.STT.train_language_model(self.get_customization_id())
168 |         self.wait_until('available', "Training")
169 |         eprint("Training complete. Custom model is ready to use.")
170 |         return resp
171 | 
172 |     def reset_custom_model(self):
173 |         return self.STT.reset_language_model(self.get_customization_id())
174 | 
175 |     def upgrade_custom_model(self):
176 |         resp = self.STT.upgrade_language_model(self.get_customization_id())
177 |         self.wait_until('available', "Upgrading")
178 |         eprint("Upgrade complete.")
179 |         return resp
180 | 
181 |     '''
182 |     Corpus functions
183 |     '''
184 | 
185 |     def get_type_corpus_handlers(self):
186 |         handlers = {}
187 |         handlers['list'  ] = self.list_corpora
188 |         handlers['get'   ] = self.get_corpus
189 |         handlers['create'] = self.add_corpus
190 |         handlers['update'] = self.update_corpus
191 |         handlers['delete'] = self.delete_corpus
192 | 
193 |         return handlers
194 | 
195 |     def list_corpora(self):
196 |         return self.STT.list_corpora(self.get_customization_id())
197 | 
198 |     def add_corpus(self):
199 |         return self.do_write_corpus(update=False)
200 | 
201 |     def update_corpus(self):
202 |         return self.do_write_corpus(update=True)
203 | 
204 |     def do_write_corpus(self, update:bool):
205 |         if self.ARGS.file is None and self.ARGS.directory is None:
206 |             eprint("ERROR: Must pass a 'file' or 'directory' for the corpus")
207 |             return None
208 | 
209 |         if self.ARGS.directory is not None:
210 |             dir = os.path.dirname(os.path.join(self.ARGS.directory, ''))
211 |             for file in os.listdir(dir):
212 |                 filename = os.path.basename(file).split('.')[0]
213 |                 if filename == "":
214 |                     eprint(f"ERROR: Corpus name is blank for file: {file}")
215 |                     return None
216 |                 with open(os.path.join(dir, file), 'rb') as corpus_contents:
217 |                     resp = self.STT.add_corpus(self.get_customization_id(), filename, corpus_contents, allow_overwrite="true")
218 |                     self.wait_until('ready', f"Creating corpora {filename}")
219 |                     eprint("Corpora " + filename + " processed")
220 |             return resp
221 |         if self.ARGS.file is not None:
222 |             name = self.ARGS.name
223 |             if self.ARGS.name is None:
224 |                 name = os.path.basename(self.ARGS.file)
225 |                 eprint(f"WARNING: A corpus 'name' is required. Using default name '{name}'")
226 | 
227 |             with open(self.ARGS.file, 'rb') as corpus_contents:
228 |                 return self.STT.add_corpus(self.get_customization_id(), name, corpus_contents, allow_overwrite="true")
229 | 
230 |     def get_corpus(self):
231 |         if self.ARGS.name is None:
232 |             eprint(f"ERROR: A corpus 'name' is required.")
233 |             return None
234 | 
235 |         return self.STT.get_corpus(self.get_customization_id(), self.ARGS.name)
236 | 
237 |     def delete_corpus(self):
238 |         if self.ARGS.name is None:
239 |             eprint(f"ERROR: A corpus 'name' is required.")
240 |             return None
241 | 
242 |         return self.STT.delete_corpus(self.get_customization_id(), self.ARGS.name)
243 | 
244 |     '''
245 |     Custom word functions
246 |     '''
247 | 
248 |     def get_type_word_handlers(self):
249 |         handlers = {}
250 |         handlers['list'  ] = self.list_words
251 |         handlers['get'   ] = self.get_word
252 |         handlers['create'] = self.add_words
253 |         handlers['update'] = self.add_words
254 |         handlers['delete'] = self.delete_word
255 | 
256 |         return handlers
257 |     
258 |     def list_words(self):
259 |         return self.STT.list_words(self.get_customization_id())
260 | 
261 |     def get_word(self):
262 |         if self.ARGS.name is None:
263 |             eprint(f"ERROR: A word 'name' is required.")
264 |             return None
265 | 
266 |         return self.STT.get_word(self.get_customization_id(), self.ARGS.name)
267 | 
268 |     def add_words(self):
269 |         if self.ARGS.file is None:
270 |             eprint(f"ERROR: A word 'file' is required.\nThe file format is documented in https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageCreate#addWords")
271 |             return None
272 | 
273 |         with open(self.ARGS.file, 'rb') as word_contents_str:
274 |             #SDK does not allow a file stream, you need to create CustomWord objects instead
275 |             words_json = json.load(word_contents_str)
276 |             words = []
277 |             for word_json in words_json['words']:
278 |                 words.append(CustomWord(word        = word_json.get('word'),
279 |                                         sounds_like = word_json.get('sounds_like'),
280 |                                         display_as  = word_json.get('display_as')
281 |                                         ))
282 |             resp = self.STT.add_words(self.get_customization_id(), words)
283 |             self.wait_until('ready', f"Creating words from {self.ARGS.file}")
284 |             eprint("Words in file processed.")
285 |             return resp
286 | 
287 |     def delete_word(self):
288 |         if self.ARGS.name is None:
289 |             eprint(f"ERROR: A word 'name' is required.")
290 |             return None
291 | 
292 |         return self.STT.delete_word(self.get_customization_id(), self.ARGS.name)
293 | 
294 |     '''
295 |     Grammar functions
296 |     '''
297 | 
298 |     def get_type_grammar_handlers(self):
299 |         handlers = {}
300 |         handlers['list'  ] = self.list_grammars
301 |         handlers['get'   ] = self.get_grammar
302 |         handlers['create'] = self.add_grammar
303 |         handlers['update'] = self.update_grammar
304 |         handlers['delete'] = self.delete_grammar
305 | 
306 |         return handlers
307 | 
308 |     def list_grammars(self):
309 |         return self.STT.list_grammars(self.get_customization_id())
310 | 
311 |     def add_grammar(self):
312 |         return self.do_write_grammar(update=False)
313 | 
314 |     def update_grammar(self):
315 |         return self.do_write_grammar(update=True)
316 | 
317 |     def do_write_grammar(self, update:bool):
318 |         if self.ARGS.file is None:
319 |             eprint("ERROR: Must pass a 'file' for the grammar")
320 |             return None
321 | 
322 |         name = self.ARGS.name
323 |         if self.ARGS.name is None:
324 |             name = os.path.basename(self.ARGS.file)
325 |             eprint(f"WARNING: A grammar 'name' is required. Using default name '{name}'")
326 | 
327 |         if self.ARGS.file.endswith('.abnf'):
328 |             content_type = "application/srgs"
329 |         elif self.ARGS.file.endswith('.xml'):
330 |             content_type = "application/srgs+xml"
331 |         else:
332 |             eprint(f"ERROR: Expected .abnf or .xml file type for grammar.")
333 |             return None
334 | 
335 |         with open(self.ARGS.file, 'rb') as grammar_contents:
336 |             return self.STT.add_grammar(self.get_customization_id(), name, grammar_contents, content_type=content_type, allow_overwrite=update)
337 | 
338 |     def get_grammar(self):
339 |         if self.ARGS.name is None:
340 |             eprint(f"ERROR: A grammar 'name' is required.")
341 |             return None
342 | 
343 |         return self.STT.get_grammar(self.get_customization_id(), self.ARGS.name)
344 | 
345 |     def delete_grammar(self):
346 |         if self.ARGS.name is None:
347 |             eprint(f"ERROR: A grammar 'name' is required.")
348 |             return None
349 | 
350 |         return self.STT.delete_grammar(self.get_customization_id(), self.ARGS.name)
351 | 
352 | def create_parser():
353 |     parser = ArgumentParser(description='Run IBM Speech To Text model-related commands')
354 |     parser.add_argument('-c', '--config_file', type=str, required=False, default="config.ini", help='Configuration file including connection details')
355 |     parser.add_argument('-o', '--operation', type=str, required=True, choices=["list","get","create","update","delete","reset"], help="operation to perform")
356 |     parser.add_argument('-t', '--type', type=str, required=True, choices=["base_model","custom_model","corpus","word","grammar"], help="type the operation works on")
357 |     parser.add_argument('-n', '--name', type=str, required=False, help="name the operation works on, for instance 'MyModel' or 'corpus1'.")
358 |     parser.add_argument('-d', '--description', type=str, required=False, help="description of the object being created; used only in create")
359 |     parser.add_argument('-f', '--file', type=str, required=False, help="path to a file supporting the operation, for instance a corpus file or grammar file")
360 |     parser.add_argument('-dir', '--directory', type=str, required=False, help="directory containing corpus files")
361 |     return parser
362 | 
363 | def main(ARGS):
364 |     config     = Config(ARGS.config_file)
365 |     model_tool = ModelTool(config, ARGS)
366 |     
367 |     model_tool.execute()
368 | 
369 | if __name__ == '__main__':
370 |     ARGS = create_parser().parse_args()
371 |     main(ARGS)


--------------------------------------------------------------------------------
/optional_analyze_with_sclite.py:
--------------------------------------------------------------------------------
  1 | # https://people.csail.mit.edu/joe/sctk-1.2/doc/sclite.htm#sclite_name_0
  2 | # sclite is an open source tool for analyzing STT results that uses a reference file to calculate substitutions,
  3 | # deletions, insertions, Word Error Rate (WER), and Sentence Error Rate (SER) in a file (.sys). It also outputs a detailed
  4 | # report (.dtl) and a string alignment file (.prf)
  5 | 
  6 | import argparse
  7 | import copy
  8 | import os
  9 | import subprocess
 10 | import json
 11 | import sys
 12 | import logging
 13 | from shutil import copyfile
 14 | from os.path import join, dirname
 15 | from config import Config
 16 | 
 17 | import pandas as pd
 18 | 
 19 | DEFAULT_CONFIG_INI='config.ini'
 20 | DEFAULT_LOGLEVEL='DEBUG'
 21 | 
 22 | class Analyzer:
 23 |     def __init__(self, config):
 24 |         self.config = config
 25 | 
 26 |     def create_ctm(self, transcriptions_filename):
 27 |         ctm_file = os.path.splitext(transcriptions_filename)[0]+".ctm"
 28 |         ctm_entries=[]
 29 |         #data = self.transcriptions.getData()
 30 |         if transcriptions_filename is not None:
 31 |             try:
 32 |                 if os.path.exists(transcriptions_filename):
 33 |                     logging.debug(f"Attempting to create ctm file from transcriptions file - {transcriptions_filename}")
 34 |                     transcriptions_df = pd.read_csv(transcriptions_filename)
 35 |                     transcriptions_df.sort_values(by="Audio File Name", inplace=True)
 36 | 
 37 |                     for audio_file in json.loads(transcriptions_df.to_json(orient='records')):
 38 |                         words = str(audio_file["Transcription"]).split()
 39 |                         for word in words:
 40 |                             ctm_entries.append(audio_file["Audio File Name"].replace(" ", "_") + " 1 0 -1 " + word)
 41 |                     
 42 |                     self.write_to_file(ctm_entries,ctm_file)
 43 |                     logging.info(f"Created ctm file - {ctm_file}")
 44 |             except Exception as e:
 45 |                 logging.exception(f"Failed to create ctm file {ctm_file}:",e)
 46 | 
 47 |     def create_stm(self, transcriptions_filename, reference_file_name):
 48 |         stm_file = os.path.splitext(transcriptions_filename)[0]+".stm"
 49 |         if reference_file_name is not None:
 50 |             try:
 51 |                 if os.path.exists(reference_file_name):
 52 |                     logging.debug(f"Found reference transcriptions file - {reference_file_name} - attempting to create stm file")
 53 |                     ref_df = pd.read_csv(reference_file_name, usecols = ['Audio File Name','Reference'])
 54 |                     ref_df = ref_df.sort_values(by = 'Audio File Name')
 55 |                     ref_df.insert(1,"num1",pd.Series([1 for x in range(len(ref_df.index))]))
 56 |                     ref_df.insert(2,"num2",pd.Series([0 for x in range(len(ref_df.index))]))
 57 |                     ref_df.insert(3,"num3",pd.Series([0 for x in range(len(ref_df.index))]))
 58 |                     ref_df.insert(4,"num4",pd.Series([1000 for x in range(len(ref_df.index))]))
 59 |                     #write out to file
 60 |                     ref_df = ref_df.to_string(header=False,index=False)
 61 |                     lines = ref_df.split("\n")
 62 |                     new_lines = []
 63 |                     for line in lines:
 64 |                         new_lines.append(line.lstrip())
 65 |                     self.write_to_file(new_lines, stm_file)
 66 |                     logging.info(f"Created stm file - {stm_file}")
 67 |             except Exception as e:
 68 |                 logging.exception(f"Failed to create stm file {stm_file}:",e)
 69 | 
 70 |     def write_to_file(self, entries, filename):
 71 |         with open(filename, "wt", encoding="utf-8") as f:
 72 |             for entry in entries:
 73 |                 f.write(entry + "\n")
 74 |     
 75 |     def analyze(self, transcriptions_filename, sclite_path):
 76 |         results = {'task':[], 'sub':[], 'del':[], 'ins':[], 'wer':[], 'ser':[], 'words':[], 'sentences':[]}
 77 | 
 78 |         stm_file = os.path.splitext(os.path.abspath(transcriptions_filename))[0]+".stm"
 79 |         ctm_file = os.path.splitext(os.path.abspath(transcriptions_filename))[0]+".ctm"
 80 | 
 81 |         result = subprocess.run([sclite_path+'/sclite', '-h', ctm_file, 'ctm', '-r', stm_file, 'stm', '-o', 'prf', 'dtl', 'sum'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 82 |         sclite_summary_file = ctm_file + '.sys'
 83 | 
 84 |         logging.debug(result.stdout.decode('ascii'))
 85 | 
 86 |         try:
 87 |             with open(sclite_summary_file,'rt') as f:
 88 |                         lines = f.readlines()
 89 |                         for line in lines:
 90 |                             if line.find("Sum") != -1:
 91 |                                 align = self.get_wer(line)
 92 |                                 results['task'].append(os.path.basename(os.path.dirname(ctm_file)))                          
 93 |                                 for cat in ('sub', 'del', 'ins', 'wer', 'words', 'ser', 'sentences'):
 94 |                                     results[cat].append(float(align[cat]))   
 95 |         except Exception as e:
 96 |             logging.exception(f"Could not open {sclite_summary_file}: ", exc_info=e)
 97 |             exit() 
 98 |         
 99 |         columns = {'sub':'Substitutions', 'del':'Deletions', 'ins':'Insertions', 'wer':'Word Error Rate', 'words':'Total Words', 'ser':'Sentence Error Rate', 'sentences':'Total Sentences'}
100 |         df = pd.DataFrame.from_dict(results).rename(columns=columns)
101 | 
102 |         wer_summary_file=str(os.path.dirname(stm_file)+"/sclite_wer_summary.json")
103 |         df.to_json(wer_summary_file, orient="records")
104 |         logging.info(f"Created summary file - {wer_summary_file}")
105 | 
106 |         logging.debug("\n"+df.to_markdown(index=False)) 
107 | 
108 |     def get_wer(self, sclite_str):
109 |         #  "| Sum/Avg|  187    764 | 84.9   11.0    4.1    8.4   23.4   49.2 |"
110 |         elements = sclite_str.replace('|', ' ').split()
111 |         if len(elements) != 9:
112 |             sys.exit("unable to parse: ", sclite_str)
113 |         return {'sentences':elements[1], 'words':elements[2], 'accuracy':elements[3], 'sub':elements[4],
114 |                 'del':elements[5], 'ins':elements[6], 'wer':elements[7], 'ser':elements[8]}   
115 | 
116 | def run(config_file:str, logging_level:str=DEFAULT_LOGLEVEL):
117 |     config      = Config(config_file)
118 |     analyzer    = Analyzer(config)
119 | 
120 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
121 | 
122 |     logging.debug(f"Using config file:{config_file}")
123 | 
124 |     output_dir = os.path.dirname(config.getValue("ErrorRateOutput", "summary_file"))
125 |     transcriptions_filename = config.getValue("Transcriptions", "stt_transcriptions_file")
126 |     reference_file_name = config.getValue("Transcriptions", "reference_transcriptions_file")
127 |     sclite_directory = config.getValue("ErrorRateOutput", "sclite_directory")
128 | 
129 |     if output_dir is not None and len(output_dir) > 0:
130 |         os.makedirs(output_dir, exist_ok=True)
131 | 
132 |     analyzer.create_ctm(transcriptions_filename)
133 |     analyzer.create_stm(transcriptions_filename, reference_file_name)
134 |     analyzer.analyze(transcriptions_filename, sclite_directory)
135 | 
136 | if __name__ == '__main__':
137 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
138 |     parser.add_argument(
139 |         '-c', '--config_file', type=str, default=DEFAULT_CONFIG_INI, help='the config file to use')
140 |     parser.add_argument(
141 |         '-ll', '--log_level', type=str, default=DEFAULT_LOGLEVEL, help='the log level to use')
142 | 
143 |     args = parser.parse_args()
144 | 
145 |     run(args.config_file, args.log_level)
146 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ibm_watson>=6.1
2 | jiwer==2.2.0
3 | configparser>=5.0.0
4 | pandas>=1.0.5
5 | nltk>=3.4.5
6 | 


--------------------------------------------------------------------------------
/sample-files/ibuprofen.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/watson-stt-wer-python/661c329230c588f71046228ffbca4737f5d19d1f/sample-files/ibuprofen.wav


--------------------------------------------------------------------------------
/sample-files/lipitor.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/watson-stt-wer-python/661c329230c588f71046228ffbca4737f5d19d1f/sample-files/lipitor.wav


--------------------------------------------------------------------------------
/sample-files/reference_transcriptions.csv:
--------------------------------------------------------------------------------
1 | Audio File Name,Reference
2 | ./vicodin.wav,I will prescribe you some Vicodin
3 | ./lipitor.wav,"To deal with your bad cholesterol, Lipitor would be a good option"
4 | ./tylenol.wav,Take two Tylenol for your fever
5 | ./ibuprofen.wav,Ibuprofen is good for your muscle aches


--------------------------------------------------------------------------------
/sample-files/stt_transcriptions.csv:
--------------------------------------------------------------------------------
1 | Audio File Name,Transcription,Reference
2 | ./lipitor.wav,to deal with your bad cholesterol libertor would be a good option ,"To deal with your bad cholesterol, Lipitor would be a good option"
3 | ./ibuprofen.wav,i be profine is good for your muscle lakes ,Ibuprofen is good for your muscle aches
4 | ./vicodin.wav,i will prescribe you some vicating ,I will prescribe you some Vicodin
5 | ./tylenol.wav,take two tylenol for your fever ,Take two Tylenol for your fever
6 | 


--------------------------------------------------------------------------------
/sample-files/tylenol.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/watson-stt-wer-python/661c329230c588f71046228ffbca4737f5d19d1f/sample-files/tylenol.wav


--------------------------------------------------------------------------------
/sample-files/vicodin.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/watson-stt-wer-python/661c329230c588f71046228ffbca4737f5d19d1f/sample-files/vicodin.wav


--------------------------------------------------------------------------------
/sample-files/wer_details.csv:
--------------------------------------------------------------------------------
1 | Audio File Name,Reference,Transcription,Reference (clean),Transcription (clean),WER,MER,WIL,Hits,Substitutions,Deletions,Insertions,Differences
2 | ./vicodin.wav,I will prescribe you some Vicodin,i will prescribe you some vicating ,i will prescribe you some vicodin,i will prescribe you some vicating,16.666666666666664,16.666666666666664,30.555555555555546,5,1,0,0,['vicodin']
3 | ./lipitor.wav,"To deal with your bad cholesterol, Lipitor would be a good option",to deal with your bad cholesterol libertor would be a good option ,to deal with your bad cholesterol lipitor would be a good option,to deal with your bad cholesterol libertor would be a good option,8.333333333333332,8.333333333333332,15.972222222222232,11,1,0,0,['lipitor']
4 | ./tylenol.wav,Take two Tylenol for your fever,take two tylenol for your fever ,take two tylenol for your fever,take two tylenol for your fever,0.0,0.0,0.0,6,0,0,0,[]
5 | ./ibuprofen.wav,Ibuprofen is good for your muscle aches,i be profine is good for your muscle lakes ,ibuprofen is good for your muscle aches,i be profine is good for your muscle lakes,57.14285714285714,44.44444444444444,60.317460317460316,5,2,0,2,"['aches', 'ibuprofen']"
6 | 


--------------------------------------------------------------------------------
/sample-files/wer_summary.json:
--------------------------------------------------------------------------------
1 | {"Number of Samples": 4, "Total Words": 31, "Total Word Errors": 6, "Word Error Rate": 0.1935, "Total Sentence Errors": 3, "Sentence Error Rate": 0.75}


--------------------------------------------------------------------------------
/test_config.py:
--------------------------------------------------------------------------------
 1 | import unittest, os
 2 | from config import Config
 3 | 
 4 | 
 5 | def getInstance():
 6 |     return Config('config.ini.sample')
 7 | 
 8 | class MyTest(unittest.TestCase):
 9 |     def test_get_value(self):
10 |         c = getInstance()
11 |         self.assertEqual(c.getValue('SpeechToText','base_model_name'), 'en-US_NarrowbandModel')
12 | 
13 |     def test_get_missing_section(self):
14 |         c = getInstance()
15 |         self.assertEqual(c.getValue('NotARealSection','NotARealKey'), None)
16 | 
17 |     def test_get_missing_key(self):
18 |         c = getInstance()
19 |         self.assertEqual(c.getValue('SpeechToText', 'NotARealKey'), None)
20 | 
21 |     def test_get_boolean_false(self):
22 |         c = getInstance()
23 |         self.assertEqual(c.getBoolean('SpeechToText', 'use_bearer_token'), False)
24 | 
25 |     def test_get_boolean_true(self):
26 |         c = getInstance()
27 |         self.assertEqual(c.getBoolean('Transformations', 'remove_empty_strings'), True)
28 | 
29 |     def test_get_value_with_percent(self):
30 |         c = getInstance()
31 |         self.assertEqual(c.getValue('Transformations','remove_word_list'), 'uh,uhuh,%hesitation,hesitation')
32 | 
33 |     def test_set_value_with_key(self):
34 |         c = getInstance()
35 |         c.setValue('SpeechToText','smart_formatting', 'True')
36 |         self.assertEqual(c.getValue('SpeechToText', 'smart_formatting'), 'True')
37 | 
38 |     def test_write_file(self):
39 |         c = getInstance()
40 |         c.writeFile('config.ini.unit_test')
41 |         self.assertEqual(Config('config.ini.unit_test').getValue('SpeechToText','base_model_name'), 'en-US_NarrowbandModel')
42 |         os.remove('config.ini.unit_test')
43 | 
44 |     
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/transcribe.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import argparse
  3 | import json
  4 | import os
  5 | import sys
  6 | import re
  7 | import csv
  8 | import concurrent.futures
  9 | import threading
 10 | from config import Config
 11 | import logging
 12 | 
 13 | from ibm_watson import SpeechToTextV1
 14 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
 15 | from ibm_watson import IAMTokenManager
 16 | from ibm_cloud_sdk_core.authenticators import BearerTokenAuthenticator
 17 | from ibm_watson.websocket import RecognizeCallback, AudioSource
 18 | 
 19 | import os.path
 20 | from os import path
 21 | 
 22 | import pandas as pd
 23 | 
 24 | from uuid import uuid4
 25 | 
 26 | DEFAULT_CONFIG_INI='config.ini'
 27 | DEFAULT_LOGLEVEL='DEBUG'
 28 | 
 29 | FILE_EXTENSIONS = ("mp3", "mpeg", "ogg", "wav", "webm", "opus")
 30 | 
 31 | class Transcriptions:
 32 |     data = {}
 33 | 
 34 |     def add(self, transcriptionKey:str, transcriptionValue:str):
 35 |         self.data[transcriptionKey] = transcriptionValue
 36 | 
 37 |     def getData(self):
 38 |         return self.data
 39 | 
 40 | class MyRecognizeCallback(RecognizeCallback):
 41 |     audio_file_name = None
 42 |     transcriptions = None
 43 | 
 44 |     def __init__(self, audio_file_name:str, transcriptions:Transcriptions):
 45 |         RecognizeCallback.__init__(self)
 46 |         self.audio_file_name = audio_file_name
 47 |         self.transcriptions = transcriptions
 48 |         logging.debug("initialized object")
 49 | 
 50 |     def on_data(self, data):
 51 |         #print(json.dumps(data, indent=2))
 52 |         try:
 53 |             transcription = ""
 54 |             for result in data['results']:
 55 |                 transcription += result["alternatives"][0]["transcript"]
 56 |             #print(transcription)
 57 |             self.transcriptions.add(self.audio_file_name, transcription)
 58 |         except:
 59 |             logging.exception(f"{self.audio_file_name} - No transcription found")
 60 | 
 61 |     def on_error(self, error):
 62 |         logging.error(f'{self.audio_file_name} - Recognize Error received: {error}')
 63 |         logging.exception(f"Error transcribing {self.audio_file_name}:",exc_info=error)
 64 | 
 65 |     def on_inactivity_timeout(self, error):
 66 |         logging.error(f'{self.audio_file_name} - Inactivity timeout: {error}')
 67 | 
 68 | class Transcriber:
 69 | 
 70 |     def __init__(self, config):
 71 |         self.config = config
 72 |         self.STT = self.createSTT()
 73 |         self.transcriptions = Transcriptions()
 74 |         self.audio_types = {}
 75 |         self.audio_types["wav"]  = "audio/wav"
 76 |         self.audio_types["mp3"]  = "audio/mp3"
 77 |         self.audio_types["mpeg"] = "audio/mpeg"
 78 |         self.audio_types["ogg"]  = "audio/ogg"
 79 |         self.audio_types["webm"]  = "audio/webm"
 80 |         self.audio_types["opus"]  = "audio/webm"
 81 | 
 82 |     def createSTT(self):
 83 |         apikey            = self.config.getValue("SpeechToText", "apikey")
 84 |         url               = self.config.getValue("SpeechToText", "service_url")
 85 |         use_bearer_token  = self.config.getBoolean("SpeechToText", "use_bearer_token")
 86 | 
 87 |         if use_bearer_token != True:
 88 |             authenticator = IAMAuthenticator(apikey)
 89 |         else:
 90 |             iam_token_manager = IAMTokenManager(apikey=apikey)
 91 |             bearerToken       = iam_token_manager.get_token()
 92 |             authenticator     = BearerTokenAuthenticator(bearerToken)
 93 | 
 94 |         speech_to_text = SpeechToTextV1(authenticator=authenticator)
 95 | 
 96 |         speech_to_text.set_service_url(url)
 97 |         speech_to_text.set_default_headers({'x-watson-learning-opt-out': "true"})
 98 |         return speech_to_text
 99 | 
100 |     def getAudioType(self, file:str):
101 |         try:
102 |             filetype = file.lower().split(".")[-1]
103 |             return self.audio_types.get(filetype, None)
104 |         except:
105 |             return None
106 | 
107 |     def transcribe(self, filename):
108 |         logging.debug(f"Transcribing file: {filename}")
109 | 
110 |         #Model connection configs
111 |         base_model                = self.config.getValue("SpeechToText", "base_model_name")
112 |         language_customization_id = self.config.getValue("SpeechToText", "language_model_id")
113 |         acoustic_customization_id = self.config.getValue("SpeechToText", "acoustic_model_id")
114 |         grammar_name              = self.config.getValue("SpeechToText", "grammar_name")
115 | 
116 |         #Float parameter configs
117 |         end_of_phrase_silence_time   = float(self.config.getValue("SpeechToText", "end_of_phrase_silence_time"))
118 |         inactivity_timeout           =   int(self.config.getValue("SpeechToText", "inactivity_timeout"))
119 |         speech_detector_sensitivity  = float(self.config.getValue("SpeechToText", "speech_detector_sensitivity"))
120 |         background_audio_suppression = float(self.config.getValue("SpeechToText", "background_audio_suppression"))
121 |         character_insertion_bias     = float(self.config.getValue("SpeechToText", "character_insertion_bias", 0.0))
122 |         smart_formatting_version     =   int(self.config.getValue  ("SpeechToText", "smart_formatting_version", 0))
123 |         if language_customization_id is not None:
124 |             customization_weight         = float(self.config.getValue("SpeechToText", "customization_weight"))
125 |         else:
126 |             customization_weight = None
127 | 
128 |         #Boolean configs
129 |         interim_results              = self.config.getBoolean("SpeechToText", "interim_results")
130 |         audio_metrics                = self.config.getBoolean("SpeechToText", "audio_metrics")
131 |         smart_formatting             = self.config.getBoolean("SpeechToText", "smart_formatting")
132 |         low_latency                  = self.config.getBoolean("SpeechToText", "low_latency")
133 |         skip_zero_len_words          = self.config.getBoolean("SpeechToText", "skip_zero_len_words")
134 |         custom_transaction_id        = self.config.getBoolean("SpeechToText", "custom_transaction_id")
135 | 
136 |         callback = MyRecognizeCallback(filename, self.transcriptions)
137 | 
138 |         if custom_transaction_id:
139 |             transaction_id=str("{}".format(datetime.now().strftime('%Y%m-%d%H-%M%S-') + str(uuid4())))
140 |             new_headers=self.STT.default_headers
141 |             new_headers['X-Global-Transaction-Id']=transaction_id
142 |             self.STT.set_default_headers(new_headers)
143 |             logging.debug("--> Transaction ID: " + self.STT.default_headers['X-Global-Transaction-Id'])
144 | 
145 |         #print(f"Requesting transcription of {filename}")
146 |         with open(filename, "rb") as audio_file:
147 |             try:
148 |                 self.STT.recognize_using_websocket(audio=AudioSource(audio_file),
149 |                     content_type=self.getAudioType(filename),
150 |                     recognize_callback=callback,
151 |                     model=base_model,
152 |                     language_customization_id=language_customization_id,
153 |                     acoustic_customization_id=acoustic_customization_id,
154 |                     grammar_name=grammar_name,
155 |                     end_of_phrase_silence_time=end_of_phrase_silence_time,
156 |                     inactivity_timeout=inactivity_timeout,
157 |                     speech_detector_sensitivity=speech_detector_sensitivity,
158 |                     background_audio_suppression=background_audio_suppression,
159 |                     smart_formatting=smart_formatting,
160 |                     smart_formatting_version=smart_formatting_version,
161 |                     low_latency=low_latency,
162 |                     skip_zero_len_words=skip_zero_len_words,
163 |                     character_insertion_bias=character_insertion_bias,
164 |                     customization_weight=customization_weight,
165 |                     #At most one of interim_results and audio_metrics can be True
166 |                     interim_results=interim_results,
167 |                     audio_metrics=audio_metrics
168 |                 )
169 |                 #print(f"Requested transcription of {filename}")
170 |             except Exception as e:
171 |                 logging.exception(f"Error transcribing {filename}:",exc_info=e)
172 | 
173 |     def report(self):
174 |         report_file_name = self.config.getValue("Transcriptions", "stt_transcriptions_file")
175 |         csv_columns = ['Audio File Name','Transcription']
176 |         #print(self.transcriptions.getData())
177 |         data = self.transcriptions.getData()
178 | 
179 |         with open(report_file_name, 'w', encoding='utf-8-sig') as csvfile:
180 |             writer = csv.writer(csvfile)
181 |             writer.writerow(csv_columns)
182 |             writer.writerows(data.items())
183 |             logging.info(f"Wrote transcriptions for {len(data)} audio files to {report_file_name}")
184 | 
185 |         reference_file_name = self.config.getValue("Transcriptions", "reference_transcriptions_file")
186 |         if reference_file_name is not None:
187 |             try:
188 |                 if path.exists(reference_file_name):
189 |                     logging.debug(f"Found reference transcriptions file - {reference_file_name} - attempting merge with model's transcriptions")
190 | 
191 |                     file1_df = pd.read_csv(report_file_name)
192 |                     file2_df = pd.read_csv(reference_file_name)
193 | 
194 |                     missing_columns = False
195 |                     if not "Audio File Name" in file2_df.columns:
196 |                         missing_columns = True
197 |                         logging.warning(f"'Audio File Name' column missing in reference transcriptions file {reference_file_name}; will not merge.")
198 | 
199 |                     if not "Reference" in file2_df.columns:
200 |                         missing_columns = True
201 |                         logging.warning(f"'Reference' column missing in reference transcriptions file {reference_file_name}; will not merge.")
202 | 
203 |                     if not missing_columns:
204 |                         file2_df = file2_df[["Audio File Name", "Reference"]]
205 | 
206 |                         # Perform outer join merge
207 |                         comparison_result = pd.merge(file1_df,file2_df, on='Audio File Name', how='outer')
208 |                         #print(comparison_result)
209 | 
210 |                         comparison_result.to_csv(report_file_name, index=False)
211 |                         logging.info(f"Updated {report_file_name} with reference transcriptions")
212 |             except Exception as e:
213 |                 logging.warning(f"Failed to merge reference transcriptions into {report_file_name}:", exc_info=e)
214 |                 
215 | def run(config_file:str, logging_level:str=DEFAULT_LOGLEVEL):
216 |     config      = Config(config_file)
217 |     transcriber = Transcriber(config)
218 | 
219 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
220 | 
221 |     logging.debug(f"Using config file:{config_file}")
222 | 
223 |     audio_file_dir    = config.getValue("Transcriptions","audio_file_folder")
224 |     max_threads   = int(config.getValue("SpeechToText","max_threads", 1))
225 | 
226 |     output_dir = os.path.dirname(config.getValue("ErrorRateOutput", "summary_file"))
227 |     if output_dir is not None and len(output_dir) > 0:
228 |         os.makedirs(output_dir, exist_ok=True)
229 | 
230 |     files = []
231 |     skipped = []
232 |     for f in os.listdir(audio_file_dir):
233 |         if f.endswith(FILE_EXTENSIONS):
234 |             files.append(audio_file_dir + "/" + f)
235 |         else:
236 |             skipped.append(audio_file_dir + "/" + f)
237 | 
238 |     if len(files) < len(os.listdir(audio_file_dir)):
239 |         logging.warning("Skipping files in the audio file directory due to invalid file extensions: " + str(skipped))
240 | 
241 |     total_files=len(files)
242 | 
243 |     if total_files>0:
244 |         complete_files=0
245 |         with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
246 |             #executor.map(transcriber.transcribe,files)
247 |             futures = [executor.submit(transcriber.transcribe, file) for file in files]
248 |             for future in concurrent.futures.as_completed(futures):
249 |                 complete_files+=1
250 |                 if complete_files%100==0:
251 |                     logging.info(f"Completed transcribing {complete_files} files out of {total_files}")
252 |     
253 |         if complete_files != total_files:
254 |             logging.error(f"Only {complete_files} out of {total_files} were transcribed.")
255 |         else:
256 |             logging.info(f"Completed transcribing {complete_files} files out of {total_files}")
257 |     else:
258 |         logging.error("There were no valid audio files found. Exiting.")
259 |         sys.exit(1)
260 | 
261 |     transcriber.report()
262 | 
263 | if __name__ == '__main__':
264 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
265 |     parser.add_argument(
266 |         '-c', '--config_file', type=str, default=DEFAULT_CONFIG_INI, help='the config file to use')
267 |     parser.add_argument(
268 |         '-ll', '--log_level', type=str, default=DEFAULT_LOGLEVEL, help='the log level to use')
269 | 
270 |     args = parser.parse_args()
271 | 
272 |     run(args.config_file, args.log_level)
273 | 


--------------------------------------------------------------------------------